Mercurial > repos > melissacline > ucsc_cancer_utilities
comparison mergeGenomicMatrixFiles.py @ 3:12a1ea920524
Creating a tool to merge genomic datasets
author | melissacline |
---|---|
date | Wed, 11 Feb 2015 16:44:33 -0800 |
parents | |
children | d0674221a6ae |
comparison
equal
deleted
inserted
replaced
2:d1104ad3646a | 3:12a1ea920524 |
---|---|
1 import string,os,sys | |
2 | |
3 def header (samples, infile): | |
4 fin= open(infile,'r') | |
5 #header, samples | |
6 newSamples = string.split(string.strip(fin.readline()),'\t')[1:] | |
7 for sample in newSamples: | |
8 if sample not in samples: | |
9 samples[sample]= len(samples) | |
10 fin.close() | |
11 return | |
12 | |
13 def process(genes, samples, dataMatrix, infile): | |
14 maxLength= len(samples) | |
15 | |
16 fin= open(infile,'r') | |
17 #header | |
18 newSamples = string.split(string.strip(fin.readline()),'\t') | |
19 | |
20 while 1: | |
21 line = fin.readline()[:-1] | |
22 if line =="": | |
23 break | |
24 data = string.split(line,"\t") | |
25 gene = data[0] | |
26 if gene not in genes: | |
27 genes[gene]= len(genes) | |
28 l=[] | |
29 for i in range (0, maxLength): | |
30 l.append("") | |
31 dataMatrix.append(l) | |
32 | |
33 x = genes[gene] | |
34 for i in range (1, len(data)): | |
35 sample = newSamples[i] | |
36 y = samples[sample] | |
37 dataMatrix[x][y]= data[i] | |
38 | |
39 fin.close() | |
40 return | |
41 | |
42 def outputMatrix(dataMatrix, samples, genes, outfile): | |
43 fout = open(outfile,"w") | |
44 maxLength= len(samples) | |
45 sList=[] | |
46 for i in range (0, maxLength): | |
47 sList.append("") | |
48 for sample in samples: | |
49 pos =samples[sample] | |
50 sList[pos] = sample | |
51 | |
52 fout.write("sample") | |
53 for sample in sList: | |
54 fout.write("\t"+sample) | |
55 fout.write("\n") | |
56 | |
57 for gene in genes: | |
58 fout.write(gene) | |
59 for sample in sList: | |
60 value = dataMatrix[genes[gene]][samples[sample]] | |
61 fout.write("\t"+value) | |
62 fout.write("\n") | |
63 fout.close() | |
64 return | |
65 | |
66 if __name__ == '__main__' : | |
67 if len(sys.argv[:]) <4: | |
68 print "python mergeFilesByColumn.py output inputfile(s)" | |
69 print "**********memory intensive, not for very genomic data with hugo number of probes" | |
70 print "this is merging data A+B=C\n" | |
71 sys.exit() | |
72 | |
73 inFiles = sys.argv[2:] | |
74 outfile = sys.argv[1] | |
75 | |
76 genes={} | |
77 samples={} | |
78 dataMatrix=[] | |
79 | |
80 for infile in inFiles: | |
81 header (samples, infile) | |
82 | |
83 for infile in inFiles: | |
84 process(genes, samples, dataMatrix, infile) | |
85 | |
86 outputMatrix(dataMatrix, samples, genes, outfile) |