3
|
1 import string,os,sys
|
|
2
|
|
3 def header (samples, infile):
|
|
4 fin= open(infile,'r')
|
|
5 #header, samples
|
|
6 newSamples = string.split(string.strip(fin.readline()),'\t')[1:]
|
|
7 for sample in newSamples:
|
|
8 if sample not in samples:
|
|
9 samples[sample]= len(samples)
|
|
10 fin.close()
|
|
11 return
|
|
12
|
|
13 def process(genes, samples, dataMatrix, infile):
|
|
14 maxLength= len(samples)
|
|
15
|
|
16 fin= open(infile,'r')
|
|
17 #header
|
|
18 newSamples = string.split(string.strip(fin.readline()),'\t')
|
|
19
|
|
20 while 1:
|
|
21 line = fin.readline()[:-1]
|
|
22 if line =="":
|
|
23 break
|
|
24 data = string.split(line,"\t")
|
|
25 gene = data[0]
|
|
26 if gene not in genes:
|
|
27 genes[gene]= len(genes)
|
|
28 l=[]
|
|
29 for i in range (0, maxLength):
|
|
30 l.append("")
|
|
31 dataMatrix.append(l)
|
|
32
|
|
33 x = genes[gene]
|
|
34 for i in range (1, len(data)):
|
|
35 sample = newSamples[i]
|
|
36 y = samples[sample]
|
|
37 dataMatrix[x][y]= data[i]
|
|
38
|
|
39 fin.close()
|
|
40 return
|
|
41
|
|
42 def outputMatrix(dataMatrix, samples, genes, outfile):
|
|
43 fout = open(outfile,"w")
|
|
44 maxLength= len(samples)
|
|
45 sList=[]
|
|
46 for i in range (0, maxLength):
|
|
47 sList.append("")
|
|
48 for sample in samples:
|
|
49 pos =samples[sample]
|
|
50 sList[pos] = sample
|
|
51
|
|
52 fout.write("sample")
|
|
53 for sample in sList:
|
|
54 fout.write("\t"+sample)
|
|
55 fout.write("\n")
|
|
56
|
|
57 for gene in genes:
|
|
58 fout.write(gene)
|
|
59 for sample in sList:
|
|
60 value = dataMatrix[genes[gene]][samples[sample]]
|
|
61 fout.write("\t"+value)
|
|
62 fout.write("\n")
|
|
63 fout.close()
|
|
64 return
|
|
65
|
|
66 if __name__ == '__main__' :
|
|
67 if len(sys.argv[:]) <4:
|
|
68 print "python mergeFilesByColumn.py output inputfile(s)"
|
|
69 print "**********memory intensive, not for very genomic data with hugo number of probes"
|
|
70 print "this is merging data A+B=C\n"
|
|
71 sys.exit()
|
|
72
|
|
73 inFiles = sys.argv[2:]
|
|
74 outfile = sys.argv[1]
|
|
75
|
|
76 genes={}
|
|
77 samples={}
|
|
78 dataMatrix=[]
|
|
79
|
|
80 for infile in inFiles:
|
|
81 header (samples, infile)
|
|
82
|
|
83 for infile in inFiles:
|
|
84 process(genes, samples, dataMatrix, infile)
|
|
85
|
|
86 outputMatrix(dataMatrix, samples, genes, outfile)
|