4
|
1 #!/usr/bin/env python
|
|
2
|
3
|
3 import string,os,sys
|
|
4
|
|
5 def header (samples, infile):
|
|
6 fin= open(infile,'r')
|
|
7 #header, samples
|
|
8 newSamples = string.split(string.strip(fin.readline()),'\t')[1:]
|
|
9 for sample in newSamples:
|
|
10 if sample not in samples:
|
|
11 samples[sample]= len(samples)
|
|
12 fin.close()
|
|
13 return
|
|
14
|
|
15 def process(genes, samples, dataMatrix, infile):
|
|
16 maxLength= len(samples)
|
|
17
|
|
18 fin= open(infile,'r')
|
|
19 #header
|
|
20 newSamples = string.split(string.strip(fin.readline()),'\t')
|
|
21
|
|
22 while 1:
|
|
23 line = fin.readline()[:-1]
|
|
24 if line =="":
|
|
25 break
|
|
26 data = string.split(line,"\t")
|
|
27 gene = data[0]
|
|
28 if gene not in genes:
|
|
29 genes[gene]= len(genes)
|
|
30 l=[]
|
|
31 for i in range (0, maxLength):
|
|
32 l.append("")
|
|
33 dataMatrix.append(l)
|
|
34
|
|
35 x = genes[gene]
|
|
36 for i in range (1, len(data)):
|
|
37 sample = newSamples[i]
|
|
38 y = samples[sample]
|
|
39 dataMatrix[x][y]= data[i]
|
|
40
|
|
41 fin.close()
|
|
42 return
|
|
43
|
|
44 def outputMatrix(dataMatrix, samples, genes, outfile):
|
|
45 fout = open(outfile,"w")
|
|
46 maxLength= len(samples)
|
|
47 sList=[]
|
|
48 for i in range (0, maxLength):
|
|
49 sList.append("")
|
|
50 for sample in samples:
|
|
51 pos =samples[sample]
|
|
52 sList[pos] = sample
|
|
53
|
|
54 fout.write("sample")
|
|
55 for sample in sList:
|
|
56 fout.write("\t"+sample)
|
|
57 fout.write("\n")
|
|
58
|
|
59 for gene in genes:
|
|
60 fout.write(gene)
|
|
61 for sample in sList:
|
|
62 value = dataMatrix[genes[gene]][samples[sample]]
|
|
63 fout.write("\t"+value)
|
|
64 fout.write("\n")
|
|
65 fout.close()
|
|
66 return
|
|
67
|
|
68 if __name__ == '__main__' :
|
|
69 if len(sys.argv[:]) <4:
|
|
70 print "python mergeFilesByColumn.py output inputfile(s)"
|
|
71 print "**********memory intensive, not for very genomic data with hugo number of probes"
|
|
72 print "this is merging data A+B=C\n"
|
|
73 sys.exit()
|
|
74
|
|
75 inFiles = sys.argv[2:]
|
|
76 outfile = sys.argv[1]
|
|
77
|
|
78 genes={}
|
|
79 samples={}
|
|
80 dataMatrix=[]
|
|
81
|
|
82 for infile in inFiles:
|
|
83 header (samples, infile)
|
|
84
|
|
85 for infile in inFiles:
|
|
86 process(genes, samples, dataMatrix, infile)
|
|
87
|
|
88 outputMatrix(dataMatrix, samples, genes, outfile)
|