comparison mergeGenomicMatrixFiles.py @ 6:2035405538b4

Uploaded
author melissacline
date Thu, 12 Feb 2015 01:15:58 -0500
parents
children 1d150e860c4d
comparison
equal deleted inserted replaced
5:6c23a3b58eb8 6:2035405538b4
1 #!/usr/bin/env python
2
3 import string,os,sys
4
5 def header (samples, infile):
6 fin= open(infile,'r')
7 #header, samples
8 newSamples = string.split(string.strip(fin.readline()),'\t')[1:]
9 for sample in newSamples:
10 if sample not in samples:
11 samples[sample]= len(samples)
12 fin.close()
13 return
14
15 def process(genes, samples, dataMatrix, infile):
16 maxLength= len(samples)
17
18 fin= open(infile,'r')
19 #header
20 newSamples = string.split(string.strip(fin.readline()),'\t')
21
22 while 1:
23 line = fin.readline()[:-1]
24 if line =="":
25 break
26 data = string.split(line,"\t")
27 gene = data[0]
28 if gene not in genes:
29 genes[gene]= len(genes)
30 l=[]
31 for i in range (0, maxLength):
32 l.append("")
33 dataMatrix.append(l)
34
35 x = genes[gene]
36 for i in range (1, len(data)):
37 sample = newSamples[i]
38 y = samples[sample]
39 dataMatrix[x][y]= data[i]
40
41 fin.close()
42 return
43
44 def outputMatrix(dataMatrix, samples, genes, outfile):
45 fout = open(outfile,"w")
46 maxLength= len(samples)
47 sList=[]
48 for i in range (0, maxLength):
49 sList.append("")
50 for sample in samples:
51 pos =samples[sample]
52 sList[pos] = sample
53
54 fout.write("sample")
55 for sample in sList:
56 fout.write("\t"+sample)
57 fout.write("\n")
58
59 for gene in genes:
60 fout.write(gene)
61 for sample in sList:
62 value = dataMatrix[genes[gene]][samples[sample]]
63 fout.write("\t"+value)
64 fout.write("\n")
65 fout.close()
66 return
67
68 if __name__ == '__main__' :
69 if len(sys.argv[:]) <4:
70 print "python mergeFilesByColumn.py output inputfile(s)"
71 print "**********memory intensive, not for very genomic data with hugo number of probes"
72 print "this is merging data A+B=C\n"
73 sys.exit()
74
75 inFiles = sys.argv[2:]
76 outfile = sys.argv[1]
77
78 genes={}
79 samples={}
80 dataMatrix=[]
81
82 for infile in inFiles:
83 header (samples, infile)
84
85 for infile in inFiles:
86 process(genes, samples, dataMatrix, infile)
87
88 outputMatrix(dataMatrix, samples, genes, outfile)