comparison mergeGenomicMatrixFiles.py @ 3:12a1ea920524

Creating a tool to merge genomic datasets
author melissacline
date Wed, 11 Feb 2015 16:44:33 -0800
parents
children d0674221a6ae
comparison
equal deleted inserted replaced
2:d1104ad3646a 3:12a1ea920524
1 import string,os,sys
2
3 def header (samples, infile):
4 fin= open(infile,'r')
5 #header, samples
6 newSamples = string.split(string.strip(fin.readline()),'\t')[1:]
7 for sample in newSamples:
8 if sample not in samples:
9 samples[sample]= len(samples)
10 fin.close()
11 return
12
13 def process(genes, samples, dataMatrix, infile):
14 maxLength= len(samples)
15
16 fin= open(infile,'r')
17 #header
18 newSamples = string.split(string.strip(fin.readline()),'\t')
19
20 while 1:
21 line = fin.readline()[:-1]
22 if line =="":
23 break
24 data = string.split(line,"\t")
25 gene = data[0]
26 if gene not in genes:
27 genes[gene]= len(genes)
28 l=[]
29 for i in range (0, maxLength):
30 l.append("")
31 dataMatrix.append(l)
32
33 x = genes[gene]
34 for i in range (1, len(data)):
35 sample = newSamples[i]
36 y = samples[sample]
37 dataMatrix[x][y]= data[i]
38
39 fin.close()
40 return
41
42 def outputMatrix(dataMatrix, samples, genes, outfile):
43 fout = open(outfile,"w")
44 maxLength= len(samples)
45 sList=[]
46 for i in range (0, maxLength):
47 sList.append("")
48 for sample in samples:
49 pos =samples[sample]
50 sList[pos] = sample
51
52 fout.write("sample")
53 for sample in sList:
54 fout.write("\t"+sample)
55 fout.write("\n")
56
57 for gene in genes:
58 fout.write(gene)
59 for sample in sList:
60 value = dataMatrix[genes[gene]][samples[sample]]
61 fout.write("\t"+value)
62 fout.write("\n")
63 fout.close()
64 return
65
66 if __name__ == '__main__' :
67 if len(sys.argv[:]) <4:
68 print "python mergeFilesByColumn.py output inputfile(s)"
69 print "**********memory intensive, not for very genomic data with hugo number of probes"
70 print "this is merging data A+B=C\n"
71 sys.exit()
72
73 inFiles = sys.argv[2:]
74 outfile = sys.argv[1]
75
76 genes={}
77 samples={}
78 dataMatrix=[]
79
80 for infile in inFiles:
81 header (samples, infile)
82
83 for infile in inFiles:
84 process(genes, samples, dataMatrix, infile)
85
86 outputMatrix(dataMatrix, samples, genes, outfile)