annotate mergeGenomicMatrixFiles.py @ 19:371579dd9bc6

Uploaded
author melissacline
date Fri, 20 Mar 2015 18:09:15 -0400
parents 2035405538b4
children 1d150e860c4d
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
6
2035405538b4 Uploaded
melissacline
parents:
diff changeset
1 #!/usr/bin/env python
2035405538b4 Uploaded
melissacline
parents:
diff changeset
2
2035405538b4 Uploaded
melissacline
parents:
diff changeset
3 import string,os,sys
2035405538b4 Uploaded
melissacline
parents:
diff changeset
4
2035405538b4 Uploaded
melissacline
parents:
diff changeset
5 def header (samples, infile):
2035405538b4 Uploaded
melissacline
parents:
diff changeset
6 fin= open(infile,'r')
2035405538b4 Uploaded
melissacline
parents:
diff changeset
7 #header, samples
2035405538b4 Uploaded
melissacline
parents:
diff changeset
8 newSamples = string.split(string.strip(fin.readline()),'\t')[1:]
2035405538b4 Uploaded
melissacline
parents:
diff changeset
9 for sample in newSamples:
2035405538b4 Uploaded
melissacline
parents:
diff changeset
10 if sample not in samples:
2035405538b4 Uploaded
melissacline
parents:
diff changeset
11 samples[sample]= len(samples)
2035405538b4 Uploaded
melissacline
parents:
diff changeset
12 fin.close()
2035405538b4 Uploaded
melissacline
parents:
diff changeset
13 return
2035405538b4 Uploaded
melissacline
parents:
diff changeset
14
2035405538b4 Uploaded
melissacline
parents:
diff changeset
15 def process(genes, samples, dataMatrix, infile):
2035405538b4 Uploaded
melissacline
parents:
diff changeset
16 maxLength= len(samples)
2035405538b4 Uploaded
melissacline
parents:
diff changeset
17
2035405538b4 Uploaded
melissacline
parents:
diff changeset
18 fin= open(infile,'r')
2035405538b4 Uploaded
melissacline
parents:
diff changeset
19 #header
2035405538b4 Uploaded
melissacline
parents:
diff changeset
20 newSamples = string.split(string.strip(fin.readline()),'\t')
2035405538b4 Uploaded
melissacline
parents:
diff changeset
21
2035405538b4 Uploaded
melissacline
parents:
diff changeset
22 while 1:
2035405538b4 Uploaded
melissacline
parents:
diff changeset
23 line = fin.readline()[:-1]
2035405538b4 Uploaded
melissacline
parents:
diff changeset
24 if line =="":
2035405538b4 Uploaded
melissacline
parents:
diff changeset
25 break
2035405538b4 Uploaded
melissacline
parents:
diff changeset
26 data = string.split(line,"\t")
2035405538b4 Uploaded
melissacline
parents:
diff changeset
27 gene = data[0]
2035405538b4 Uploaded
melissacline
parents:
diff changeset
28 if gene not in genes:
2035405538b4 Uploaded
melissacline
parents:
diff changeset
29 genes[gene]= len(genes)
2035405538b4 Uploaded
melissacline
parents:
diff changeset
30 l=[]
2035405538b4 Uploaded
melissacline
parents:
diff changeset
31 for i in range (0, maxLength):
2035405538b4 Uploaded
melissacline
parents:
diff changeset
32 l.append("")
2035405538b4 Uploaded
melissacline
parents:
diff changeset
33 dataMatrix.append(l)
2035405538b4 Uploaded
melissacline
parents:
diff changeset
34
2035405538b4 Uploaded
melissacline
parents:
diff changeset
35 x = genes[gene]
2035405538b4 Uploaded
melissacline
parents:
diff changeset
36 for i in range (1, len(data)):
2035405538b4 Uploaded
melissacline
parents:
diff changeset
37 sample = newSamples[i]
2035405538b4 Uploaded
melissacline
parents:
diff changeset
38 y = samples[sample]
2035405538b4 Uploaded
melissacline
parents:
diff changeset
39 dataMatrix[x][y]= data[i]
2035405538b4 Uploaded
melissacline
parents:
diff changeset
40
2035405538b4 Uploaded
melissacline
parents:
diff changeset
41 fin.close()
2035405538b4 Uploaded
melissacline
parents:
diff changeset
42 return
2035405538b4 Uploaded
melissacline
parents:
diff changeset
43
2035405538b4 Uploaded
melissacline
parents:
diff changeset
44 def outputMatrix(dataMatrix, samples, genes, outfile):
2035405538b4 Uploaded
melissacline
parents:
diff changeset
45 fout = open(outfile,"w")
2035405538b4 Uploaded
melissacline
parents:
diff changeset
46 maxLength= len(samples)
2035405538b4 Uploaded
melissacline
parents:
diff changeset
47 sList=[]
2035405538b4 Uploaded
melissacline
parents:
diff changeset
48 for i in range (0, maxLength):
2035405538b4 Uploaded
melissacline
parents:
diff changeset
49 sList.append("")
2035405538b4 Uploaded
melissacline
parents:
diff changeset
50 for sample in samples:
2035405538b4 Uploaded
melissacline
parents:
diff changeset
51 pos =samples[sample]
2035405538b4 Uploaded
melissacline
parents:
diff changeset
52 sList[pos] = sample
2035405538b4 Uploaded
melissacline
parents:
diff changeset
53
2035405538b4 Uploaded
melissacline
parents:
diff changeset
54 fout.write("sample")
2035405538b4 Uploaded
melissacline
parents:
diff changeset
55 for sample in sList:
2035405538b4 Uploaded
melissacline
parents:
diff changeset
56 fout.write("\t"+sample)
2035405538b4 Uploaded
melissacline
parents:
diff changeset
57 fout.write("\n")
2035405538b4 Uploaded
melissacline
parents:
diff changeset
58
2035405538b4 Uploaded
melissacline
parents:
diff changeset
59 for gene in genes:
2035405538b4 Uploaded
melissacline
parents:
diff changeset
60 fout.write(gene)
2035405538b4 Uploaded
melissacline
parents:
diff changeset
61 for sample in sList:
2035405538b4 Uploaded
melissacline
parents:
diff changeset
62 value = dataMatrix[genes[gene]][samples[sample]]
2035405538b4 Uploaded
melissacline
parents:
diff changeset
63 fout.write("\t"+value)
2035405538b4 Uploaded
melissacline
parents:
diff changeset
64 fout.write("\n")
2035405538b4 Uploaded
melissacline
parents:
diff changeset
65 fout.close()
2035405538b4 Uploaded
melissacline
parents:
diff changeset
66 return
2035405538b4 Uploaded
melissacline
parents:
diff changeset
67
2035405538b4 Uploaded
melissacline
parents:
diff changeset
68 if __name__ == '__main__' :
2035405538b4 Uploaded
melissacline
parents:
diff changeset
69 if len(sys.argv[:]) <4:
2035405538b4 Uploaded
melissacline
parents:
diff changeset
70 print "python mergeFilesByColumn.py output inputfile(s)"
2035405538b4 Uploaded
melissacline
parents:
diff changeset
71 print "**********memory intensive, not for very genomic data with hugo number of probes"
2035405538b4 Uploaded
melissacline
parents:
diff changeset
72 print "this is merging data A+B=C\n"
2035405538b4 Uploaded
melissacline
parents:
diff changeset
73 sys.exit()
2035405538b4 Uploaded
melissacline
parents:
diff changeset
74
2035405538b4 Uploaded
melissacline
parents:
diff changeset
75 inFiles = sys.argv[2:]
2035405538b4 Uploaded
melissacline
parents:
diff changeset
76 outfile = sys.argv[1]
2035405538b4 Uploaded
melissacline
parents:
diff changeset
77
2035405538b4 Uploaded
melissacline
parents:
diff changeset
78 genes={}
2035405538b4 Uploaded
melissacline
parents:
diff changeset
79 samples={}
2035405538b4 Uploaded
melissacline
parents:
diff changeset
80 dataMatrix=[]
2035405538b4 Uploaded
melissacline
parents:
diff changeset
81
2035405538b4 Uploaded
melissacline
parents:
diff changeset
82 for infile in inFiles:
2035405538b4 Uploaded
melissacline
parents:
diff changeset
83 header (samples, infile)
2035405538b4 Uploaded
melissacline
parents:
diff changeset
84
2035405538b4 Uploaded
melissacline
parents:
diff changeset
85 for infile in inFiles:
2035405538b4 Uploaded
melissacline
parents:
diff changeset
86 process(genes, samples, dataMatrix, infile)
2035405538b4 Uploaded
melissacline
parents:
diff changeset
87
2035405538b4 Uploaded
melissacline
parents:
diff changeset
88 outputMatrix(dataMatrix, samples, genes, outfile)