Mercurial > repos > melissacline > ucsc_cancer_utilities
comparison mergeGenomicMatrixFiles.py @ 6:2035405538b4
Uploaded
author | melissacline |
---|---|
date | Thu, 12 Feb 2015 01:15:58 -0500 |
parents | |
children | 1d150e860c4d |
comparison
equal
deleted
inserted
replaced
5:6c23a3b58eb8 | 6:2035405538b4 |
---|---|
1 #!/usr/bin/env python | |
2 | |
3 import string,os,sys | |
4 | |
5 def header (samples, infile): | |
6 fin= open(infile,'r') | |
7 #header, samples | |
8 newSamples = string.split(string.strip(fin.readline()),'\t')[1:] | |
9 for sample in newSamples: | |
10 if sample not in samples: | |
11 samples[sample]= len(samples) | |
12 fin.close() | |
13 return | |
14 | |
15 def process(genes, samples, dataMatrix, infile): | |
16 maxLength= len(samples) | |
17 | |
18 fin= open(infile,'r') | |
19 #header | |
20 newSamples = string.split(string.strip(fin.readline()),'\t') | |
21 | |
22 while 1: | |
23 line = fin.readline()[:-1] | |
24 if line =="": | |
25 break | |
26 data = string.split(line,"\t") | |
27 gene = data[0] | |
28 if gene not in genes: | |
29 genes[gene]= len(genes) | |
30 l=[] | |
31 for i in range (0, maxLength): | |
32 l.append("") | |
33 dataMatrix.append(l) | |
34 | |
35 x = genes[gene] | |
36 for i in range (1, len(data)): | |
37 sample = newSamples[i] | |
38 y = samples[sample] | |
39 dataMatrix[x][y]= data[i] | |
40 | |
41 fin.close() | |
42 return | |
43 | |
44 def outputMatrix(dataMatrix, samples, genes, outfile): | |
45 fout = open(outfile,"w") | |
46 maxLength= len(samples) | |
47 sList=[] | |
48 for i in range (0, maxLength): | |
49 sList.append("") | |
50 for sample in samples: | |
51 pos =samples[sample] | |
52 sList[pos] = sample | |
53 | |
54 fout.write("sample") | |
55 for sample in sList: | |
56 fout.write("\t"+sample) | |
57 fout.write("\n") | |
58 | |
59 for gene in genes: | |
60 fout.write(gene) | |
61 for sample in sList: | |
62 value = dataMatrix[genes[gene]][samples[sample]] | |
63 fout.write("\t"+value) | |
64 fout.write("\n") | |
65 fout.close() | |
66 return | |
67 | |
68 if __name__ == '__main__' : | |
69 if len(sys.argv[:]) <4: | |
70 print "python mergeFilesByColumn.py output inputfile(s)" | |
71 print "**********memory intensive, not for very genomic data with hugo number of probes" | |
72 print "this is merging data A+B=C\n" | |
73 sys.exit() | |
74 | |
75 inFiles = sys.argv[2:] | |
76 outfile = sys.argv[1] | |
77 | |
78 genes={} | |
79 samples={} | |
80 dataMatrix=[] | |
81 | |
82 for infile in inFiles: | |
83 header (samples, infile) | |
84 | |
85 for infile in inFiles: | |
86 process(genes, samples, dataMatrix, infile) | |
87 | |
88 outputMatrix(dataMatrix, samples, genes, outfile) |