annotate mergeGenomicMatrixFiles.py @ 3:12a1ea920524

Creating a tool to merge genomic datasets
author melissacline
date Wed, 11 Feb 2015 16:44:33 -0800
parents
children d0674221a6ae
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
3
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
1 import string,os,sys
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
2
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
3 def header (samples, infile):
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
4 fin= open(infile,'r')
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
5 #header, samples
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
6 newSamples = string.split(string.strip(fin.readline()),'\t')[1:]
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
7 for sample in newSamples:
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
8 if sample not in samples:
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
9 samples[sample]= len(samples)
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
10 fin.close()
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
11 return
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
12
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
13 def process(genes, samples, dataMatrix, infile):
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
14 maxLength= len(samples)
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
15
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
16 fin= open(infile,'r')
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
17 #header
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
18 newSamples = string.split(string.strip(fin.readline()),'\t')
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
19
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
20 while 1:
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
21 line = fin.readline()[:-1]
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
22 if line =="":
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
23 break
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
24 data = string.split(line,"\t")
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
25 gene = data[0]
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
26 if gene not in genes:
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
27 genes[gene]= len(genes)
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
28 l=[]
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
29 for i in range (0, maxLength):
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
30 l.append("")
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
31 dataMatrix.append(l)
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
32
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
33 x = genes[gene]
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
34 for i in range (1, len(data)):
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
35 sample = newSamples[i]
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
36 y = samples[sample]
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
37 dataMatrix[x][y]= data[i]
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
38
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
39 fin.close()
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
40 return
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
41
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
42 def outputMatrix(dataMatrix, samples, genes, outfile):
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
43 fout = open(outfile,"w")
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
44 maxLength= len(samples)
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
45 sList=[]
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
46 for i in range (0, maxLength):
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
47 sList.append("")
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
48 for sample in samples:
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
49 pos =samples[sample]
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
50 sList[pos] = sample
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
51
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
52 fout.write("sample")
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
53 for sample in sList:
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
54 fout.write("\t"+sample)
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
55 fout.write("\n")
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
56
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
57 for gene in genes:
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
58 fout.write(gene)
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
59 for sample in sList:
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
60 value = dataMatrix[genes[gene]][samples[sample]]
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
61 fout.write("\t"+value)
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
62 fout.write("\n")
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
63 fout.close()
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
64 return
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
65
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
66 if __name__ == '__main__' :
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
67 if len(sys.argv[:]) <4:
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
68 print "python mergeFilesByColumn.py output inputfile(s)"
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
69 print "**********memory intensive, not for very genomic data with hugo number of probes"
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
70 print "this is merging data A+B=C\n"
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
71 sys.exit()
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
72
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
73 inFiles = sys.argv[2:]
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
74 outfile = sys.argv[1]
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
75
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
76 genes={}
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
77 samples={}
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
78 dataMatrix=[]
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
79
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
80 for infile in inFiles:
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
81 header (samples, infile)
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
82
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
83 for infile in inFiles:
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
84 process(genes, samples, dataMatrix, infile)
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
85
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
86 outputMatrix(dataMatrix, samples, genes, outfile)