annotate mergeGenomicMatrixFiles.py @ 4:d0674221a6ae

Added a proper first line
author melissacline
date Wed, 11 Feb 2015 17:28:52 -0800
parents 12a1ea920524
children 1d83dbbee373
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
4
d0674221a6ae Added a proper first line
melissacline
parents: 3
diff changeset
1 #!/usr/bin/env python
d0674221a6ae Added a proper first line
melissacline
parents: 3
diff changeset
2
3
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
3 import string,os,sys
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
4
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
5 def header (samples, infile):
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
6 fin= open(infile,'r')
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
7 #header, samples
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
8 newSamples = string.split(string.strip(fin.readline()),'\t')[1:]
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
9 for sample in newSamples:
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
10 if sample not in samples:
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
11 samples[sample]= len(samples)
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
12 fin.close()
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
13 return
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
14
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
15 def process(genes, samples, dataMatrix, infile):
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
16 maxLength= len(samples)
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
17
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
18 fin= open(infile,'r')
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
19 #header
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
20 newSamples = string.split(string.strip(fin.readline()),'\t')
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
21
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
22 while 1:
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
23 line = fin.readline()[:-1]
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
24 if line =="":
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
25 break
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
26 data = string.split(line,"\t")
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
27 gene = data[0]
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
28 if gene not in genes:
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
29 genes[gene]= len(genes)
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
30 l=[]
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
31 for i in range (0, maxLength):
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
32 l.append("")
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
33 dataMatrix.append(l)
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
34
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
35 x = genes[gene]
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
36 for i in range (1, len(data)):
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
37 sample = newSamples[i]
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
38 y = samples[sample]
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
39 dataMatrix[x][y]= data[i]
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
40
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
41 fin.close()
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
42 return
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
43
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
44 def outputMatrix(dataMatrix, samples, genes, outfile):
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
45 fout = open(outfile,"w")
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
46 maxLength= len(samples)
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
47 sList=[]
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
48 for i in range (0, maxLength):
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
49 sList.append("")
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
50 for sample in samples:
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
51 pos =samples[sample]
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
52 sList[pos] = sample
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
53
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
54 fout.write("sample")
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
55 for sample in sList:
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
56 fout.write("\t"+sample)
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
57 fout.write("\n")
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
58
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
59 for gene in genes:
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
60 fout.write(gene)
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
61 for sample in sList:
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
62 value = dataMatrix[genes[gene]][samples[sample]]
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
63 fout.write("\t"+value)
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
64 fout.write("\n")
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
65 fout.close()
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
66 return
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
67
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
68 if __name__ == '__main__' :
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
69 if len(sys.argv[:]) <4:
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
70 print "python mergeFilesByColumn.py output inputfile(s)"
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
71 print "**********memory intensive, not for very genomic data with hugo number of probes"
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
72 print "this is merging data A+B=C\n"
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
73 sys.exit()
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
74
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
75 inFiles = sys.argv[2:]
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
76 outfile = sys.argv[1]
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
77
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
78 genes={}
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
79 samples={}
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
80 dataMatrix=[]
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
81
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
82 for infile in inFiles:
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
83 header (samples, infile)
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
84
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
85 for infile in inFiles:
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
86 process(genes, samples, dataMatrix, infile)
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
87
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
88 outputMatrix(dataMatrix, samples, genes, outfile)