annotate mergeGenomicMatrixFiles.py @ 14:d0b8c8eee9d5

Committing changes that hadn't made it in yet, merge hell
author melissacline
date Tue, 10 Mar 2015 19:32:14 -0700
parents 30aab34424a9
children 1d83dbbee373
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
6
2035405538b4 Uploaded
melissacline
parents:
diff changeset
1 #!/usr/bin/env python
2035405538b4 Uploaded
melissacline
parents:
diff changeset
2
7
1d150e860c4d Expanded the functionality of the merge genomic datasets tool, to generate an output dataset with the file (or label) indicating where each column came from
melissacline
parents: 6
diff changeset
3 import argparse
6
2035405538b4 Uploaded
melissacline
parents:
diff changeset
4 import string,os,sys
2035405538b4 Uploaded
melissacline
parents:
diff changeset
5
7
1d150e860c4d Expanded the functionality of the merge genomic datasets tool, to generate an output dataset with the file (or label) indicating where each column came from
melissacline
parents: 6
diff changeset
6 def header (samples, sourceFiles, infile, labelThisFile):
1d150e860c4d Expanded the functionality of the merge genomic datasets tool, to generate an output dataset with the file (or label) indicating where each column came from
melissacline
parents: 6
diff changeset
7 if labelThisFile == None:
1d150e860c4d Expanded the functionality of the merge genomic datasets tool, to generate an output dataset with the file (or label) indicating where each column came from
melissacline
parents: 6
diff changeset
8 labelToUse = infile
1d150e860c4d Expanded the functionality of the merge genomic datasets tool, to generate an output dataset with the file (or label) indicating where each column came from
melissacline
parents: 6
diff changeset
9 else:
1d150e860c4d Expanded the functionality of the merge genomic datasets tool, to generate an output dataset with the file (or label) indicating where each column came from
melissacline
parents: 6
diff changeset
10 labelToUse = labelThisFile
8
5d4538cb38db When opening files for reading, changed the open() mode from 'r' to 'U' to accommodate non-unix systems
melissacline
parents: 7
diff changeset
11 fin= open(infile, 'U')
6
2035405538b4 Uploaded
melissacline
parents:
diff changeset
12 #header, samples
2035405538b4 Uploaded
melissacline
parents:
diff changeset
13 newSamples = string.split(string.strip(fin.readline()),'\t')[1:]
2035405538b4 Uploaded
melissacline
parents:
diff changeset
14 for sample in newSamples:
2035405538b4 Uploaded
melissacline
parents:
diff changeset
15 if sample not in samples:
2035405538b4 Uploaded
melissacline
parents:
diff changeset
16 samples[sample]= len(samples)
7
1d150e860c4d Expanded the functionality of the merge genomic datasets tool, to generate an output dataset with the file (or label) indicating where each column came from
melissacline
parents: 6
diff changeset
17 sourceFiles[sample] = labelToUse
6
2035405538b4 Uploaded
melissacline
parents:
diff changeset
18 fin.close()
2035405538b4 Uploaded
melissacline
parents:
diff changeset
19 return
2035405538b4 Uploaded
melissacline
parents:
diff changeset
20
2035405538b4 Uploaded
melissacline
parents:
diff changeset
21 def process(genes, samples, dataMatrix, infile):
2035405538b4 Uploaded
melissacline
parents:
diff changeset
22 maxLength= len(samples)
2035405538b4 Uploaded
melissacline
parents:
diff changeset
23
8
5d4538cb38db When opening files for reading, changed the open() mode from 'r' to 'U' to accommodate non-unix systems
melissacline
parents: 7
diff changeset
24 fin= open(infile,'U')
6
2035405538b4 Uploaded
melissacline
parents:
diff changeset
25 #header
2035405538b4 Uploaded
melissacline
parents:
diff changeset
26 newSamples = string.split(string.strip(fin.readline()),'\t')
2035405538b4 Uploaded
melissacline
parents:
diff changeset
27
2035405538b4 Uploaded
melissacline
parents:
diff changeset
28 while 1:
2035405538b4 Uploaded
melissacline
parents:
diff changeset
29 line = fin.readline()[:-1]
2035405538b4 Uploaded
melissacline
parents:
diff changeset
30 if line =="":
2035405538b4 Uploaded
melissacline
parents:
diff changeset
31 break
2035405538b4 Uploaded
melissacline
parents:
diff changeset
32 data = string.split(line,"\t")
2035405538b4 Uploaded
melissacline
parents:
diff changeset
33 gene = data[0]
2035405538b4 Uploaded
melissacline
parents:
diff changeset
34 if gene not in genes:
2035405538b4 Uploaded
melissacline
parents:
diff changeset
35 genes[gene]= len(genes)
2035405538b4 Uploaded
melissacline
parents:
diff changeset
36 l=[]
2035405538b4 Uploaded
melissacline
parents:
diff changeset
37 for i in range (0, maxLength):
2035405538b4 Uploaded
melissacline
parents:
diff changeset
38 l.append("")
2035405538b4 Uploaded
melissacline
parents:
diff changeset
39 dataMatrix.append(l)
2035405538b4 Uploaded
melissacline
parents:
diff changeset
40
2035405538b4 Uploaded
melissacline
parents:
diff changeset
41 x = genes[gene]
2035405538b4 Uploaded
melissacline
parents:
diff changeset
42 for i in range (1, len(data)):
2035405538b4 Uploaded
melissacline
parents:
diff changeset
43 sample = newSamples[i]
2035405538b4 Uploaded
melissacline
parents:
diff changeset
44 y = samples[sample]
2035405538b4 Uploaded
melissacline
parents:
diff changeset
45 dataMatrix[x][y]= data[i]
2035405538b4 Uploaded
melissacline
parents:
diff changeset
46
2035405538b4 Uploaded
melissacline
parents:
diff changeset
47 fin.close()
2035405538b4 Uploaded
melissacline
parents:
diff changeset
48 return
2035405538b4 Uploaded
melissacline
parents:
diff changeset
49
7
1d150e860c4d Expanded the functionality of the merge genomic datasets tool, to generate an output dataset with the file (or label) indicating where each column came from
melissacline
parents: 6
diff changeset
50
1d150e860c4d Expanded the functionality of the merge genomic datasets tool, to generate an output dataset with the file (or label) indicating where each column came from
melissacline
parents: 6
diff changeset
51 def outputSourceMatrix(sourceData, outputFileName):
1d150e860c4d Expanded the functionality of the merge genomic datasets tool, to generate an output dataset with the file (or label) indicating where each column came from
melissacline
parents: 6
diff changeset
52 fout = open(outputFileName, "w")
1d150e860c4d Expanded the functionality of the merge genomic datasets tool, to generate an output dataset with the file (or label) indicating where each column came from
melissacline
parents: 6
diff changeset
53 fout.write("Sample\tSource\n")
1d150e860c4d Expanded the functionality of the merge genomic datasets tool, to generate an output dataset with the file (or label) indicating where each column came from
melissacline
parents: 6
diff changeset
54 for thisSample in sourceData.keys():
1d150e860c4d Expanded the functionality of the merge genomic datasets tool, to generate an output dataset with the file (or label) indicating where each column came from
melissacline
parents: 6
diff changeset
55 fout.write("%s\t%s\n" % (thisSample, sourceData[thisSample]))
1d150e860c4d Expanded the functionality of the merge genomic datasets tool, to generate an output dataset with the file (or label) indicating where each column came from
melissacline
parents: 6
diff changeset
56 fout.close()
1d150e860c4d Expanded the functionality of the merge genomic datasets tool, to generate an output dataset with the file (or label) indicating where each column came from
melissacline
parents: 6
diff changeset
57 return
1d150e860c4d Expanded the functionality of the merge genomic datasets tool, to generate an output dataset with the file (or label) indicating where each column came from
melissacline
parents: 6
diff changeset
58
1d150e860c4d Expanded the functionality of the merge genomic datasets tool, to generate an output dataset with the file (or label) indicating where each column came from
melissacline
parents: 6
diff changeset
59
1d150e860c4d Expanded the functionality of the merge genomic datasets tool, to generate an output dataset with the file (or label) indicating where each column came from
melissacline
parents: 6
diff changeset
60 def outputMergedMatrix(dataMatrix, samples, genes, outfile):
6
2035405538b4 Uploaded
melissacline
parents:
diff changeset
61 fout = open(outfile,"w")
2035405538b4 Uploaded
melissacline
parents:
diff changeset
62 maxLength= len(samples)
2035405538b4 Uploaded
melissacline
parents:
diff changeset
63 sList=[]
2035405538b4 Uploaded
melissacline
parents:
diff changeset
64 for i in range (0, maxLength):
2035405538b4 Uploaded
melissacline
parents:
diff changeset
65 sList.append("")
2035405538b4 Uploaded
melissacline
parents:
diff changeset
66 for sample in samples:
2035405538b4 Uploaded
melissacline
parents:
diff changeset
67 pos =samples[sample]
2035405538b4 Uploaded
melissacline
parents:
diff changeset
68 sList[pos] = sample
2035405538b4 Uploaded
melissacline
parents:
diff changeset
69
2035405538b4 Uploaded
melissacline
parents:
diff changeset
70 fout.write("sample")
2035405538b4 Uploaded
melissacline
parents:
diff changeset
71 for sample in sList:
2035405538b4 Uploaded
melissacline
parents:
diff changeset
72 fout.write("\t"+sample)
2035405538b4 Uploaded
melissacline
parents:
diff changeset
73 fout.write("\n")
2035405538b4 Uploaded
melissacline
parents:
diff changeset
74
2035405538b4 Uploaded
melissacline
parents:
diff changeset
75 for gene in genes:
2035405538b4 Uploaded
melissacline
parents:
diff changeset
76 fout.write(gene)
2035405538b4 Uploaded
melissacline
parents:
diff changeset
77 for sample in sList:
2035405538b4 Uploaded
melissacline
parents:
diff changeset
78 value = dataMatrix[genes[gene]][samples[sample]]
2035405538b4 Uploaded
melissacline
parents:
diff changeset
79 fout.write("\t"+value)
2035405538b4 Uploaded
melissacline
parents:
diff changeset
80 fout.write("\n")
2035405538b4 Uploaded
melissacline
parents:
diff changeset
81 fout.close()
2035405538b4 Uploaded
melissacline
parents:
diff changeset
82 return
2035405538b4 Uploaded
melissacline
parents:
diff changeset
83
2035405538b4 Uploaded
melissacline
parents:
diff changeset
84 if __name__ == '__main__' :
7
1d150e860c4d Expanded the functionality of the merge genomic datasets tool, to generate an output dataset with the file (or label) indicating where each column came from
melissacline
parents: 6
diff changeset
85 #
1d150e860c4d Expanded the functionality of the merge genomic datasets tool, to generate an output dataset with the file (or label) indicating where each column came from
melissacline
parents: 6
diff changeset
86 # The input files to this script are two or more matrices, in which
1d150e860c4d Expanded the functionality of the merge genomic datasets tool, to generate an output dataset with the file (or label) indicating where each column came from
melissacline
parents: 6
diff changeset
87 # columns represent samples and rows represent genes or measurements.
1d150e860c4d Expanded the functionality of the merge genomic datasets tool, to generate an output dataset with the file (or label) indicating where each column came from
melissacline
parents: 6
diff changeset
88 # There are two output files: outMergedData contains the input data merged
1d150e860c4d Expanded the functionality of the merge genomic datasets tool, to generate an output dataset with the file (or label) indicating where each column came from
melissacline
parents: 6
diff changeset
89 # into a single matrix, and outSourceMatrix is a two-column matrix
1d150e860c4d Expanded the functionality of the merge genomic datasets tool, to generate an output dataset with the file (or label) indicating where each column came from
melissacline
parents: 6
diff changeset
90 # indicating which file each sample (or column label) came from. This
1d150e860c4d Expanded the functionality of the merge genomic datasets tool, to generate an output dataset with the file (or label) indicating where each column came from
melissacline
parents: 6
diff changeset
91 # assumes that each sample came from at most one file.
1d150e860c4d Expanded the functionality of the merge genomic datasets tool, to generate an output dataset with the file (or label) indicating where each column came from
melissacline
parents: 6
diff changeset
92 #
1d150e860c4d Expanded the functionality of the merge genomic datasets tool, to generate an output dataset with the file (or label) indicating where each column came from
melissacline
parents: 6
diff changeset
93 parser = argparse.ArgumentParser()
1d150e860c4d Expanded the functionality of the merge genomic datasets tool, to generate an output dataset with the file (or label) indicating where each column came from
melissacline
parents: 6
diff changeset
94 parser.add_argument("inFileA", type=str, help="First input file")
1d150e860c4d Expanded the functionality of the merge genomic datasets tool, to generate an output dataset with the file (or label) indicating where each column came from
melissacline
parents: 6
diff changeset
95 parser.add_argument("inFileB", type=str, help="Second input file")
1d150e860c4d Expanded the functionality of the merge genomic datasets tool, to generate an output dataset with the file (or label) indicating where each column came from
melissacline
parents: 6
diff changeset
96 parser.add_argument("outMergedData", type=str,
1d150e860c4d Expanded the functionality of the merge genomic datasets tool, to generate an output dataset with the file (or label) indicating where each column came from
melissacline
parents: 6
diff changeset
97 help="Filename for the merged dataset")
1d150e860c4d Expanded the functionality of the merge genomic datasets tool, to generate an output dataset with the file (or label) indicating where each column came from
melissacline
parents: 6
diff changeset
98 parser.add_argument("outSourceMatrix", type=str,
1d150e860c4d Expanded the functionality of the merge genomic datasets tool, to generate an output dataset with the file (or label) indicating where each column came from
melissacline
parents: 6
diff changeset
99 help="""Filename for a Nx2 matrix that indicates
1d150e860c4d Expanded the functionality of the merge genomic datasets tool, to generate an output dataset with the file (or label) indicating where each column came from
melissacline
parents: 6
diff changeset
100 the source file of each column""")
1d150e860c4d Expanded the functionality of the merge genomic datasets tool, to generate an output dataset with the file (or label) indicating where each column came from
melissacline
parents: 6
diff changeset
101 parser.add_argument("--aLabel", type=str, default=None,
1d150e860c4d Expanded the functionality of the merge genomic datasets tool, to generate an output dataset with the file (or label) indicating where each column came from
melissacline
parents: 6
diff changeset
102 help="User-friendly label for the first input file")
1d150e860c4d Expanded the functionality of the merge genomic datasets tool, to generate an output dataset with the file (or label) indicating where each column came from
melissacline
parents: 6
diff changeset
103 parser.add_argument("--bLabel", type=str, default=None,
1d150e860c4d Expanded the functionality of the merge genomic datasets tool, to generate an output dataset with the file (or label) indicating where each column came from
melissacline
parents: 6
diff changeset
104 help="User-friendly label for the second input file")
1d150e860c4d Expanded the functionality of the merge genomic datasets tool, to generate an output dataset with the file (or label) indicating where each column came from
melissacline
parents: 6
diff changeset
105 args = parser.parse_args()
6
2035405538b4 Uploaded
melissacline
parents:
diff changeset
106
2035405538b4 Uploaded
melissacline
parents:
diff changeset
107 genes={}
2035405538b4 Uploaded
melissacline
parents:
diff changeset
108 samples={}
7
1d150e860c4d Expanded the functionality of the merge genomic datasets tool, to generate an output dataset with the file (or label) indicating where each column came from
melissacline
parents: 6
diff changeset
109 sourceFiles = {}
6
2035405538b4 Uploaded
melissacline
parents:
diff changeset
110 dataMatrix=[]
2035405538b4 Uploaded
melissacline
parents:
diff changeset
111
7
1d150e860c4d Expanded the functionality of the merge genomic datasets tool, to generate an output dataset with the file (or label) indicating where each column came from
melissacline
parents: 6
diff changeset
112 header(samples, sourceFiles, args.inFileA, args.aLabel)
1d150e860c4d Expanded the functionality of the merge genomic datasets tool, to generate an output dataset with the file (or label) indicating where each column came from
melissacline
parents: 6
diff changeset
113 header(samples, sourceFiles, args.inFileB, args.bLabel)
6
2035405538b4 Uploaded
melissacline
parents:
diff changeset
114
7
1d150e860c4d Expanded the functionality of the merge genomic datasets tool, to generate an output dataset with the file (or label) indicating where each column came from
melissacline
parents: 6
diff changeset
115 process(genes, samples, dataMatrix, args.inFileA)
1d150e860c4d Expanded the functionality of the merge genomic datasets tool, to generate an output dataset with the file (or label) indicating where each column came from
melissacline
parents: 6
diff changeset
116 process(genes, samples, dataMatrix, args.inFileB)
6
2035405538b4 Uploaded
melissacline
parents:
diff changeset
117
7
1d150e860c4d Expanded the functionality of the merge genomic datasets tool, to generate an output dataset with the file (or label) indicating where each column came from
melissacline
parents: 6
diff changeset
118 outputSourceMatrix(sourceFiles, args.outSourceMatrix)
1d150e860c4d Expanded the functionality of the merge genomic datasets tool, to generate an output dataset with the file (or label) indicating where each column came from
melissacline
parents: 6
diff changeset
119 outputMergedMatrix(dataMatrix, samples, genes, args.outMergedData)