annotate mergeGenomicMatrixFiles.py @ 26:0e44976a61b0

Trying without the install version specification for synapseClient
author melissacline
date Mon, 20 Jul 2015 15:56:06 -0700
parents 1d83dbbee373
children eb5acf81e609
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
4
d0674221a6ae Added a proper first line
melissacline
parents: 3
diff changeset
1 #!/usr/bin/env python
d0674221a6ae Added a proper first line
melissacline
parents: 3
diff changeset
2
7
1d150e860c4d Expanded the functionality of the merge genomic datasets tool, to generate an output dataset with the file (or label) indicating where each column came from
melissacline
parents: 6
diff changeset
3 import argparse
3
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
4 import string,os,sys
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
5
7
1d150e860c4d Expanded the functionality of the merge genomic datasets tool, to generate an output dataset with the file (or label) indicating where each column came from
melissacline
parents: 6
diff changeset
6 def header (samples, sourceFiles, infile, labelThisFile):
1d150e860c4d Expanded the functionality of the merge genomic datasets tool, to generate an output dataset with the file (or label) indicating where each column came from
melissacline
parents: 6
diff changeset
7 if labelThisFile == None:
1d150e860c4d Expanded the functionality of the merge genomic datasets tool, to generate an output dataset with the file (or label) indicating where each column came from
melissacline
parents: 6
diff changeset
8 labelToUse = infile
1d150e860c4d Expanded the functionality of the merge genomic datasets tool, to generate an output dataset with the file (or label) indicating where each column came from
melissacline
parents: 6
diff changeset
9 else:
1d150e860c4d Expanded the functionality of the merge genomic datasets tool, to generate an output dataset with the file (or label) indicating where each column came from
melissacline
parents: 6
diff changeset
10 labelToUse = labelThisFile
8
5d4538cb38db When opening files for reading, changed the open() mode from 'r' to 'U' to accommodate non-unix systems
melissacline
parents: 7
diff changeset
11 fin= open(infile, 'U')
3
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
12 #header, samples
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
13 newSamples = string.split(string.strip(fin.readline()),'\t')[1:]
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
14 for sample in newSamples:
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
15 if sample not in samples:
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
16 samples[sample]= len(samples)
7
1d150e860c4d Expanded the functionality of the merge genomic datasets tool, to generate an output dataset with the file (or label) indicating where each column came from
melissacline
parents: 6
diff changeset
17 sourceFiles[sample] = labelToUse
3
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
18 fin.close()
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
19 return
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
20
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
21 def process(genes, samples, dataMatrix, infile):
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
22 maxLength= len(samples)
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
23
8
5d4538cb38db When opening files for reading, changed the open() mode from 'r' to 'U' to accommodate non-unix systems
melissacline
parents: 7
diff changeset
24 fin= open(infile,'U')
3
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
25 #header
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
26 newSamples = string.split(string.strip(fin.readline()),'\t')
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
27
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
28 while 1:
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
29 line = fin.readline()[:-1]
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
30 if line =="":
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
31 break
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
32 data = string.split(line,"\t")
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
33 gene = data[0]
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
34 if gene not in genes:
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
35 genes[gene]= len(genes)
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
36 l=[]
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
37 for i in range (0, maxLength):
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
38 l.append("")
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
39 dataMatrix.append(l)
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
40
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
41 x = genes[gene]
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
42 for i in range (1, len(data)):
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
43 sample = newSamples[i]
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
44 y = samples[sample]
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
45 dataMatrix[x][y]= data[i]
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
46
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
47 fin.close()
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
48 return
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
49
7
1d150e860c4d Expanded the functionality of the merge genomic datasets tool, to generate an output dataset with the file (or label) indicating where each column came from
melissacline
parents: 6
diff changeset
50
1d150e860c4d Expanded the functionality of the merge genomic datasets tool, to generate an output dataset with the file (or label) indicating where each column came from
melissacline
parents: 6
diff changeset
51 def outputSourceMatrix(sourceData, outputFileName):
1d150e860c4d Expanded the functionality of the merge genomic datasets tool, to generate an output dataset with the file (or label) indicating where each column came from
melissacline
parents: 6
diff changeset
52 fout = open(outputFileName, "w")
1d150e860c4d Expanded the functionality of the merge genomic datasets tool, to generate an output dataset with the file (or label) indicating where each column came from
melissacline
parents: 6
diff changeset
53 fout.write("Sample\tSource\n")
1d150e860c4d Expanded the functionality of the merge genomic datasets tool, to generate an output dataset with the file (or label) indicating where each column came from
melissacline
parents: 6
diff changeset
54 for thisSample in sourceData.keys():
1d150e860c4d Expanded the functionality of the merge genomic datasets tool, to generate an output dataset with the file (or label) indicating where each column came from
melissacline
parents: 6
diff changeset
55 fout.write("%s\t%s\n" % (thisSample, sourceData[thisSample]))
1d150e860c4d Expanded the functionality of the merge genomic datasets tool, to generate an output dataset with the file (or label) indicating where each column came from
melissacline
parents: 6
diff changeset
56 fout.close()
1d150e860c4d Expanded the functionality of the merge genomic datasets tool, to generate an output dataset with the file (or label) indicating where each column came from
melissacline
parents: 6
diff changeset
57 return
1d150e860c4d Expanded the functionality of the merge genomic datasets tool, to generate an output dataset with the file (or label) indicating where each column came from
melissacline
parents: 6
diff changeset
58
1d150e860c4d Expanded the functionality of the merge genomic datasets tool, to generate an output dataset with the file (or label) indicating where each column came from
melissacline
parents: 6
diff changeset
59
1d150e860c4d Expanded the functionality of the merge genomic datasets tool, to generate an output dataset with the file (or label) indicating where each column came from
melissacline
parents: 6
diff changeset
60 def outputMergedMatrix(dataMatrix, samples, genes, outfile):
3
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
61 fout = open(outfile,"w")
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
62 maxLength= len(samples)
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
63 sList=[]
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
64 for i in range (0, maxLength):
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
65 sList.append("")
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
66 for sample in samples:
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
67 pos =samples[sample]
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
68 sList[pos] = sample
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
69
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
70 fout.write("sample")
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
71 for sample in sList:
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
72 fout.write("\t"+sample)
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
73 fout.write("\n")
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
74
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
75 for gene in genes:
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
76 fout.write(gene)
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
77 for sample in sList:
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
78 value = dataMatrix[genes[gene]][samples[sample]]
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
79 fout.write("\t"+value)
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
80 fout.write("\n")
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
81 fout.close()
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
82 return
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
83
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
84 if __name__ == '__main__' :
7
1d150e860c4d Expanded the functionality of the merge genomic datasets tool, to generate an output dataset with the file (or label) indicating where each column came from
melissacline
parents: 6
diff changeset
85 #
1d150e860c4d Expanded the functionality of the merge genomic datasets tool, to generate an output dataset with the file (or label) indicating where each column came from
melissacline
parents: 6
diff changeset
86 # The input files to this script are two or more matrices, in which
1d150e860c4d Expanded the functionality of the merge genomic datasets tool, to generate an output dataset with the file (or label) indicating where each column came from
melissacline
parents: 6
diff changeset
87 # columns represent samples and rows represent genes or measurements.
1d150e860c4d Expanded the functionality of the merge genomic datasets tool, to generate an output dataset with the file (or label) indicating where each column came from
melissacline
parents: 6
diff changeset
88 # There are two output files: outMergedData contains the input data merged
1d150e860c4d Expanded the functionality of the merge genomic datasets tool, to generate an output dataset with the file (or label) indicating where each column came from
melissacline
parents: 6
diff changeset
89 # into a single matrix, and outSourceMatrix is a two-column matrix
1d150e860c4d Expanded the functionality of the merge genomic datasets tool, to generate an output dataset with the file (or label) indicating where each column came from
melissacline
parents: 6
diff changeset
90 # indicating which file each sample (or column label) came from. This
1d150e860c4d Expanded the functionality of the merge genomic datasets tool, to generate an output dataset with the file (or label) indicating where each column came from
melissacline
parents: 6
diff changeset
91 # assumes that each sample came from at most one file.
1d150e860c4d Expanded the functionality of the merge genomic datasets tool, to generate an output dataset with the file (or label) indicating where each column came from
melissacline
parents: 6
diff changeset
92 #
1d150e860c4d Expanded the functionality of the merge genomic datasets tool, to generate an output dataset with the file (or label) indicating where each column came from
melissacline
parents: 6
diff changeset
93 parser = argparse.ArgumentParser()
1d150e860c4d Expanded the functionality of the merge genomic datasets tool, to generate an output dataset with the file (or label) indicating where each column came from
melissacline
parents: 6
diff changeset
94 parser.add_argument("inFileA", type=str, help="First input file")
1d150e860c4d Expanded the functionality of the merge genomic datasets tool, to generate an output dataset with the file (or label) indicating where each column came from
melissacline
parents: 6
diff changeset
95 parser.add_argument("inFileB", type=str, help="Second input file")
1d150e860c4d Expanded the functionality of the merge genomic datasets tool, to generate an output dataset with the file (or label) indicating where each column came from
melissacline
parents: 6
diff changeset
96 parser.add_argument("outMergedData", type=str,
1d150e860c4d Expanded the functionality of the merge genomic datasets tool, to generate an output dataset with the file (or label) indicating where each column came from
melissacline
parents: 6
diff changeset
97 help="Filename for the merged dataset")
1d150e860c4d Expanded the functionality of the merge genomic datasets tool, to generate an output dataset with the file (or label) indicating where each column came from
melissacline
parents: 6
diff changeset
98 parser.add_argument("outSourceMatrix", type=str,
1d150e860c4d Expanded the functionality of the merge genomic datasets tool, to generate an output dataset with the file (or label) indicating where each column came from
melissacline
parents: 6
diff changeset
99 help="""Filename for a Nx2 matrix that indicates
1d150e860c4d Expanded the functionality of the merge genomic datasets tool, to generate an output dataset with the file (or label) indicating where each column came from
melissacline
parents: 6
diff changeset
100 the source file of each column""")
1d150e860c4d Expanded the functionality of the merge genomic datasets tool, to generate an output dataset with the file (or label) indicating where each column came from
melissacline
parents: 6
diff changeset
101 parser.add_argument("--aLabel", type=str, default=None,
1d150e860c4d Expanded the functionality of the merge genomic datasets tool, to generate an output dataset with the file (or label) indicating where each column came from
melissacline
parents: 6
diff changeset
102 help="User-friendly label for the first input file")
1d150e860c4d Expanded the functionality of the merge genomic datasets tool, to generate an output dataset with the file (or label) indicating where each column came from
melissacline
parents: 6
diff changeset
103 parser.add_argument("--bLabel", type=str, default=None,
1d150e860c4d Expanded the functionality of the merge genomic datasets tool, to generate an output dataset with the file (or label) indicating where each column came from
melissacline
parents: 6
diff changeset
104 help="User-friendly label for the second input file")
1d150e860c4d Expanded the functionality of the merge genomic datasets tool, to generate an output dataset with the file (or label) indicating where each column came from
melissacline
parents: 6
diff changeset
105 args = parser.parse_args()
3
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
106
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
107 genes={}
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
108 samples={}
7
1d150e860c4d Expanded the functionality of the merge genomic datasets tool, to generate an output dataset with the file (or label) indicating where each column came from
melissacline
parents: 6
diff changeset
109 sourceFiles = {}
3
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
110 dataMatrix=[]
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
111
7
1d150e860c4d Expanded the functionality of the merge genomic datasets tool, to generate an output dataset with the file (or label) indicating where each column came from
melissacline
parents: 6
diff changeset
112 header(samples, sourceFiles, args.inFileA, args.aLabel)
1d150e860c4d Expanded the functionality of the merge genomic datasets tool, to generate an output dataset with the file (or label) indicating where each column came from
melissacline
parents: 6
diff changeset
113 header(samples, sourceFiles, args.inFileB, args.bLabel)
3
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
114
7
1d150e860c4d Expanded the functionality of the merge genomic datasets tool, to generate an output dataset with the file (or label) indicating where each column came from
melissacline
parents: 6
diff changeset
115 process(genes, samples, dataMatrix, args.inFileA)
1d150e860c4d Expanded the functionality of the merge genomic datasets tool, to generate an output dataset with the file (or label) indicating where each column came from
melissacline
parents: 6
diff changeset
116 process(genes, samples, dataMatrix, args.inFileB)
3
12a1ea920524 Creating a tool to merge genomic datasets
melissacline
parents:
diff changeset
117
7
1d150e860c4d Expanded the functionality of the merge genomic datasets tool, to generate an output dataset with the file (or label) indicating where each column came from
melissacline
parents: 6
diff changeset
118 outputSourceMatrix(sourceFiles, args.outSourceMatrix)
1d150e860c4d Expanded the functionality of the merge genomic datasets tool, to generate an output dataset with the file (or label) indicating where each column came from
melissacline
parents: 6
diff changeset
119 outputMergedMatrix(dataMatrix, samples, genes, args.outMergedData)