Mercurial > repos > melissacline > ucsc_cancer_utilities
annotate mergeXenaMutation.py @ 55:1093078e7976
merge mutation data conform to new mutationVector data standard
author | jingchunzhu |
---|---|
date | Fri, 18 Sep 2015 10:24:39 -0700 |
parents | 9806198df91f |
children |
rev | line source |
---|---|
18 | 1 #!/usr/bin/env python |
2 | |
3 import argparse | |
4 import string, os, sys | |
5 | |
55
1093078e7976
merge mutation data conform to new mutationVector data standard
jingchunzhu
parents:
41
diff
changeset
|
6 requiredCOLs = ["chr", "start","end","reference","alt"] |
18 | 7 |
8 def headerError(filename, column, ferror): | |
9 ferror.write(filename +" does not have column " + column+"\n") | |
10 ferror.close() | |
11 sys.exit(1) | |
12 | |
13 def findAnyValueInList (values, dataList): | |
14 for value in values: | |
15 for i in range(0,len(dataList)): | |
16 if value == dataList[i]: | |
17 return i | |
18 return -1 | |
19 | |
20 def header (infile, ferror): | |
21 fin= open(infile,'U') | |
22 | |
23 columnDic ={} | |
24 #header | |
25 line = fin.readline() | |
26 fin.close() | |
27 if line [0]=="#": | |
28 line = line[1:-1] | |
55
1093078e7976
merge mutation data conform to new mutationVector data standard
jingchunzhu
parents:
41
diff
changeset
|
29 data = string.split(string.strip(line),"\t") |
18 | 30 |
55
1093078e7976
merge mutation data conform to new mutationVector data standard
jingchunzhu
parents:
41
diff
changeset
|
31 columnDic["chr"]= findAnyValueInList (["chr","chrom", "Chr"], data) |
18 | 32 if columnDic["chr"] ==-1: |
33 headerError(infile, "chr", ferror) | |
34 | |
55
1093078e7976
merge mutation data conform to new mutationVector data standard
jingchunzhu
parents:
41
diff
changeset
|
35 columnDic["start"]= findAnyValueInList (["start","chrStart","Start"], data) |
18 | 36 if columnDic["start"] == -1: |
37 headerError(infile, "start", ferror) | |
38 | |
55
1093078e7976
merge mutation data conform to new mutationVector data standard
jingchunzhu
parents:
41
diff
changeset
|
39 columnDic["end"]= findAnyValueInList (["end","chrEnd", "End"], data) |
18 | 40 if columnDic["end"] == -1: |
41 headerError(infile, "end", ferror) | |
42 | |
55
1093078e7976
merge mutation data conform to new mutationVector data standard
jingchunzhu
parents:
41
diff
changeset
|
43 columnDic["alt"]= findAnyValueInList (["alt","Alt"], data) |
18 | 44 if columnDic["alt"] == -1: |
45 headerError(infile, "alt", ferror) | |
46 | |
55
1093078e7976
merge mutation data conform to new mutationVector data standard
jingchunzhu
parents:
41
diff
changeset
|
47 columnDic["reference"]= findAnyValueInList (["reference","ref","Reference","Ref"], data) |
18 | 48 if columnDic["reference"] == -1: |
49 headerError(infile, "reference", ferror) | |
50 | |
55
1093078e7976
merge mutation data conform to new mutationVector data standard
jingchunzhu
parents:
41
diff
changeset
|
51 #columnDic["gene"]= findAnyValueInList (["gene","Gene"], data) |
1093078e7976
merge mutation data conform to new mutationVector data standard
jingchunzhu
parents:
41
diff
changeset
|
52 #if columnDic["gene"] == -1: |
1093078e7976
merge mutation data conform to new mutationVector data standard
jingchunzhu
parents:
41
diff
changeset
|
53 # headerError(infile, "gene", ferror) |
18 | 54 |
55
1093078e7976
merge mutation data conform to new mutationVector data standard
jingchunzhu
parents:
41
diff
changeset
|
55 #columnDic["effect"]= findAnyValueInList (["effect"], data) |
1093078e7976
merge mutation data conform to new mutationVector data standard
jingchunzhu
parents:
41
diff
changeset
|
56 #if columnDic["effect"] == -1: |
1093078e7976
merge mutation data conform to new mutationVector data standard
jingchunzhu
parents:
41
diff
changeset
|
57 # headerError(infile, "effect", ferror) |
18 | 58 |
59 requiredCols = columnDic.keys() | |
60 requiredColsPos = columnDic.values() | |
61 for i in range(1,len(data)): | |
62 if i not in requiredColsPos: | |
63 columnDic [data[i]]=i | |
64 | |
65 return columnDic | |
66 | |
20 | 67 def summarizeColumns(inFiles, fileColumn, allCols, ferror): |
18 | 68 for infile in inFiles: |
69 columnDic = header (infile, ferror) | |
70 fileColumn [infile] = columnDic | |
71 for col in columnDic: | |
72 if col not in allCols: | |
73 allCols.append(col) | |
74 return | |
75 | |
76 def outputHeader (requiredCOLs,allCols,fout): | |
55
1093078e7976
merge mutation data conform to new mutationVector data standard
jingchunzhu
parents:
41
diff
changeset
|
77 fout.write("sample") |
18 | 78 for col in requiredCOLs: |
79 fout.write("\t"+col) | |
80 for col in allCols: | |
81 if col not in requiredCOLs: | |
82 fout.write("\t"+col) | |
83 fout.write("\n") | |
84 fout.close() | |
85 return | |
86 | |
87 def processAndOutput(infile,requiredCOLs,allCols,columnDic,fout): | |
88 fin = open(infile,'U') | |
89 fin.readline() | |
90 while 1: | |
91 line = fin.readline()[:-1] | |
92 if line =="": | |
93 break | |
94 data = string.split(line,'\t') | |
95 fout.write(data[0]) | |
96 for col in requiredCOLs: | |
97 pos = columnDic[col] | |
98 fout.write("\t"+ data[pos]) | |
99 for col in allCols: | |
100 if col not in requiredCOLs: | |
101 if col in columnDic: | |
102 pos = columnDic[col] | |
103 fout.write("\t"+ data[pos]) | |
104 else: | |
105 fout.write("\t") | |
106 fout.write("\n") | |
107 fin.close() | |
108 return | |
109 | |
110 def collectSource(inFile, label, sampleDic): | |
111 fin = open(inFile,'U') | |
112 fin.readline() | |
113 while 1: | |
114 line = fin.readline()[:-1] | |
115 if line =="": | |
116 break | |
117 sample = string.split(line,'\t')[0] | |
118 if sample not in sampleDic: | |
119 sampleDic[sample]=[] | |
41 | 120 if label not in sampleDic[sample]: |
18 | 121 sampleDic[sample].append(label) |
122 fin.close() | |
123 return | |
124 | |
125 def outputSampleDic (sampleDic, outPhenotypeFile): | |
126 fout = open(outPhenotypeFile,'w') | |
127 fout.write("sample\tsource\n") | |
128 for sample in sampleDic: | |
129 source = sampleDic[sample] | |
130 source.sort() | |
131 fout.write(sample+"\t"+string.join(source,", ")+"\n") | |
132 fout.close() | |
133 return | |
134 | |
135 if __name__ == '__main__' : | |
136 if len(sys.argv[:]) <6: | |
137 print "python mergeMultipleXenaMutation.py outputXenaMutation outputPhenotypeMatrix errorLog inputfile(s)" | |
138 print "this is merging data A+B=C for mutation by position type of data\n" | |
139 sys.exit(1) | |
140 | |
20 | 141 # |
142 # The input files to this script are two or more matrices, in which | |
143 # columns represent samples and rows represent genes or measurements. | |
144 # There are two output files: outMergedData contains the input data merged | |
145 # into a single matrix, and outSourceMatrix is a two-column matrix | |
146 # indicating which file each sample (or column label) came from. This | |
147 # assumes that each sample came from at most one file. | |
148 # | |
149 parser = argparse.ArgumentParser() | |
150 parser.add_argument("outMergedData", type=str, | |
151 help="Filename for the merged dataset") | |
152 parser.add_argument("outSourceMatrix", type=str, | |
153 help="""Filename for a Nx2 matrix that indicates | |
18 | 154 the source file of each column""") |
20 | 155 parser.add_argument("errorLog", type=str, |
156 help="""Error log""") | |
157 parser.add_argument("inFileA", type=str, help="First input file") | |
158 parser.add_argument("inFileB", type=str, help="Second input file") | |
159 parser.add_argument("--aLabel", type=str, default=None, | |
160 help="User-friendly label for the first input file") | |
161 parser.add_argument("--bLabel", type=str, default=None, | |
162 help="User-friendly label for the second input file") | |
163 args = parser.parse_args() | |
18 | 164 |
165 | |
20 | 166 #inFiles = sys.argv[4:] |
167 inFiles = list() | |
168 inFiles.append(args.inFileA) | |
169 inFiles.append(args.inFileB) | |
170 errofile = args.errorLog | |
171 outfile = args.outMergedData | |
172 #print outfile | |
173 outPhenotypeFile = args.outSourceMatrix | |
174 #print outPhenotypeFile | |
18 | 175 |
20 | 176 ferror = open(errofile,'w') |
177 | |
178 #get all the columns, build fileColumn dictionary | |
179 fileColumn={} | |
180 allCols =[] | |
181 summarizeColumns(inFiles, fileColumn, allCols, ferror) | |
182 ferror.close() | |
18 | 183 |
20 | 184 #output header line |
185 fout = open(outfile,'w') | |
186 outputHeader (requiredCOLs,allCols,fout) | |
187 | |
188 #process and output combined mutationXena file | |
189 fout = open(outfile,'a') | |
18 | 190 |
20 | 191 columnDic = fileColumn[args.inFileA] |
192 processAndOutput(args.inFileA,requiredCOLs,allCols,columnDic,fout) | |
193 columnDic = fileColumn[args.inFileB] | |
194 processAndOutput(args.inFileB,requiredCOLs,allCols,columnDic,fout) | |
195 fout.close() | |
196 | |
197 #collect sample from source information | |
198 sampleDic ={} | |
199 if args.aLabel is None: | |
200 collectSource(args.inFileA, args.inFileA, sampleDic) | |
201 else: | |
202 collectSource(args.inFileA, args.aLabel, sampleDic) | |
203 if args.bLabel is None: | |
204 collectSource(args.inFileB, args.inFileB, sampleDic) | |
205 else: | |
206 collectSource(args.inFileB, args.bLabel, sampleDic) | |
18 | 207 |
208 | |
20 | 209 #output sample source information as phenotype matrix |
210 outputSampleDic (sampleDic, outPhenotypeFile) | |
18 | 211 |
212 | |
213 | |
214 |