Mercurial > repos > melissacline > ucsc_cancer_utilities
comparison mergeXenaMutation.py @ 20:914bc8ee6222
Debugged the merge mutation data tool
author | melissacline |
---|---|
date | Fri, 20 Mar 2015 15:50:22 -0700 |
parents | 15cb5a49cdbc |
children | 9806198df91f |
comparison
equal
deleted
inserted
replaced
19:371579dd9bc6 | 20:914bc8ee6222 |
---|---|
62 if i not in requiredColsPos: | 62 if i not in requiredColsPos: |
63 columnDic [data[i]]=i | 63 columnDic [data[i]]=i |
64 | 64 |
65 return columnDic | 65 return columnDic |
66 | 66 |
67 def summarizeColumns(infiles, fileColumn, allCols, ferror): | 67 def summarizeColumns(inFiles, fileColumn, allCols, ferror): |
68 for infile in inFiles: | 68 for infile in inFiles: |
69 columnDic = header (infile, ferror) | 69 columnDic = header (infile, ferror) |
70 fileColumn [infile] = columnDic | 70 fileColumn [infile] = columnDic |
71 for col in columnDic: | 71 for col in columnDic: |
72 if col not in allCols: | 72 if col not in allCols: |
136 if len(sys.argv[:]) <6: | 136 if len(sys.argv[:]) <6: |
137 print "python mergeMultipleXenaMutation.py outputXenaMutation outputPhenotypeMatrix errorLog inputfile(s)" | 137 print "python mergeMultipleXenaMutation.py outputXenaMutation outputPhenotypeMatrix errorLog inputfile(s)" |
138 print "this is merging data A+B=C for mutation by position type of data\n" | 138 print "this is merging data A+B=C for mutation by position type of data\n" |
139 sys.exit(1) | 139 sys.exit(1) |
140 | 140 |
141 # | 141 # |
142 # The input files to this script are two or more matrices, in which | 142 # The input files to this script are two or more matrices, in which |
143 # columns represent samples and rows represent genes or measurements. | 143 # columns represent samples and rows represent genes or measurements. |
144 # There are two output files: outMergedData contains the input data merged | 144 # There are two output files: outMergedData contains the input data merged |
145 # into a single matrix, and outSourceMatrix is a two-column matrix | 145 # into a single matrix, and outSourceMatrix is a two-column matrix |
146 # indicating which file each sample (or column label) came from. This | 146 # indicating which file each sample (or column label) came from. This |
147 # assumes that each sample came from at most one file. | 147 # assumes that each sample came from at most one file. |
148 # | 148 # |
149 parser = argparse.ArgumentParser() | 149 parser = argparse.ArgumentParser() |
150 parser.add_argument("inFileA", type=str, help="First input file") | 150 parser.add_argument("outMergedData", type=str, |
151 parser.add_argument("inFileB", type=str, help="Second input file") | 151 help="Filename for the merged dataset") |
152 parser.add_argument("outMergedData", type=str, | 152 parser.add_argument("outSourceMatrix", type=str, |
153 help="Filename for the merged dataset") | 153 help="""Filename for a Nx2 matrix that indicates |
154 parser.add_argument("outSourceMatrix", type=str, | |
155 help="""Filename for a Nx2 matrix that indicates | |
156 the source file of each column""") | 154 the source file of each column""") |
157 parser.add_argument("errorLog", type=str, | 155 parser.add_argument("errorLog", type=str, |
158 help="""Error log""") | 156 help="""Error log""") |
159 parser.add_argument("--aLabel", type=str, default=None, | 157 parser.add_argument("inFileA", type=str, help="First input file") |
160 help="User-friendly label for the first input file") | 158 parser.add_argument("inFileB", type=str, help="Second input file") |
161 parser.add_argument("--bLabel", type=str, default=None, | 159 parser.add_argument("--aLabel", type=str, default=None, |
162 help="User-friendly label for the second input file") | 160 help="User-friendly label for the first input file") |
163 args = parser.parse_args() | 161 parser.add_argument("--bLabel", type=str, default=None, |
162 help="User-friendly label for the second input file") | |
163 args = parser.parse_args() | |
164 | 164 |
165 | 165 |
166 #inFiles = sys.argv[4:] | 166 #inFiles = sys.argv[4:] |
167 print inFiles | 167 inFiles = list() |
168 errofile = args.errorLog | 168 inFiles.append(args.inFileA) |
169 outfile = args.outMergedData | 169 inFiles.append(args.inFileB) |
170 print outfile | 170 errofile = args.errorLog |
171 outPhenotypeFile = args.outSourceMatrix | 171 outfile = args.outMergedData |
172 print outPhenotypeFile | 172 #print outfile |
173 outPhenotypeFile = args.outSourceMatrix | |
174 #print outPhenotypeFile | |
173 | 175 |
174 ferror = open(errofile,'w') | 176 ferror = open(errofile,'w') |
175 | 177 |
176 #get all the columns, build fileColumn dictionary | 178 #get all the columns, build fileColumn dictionary |
177 fileColumn={} | 179 fileColumn={} |
178 allCols =[] | 180 allCols =[] |
179 summarizeColumns(inFiles, fileColumn, allCols, ferror) | 181 summarizeColumns(inFiles, fileColumn, allCols, ferror) |
180 ferror.close() | 182 ferror.close() |
181 | 183 |
182 #output header line | 184 #output header line |
183 fout = open(outfile,'w') | 185 fout = open(outfile,'w') |
184 outputHeader (requiredCOLs,allCols,fout) | 186 outputHeader (requiredCOLs,allCols,fout) |
185 | 187 |
186 #process and output combined mutationXena file | 188 #process and output combined mutationXena file |
187 fout = open(outfile,'a') | 189 fout = open(outfile,'a') |
188 | 190 |
189 columnDic = fileColumn[args.inFileA] | 191 columnDic = fileColumn[args.inFileA] |
190 processAndOutput(args.inFileA,requiredCOLs,allCols,columnDic,fout) | 192 processAndOutput(args.inFileA,requiredCOLs,allCols,columnDic,fout) |
191 columnDic = fileColumn[args.inFileB] | 193 columnDic = fileColumn[args.inFileB] |
192 processAndOutput(args.inFileB,requiredCOLs,allCols,columnDic,fout) | 194 processAndOutput(args.inFileB,requiredCOLs,allCols,columnDic,fout) |
193 fout.close() | 195 fout.close() |
194 | 196 |
195 #collect sample from source information | 197 #collect sample from source information |
196 sampleDic ={} | 198 sampleDic ={} |
197 if args.aLabel is None: | 199 if args.aLabel is None: |
198 collectSource(args.inFileA, args.inFileA, sampleDic) | 200 collectSource(args.inFileA, args.inFileA, sampleDic) |
199 else: | 201 else: |
200 collectSource(args.inFileA, args.aLabel, sampleDic | 202 collectSource(args.inFileA, args.aLabel, sampleDic) |
201 if args.bLabel is None: | 203 if args.bLabel is None: |
202 collectSource(args.inFileB, args.inFileB, sampleDic) | 204 collectSource(args.inFileB, args.inFileB, sampleDic) |
203 else: | 205 else: |
204 collectSource(args.inFileB, args.bLabel, sampleDic | 206 collectSource(args.inFileB, args.bLabel, sampleDic) |
205 | 207 |
206 | 208 |
207 #output sample source information as phenotype matrix | 209 #output sample source information as phenotype matrix |
208 outputSampleDic (sampleDic, outPhenotypeFile) | 210 outputSampleDic (sampleDic, outPhenotypeFile) |
209 | 211 |
210 | 212 |
211 | 213 |
212 | 214 |