comparison mergeXenaMutation.py @ 20:914bc8ee6222

Debugged the merge mutation data tool
author melissacline
date Fri, 20 Mar 2015 15:50:22 -0700
parents 15cb5a49cdbc
children 9806198df91f
comparison
equal deleted inserted replaced
19:371579dd9bc6 20:914bc8ee6222
62 if i not in requiredColsPos: 62 if i not in requiredColsPos:
63 columnDic [data[i]]=i 63 columnDic [data[i]]=i
64 64
65 return columnDic 65 return columnDic
66 66
67 def summarizeColumns(infiles, fileColumn, allCols, ferror): 67 def summarizeColumns(inFiles, fileColumn, allCols, ferror):
68 for infile in inFiles: 68 for infile in inFiles:
69 columnDic = header (infile, ferror) 69 columnDic = header (infile, ferror)
70 fileColumn [infile] = columnDic 70 fileColumn [infile] = columnDic
71 for col in columnDic: 71 for col in columnDic:
72 if col not in allCols: 72 if col not in allCols:
136 if len(sys.argv[:]) <6: 136 if len(sys.argv[:]) <6:
137 print "python mergeMultipleXenaMutation.py outputXenaMutation outputPhenotypeMatrix errorLog inputfile(s)" 137 print "python mergeMultipleXenaMutation.py outputXenaMutation outputPhenotypeMatrix errorLog inputfile(s)"
138 print "this is merging data A+B=C for mutation by position type of data\n" 138 print "this is merging data A+B=C for mutation by position type of data\n"
139 sys.exit(1) 139 sys.exit(1)
140 140
141 # 141 #
142 # The input files to this script are two or more matrices, in which 142 # The input files to this script are two or more matrices, in which
143 # columns represent samples and rows represent genes or measurements. 143 # columns represent samples and rows represent genes or measurements.
144 # There are two output files: outMergedData contains the input data merged 144 # There are two output files: outMergedData contains the input data merged
145 # into a single matrix, and outSourceMatrix is a two-column matrix 145 # into a single matrix, and outSourceMatrix is a two-column matrix
146 # indicating which file each sample (or column label) came from. This 146 # indicating which file each sample (or column label) came from. This
147 # assumes that each sample came from at most one file. 147 # assumes that each sample came from at most one file.
148 # 148 #
149 parser = argparse.ArgumentParser() 149 parser = argparse.ArgumentParser()
150 parser.add_argument("inFileA", type=str, help="First input file") 150 parser.add_argument("outMergedData", type=str,
151 parser.add_argument("inFileB", type=str, help="Second input file") 151 help="Filename for the merged dataset")
152 parser.add_argument("outMergedData", type=str, 152 parser.add_argument("outSourceMatrix", type=str,
153 help="Filename for the merged dataset") 153 help="""Filename for a Nx2 matrix that indicates
154 parser.add_argument("outSourceMatrix", type=str,
155 help="""Filename for a Nx2 matrix that indicates
156 the source file of each column""") 154 the source file of each column""")
157 parser.add_argument("errorLog", type=str, 155 parser.add_argument("errorLog", type=str,
158 help="""Error log""") 156 help="""Error log""")
159 parser.add_argument("--aLabel", type=str, default=None, 157 parser.add_argument("inFileA", type=str, help="First input file")
160 help="User-friendly label for the first input file") 158 parser.add_argument("inFileB", type=str, help="Second input file")
161 parser.add_argument("--bLabel", type=str, default=None, 159 parser.add_argument("--aLabel", type=str, default=None,
162 help="User-friendly label for the second input file") 160 help="User-friendly label for the first input file")
163 args = parser.parse_args() 161 parser.add_argument("--bLabel", type=str, default=None,
162 help="User-friendly label for the second input file")
163 args = parser.parse_args()
164 164
165 165
166 #inFiles = sys.argv[4:] 166 #inFiles = sys.argv[4:]
167 print inFiles 167 inFiles = list()
168 errofile = args.errorLog 168 inFiles.append(args.inFileA)
169 outfile = args.outMergedData 169 inFiles.append(args.inFileB)
170 print outfile 170 errofile = args.errorLog
171 outPhenotypeFile = args.outSourceMatrix 171 outfile = args.outMergedData
172 print outPhenotypeFile 172 #print outfile
173 outPhenotypeFile = args.outSourceMatrix
174 #print outPhenotypeFile
173 175
174 ferror = open(errofile,'w') 176 ferror = open(errofile,'w')
175 177
176 #get all the columns, build fileColumn dictionary 178 #get all the columns, build fileColumn dictionary
177 fileColumn={} 179 fileColumn={}
178 allCols =[] 180 allCols =[]
179 summarizeColumns(inFiles, fileColumn, allCols, ferror) 181 summarizeColumns(inFiles, fileColumn, allCols, ferror)
180 ferror.close() 182 ferror.close()
181 183
182 #output header line 184 #output header line
183 fout = open(outfile,'w') 185 fout = open(outfile,'w')
184 outputHeader (requiredCOLs,allCols,fout) 186 outputHeader (requiredCOLs,allCols,fout)
185 187
186 #process and output combined mutationXena file 188 #process and output combined mutationXena file
187 fout = open(outfile,'a') 189 fout = open(outfile,'a')
188 190
189 columnDic = fileColumn[args.inFileA] 191 columnDic = fileColumn[args.inFileA]
190 processAndOutput(args.inFileA,requiredCOLs,allCols,columnDic,fout) 192 processAndOutput(args.inFileA,requiredCOLs,allCols,columnDic,fout)
191 columnDic = fileColumn[args.inFileB] 193 columnDic = fileColumn[args.inFileB]
192 processAndOutput(args.inFileB,requiredCOLs,allCols,columnDic,fout) 194 processAndOutput(args.inFileB,requiredCOLs,allCols,columnDic,fout)
193 fout.close() 195 fout.close()
194 196
195 #collect sample from source information 197 #collect sample from source information
196 sampleDic ={} 198 sampleDic ={}
197 if args.aLabel is None: 199 if args.aLabel is None:
198 collectSource(args.inFileA, args.inFileA, sampleDic) 200 collectSource(args.inFileA, args.inFileA, sampleDic)
199 else: 201 else:
200 collectSource(args.inFileA, args.aLabel, sampleDic 202 collectSource(args.inFileA, args.aLabel, sampleDic)
201 if args.bLabel is None: 203 if args.bLabel is None:
202 collectSource(args.inFileB, args.inFileB, sampleDic) 204 collectSource(args.inFileB, args.inFileB, sampleDic)
203 else: 205 else:
204 collectSource(args.inFileB, args.bLabel, sampleDic 206 collectSource(args.inFileB, args.bLabel, sampleDic)
205 207
206 208
207 #output sample source information as phenotype matrix 209 #output sample source information as phenotype matrix
208 outputSampleDic (sampleDic, outPhenotypeFile) 210 outputSampleDic (sampleDic, outPhenotypeFile)
209 211
210 212
211 213
212 214