Mercurial > repos > melissacline > ucsc_cancer_utilities
changeset 20:914bc8ee6222
Debugged the merge mutation data tool
author | melissacline |
---|---|
date | Fri, 20 Mar 2015 15:50:22 -0700 |
parents | 371579dd9bc6 |
children | 3a259686f0fc |
files | mergeGenomicFiles.xml mergeMutationDatasets.xml mergeXenaMutation.py |
diffstat | 3 files changed, 67 insertions(+), 65 deletions(-) [+] |
line wrap: on
line diff
--- a/mergeGenomicFiles.xml Fri Mar 20 18:09:15 2015 -0400 +++ b/mergeGenomicFiles.xml Fri Mar 20 15:50:22 2015 -0700 @@ -1,4 +1,4 @@ -<tool id="mergeGenomicFiles" description="Merge two genomic datasets into a new dataset" name="mergeGenomicFiles" version="0.0.1"> +<tool id="mergeGenomicFiles" description="Merge two genomic datasets into a new dataset" name="Merge Genomic Datasets" version="0.0.1"> <description> Given two genomic datasets, merge them to create a third dataset with the row and column identifiers from both datasets. </description>
--- a/mergeMutationDatasets.xml Fri Mar 20 18:09:15 2015 -0400 +++ b/mergeMutationDatasets.xml Fri Mar 20 15:50:22 2015 -0700 @@ -1,4 +1,4 @@ -<tool id="mergeMutationDatasets" description="Merge two mutation datasets into a new dataset" name="mergeMutationData" version="0.0.1"> +<tool id="mergeMutationDatasets" description="Merge two mutation datasets into a new dataset" name="Merge Mutation Data" version="0.0.1"> <description> Given two mutation datasets, merge them to create a larger dataset with the mutations from both datasets. Output this larger dataset, along with a 2-column matrix indicating the source of each mutation </description> @@ -18,9 +18,9 @@ <param type="text" name="labelForDatasetB" label="Dataset B Label (optional)" optional="true"/> </inputs> <outputs> - <data name="outputC" format="tabular" label="Merged Mutation Data"/> + <data name="errorLog" format="data" label="Execution Log"/> <data name="outputSourceMatrix" format="tabular" label="Mutation Data Sources"/> - <data name="errorLog" format="data" label="Execution Log"/> + <data name="outputC" format="tabular" label="Merged Mutation Data"/> </outputs> <help> ***Merge Xena Mutation Datasets***
--- a/mergeXenaMutation.py Fri Mar 20 18:09:15 2015 -0400 +++ b/mergeXenaMutation.py Fri Mar 20 15:50:22 2015 -0700 @@ -64,7 +64,7 @@ return columnDic -def summarizeColumns(infiles, fileColumn, allCols, ferror): +def summarizeColumns(inFiles, fileColumn, allCols, ferror): for infile in inFiles: columnDic = header (infile, ferror) fileColumn [infile] = columnDic @@ -138,74 +138,76 @@ print "this is merging data A+B=C for mutation by position type of data\n" sys.exit(1) - # - # The input files to this script are two or more matrices, in which - # columns represent samples and rows represent genes or measurements. - # There are two output files: outMergedData contains the input data merged - # into a single matrix, and outSourceMatrix is a two-column matrix - # indicating which file each sample (or column label) came from. This - # assumes that each sample came from at most one file. - # - parser = argparse.ArgumentParser() - parser.add_argument("inFileA", type=str, help="First input file") - parser.add_argument("inFileB", type=str, help="Second input file") - parser.add_argument("outMergedData", type=str, - help="Filename for the merged dataset") - parser.add_argument("outSourceMatrix", type=str, - help="""Filename for a Nx2 matrix that indicates + # + # The input files to this script are two or more matrices, in which + # columns represent samples and rows represent genes or measurements. + # There are two output files: outMergedData contains the input data merged + # into a single matrix, and outSourceMatrix is a two-column matrix + # indicating which file each sample (or column label) came from. This + # assumes that each sample came from at most one file. + # + parser = argparse.ArgumentParser() + parser.add_argument("outMergedData", type=str, + help="Filename for the merged dataset") + parser.add_argument("outSourceMatrix", type=str, + help="""Filename for a Nx2 matrix that indicates the source file of each column""") - parser.add_argument("errorLog", type=str, - help="""Error log""") - parser.add_argument("--aLabel", type=str, default=None, - help="User-friendly label for the first input file") - parser.add_argument("--bLabel", type=str, default=None, - help="User-friendly label for the second input file") - args = parser.parse_args() + parser.add_argument("errorLog", type=str, + help="""Error log""") + parser.add_argument("inFileA", type=str, help="First input file") + parser.add_argument("inFileB", type=str, help="Second input file") + parser.add_argument("--aLabel", type=str, default=None, + help="User-friendly label for the first input file") + parser.add_argument("--bLabel", type=str, default=None, + help="User-friendly label for the second input file") + args = parser.parse_args() - #inFiles = sys.argv[4:] - print inFiles - errofile = args.errorLog - outfile = args.outMergedData - print outfile - outPhenotypeFile = args.outSourceMatrix - print outPhenotypeFile - - ferror = open(errofile,'w') + #inFiles = sys.argv[4:] + inFiles = list() + inFiles.append(args.inFileA) + inFiles.append(args.inFileB) + errofile = args.errorLog + outfile = args.outMergedData + #print outfile + outPhenotypeFile = args.outSourceMatrix + #print outPhenotypeFile - #get all the columns, build fileColumn dictionary - fileColumn={} - allCols =[] - summarizeColumns(inFiles, fileColumn, allCols, ferror) - ferror.close() - - #output header line - fout = open(outfile,'w') - outputHeader (requiredCOLs,allCols,fout) + ferror = open(errofile,'w') + + #get all the columns, build fileColumn dictionary + fileColumn={} + allCols =[] + summarizeColumns(inFiles, fileColumn, allCols, ferror) + ferror.close() - #process and output combined mutationXena file - fout = open(outfile,'a') - - columnDic = fileColumn[args.inFileA] - processAndOutput(args.inFileA,requiredCOLs,allCols,columnDic,fout) - columnDic = fileColumn[args.inFileB] - processAndOutput(args.inFileB,requiredCOLs,allCols,columnDic,fout) - fout.close() + #output header line + fout = open(outfile,'w') + outputHeader (requiredCOLs,allCols,fout) + + #process and output combined mutationXena file + fout = open(outfile,'a') - #collect sample from source information - sampleDic ={} - if args.aLabel is None: - collectSource(args.inFileA, args.inFileA, sampleDic) - else: - collectSource(args.inFileA, args.aLabel, sampleDic - if args.bLabel is None: - collectSource(args.inFileB, args.inFileB, sampleDic) - else: - collectSource(args.inFileB, args.bLabel, sampleDic + columnDic = fileColumn[args.inFileA] + processAndOutput(args.inFileA,requiredCOLs,allCols,columnDic,fout) + columnDic = fileColumn[args.inFileB] + processAndOutput(args.inFileB,requiredCOLs,allCols,columnDic,fout) + fout.close() + + #collect sample from source information + sampleDic ={} + if args.aLabel is None: + collectSource(args.inFileA, args.inFileA, sampleDic) + else: + collectSource(args.inFileA, args.aLabel, sampleDic) + if args.bLabel is None: + collectSource(args.inFileB, args.inFileB, sampleDic) + else: + collectSource(args.inFileB, args.bLabel, sampleDic) - #output sample source information as phenotype matrix - outputSampleDic (sampleDic, outPhenotypeFile) + #output sample source information as phenotype matrix + outputSampleDic (sampleDic, outPhenotypeFile)