# HG changeset patch # User melissacline # Date 1426041134 25200 # Node ID d0b8c8eee9d5cd2ad55a28a8bed18a59edd32de3 # Parent dd93e7d1bf0161c54cc83785fe71397528415234# Parent eba1f6b111c44e5e70e18513b5a295eb35c40393 Committing changes that hadn't made it in yet, merge hell diff -r dd93e7d1bf01 -r d0b8c8eee9d5 mergeGenomicFiles.xml --- a/mergeGenomicFiles.xml Tue Mar 10 19:29:10 2015 -0700 +++ b/mergeGenomicFiles.xml Tue Mar 10 19:32:14 2015 -0700 @@ -1,20 +1,31 @@ - Given two genomic datasets, merge them to create a third dataset with the row and column identifiers from both datasets. + Given two genomic datasets, merge them to create a larger dataset with the row and column identifiers from both datasets. Output this larger dataset, along with a 2-column matrix indicating the source file of each sample - mergeGenomicMatrixFiles.py $outputC $inputA $inputB + mergeGenomicMatrixFiles.py $inputA $inputB $outputC $outputSourceMatrix + #if $labelForDatasetA + --aLabel "${labelForDatasetA}" + #end if + #if $labelForDatasetB + --bLabel "${labelForDatasetB}" + #end if - - + + + + + ***Merge Genomic Datasets*** - Given two genomic datasets, merge them to produce a third dataset that is the union of the first two. The new dataset will contain all column labels from either dataset, and all row labels from either dataset. If a row label appears in both datasets, the output dataset will contain, for that row, all values for the first set of columns, plus all values for the second set of columns. If a row label appears in the first dataset only, the output dataset will contain the values for the columns of the first dataset, and blanks (indicating missing values) for the columns of the second da + Given two genomic datasets, merge them to produce a third dataset that is the union of the first two. The new dataset will contain all column labels from either dataset, and all row labels from either dataset. If a row label appears in both datasets, the output dataset will contain, for that row, all values for the first set of columns, plus all values for the second set of columns. If a row label appears in the first dataset only, the output dataset will contain the values for the columns of the first dataset, and blanks (indicating missing values) for the columns of the second dataset. + + To maintain provenance, this script also outputs a second matrix, with one row for each column in the output dataset, and two columns per row indicating which input dataset that column came from. By default, the input dataset name is used to indicate which input file each column came from. Optionally, the user can specify descriptive labels to be used in place of the filenames. This all assumes that each column exists in only one input dataset. diff -r dd93e7d1bf01 -r d0b8c8eee9d5 mergeGenomicMatrixFiles.py --- a/mergeGenomicMatrixFiles.py Tue Mar 10 19:29:10 2015 -0700 +++ b/mergeGenomicMatrixFiles.py Tue Mar 10 19:32:14 2015 -0700 @@ -1,21 +1,27 @@ #!/usr/bin/env python +import argparse import string,os,sys -def header (samples, infile): - fin= open(infile,'r') +def header (samples, sourceFiles, infile, labelThisFile): + if labelThisFile == None: + labelToUse = infile + else: + labelToUse = labelThisFile + fin= open(infile, 'U') #header, samples newSamples = string.split(string.strip(fin.readline()),'\t')[1:] for sample in newSamples: if sample not in samples: samples[sample]= len(samples) + sourceFiles[sample] = labelToUse fin.close() return def process(genes, samples, dataMatrix, infile): maxLength= len(samples) - fin= open(infile,'r') + fin= open(infile,'U') #header newSamples = string.split(string.strip(fin.readline()),'\t') @@ -41,7 +47,17 @@ fin.close() return -def outputMatrix(dataMatrix, samples, genes, outfile): + +def outputSourceMatrix(sourceData, outputFileName): + fout = open(outputFileName, "w") + fout.write("Sample\tSource\n") + for thisSample in sourceData.keys(): + fout.write("%s\t%s\n" % (thisSample, sourceData[thisSample])) + fout.close() + return + + +def outputMergedMatrix(dataMatrix, samples, genes, outfile): fout = open(outfile,"w") maxLength= len(samples) sList=[] @@ -66,23 +82,38 @@ return if __name__ == '__main__' : - if len(sys.argv[:]) <4: - print "python mergeFilesByColumn.py output inputfile(s)" - print "**********memory intensive, not for very genomic data with hugo number of probes" - print "this is merging data A+B=C\n" - sys.exit() - - inFiles = sys.argv[2:] - outfile = sys.argv[1] + # + # The input files to this script are two or more matrices, in which + # columns represent samples and rows represent genes or measurements. + # There are two output files: outMergedData contains the input data merged + # into a single matrix, and outSourceMatrix is a two-column matrix + # indicating which file each sample (or column label) came from. This + # assumes that each sample came from at most one file. + # + parser = argparse.ArgumentParser() + parser.add_argument("inFileA", type=str, help="First input file") + parser.add_argument("inFileB", type=str, help="Second input file") + parser.add_argument("outMergedData", type=str, + help="Filename for the merged dataset") + parser.add_argument("outSourceMatrix", type=str, + help="""Filename for a Nx2 matrix that indicates + the source file of each column""") + parser.add_argument("--aLabel", type=str, default=None, + help="User-friendly label for the first input file") + parser.add_argument("--bLabel", type=str, default=None, + help="User-friendly label for the second input file") + args = parser.parse_args() genes={} samples={} + sourceFiles = {} dataMatrix=[] - for infile in inFiles: - header (samples, infile) + header(samples, sourceFiles, args.inFileA, args.aLabel) + header(samples, sourceFiles, args.inFileB, args.bLabel) - for infile in inFiles: - process(genes, samples, dataMatrix, infile) + process(genes, samples, dataMatrix, args.inFileA) + process(genes, samples, dataMatrix, args.inFileB) - outputMatrix(dataMatrix, samples, genes, outfile) + outputSourceMatrix(sourceFiles, args.outSourceMatrix) + outputMergedMatrix(dataMatrix, samples, genes, args.outMergedData) diff -r dd93e7d1bf01 -r d0b8c8eee9d5 tool_dependencies.xml --- a/tool_dependencies.xml Tue Mar 10 19:29:10 2015 -0700 +++ b/tool_dependencies.xml Tue Mar 10 19:32:14 2015 -0700 @@ -1,5 +1,6 @@ +