diff mergeGenomicMatrixFiles.py @ 7:1d150e860c4d

Expanded the functionality of the merge genomic datasets tool, to generate an output dataset with the file (or label) indicating where each column came from
author melissacline
date Mon, 09 Mar 2015 19:58:03 -0700
parents 2035405538b4
children 5d4538cb38db
line wrap: on
line diff
--- a/mergeGenomicMatrixFiles.py	Thu Feb 12 01:15:58 2015 -0500
+++ b/mergeGenomicMatrixFiles.py	Mon Mar 09 19:58:03 2015 -0700
@@ -1,14 +1,20 @@
 #!/usr/bin/env python
 
+import argparse
 import string,os,sys
 
-def header (samples, infile):
+def header (samples, sourceFiles, infile, labelThisFile):
+    if labelThisFile == None:
+        labelToUse = infile
+    else:
+        labelToUse = labelThisFile
     fin= open(infile,'r')
     #header, samples
     newSamples = string.split(string.strip(fin.readline()),'\t')[1:]
     for sample in newSamples:
         if sample not in samples:
             samples[sample]= len(samples)
+            sourceFiles[sample] = labelToUse
     fin.close()
     return
 
@@ -41,7 +47,17 @@
     fin.close()
     return
 
-def outputMatrix(dataMatrix, samples, genes, outfile):
+
+def outputSourceMatrix(sourceData, outputFileName):
+    fout = open(outputFileName, "w")
+    fout.write("Sample\tSource\n")
+    for thisSample in sourceData.keys():
+        fout.write("%s\t%s\n" % (thisSample, sourceData[thisSample]))
+    fout.close()
+    return
+
+
+def outputMergedMatrix(dataMatrix, samples, genes, outfile):
     fout = open(outfile,"w")
     maxLength= len(samples)
     sList=[]
@@ -66,23 +82,39 @@
     return
 
 if __name__ == '__main__' :
-    if len(sys.argv[:]) <4:
-        print "python mergeFilesByColumn.py output inputfile(s)"
-        print "**********memory intensive, not for very genomic data with hugo number of probes"
-        print "this is merging data A+B=C\n"
-        sys.exit()
-
-    inFiles = sys.argv[2:]
-    outfile = sys.argv[1]
+    #
+    # The input files to this script are two or more matrices, in which
+    # columns represent samples and rows represent genes or measurements.
+    # There are two output files: outMergedData contains the input data merged
+    # into a single matrix, and outSourceMatrix is a two-column matrix 
+    # indicating which file each sample (or column label) came from.  This
+    # assumes that each sample came from at most one file.
+    #
+    parser = argparse.ArgumentParser()
+    parser.add_argument("inFileA", type=str, help="First input file")
+    parser.add_argument("inFileB", type=str, help="Second input file")
+    parser.add_argument("outMergedData", type=str, 
+                        help="Filename for the merged dataset")
+    parser.add_argument("outSourceMatrix", type=str,
+                        help="""Filename for a Nx2 matrix that indicates
+                                the source file of each column""")
+    parser.add_argument("--aLabel", type=str, default=None,
+                        help="User-friendly label for the first input file")
+    parser.add_argument("--bLabel", type=str, default=None,
+                        help="User-friendly label for the second input file")
+    args = parser.parse_args()
+    outSourceMatrix = sys.argv[2]  
 
     genes={}
     samples={}
+    sourceFiles = {}
     dataMatrix=[]
 
-    for infile in inFiles:
-        header (samples, infile)
+    header(samples, sourceFiles, args.inFileA, args.aLabel)
+    header(samples, sourceFiles, args.inFileB, args.bLabel)
 
-    for infile in inFiles:
-        process(genes, samples, dataMatrix, infile)
+    process(genes, samples, dataMatrix, args.inFileA)
+    process(genes, samples, dataMatrix, args.inFileB)
 
-    outputMatrix(dataMatrix, samples, genes, outfile)
+    outputSourceMatrix(sourceFiles, args.outSourceMatrix)
+    outputMergedMatrix(dataMatrix, samples, genes, args.outMergedData)