diff mergeGenomicMatrixFiles.py @ 21:3a259686f0fc

Merged with head, tweaked labels on merge mutation data tool
author melissacline
date Fri, 20 Mar 2015 16:38:46 -0700
parents 30aab34424a9
children 1d83dbbee373
line wrap: on
line diff
--- a/mergeGenomicMatrixFiles.py	Fri Mar 20 15:50:22 2015 -0700
+++ b/mergeGenomicMatrixFiles.py	Fri Mar 20 16:38:46 2015 -0700
@@ -1,21 +1,27 @@
 #!/usr/bin/env python
 
+import argparse
 import string,os,sys
 
-def header (samples, infile):
-    fin= open(infile,'r')
+def header (samples, sourceFiles, infile, labelThisFile):
+    if labelThisFile == None:
+        labelToUse = infile
+    else:
+        labelToUse = labelThisFile
+    fin= open(infile, 'U')
     #header, samples
     newSamples = string.split(string.strip(fin.readline()),'\t')[1:]
     for sample in newSamples:
         if sample not in samples:
             samples[sample]= len(samples)
+            sourceFiles[sample] = labelToUse
     fin.close()
     return
 
 def process(genes, samples, dataMatrix, infile):
     maxLength= len(samples)
 
-    fin= open(infile,'r')
+    fin= open(infile,'U')
     #header 
     newSamples = string.split(string.strip(fin.readline()),'\t')
     
@@ -41,7 +47,17 @@
     fin.close()
     return
 
-def outputMatrix(dataMatrix, samples, genes, outfile):
+
+def outputSourceMatrix(sourceData, outputFileName):
+    fout = open(outputFileName, "w")
+    fout.write("Sample\tSource\n")
+    for thisSample in sourceData.keys():
+        fout.write("%s\t%s\n" % (thisSample, sourceData[thisSample]))
+    fout.close()
+    return
+
+
+def outputMergedMatrix(dataMatrix, samples, genes, outfile):
     fout = open(outfile,"w")
     maxLength= len(samples)
     sList=[]
@@ -66,23 +82,38 @@
     return
 
 if __name__ == '__main__' :
-    if len(sys.argv[:]) <4:
-        print "python mergeFilesByColumn.py output inputfile(s)"
-        print "**********memory intensive, not for very genomic data with hugo number of probes"
-        print "this is merging data A+B=C\n"
-        sys.exit()
-
-    inFiles = sys.argv[2:]
-    outfile = sys.argv[1]
+    #
+    # The input files to this script are two or more matrices, in which
+    # columns represent samples and rows represent genes or measurements.
+    # There are two output files: outMergedData contains the input data merged
+    # into a single matrix, and outSourceMatrix is a two-column matrix 
+    # indicating which file each sample (or column label) came from.  This
+    # assumes that each sample came from at most one file.
+    #
+    parser = argparse.ArgumentParser()
+    parser.add_argument("inFileA", type=str, help="First input file")
+    parser.add_argument("inFileB", type=str, help="Second input file")
+    parser.add_argument("outMergedData", type=str, 
+                        help="Filename for the merged dataset")
+    parser.add_argument("outSourceMatrix", type=str,
+                        help="""Filename for a Nx2 matrix that indicates
+                                the source file of each column""")
+    parser.add_argument("--aLabel", type=str, default=None,
+                        help="User-friendly label for the first input file")
+    parser.add_argument("--bLabel", type=str, default=None,
+                        help="User-friendly label for the second input file")
+    args = parser.parse_args()
 
     genes={}
     samples={}
+    sourceFiles = {}
     dataMatrix=[]
 
-    for infile in inFiles:
-        header (samples, infile)
+    header(samples, sourceFiles, args.inFileA, args.aLabel)
+    header(samples, sourceFiles, args.inFileB, args.bLabel)
 
-    for infile in inFiles:
-        process(genes, samples, dataMatrix, infile)
+    process(genes, samples, dataMatrix, args.inFileA)
+    process(genes, samples, dataMatrix, args.inFileB)
 
-    outputMatrix(dataMatrix, samples, genes, outfile)
+    outputSourceMatrix(sourceFiles, args.outSourceMatrix)
+    outputMergedMatrix(dataMatrix, samples, genes, args.outMergedData)