Mercurial > repos > melissacline > ucsc_cancer_utilities

diff mergeGenomicMatrixFiles.py @ 7:1d150e860c4d
Expanded the functionality of the merge genomic datasets tool, to generate an output dataset with the file (or label) indicating where each column came from
author: melissacline
date: Mon, 09 Mar 2015 19:58:03 -0700
parents: 2035405538b4
children: 5d4538cb38db
--- a/mergeGenomicMatrixFiles.py	Thu Feb 12 01:15:58 2015 -0500
+++ b/mergeGenomicMatrixFiles.py	Mon Mar 09 19:58:03 2015 -0700
@@ -1,14 +1,20 @@
 #!/usr/bin/env python
 
+import argparse
 import string,os,sys
 
-def header (samples, infile):
+def header (samples, sourceFiles, infile, labelThisFile):
+    if labelThisFile == None:
+        labelToUse = infile
+    else:
+        labelToUse = labelThisFile
     fin= open(infile,'r')
     #header, samples
     newSamples = string.split(string.strip(fin.readline()),'\t')[1:]
     for sample in newSamples:
         if sample not in samples:
             samples[sample]= len(samples)
+            sourceFiles[sample] = labelToUse
     fin.close()
     return
 
@@ -41,7 +47,17 @@
     fin.close()
     return
 
-def outputMatrix(dataMatrix, samples, genes, outfile):
+
+def outputSourceMatrix(sourceData, outputFileName):
+    fout = open(outputFileName, "w")
+    fout.write("Sample\tSource\n")
+    for thisSample in sourceData.keys():
+        fout.write("%s\t%s\n" % (thisSample, sourceData[thisSample]))
+    fout.close()
+    return
+
+
+def outputMergedMatrix(dataMatrix, samples, genes, outfile):
     fout = open(outfile,"w")
     maxLength= len(samples)
     sList=[]
@@ -66,23 +82,39 @@
     return
 
 if __name__ == '__main__' :
-    if len(sys.argv[:]) <4:
-        print "python mergeFilesByColumn.py output inputfile(s)"
-        print "**********memory intensive, not for very genomic data with hugo number of probes"
-        print "this is merging data A+B=C\n"
-        sys.exit()
-
-    inFiles = sys.argv[2:]
-    outfile = sys.argv[1]
+    #
+    # The input files to this script are two or more matrices, in which
+    # columns represent samples and rows represent genes or measurements.
+    # There are two output files: outMergedData contains the input data merged
+    # into a single matrix, and outSourceMatrix is a two-column matrix 
+    # indicating which file each sample (or column label) came from.  This
+    # assumes that each sample came from at most one file.
+    #
+    parser = argparse.ArgumentParser()
+    parser.add_argument("inFileA", type=str, help="First input file")
+    parser.add_argument("inFileB", type=str, help="Second input file")
+    parser.add_argument("outMergedData", type=str, 
+                        help="Filename for the merged dataset")
+    parser.add_argument("outSourceMatrix", type=str,
+                        help="""Filename for a Nx2 matrix that indicates
+                                the source file of each column""")
+    parser.add_argument("--aLabel", type=str, default=None,
+                        help="User-friendly label for the first input file")
+    parser.add_argument("--bLabel", type=str, default=None,
+                        help="User-friendly label for the second input file")
+    args = parser.parse_args()
+    outSourceMatrix = sys.argv[2]  
 
     genes={}
     samples={}
+    sourceFiles = {}
     dataMatrix=[]
 
-    for infile in inFiles:
-        header (samples, infile)
+    header(samples, sourceFiles, args.inFileA, args.aLabel)
+    header(samples, sourceFiles, args.inFileB, args.bLabel)
 
-    for infile in inFiles:
-        process(genes, samples, dataMatrix, infile)
+    process(genes, samples, dataMatrix, args.inFileA)
+    process(genes, samples, dataMatrix, args.inFileB)
 
-    outputMatrix(dataMatrix, samples, genes, outfile)
+    outputSourceMatrix(sourceFiles, args.outSourceMatrix)
+    outputMergedMatrix(dataMatrix, samples, genes, args.outMergedData)
author	melissacline
date	Mon, 09 Mar 2015 19:58:03 -0700
parents	2035405538b4
children	5d4538cb38db