changeset 6:2035405538b4

Uploaded
author melissacline
date Thu, 12 Feb 2015 01:15:58 -0500
parents 6c23a3b58eb8
children 1d150e860c4d 4fa8700d179d 7e4fa558fa1b dd93e7d1bf01 d9c36f0aa1d3 15cb5a49cdbc
files mergeGenomicMatrixFiles.py
diffstat 1 files changed, 88 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mergeGenomicMatrixFiles.py	Thu Feb 12 01:15:58 2015 -0500
@@ -0,0 +1,88 @@
+#!/usr/bin/env python
+
+import string,os,sys
+
+def header (samples, infile):
+    fin= open(infile,'r')
+    #header, samples
+    newSamples = string.split(string.strip(fin.readline()),'\t')[1:]
+    for sample in newSamples:
+        if sample not in samples:
+            samples[sample]= len(samples)
+    fin.close()
+    return
+
+def process(genes, samples, dataMatrix, infile):
+    maxLength= len(samples)
+
+    fin= open(infile,'r')
+    #header 
+    newSamples = string.split(string.strip(fin.readline()),'\t')
+    
+    while 1:
+        line = fin.readline()[:-1]
+        if line =="":
+            break
+        data = string.split(line,"\t")
+        gene = data[0]
+        if gene not in genes:
+            genes[gene]= len(genes)
+            l=[]
+            for i in range (0, maxLength):
+                l.append("")
+            dataMatrix.append(l)
+
+        x = genes[gene]
+        for i in range (1, len(data)):
+            sample = newSamples[i]
+            y = samples[sample]
+            dataMatrix[x][y]= data[i]
+
+    fin.close()
+    return
+
+def outputMatrix(dataMatrix, samples, genes, outfile):
+    fout = open(outfile,"w")
+    maxLength= len(samples)
+    sList=[]
+    for i in range (0, maxLength):
+        sList.append("")
+    for sample in samples:
+        pos =samples[sample]
+        sList[pos] = sample
+
+    fout.write("sample")
+    for sample in sList:
+        fout.write("\t"+sample)
+    fout.write("\n")
+
+    for gene in genes:
+        fout.write(gene)
+        for sample in sList:
+            value = dataMatrix[genes[gene]][samples[sample]]
+            fout.write("\t"+value)
+        fout.write("\n")
+    fout.close()
+    return
+
+if __name__ == '__main__' :
+    if len(sys.argv[:]) <4:
+        print "python mergeFilesByColumn.py output inputfile(s)"
+        print "**********memory intensive, not for very genomic data with hugo number of probes"
+        print "this is merging data A+B=C\n"
+        sys.exit()
+
+    inFiles = sys.argv[2:]
+    outfile = sys.argv[1]
+
+    genes={}
+    samples={}
+    dataMatrix=[]
+
+    for infile in inFiles:
+        header (samples, infile)
+
+    for infile in inFiles:
+        process(genes, samples, dataMatrix, infile)
+
+    outputMatrix(dataMatrix, samples, genes, outfile)