changeset 3:12a1ea920524

Creating a tool to merge genomic datasets
author melissacline
date Wed, 11 Feb 2015 16:44:33 -0800
parents d1104ad3646a
children d0674221a6ae
files mergeGenomicFiles.xml mergeGenomicMatrixFiles.py
diffstat 2 files changed, 106 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mergeGenomicFiles.xml	Wed Feb 11 16:44:33 2015 -0800
@@ -0,0 +1,20 @@
+<tool id="mergeGenomicFiles" description="Merge two genomic datasets into a new dataset" name="mergeGenomicFiles" version="0.0.1">
+  <description>
+    Given two genomic datasets, merge them to create a third dataset with the row and column identifiers from both datasets.
+  </description>
+  <command interpreter="python">
+      mergeGenomicMatrixFiles.py $outputC $inputA $inputB
+  </command>
+  <inputs>
+    <param name="inputA" format="tabular" type="data" label="Genomic Dataset A"/>
+    <param name="inputB" format="tabular" type="data" label="Genomic Dataset B"/>
+  </inputs>
+  <outputs>
+    <data name="outputC" format="tabular"/>
+  </outputs>
+  <help>
+    ***Merge Genomic Datasets***
+
+    Given two genomic datasets, merge them to produce a third dataset that is the union of the first two.  The new dataset will contain all column labels from either dataset, and all row labels from either dataset.  If a row label appears in both datasets, the output dataset will contain, for that row, all values for the first set of columns, plus all values for the second set of columns.  If a row label appears in the first dataset only, the output dataset will contain the values for the columns of the first dataset, and blanks (indicating missing values) for the columns of the second da
+  </help>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mergeGenomicMatrixFiles.py	Wed Feb 11 16:44:33 2015 -0800
@@ -0,0 +1,86 @@
+import string,os,sys
+
+def header (samples, infile):
+    fin= open(infile,'r')
+    #header, samples
+    newSamples = string.split(string.strip(fin.readline()),'\t')[1:]
+    for sample in newSamples:
+        if sample not in samples:
+            samples[sample]= len(samples)
+    fin.close()
+    return
+
+def process(genes, samples, dataMatrix, infile):
+    maxLength= len(samples)
+
+    fin= open(infile,'r')
+    #header 
+    newSamples = string.split(string.strip(fin.readline()),'\t')
+    
+    while 1:
+        line = fin.readline()[:-1]
+        if line =="":
+            break
+        data = string.split(line,"\t")
+        gene = data[0]
+        if gene not in genes:
+            genes[gene]= len(genes)
+            l=[]
+            for i in range (0, maxLength):
+                l.append("")
+            dataMatrix.append(l)
+
+        x = genes[gene]
+        for i in range (1, len(data)):
+            sample = newSamples[i]
+            y = samples[sample]
+            dataMatrix[x][y]= data[i]
+
+    fin.close()
+    return
+
+def outputMatrix(dataMatrix, samples, genes, outfile):
+    fout = open(outfile,"w")
+    maxLength= len(samples)
+    sList=[]
+    for i in range (0, maxLength):
+        sList.append("")
+    for sample in samples:
+        pos =samples[sample]
+        sList[pos] = sample
+
+    fout.write("sample")
+    for sample in sList:
+        fout.write("\t"+sample)
+    fout.write("\n")
+
+    for gene in genes:
+        fout.write(gene)
+        for sample in sList:
+            value = dataMatrix[genes[gene]][samples[sample]]
+            fout.write("\t"+value)
+        fout.write("\n")
+    fout.close()
+    return
+
+if __name__ == '__main__' :
+    if len(sys.argv[:]) <4:
+        print "python mergeFilesByColumn.py output inputfile(s)"
+        print "**********memory intensive, not for very genomic data with hugo number of probes"
+        print "this is merging data A+B=C\n"
+        sys.exit()
+
+    inFiles = sys.argv[2:]
+    outfile = sys.argv[1]
+
+    genes={}
+    samples={}
+    dataMatrix=[]
+
+    for infile in inFiles:
+        header (samples, infile)
+
+    for infile in inFiles:
+        process(genes, samples, dataMatrix, infile)
+
+    outputMatrix(dataMatrix, samples, genes, outfile)