changeset 39:61f03b481b0d

new tool
author jingchunzhu
date Fri, 31 Jul 2015 20:38:27 -0700
parents 84eb11adc22f
children 72dc9215623d
files seg2matrix/mapSegToGeneMatrix.py seg2matrix/segToProbeMap.py segToGeneMatrix.xml segToMatrix.xml
diffstat 4 files changed, 198 insertions(+), 1 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/seg2matrix/mapSegToGeneMatrix.py	Fri Jul 31 20:38:27 2015 -0700
@@ -0,0 +1,85 @@
+#!/usr/bin/env python
+
+import sys,string,copy
+import CGData.RefGene
+import CGData.GeneMap
+import segToProbeMap
+
+if __name__ == "__main__":
+    
+    if len(sys.argv[:])!=4:
+        print "python mapSegToGeneMatrix.py genomicsSegmentIn refGene GeneLevelMatrixOut\n"
+        sys.exit()
+    refgene = CGData.RefGene.RefGene()
+    refgene.load( sys.argv[2] )
+
+    #* b for cnv 
+    probeMapper = CGData.GeneMap.ProbeMapper('b')
+
+    fin =open(sys.argv[1],'r')
+    genes= {}
+    samples={}
+    matrix=[]  #sample then gene
+    for gene in refgene.get_gene_list():
+        genes[gene]=len(genes)
+
+    Ngene = len(genes.keys())
+    oneSample=[]    
+    for i in range(0, Ngene):
+        oneSample.append([]);
+
+    print "genes: ", len(genes)
+
+    count =0
+
+    while 1:
+        count = count+1
+        #print count
+        line =string.strip(fin.readline())
+        if line =="":
+            break
+        if line[0]=="#":
+            continue
+        tmp = string.split(line,"\t")
+        if len(tmp)!= 6:
+            continue
+        seg = segToProbeMap.probeseg("", tmp[1], int(tmp[2]), int(tmp[3]),tmp[4])
+        sample = tmp[0]
+        value = float(tmp[5])
+        if sample not in samples:
+            samples[sample]=len(samples)
+            matrix.append(copy.deepcopy(oneSample))
+            
+        hits={}
+        for hit in probeMapper.find_overlap( seg, refgene ):
+            gene = hit.name
+            if gene in hits:
+                continue
+            hits[gene]=0
+            matrix[samples[sample]][genes[gene]].append(value)
+    fin.close()
+
+    print "segments: ", count
+
+    fout =open(sys.argv[3],'w')
+    sample_list =samples.keys()
+    fout.write("sample\t"+string.join(sample_list,"\t")+"\n")
+    for gene in genes.keys():
+        fout.write(gene)
+        for sample in sample_list:
+            list = matrix[samples[sample]][genes[gene]]
+            if len(list)==0:
+                average =0
+            elif len(list)==1:
+                average = list[0]
+                average =round(average,3)
+            else:
+                total=0.0
+                for value in list:
+                    total = total +value
+                average = total/len(list)
+                average =round(average,3)
+            fout.write("\t"+str(average))
+        fout.write("\n")
+    fout.close()
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/seg2matrix/segToProbeMap.py	Fri Jul 31 20:38:27 2015 -0700
@@ -0,0 +1,77 @@
+#!/usr/bin/env python
+
+import sys,string
+sys.path.insert(0,"../CGDataNew")
+
+import CGData.GenomicSegment
+import CGData.SegToMatrix
+import CGData.RefGene
+import CGData.GeneMap
+import optparse
+
+class segs:
+    def __init__(self):
+        self.probes = []
+
+    def load (self, handle): #handle bed6
+        fin =open(handle,'r')
+        while 1:
+            line =string.strip(fin.readline())
+            if line =="":
+                break
+            if line[0]=="#":
+                continue
+            tmp = string.split(line,"\t")
+            if len(tmp)== 5:
+                p = probeseg(tmp[0], tmp[1], int(tmp[3]), int(tmp[4]),tmp[2])
+                self.probes.append(p)
+        fin.close()
+
+
+class probeseg:
+    def __init__(self, name, chrom, chrom_start, chrom_end, strand):
+        self.name = name
+        self.chrom = chrom
+        self.chrom_start = chrom_start
+        self.chrom_end = chrom_end
+        self.strand = strand
+
+
+if __name__ == "__main__":
+    def printUsage():
+        print "python segToProbeMap.py segInput(name,chr,strand,start,end) refGene(eg hg18) probeMapOut --mode=cnv|exp\n"
+
+    if len(sys.argv) != 5:
+        printUsage()
+        sys.exit()
+
+    parser = optparse.OptionParser()
+    parser.add_option("--mode", action="store", type="string", dest="mode")
+    (options, args) = parser.parse_args()
+
+    if (not options.mode) or (options.mode not in ["cnv","exp"]):
+        printUsage()
+        sys.exit()
+
+    probes=segs()
+    probes.load(sys.argv[1])
+
+    refgene = CGData.RefGene.RefGene()
+    refgene.load(sys.argv[2])
+
+    handle = open(sys.argv[3], "w")
+    if options.mode=="cnv":
+        probeMapper = CGData.GeneMap.ProbeMapper('b')
+    if options.mode=="exp":
+        probeMapper = CGData.GeneMap.ProbeMapper('g')
+
+    handle.write("%s\t%s\t%s\t%s\t%s\t%s\n" % ("#id", "gene","chrom","chromStart","chromEnd","strand"))
+    for probe in probes.probes:
+        hits = []
+        for hit in probeMapper.find_overlap( probe, refgene ):
+            if hit.name not in hits:
+                hits.append(hit.name)
+        handle.write("%s\t%s\t%s\t%s\t%s\t%s\n" % (probe.name, ",".join(hits), probe.chrom, probe.chrom_start, probe.chrom_end, probe.strand))
+    handle.close()
+
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/segToGeneMatrix.xml	Fri Jul 31 20:38:27 2015 -0700
@@ -0,0 +1,35 @@
+<tool id="segToGeneMatrix" name="segToGeneMatrix" version="0.0.1">
+  <description>
+    Convert segmented copy number data to gene based matrix data
+  </description>
+  <command interpreter="python">
+    seg2matrix/mapSegToGeneMatrix.py $input $__tool_directory__/seg2matrix/$refGene.assembly $outputMatrix
+  </command>
+  <inputs>
+    <param name="input" format="tabular" type="data" multiple="false" label="Input segmented copy number data" />
+    <conditional name="refGene">
+      <param name="assembly_select" type="select" label="Which human genome assembly is your input segmented copy number data?">
+	<option value="hg19">hg19</option>
+	<option value="hg18">hg18</option>
+      </param>
+      <when value="hg19">
+        <param name="assembly" type="hidden" value="refGene_hg19" />
+      </when>
+      <when value="hg18">
+        <param name="assembly" type="hidden" value="refGene_hg18" />
+      </when>
+    </conditional>  
+  </inputs>
+  <outputs>
+    <data name="outputMatrix" format="tabular" label="gene-level copy number matrix" />
+  </outputs>
+  <help>
+    ***Convert segmented copy number data for input into xena***
+
+    Given a segmented copy number data file, convert it into gene based matrix data
+
+    Output File no 1. matrix file
+
+  </help>
+</tool>
+
--- a/segToMatrix.xml	Fri Jul 31 15:29:53 2015 -0700
+++ b/segToMatrix.xml	Fri Jul 31 20:38:27 2015 -0700
@@ -3,7 +3,7 @@
     Prep segmented copy number data for Xena
   </description>
   <command interpreter="python">
-    seg2matrix/segToMatrixGalaxy.py $input seg2matrix/$refGene.assembly $outputMatrix $outputProbeMap
+    seg2matrix/segToMatrixGalaxy.py $input $__tool_directory__/seg2matrix/$refGene.assembly $outputMatrix $outputProbeMap
   </command>
   <inputs>
     <param name="input" format="tabular" type="data" multiple="false" label="Input segmented copy number data" />