diff seg2matrix/mapSegToGeneMatrix.py @ 39:61f03b481b0d

new tool
author jingchunzhu
date Fri, 31 Jul 2015 20:38:27 -0700
parents
children 728eda331f07
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/seg2matrix/mapSegToGeneMatrix.py	Fri Jul 31 20:38:27 2015 -0700
@@ -0,0 +1,85 @@
+#!/usr/bin/env python
+
+import sys,string,copy
+import CGData.RefGene
+import CGData.GeneMap
+import segToProbeMap
+
+if __name__ == "__main__":
+    
+    if len(sys.argv[:])!=4:
+        print "python mapSegToGeneMatrix.py genomicsSegmentIn refGene GeneLevelMatrixOut\n"
+        sys.exit()
+    refgene = CGData.RefGene.RefGene()
+    refgene.load( sys.argv[2] )
+
+    #* b for cnv 
+    probeMapper = CGData.GeneMap.ProbeMapper('b')
+
+    fin =open(sys.argv[1],'r')
+    genes= {}
+    samples={}
+    matrix=[]  #sample then gene
+    for gene in refgene.get_gene_list():
+        genes[gene]=len(genes)
+
+    Ngene = len(genes.keys())
+    oneSample=[]    
+    for i in range(0, Ngene):
+        oneSample.append([]);
+
+    print "genes: ", len(genes)
+
+    count =0
+
+    while 1:
+        count = count+1
+        #print count
+        line =string.strip(fin.readline())
+        if line =="":
+            break
+        if line[0]=="#":
+            continue
+        tmp = string.split(line,"\t")
+        if len(tmp)!= 6:
+            continue
+        seg = segToProbeMap.probeseg("", tmp[1], int(tmp[2]), int(tmp[3]),tmp[4])
+        sample = tmp[0]
+        value = float(tmp[5])
+        if sample not in samples:
+            samples[sample]=len(samples)
+            matrix.append(copy.deepcopy(oneSample))
+            
+        hits={}
+        for hit in probeMapper.find_overlap( seg, refgene ):
+            gene = hit.name
+            if gene in hits:
+                continue
+            hits[gene]=0
+            matrix[samples[sample]][genes[gene]].append(value)
+    fin.close()
+
+    print "segments: ", count
+
+    fout =open(sys.argv[3],'w')
+    sample_list =samples.keys()
+    fout.write("sample\t"+string.join(sample_list,"\t")+"\n")
+    for gene in genes.keys():
+        fout.write(gene)
+        for sample in sample_list:
+            list = matrix[samples[sample]][genes[gene]]
+            if len(list)==0:
+                average =0
+            elif len(list)==1:
+                average = list[0]
+                average =round(average,3)
+            else:
+                total=0.0
+                for value in list:
+                    total = total +value
+                average = total/len(list)
+                average =round(average,3)
+            fout.write("\t"+str(average))
+        fout.write("\n")
+    fout.close()
+