annotate seg2matrix/mapSegToGeneMatrix.py @ 47:23d98125d20c

parse snpEff output
author jingchunzhu
date Thu, 13 Aug 2015 23:26:33 -0700
parents 61f03b481b0d
children 728eda331f07
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
39
61f03b481b0d new tool
jingchunzhu
parents:
diff changeset
1 #!/usr/bin/env python
61f03b481b0d new tool
jingchunzhu
parents:
diff changeset
2
61f03b481b0d new tool
jingchunzhu
parents:
diff changeset
3 import sys,string,copy
61f03b481b0d new tool
jingchunzhu
parents:
diff changeset
4 import CGData.RefGene
61f03b481b0d new tool
jingchunzhu
parents:
diff changeset
5 import CGData.GeneMap
61f03b481b0d new tool
jingchunzhu
parents:
diff changeset
6 import segToProbeMap
61f03b481b0d new tool
jingchunzhu
parents:
diff changeset
7
61f03b481b0d new tool
jingchunzhu
parents:
diff changeset
8 if __name__ == "__main__":
61f03b481b0d new tool
jingchunzhu
parents:
diff changeset
9
61f03b481b0d new tool
jingchunzhu
parents:
diff changeset
10 if len(sys.argv[:])!=4:
61f03b481b0d new tool
jingchunzhu
parents:
diff changeset
11 print "python mapSegToGeneMatrix.py genomicsSegmentIn refGene GeneLevelMatrixOut\n"
61f03b481b0d new tool
jingchunzhu
parents:
diff changeset
12 sys.exit()
61f03b481b0d new tool
jingchunzhu
parents:
diff changeset
13 refgene = CGData.RefGene.RefGene()
61f03b481b0d new tool
jingchunzhu
parents:
diff changeset
14 refgene.load( sys.argv[2] )
61f03b481b0d new tool
jingchunzhu
parents:
diff changeset
15
61f03b481b0d new tool
jingchunzhu
parents:
diff changeset
16 #* b for cnv
61f03b481b0d new tool
jingchunzhu
parents:
diff changeset
17 probeMapper = CGData.GeneMap.ProbeMapper('b')
61f03b481b0d new tool
jingchunzhu
parents:
diff changeset
18
61f03b481b0d new tool
jingchunzhu
parents:
diff changeset
19 fin =open(sys.argv[1],'r')
61f03b481b0d new tool
jingchunzhu
parents:
diff changeset
20 genes= {}
61f03b481b0d new tool
jingchunzhu
parents:
diff changeset
21 samples={}
61f03b481b0d new tool
jingchunzhu
parents:
diff changeset
22 matrix=[] #sample then gene
61f03b481b0d new tool
jingchunzhu
parents:
diff changeset
23 for gene in refgene.get_gene_list():
61f03b481b0d new tool
jingchunzhu
parents:
diff changeset
24 genes[gene]=len(genes)
61f03b481b0d new tool
jingchunzhu
parents:
diff changeset
25
61f03b481b0d new tool
jingchunzhu
parents:
diff changeset
26 Ngene = len(genes.keys())
61f03b481b0d new tool
jingchunzhu
parents:
diff changeset
27 oneSample=[]
61f03b481b0d new tool
jingchunzhu
parents:
diff changeset
28 for i in range(0, Ngene):
61f03b481b0d new tool
jingchunzhu
parents:
diff changeset
29 oneSample.append([]);
61f03b481b0d new tool
jingchunzhu
parents:
diff changeset
30
61f03b481b0d new tool
jingchunzhu
parents:
diff changeset
31 print "genes: ", len(genes)
61f03b481b0d new tool
jingchunzhu
parents:
diff changeset
32
61f03b481b0d new tool
jingchunzhu
parents:
diff changeset
33 count =0
61f03b481b0d new tool
jingchunzhu
parents:
diff changeset
34
61f03b481b0d new tool
jingchunzhu
parents:
diff changeset
35 while 1:
61f03b481b0d new tool
jingchunzhu
parents:
diff changeset
36 count = count+1
61f03b481b0d new tool
jingchunzhu
parents:
diff changeset
37 #print count
61f03b481b0d new tool
jingchunzhu
parents:
diff changeset
38 line =string.strip(fin.readline())
61f03b481b0d new tool
jingchunzhu
parents:
diff changeset
39 if line =="":
61f03b481b0d new tool
jingchunzhu
parents:
diff changeset
40 break
61f03b481b0d new tool
jingchunzhu
parents:
diff changeset
41 if line[0]=="#":
61f03b481b0d new tool
jingchunzhu
parents:
diff changeset
42 continue
61f03b481b0d new tool
jingchunzhu
parents:
diff changeset
43 tmp = string.split(line,"\t")
61f03b481b0d new tool
jingchunzhu
parents:
diff changeset
44 if len(tmp)!= 6:
61f03b481b0d new tool
jingchunzhu
parents:
diff changeset
45 continue
61f03b481b0d new tool
jingchunzhu
parents:
diff changeset
46 seg = segToProbeMap.probeseg("", tmp[1], int(tmp[2]), int(tmp[3]),tmp[4])
61f03b481b0d new tool
jingchunzhu
parents:
diff changeset
47 sample = tmp[0]
61f03b481b0d new tool
jingchunzhu
parents:
diff changeset
48 value = float(tmp[5])
61f03b481b0d new tool
jingchunzhu
parents:
diff changeset
49 if sample not in samples:
61f03b481b0d new tool
jingchunzhu
parents:
diff changeset
50 samples[sample]=len(samples)
61f03b481b0d new tool
jingchunzhu
parents:
diff changeset
51 matrix.append(copy.deepcopy(oneSample))
61f03b481b0d new tool
jingchunzhu
parents:
diff changeset
52
61f03b481b0d new tool
jingchunzhu
parents:
diff changeset
53 hits={}
61f03b481b0d new tool
jingchunzhu
parents:
diff changeset
54 for hit in probeMapper.find_overlap( seg, refgene ):
61f03b481b0d new tool
jingchunzhu
parents:
diff changeset
55 gene = hit.name
61f03b481b0d new tool
jingchunzhu
parents:
diff changeset
56 if gene in hits:
61f03b481b0d new tool
jingchunzhu
parents:
diff changeset
57 continue
61f03b481b0d new tool
jingchunzhu
parents:
diff changeset
58 hits[gene]=0
61f03b481b0d new tool
jingchunzhu
parents:
diff changeset
59 matrix[samples[sample]][genes[gene]].append(value)
61f03b481b0d new tool
jingchunzhu
parents:
diff changeset
60 fin.close()
61f03b481b0d new tool
jingchunzhu
parents:
diff changeset
61
61f03b481b0d new tool
jingchunzhu
parents:
diff changeset
62 print "segments: ", count
61f03b481b0d new tool
jingchunzhu
parents:
diff changeset
63
61f03b481b0d new tool
jingchunzhu
parents:
diff changeset
64 fout =open(sys.argv[3],'w')
61f03b481b0d new tool
jingchunzhu
parents:
diff changeset
65 sample_list =samples.keys()
61f03b481b0d new tool
jingchunzhu
parents:
diff changeset
66 fout.write("sample\t"+string.join(sample_list,"\t")+"\n")
61f03b481b0d new tool
jingchunzhu
parents:
diff changeset
67 for gene in genes.keys():
61f03b481b0d new tool
jingchunzhu
parents:
diff changeset
68 fout.write(gene)
61f03b481b0d new tool
jingchunzhu
parents:
diff changeset
69 for sample in sample_list:
61f03b481b0d new tool
jingchunzhu
parents:
diff changeset
70 list = matrix[samples[sample]][genes[gene]]
61f03b481b0d new tool
jingchunzhu
parents:
diff changeset
71 if len(list)==0:
61f03b481b0d new tool
jingchunzhu
parents:
diff changeset
72 average =0
61f03b481b0d new tool
jingchunzhu
parents:
diff changeset
73 elif len(list)==1:
61f03b481b0d new tool
jingchunzhu
parents:
diff changeset
74 average = list[0]
61f03b481b0d new tool
jingchunzhu
parents:
diff changeset
75 average =round(average,3)
61f03b481b0d new tool
jingchunzhu
parents:
diff changeset
76 else:
61f03b481b0d new tool
jingchunzhu
parents:
diff changeset
77 total=0.0
61f03b481b0d new tool
jingchunzhu
parents:
diff changeset
78 for value in list:
61f03b481b0d new tool
jingchunzhu
parents:
diff changeset
79 total = total +value
61f03b481b0d new tool
jingchunzhu
parents:
diff changeset
80 average = total/len(list)
61f03b481b0d new tool
jingchunzhu
parents:
diff changeset
81 average =round(average,3)
61f03b481b0d new tool
jingchunzhu
parents:
diff changeset
82 fout.write("\t"+str(average))
61f03b481b0d new tool
jingchunzhu
parents:
diff changeset
83 fout.write("\n")
61f03b481b0d new tool
jingchunzhu
parents:
diff changeset
84 fout.close()
61f03b481b0d new tool
jingchunzhu
parents:
diff changeset
85