annotate seg2matrix/mapSegToGeneMatrix.py @ 51:728eda331f07

better handle of input
author jingchunzhu@gmail.com
date Thu, 17 Sep 2015 14:33:21 -0700
parents 61f03b481b0d
children 59dbe857f5d4
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
39
61f03b481b0d new tool
jingchunzhu
parents:
diff changeset
1 #!/usr/bin/env python
61f03b481b0d new tool
jingchunzhu
parents:
diff changeset
2
61f03b481b0d new tool
jingchunzhu
parents:
diff changeset
3 import sys,string,copy
61f03b481b0d new tool
jingchunzhu
parents:
diff changeset
4 import CGData.RefGene
61f03b481b0d new tool
jingchunzhu
parents:
diff changeset
5 import CGData.GeneMap
61f03b481b0d new tool
jingchunzhu
parents:
diff changeset
6 import segToProbeMap
61f03b481b0d new tool
jingchunzhu
parents:
diff changeset
7
61f03b481b0d new tool
jingchunzhu
parents:
diff changeset
8 if __name__ == "__main__":
61f03b481b0d new tool
jingchunzhu
parents:
diff changeset
9
61f03b481b0d new tool
jingchunzhu
parents:
diff changeset
10 if len(sys.argv[:])!=4:
61f03b481b0d new tool
jingchunzhu
parents:
diff changeset
11 print "python mapSegToGeneMatrix.py genomicsSegmentIn refGene GeneLevelMatrixOut\n"
61f03b481b0d new tool
jingchunzhu
parents:
diff changeset
12 sys.exit()
61f03b481b0d new tool
jingchunzhu
parents:
diff changeset
13 refgene = CGData.RefGene.RefGene()
61f03b481b0d new tool
jingchunzhu
parents:
diff changeset
14 refgene.load( sys.argv[2] )
61f03b481b0d new tool
jingchunzhu
parents:
diff changeset
15
61f03b481b0d new tool
jingchunzhu
parents:
diff changeset
16 #* b for cnv
61f03b481b0d new tool
jingchunzhu
parents:
diff changeset
17 probeMapper = CGData.GeneMap.ProbeMapper('b')
61f03b481b0d new tool
jingchunzhu
parents:
diff changeset
18
61f03b481b0d new tool
jingchunzhu
parents:
diff changeset
19 fin =open(sys.argv[1],'r')
61f03b481b0d new tool
jingchunzhu
parents:
diff changeset
20 genes= {}
61f03b481b0d new tool
jingchunzhu
parents:
diff changeset
21 samples={}
61f03b481b0d new tool
jingchunzhu
parents:
diff changeset
22 matrix=[] #sample then gene
61f03b481b0d new tool
jingchunzhu
parents:
diff changeset
23 for gene in refgene.get_gene_list():
61f03b481b0d new tool
jingchunzhu
parents:
diff changeset
24 genes[gene]=len(genes)
61f03b481b0d new tool
jingchunzhu
parents:
diff changeset
25
61f03b481b0d new tool
jingchunzhu
parents:
diff changeset
26 Ngene = len(genes.keys())
61f03b481b0d new tool
jingchunzhu
parents:
diff changeset
27 oneSample=[]
61f03b481b0d new tool
jingchunzhu
parents:
diff changeset
28 for i in range(0, Ngene):
61f03b481b0d new tool
jingchunzhu
parents:
diff changeset
29 oneSample.append([]);
61f03b481b0d new tool
jingchunzhu
parents:
diff changeset
30
61f03b481b0d new tool
jingchunzhu
parents:
diff changeset
31 print "genes: ", len(genes)
61f03b481b0d new tool
jingchunzhu
parents:
diff changeset
32
61f03b481b0d new tool
jingchunzhu
parents:
diff changeset
33 count =0
61f03b481b0d new tool
jingchunzhu
parents:
diff changeset
34
61f03b481b0d new tool
jingchunzhu
parents:
diff changeset
35 while 1:
61f03b481b0d new tool
jingchunzhu
parents:
diff changeset
36 count = count+1
61f03b481b0d new tool
jingchunzhu
parents:
diff changeset
37 #print count
51
728eda331f07 better handle of input
jingchunzhu@gmail.com
parents: 39
diff changeset
38 line =fin.readline()
728eda331f07 better handle of input
jingchunzhu@gmail.com
parents: 39
diff changeset
39 if line =="": # end of file
39
61f03b481b0d new tool
jingchunzhu
parents:
diff changeset
40 break
51
728eda331f07 better handle of input
jingchunzhu@gmail.com
parents: 39
diff changeset
41 if count ==1:
728eda331f07 better handle of input
jingchunzhu@gmail.com
parents: 39
diff changeset
42 continue #ignore the first line
728eda331f07 better handle of input
jingchunzhu@gmail.com
parents: 39
diff changeset
43 line = string.strip(line)
728eda331f07 better handle of input
jingchunzhu@gmail.com
parents: 39
diff changeset
44 if line == "": # empty line
728eda331f07 better handle of input
jingchunzhu@gmail.com
parents: 39
diff changeset
45 continue
39
61f03b481b0d new tool
jingchunzhu
parents:
diff changeset
46 if line[0]=="#":
61f03b481b0d new tool
jingchunzhu
parents:
diff changeset
47 continue
61f03b481b0d new tool
jingchunzhu
parents:
diff changeset
48 tmp = string.split(line,"\t")
61f03b481b0d new tool
jingchunzhu
parents:
diff changeset
49 if len(tmp)!= 6:
61f03b481b0d new tool
jingchunzhu
parents:
diff changeset
50 continue
61f03b481b0d new tool
jingchunzhu
parents:
diff changeset
51 seg = segToProbeMap.probeseg("", tmp[1], int(tmp[2]), int(tmp[3]),tmp[4])
61f03b481b0d new tool
jingchunzhu
parents:
diff changeset
52 sample = tmp[0]
61f03b481b0d new tool
jingchunzhu
parents:
diff changeset
53 value = float(tmp[5])
61f03b481b0d new tool
jingchunzhu
parents:
diff changeset
54 if sample not in samples:
61f03b481b0d new tool
jingchunzhu
parents:
diff changeset
55 samples[sample]=len(samples)
61f03b481b0d new tool
jingchunzhu
parents:
diff changeset
56 matrix.append(copy.deepcopy(oneSample))
61f03b481b0d new tool
jingchunzhu
parents:
diff changeset
57
61f03b481b0d new tool
jingchunzhu
parents:
diff changeset
58 hits={}
61f03b481b0d new tool
jingchunzhu
parents:
diff changeset
59 for hit in probeMapper.find_overlap( seg, refgene ):
61f03b481b0d new tool
jingchunzhu
parents:
diff changeset
60 gene = hit.name
61f03b481b0d new tool
jingchunzhu
parents:
diff changeset
61 if gene in hits:
61f03b481b0d new tool
jingchunzhu
parents:
diff changeset
62 continue
61f03b481b0d new tool
jingchunzhu
parents:
diff changeset
63 hits[gene]=0
61f03b481b0d new tool
jingchunzhu
parents:
diff changeset
64 matrix[samples[sample]][genes[gene]].append(value)
61f03b481b0d new tool
jingchunzhu
parents:
diff changeset
65 fin.close()
61f03b481b0d new tool
jingchunzhu
parents:
diff changeset
66
61f03b481b0d new tool
jingchunzhu
parents:
diff changeset
67 print "segments: ", count
61f03b481b0d new tool
jingchunzhu
parents:
diff changeset
68
61f03b481b0d new tool
jingchunzhu
parents:
diff changeset
69 fout =open(sys.argv[3],'w')
61f03b481b0d new tool
jingchunzhu
parents:
diff changeset
70 sample_list =samples.keys()
61f03b481b0d new tool
jingchunzhu
parents:
diff changeset
71 fout.write("sample\t"+string.join(sample_list,"\t")+"\n")
61f03b481b0d new tool
jingchunzhu
parents:
diff changeset
72 for gene in genes.keys():
61f03b481b0d new tool
jingchunzhu
parents:
diff changeset
73 fout.write(gene)
61f03b481b0d new tool
jingchunzhu
parents:
diff changeset
74 for sample in sample_list:
61f03b481b0d new tool
jingchunzhu
parents:
diff changeset
75 list = matrix[samples[sample]][genes[gene]]
61f03b481b0d new tool
jingchunzhu
parents:
diff changeset
76 if len(list)==0:
61f03b481b0d new tool
jingchunzhu
parents:
diff changeset
77 average =0
61f03b481b0d new tool
jingchunzhu
parents:
diff changeset
78 elif len(list)==1:
61f03b481b0d new tool
jingchunzhu
parents:
diff changeset
79 average = list[0]
61f03b481b0d new tool
jingchunzhu
parents:
diff changeset
80 average =round(average,3)
61f03b481b0d new tool
jingchunzhu
parents:
diff changeset
81 else:
61f03b481b0d new tool
jingchunzhu
parents:
diff changeset
82 total=0.0
61f03b481b0d new tool
jingchunzhu
parents:
diff changeset
83 for value in list:
61f03b481b0d new tool
jingchunzhu
parents:
diff changeset
84 total = total +value
61f03b481b0d new tool
jingchunzhu
parents:
diff changeset
85 average = total/len(list)
61f03b481b0d new tool
jingchunzhu
parents:
diff changeset
86 average =round(average,3)
61f03b481b0d new tool
jingchunzhu
parents:
diff changeset
87 fout.write("\t"+str(average))
61f03b481b0d new tool
jingchunzhu
parents:
diff changeset
88 fout.write("\n")
61f03b481b0d new tool
jingchunzhu
parents:
diff changeset
89 fout.close()
61f03b481b0d new tool
jingchunzhu
parents:
diff changeset
90