39
|
1 #!/usr/bin/env python
|
|
2
|
|
3 import sys,string,copy
|
|
4 import CGData.RefGene
|
|
5 import CGData.GeneMap
|
|
6 import segToProbeMap
|
|
7
|
|
8 if __name__ == "__main__":
|
|
9
|
|
10 if len(sys.argv[:])!=4:
|
|
11 print "python mapSegToGeneMatrix.py genomicsSegmentIn refGene GeneLevelMatrixOut\n"
|
|
12 sys.exit()
|
|
13 refgene = CGData.RefGene.RefGene()
|
|
14 refgene.load( sys.argv[2] )
|
|
15
|
|
16 #* b for cnv
|
|
17 probeMapper = CGData.GeneMap.ProbeMapper('b')
|
|
18
|
|
19 fin =open(sys.argv[1],'r')
|
|
20 genes= {}
|
|
21 samples={}
|
|
22 matrix=[] #sample then gene
|
|
23 for gene in refgene.get_gene_list():
|
|
24 genes[gene]=len(genes)
|
|
25
|
|
26 Ngene = len(genes.keys())
|
|
27 oneSample=[]
|
|
28 for i in range(0, Ngene):
|
|
29 oneSample.append([]);
|
|
30
|
|
31 print "genes: ", len(genes)
|
|
32
|
|
33 count =0
|
|
34
|
|
35 while 1:
|
|
36 count = count+1
|
|
37 #print count
|
51
|
38 line =fin.readline()
|
|
39 if line =="": # end of file
|
39
|
40 break
|
51
|
41 if count ==1:
|
|
42 continue #ignore the first line
|
|
43 line = string.strip(line)
|
|
44 if line == "": # empty line
|
|
45 continue
|
39
|
46 if line[0]=="#":
|
|
47 continue
|
|
48 tmp = string.split(line,"\t")
|
|
49 if len(tmp)!= 6:
|
|
50 continue
|
|
51 seg = segToProbeMap.probeseg("", tmp[1], int(tmp[2]), int(tmp[3]),tmp[4])
|
|
52 sample = tmp[0]
|
|
53 value = float(tmp[5])
|
|
54 if sample not in samples:
|
|
55 samples[sample]=len(samples)
|
|
56 matrix.append(copy.deepcopy(oneSample))
|
|
57
|
|
58 hits={}
|
|
59 for hit in probeMapper.find_overlap( seg, refgene ):
|
|
60 gene = hit.name
|
|
61 if gene in hits:
|
|
62 continue
|
|
63 hits[gene]=0
|
|
64 matrix[samples[sample]][genes[gene]].append(value)
|
|
65 fin.close()
|
|
66
|
|
67 print "segments: ", count
|
|
68
|
|
69 fout =open(sys.argv[3],'w')
|
|
70 sample_list =samples.keys()
|
|
71 fout.write("sample\t"+string.join(sample_list,"\t")+"\n")
|
|
72 for gene in genes.keys():
|
|
73 fout.write(gene)
|
|
74 for sample in sample_list:
|
|
75 list = matrix[samples[sample]][genes[gene]]
|
|
76 if len(list)==0:
|
|
77 average =0
|
|
78 elif len(list)==1:
|
|
79 average = list[0]
|
|
80 average =round(average,3)
|
|
81 else:
|
|
82 total=0.0
|
|
83 for value in list:
|
|
84 total = total +value
|
|
85 average = total/len(list)
|
|
86 average =round(average,3)
|
|
87 fout.write("\t"+str(average))
|
|
88 fout.write("\n")
|
|
89 fout.close()
|
|
90
|