annotate seg2matrix/CGData/GeneMap.py @ 60:bf57076e27b9 default tip

change genomicSegment input data
author jingchunzhu@gmail.com
date Tue, 27 Oct 2015 16:07:09 -0700
parents 3e5680fecd7a
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
31
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
1 #!/usr/bin/env python
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
2
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
3 import csv
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
4 import sys
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
5 import re
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
6
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
7 import CGData.ProbeMap
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
8 import CGData.GenomicMatrix
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
9
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
10 class ProbeMapper(object):
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
11 """
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
12 Class to map the probes. Expects handle to the refGene_hg18.table file
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
13 """
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
14
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
15 def __init__(self, mode='g'):
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
16 self.cmp_func = optionMap[mode]
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
17
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
18 def find_overlap(self, segment, ref_gene, cmp_func=None):
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
19 """
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
20 Function to find overlaps for a given probe description.
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
21 the cmp_func arg is a function that returns a 'True' or 'False' for
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
22 a given probe description and a gene, examples include 'gene_overlap'
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
23 and 'gene_simple_meth_overlap'
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
24 """
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
25 if cmp_func is None:
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
26 cmp_func = self.cmp_func
58
3e5680fecd7a accept chr1 and 1
jingchunzhu
parents: 31
diff changeset
27
31
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
28 if not ref_gene.has_chrom(segment.chrom):
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
29 return []
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
30 chromList = ref_gene.get_chrom(segment.chrom)
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
31
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
32 out = []
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
33 for gene in chromList:
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
34 if cmp_func(segment.chrom_start,
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
35 segment.chrom_end, segment.strand, gene):
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
36 out.append(gene)
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
37 return out
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
38
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
39
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
40 #
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
41 # The set of functions that can be used to do comparisons
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
42 #
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
43
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
44
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
45 def block_both_strand(start, end, strand, gene):
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
46 """
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
47 Check is segment is between gene start and end, either strand
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
48
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
49 **Code 'b'**
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
50 """
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
51 if gene.chrom_end >= start and gene.chrom_start <= end:
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
52 return True
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
53 return False
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
54
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
55 def block_same_strand(start, end, strand, gene):
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
56 """
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
57 Check is segment is on same strand, between gene start and end
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
58
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
59 **Code 's'**
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
60 """
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
61 if gene.chrom_end >= start and gene.chrom_start <= end and strand == gene.strand:
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
62 return True
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
63 return False
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
64
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
65
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
66 def exon_same_strand(start, end, strand, gene):
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
67 """
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
68 Check is segment is on same strand, and occurs on an exon
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
69
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
70 **Code 'g'**
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
71 """
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
72
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
73 if gene.strand != strand:
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
74 return False
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
75 for i in range(int(gene.ex_count)):
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
76 if gene.ex_end[i] >= start and gene.ex_start[i] <= end:
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
77 return True
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
78 return False
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
79
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
80
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
81 def exon_both_strand(start, end, strand, gene):
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
82 """
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
83 Check is segment occurs on an exon, on either stand
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
84
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
85 **Code 'e'**
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
86 """
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
87 for i in range(int(gene.ex_count)):
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
88 if gene.ex_end[i] >= start and gene.ex_start[i] <= end:
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
89 return True
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
90 return False
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
91
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
92 def block_same_strand_coverage75(start, end, strand, gene):
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
93 if gene.chrom_end >= start and gene.chrom_start <= end and strand == gene.strand:
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
94 cov = min(gene.chrom_end,end) - max(gene.chrom_start, start)
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
95 if float(cov) / float(end - start) > 0.75:
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
96 return True
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
97 return False
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
98
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
99
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
100 def exon_same_strand_coverage75(start, end, strand, gene):
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
101 if strand != gene.strand:
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
102 return False
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
103 cov = 0
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
104 for i in range(int(gene.ex_count)):
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
105 if gene.ex_end[i] >= start and gene.ex_start[i] <= end:
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
106 cov += min(gene.ex_end[i],end) - max(gene.ex_start[i], start)
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
107 if float(cov) / float(end - start) > 0.75:
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
108 return True
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
109 return False
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
110
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
111
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
112 class Intersector:
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
113 def __init__(self, same_strand=True, exons=False, coverage=None, start_rel_cdsStart=None, end_rel_cdsStart=None, start_rel_tss=None, end_rel_tss=None):
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
114 self.same_strand = same_strand
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
115 self.exons = exons
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
116 self.coverage = coverage
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
117 self.start_rel_tss = start_rel_tss
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
118 self.end_rel_tss = end_rel_tss
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
119 self.start_rel_cdsStart = start_rel_cdsStart
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
120 self.end_rel_cdsStart = end_rel_cdsStart
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
121
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
122 def hit(self, start, end, strand, gene):
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
123 if self.same_strand and strand != gene.strand:
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
124 return False
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
125
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
126 if self.exons:
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
127 cov = 0
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
128 for i in range(int(gene.ex_count)):
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
129 if gene.ex_end[i] >= start and gene.ex_start[i] <= end:
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
130 cov += min(gene.ex_end[i],end) - max(gene.ex_start[i], start)
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
131 if self.coverage is None and cov > 0:
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
132 return True
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
133 else:
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
134 if float(cov) / float(end - start) > self.coverage:
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
135 return True
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
136 else:
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
137 wStart = gene.chrom_start
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
138 wEnd = gene.chrom_end
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
139
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
140 if self.start_rel_cdsStart is not None:
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
141 if gene.strand == "+":
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
142 wStart = gene.cds_start + self.start_rel_cdsStart
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
143 if gene.strand == "-":
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
144 wStart = gene.cds_end - self.start_rel_cdsStart
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
145 if self.end_rel_cdsStart is not None:
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
146 if gene.strand == "+":
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
147 wEnd = gene.cds_start + self.end_rel_cdsStart
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
148 if gene.strand == "-":
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
149 wEnd = gene.cds_end - self.end_rel_cdsStart
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
150
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
151 if self.start_rel_tss is not None:
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
152 if gene.strand == "+":
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
153 wStart = gene.chrom_start + self.start_rel_tss
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
154 if gene.strand == "-":
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
155 wStart = gene.chrom_end - self.start_rel_tss
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
156 if self.end_rel_tss is not None:
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
157 if gene.strand == "+":
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
158 wEnd = gene.chrom_start + self.end_rel_tss
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
159 if gene.strand == "-":
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
160 wEnd = gene.chrom_end - self.end_rel_tss
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
161
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
162 cstart = min(wEnd, wStart)
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
163 cend = max(wEnd, wStart)
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
164 if cend >= start and cstart <= end:
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
165 cov = min(gene.chrom_end,end) - max(gene.chrom_start, start)
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
166 if self.coverage is None and cov > 0:
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
167 return True
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
168 else:
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
169 if float(cov) / float(end - start) > 0.75:
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
170 return True
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
171
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
172 return False
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
173
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
174
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
175
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
176 ###ADD MORE FUNCTIONS HERE
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
177
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
178
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
179 ####
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
180
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
181 ###To add options to the command line, map the option character to a function
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
182 ###for example '-m' maps to gene_simple_meth_overlap
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
183
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
184 optionMap = {
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
185 "b": block_both_strand,
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
186 "s": block_same_strand,
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
187 "g": exon_same_strand,
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
188 "e": exon_both_strand
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
189 }
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
190
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
191
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
192
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
193 def genomicSegment2Matrix(genomicSegment, refGene, probeMapper):
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
194 """
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
195 Take a genomicSegment map, compare it against a refGene table,
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
196 and contruct a genomicMatrix
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
197 """
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
198 out = CGData.GenomicMatrix.GenomicMatrix()
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
199 out.init_blank( rows=refGene.get_gene_list(), cols=genomicSegment.get_key_list() )
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
200 for id in genomicSegment.get_key_list():
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
201 for segment in genomicSegment.get_by(id):
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
202 for hit in probeMapper.find_overlap( segment, refGene ):
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
203 out.set_val(row_name=hit.name, col_name=segment.id, value=segment.value )
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
204 return out
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
205
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
206
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
207 def filter_longest_form(refgene):
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
208 """
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
209 take a refgene table and filter multiple gene isoforms down to the longest
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
210 """
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
211 ng = CGData.RefGene.RefGene()
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
212 for g in refgene.get_gene_list():
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
213 longest = None
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
214 length = 0
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
215 for elem in refgene.get_gene(g):
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
216 newLength = elem.chrom_end - elem.chrom_start
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
217 if newLength > length:
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
218 length = newLength
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
219 longest = elem
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
220 ng.add(longest)
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
221 ng.loaded = True
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
222 return ng
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
223
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
224
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
225 def genomicSegment2MatrixNorm(genomicSegment, refGene, probeMapper):
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
226 """
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
227 Given
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
228 """
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
229 ng = filter_longest_form(refGene)
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
230 #enumerate the col order of the sample ids
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
231 idList = genomicSegment.get_key_list()
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
232
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
233 geneList = ng.get_gene_list()
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
234
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
235 tmp = CGData.GenomicMatrix.GenomicMatrix()
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
236 tmp.init_blank( rows=geneList, cols=idList )
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
237 geneHits = {}
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
238 #read through the segment one sample id at a time
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
239 for id in idList:
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
240 segmentMap = {}
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
241 for segment in genomicSegment.get_by(id):
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
242 for hit in probeMapper.find_overlap( segment, ng ):
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
243 span = float(min(segment.chrom_end, hit.chrom_end) - max(segment.chrom_start, hit.chrom_start)) / float(hit.chrom_end - hit.chrom_start)
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
244 #if hit.name not in segmentMap:
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
245 # segmentMap[hit.name] = []
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
246 try:
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
247 segmentMap[hit.name].append(
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
248 ( span, segment.value )
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
249 )
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
250 except KeyError:
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
251 segmentMap[hit.name] = [
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
252 ( span, segment.value )
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
253 ]
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
254
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
255 for gene in segmentMap:
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
256 geneHits[gene] = True
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
257 mapInfo = segmentMap[gene]
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
258 coverage = sum( i[0] for i in mapInfo )
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
259 assert coverage <= 1.0
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
260 value = sum( i[0]*i[1] for i in mapInfo )
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
261 #print coverage, value, value/coverage, segmentMap[gene]
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
262 tmp.set_val(row_name=gene, col_name=id, value=value/coverage)
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
263
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
264 #now remove the blanks
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
265 out = CGData.GenomicMatrix.GenomicMatrix()
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
266 out.init_blank( rows=geneHits, cols=idList )
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
267 for gene in geneHits:
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
268 for sample in idList:
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
269 out.set_val( row_name=gene, col_name=sample, value=tmp.get_val(row_name=gene, col_name=sample))
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
270 return out
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
271
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
272
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
273
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
274 def aliasRemap(genomicMatrix, aliasMap):
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
275 """
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
276 Given a genomicMatrix and an alias map, create a new genomic matrix
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
277 with the probes from the original matrix remapped to the connected aliases
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
278 from the map
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
279 """
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
280
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
281 am = {}
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
282 for probe in aliasMap.get_key_list():
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
283 for alias in aliasMap.get_by(probe):
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
284 if alias not in am:
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
285 am[alias.alias] = {}
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
286 am[alias.alias][probe] = True
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
287
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
288 out = CGData.GenomicMatrix.GenomicMatrix()
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
289 out.init_blank( rows=am.keys(), cols=genomicMatrix.get_col_list() )
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
290 probeMap = genomicMatrix.get_row_map()
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
291 for a in am:
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
292 for sample in genomicMatrix.get_col_list():
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
293 o = []
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
294 for p in am[a]:
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
295 if p in probeMap:
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
296 o.append( genomicMatrix.get_val( col_name=sample, row_name=p) )
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
297 if len(o):
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
298 out.set_val(col_name=sample, row_name=a, value=sum(o) / float(len(o)))
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
299
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
300 return out
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
301
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
302 def refGeneLink2ProbeLoc(aliasMap, refGene):
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
303 """
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
304 given an alias map, and a refGene produce a probeMap by connecting
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
305 alias symbols. Returns the coordinates of the longest form
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
306 """
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
307 out = CGData.ProbeLoc.ProbeLoc()
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
308 out.init_blank()
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
309
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
310 for probe in aliasMap.get_key_list():
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
311 for link in aliasMap.get_by(probe):
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
312 probe = link.probe
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
313 geneName = link.alias
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
314 sGene = None
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
315 try:
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
316 for gene in refGene.get_gene(geneName):
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
317 if sGene is None or gene.chrom_end - gene.chrom_start > sGene.chrom_end - sGene.chrom_start:
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
318 sGene = gene
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
319 except KeyError:
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
320 pass
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
321 if sGene is not None:
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
322 out.insert(probe, { 'probe' : probe, 'chrom' : sGene.chrom, 'strand' : sGene.strand, 'chrom_start' : sGene.chrom_start, 'chrom_end' : sGene.chrom_end })
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
323 return out
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
324
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
325
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
326
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
327
ab20c0d04f4a add seg2matrix tool
jingchunzhu
parents:
diff changeset
328