31
|
1 #!/usr/bin/env python
|
|
2
|
|
3 import csv
|
|
4 import sys
|
|
5 import re
|
|
6
|
|
7 import CGData.ProbeMap
|
|
8 import CGData.GenomicMatrix
|
|
9
|
|
10 class ProbeMapper(object):
|
|
11 """
|
|
12 Class to map the probes. Expects handle to the refGene_hg18.table file
|
|
13 """
|
|
14
|
|
15 def __init__(self, mode='g'):
|
|
16 self.cmp_func = optionMap[mode]
|
|
17
|
|
18 def find_overlap(self, segment, ref_gene, cmp_func=None):
|
|
19 """
|
|
20 Function to find overlaps for a given probe description.
|
|
21 the cmp_func arg is a function that returns a 'True' or 'False' for
|
|
22 a given probe description and a gene, examples include 'gene_overlap'
|
|
23 and 'gene_simple_meth_overlap'
|
|
24 """
|
|
25 if cmp_func is None:
|
|
26 cmp_func = self.cmp_func
|
58
|
27
|
31
|
28 if not ref_gene.has_chrom(segment.chrom):
|
|
29 return []
|
|
30 chromList = ref_gene.get_chrom(segment.chrom)
|
|
31
|
|
32 out = []
|
|
33 for gene in chromList:
|
|
34 if cmp_func(segment.chrom_start,
|
|
35 segment.chrom_end, segment.strand, gene):
|
|
36 out.append(gene)
|
|
37 return out
|
|
38
|
|
39
|
|
40 #
|
|
41 # The set of functions that can be used to do comparisons
|
|
42 #
|
|
43
|
|
44
|
|
45 def block_both_strand(start, end, strand, gene):
|
|
46 """
|
|
47 Check is segment is between gene start and end, either strand
|
|
48
|
|
49 **Code 'b'**
|
|
50 """
|
|
51 if gene.chrom_end >= start and gene.chrom_start <= end:
|
|
52 return True
|
|
53 return False
|
|
54
|
|
55 def block_same_strand(start, end, strand, gene):
|
|
56 """
|
|
57 Check is segment is on same strand, between gene start and end
|
|
58
|
|
59 **Code 's'**
|
|
60 """
|
|
61 if gene.chrom_end >= start and gene.chrom_start <= end and strand == gene.strand:
|
|
62 return True
|
|
63 return False
|
|
64
|
|
65
|
|
66 def exon_same_strand(start, end, strand, gene):
|
|
67 """
|
|
68 Check is segment is on same strand, and occurs on an exon
|
|
69
|
|
70 **Code 'g'**
|
|
71 """
|
|
72
|
|
73 if gene.strand != strand:
|
|
74 return False
|
|
75 for i in range(int(gene.ex_count)):
|
|
76 if gene.ex_end[i] >= start and gene.ex_start[i] <= end:
|
|
77 return True
|
|
78 return False
|
|
79
|
|
80
|
|
81 def exon_both_strand(start, end, strand, gene):
|
|
82 """
|
|
83 Check is segment occurs on an exon, on either stand
|
|
84
|
|
85 **Code 'e'**
|
|
86 """
|
|
87 for i in range(int(gene.ex_count)):
|
|
88 if gene.ex_end[i] >= start and gene.ex_start[i] <= end:
|
|
89 return True
|
|
90 return False
|
|
91
|
|
92 def block_same_strand_coverage75(start, end, strand, gene):
|
|
93 if gene.chrom_end >= start and gene.chrom_start <= end and strand == gene.strand:
|
|
94 cov = min(gene.chrom_end,end) - max(gene.chrom_start, start)
|
|
95 if float(cov) / float(end - start) > 0.75:
|
|
96 return True
|
|
97 return False
|
|
98
|
|
99
|
|
100 def exon_same_strand_coverage75(start, end, strand, gene):
|
|
101 if strand != gene.strand:
|
|
102 return False
|
|
103 cov = 0
|
|
104 for i in range(int(gene.ex_count)):
|
|
105 if gene.ex_end[i] >= start and gene.ex_start[i] <= end:
|
|
106 cov += min(gene.ex_end[i],end) - max(gene.ex_start[i], start)
|
|
107 if float(cov) / float(end - start) > 0.75:
|
|
108 return True
|
|
109 return False
|
|
110
|
|
111
|
|
112 class Intersector:
|
|
113 def __init__(self, same_strand=True, exons=False, coverage=None, start_rel_cdsStart=None, end_rel_cdsStart=None, start_rel_tss=None, end_rel_tss=None):
|
|
114 self.same_strand = same_strand
|
|
115 self.exons = exons
|
|
116 self.coverage = coverage
|
|
117 self.start_rel_tss = start_rel_tss
|
|
118 self.end_rel_tss = end_rel_tss
|
|
119 self.start_rel_cdsStart = start_rel_cdsStart
|
|
120 self.end_rel_cdsStart = end_rel_cdsStart
|
|
121
|
|
122 def hit(self, start, end, strand, gene):
|
|
123 if self.same_strand and strand != gene.strand:
|
|
124 return False
|
|
125
|
|
126 if self.exons:
|
|
127 cov = 0
|
|
128 for i in range(int(gene.ex_count)):
|
|
129 if gene.ex_end[i] >= start and gene.ex_start[i] <= end:
|
|
130 cov += min(gene.ex_end[i],end) - max(gene.ex_start[i], start)
|
|
131 if self.coverage is None and cov > 0:
|
|
132 return True
|
|
133 else:
|
|
134 if float(cov) / float(end - start) > self.coverage:
|
|
135 return True
|
|
136 else:
|
|
137 wStart = gene.chrom_start
|
|
138 wEnd = gene.chrom_end
|
|
139
|
|
140 if self.start_rel_cdsStart is not None:
|
|
141 if gene.strand == "+":
|
|
142 wStart = gene.cds_start + self.start_rel_cdsStart
|
|
143 if gene.strand == "-":
|
|
144 wStart = gene.cds_end - self.start_rel_cdsStart
|
|
145 if self.end_rel_cdsStart is not None:
|
|
146 if gene.strand == "+":
|
|
147 wEnd = gene.cds_start + self.end_rel_cdsStart
|
|
148 if gene.strand == "-":
|
|
149 wEnd = gene.cds_end - self.end_rel_cdsStart
|
|
150
|
|
151 if self.start_rel_tss is not None:
|
|
152 if gene.strand == "+":
|
|
153 wStart = gene.chrom_start + self.start_rel_tss
|
|
154 if gene.strand == "-":
|
|
155 wStart = gene.chrom_end - self.start_rel_tss
|
|
156 if self.end_rel_tss is not None:
|
|
157 if gene.strand == "+":
|
|
158 wEnd = gene.chrom_start + self.end_rel_tss
|
|
159 if gene.strand == "-":
|
|
160 wEnd = gene.chrom_end - self.end_rel_tss
|
|
161
|
|
162 cstart = min(wEnd, wStart)
|
|
163 cend = max(wEnd, wStart)
|
|
164 if cend >= start and cstart <= end:
|
|
165 cov = min(gene.chrom_end,end) - max(gene.chrom_start, start)
|
|
166 if self.coverage is None and cov > 0:
|
|
167 return True
|
|
168 else:
|
|
169 if float(cov) / float(end - start) > 0.75:
|
|
170 return True
|
|
171
|
|
172 return False
|
|
173
|
|
174
|
|
175
|
|
176 ###ADD MORE FUNCTIONS HERE
|
|
177
|
|
178
|
|
179 ####
|
|
180
|
|
181 ###To add options to the command line, map the option character to a function
|
|
182 ###for example '-m' maps to gene_simple_meth_overlap
|
|
183
|
|
184 optionMap = {
|
|
185 "b": block_both_strand,
|
|
186 "s": block_same_strand,
|
|
187 "g": exon_same_strand,
|
|
188 "e": exon_both_strand
|
|
189 }
|
|
190
|
|
191
|
|
192
|
|
193 def genomicSegment2Matrix(genomicSegment, refGene, probeMapper):
|
|
194 """
|
|
195 Take a genomicSegment map, compare it against a refGene table,
|
|
196 and contruct a genomicMatrix
|
|
197 """
|
|
198 out = CGData.GenomicMatrix.GenomicMatrix()
|
|
199 out.init_blank( rows=refGene.get_gene_list(), cols=genomicSegment.get_key_list() )
|
|
200 for id in genomicSegment.get_key_list():
|
|
201 for segment in genomicSegment.get_by(id):
|
|
202 for hit in probeMapper.find_overlap( segment, refGene ):
|
|
203 out.set_val(row_name=hit.name, col_name=segment.id, value=segment.value )
|
|
204 return out
|
|
205
|
|
206
|
|
207 def filter_longest_form(refgene):
|
|
208 """
|
|
209 take a refgene table and filter multiple gene isoforms down to the longest
|
|
210 """
|
|
211 ng = CGData.RefGene.RefGene()
|
|
212 for g in refgene.get_gene_list():
|
|
213 longest = None
|
|
214 length = 0
|
|
215 for elem in refgene.get_gene(g):
|
|
216 newLength = elem.chrom_end - elem.chrom_start
|
|
217 if newLength > length:
|
|
218 length = newLength
|
|
219 longest = elem
|
|
220 ng.add(longest)
|
|
221 ng.loaded = True
|
|
222 return ng
|
|
223
|
|
224
|
|
225 def genomicSegment2MatrixNorm(genomicSegment, refGene, probeMapper):
|
|
226 """
|
|
227 Given
|
|
228 """
|
|
229 ng = filter_longest_form(refGene)
|
|
230 #enumerate the col order of the sample ids
|
|
231 idList = genomicSegment.get_key_list()
|
|
232
|
|
233 geneList = ng.get_gene_list()
|
|
234
|
|
235 tmp = CGData.GenomicMatrix.GenomicMatrix()
|
|
236 tmp.init_blank( rows=geneList, cols=idList )
|
|
237 geneHits = {}
|
|
238 #read through the segment one sample id at a time
|
|
239 for id in idList:
|
|
240 segmentMap = {}
|
|
241 for segment in genomicSegment.get_by(id):
|
|
242 for hit in probeMapper.find_overlap( segment, ng ):
|
|
243 span = float(min(segment.chrom_end, hit.chrom_end) - max(segment.chrom_start, hit.chrom_start)) / float(hit.chrom_end - hit.chrom_start)
|
|
244 #if hit.name not in segmentMap:
|
|
245 # segmentMap[hit.name] = []
|
|
246 try:
|
|
247 segmentMap[hit.name].append(
|
|
248 ( span, segment.value )
|
|
249 )
|
|
250 except KeyError:
|
|
251 segmentMap[hit.name] = [
|
|
252 ( span, segment.value )
|
|
253 ]
|
|
254
|
|
255 for gene in segmentMap:
|
|
256 geneHits[gene] = True
|
|
257 mapInfo = segmentMap[gene]
|
|
258 coverage = sum( i[0] for i in mapInfo )
|
|
259 assert coverage <= 1.0
|
|
260 value = sum( i[0]*i[1] for i in mapInfo )
|
|
261 #print coverage, value, value/coverage, segmentMap[gene]
|
|
262 tmp.set_val(row_name=gene, col_name=id, value=value/coverage)
|
|
263
|
|
264 #now remove the blanks
|
|
265 out = CGData.GenomicMatrix.GenomicMatrix()
|
|
266 out.init_blank( rows=geneHits, cols=idList )
|
|
267 for gene in geneHits:
|
|
268 for sample in idList:
|
|
269 out.set_val( row_name=gene, col_name=sample, value=tmp.get_val(row_name=gene, col_name=sample))
|
|
270 return out
|
|
271
|
|
272
|
|
273
|
|
274 def aliasRemap(genomicMatrix, aliasMap):
|
|
275 """
|
|
276 Given a genomicMatrix and an alias map, create a new genomic matrix
|
|
277 with the probes from the original matrix remapped to the connected aliases
|
|
278 from the map
|
|
279 """
|
|
280
|
|
281 am = {}
|
|
282 for probe in aliasMap.get_key_list():
|
|
283 for alias in aliasMap.get_by(probe):
|
|
284 if alias not in am:
|
|
285 am[alias.alias] = {}
|
|
286 am[alias.alias][probe] = True
|
|
287
|
|
288 out = CGData.GenomicMatrix.GenomicMatrix()
|
|
289 out.init_blank( rows=am.keys(), cols=genomicMatrix.get_col_list() )
|
|
290 probeMap = genomicMatrix.get_row_map()
|
|
291 for a in am:
|
|
292 for sample in genomicMatrix.get_col_list():
|
|
293 o = []
|
|
294 for p in am[a]:
|
|
295 if p in probeMap:
|
|
296 o.append( genomicMatrix.get_val( col_name=sample, row_name=p) )
|
|
297 if len(o):
|
|
298 out.set_val(col_name=sample, row_name=a, value=sum(o) / float(len(o)))
|
|
299
|
|
300 return out
|
|
301
|
|
302 def refGeneLink2ProbeLoc(aliasMap, refGene):
|
|
303 """
|
|
304 given an alias map, and a refGene produce a probeMap by connecting
|
|
305 alias symbols. Returns the coordinates of the longest form
|
|
306 """
|
|
307 out = CGData.ProbeLoc.ProbeLoc()
|
|
308 out.init_blank()
|
|
309
|
|
310 for probe in aliasMap.get_key_list():
|
|
311 for link in aliasMap.get_by(probe):
|
|
312 probe = link.probe
|
|
313 geneName = link.alias
|
|
314 sGene = None
|
|
315 try:
|
|
316 for gene in refGene.get_gene(geneName):
|
|
317 if sGene is None or gene.chrom_end - gene.chrom_start > sGene.chrom_end - sGene.chrom_start:
|
|
318 sGene = gene
|
|
319 except KeyError:
|
|
320 pass
|
|
321 if sGene is not None:
|
|
322 out.insert(probe, { 'probe' : probe, 'chrom' : sGene.chrom, 'strand' : sGene.strand, 'chrom_start' : sGene.chrom_start, 'chrom_end' : sGene.chrom_end })
|
|
323 return out
|
|
324
|
|
325
|
|
326
|
|
327
|
|
328
|