comparison seg2matrix/CGData/GeneMap.py @ 31:ab20c0d04f4a

add seg2matrix tool
author jingchunzhu
date Fri, 24 Jul 2015 13:10:11 -0700
parents
children 3e5680fecd7a
comparison
equal deleted inserted replaced
30:7a7a52e9b019 31:ab20c0d04f4a
1 #!/usr/bin/env python
2
3 import csv
4 import sys
5 import re
6
7 import CGData.ProbeMap
8 import CGData.GenomicMatrix
9
10 class ProbeMapper(object):
11 """
12 Class to map the probes. Expects handle to the refGene_hg18.table file
13 """
14
15 def __init__(self, mode='g'):
16 self.cmp_func = optionMap[mode]
17
18 def find_overlap(self, segment, ref_gene, cmp_func=None):
19 """
20 Function to find overlaps for a given probe description.
21 the cmp_func arg is a function that returns a 'True' or 'False' for
22 a given probe description and a gene, examples include 'gene_overlap'
23 and 'gene_simple_meth_overlap'
24 """
25 if cmp_func is None:
26 cmp_func = self.cmp_func
27 if not ref_gene.has_chrom(segment.chrom):
28 return []
29 chromList = ref_gene.get_chrom(segment.chrom)
30
31 out = []
32 for gene in chromList:
33 if cmp_func(segment.chrom_start,
34 segment.chrom_end, segment.strand, gene):
35 out.append(gene)
36 return out
37
38
39 #
40 # The set of functions that can be used to do comparisons
41 #
42
43
44 def block_both_strand(start, end, strand, gene):
45 """
46 Check is segment is between gene start and end, either strand
47
48 **Code 'b'**
49 """
50 if gene.chrom_end >= start and gene.chrom_start <= end:
51 return True
52 return False
53
54 def block_same_strand(start, end, strand, gene):
55 """
56 Check is segment is on same strand, between gene start and end
57
58 **Code 's'**
59 """
60 if gene.chrom_end >= start and gene.chrom_start <= end and strand == gene.strand:
61 return True
62 return False
63
64
65 def exon_same_strand(start, end, strand, gene):
66 """
67 Check is segment is on same strand, and occurs on an exon
68
69 **Code 'g'**
70 """
71
72 if gene.strand != strand:
73 return False
74 for i in range(int(gene.ex_count)):
75 if gene.ex_end[i] >= start and gene.ex_start[i] <= end:
76 return True
77 return False
78
79
80 def exon_both_strand(start, end, strand, gene):
81 """
82 Check is segment occurs on an exon, on either stand
83
84 **Code 'e'**
85 """
86 for i in range(int(gene.ex_count)):
87 if gene.ex_end[i] >= start and gene.ex_start[i] <= end:
88 return True
89 return False
90
91 def block_same_strand_coverage75(start, end, strand, gene):
92 if gene.chrom_end >= start and gene.chrom_start <= end and strand == gene.strand:
93 cov = min(gene.chrom_end,end) - max(gene.chrom_start, start)
94 if float(cov) / float(end - start) > 0.75:
95 return True
96 return False
97
98
99 def exon_same_strand_coverage75(start, end, strand, gene):
100 if strand != gene.strand:
101 return False
102 cov = 0
103 for i in range(int(gene.ex_count)):
104 if gene.ex_end[i] >= start and gene.ex_start[i] <= end:
105 cov += min(gene.ex_end[i],end) - max(gene.ex_start[i], start)
106 if float(cov) / float(end - start) > 0.75:
107 return True
108 return False
109
110
111 class Intersector:
112 def __init__(self, same_strand=True, exons=False, coverage=None, start_rel_cdsStart=None, end_rel_cdsStart=None, start_rel_tss=None, end_rel_tss=None):
113 self.same_strand = same_strand
114 self.exons = exons
115 self.coverage = coverage
116 self.start_rel_tss = start_rel_tss
117 self.end_rel_tss = end_rel_tss
118 self.start_rel_cdsStart = start_rel_cdsStart
119 self.end_rel_cdsStart = end_rel_cdsStart
120
121 def hit(self, start, end, strand, gene):
122 if self.same_strand and strand != gene.strand:
123 return False
124
125 if self.exons:
126 cov = 0
127 for i in range(int(gene.ex_count)):
128 if gene.ex_end[i] >= start and gene.ex_start[i] <= end:
129 cov += min(gene.ex_end[i],end) - max(gene.ex_start[i], start)
130 if self.coverage is None and cov > 0:
131 return True
132 else:
133 if float(cov) / float(end - start) > self.coverage:
134 return True
135 else:
136 wStart = gene.chrom_start
137 wEnd = gene.chrom_end
138
139 if self.start_rel_cdsStart is not None:
140 if gene.strand == "+":
141 wStart = gene.cds_start + self.start_rel_cdsStart
142 if gene.strand == "-":
143 wStart = gene.cds_end - self.start_rel_cdsStart
144 if self.end_rel_cdsStart is not None:
145 if gene.strand == "+":
146 wEnd = gene.cds_start + self.end_rel_cdsStart
147 if gene.strand == "-":
148 wEnd = gene.cds_end - self.end_rel_cdsStart
149
150 if self.start_rel_tss is not None:
151 if gene.strand == "+":
152 wStart = gene.chrom_start + self.start_rel_tss
153 if gene.strand == "-":
154 wStart = gene.chrom_end - self.start_rel_tss
155 if self.end_rel_tss is not None:
156 if gene.strand == "+":
157 wEnd = gene.chrom_start + self.end_rel_tss
158 if gene.strand == "-":
159 wEnd = gene.chrom_end - self.end_rel_tss
160
161 cstart = min(wEnd, wStart)
162 cend = max(wEnd, wStart)
163 if cend >= start and cstart <= end:
164 cov = min(gene.chrom_end,end) - max(gene.chrom_start, start)
165 if self.coverage is None and cov > 0:
166 return True
167 else:
168 if float(cov) / float(end - start) > 0.75:
169 return True
170
171 return False
172
173
174
175 ###ADD MORE FUNCTIONS HERE
176
177
178 ####
179
180 ###To add options to the command line, map the option character to a function
181 ###for example '-m' maps to gene_simple_meth_overlap
182
183 optionMap = {
184 "b": block_both_strand,
185 "s": block_same_strand,
186 "g": exon_same_strand,
187 "e": exon_both_strand
188 }
189
190
191
192 def genomicSegment2Matrix(genomicSegment, refGene, probeMapper):
193 """
194 Take a genomicSegment map, compare it against a refGene table,
195 and contruct a genomicMatrix
196 """
197 out = CGData.GenomicMatrix.GenomicMatrix()
198 out.init_blank( rows=refGene.get_gene_list(), cols=genomicSegment.get_key_list() )
199 for id in genomicSegment.get_key_list():
200 for segment in genomicSegment.get_by(id):
201 for hit in probeMapper.find_overlap( segment, refGene ):
202 out.set_val(row_name=hit.name, col_name=segment.id, value=segment.value )
203 return out
204
205
206 def filter_longest_form(refgene):
207 """
208 take a refgene table and filter multiple gene isoforms down to the longest
209 """
210 ng = CGData.RefGene.RefGene()
211 for g in refgene.get_gene_list():
212 longest = None
213 length = 0
214 for elem in refgene.get_gene(g):
215 newLength = elem.chrom_end - elem.chrom_start
216 if newLength > length:
217 length = newLength
218 longest = elem
219 ng.add(longest)
220 ng.loaded = True
221 return ng
222
223
224 def genomicSegment2MatrixNorm(genomicSegment, refGene, probeMapper):
225 """
226 Given
227 """
228 ng = filter_longest_form(refGene)
229 #enumerate the col order of the sample ids
230 idList = genomicSegment.get_key_list()
231
232 geneList = ng.get_gene_list()
233
234 tmp = CGData.GenomicMatrix.GenomicMatrix()
235 tmp.init_blank( rows=geneList, cols=idList )
236 geneHits = {}
237 #read through the segment one sample id at a time
238 for id in idList:
239 segmentMap = {}
240 for segment in genomicSegment.get_by(id):
241 for hit in probeMapper.find_overlap( segment, ng ):
242 span = float(min(segment.chrom_end, hit.chrom_end) - max(segment.chrom_start, hit.chrom_start)) / float(hit.chrom_end - hit.chrom_start)
243 #if hit.name not in segmentMap:
244 # segmentMap[hit.name] = []
245 try:
246 segmentMap[hit.name].append(
247 ( span, segment.value )
248 )
249 except KeyError:
250 segmentMap[hit.name] = [
251 ( span, segment.value )
252 ]
253
254 for gene in segmentMap:
255 geneHits[gene] = True
256 mapInfo = segmentMap[gene]
257 coverage = sum( i[0] for i in mapInfo )
258 assert coverage <= 1.0
259 value = sum( i[0]*i[1] for i in mapInfo )
260 #print coverage, value, value/coverage, segmentMap[gene]
261 tmp.set_val(row_name=gene, col_name=id, value=value/coverage)
262
263 #now remove the blanks
264 out = CGData.GenomicMatrix.GenomicMatrix()
265 out.init_blank( rows=geneHits, cols=idList )
266 for gene in geneHits:
267 for sample in idList:
268 out.set_val( row_name=gene, col_name=sample, value=tmp.get_val(row_name=gene, col_name=sample))
269 return out
270
271
272
273 def aliasRemap(genomicMatrix, aliasMap):
274 """
275 Given a genomicMatrix and an alias map, create a new genomic matrix
276 with the probes from the original matrix remapped to the connected aliases
277 from the map
278 """
279
280 am = {}
281 for probe in aliasMap.get_key_list():
282 for alias in aliasMap.get_by(probe):
283 if alias not in am:
284 am[alias.alias] = {}
285 am[alias.alias][probe] = True
286
287 out = CGData.GenomicMatrix.GenomicMatrix()
288 out.init_blank( rows=am.keys(), cols=genomicMatrix.get_col_list() )
289 probeMap = genomicMatrix.get_row_map()
290 for a in am:
291 for sample in genomicMatrix.get_col_list():
292 o = []
293 for p in am[a]:
294 if p in probeMap:
295 o.append( genomicMatrix.get_val( col_name=sample, row_name=p) )
296 if len(o):
297 out.set_val(col_name=sample, row_name=a, value=sum(o) / float(len(o)))
298
299 return out
300
301 def refGeneLink2ProbeLoc(aliasMap, refGene):
302 """
303 given an alias map, and a refGene produce a probeMap by connecting
304 alias symbols. Returns the coordinates of the longest form
305 """
306 out = CGData.ProbeLoc.ProbeLoc()
307 out.init_blank()
308
309 for probe in aliasMap.get_key_list():
310 for link in aliasMap.get_by(probe):
311 probe = link.probe
312 geneName = link.alias
313 sGene = None
314 try:
315 for gene in refGene.get_gene(geneName):
316 if sGene is None or gene.chrom_end - gene.chrom_start > sGene.chrom_end - sGene.chrom_start:
317 sGene = gene
318 except KeyError:
319 pass
320 if sGene is not None:
321 out.insert(probe, { 'probe' : probe, 'chrom' : sGene.chrom, 'strand' : sGene.strand, 'chrom_start' : sGene.chrom_start, 'chrom_end' : sGene.chrom_end })
322 return out
323
324
325
326
327