Mercurial > repos > melissacline > ucsc_cancer_utilities
comparison seg2matrix/CGData/GeneMap.py @ 31:ab20c0d04f4a
add seg2matrix tool
author | jingchunzhu |
---|---|
date | Fri, 24 Jul 2015 13:10:11 -0700 |
parents | |
children | 3e5680fecd7a |
comparison
equal
deleted
inserted
replaced
30:7a7a52e9b019 | 31:ab20c0d04f4a |
---|---|
1 #!/usr/bin/env python | |
2 | |
3 import csv | |
4 import sys | |
5 import re | |
6 | |
7 import CGData.ProbeMap | |
8 import CGData.GenomicMatrix | |
9 | |
10 class ProbeMapper(object): | |
11 """ | |
12 Class to map the probes. Expects handle to the refGene_hg18.table file | |
13 """ | |
14 | |
15 def __init__(self, mode='g'): | |
16 self.cmp_func = optionMap[mode] | |
17 | |
18 def find_overlap(self, segment, ref_gene, cmp_func=None): | |
19 """ | |
20 Function to find overlaps for a given probe description. | |
21 the cmp_func arg is a function that returns a 'True' or 'False' for | |
22 a given probe description and a gene, examples include 'gene_overlap' | |
23 and 'gene_simple_meth_overlap' | |
24 """ | |
25 if cmp_func is None: | |
26 cmp_func = self.cmp_func | |
27 if not ref_gene.has_chrom(segment.chrom): | |
28 return [] | |
29 chromList = ref_gene.get_chrom(segment.chrom) | |
30 | |
31 out = [] | |
32 for gene in chromList: | |
33 if cmp_func(segment.chrom_start, | |
34 segment.chrom_end, segment.strand, gene): | |
35 out.append(gene) | |
36 return out | |
37 | |
38 | |
39 # | |
40 # The set of functions that can be used to do comparisons | |
41 # | |
42 | |
43 | |
44 def block_both_strand(start, end, strand, gene): | |
45 """ | |
46 Check is segment is between gene start and end, either strand | |
47 | |
48 **Code 'b'** | |
49 """ | |
50 if gene.chrom_end >= start and gene.chrom_start <= end: | |
51 return True | |
52 return False | |
53 | |
54 def block_same_strand(start, end, strand, gene): | |
55 """ | |
56 Check is segment is on same strand, between gene start and end | |
57 | |
58 **Code 's'** | |
59 """ | |
60 if gene.chrom_end >= start and gene.chrom_start <= end and strand == gene.strand: | |
61 return True | |
62 return False | |
63 | |
64 | |
65 def exon_same_strand(start, end, strand, gene): | |
66 """ | |
67 Check is segment is on same strand, and occurs on an exon | |
68 | |
69 **Code 'g'** | |
70 """ | |
71 | |
72 if gene.strand != strand: | |
73 return False | |
74 for i in range(int(gene.ex_count)): | |
75 if gene.ex_end[i] >= start and gene.ex_start[i] <= end: | |
76 return True | |
77 return False | |
78 | |
79 | |
80 def exon_both_strand(start, end, strand, gene): | |
81 """ | |
82 Check is segment occurs on an exon, on either stand | |
83 | |
84 **Code 'e'** | |
85 """ | |
86 for i in range(int(gene.ex_count)): | |
87 if gene.ex_end[i] >= start and gene.ex_start[i] <= end: | |
88 return True | |
89 return False | |
90 | |
91 def block_same_strand_coverage75(start, end, strand, gene): | |
92 if gene.chrom_end >= start and gene.chrom_start <= end and strand == gene.strand: | |
93 cov = min(gene.chrom_end,end) - max(gene.chrom_start, start) | |
94 if float(cov) / float(end - start) > 0.75: | |
95 return True | |
96 return False | |
97 | |
98 | |
99 def exon_same_strand_coverage75(start, end, strand, gene): | |
100 if strand != gene.strand: | |
101 return False | |
102 cov = 0 | |
103 for i in range(int(gene.ex_count)): | |
104 if gene.ex_end[i] >= start and gene.ex_start[i] <= end: | |
105 cov += min(gene.ex_end[i],end) - max(gene.ex_start[i], start) | |
106 if float(cov) / float(end - start) > 0.75: | |
107 return True | |
108 return False | |
109 | |
110 | |
111 class Intersector: | |
112 def __init__(self, same_strand=True, exons=False, coverage=None, start_rel_cdsStart=None, end_rel_cdsStart=None, start_rel_tss=None, end_rel_tss=None): | |
113 self.same_strand = same_strand | |
114 self.exons = exons | |
115 self.coverage = coverage | |
116 self.start_rel_tss = start_rel_tss | |
117 self.end_rel_tss = end_rel_tss | |
118 self.start_rel_cdsStart = start_rel_cdsStart | |
119 self.end_rel_cdsStart = end_rel_cdsStart | |
120 | |
121 def hit(self, start, end, strand, gene): | |
122 if self.same_strand and strand != gene.strand: | |
123 return False | |
124 | |
125 if self.exons: | |
126 cov = 0 | |
127 for i in range(int(gene.ex_count)): | |
128 if gene.ex_end[i] >= start and gene.ex_start[i] <= end: | |
129 cov += min(gene.ex_end[i],end) - max(gene.ex_start[i], start) | |
130 if self.coverage is None and cov > 0: | |
131 return True | |
132 else: | |
133 if float(cov) / float(end - start) > self.coverage: | |
134 return True | |
135 else: | |
136 wStart = gene.chrom_start | |
137 wEnd = gene.chrom_end | |
138 | |
139 if self.start_rel_cdsStart is not None: | |
140 if gene.strand == "+": | |
141 wStart = gene.cds_start + self.start_rel_cdsStart | |
142 if gene.strand == "-": | |
143 wStart = gene.cds_end - self.start_rel_cdsStart | |
144 if self.end_rel_cdsStart is not None: | |
145 if gene.strand == "+": | |
146 wEnd = gene.cds_start + self.end_rel_cdsStart | |
147 if gene.strand == "-": | |
148 wEnd = gene.cds_end - self.end_rel_cdsStart | |
149 | |
150 if self.start_rel_tss is not None: | |
151 if gene.strand == "+": | |
152 wStart = gene.chrom_start + self.start_rel_tss | |
153 if gene.strand == "-": | |
154 wStart = gene.chrom_end - self.start_rel_tss | |
155 if self.end_rel_tss is not None: | |
156 if gene.strand == "+": | |
157 wEnd = gene.chrom_start + self.end_rel_tss | |
158 if gene.strand == "-": | |
159 wEnd = gene.chrom_end - self.end_rel_tss | |
160 | |
161 cstart = min(wEnd, wStart) | |
162 cend = max(wEnd, wStart) | |
163 if cend >= start and cstart <= end: | |
164 cov = min(gene.chrom_end,end) - max(gene.chrom_start, start) | |
165 if self.coverage is None and cov > 0: | |
166 return True | |
167 else: | |
168 if float(cov) / float(end - start) > 0.75: | |
169 return True | |
170 | |
171 return False | |
172 | |
173 | |
174 | |
175 ###ADD MORE FUNCTIONS HERE | |
176 | |
177 | |
178 #### | |
179 | |
180 ###To add options to the command line, map the option character to a function | |
181 ###for example '-m' maps to gene_simple_meth_overlap | |
182 | |
183 optionMap = { | |
184 "b": block_both_strand, | |
185 "s": block_same_strand, | |
186 "g": exon_same_strand, | |
187 "e": exon_both_strand | |
188 } | |
189 | |
190 | |
191 | |
192 def genomicSegment2Matrix(genomicSegment, refGene, probeMapper): | |
193 """ | |
194 Take a genomicSegment map, compare it against a refGene table, | |
195 and contruct a genomicMatrix | |
196 """ | |
197 out = CGData.GenomicMatrix.GenomicMatrix() | |
198 out.init_blank( rows=refGene.get_gene_list(), cols=genomicSegment.get_key_list() ) | |
199 for id in genomicSegment.get_key_list(): | |
200 for segment in genomicSegment.get_by(id): | |
201 for hit in probeMapper.find_overlap( segment, refGene ): | |
202 out.set_val(row_name=hit.name, col_name=segment.id, value=segment.value ) | |
203 return out | |
204 | |
205 | |
206 def filter_longest_form(refgene): | |
207 """ | |
208 take a refgene table and filter multiple gene isoforms down to the longest | |
209 """ | |
210 ng = CGData.RefGene.RefGene() | |
211 for g in refgene.get_gene_list(): | |
212 longest = None | |
213 length = 0 | |
214 for elem in refgene.get_gene(g): | |
215 newLength = elem.chrom_end - elem.chrom_start | |
216 if newLength > length: | |
217 length = newLength | |
218 longest = elem | |
219 ng.add(longest) | |
220 ng.loaded = True | |
221 return ng | |
222 | |
223 | |
224 def genomicSegment2MatrixNorm(genomicSegment, refGene, probeMapper): | |
225 """ | |
226 Given | |
227 """ | |
228 ng = filter_longest_form(refGene) | |
229 #enumerate the col order of the sample ids | |
230 idList = genomicSegment.get_key_list() | |
231 | |
232 geneList = ng.get_gene_list() | |
233 | |
234 tmp = CGData.GenomicMatrix.GenomicMatrix() | |
235 tmp.init_blank( rows=geneList, cols=idList ) | |
236 geneHits = {} | |
237 #read through the segment one sample id at a time | |
238 for id in idList: | |
239 segmentMap = {} | |
240 for segment in genomicSegment.get_by(id): | |
241 for hit in probeMapper.find_overlap( segment, ng ): | |
242 span = float(min(segment.chrom_end, hit.chrom_end) - max(segment.chrom_start, hit.chrom_start)) / float(hit.chrom_end - hit.chrom_start) | |
243 #if hit.name not in segmentMap: | |
244 # segmentMap[hit.name] = [] | |
245 try: | |
246 segmentMap[hit.name].append( | |
247 ( span, segment.value ) | |
248 ) | |
249 except KeyError: | |
250 segmentMap[hit.name] = [ | |
251 ( span, segment.value ) | |
252 ] | |
253 | |
254 for gene in segmentMap: | |
255 geneHits[gene] = True | |
256 mapInfo = segmentMap[gene] | |
257 coverage = sum( i[0] for i in mapInfo ) | |
258 assert coverage <= 1.0 | |
259 value = sum( i[0]*i[1] for i in mapInfo ) | |
260 #print coverage, value, value/coverage, segmentMap[gene] | |
261 tmp.set_val(row_name=gene, col_name=id, value=value/coverage) | |
262 | |
263 #now remove the blanks | |
264 out = CGData.GenomicMatrix.GenomicMatrix() | |
265 out.init_blank( rows=geneHits, cols=idList ) | |
266 for gene in geneHits: | |
267 for sample in idList: | |
268 out.set_val( row_name=gene, col_name=sample, value=tmp.get_val(row_name=gene, col_name=sample)) | |
269 return out | |
270 | |
271 | |
272 | |
273 def aliasRemap(genomicMatrix, aliasMap): | |
274 """ | |
275 Given a genomicMatrix and an alias map, create a new genomic matrix | |
276 with the probes from the original matrix remapped to the connected aliases | |
277 from the map | |
278 """ | |
279 | |
280 am = {} | |
281 for probe in aliasMap.get_key_list(): | |
282 for alias in aliasMap.get_by(probe): | |
283 if alias not in am: | |
284 am[alias.alias] = {} | |
285 am[alias.alias][probe] = True | |
286 | |
287 out = CGData.GenomicMatrix.GenomicMatrix() | |
288 out.init_blank( rows=am.keys(), cols=genomicMatrix.get_col_list() ) | |
289 probeMap = genomicMatrix.get_row_map() | |
290 for a in am: | |
291 for sample in genomicMatrix.get_col_list(): | |
292 o = [] | |
293 for p in am[a]: | |
294 if p in probeMap: | |
295 o.append( genomicMatrix.get_val( col_name=sample, row_name=p) ) | |
296 if len(o): | |
297 out.set_val(col_name=sample, row_name=a, value=sum(o) / float(len(o))) | |
298 | |
299 return out | |
300 | |
301 def refGeneLink2ProbeLoc(aliasMap, refGene): | |
302 """ | |
303 given an alias map, and a refGene produce a probeMap by connecting | |
304 alias symbols. Returns the coordinates of the longest form | |
305 """ | |
306 out = CGData.ProbeLoc.ProbeLoc() | |
307 out.init_blank() | |
308 | |
309 for probe in aliasMap.get_key_list(): | |
310 for link in aliasMap.get_by(probe): | |
311 probe = link.probe | |
312 geneName = link.alias | |
313 sGene = None | |
314 try: | |
315 for gene in refGene.get_gene(geneName): | |
316 if sGene is None or gene.chrom_end - gene.chrom_start > sGene.chrom_end - sGene.chrom_start: | |
317 sGene = gene | |
318 except KeyError: | |
319 pass | |
320 if sGene is not None: | |
321 out.insert(probe, { 'probe' : probe, 'chrom' : sGene.chrom, 'strand' : sGene.strand, 'chrom_start' : sGene.chrom_start, 'chrom_end' : sGene.chrom_end }) | |
322 return out | |
323 | |
324 | |
325 | |
326 | |
327 |