Mercurial > repos > mahtabm > ensemb_rep_gvl
comparison variant_effect_predictor/Bio/EnsEMBL/TopLevelAssemblyMapper.pm @ 0:2bc9b66ada89 draft default tip
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 06:29:17 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:2bc9b66ada89 |
---|---|
1 =head1 LICENSE | |
2 | |
3 Copyright (c) 1999-2012 The European Bioinformatics Institute and | |
4 Genome Research Limited. All rights reserved. | |
5 | |
6 This software is distributed under a modified Apache license. | |
7 For license details, please see | |
8 | |
9 http://www.ensembl.org/info/about/code_licence.html | |
10 | |
11 =head1 CONTACT | |
12 | |
13 Please email comments or questions to the public Ensembl | |
14 developers list at <dev@ensembl.org>. | |
15 | |
16 Questions may also be sent to the Ensembl help desk at | |
17 <helpdesk@ensembl.org>. | |
18 | |
19 =cut | |
20 | |
21 =head1 NAME | |
22 | |
23 Bio::EnsEMBL::TopLevelAssemblyMapper - | |
24 Handles mapping between a given coordinate system and the toplevel | |
25 pseudo coordinate system. | |
26 | |
27 =head1 SYNOPSIS | |
28 | |
29 $db = Bio::EnsEMBL::DBSQL::DBAdaptor->new(...); | |
30 $asma = $db->get_AssemblyMapperAdaptor(); | |
31 $csa = $db->get_CoordSystemAdaptor(); | |
32 | |
33 my $toplevel = $cs_adaptor->fetch_by_name('toplevel'); | |
34 my $ctg_cs = $cs_adaptor->fetch_by_name('contig'); | |
35 | |
36 $asm_mapper = $map_adaptor->fetch_by_CoordSystems( $cs1, $cs2 ); | |
37 | |
38 # map to toplevel coord system for this region | |
39 @chr_coords = | |
40 $asm_mapper->map( 'AL30421.1.200.92341', 100, 10000, -1, $ctg_cs ); | |
41 | |
42 # list toplevel seq_region_ids for this region | |
43 @chr_ids = | |
44 $asm_mapper->list_ids( 'AL30421.1.200.92341', 1, 1000, -1, | |
45 $ctg_cs ); | |
46 | |
47 =head1 DESCRIPTION | |
48 | |
49 The TopLevelAssemblyMapper performs mapping between a provided | |
50 coordinate system and the toplevel pseudo cooordinate system. The | |
51 toplevel coordinate system is not a real coordinate system, but | |
52 represents the highest coordinate system that can be mapped to in a | |
53 given region. It is only possible to perform unidirectional mapping | |
54 using this mapper, because it does not make sense to map from the | |
55 toplevel coordinate system to another coordinate system. | |
56 | |
57 =head1 METHODS | |
58 | |
59 =cut | |
60 | |
61 | |
62 use strict; | |
63 use warnings; | |
64 | |
65 package Bio::EnsEMBL::TopLevelAssemblyMapper; | |
66 | |
67 use Bio::EnsEMBL::Utils::Exception qw(throw); | |
68 use Bio::EnsEMBL::Mapper; | |
69 use Bio::EnsEMBL::CoordSystem; | |
70 use Scalar::Util qw(weaken); | |
71 | |
72 =head2 new | |
73 | |
74 Arg [1] : Bio::EnsEMBL::DBAdaptor $dbadaptor the adaptor for | |
75 the database this mapper is using. | |
76 Arg [2] : Toplevel CoordSystem | |
77 Arg [3] : Other CoordSystem | |
78 Description: Creates a new TopLevelAssemblyMapper object | |
79 Returntype : Bio::EnsEMBL::DBSQL::TopLevelAssemblyMapper | |
80 Exceptions : throws if any of the 3 arguments are missing/ not | |
81 : of the correct type. | |
82 Caller : Bio::EnsEMBL::DBSQL::AssemblyMapperAdaptor | |
83 Status : Stable | |
84 | |
85 =cut | |
86 | |
87 | |
88 sub new { | |
89 my ($caller, $adaptor, $toplevel_cs, $other_cs) = @_; | |
90 | |
91 my $class = ref($caller) || $caller; | |
92 | |
93 if(!ref($toplevel_cs)) { | |
94 throw('Toplevel CoordSystem argument expected.'); | |
95 } | |
96 if(!ref($other_cs)) { | |
97 throw('Other CoordSystem argument expected.'); | |
98 } | |
99 | |
100 if(!$toplevel_cs->is_top_level()) { | |
101 throw($toplevel_cs->name() . " is not the toplevel CoordSystem."); | |
102 } | |
103 if($other_cs->is_top_level()) { | |
104 throw("Other CoordSystem argument should not be toplevel CoordSystem."); | |
105 } | |
106 | |
107 my $cs_adaptor = $adaptor->db()->get_CoordSystemAdaptor(); | |
108 my $coord_systems = $cs_adaptor->fetch_all(); | |
109 | |
110 my $self = bless {'coord_systems' => $coord_systems, | |
111 'toplevel_cs' => $toplevel_cs, | |
112 'other_cs' => $other_cs}, $class; | |
113 | |
114 $self->adaptor($adaptor); | |
115 return $self; | |
116 } | |
117 | |
118 sub adaptor { | |
119 my $self = shift; | |
120 | |
121 weaken($self->{'adaptor'} = shift) if(@_); | |
122 | |
123 return $self->{'adaptor'}; | |
124 } | |
125 | |
126 =head2 map | |
127 | |
128 Arg [1] : string $frm_seq_region | |
129 The name of the sequence region to transform FROM | |
130 Arg [2] : int $frm_start | |
131 The start of the region to transform FROM | |
132 Arg [3] : int $frm_end | |
133 The end of the region to transform FROM | |
134 Arg [4] : int $strand | |
135 The strand of the region to transform FROM | |
136 Arg [5] : Bio::EnsEMBL::CoordSystem | |
137 The coordinate system to transform FROM | |
138 Arg [6] : if set will do a fastmap | |
139 Example : @coords = $mapper->map('X', 1_000_000, 2_000_000, | |
140 1, $chr_cs); | |
141 Description: Transforms coordinates from one coordinate system | |
142 to another. | |
143 Returntype : List of Bio::EnsEMBL::Mapper::Coordinate and/or | |
144 Bio::EnsEMBL::Mapper:Gap objects | |
145 Exceptions : thrown if if the specified TO coordinate system is not one | |
146 of the coordinate systems associated with this mapper | |
147 Caller : general | |
148 Status : Stable | |
149 | |
150 =cut | |
151 | |
152 | |
153 sub map { | |
154 throw('Incorrect number of arguments.') if(@_ != 6 && @_ != 7); | |
155 | |
156 my($self, $frm_seq_region_name, $frm_start, $frm_end, $frm_strand, $frm_cs, | |
157 $fastmap) = @_; | |
158 | |
159 if($frm_cs->is_top_level()) { | |
160 throw("The toplevel CoordSystem can only be mapped TO, not FROM."); | |
161 } | |
162 | |
163 my @tmp; | |
164 push @tmp, $frm_seq_region_name; | |
165 my $seq_region_id = @{$self->adaptor()->seq_regions_to_ids($frm_cs, \@tmp)}[0]; | |
166 | |
167 my $mapper = $self->{'mapper'}; | |
168 my $toplevel_cs = $self->{'toplevel_cs'}; | |
169 my $other_cs = $self->{'other_cs'}; | |
170 my $adaptor = $self->adaptor; | |
171 | |
172 if($frm_cs != $other_cs && !$frm_cs->equals($other_cs)) { | |
173 throw("Coordinate system " . $frm_cs->name . " " . $frm_cs->version . | |
174 " is neither the assembled nor the component coordinate system " . | |
175 " of this AssemblyMapper"); | |
176 } | |
177 | |
178 my $coord_systems = $self->{'coord_systems'}; | |
179 | |
180 my $csa = $self->adaptor()->db()->get_CoordSystemAdaptor(); | |
181 | |
182 # | |
183 # TBD try to make this more efficient | |
184 # | |
185 my $from_rank = $other_cs->rank(); | |
186 foreach my $cs (@$coord_systems) { | |
187 last if($cs->rank >= $from_rank); | |
188 | |
189 #check if a mapping path even exists to this coordinate system | |
190 my @mapping_path = @{ $csa->get_mapping_path( $cs, $other_cs ) }; | |
191 | |
192 if(@mapping_path) { | |
193 | |
194 # Try to map to this coord system. If we get back any coordinates then | |
195 # it is our 'toplevel' that we were looking for | |
196 my $mapper = $adaptor->fetch_by_CoordSystems($other_cs, $cs); | |
197 | |
198 if($fastmap) { | |
199 my @result = $mapper->fastmap($frm_seq_region_name, $frm_start, $frm_end, | |
200 $frm_strand, $frm_cs); | |
201 return @result if(@result); | |
202 } else { | |
203 my @coords = $mapper->map($frm_seq_region_name, $frm_start, $frm_end, | |
204 $frm_strand, $frm_cs); | |
205 | |
206 if(@coords > 1 || !$coords[0]->isa('Bio::EnsEMBL::Mapper::Gap')) { | |
207 return @coords; | |
208 } | |
209 } | |
210 } | |
211 } | |
212 | |
213 # the toplevel coordinate system for the region requested *is* the | |
214 # requested region. | |
215 if($fastmap) { | |
216 return ($seq_region_id,$frm_start, $frm_end, $frm_strand, $other_cs); | |
217 } | |
218 return Bio::EnsEMBL::Mapper::Coordinate->new | |
219 ($seq_region_id,$frm_start,$frm_end, $frm_strand, $other_cs); | |
220 } | |
221 | |
222 # | |
223 # for polymorphism with AssemblyMapper | |
224 # | |
225 =head2 flush | |
226 | |
227 Args : none | |
228 Example : none | |
229 Description: polymorphism with AssemblyMapper, does nothing | |
230 Returntype : none | |
231 Exceptions : none | |
232 Status : Stable | |
233 | |
234 =cut | |
235 | |
236 sub flush {} | |
237 | |
238 =head2 fastmap | |
239 | |
240 Arg [1] : string $frm_seq_region | |
241 The name of the sequence region to transform FROM | |
242 Arg [2] : int $frm_start | |
243 The start of the region to transform FROM | |
244 Arg [3] : int $frm_end | |
245 The end of the region to transform FROM | |
246 Arg [4] : int $strand | |
247 The strand of the region to transform FROM | |
248 Arg [5] : Bio::EnsEMBL::CoordSystem | |
249 The coordinate system to transform FROM | |
250 Example : @coords = $mapper->fastmap('X', 1_000_000, 2_000_000, | |
251 1, $chr_cs); | |
252 Description: Transforms coordinates from one coordinate system | |
253 to another. | |
254 Returntype : List of Bio::EnsEMBL::Mapper::Coordinate and/or | |
255 Bio::EnsEMBL::Mapper:Gap objects | |
256 Exceptions : thrown if if the specified TO coordinate system is not one | |
257 of the coordinate systems associated with this mapper | |
258 Caller : general | |
259 Status : Stable | |
260 | |
261 =cut | |
262 | |
263 sub fastmap { | |
264 my $self = shift; | |
265 return $self->map(@_,1); | |
266 } | |
267 | |
268 =head2 assembled_CoordSystem | |
269 | |
270 Arg [1] : none | |
271 Example : $cs = $mapper->assembled_CoordSystem | |
272 Description: Retrieves the assembled CoordSystem from this mapper | |
273 Returntype : Bio::EnsEMBL::CoordSystem | |
274 Exceptions : none | |
275 Caller : internal, AssemblyMapperAdaptor | |
276 Status : Stable | |
277 | |
278 =cut | |
279 | |
280 sub assembled_CoordSystem { | |
281 my $self = shift; | |
282 return $self->{'toplevel_cs'}; | |
283 } | |
284 | |
285 =head2 component_CoordSystem | |
286 | |
287 Arg [1] : none | |
288 Example : $cs = $mapper->component_CoordSystem | |
289 Description: Retrieves the component CoordSystem from this mapper | |
290 Returntype : Bio::EnsEMBL::CoordSystem | |
291 Exceptions : none | |
292 Caller : internal, AssemblyMapperAdaptor | |
293 Status : Stable | |
294 | |
295 =cut | |
296 | |
297 sub component_CoordSystem { | |
298 my $self = shift; | |
299 return $self->{'other_cs'}; | |
300 } | |
301 | |
302 | |
303 sub _list { | |
304 my($self, $frm_seq_region_name, $frm_start, $frm_end, $frm_cs, $seq_regions) = @_; | |
305 | |
306 my $mapper = $self->{'mapper'}; | |
307 my $toplevel_cs = $self->{'toplevel_cs'}; | |
308 my $other_cs = $self->{'other_cs'}; | |
309 my $adaptor = $self->adaptor; | |
310 | |
311 if($frm_cs->is_top_level()) { | |
312 throw("The toplevel CoordSystem can only be mapped TO, not FROM."); | |
313 } | |
314 if($frm_cs != $other_cs && !$frm_cs->equals($other_cs)) { | |
315 throw("Coordinate system " . $frm_cs->name . " " . $frm_cs->version . | |
316 " is neither the assembled nor the component coordinate system " . | |
317 " of this AssemblyMapper"); | |
318 } | |
319 | |
320 my $coord_systems = $self->{'coord_systems'}; | |
321 my $csa = $self->adaptor()->db()->get_CoordSystemAdaptor(); | |
322 | |
323 # | |
324 # TBD try to make this more efficient | |
325 # | |
326 my $from_rank = $other_cs->rank(); | |
327 foreach my $cs (@$coord_systems) { | |
328 last if($cs->rank >= $from_rank); | |
329 | |
330 #check if a mapping path even exists to this coordinate system | |
331 my @mapping_path = @{ $csa->get_mapping_path( $cs, $other_cs ) }; | |
332 | |
333 if(@mapping_path) { | |
334 | |
335 # Try to map to this coord system. If we get back any coordinates then | |
336 # it is our 'toplevel' that we were looking for | |
337 my $mapper = $adaptor->fetch_by_CoordSystems($other_cs, $cs); | |
338 | |
339 my @result; | |
340 | |
341 my @tmp; | |
342 push @tmp, $frm_seq_region_name; | |
343 my $seq_region_id = @{$self->adaptor()->seq_regions_to_ids($frm_cs, \@tmp)}[0]; | |
344 | |
345 if($seq_regions) { | |
346 @result = $mapper->list_seq_regions($frm_seq_region_name, $frm_start, | |
347 $frm_end, $frm_cs); | |
348 } else { | |
349 @result = $mapper->list_ids($frm_seq_region_name, $frm_start, | |
350 $frm_end, $frm_cs); | |
351 } | |
352 | |
353 return @result if(@result); | |
354 } | |
355 } | |
356 | |
357 # the toplevel coordinate system for the region requested *is* the | |
358 return ($frm_seq_region_name); | |
359 | |
360 | |
361 # requested region. | |
362 if($seq_regions) { | |
363 return ($frm_seq_region_name); | |
364 } | |
365 | |
366 #this seems a bit silly and inefficient, but it is probably never | |
367 #called anyway. | |
368 my $slice_adaptor = $adaptor->db()->get_SliceAdaptor(); | |
369 my $slice = $slice_adaptor->fetch_by_region($other_cs->name(), | |
370 $frm_seq_region_name, | |
371 undef,undef,undef,$other_cs); | |
372 return ($slice_adaptor->get_seq_region_id($slice)); | |
373 } | |
374 | |
375 | |
376 | |
377 =head2 list_seq_regions | |
378 | |
379 Arg [1] : string $frm_seq_region | |
380 The name of the sequence region of interest | |
381 Arg [2] : int $frm_start | |
382 The start of the region of interest | |
383 Arg [3] : int $frm_end | |
384 The end of the region to transform of interest | |
385 Arg [5] : Bio::EnsEMBL::CoordSystem $frm_cs | |
386 The coordinate system to obtain overlapping ids of | |
387 Example : foreach $id ($asm_mapper->list_ids('X',1,1000,$ctg_cs)) {...} | |
388 Description: Retrieves a list of overlapping seq_region names | |
389 of another coordinate system. This is the same as the | |
390 list_ids method but uses seq_region names rather internal ids | |
391 Returntype : List of strings | |
392 Exceptions : none | |
393 Caller : general | |
394 Status : Stable | |
395 | |
396 =cut | |
397 | |
398 sub list_seq_regions { | |
399 throw('Incorrect number of arguments.') if(@_ != 5); | |
400 return _list(@_,1); | |
401 } | |
402 | |
403 | |
404 =head2 list_ids | |
405 | |
406 Arg [1] : string $frm_seq_region | |
407 The name of the sequence region of interest. | |
408 Arg [2] : int $frm_start | |
409 The start of the region of interest | |
410 Arg [3] : int $frm_end | |
411 The end of the region to transform of interest | |
412 Arg [5] : Bio::EnsEMBL::CoordSystem $frm_cs | |
413 The coordinate system to obtain overlapping ids of | |
414 Example : foreach $id ($asm_mapper->list_ids('X',1,1000,$chr_cs)) {...} | |
415 Description: Retrieves a list of overlapping seq_region internal identifiers | |
416 of another coordinate system. This is the same as the | |
417 list_seq_regions method but uses internal identfiers rather | |
418 than seq_region strings | |
419 Returntype : List of ints | |
420 Exceptions : thrown if the from CoordSystem is the toplevel coord system | |
421 thrown if the from CoordSystem is not the one used in the mapper | |
422 Caller : general | |
423 Status : Stable | |
424 | |
425 =cut | |
426 | |
427 sub list_ids { | |
428 throw('Incorrect number of arguments.') if(@_ != 5); | |
429 return _list(@_,0); | |
430 } | |
431 | |
432 | |
433 | |
434 | |
435 | |
436 1; |