Mercurial > repos > mahtabm > ensembl
comparison variant_effect_predictor/Bio/EnsEMBL/Mapper.pm @ 0:1f6dce3d34e0
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 02:01:53 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:1f6dce3d34e0 |
---|---|
1 =head1 LICENSE | |
2 | |
3 Copyright (c) 1999-2012 The European Bioinformatics Institute and | |
4 Genome Research Limited. All rights reserved. | |
5 | |
6 This software is distributed under a modified Apache license. | |
7 For license details, please see | |
8 | |
9 http://www.ensembl.org/info/about/code_licence.html | |
10 | |
11 =head1 CONTACT | |
12 | |
13 Please email comments or questions to the public Ensembl | |
14 developers list at <dev@ensembl.org>. | |
15 | |
16 Questions may also be sent to the Ensembl help desk at | |
17 <helpdesk@ensembl.org>. | |
18 | |
19 =cut | |
20 | |
21 =head1 NAME | |
22 | |
23 Bio::EnsEMBL::Mapper | |
24 | |
25 =head1 SYNOPSIS | |
26 | |
27 $map = Bio::EnsEMBL::Mapper->new( 'rawcontig', 'chromosome' ); | |
28 | |
29 # add a coodinate mapping - supply two pairs or coordinates | |
30 $map->add_map_coordinates( | |
31 $contig_id, $contig_start, $contig_end, $contig_ori, | |
32 $chr_name, chr_start, $chr_end | |
33 ); | |
34 | |
35 # map from one coordinate system to another | |
36 my @coordlist = | |
37 $mapper->map_coordinates( 627012, 2, 5, -1, "rawcontig" ); | |
38 | |
39 =head1 DESCRIPTION | |
40 | |
41 Generic mapper to provide coordinate transforms between two disjoint | |
42 coordinate systems. This mapper is intended to be 'context neutral' - in | |
43 that it does not contain any code relating to any particular coordinate | |
44 system. This is provided in, for example, Bio::EnsEMBL::AssemblyMapper. | |
45 | |
46 =head1 METHODS | |
47 | |
48 =cut | |
49 | |
50 package Bio::EnsEMBL::Mapper; | |
51 use strict; | |
52 use integer; | |
53 | |
54 use Bio::EnsEMBL::Utils::Exception qw(throw deprecate warning stack_trace_dump); | |
55 use Bio::EnsEMBL::Mapper::Pair; | |
56 use Bio::EnsEMBL::Mapper::IndelPair; | |
57 use Bio::EnsEMBL::Mapper::Unit; | |
58 use Bio::EnsEMBL::Mapper::Coordinate; | |
59 use Bio::EnsEMBL::Mapper::IndelCoordinate; | |
60 use Bio::EnsEMBL::Mapper::Gap; | |
61 | |
62 use Bio::EnsEMBL::Utils::Exception qw(throw); | |
63 | |
64 # use Data::Dumper; | |
65 | |
66 =head2 new | |
67 | |
68 Arg [1] : string $from | |
69 The name of the 'from' coordinate system | |
70 Arg [2] : string $to | |
71 The name of the 'to' coordinate system | |
72 Arg [3] : (optional) Bio::EnsEMBL::CoordSystem $from_cs | |
73 The 'from' coordinate system | |
74 Arg [4] : (optional) Bio::EnsEMBL::CoordSystem $to_cs | |
75 Example : my $mapper = Bio::EnsEMBL::Mapper->new('FROM', 'TO'); | |
76 Description: Constructor. Creates a new Bio::EnsEMBL::Mapper object. | |
77 Returntype : Bio::EnsEMBL::Mapper | |
78 Exceptions : none | |
79 Caller : general | |
80 | |
81 =cut | |
82 | |
83 sub new { | |
84 my ( $proto, $from, $to, $from_cs, $to_cs ) = @_; | |
85 | |
86 if ( !defined($to) || !defined($from) ) { | |
87 throw("Must supply 'to' and 'from' tags"); | |
88 } | |
89 | |
90 my $class = ref($proto) || $proto; | |
91 | |
92 my $self = bless( { "_pair_$from" => {}, | |
93 "_pair_$to" => {}, | |
94 'pair_count' => 0, | |
95 'to' => $to, | |
96 'from' => $from, | |
97 'to_cs' => $to_cs, | |
98 'from_cs' => $from_cs | |
99 }, | |
100 $class ); | |
101 | |
102 # do sql to get any componente with muliple assemblys. | |
103 | |
104 return $self; | |
105 } | |
106 | |
107 =head2 flush | |
108 | |
109 Args : none | |
110 Example : none | |
111 Description: removes all cached information out of this mapper | |
112 Returntype : none | |
113 Exceptions : none | |
114 Caller : AssemblyMapper, ChainedAssemblyMapper | |
115 | |
116 =cut | |
117 | |
118 sub flush { | |
119 my $self = shift; | |
120 my $from = $self->from(); | |
121 my $to = $self->to(); | |
122 | |
123 $self->{"_pair_$from"} = {}; | |
124 $self->{"_pair_$to"} = {}; | |
125 | |
126 $self->{'pair_count'} = 0; | |
127 } | |
128 | |
129 | |
130 | |
131 =head2 map_coordinates | |
132 | |
133 Arg 1 string $id | |
134 id of 'source' sequence | |
135 Arg 2 int $start | |
136 start coordinate of 'source' sequence | |
137 Arg 3 int $end | |
138 end coordinate of 'source' sequence | |
139 Arg 4 int $strand | |
140 raw contig orientation (+/- 1) | |
141 Arg 5 string $type | |
142 nature of transform - gives the type of | |
143 coordinates to be transformed *from* | |
144 Function generic map method | |
145 Returntype array of Bio::EnsEMBL::Mapper::Coordinate | |
146 and/or Bio::EnsEMBL::Mapper::Gap | |
147 Exceptions none | |
148 Caller Bio::EnsEMBL::Mapper | |
149 | |
150 =cut | |
151 | |
152 sub map_coordinates { | |
153 my ( $self, $id, $start, $end, $strand, $type ) = @_; | |
154 | |
155 unless ( defined($id) | |
156 && defined($start) | |
157 && defined($end) | |
158 && defined($strand) | |
159 && defined($type) ) | |
160 { | |
161 throw("Expecting 5 arguments"); | |
162 } | |
163 | |
164 # special case for handling inserts: | |
165 if ( $start == $end + 1 ) { | |
166 return $self->map_insert( $id, $start, $end, $strand, $type ); | |
167 } | |
168 | |
169 if ( !$self->{'_is_sorted'} ) { $self->_sort() } | |
170 | |
171 my $hash = $self->{"_pair_$type"}; | |
172 | |
173 my ( $from, $to, $cs ); | |
174 | |
175 if ( $type eq $self->{'to'} ) { | |
176 $from = 'to'; | |
177 $to = 'from'; | |
178 $cs = $self->{'from_cs'}; | |
179 } else { | |
180 $from = 'from'; | |
181 $to = 'to'; | |
182 $cs = $self->{'to_cs'}; | |
183 } | |
184 | |
185 unless ( defined $hash ) { | |
186 throw("Type $type is neither to or from coordinate systems"); | |
187 } | |
188 | |
189 if ( !defined $hash->{ uc($id) } ) { | |
190 # one big gap! | |
191 my $gap = Bio::EnsEMBL::Mapper::Gap->new( $start, $end ); | |
192 return $gap; | |
193 } | |
194 | |
195 my $last_used_pair; | |
196 my @result; | |
197 | |
198 my ( $start_idx, $end_idx, $mid_idx, $pair, $self_coord ); | |
199 my $lr = $hash->{ uc($id) }; | |
200 | |
201 $start_idx = 0; | |
202 $end_idx = $#{$lr}; | |
203 | |
204 # binary search the relevant pairs | |
205 # helps if the list is big | |
206 while ( ( $end_idx - $start_idx ) > 1 ) { | |
207 $mid_idx = ( $start_idx + $end_idx ) >> 1; | |
208 $pair = $lr->[$mid_idx]; | |
209 $self_coord = $pair->{$from}; | |
210 if ( $self_coord->{'end'} < $start ) { | |
211 $start_idx = $mid_idx; | |
212 } else { | |
213 $end_idx = $mid_idx; | |
214 } | |
215 } | |
216 | |
217 my $rank = 0; | |
218 my $orig_start = $start; | |
219 my $last_target_coord = undef; | |
220 for ( my $i = $start_idx; $i <= $#{$lr}; $i++ ) { | |
221 $pair = $lr->[$i]; | |
222 my $self_coord = $pair->{$from}; | |
223 my $target_coord = $pair->{$to}; | |
224 | |
225 # | |
226 # But not the case for haplotypes!! need to test for this case??? | |
227 # so removing this till a better solution is found | |
228 # | |
229 # | |
230 # if($self_coord->{'start'} < $start){ | |
231 # $start = $orig_start; | |
232 # $rank++; | |
233 # } | |
234 | |
235 if ( defined($last_target_coord) | |
236 and $target_coord->{'id'} ne $last_target_coord ) | |
237 { | |
238 if ( $self_coord->{'start'} < $start ) | |
239 { # i.e. the same bit is being mapped to another assembled bit | |
240 $start = $orig_start; | |
241 } | |
242 } else { | |
243 $last_target_coord = $target_coord->{'id'}; | |
244 } | |
245 | |
246 # if we haven't even reached the start, move on | |
247 if ( $self_coord->{'end'} < $orig_start ) { | |
248 next; | |
249 } | |
250 | |
251 # if we have over run, break | |
252 if ( $self_coord->{'start'} > $end ) { | |
253 last; | |
254 } | |
255 | |
256 if ( $start < $self_coord->{'start'} ) { | |
257 # gap detected | |
258 my $gap = Bio::EnsEMBL::Mapper::Gap->new( $start, | |
259 $self_coord->{'start'} - 1, $rank ); | |
260 | |
261 push( @result, $gap ); | |
262 $start = $gap->{'end'} + 1; | |
263 } | |
264 my ( $target_start, $target_end, $target_ori ); | |
265 my $res; | |
266 if ( exists $pair->{'indel'} ) { | |
267 # When next pair is an IndelPair and not a Coordinate, create the | |
268 # new mapping Coordinate, the IndelCoordinate. | |
269 $target_start = $target_coord->{'start'}; | |
270 $target_end = $target_coord->{'end'}; | |
271 | |
272 #create a Gap object | |
273 my $gap = Bio::EnsEMBL::Mapper::Gap->new( $start, | |
274 ( $self_coord->{'end'} < $end ? $self_coord->{'end'} : $end ) ); | |
275 #create the Coordinate object | |
276 my $coord = | |
277 Bio::EnsEMBL::Mapper::Coordinate->new( $target_coord->{'id'}, | |
278 $target_start, $target_end, $pair->{'ori'}*$strand, $cs ); | |
279 #and finally, the IndelCoordinate object with | |
280 $res = Bio::EnsEMBL::Mapper::IndelCoordinate->new( $gap, $coord ); | |
281 } else { | |
282 # start is somewhere inside the region | |
283 if ( $pair->{'ori'} == 1 ) { | |
284 $target_start = | |
285 $target_coord->{'start'} + | |
286 ( $start - $self_coord->{'start'} ); | |
287 } else { | |
288 $target_end = | |
289 $target_coord->{'end'} - ( $start - $self_coord->{'start'} ); | |
290 } | |
291 | |
292 # Either we are enveloping this map or not. If yes, then end | |
293 # point (self perspective) is determined solely by target. If | |
294 # not we need to adjust. | |
295 | |
296 if ( $end > $self_coord->{'end'} ) { | |
297 # enveloped | |
298 if ( $pair->{'ori'} == 1 ) { | |
299 $target_end = $target_coord->{'end'}; | |
300 } else { | |
301 $target_start = $target_coord->{'start'}; | |
302 } | |
303 } else { | |
304 # need to adjust end | |
305 if ( $pair->{'ori'} == 1 ) { | |
306 $target_end = | |
307 $target_coord->{'start'} + | |
308 ( $end - $self_coord->{'start'} ); | |
309 } else { | |
310 $target_start = | |
311 $target_coord->{'end'} - ( $end - $self_coord->{'start'} ); | |
312 } | |
313 } | |
314 | |
315 $res = | |
316 Bio::EnsEMBL::Mapper::Coordinate->new( $target_coord->{'id'}, | |
317 $target_start, $target_end, $pair->{'ori'}*$strand, | |
318 $cs, $rank ); | |
319 | |
320 } ## end else [ if ( exists $pair->{'indel'...})] | |
321 | |
322 push( @result, $res ); | |
323 | |
324 $last_used_pair = $pair; | |
325 $start = $self_coord->{'end'} + 1; | |
326 | |
327 } ## end for ( my $i = $start_idx...) | |
328 | |
329 if ( !defined $last_used_pair ) { | |
330 my $gap = Bio::EnsEMBL::Mapper::Gap->new( $start, $end ); | |
331 push( @result, $gap ); | |
332 | |
333 } elsif ( $last_used_pair->{$from}->{'end'} < $end ) { | |
334 # gap at the end | |
335 my $gap = | |
336 Bio::EnsEMBL::Mapper::Gap->new( | |
337 $last_used_pair->{$from}->{'end'} + 1, | |
338 $end, $rank ); | |
339 push( @result, $gap ); | |
340 } | |
341 | |
342 if ( $strand == -1 ) { | |
343 @result = reverse(@result); | |
344 } | |
345 | |
346 return @result; | |
347 } ## end sub map_coordinates | |
348 | |
349 | |
350 | |
351 =head2 map_insert | |
352 | |
353 Arg [1] : string $id | |
354 Arg [2] : int $start - start coord. Since this is an insert should always | |
355 be one greater than end. | |
356 Arg [3] : int $end - end coord. Since this is an insert should always | |
357 be one less than start. | |
358 Arg [4] : int $strand (0, 1, -1) | |
359 Arg [5] : string $type - the coordinate system name the coords are from. | |
360 Arg [6] : boolean $fastmap - if specified, this is being called from | |
361 the fastmap call. The mapping done is not any faster for | |
362 inserts, but the return value is different. | |
363 Example : | |
364 Description: This is in internal function which handles the special mapping | |
365 case for inserts (start = end +1). This function will be called | |
366 automatically by the map function so there is no reason to | |
367 call it directly. | |
368 Returntype : list of Bio::EnsEMBL::Mapper::Coordinate and/or Gap objects | |
369 Exceptions : none | |
370 Caller : map_coordinates() | |
371 | |
372 =cut | |
373 | |
374 sub map_insert { | |
375 my ($self, $id, $start, $end, $strand, $type, $fastmap) = @_; | |
376 | |
377 # swap start/end and map the resultant 2bp coordinate | |
378 ($start, $end) =($end,$start); | |
379 | |
380 my @coords = $self->map_coordinates($id, $start, $end, $strand, $type); | |
381 | |
382 if(@coords == 1) { | |
383 my $c = $coords[0]; | |
384 # swap start and end to convert back into insert | |
385 ($c->{'start'}, $c->{'end'}) = ($c->{'end'}, $c->{'start'}); | |
386 } else { | |
387 throw("Unexpected: Got ",scalar(@coords)," expected 2.") if(@coords != 2); | |
388 | |
389 # adjust coordinates, remove gaps | |
390 my ($c1, $c2); | |
391 if($strand == -1) { | |
392 ($c2,$c1) = @coords; | |
393 } else { | |
394 ($c1, $c2) = @coords; | |
395 } | |
396 @coords = (); | |
397 | |
398 if(ref($c1) eq 'Bio::EnsEMBL::Mapper::Coordinate') { | |
399 # insert is after first coord | |
400 if($c1->{'strand'} * $strand == -1) { | |
401 $c1->{'end'}--; | |
402 } else { | |
403 $c1->{'start'}++; | |
404 } | |
405 @coords = ($c1); | |
406 } | |
407 if(ref($c2) eq 'Bio::EnsEMBL::Mapper::Coordinate') { | |
408 # insert is before second coord | |
409 if($c2->{'strand'} * $strand == -1) { | |
410 $c2->{'start'}++; | |
411 } else { | |
412 $c2->{'end'}--; | |
413 } | |
414 if($strand == -1) { | |
415 unshift @coords, $c2; | |
416 } else { | |
417 push @coords, $c2; | |
418 } | |
419 } | |
420 } | |
421 | |
422 if($fastmap) { | |
423 return undef if(@coords != 1); | |
424 my $c = $coords[0]; | |
425 return ($c->{'id'}, $c->{'start'}, $c->{'end'}, | |
426 $c->{'strand'}, $c->{'coord_system'}); | |
427 } | |
428 | |
429 return @coords; | |
430 } | |
431 | |
432 | |
433 | |
434 | |
435 | |
436 | |
437 =head2 fastmap | |
438 | |
439 Arg 1 string $id | |
440 id of 'source' sequence | |
441 Arg 2 int $start | |
442 start coordinate of 'source' sequence | |
443 Arg 3 int $end | |
444 end coordinate of 'source' sequence | |
445 Arg 4 int $strand | |
446 raw contig orientation (+/- 1) | |
447 Arg 5 int $type | |
448 nature of transform - gives the type of | |
449 coordinates to be transformed *from* | |
450 Function inferior map method. Will only do ungapped unsplit mapping. | |
451 Will return id, start, end strand in a list. | |
452 Returntype list of results | |
453 Exceptions none | |
454 Caller Bio::EnsEMBL::AssemblyMapper | |
455 | |
456 =cut | |
457 | |
458 sub fastmap { | |
459 my ($self, $id, $start, $end, $strand, $type) = @_; | |
460 | |
461 my ($from, $to, $cs); | |
462 | |
463 if($end+1 == $start) { | |
464 return $self->map_insert($id, $start, $end, $strand, $type, 1); | |
465 } | |
466 | |
467 if( ! $self->{'_is_sorted'} ) { $self->_sort() } | |
468 | |
469 if($type eq $self->{'to'}) { | |
470 $from = 'to'; | |
471 $to = 'from'; | |
472 $cs = $self->{'from_cs'}; | |
473 } else { | |
474 $from = 'from'; | |
475 $to = 'to'; | |
476 $cs = $self->{'to_cs'}; | |
477 } | |
478 | |
479 my $hash = $self->{"_pair_$type"} or | |
480 throw("Type $type is neither to or from coordinate systems"); | |
481 | |
482 | |
483 my $pairs = $hash->{uc($id)}; | |
484 | |
485 foreach my $pair (@$pairs) { | |
486 my $self_coord = $pair->{$from}; | |
487 my $target_coord = $pair->{$to}; | |
488 | |
489 # only super easy mapping is done | |
490 if( $start < $self_coord->{'start'} || | |
491 $end > $self_coord->{'end'} ) { | |
492 next; | |
493 } | |
494 | |
495 if( $pair->{'ori'} == 1 ) { | |
496 return ( $target_coord->{'id'}, | |
497 $target_coord->{'start'}+$start-$self_coord->{'start'}, | |
498 $target_coord->{'start'}+$end-$self_coord->{'start'}, | |
499 $strand, $cs ); | |
500 } else { | |
501 return ( $target_coord->{'id'}, | |
502 $target_coord->{'end'} - ($end - $self_coord->{'start'}), | |
503 $target_coord->{'end'} - ($start - $self_coord->{'start'}), | |
504 -$strand, $cs ); | |
505 } | |
506 } | |
507 | |
508 return (); | |
509 } | |
510 | |
511 | |
512 | |
513 =head2 add_map_coordinates | |
514 | |
515 Arg 1 int $id | |
516 id of 'source' sequence | |
517 Arg 2 int $start | |
518 start coordinate of 'source' sequence | |
519 Arg 3 int $end | |
520 end coordinate of 'source' sequence | |
521 Arg 4 int $strand | |
522 relative orientation of source and target (+/- 1) | |
523 Arg 5 int $id | |
524 id of 'target' sequence | |
525 Arg 6 int $start | |
526 start coordinate of 'target' sequence | |
527 Arg 7 int $end | |
528 end coordinate of 'target' sequence | |
529 Function Stores details of mapping between | |
530 'source' and 'target' regions. | |
531 Returntype none | |
532 Exceptions none | |
533 Caller Bio::EnsEMBL::Mapper | |
534 | |
535 =cut | |
536 | |
537 sub add_map_coordinates { | |
538 my ( $self, $contig_id, $contig_start, $contig_end, $contig_ori, | |
539 $chr_name, $chr_start, $chr_end ) | |
540 = @_; | |
541 | |
542 unless ( defined($contig_id) | |
543 && defined($contig_start) | |
544 && defined($contig_end) | |
545 && defined($contig_ori) | |
546 && defined($chr_name) | |
547 && defined($chr_start) | |
548 && defined($chr_end) ) | |
549 { | |
550 throw("7 arguments expected"); | |
551 } | |
552 | |
553 if ( ( $contig_end - $contig_start ) != ( $chr_end - $chr_start ) ) { | |
554 throw("Cannot deal with mis-lengthed mappings so far"); | |
555 } | |
556 | |
557 my $from = Bio::EnsEMBL::Mapper::Unit->new( $contig_id, $contig_start, | |
558 $contig_end ); | |
559 my $to = | |
560 Bio::EnsEMBL::Mapper::Unit->new( $chr_name, $chr_start, $chr_end ); | |
561 | |
562 my $pair = Bio::EnsEMBL::Mapper::Pair->new( $from, $to, $contig_ori ); | |
563 | |
564 # place into hash on both ids | |
565 my $map_to = $self->{'to'}; | |
566 my $map_from = $self->{'from'}; | |
567 | |
568 push( @{ $self->{"_pair_$map_to"}->{ uc($chr_name) } }, $pair ); | |
569 push( @{ $self->{"_pair_$map_from"}->{ uc($contig_id) } }, $pair ); | |
570 | |
571 $self->{'pair_count'}++; | |
572 $self->{'_is_sorted'} = 0; | |
573 } ## end sub add_map_coordinates | |
574 | |
575 | |
576 =head2 add_indel_coordinates | |
577 | |
578 Arg 1 int $id | |
579 id of 'source' sequence | |
580 Arg 2 int $start | |
581 start coordinate of 'source' sequence | |
582 Arg 3 int $end | |
583 end coordinate of 'source' sequence | |
584 Arg 4 int $strand | |
585 relative orientation of source and target (+/- 1) | |
586 Arg 5 int $id | |
587 id of 'targe' sequence | |
588 Arg 6 int $start | |
589 start coordinate of 'targe' sequence | |
590 Arg 7 int $end | |
591 end coordinate of 'targe' sequence | |
592 Function stores details of mapping between two regions: | |
593 'source' and 'target'. Returns 1 if the pair was added, 0 if it | |
594 was already in. Used when adding an indel | |
595 Returntype int 0,1 | |
596 Exceptions none | |
597 Caller Bio::EnsEMBL::Mapper | |
598 | |
599 =cut | |
600 | |
601 sub add_indel_coordinates{ | |
602 my ($self, $contig_id, $contig_start, $contig_end, | |
603 $contig_ori, $chr_name, $chr_start, $chr_end) = @_; | |
604 | |
605 unless(defined($contig_id) && defined($contig_start) && defined($contig_end) | |
606 && defined($contig_ori) && defined($chr_name) && defined($chr_start) | |
607 && defined($chr_end)) { | |
608 throw("7 arguments expected"); | |
609 } | |
610 | |
611 #we need to create the IndelPair object to add to both lists, to and from | |
612 my $from = | |
613 Bio::EnsEMBL::Mapper::Unit->new($contig_id, $contig_start, $contig_end); | |
614 my $to = | |
615 Bio::EnsEMBL::Mapper::Unit->new($chr_name, $chr_start, $chr_end); | |
616 | |
617 my $pair = Bio::EnsEMBL::Mapper::IndelPair->new($from, $to, $contig_ori); | |
618 | |
619 # place into hash on both ids | |
620 my $map_to = $self->{'to'}; | |
621 my $map_from = $self->{'from'}; | |
622 | |
623 push( @{$self->{"_pair_$map_to"}->{uc($chr_name)}}, $pair ); | |
624 push( @{$self->{"_pair_$map_from"}->{uc($contig_id)}}, $pair ); | |
625 | |
626 $self->{'pair_count'}++; | |
627 | |
628 $self->{'_is_sorted'} = 0; | |
629 return 1; | |
630 } | |
631 | |
632 | |
633 =head2 map_indel | |
634 | |
635 Arg [1] : string $id | |
636 Arg [2] : int $start - start coord. Since this is an indel should always | |
637 be one greater than end. | |
638 Arg [3] : int $end - end coord. Since this is an indel should always | |
639 be one less than start. | |
640 Arg [4] : int $strand (0, 1, -1) | |
641 Arg [5] : string $type - the coordinate system name the coords are from. | |
642 Example : @coords = $mapper->map_indel(); | |
643 Description: This is in internal function which handles the special mapping | |
644 case for indels (start = end +1). It will be used to map from | |
645 a coordinate system with a gap to another that contains an | |
646 insertion. It will be mainly used by the Variation API. | |
647 Returntype : Bio::EnsEMBL::Mapper::Unit objects | |
648 Exceptions : none | |
649 Caller : general | |
650 | |
651 =cut | |
652 | |
653 sub map_indel { | |
654 my ( $self, $id, $start, $end, $strand, $type ) = @_; | |
655 | |
656 # swap start/end and map the resultant 2bp coordinate | |
657 ( $start, $end ) = ( $end, $start ); | |
658 | |
659 if ( !$self->{'_is_sorted'} ) { $self->_sort() } | |
660 | |
661 my $hash = $self->{"_pair_$type"}; | |
662 | |
663 my ( $from, $to, $cs ); | |
664 | |
665 if ( $type eq $self->{'to'} ) { | |
666 $from = 'to'; | |
667 $to = 'from'; | |
668 $cs = $self->{'from_cs'}; | |
669 } else { | |
670 $from = 'from'; | |
671 $to = 'to'; | |
672 $cs = $self->{'to_cs'}; | |
673 } | |
674 | |
675 unless ( defined $hash ) { | |
676 throw("Type $type is neither to or from coordinate systems"); | |
677 } | |
678 my @indel_coordinates; | |
679 | |
680 my ( $start_idx, $end_idx, $mid_idx, $pair, $self_coord ); | |
681 my $lr = $hash->{ uc($id) }; | |
682 | |
683 $start_idx = 0; | |
684 $end_idx = $#{$lr}; | |
685 | |
686 # binary search the relevant pairs | |
687 # helps if the list is big | |
688 while ( ( $end_idx - $start_idx ) > 1 ) { | |
689 $mid_idx = ( $start_idx + $end_idx ) >> 1; | |
690 $pair = $lr->[$mid_idx]; | |
691 $self_coord = $pair->{$from}; | |
692 if ( $self_coord->{'end'} <= $start ) { | |
693 $start_idx = $mid_idx; | |
694 } else { | |
695 $end_idx = $mid_idx; | |
696 } | |
697 } | |
698 | |
699 for ( my $i = $start_idx; $i <= $#{$lr}; $i++ ) { | |
700 $pair = $lr->[$i]; | |
701 my $self_coord = $pair->{$from}; | |
702 my $target_coord = $pair->{$to}; | |
703 | |
704 if ( exists $pair->{'indel'} ) { | |
705 #need to return unit coordinate | |
706 my $to = | |
707 Bio::EnsEMBL::Mapper::Unit->new( $target_coord->{'id'}, | |
708 $target_coord->{'start'}, | |
709 $target_coord->{'end'}, ); | |
710 push @indel_coordinates, $to; | |
711 last; | |
712 } | |
713 } | |
714 | |
715 return @indel_coordinates; | |
716 } ## end sub map_indel | |
717 | |
718 | |
719 =head2 add_Mapper | |
720 | |
721 Arg 1 Bio::EnsEMBL::Mapper $mapper2 | |
722 Example $mapper->add_Mapper($mapper2) | |
723 Function add all the map coordinates from $mapper to this mapper. | |
724 This object will contain mapping pairs from both the old | |
725 object and $mapper2. | |
726 Returntype int 0,1 | |
727 Exceptions throw if 'to' and 'from' from both Bio::EnsEMBL::Mappers | |
728 are incompatible | |
729 Caller $mapper->methodname() | |
730 | |
731 =cut | |
732 | |
733 sub add_Mapper{ | |
734 my ($self, $mapper) = @_; | |
735 | |
736 my $mapper_to = $mapper->{'to'}; | |
737 my $mapper_from = $mapper->{'from'}; | |
738 if ($mapper_to ne $self->{'to'} or $mapper_from ne $self->{'from'}) { | |
739 throw("Trying to add an incompatible Mapper"); | |
740 } | |
741 | |
742 my $count_a = 0; | |
743 foreach my $seq_name (keys %{$mapper->{"_pair_$mapper_to"}}) { | |
744 push(@{$self->{"_pair_$mapper_to"}->{$seq_name}}, | |
745 @{$mapper->{"_pair_$mapper_to"}->{$seq_name}}); | |
746 $count_a += scalar(@{$mapper->{"_pair_$mapper_to"}->{$seq_name}}); | |
747 } | |
748 my $count_b = 0; | |
749 foreach my $seq_name (keys %{$mapper->{"_pair_$mapper_from"}}) { | |
750 push(@{$self->{"_pair_$mapper_from"}->{$seq_name}}, | |
751 @{$mapper->{"_pair_$mapper_from"}->{$seq_name}}); | |
752 $count_b += scalar(@{$mapper->{"_pair_$mapper_from"}->{$seq_name}}); | |
753 } | |
754 | |
755 if ($count_a == $count_b) { | |
756 $self->{'pair_count'} += $count_a; | |
757 } else { | |
758 throw("Trying to add a funny Mapper"); | |
759 } | |
760 | |
761 $self->{'_is_sorted'} = 0; | |
762 return 1; | |
763 } | |
764 | |
765 | |
766 | |
767 =head2 list_pairs | |
768 | |
769 Arg 1 int $id | |
770 id of 'source' sequence | |
771 Arg 2 int $start | |
772 start coordinate of 'source' sequence | |
773 Arg 3 int $end | |
774 end coordinate of 'source' sequence | |
775 Arg 4 string $type | |
776 nature of transform - gives the type of | |
777 coordinates to be transformed *from* | |
778 Function list all pairs of mappings in a region | |
779 Returntype list of Bio::EnsEMBL::Mapper::Pair | |
780 Exceptions none | |
781 Caller Bio::EnsEMBL::Mapper | |
782 | |
783 =cut | |
784 | |
785 sub list_pairs { | |
786 my ( $self, $id, $start, $end, $type ) = @_; | |
787 | |
788 if ( !$self->{'_is_sorted'} ) { $self->_sort() } | |
789 | |
790 if ( !defined $type ) { | |
791 throw("Expected 4 arguments"); | |
792 } | |
793 | |
794 if ( $start > $end ) { | |
795 throw( "Start is greater than end " | |
796 . "for id $id, start $start, end $end\n" ); | |
797 } | |
798 | |
799 my $hash = $self->{"_pair_$type"}; | |
800 | |
801 my ( $from, $to ); | |
802 | |
803 if ( $type eq $self->{'to'} ) { | |
804 $from = 'to'; | |
805 $to = 'from'; | |
806 } else { | |
807 $from = 'from'; | |
808 $to = 'to'; | |
809 } | |
810 | |
811 unless ( defined $hash ) { | |
812 throw("Type $type is neither to or from coordinate systems"); | |
813 } | |
814 | |
815 my @list; | |
816 | |
817 unless ( exists $hash->{ uc($id) } ) { | |
818 return (); | |
819 } | |
820 | |
821 @list = @{ $hash->{ uc($id) } }; | |
822 | |
823 my @output; | |
824 if ( $start == -1 && $end == -1 ) { | |
825 return @list; | |
826 } else { | |
827 | |
828 foreach my $p (@list) { | |
829 | |
830 if ( $p->{$from}->{'end'} < $start ) { | |
831 next; | |
832 } | |
833 if ( $p->{$from}->{'start'} > $end ) { | |
834 last; | |
835 } | |
836 push( @output, $p ); | |
837 } | |
838 return @output; | |
839 } | |
840 } ## end sub list_pairs | |
841 | |
842 | |
843 =head2 to | |
844 | |
845 Arg 1 Bio::EnsEMBL::Mapper::Unit $id | |
846 id of 'source' sequence | |
847 Function accessor method form the 'source' | |
848 and 'target' in a Mapper::Pair | |
849 Returntype Bio::EnsEMBL::Mapper::Unit | |
850 Exceptions none | |
851 Caller Bio::EnsEMBL::Mapper | |
852 | |
853 =cut | |
854 | |
855 sub to { | |
856 my ( $self, $value ) = @_; | |
857 | |
858 if ( defined($value) ) { | |
859 $self->{'to'} = $value; | |
860 } | |
861 | |
862 return $self->{'to'}; | |
863 } | |
864 | |
865 =head2 from | |
866 | |
867 Arg 1 Bio::EnsEMBL::Mapper::Unit $id | |
868 id of 'source' sequence | |
869 Function accessor method form the 'source' | |
870 and 'target' in a Mapper::Pair | |
871 Returntype Bio::EnsEMBL::Mapper::Unit | |
872 Exceptions none | |
873 Caller Bio::EnsEMBL::Mapper | |
874 | |
875 =cut | |
876 sub from { | |
877 my ( $self, $value ) = @_; | |
878 | |
879 if ( defined($value) ) { | |
880 $self->{'from'} = $value; | |
881 } | |
882 | |
883 return $self->{'from'}; | |
884 } | |
885 | |
886 | |
887 # _dump | |
888 # | |
889 # Arg 1 *FileHandle $fh | |
890 # Function convenience dump function | |
891 # possibly useful for debugging | |
892 # Returntype none | |
893 # Exceptions none | |
894 # Caller internal | |
895 # | |
896 | |
897 sub _dump{ | |
898 my ($self,$fh) = @_; | |
899 | |
900 if( !defined $fh ) { | |
901 $fh = \*STDERR; | |
902 } | |
903 | |
904 foreach my $id ( keys %{$self->{'_pair_hash_from'}} ) { | |
905 print $fh "From Hash $id\n"; | |
906 foreach my $pair ( @{$self->{'_pair_hash_from'}->{uc($id)}} ) { | |
907 print $fh " ",$pair->from->start," ",$pair->from->end,":",$pair->to->start," ",$pair->to->end," ",$pair->to->id,"\n"; | |
908 } | |
909 } | |
910 | |
911 } | |
912 | |
913 | |
914 # _sort | |
915 # | |
916 # Function sort function so that all | |
917 # mappings are sorted by | |
918 # chromosome start | |
919 # Returntype none | |
920 # Exceptions none | |
921 # Caller internal | |
922 # | |
923 | |
924 sub _sort { | |
925 my ($self) = @_; | |
926 | |
927 my $to = $self->{'to'}; | |
928 my $from = $self->{'from'}; | |
929 | |
930 foreach my $id ( keys %{ $self->{"_pair_$from"} } ) { | |
931 @{ $self->{"_pair_$from"}->{$id} } = | |
932 sort { $a->{'from'}->{'start'} <=> $b->{'from'}->{'start'} } | |
933 @{ $self->{"_pair_$from"}->{$id} }; | |
934 } | |
935 | |
936 foreach my $id ( keys %{ $self->{"_pair_$to"} } ) { | |
937 @{ $self->{"_pair_$to"}->{$id} } = | |
938 sort { $a->{'to'}->{'start'} <=> $b->{'to'}->{'start'} } | |
939 @{ $self->{"_pair_$to"}->{$id} }; | |
940 } | |
941 | |
942 $self->_merge_pairs(); | |
943 $self->_is_sorted(1); | |
944 } | |
945 | |
946 # this function merges pairs that are adjacent into one | |
947 sub _merge_pairs { | |
948 my $self = shift; | |
949 | |
950 my ( $lr, $lr_from, $del_pair, $next_pair, $current_pair ); | |
951 | |
952 my $map_to = $self->{'to'}; | |
953 my $map_from = $self->{'from'}; | |
954 $self->{'pair_count'} = 0; | |
955 | |
956 for my $key ( keys %{$self->{"_pair_$map_to"}} ) { | |
957 $lr = $self->{"_pair_$map_to"}->{$key}; | |
958 | |
959 my $i = 0; | |
960 my $next = 1; | |
961 my $length = $#{$lr}; | |
962 while( $next <= $length ) { | |
963 $current_pair = $lr->[$i]; | |
964 $next_pair = $lr->[$next]; | |
965 $del_pair = undef; | |
966 | |
967 if(exists $current_pair->{'indel'} || exists $next_pair->{'indel'}){ | |
968 #necessary to modify the merge function to not merge indels | |
969 $next++; | |
970 $i++; | |
971 } | |
972 else{ | |
973 # duplicate filter | |
974 if( $current_pair->{'to'}->{'start'} == $next_pair->{'to'}->{'start'} | |
975 and $current_pair->{'from'}->{'id'} == $next_pair->{'from'}->{'id'} ) { | |
976 $del_pair = $next_pair; | |
977 } elsif(( $current_pair->{'from'}->{'id'} eq $next_pair->{'from'}->{'id'} ) && | |
978 ( $next_pair->{'ori'} == $current_pair->{'ori'} ) && | |
979 ( $next_pair->{'to'}->{'start'} -1 == $current_pair->{'to'}->{'end'} )) { | |
980 | |
981 if( $current_pair->{'ori'} == 1 ) { | |
982 # check forward strand merge | |
983 if( $next_pair->{'from'}->{'start'} - 1 == $current_pair->{'from'}->{'end'} ) { | |
984 # normal merge with previous element | |
985 $current_pair->{'to'}->{'end'} = $next_pair->{'to'}->{'end'}; | |
986 $current_pair->{'from'}->{'end'} = $next_pair->{'from'}->{'end'}; | |
987 $del_pair = $next_pair; | |
988 } | |
989 } else { | |
990 # check backward strand merge | |
991 if( $next_pair->{'from'}->{'end'} + 1 == $current_pair->{'from'}->{'start'} ) { | |
992 # yes its a merge | |
993 $current_pair->{'to'}->{'end'} = $next_pair->{'to'}->{'end'}; | |
994 $current_pair->{'from'}->{'start'} = $next_pair->{'from'}->{'start'}; | |
995 $del_pair = $next_pair; | |
996 } | |
997 } | |
998 } | |
999 | |
1000 if( defined $del_pair ) { | |
1001 splice( @$lr, $next, 1 ); | |
1002 $lr_from = $self->{"_pair_$map_from"}->{uc($del_pair->{'from'}->{'id'})}; | |
1003 for( my $j=0; $j <= $#{$lr_from}; $j++ ) { | |
1004 if( $lr_from->[$j] == $del_pair ) { | |
1005 splice( @$lr_from, $j, 1 ); | |
1006 last; | |
1007 } | |
1008 } | |
1009 $length--; | |
1010 if( $length < $next ) { last; } | |
1011 } | |
1012 else { | |
1013 $next++; | |
1014 $i++; | |
1015 } | |
1016 } | |
1017 | |
1018 } | |
1019 $self->{'pair_count'} += scalar( @$lr ); | |
1020 } | |
1021 } | |
1022 | |
1023 | |
1024 # _is_sorted | |
1025 # | |
1026 # Arg 1 int $sorted | |
1027 # Function toggle for whether the (internal) | |
1028 # map data are sorted | |
1029 # Returntype int | |
1030 # Exceptions none | |
1031 # Caller internal | |
1032 # | |
1033 | |
1034 sub _is_sorted { | |
1035 my ($self, $value) = @_; | |
1036 $self->{'_is_sorted'} = $value if (defined($value)); | |
1037 return $self->{'_is_sorted'}; | |
1038 } | |
1039 | |
1040 | |
1041 1; |