comparison variant_effect_predictor/Bio/EnsEMBL/Mapper.pm @ 0:1f6dce3d34e0

Uploaded
author mahtabm
date Thu, 11 Apr 2013 02:01:53 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:1f6dce3d34e0
1 =head1 LICENSE
2
3 Copyright (c) 1999-2012 The European Bioinformatics Institute and
4 Genome Research Limited. All rights reserved.
5
6 This software is distributed under a modified Apache license.
7 For license details, please see
8
9 http://www.ensembl.org/info/about/code_licence.html
10
11 =head1 CONTACT
12
13 Please email comments or questions to the public Ensembl
14 developers list at <dev@ensembl.org>.
15
16 Questions may also be sent to the Ensembl help desk at
17 <helpdesk@ensembl.org>.
18
19 =cut
20
21 =head1 NAME
22
23 Bio::EnsEMBL::Mapper
24
25 =head1 SYNOPSIS
26
27 $map = Bio::EnsEMBL::Mapper->new( 'rawcontig', 'chromosome' );
28
29 # add a coodinate mapping - supply two pairs or coordinates
30 $map->add_map_coordinates(
31 $contig_id, $contig_start, $contig_end, $contig_ori,
32 $chr_name, chr_start, $chr_end
33 );
34
35 # map from one coordinate system to another
36 my @coordlist =
37 $mapper->map_coordinates( 627012, 2, 5, -1, "rawcontig" );
38
39 =head1 DESCRIPTION
40
41 Generic mapper to provide coordinate transforms between two disjoint
42 coordinate systems. This mapper is intended to be 'context neutral' - in
43 that it does not contain any code relating to any particular coordinate
44 system. This is provided in, for example, Bio::EnsEMBL::AssemblyMapper.
45
46 =head1 METHODS
47
48 =cut
49
50 package Bio::EnsEMBL::Mapper;
51 use strict;
52 use integer;
53
54 use Bio::EnsEMBL::Utils::Exception qw(throw deprecate warning stack_trace_dump);
55 use Bio::EnsEMBL::Mapper::Pair;
56 use Bio::EnsEMBL::Mapper::IndelPair;
57 use Bio::EnsEMBL::Mapper::Unit;
58 use Bio::EnsEMBL::Mapper::Coordinate;
59 use Bio::EnsEMBL::Mapper::IndelCoordinate;
60 use Bio::EnsEMBL::Mapper::Gap;
61
62 use Bio::EnsEMBL::Utils::Exception qw(throw);
63
64 # use Data::Dumper;
65
66 =head2 new
67
68 Arg [1] : string $from
69 The name of the 'from' coordinate system
70 Arg [2] : string $to
71 The name of the 'to' coordinate system
72 Arg [3] : (optional) Bio::EnsEMBL::CoordSystem $from_cs
73 The 'from' coordinate system
74 Arg [4] : (optional) Bio::EnsEMBL::CoordSystem $to_cs
75 Example : my $mapper = Bio::EnsEMBL::Mapper->new('FROM', 'TO');
76 Description: Constructor. Creates a new Bio::EnsEMBL::Mapper object.
77 Returntype : Bio::EnsEMBL::Mapper
78 Exceptions : none
79 Caller : general
80
81 =cut
82
83 sub new {
84 my ( $proto, $from, $to, $from_cs, $to_cs ) = @_;
85
86 if ( !defined($to) || !defined($from) ) {
87 throw("Must supply 'to' and 'from' tags");
88 }
89
90 my $class = ref($proto) || $proto;
91
92 my $self = bless( { "_pair_$from" => {},
93 "_pair_$to" => {},
94 'pair_count' => 0,
95 'to' => $to,
96 'from' => $from,
97 'to_cs' => $to_cs,
98 'from_cs' => $from_cs
99 },
100 $class );
101
102 # do sql to get any componente with muliple assemblys.
103
104 return $self;
105 }
106
107 =head2 flush
108
109 Args : none
110 Example : none
111 Description: removes all cached information out of this mapper
112 Returntype : none
113 Exceptions : none
114 Caller : AssemblyMapper, ChainedAssemblyMapper
115
116 =cut
117
118 sub flush {
119 my $self = shift;
120 my $from = $self->from();
121 my $to = $self->to();
122
123 $self->{"_pair_$from"} = {};
124 $self->{"_pair_$to"} = {};
125
126 $self->{'pair_count'} = 0;
127 }
128
129
130
131 =head2 map_coordinates
132
133 Arg 1 string $id
134 id of 'source' sequence
135 Arg 2 int $start
136 start coordinate of 'source' sequence
137 Arg 3 int $end
138 end coordinate of 'source' sequence
139 Arg 4 int $strand
140 raw contig orientation (+/- 1)
141 Arg 5 string $type
142 nature of transform - gives the type of
143 coordinates to be transformed *from*
144 Function generic map method
145 Returntype array of Bio::EnsEMBL::Mapper::Coordinate
146 and/or Bio::EnsEMBL::Mapper::Gap
147 Exceptions none
148 Caller Bio::EnsEMBL::Mapper
149
150 =cut
151
152 sub map_coordinates {
153 my ( $self, $id, $start, $end, $strand, $type ) = @_;
154
155 unless ( defined($id)
156 && defined($start)
157 && defined($end)
158 && defined($strand)
159 && defined($type) )
160 {
161 throw("Expecting 5 arguments");
162 }
163
164 # special case for handling inserts:
165 if ( $start == $end + 1 ) {
166 return $self->map_insert( $id, $start, $end, $strand, $type );
167 }
168
169 if ( !$self->{'_is_sorted'} ) { $self->_sort() }
170
171 my $hash = $self->{"_pair_$type"};
172
173 my ( $from, $to, $cs );
174
175 if ( $type eq $self->{'to'} ) {
176 $from = 'to';
177 $to = 'from';
178 $cs = $self->{'from_cs'};
179 } else {
180 $from = 'from';
181 $to = 'to';
182 $cs = $self->{'to_cs'};
183 }
184
185 unless ( defined $hash ) {
186 throw("Type $type is neither to or from coordinate systems");
187 }
188
189 if ( !defined $hash->{ uc($id) } ) {
190 # one big gap!
191 my $gap = Bio::EnsEMBL::Mapper::Gap->new( $start, $end );
192 return $gap;
193 }
194
195 my $last_used_pair;
196 my @result;
197
198 my ( $start_idx, $end_idx, $mid_idx, $pair, $self_coord );
199 my $lr = $hash->{ uc($id) };
200
201 $start_idx = 0;
202 $end_idx = $#{$lr};
203
204 # binary search the relevant pairs
205 # helps if the list is big
206 while ( ( $end_idx - $start_idx ) > 1 ) {
207 $mid_idx = ( $start_idx + $end_idx ) >> 1;
208 $pair = $lr->[$mid_idx];
209 $self_coord = $pair->{$from};
210 if ( $self_coord->{'end'} < $start ) {
211 $start_idx = $mid_idx;
212 } else {
213 $end_idx = $mid_idx;
214 }
215 }
216
217 my $rank = 0;
218 my $orig_start = $start;
219 my $last_target_coord = undef;
220 for ( my $i = $start_idx; $i <= $#{$lr}; $i++ ) {
221 $pair = $lr->[$i];
222 my $self_coord = $pair->{$from};
223 my $target_coord = $pair->{$to};
224
225 #
226 # But not the case for haplotypes!! need to test for this case???
227 # so removing this till a better solution is found
228 #
229 #
230 # if($self_coord->{'start'} < $start){
231 # $start = $orig_start;
232 # $rank++;
233 # }
234
235 if ( defined($last_target_coord)
236 and $target_coord->{'id'} ne $last_target_coord )
237 {
238 if ( $self_coord->{'start'} < $start )
239 { # i.e. the same bit is being mapped to another assembled bit
240 $start = $orig_start;
241 }
242 } else {
243 $last_target_coord = $target_coord->{'id'};
244 }
245
246 # if we haven't even reached the start, move on
247 if ( $self_coord->{'end'} < $orig_start ) {
248 next;
249 }
250
251 # if we have over run, break
252 if ( $self_coord->{'start'} > $end ) {
253 last;
254 }
255
256 if ( $start < $self_coord->{'start'} ) {
257 # gap detected
258 my $gap = Bio::EnsEMBL::Mapper::Gap->new( $start,
259 $self_coord->{'start'} - 1, $rank );
260
261 push( @result, $gap );
262 $start = $gap->{'end'} + 1;
263 }
264 my ( $target_start, $target_end, $target_ori );
265 my $res;
266 if ( exists $pair->{'indel'} ) {
267 # When next pair is an IndelPair and not a Coordinate, create the
268 # new mapping Coordinate, the IndelCoordinate.
269 $target_start = $target_coord->{'start'};
270 $target_end = $target_coord->{'end'};
271
272 #create a Gap object
273 my $gap = Bio::EnsEMBL::Mapper::Gap->new( $start,
274 ( $self_coord->{'end'} < $end ? $self_coord->{'end'} : $end ) );
275 #create the Coordinate object
276 my $coord =
277 Bio::EnsEMBL::Mapper::Coordinate->new( $target_coord->{'id'},
278 $target_start, $target_end, $pair->{'ori'}*$strand, $cs );
279 #and finally, the IndelCoordinate object with
280 $res = Bio::EnsEMBL::Mapper::IndelCoordinate->new( $gap, $coord );
281 } else {
282 # start is somewhere inside the region
283 if ( $pair->{'ori'} == 1 ) {
284 $target_start =
285 $target_coord->{'start'} +
286 ( $start - $self_coord->{'start'} );
287 } else {
288 $target_end =
289 $target_coord->{'end'} - ( $start - $self_coord->{'start'} );
290 }
291
292 # Either we are enveloping this map or not. If yes, then end
293 # point (self perspective) is determined solely by target. If
294 # not we need to adjust.
295
296 if ( $end > $self_coord->{'end'} ) {
297 # enveloped
298 if ( $pair->{'ori'} == 1 ) {
299 $target_end = $target_coord->{'end'};
300 } else {
301 $target_start = $target_coord->{'start'};
302 }
303 } else {
304 # need to adjust end
305 if ( $pair->{'ori'} == 1 ) {
306 $target_end =
307 $target_coord->{'start'} +
308 ( $end - $self_coord->{'start'} );
309 } else {
310 $target_start =
311 $target_coord->{'end'} - ( $end - $self_coord->{'start'} );
312 }
313 }
314
315 $res =
316 Bio::EnsEMBL::Mapper::Coordinate->new( $target_coord->{'id'},
317 $target_start, $target_end, $pair->{'ori'}*$strand,
318 $cs, $rank );
319
320 } ## end else [ if ( exists $pair->{'indel'...})]
321
322 push( @result, $res );
323
324 $last_used_pair = $pair;
325 $start = $self_coord->{'end'} + 1;
326
327 } ## end for ( my $i = $start_idx...)
328
329 if ( !defined $last_used_pair ) {
330 my $gap = Bio::EnsEMBL::Mapper::Gap->new( $start, $end );
331 push( @result, $gap );
332
333 } elsif ( $last_used_pair->{$from}->{'end'} < $end ) {
334 # gap at the end
335 my $gap =
336 Bio::EnsEMBL::Mapper::Gap->new(
337 $last_used_pair->{$from}->{'end'} + 1,
338 $end, $rank );
339 push( @result, $gap );
340 }
341
342 if ( $strand == -1 ) {
343 @result = reverse(@result);
344 }
345
346 return @result;
347 } ## end sub map_coordinates
348
349
350
351 =head2 map_insert
352
353 Arg [1] : string $id
354 Arg [2] : int $start - start coord. Since this is an insert should always
355 be one greater than end.
356 Arg [3] : int $end - end coord. Since this is an insert should always
357 be one less than start.
358 Arg [4] : int $strand (0, 1, -1)
359 Arg [5] : string $type - the coordinate system name the coords are from.
360 Arg [6] : boolean $fastmap - if specified, this is being called from
361 the fastmap call. The mapping done is not any faster for
362 inserts, but the return value is different.
363 Example :
364 Description: This is in internal function which handles the special mapping
365 case for inserts (start = end +1). This function will be called
366 automatically by the map function so there is no reason to
367 call it directly.
368 Returntype : list of Bio::EnsEMBL::Mapper::Coordinate and/or Gap objects
369 Exceptions : none
370 Caller : map_coordinates()
371
372 =cut
373
374 sub map_insert {
375 my ($self, $id, $start, $end, $strand, $type, $fastmap) = @_;
376
377 # swap start/end and map the resultant 2bp coordinate
378 ($start, $end) =($end,$start);
379
380 my @coords = $self->map_coordinates($id, $start, $end, $strand, $type);
381
382 if(@coords == 1) {
383 my $c = $coords[0];
384 # swap start and end to convert back into insert
385 ($c->{'start'}, $c->{'end'}) = ($c->{'end'}, $c->{'start'});
386 } else {
387 throw("Unexpected: Got ",scalar(@coords)," expected 2.") if(@coords != 2);
388
389 # adjust coordinates, remove gaps
390 my ($c1, $c2);
391 if($strand == -1) {
392 ($c2,$c1) = @coords;
393 } else {
394 ($c1, $c2) = @coords;
395 }
396 @coords = ();
397
398 if(ref($c1) eq 'Bio::EnsEMBL::Mapper::Coordinate') {
399 # insert is after first coord
400 if($c1->{'strand'} * $strand == -1) {
401 $c1->{'end'}--;
402 } else {
403 $c1->{'start'}++;
404 }
405 @coords = ($c1);
406 }
407 if(ref($c2) eq 'Bio::EnsEMBL::Mapper::Coordinate') {
408 # insert is before second coord
409 if($c2->{'strand'} * $strand == -1) {
410 $c2->{'start'}++;
411 } else {
412 $c2->{'end'}--;
413 }
414 if($strand == -1) {
415 unshift @coords, $c2;
416 } else {
417 push @coords, $c2;
418 }
419 }
420 }
421
422 if($fastmap) {
423 return undef if(@coords != 1);
424 my $c = $coords[0];
425 return ($c->{'id'}, $c->{'start'}, $c->{'end'},
426 $c->{'strand'}, $c->{'coord_system'});
427 }
428
429 return @coords;
430 }
431
432
433
434
435
436
437 =head2 fastmap
438
439 Arg 1 string $id
440 id of 'source' sequence
441 Arg 2 int $start
442 start coordinate of 'source' sequence
443 Arg 3 int $end
444 end coordinate of 'source' sequence
445 Arg 4 int $strand
446 raw contig orientation (+/- 1)
447 Arg 5 int $type
448 nature of transform - gives the type of
449 coordinates to be transformed *from*
450 Function inferior map method. Will only do ungapped unsplit mapping.
451 Will return id, start, end strand in a list.
452 Returntype list of results
453 Exceptions none
454 Caller Bio::EnsEMBL::AssemblyMapper
455
456 =cut
457
458 sub fastmap {
459 my ($self, $id, $start, $end, $strand, $type) = @_;
460
461 my ($from, $to, $cs);
462
463 if($end+1 == $start) {
464 return $self->map_insert($id, $start, $end, $strand, $type, 1);
465 }
466
467 if( ! $self->{'_is_sorted'} ) { $self->_sort() }
468
469 if($type eq $self->{'to'}) {
470 $from = 'to';
471 $to = 'from';
472 $cs = $self->{'from_cs'};
473 } else {
474 $from = 'from';
475 $to = 'to';
476 $cs = $self->{'to_cs'};
477 }
478
479 my $hash = $self->{"_pair_$type"} or
480 throw("Type $type is neither to or from coordinate systems");
481
482
483 my $pairs = $hash->{uc($id)};
484
485 foreach my $pair (@$pairs) {
486 my $self_coord = $pair->{$from};
487 my $target_coord = $pair->{$to};
488
489 # only super easy mapping is done
490 if( $start < $self_coord->{'start'} ||
491 $end > $self_coord->{'end'} ) {
492 next;
493 }
494
495 if( $pair->{'ori'} == 1 ) {
496 return ( $target_coord->{'id'},
497 $target_coord->{'start'}+$start-$self_coord->{'start'},
498 $target_coord->{'start'}+$end-$self_coord->{'start'},
499 $strand, $cs );
500 } else {
501 return ( $target_coord->{'id'},
502 $target_coord->{'end'} - ($end - $self_coord->{'start'}),
503 $target_coord->{'end'} - ($start - $self_coord->{'start'}),
504 -$strand, $cs );
505 }
506 }
507
508 return ();
509 }
510
511
512
513 =head2 add_map_coordinates
514
515 Arg 1 int $id
516 id of 'source' sequence
517 Arg 2 int $start
518 start coordinate of 'source' sequence
519 Arg 3 int $end
520 end coordinate of 'source' sequence
521 Arg 4 int $strand
522 relative orientation of source and target (+/- 1)
523 Arg 5 int $id
524 id of 'target' sequence
525 Arg 6 int $start
526 start coordinate of 'target' sequence
527 Arg 7 int $end
528 end coordinate of 'target' sequence
529 Function Stores details of mapping between
530 'source' and 'target' regions.
531 Returntype none
532 Exceptions none
533 Caller Bio::EnsEMBL::Mapper
534
535 =cut
536
537 sub add_map_coordinates {
538 my ( $self, $contig_id, $contig_start, $contig_end, $contig_ori,
539 $chr_name, $chr_start, $chr_end )
540 = @_;
541
542 unless ( defined($contig_id)
543 && defined($contig_start)
544 && defined($contig_end)
545 && defined($contig_ori)
546 && defined($chr_name)
547 && defined($chr_start)
548 && defined($chr_end) )
549 {
550 throw("7 arguments expected");
551 }
552
553 if ( ( $contig_end - $contig_start ) != ( $chr_end - $chr_start ) ) {
554 throw("Cannot deal with mis-lengthed mappings so far");
555 }
556
557 my $from = Bio::EnsEMBL::Mapper::Unit->new( $contig_id, $contig_start,
558 $contig_end );
559 my $to =
560 Bio::EnsEMBL::Mapper::Unit->new( $chr_name, $chr_start, $chr_end );
561
562 my $pair = Bio::EnsEMBL::Mapper::Pair->new( $from, $to, $contig_ori );
563
564 # place into hash on both ids
565 my $map_to = $self->{'to'};
566 my $map_from = $self->{'from'};
567
568 push( @{ $self->{"_pair_$map_to"}->{ uc($chr_name) } }, $pair );
569 push( @{ $self->{"_pair_$map_from"}->{ uc($contig_id) } }, $pair );
570
571 $self->{'pair_count'}++;
572 $self->{'_is_sorted'} = 0;
573 } ## end sub add_map_coordinates
574
575
576 =head2 add_indel_coordinates
577
578 Arg 1 int $id
579 id of 'source' sequence
580 Arg 2 int $start
581 start coordinate of 'source' sequence
582 Arg 3 int $end
583 end coordinate of 'source' sequence
584 Arg 4 int $strand
585 relative orientation of source and target (+/- 1)
586 Arg 5 int $id
587 id of 'targe' sequence
588 Arg 6 int $start
589 start coordinate of 'targe' sequence
590 Arg 7 int $end
591 end coordinate of 'targe' sequence
592 Function stores details of mapping between two regions:
593 'source' and 'target'. Returns 1 if the pair was added, 0 if it
594 was already in. Used when adding an indel
595 Returntype int 0,1
596 Exceptions none
597 Caller Bio::EnsEMBL::Mapper
598
599 =cut
600
601 sub add_indel_coordinates{
602 my ($self, $contig_id, $contig_start, $contig_end,
603 $contig_ori, $chr_name, $chr_start, $chr_end) = @_;
604
605 unless(defined($contig_id) && defined($contig_start) && defined($contig_end)
606 && defined($contig_ori) && defined($chr_name) && defined($chr_start)
607 && defined($chr_end)) {
608 throw("7 arguments expected");
609 }
610
611 #we need to create the IndelPair object to add to both lists, to and from
612 my $from =
613 Bio::EnsEMBL::Mapper::Unit->new($contig_id, $contig_start, $contig_end);
614 my $to =
615 Bio::EnsEMBL::Mapper::Unit->new($chr_name, $chr_start, $chr_end);
616
617 my $pair = Bio::EnsEMBL::Mapper::IndelPair->new($from, $to, $contig_ori);
618
619 # place into hash on both ids
620 my $map_to = $self->{'to'};
621 my $map_from = $self->{'from'};
622
623 push( @{$self->{"_pair_$map_to"}->{uc($chr_name)}}, $pair );
624 push( @{$self->{"_pair_$map_from"}->{uc($contig_id)}}, $pair );
625
626 $self->{'pair_count'}++;
627
628 $self->{'_is_sorted'} = 0;
629 return 1;
630 }
631
632
633 =head2 map_indel
634
635 Arg [1] : string $id
636 Arg [2] : int $start - start coord. Since this is an indel should always
637 be one greater than end.
638 Arg [3] : int $end - end coord. Since this is an indel should always
639 be one less than start.
640 Arg [4] : int $strand (0, 1, -1)
641 Arg [5] : string $type - the coordinate system name the coords are from.
642 Example : @coords = $mapper->map_indel();
643 Description: This is in internal function which handles the special mapping
644 case for indels (start = end +1). It will be used to map from
645 a coordinate system with a gap to another that contains an
646 insertion. It will be mainly used by the Variation API.
647 Returntype : Bio::EnsEMBL::Mapper::Unit objects
648 Exceptions : none
649 Caller : general
650
651 =cut
652
653 sub map_indel {
654 my ( $self, $id, $start, $end, $strand, $type ) = @_;
655
656 # swap start/end and map the resultant 2bp coordinate
657 ( $start, $end ) = ( $end, $start );
658
659 if ( !$self->{'_is_sorted'} ) { $self->_sort() }
660
661 my $hash = $self->{"_pair_$type"};
662
663 my ( $from, $to, $cs );
664
665 if ( $type eq $self->{'to'} ) {
666 $from = 'to';
667 $to = 'from';
668 $cs = $self->{'from_cs'};
669 } else {
670 $from = 'from';
671 $to = 'to';
672 $cs = $self->{'to_cs'};
673 }
674
675 unless ( defined $hash ) {
676 throw("Type $type is neither to or from coordinate systems");
677 }
678 my @indel_coordinates;
679
680 my ( $start_idx, $end_idx, $mid_idx, $pair, $self_coord );
681 my $lr = $hash->{ uc($id) };
682
683 $start_idx = 0;
684 $end_idx = $#{$lr};
685
686 # binary search the relevant pairs
687 # helps if the list is big
688 while ( ( $end_idx - $start_idx ) > 1 ) {
689 $mid_idx = ( $start_idx + $end_idx ) >> 1;
690 $pair = $lr->[$mid_idx];
691 $self_coord = $pair->{$from};
692 if ( $self_coord->{'end'} <= $start ) {
693 $start_idx = $mid_idx;
694 } else {
695 $end_idx = $mid_idx;
696 }
697 }
698
699 for ( my $i = $start_idx; $i <= $#{$lr}; $i++ ) {
700 $pair = $lr->[$i];
701 my $self_coord = $pair->{$from};
702 my $target_coord = $pair->{$to};
703
704 if ( exists $pair->{'indel'} ) {
705 #need to return unit coordinate
706 my $to =
707 Bio::EnsEMBL::Mapper::Unit->new( $target_coord->{'id'},
708 $target_coord->{'start'},
709 $target_coord->{'end'}, );
710 push @indel_coordinates, $to;
711 last;
712 }
713 }
714
715 return @indel_coordinates;
716 } ## end sub map_indel
717
718
719 =head2 add_Mapper
720
721 Arg 1 Bio::EnsEMBL::Mapper $mapper2
722 Example $mapper->add_Mapper($mapper2)
723 Function add all the map coordinates from $mapper to this mapper.
724 This object will contain mapping pairs from both the old
725 object and $mapper2.
726 Returntype int 0,1
727 Exceptions throw if 'to' and 'from' from both Bio::EnsEMBL::Mappers
728 are incompatible
729 Caller $mapper->methodname()
730
731 =cut
732
733 sub add_Mapper{
734 my ($self, $mapper) = @_;
735
736 my $mapper_to = $mapper->{'to'};
737 my $mapper_from = $mapper->{'from'};
738 if ($mapper_to ne $self->{'to'} or $mapper_from ne $self->{'from'}) {
739 throw("Trying to add an incompatible Mapper");
740 }
741
742 my $count_a = 0;
743 foreach my $seq_name (keys %{$mapper->{"_pair_$mapper_to"}}) {
744 push(@{$self->{"_pair_$mapper_to"}->{$seq_name}},
745 @{$mapper->{"_pair_$mapper_to"}->{$seq_name}});
746 $count_a += scalar(@{$mapper->{"_pair_$mapper_to"}->{$seq_name}});
747 }
748 my $count_b = 0;
749 foreach my $seq_name (keys %{$mapper->{"_pair_$mapper_from"}}) {
750 push(@{$self->{"_pair_$mapper_from"}->{$seq_name}},
751 @{$mapper->{"_pair_$mapper_from"}->{$seq_name}});
752 $count_b += scalar(@{$mapper->{"_pair_$mapper_from"}->{$seq_name}});
753 }
754
755 if ($count_a == $count_b) {
756 $self->{'pair_count'} += $count_a;
757 } else {
758 throw("Trying to add a funny Mapper");
759 }
760
761 $self->{'_is_sorted'} = 0;
762 return 1;
763 }
764
765
766
767 =head2 list_pairs
768
769 Arg 1 int $id
770 id of 'source' sequence
771 Arg 2 int $start
772 start coordinate of 'source' sequence
773 Arg 3 int $end
774 end coordinate of 'source' sequence
775 Arg 4 string $type
776 nature of transform - gives the type of
777 coordinates to be transformed *from*
778 Function list all pairs of mappings in a region
779 Returntype list of Bio::EnsEMBL::Mapper::Pair
780 Exceptions none
781 Caller Bio::EnsEMBL::Mapper
782
783 =cut
784
785 sub list_pairs {
786 my ( $self, $id, $start, $end, $type ) = @_;
787
788 if ( !$self->{'_is_sorted'} ) { $self->_sort() }
789
790 if ( !defined $type ) {
791 throw("Expected 4 arguments");
792 }
793
794 if ( $start > $end ) {
795 throw( "Start is greater than end "
796 . "for id $id, start $start, end $end\n" );
797 }
798
799 my $hash = $self->{"_pair_$type"};
800
801 my ( $from, $to );
802
803 if ( $type eq $self->{'to'} ) {
804 $from = 'to';
805 $to = 'from';
806 } else {
807 $from = 'from';
808 $to = 'to';
809 }
810
811 unless ( defined $hash ) {
812 throw("Type $type is neither to or from coordinate systems");
813 }
814
815 my @list;
816
817 unless ( exists $hash->{ uc($id) } ) {
818 return ();
819 }
820
821 @list = @{ $hash->{ uc($id) } };
822
823 my @output;
824 if ( $start == -1 && $end == -1 ) {
825 return @list;
826 } else {
827
828 foreach my $p (@list) {
829
830 if ( $p->{$from}->{'end'} < $start ) {
831 next;
832 }
833 if ( $p->{$from}->{'start'} > $end ) {
834 last;
835 }
836 push( @output, $p );
837 }
838 return @output;
839 }
840 } ## end sub list_pairs
841
842
843 =head2 to
844
845 Arg 1 Bio::EnsEMBL::Mapper::Unit $id
846 id of 'source' sequence
847 Function accessor method form the 'source'
848 and 'target' in a Mapper::Pair
849 Returntype Bio::EnsEMBL::Mapper::Unit
850 Exceptions none
851 Caller Bio::EnsEMBL::Mapper
852
853 =cut
854
855 sub to {
856 my ( $self, $value ) = @_;
857
858 if ( defined($value) ) {
859 $self->{'to'} = $value;
860 }
861
862 return $self->{'to'};
863 }
864
865 =head2 from
866
867 Arg 1 Bio::EnsEMBL::Mapper::Unit $id
868 id of 'source' sequence
869 Function accessor method form the 'source'
870 and 'target' in a Mapper::Pair
871 Returntype Bio::EnsEMBL::Mapper::Unit
872 Exceptions none
873 Caller Bio::EnsEMBL::Mapper
874
875 =cut
876 sub from {
877 my ( $self, $value ) = @_;
878
879 if ( defined($value) ) {
880 $self->{'from'} = $value;
881 }
882
883 return $self->{'from'};
884 }
885
886
887 # _dump
888 #
889 # Arg 1 *FileHandle $fh
890 # Function convenience dump function
891 # possibly useful for debugging
892 # Returntype none
893 # Exceptions none
894 # Caller internal
895 #
896
897 sub _dump{
898 my ($self,$fh) = @_;
899
900 if( !defined $fh ) {
901 $fh = \*STDERR;
902 }
903
904 foreach my $id ( keys %{$self->{'_pair_hash_from'}} ) {
905 print $fh "From Hash $id\n";
906 foreach my $pair ( @{$self->{'_pair_hash_from'}->{uc($id)}} ) {
907 print $fh " ",$pair->from->start," ",$pair->from->end,":",$pair->to->start," ",$pair->to->end," ",$pair->to->id,"\n";
908 }
909 }
910
911 }
912
913
914 # _sort
915 #
916 # Function sort function so that all
917 # mappings are sorted by
918 # chromosome start
919 # Returntype none
920 # Exceptions none
921 # Caller internal
922 #
923
924 sub _sort {
925 my ($self) = @_;
926
927 my $to = $self->{'to'};
928 my $from = $self->{'from'};
929
930 foreach my $id ( keys %{ $self->{"_pair_$from"} } ) {
931 @{ $self->{"_pair_$from"}->{$id} } =
932 sort { $a->{'from'}->{'start'} <=> $b->{'from'}->{'start'} }
933 @{ $self->{"_pair_$from"}->{$id} };
934 }
935
936 foreach my $id ( keys %{ $self->{"_pair_$to"} } ) {
937 @{ $self->{"_pair_$to"}->{$id} } =
938 sort { $a->{'to'}->{'start'} <=> $b->{'to'}->{'start'} }
939 @{ $self->{"_pair_$to"}->{$id} };
940 }
941
942 $self->_merge_pairs();
943 $self->_is_sorted(1);
944 }
945
946 # this function merges pairs that are adjacent into one
947 sub _merge_pairs {
948 my $self = shift;
949
950 my ( $lr, $lr_from, $del_pair, $next_pair, $current_pair );
951
952 my $map_to = $self->{'to'};
953 my $map_from = $self->{'from'};
954 $self->{'pair_count'} = 0;
955
956 for my $key ( keys %{$self->{"_pair_$map_to"}} ) {
957 $lr = $self->{"_pair_$map_to"}->{$key};
958
959 my $i = 0;
960 my $next = 1;
961 my $length = $#{$lr};
962 while( $next <= $length ) {
963 $current_pair = $lr->[$i];
964 $next_pair = $lr->[$next];
965 $del_pair = undef;
966
967 if(exists $current_pair->{'indel'} || exists $next_pair->{'indel'}){
968 #necessary to modify the merge function to not merge indels
969 $next++;
970 $i++;
971 }
972 else{
973 # duplicate filter
974 if( $current_pair->{'to'}->{'start'} == $next_pair->{'to'}->{'start'}
975 and $current_pair->{'from'}->{'id'} == $next_pair->{'from'}->{'id'} ) {
976 $del_pair = $next_pair;
977 } elsif(( $current_pair->{'from'}->{'id'} eq $next_pair->{'from'}->{'id'} ) &&
978 ( $next_pair->{'ori'} == $current_pair->{'ori'} ) &&
979 ( $next_pair->{'to'}->{'start'} -1 == $current_pair->{'to'}->{'end'} )) {
980
981 if( $current_pair->{'ori'} == 1 ) {
982 # check forward strand merge
983 if( $next_pair->{'from'}->{'start'} - 1 == $current_pair->{'from'}->{'end'} ) {
984 # normal merge with previous element
985 $current_pair->{'to'}->{'end'} = $next_pair->{'to'}->{'end'};
986 $current_pair->{'from'}->{'end'} = $next_pair->{'from'}->{'end'};
987 $del_pair = $next_pair;
988 }
989 } else {
990 # check backward strand merge
991 if( $next_pair->{'from'}->{'end'} + 1 == $current_pair->{'from'}->{'start'} ) {
992 # yes its a merge
993 $current_pair->{'to'}->{'end'} = $next_pair->{'to'}->{'end'};
994 $current_pair->{'from'}->{'start'} = $next_pair->{'from'}->{'start'};
995 $del_pair = $next_pair;
996 }
997 }
998 }
999
1000 if( defined $del_pair ) {
1001 splice( @$lr, $next, 1 );
1002 $lr_from = $self->{"_pair_$map_from"}->{uc($del_pair->{'from'}->{'id'})};
1003 for( my $j=0; $j <= $#{$lr_from}; $j++ ) {
1004 if( $lr_from->[$j] == $del_pair ) {
1005 splice( @$lr_from, $j, 1 );
1006 last;
1007 }
1008 }
1009 $length--;
1010 if( $length < $next ) { last; }
1011 }
1012 else {
1013 $next++;
1014 $i++;
1015 }
1016 }
1017
1018 }
1019 $self->{'pair_count'} += scalar( @$lr );
1020 }
1021 }
1022
1023
1024 # _is_sorted
1025 #
1026 # Arg 1 int $sorted
1027 # Function toggle for whether the (internal)
1028 # map data are sorted
1029 # Returntype int
1030 # Exceptions none
1031 # Caller internal
1032 #
1033
1034 sub _is_sorted {
1035 my ($self, $value) = @_;
1036 $self->{'_is_sorted'} = $value if (defined($value));
1037 return $self->{'_is_sorted'};
1038 }
1039
1040
1041 1;