0
|
1 =head1 LICENSE
|
|
2
|
|
3 Copyright (c) 1999-2012 The European Bioinformatics Institute and
|
|
4 Genome Research Limited. All rights reserved.
|
|
5
|
|
6 This software is distributed under a modified Apache license.
|
|
7 For license details, please see
|
|
8
|
|
9 http://www.ensembl.org/info/about/code_licence.html
|
|
10
|
|
11 =head1 CONTACT
|
|
12
|
|
13 Please email comments or questions to the public Ensembl
|
|
14 developers list at <dev@ensembl.org>.
|
|
15
|
|
16 Questions may also be sent to the Ensembl help desk at
|
|
17 <helpdesk@ensembl.org>.
|
|
18
|
|
19 =cut
|
|
20
|
|
21 =head1 NAME
|
|
22
|
|
23 Bio::EnsEMBL::Mapper
|
|
24
|
|
25 =head1 SYNOPSIS
|
|
26
|
|
27 $map = Bio::EnsEMBL::Mapper->new( 'rawcontig', 'chromosome' );
|
|
28
|
|
29 # add a coodinate mapping - supply two pairs or coordinates
|
|
30 $map->add_map_coordinates(
|
|
31 $contig_id, $contig_start, $contig_end, $contig_ori,
|
|
32 $chr_name, chr_start, $chr_end
|
|
33 );
|
|
34
|
|
35 # map from one coordinate system to another
|
|
36 my @coordlist =
|
|
37 $mapper->map_coordinates( 627012, 2, 5, -1, "rawcontig" );
|
|
38
|
|
39 =head1 DESCRIPTION
|
|
40
|
|
41 Generic mapper to provide coordinate transforms between two disjoint
|
|
42 coordinate systems. This mapper is intended to be 'context neutral' - in
|
|
43 that it does not contain any code relating to any particular coordinate
|
|
44 system. This is provided in, for example, Bio::EnsEMBL::AssemblyMapper.
|
|
45
|
|
46 =head1 METHODS
|
|
47
|
|
48 =cut
|
|
49
|
|
50 package Bio::EnsEMBL::Mapper;
|
|
51 use strict;
|
|
52 use integer;
|
|
53
|
|
54 use Bio::EnsEMBL::Utils::Exception qw(throw deprecate warning stack_trace_dump);
|
|
55 use Bio::EnsEMBL::Mapper::Pair;
|
|
56 use Bio::EnsEMBL::Mapper::IndelPair;
|
|
57 use Bio::EnsEMBL::Mapper::Unit;
|
|
58 use Bio::EnsEMBL::Mapper::Coordinate;
|
|
59 use Bio::EnsEMBL::Mapper::IndelCoordinate;
|
|
60 use Bio::EnsEMBL::Mapper::Gap;
|
|
61
|
|
62 use Bio::EnsEMBL::Utils::Exception qw(throw);
|
|
63
|
|
64 # use Data::Dumper;
|
|
65
|
|
66 =head2 new
|
|
67
|
|
68 Arg [1] : string $from
|
|
69 The name of the 'from' coordinate system
|
|
70 Arg [2] : string $to
|
|
71 The name of the 'to' coordinate system
|
|
72 Arg [3] : (optional) Bio::EnsEMBL::CoordSystem $from_cs
|
|
73 The 'from' coordinate system
|
|
74 Arg [4] : (optional) Bio::EnsEMBL::CoordSystem $to_cs
|
|
75 Example : my $mapper = Bio::EnsEMBL::Mapper->new('FROM', 'TO');
|
|
76 Description: Constructor. Creates a new Bio::EnsEMBL::Mapper object.
|
|
77 Returntype : Bio::EnsEMBL::Mapper
|
|
78 Exceptions : none
|
|
79 Caller : general
|
|
80
|
|
81 =cut
|
|
82
|
|
83 sub new {
|
|
84 my ( $proto, $from, $to, $from_cs, $to_cs ) = @_;
|
|
85
|
|
86 if ( !defined($to) || !defined($from) ) {
|
|
87 throw("Must supply 'to' and 'from' tags");
|
|
88 }
|
|
89
|
|
90 my $class = ref($proto) || $proto;
|
|
91
|
|
92 my $self = bless( { "_pair_$from" => {},
|
|
93 "_pair_$to" => {},
|
|
94 'pair_count' => 0,
|
|
95 'to' => $to,
|
|
96 'from' => $from,
|
|
97 'to_cs' => $to_cs,
|
|
98 'from_cs' => $from_cs
|
|
99 },
|
|
100 $class );
|
|
101
|
|
102 # do sql to get any componente with muliple assemblys.
|
|
103
|
|
104 return $self;
|
|
105 }
|
|
106
|
|
107 =head2 flush
|
|
108
|
|
109 Args : none
|
|
110 Example : none
|
|
111 Description: removes all cached information out of this mapper
|
|
112 Returntype : none
|
|
113 Exceptions : none
|
|
114 Caller : AssemblyMapper, ChainedAssemblyMapper
|
|
115
|
|
116 =cut
|
|
117
|
|
118 sub flush {
|
|
119 my $self = shift;
|
|
120 my $from = $self->from();
|
|
121 my $to = $self->to();
|
|
122
|
|
123 $self->{"_pair_$from"} = {};
|
|
124 $self->{"_pair_$to"} = {};
|
|
125
|
|
126 $self->{'pair_count'} = 0;
|
|
127 }
|
|
128
|
|
129
|
|
130
|
|
131 =head2 map_coordinates
|
|
132
|
|
133 Arg 1 string $id
|
|
134 id of 'source' sequence
|
|
135 Arg 2 int $start
|
|
136 start coordinate of 'source' sequence
|
|
137 Arg 3 int $end
|
|
138 end coordinate of 'source' sequence
|
|
139 Arg 4 int $strand
|
|
140 raw contig orientation (+/- 1)
|
|
141 Arg 5 string $type
|
|
142 nature of transform - gives the type of
|
|
143 coordinates to be transformed *from*
|
|
144 Function generic map method
|
|
145 Returntype array of Bio::EnsEMBL::Mapper::Coordinate
|
|
146 and/or Bio::EnsEMBL::Mapper::Gap
|
|
147 Exceptions none
|
|
148 Caller Bio::EnsEMBL::Mapper
|
|
149
|
|
150 =cut
|
|
151
|
|
152 sub map_coordinates {
|
|
153 my ( $self, $id, $start, $end, $strand, $type ) = @_;
|
|
154
|
|
155 unless ( defined($id)
|
|
156 && defined($start)
|
|
157 && defined($end)
|
|
158 && defined($strand)
|
|
159 && defined($type) )
|
|
160 {
|
|
161 throw("Expecting 5 arguments");
|
|
162 }
|
|
163
|
|
164 # special case for handling inserts:
|
|
165 if ( $start == $end + 1 ) {
|
|
166 return $self->map_insert( $id, $start, $end, $strand, $type );
|
|
167 }
|
|
168
|
|
169 if ( !$self->{'_is_sorted'} ) { $self->_sort() }
|
|
170
|
|
171 my $hash = $self->{"_pair_$type"};
|
|
172
|
|
173 my ( $from, $to, $cs );
|
|
174
|
|
175 if ( $type eq $self->{'to'} ) {
|
|
176 $from = 'to';
|
|
177 $to = 'from';
|
|
178 $cs = $self->{'from_cs'};
|
|
179 } else {
|
|
180 $from = 'from';
|
|
181 $to = 'to';
|
|
182 $cs = $self->{'to_cs'};
|
|
183 }
|
|
184
|
|
185 unless ( defined $hash ) {
|
|
186 throw("Type $type is neither to or from coordinate systems");
|
|
187 }
|
|
188
|
|
189 if ( !defined $hash->{ uc($id) } ) {
|
|
190 # one big gap!
|
|
191 my $gap = Bio::EnsEMBL::Mapper::Gap->new( $start, $end );
|
|
192 return $gap;
|
|
193 }
|
|
194
|
|
195 my $last_used_pair;
|
|
196 my @result;
|
|
197
|
|
198 my ( $start_idx, $end_idx, $mid_idx, $pair, $self_coord );
|
|
199 my $lr = $hash->{ uc($id) };
|
|
200
|
|
201 $start_idx = 0;
|
|
202 $end_idx = $#{$lr};
|
|
203
|
|
204 # binary search the relevant pairs
|
|
205 # helps if the list is big
|
|
206 while ( ( $end_idx - $start_idx ) > 1 ) {
|
|
207 $mid_idx = ( $start_idx + $end_idx ) >> 1;
|
|
208 $pair = $lr->[$mid_idx];
|
|
209 $self_coord = $pair->{$from};
|
|
210 if ( $self_coord->{'end'} < $start ) {
|
|
211 $start_idx = $mid_idx;
|
|
212 } else {
|
|
213 $end_idx = $mid_idx;
|
|
214 }
|
|
215 }
|
|
216
|
|
217 my $rank = 0;
|
|
218 my $orig_start = $start;
|
|
219 my $last_target_coord = undef;
|
|
220 for ( my $i = $start_idx; $i <= $#{$lr}; $i++ ) {
|
|
221 $pair = $lr->[$i];
|
|
222 my $self_coord = $pair->{$from};
|
|
223 my $target_coord = $pair->{$to};
|
|
224
|
|
225 #
|
|
226 # But not the case for haplotypes!! need to test for this case???
|
|
227 # so removing this till a better solution is found
|
|
228 #
|
|
229 #
|
|
230 # if($self_coord->{'start'} < $start){
|
|
231 # $start = $orig_start;
|
|
232 # $rank++;
|
|
233 # }
|
|
234
|
|
235 if ( defined($last_target_coord)
|
|
236 and $target_coord->{'id'} ne $last_target_coord )
|
|
237 {
|
|
238 if ( $self_coord->{'start'} < $start )
|
|
239 { # i.e. the same bit is being mapped to another assembled bit
|
|
240 $start = $orig_start;
|
|
241 }
|
|
242 } else {
|
|
243 $last_target_coord = $target_coord->{'id'};
|
|
244 }
|
|
245
|
|
246 # if we haven't even reached the start, move on
|
|
247 if ( $self_coord->{'end'} < $orig_start ) {
|
|
248 next;
|
|
249 }
|
|
250
|
|
251 # if we have over run, break
|
|
252 if ( $self_coord->{'start'} > $end ) {
|
|
253 last;
|
|
254 }
|
|
255
|
|
256 if ( $start < $self_coord->{'start'} ) {
|
|
257 # gap detected
|
|
258 my $gap = Bio::EnsEMBL::Mapper::Gap->new( $start,
|
|
259 $self_coord->{'start'} - 1, $rank );
|
|
260
|
|
261 push( @result, $gap );
|
|
262 $start = $gap->{'end'} + 1;
|
|
263 }
|
|
264 my ( $target_start, $target_end, $target_ori );
|
|
265 my $res;
|
|
266 if ( exists $pair->{'indel'} ) {
|
|
267 # When next pair is an IndelPair and not a Coordinate, create the
|
|
268 # new mapping Coordinate, the IndelCoordinate.
|
|
269 $target_start = $target_coord->{'start'};
|
|
270 $target_end = $target_coord->{'end'};
|
|
271
|
|
272 #create a Gap object
|
|
273 my $gap = Bio::EnsEMBL::Mapper::Gap->new( $start,
|
|
274 ( $self_coord->{'end'} < $end ? $self_coord->{'end'} : $end ) );
|
|
275 #create the Coordinate object
|
|
276 my $coord =
|
|
277 Bio::EnsEMBL::Mapper::Coordinate->new( $target_coord->{'id'},
|
|
278 $target_start, $target_end, $pair->{'ori'}*$strand, $cs );
|
|
279 #and finally, the IndelCoordinate object with
|
|
280 $res = Bio::EnsEMBL::Mapper::IndelCoordinate->new( $gap, $coord );
|
|
281 } else {
|
|
282 # start is somewhere inside the region
|
|
283 if ( $pair->{'ori'} == 1 ) {
|
|
284 $target_start =
|
|
285 $target_coord->{'start'} +
|
|
286 ( $start - $self_coord->{'start'} );
|
|
287 } else {
|
|
288 $target_end =
|
|
289 $target_coord->{'end'} - ( $start - $self_coord->{'start'} );
|
|
290 }
|
|
291
|
|
292 # Either we are enveloping this map or not. If yes, then end
|
|
293 # point (self perspective) is determined solely by target. If
|
|
294 # not we need to adjust.
|
|
295
|
|
296 if ( $end > $self_coord->{'end'} ) {
|
|
297 # enveloped
|
|
298 if ( $pair->{'ori'} == 1 ) {
|
|
299 $target_end = $target_coord->{'end'};
|
|
300 } else {
|
|
301 $target_start = $target_coord->{'start'};
|
|
302 }
|
|
303 } else {
|
|
304 # need to adjust end
|
|
305 if ( $pair->{'ori'} == 1 ) {
|
|
306 $target_end =
|
|
307 $target_coord->{'start'} +
|
|
308 ( $end - $self_coord->{'start'} );
|
|
309 } else {
|
|
310 $target_start =
|
|
311 $target_coord->{'end'} - ( $end - $self_coord->{'start'} );
|
|
312 }
|
|
313 }
|
|
314
|
|
315 $res =
|
|
316 Bio::EnsEMBL::Mapper::Coordinate->new( $target_coord->{'id'},
|
|
317 $target_start, $target_end, $pair->{'ori'}*$strand,
|
|
318 $cs, $rank );
|
|
319
|
|
320 } ## end else [ if ( exists $pair->{'indel'...})]
|
|
321
|
|
322 push( @result, $res );
|
|
323
|
|
324 $last_used_pair = $pair;
|
|
325 $start = $self_coord->{'end'} + 1;
|
|
326
|
|
327 } ## end for ( my $i = $start_idx...)
|
|
328
|
|
329 if ( !defined $last_used_pair ) {
|
|
330 my $gap = Bio::EnsEMBL::Mapper::Gap->new( $start, $end );
|
|
331 push( @result, $gap );
|
|
332
|
|
333 } elsif ( $last_used_pair->{$from}->{'end'} < $end ) {
|
|
334 # gap at the end
|
|
335 my $gap =
|
|
336 Bio::EnsEMBL::Mapper::Gap->new(
|
|
337 $last_used_pair->{$from}->{'end'} + 1,
|
|
338 $end, $rank );
|
|
339 push( @result, $gap );
|
|
340 }
|
|
341
|
|
342 if ( $strand == -1 ) {
|
|
343 @result = reverse(@result);
|
|
344 }
|
|
345
|
|
346 return @result;
|
|
347 } ## end sub map_coordinates
|
|
348
|
|
349
|
|
350
|
|
351 =head2 map_insert
|
|
352
|
|
353 Arg [1] : string $id
|
|
354 Arg [2] : int $start - start coord. Since this is an insert should always
|
|
355 be one greater than end.
|
|
356 Arg [3] : int $end - end coord. Since this is an insert should always
|
|
357 be one less than start.
|
|
358 Arg [4] : int $strand (0, 1, -1)
|
|
359 Arg [5] : string $type - the coordinate system name the coords are from.
|
|
360 Arg [6] : boolean $fastmap - if specified, this is being called from
|
|
361 the fastmap call. The mapping done is not any faster for
|
|
362 inserts, but the return value is different.
|
|
363 Example :
|
|
364 Description: This is in internal function which handles the special mapping
|
|
365 case for inserts (start = end +1). This function will be called
|
|
366 automatically by the map function so there is no reason to
|
|
367 call it directly.
|
|
368 Returntype : list of Bio::EnsEMBL::Mapper::Coordinate and/or Gap objects
|
|
369 Exceptions : none
|
|
370 Caller : map_coordinates()
|
|
371
|
|
372 =cut
|
|
373
|
|
374 sub map_insert {
|
|
375 my ($self, $id, $start, $end, $strand, $type, $fastmap) = @_;
|
|
376
|
|
377 # swap start/end and map the resultant 2bp coordinate
|
|
378 ($start, $end) =($end,$start);
|
|
379
|
|
380 my @coords = $self->map_coordinates($id, $start, $end, $strand, $type);
|
|
381
|
|
382 if(@coords == 1) {
|
|
383 my $c = $coords[0];
|
|
384 # swap start and end to convert back into insert
|
|
385 ($c->{'start'}, $c->{'end'}) = ($c->{'end'}, $c->{'start'});
|
|
386 } else {
|
|
387 throw("Unexpected: Got ",scalar(@coords)," expected 2.") if(@coords != 2);
|
|
388
|
|
389 # adjust coordinates, remove gaps
|
|
390 my ($c1, $c2);
|
|
391 if($strand == -1) {
|
|
392 ($c2,$c1) = @coords;
|
|
393 } else {
|
|
394 ($c1, $c2) = @coords;
|
|
395 }
|
|
396 @coords = ();
|
|
397
|
|
398 if(ref($c1) eq 'Bio::EnsEMBL::Mapper::Coordinate') {
|
|
399 # insert is after first coord
|
|
400 if($c1->{'strand'} * $strand == -1) {
|
|
401 $c1->{'end'}--;
|
|
402 } else {
|
|
403 $c1->{'start'}++;
|
|
404 }
|
|
405 @coords = ($c1);
|
|
406 }
|
|
407 if(ref($c2) eq 'Bio::EnsEMBL::Mapper::Coordinate') {
|
|
408 # insert is before second coord
|
|
409 if($c2->{'strand'} * $strand == -1) {
|
|
410 $c2->{'start'}++;
|
|
411 } else {
|
|
412 $c2->{'end'}--;
|
|
413 }
|
|
414 if($strand == -1) {
|
|
415 unshift @coords, $c2;
|
|
416 } else {
|
|
417 push @coords, $c2;
|
|
418 }
|
|
419 }
|
|
420 }
|
|
421
|
|
422 if($fastmap) {
|
|
423 return undef if(@coords != 1);
|
|
424 my $c = $coords[0];
|
|
425 return ($c->{'id'}, $c->{'start'}, $c->{'end'},
|
|
426 $c->{'strand'}, $c->{'coord_system'});
|
|
427 }
|
|
428
|
|
429 return @coords;
|
|
430 }
|
|
431
|
|
432
|
|
433
|
|
434
|
|
435
|
|
436
|
|
437 =head2 fastmap
|
|
438
|
|
439 Arg 1 string $id
|
|
440 id of 'source' sequence
|
|
441 Arg 2 int $start
|
|
442 start coordinate of 'source' sequence
|
|
443 Arg 3 int $end
|
|
444 end coordinate of 'source' sequence
|
|
445 Arg 4 int $strand
|
|
446 raw contig orientation (+/- 1)
|
|
447 Arg 5 int $type
|
|
448 nature of transform - gives the type of
|
|
449 coordinates to be transformed *from*
|
|
450 Function inferior map method. Will only do ungapped unsplit mapping.
|
|
451 Will return id, start, end strand in a list.
|
|
452 Returntype list of results
|
|
453 Exceptions none
|
|
454 Caller Bio::EnsEMBL::AssemblyMapper
|
|
455
|
|
456 =cut
|
|
457
|
|
458 sub fastmap {
|
|
459 my ($self, $id, $start, $end, $strand, $type) = @_;
|
|
460
|
|
461 my ($from, $to, $cs);
|
|
462
|
|
463 if($end+1 == $start) {
|
|
464 return $self->map_insert($id, $start, $end, $strand, $type, 1);
|
|
465 }
|
|
466
|
|
467 if( ! $self->{'_is_sorted'} ) { $self->_sort() }
|
|
468
|
|
469 if($type eq $self->{'to'}) {
|
|
470 $from = 'to';
|
|
471 $to = 'from';
|
|
472 $cs = $self->{'from_cs'};
|
|
473 } else {
|
|
474 $from = 'from';
|
|
475 $to = 'to';
|
|
476 $cs = $self->{'to_cs'};
|
|
477 }
|
|
478
|
|
479 my $hash = $self->{"_pair_$type"} or
|
|
480 throw("Type $type is neither to or from coordinate systems");
|
|
481
|
|
482
|
|
483 my $pairs = $hash->{uc($id)};
|
|
484
|
|
485 foreach my $pair (@$pairs) {
|
|
486 my $self_coord = $pair->{$from};
|
|
487 my $target_coord = $pair->{$to};
|
|
488
|
|
489 # only super easy mapping is done
|
|
490 if( $start < $self_coord->{'start'} ||
|
|
491 $end > $self_coord->{'end'} ) {
|
|
492 next;
|
|
493 }
|
|
494
|
|
495 if( $pair->{'ori'} == 1 ) {
|
|
496 return ( $target_coord->{'id'},
|
|
497 $target_coord->{'start'}+$start-$self_coord->{'start'},
|
|
498 $target_coord->{'start'}+$end-$self_coord->{'start'},
|
|
499 $strand, $cs );
|
|
500 } else {
|
|
501 return ( $target_coord->{'id'},
|
|
502 $target_coord->{'end'} - ($end - $self_coord->{'start'}),
|
|
503 $target_coord->{'end'} - ($start - $self_coord->{'start'}),
|
|
504 -$strand, $cs );
|
|
505 }
|
|
506 }
|
|
507
|
|
508 return ();
|
|
509 }
|
|
510
|
|
511
|
|
512
|
|
513 =head2 add_map_coordinates
|
|
514
|
|
515 Arg 1 int $id
|
|
516 id of 'source' sequence
|
|
517 Arg 2 int $start
|
|
518 start coordinate of 'source' sequence
|
|
519 Arg 3 int $end
|
|
520 end coordinate of 'source' sequence
|
|
521 Arg 4 int $strand
|
|
522 relative orientation of source and target (+/- 1)
|
|
523 Arg 5 int $id
|
|
524 id of 'target' sequence
|
|
525 Arg 6 int $start
|
|
526 start coordinate of 'target' sequence
|
|
527 Arg 7 int $end
|
|
528 end coordinate of 'target' sequence
|
|
529 Function Stores details of mapping between
|
|
530 'source' and 'target' regions.
|
|
531 Returntype none
|
|
532 Exceptions none
|
|
533 Caller Bio::EnsEMBL::Mapper
|
|
534
|
|
535 =cut
|
|
536
|
|
537 sub add_map_coordinates {
|
|
538 my ( $self, $contig_id, $contig_start, $contig_end, $contig_ori,
|
|
539 $chr_name, $chr_start, $chr_end )
|
|
540 = @_;
|
|
541
|
|
542 unless ( defined($contig_id)
|
|
543 && defined($contig_start)
|
|
544 && defined($contig_end)
|
|
545 && defined($contig_ori)
|
|
546 && defined($chr_name)
|
|
547 && defined($chr_start)
|
|
548 && defined($chr_end) )
|
|
549 {
|
|
550 throw("7 arguments expected");
|
|
551 }
|
|
552
|
|
553 if ( ( $contig_end - $contig_start ) != ( $chr_end - $chr_start ) ) {
|
|
554 throw("Cannot deal with mis-lengthed mappings so far");
|
|
555 }
|
|
556
|
|
557 my $from = Bio::EnsEMBL::Mapper::Unit->new( $contig_id, $contig_start,
|
|
558 $contig_end );
|
|
559 my $to =
|
|
560 Bio::EnsEMBL::Mapper::Unit->new( $chr_name, $chr_start, $chr_end );
|
|
561
|
|
562 my $pair = Bio::EnsEMBL::Mapper::Pair->new( $from, $to, $contig_ori );
|
|
563
|
|
564 # place into hash on both ids
|
|
565 my $map_to = $self->{'to'};
|
|
566 my $map_from = $self->{'from'};
|
|
567
|
|
568 push( @{ $self->{"_pair_$map_to"}->{ uc($chr_name) } }, $pair );
|
|
569 push( @{ $self->{"_pair_$map_from"}->{ uc($contig_id) } }, $pair );
|
|
570
|
|
571 $self->{'pair_count'}++;
|
|
572 $self->{'_is_sorted'} = 0;
|
|
573 } ## end sub add_map_coordinates
|
|
574
|
|
575
|
|
576 =head2 add_indel_coordinates
|
|
577
|
|
578 Arg 1 int $id
|
|
579 id of 'source' sequence
|
|
580 Arg 2 int $start
|
|
581 start coordinate of 'source' sequence
|
|
582 Arg 3 int $end
|
|
583 end coordinate of 'source' sequence
|
|
584 Arg 4 int $strand
|
|
585 relative orientation of source and target (+/- 1)
|
|
586 Arg 5 int $id
|
|
587 id of 'targe' sequence
|
|
588 Arg 6 int $start
|
|
589 start coordinate of 'targe' sequence
|
|
590 Arg 7 int $end
|
|
591 end coordinate of 'targe' sequence
|
|
592 Function stores details of mapping between two regions:
|
|
593 'source' and 'target'. Returns 1 if the pair was added, 0 if it
|
|
594 was already in. Used when adding an indel
|
|
595 Returntype int 0,1
|
|
596 Exceptions none
|
|
597 Caller Bio::EnsEMBL::Mapper
|
|
598
|
|
599 =cut
|
|
600
|
|
601 sub add_indel_coordinates{
|
|
602 my ($self, $contig_id, $contig_start, $contig_end,
|
|
603 $contig_ori, $chr_name, $chr_start, $chr_end) = @_;
|
|
604
|
|
605 unless(defined($contig_id) && defined($contig_start) && defined($contig_end)
|
|
606 && defined($contig_ori) && defined($chr_name) && defined($chr_start)
|
|
607 && defined($chr_end)) {
|
|
608 throw("7 arguments expected");
|
|
609 }
|
|
610
|
|
611 #we need to create the IndelPair object to add to both lists, to and from
|
|
612 my $from =
|
|
613 Bio::EnsEMBL::Mapper::Unit->new($contig_id, $contig_start, $contig_end);
|
|
614 my $to =
|
|
615 Bio::EnsEMBL::Mapper::Unit->new($chr_name, $chr_start, $chr_end);
|
|
616
|
|
617 my $pair = Bio::EnsEMBL::Mapper::IndelPair->new($from, $to, $contig_ori);
|
|
618
|
|
619 # place into hash on both ids
|
|
620 my $map_to = $self->{'to'};
|
|
621 my $map_from = $self->{'from'};
|
|
622
|
|
623 push( @{$self->{"_pair_$map_to"}->{uc($chr_name)}}, $pair );
|
|
624 push( @{$self->{"_pair_$map_from"}->{uc($contig_id)}}, $pair );
|
|
625
|
|
626 $self->{'pair_count'}++;
|
|
627
|
|
628 $self->{'_is_sorted'} = 0;
|
|
629 return 1;
|
|
630 }
|
|
631
|
|
632
|
|
633 =head2 map_indel
|
|
634
|
|
635 Arg [1] : string $id
|
|
636 Arg [2] : int $start - start coord. Since this is an indel should always
|
|
637 be one greater than end.
|
|
638 Arg [3] : int $end - end coord. Since this is an indel should always
|
|
639 be one less than start.
|
|
640 Arg [4] : int $strand (0, 1, -1)
|
|
641 Arg [5] : string $type - the coordinate system name the coords are from.
|
|
642 Example : @coords = $mapper->map_indel();
|
|
643 Description: This is in internal function which handles the special mapping
|
|
644 case for indels (start = end +1). It will be used to map from
|
|
645 a coordinate system with a gap to another that contains an
|
|
646 insertion. It will be mainly used by the Variation API.
|
|
647 Returntype : Bio::EnsEMBL::Mapper::Unit objects
|
|
648 Exceptions : none
|
|
649 Caller : general
|
|
650
|
|
651 =cut
|
|
652
|
|
653 sub map_indel {
|
|
654 my ( $self, $id, $start, $end, $strand, $type ) = @_;
|
|
655
|
|
656 # swap start/end and map the resultant 2bp coordinate
|
|
657 ( $start, $end ) = ( $end, $start );
|
|
658
|
|
659 if ( !$self->{'_is_sorted'} ) { $self->_sort() }
|
|
660
|
|
661 my $hash = $self->{"_pair_$type"};
|
|
662
|
|
663 my ( $from, $to, $cs );
|
|
664
|
|
665 if ( $type eq $self->{'to'} ) {
|
|
666 $from = 'to';
|
|
667 $to = 'from';
|
|
668 $cs = $self->{'from_cs'};
|
|
669 } else {
|
|
670 $from = 'from';
|
|
671 $to = 'to';
|
|
672 $cs = $self->{'to_cs'};
|
|
673 }
|
|
674
|
|
675 unless ( defined $hash ) {
|
|
676 throw("Type $type is neither to or from coordinate systems");
|
|
677 }
|
|
678 my @indel_coordinates;
|
|
679
|
|
680 my ( $start_idx, $end_idx, $mid_idx, $pair, $self_coord );
|
|
681 my $lr = $hash->{ uc($id) };
|
|
682
|
|
683 $start_idx = 0;
|
|
684 $end_idx = $#{$lr};
|
|
685
|
|
686 # binary search the relevant pairs
|
|
687 # helps if the list is big
|
|
688 while ( ( $end_idx - $start_idx ) > 1 ) {
|
|
689 $mid_idx = ( $start_idx + $end_idx ) >> 1;
|
|
690 $pair = $lr->[$mid_idx];
|
|
691 $self_coord = $pair->{$from};
|
|
692 if ( $self_coord->{'end'} <= $start ) {
|
|
693 $start_idx = $mid_idx;
|
|
694 } else {
|
|
695 $end_idx = $mid_idx;
|
|
696 }
|
|
697 }
|
|
698
|
|
699 for ( my $i = $start_idx; $i <= $#{$lr}; $i++ ) {
|
|
700 $pair = $lr->[$i];
|
|
701 my $self_coord = $pair->{$from};
|
|
702 my $target_coord = $pair->{$to};
|
|
703
|
|
704 if ( exists $pair->{'indel'} ) {
|
|
705 #need to return unit coordinate
|
|
706 my $to =
|
|
707 Bio::EnsEMBL::Mapper::Unit->new( $target_coord->{'id'},
|
|
708 $target_coord->{'start'},
|
|
709 $target_coord->{'end'}, );
|
|
710 push @indel_coordinates, $to;
|
|
711 last;
|
|
712 }
|
|
713 }
|
|
714
|
|
715 return @indel_coordinates;
|
|
716 } ## end sub map_indel
|
|
717
|
|
718
|
|
719 =head2 add_Mapper
|
|
720
|
|
721 Arg 1 Bio::EnsEMBL::Mapper $mapper2
|
|
722 Example $mapper->add_Mapper($mapper2)
|
|
723 Function add all the map coordinates from $mapper to this mapper.
|
|
724 This object will contain mapping pairs from both the old
|
|
725 object and $mapper2.
|
|
726 Returntype int 0,1
|
|
727 Exceptions throw if 'to' and 'from' from both Bio::EnsEMBL::Mappers
|
|
728 are incompatible
|
|
729 Caller $mapper->methodname()
|
|
730
|
|
731 =cut
|
|
732
|
|
733 sub add_Mapper{
|
|
734 my ($self, $mapper) = @_;
|
|
735
|
|
736 my $mapper_to = $mapper->{'to'};
|
|
737 my $mapper_from = $mapper->{'from'};
|
|
738 if ($mapper_to ne $self->{'to'} or $mapper_from ne $self->{'from'}) {
|
|
739 throw("Trying to add an incompatible Mapper");
|
|
740 }
|
|
741
|
|
742 my $count_a = 0;
|
|
743 foreach my $seq_name (keys %{$mapper->{"_pair_$mapper_to"}}) {
|
|
744 push(@{$self->{"_pair_$mapper_to"}->{$seq_name}},
|
|
745 @{$mapper->{"_pair_$mapper_to"}->{$seq_name}});
|
|
746 $count_a += scalar(@{$mapper->{"_pair_$mapper_to"}->{$seq_name}});
|
|
747 }
|
|
748 my $count_b = 0;
|
|
749 foreach my $seq_name (keys %{$mapper->{"_pair_$mapper_from"}}) {
|
|
750 push(@{$self->{"_pair_$mapper_from"}->{$seq_name}},
|
|
751 @{$mapper->{"_pair_$mapper_from"}->{$seq_name}});
|
|
752 $count_b += scalar(@{$mapper->{"_pair_$mapper_from"}->{$seq_name}});
|
|
753 }
|
|
754
|
|
755 if ($count_a == $count_b) {
|
|
756 $self->{'pair_count'} += $count_a;
|
|
757 } else {
|
|
758 throw("Trying to add a funny Mapper");
|
|
759 }
|
|
760
|
|
761 $self->{'_is_sorted'} = 0;
|
|
762 return 1;
|
|
763 }
|
|
764
|
|
765
|
|
766
|
|
767 =head2 list_pairs
|
|
768
|
|
769 Arg 1 int $id
|
|
770 id of 'source' sequence
|
|
771 Arg 2 int $start
|
|
772 start coordinate of 'source' sequence
|
|
773 Arg 3 int $end
|
|
774 end coordinate of 'source' sequence
|
|
775 Arg 4 string $type
|
|
776 nature of transform - gives the type of
|
|
777 coordinates to be transformed *from*
|
|
778 Function list all pairs of mappings in a region
|
|
779 Returntype list of Bio::EnsEMBL::Mapper::Pair
|
|
780 Exceptions none
|
|
781 Caller Bio::EnsEMBL::Mapper
|
|
782
|
|
783 =cut
|
|
784
|
|
785 sub list_pairs {
|
|
786 my ( $self, $id, $start, $end, $type ) = @_;
|
|
787
|
|
788 if ( !$self->{'_is_sorted'} ) { $self->_sort() }
|
|
789
|
|
790 if ( !defined $type ) {
|
|
791 throw("Expected 4 arguments");
|
|
792 }
|
|
793
|
|
794 if ( $start > $end ) {
|
|
795 throw( "Start is greater than end "
|
|
796 . "for id $id, start $start, end $end\n" );
|
|
797 }
|
|
798
|
|
799 my $hash = $self->{"_pair_$type"};
|
|
800
|
|
801 my ( $from, $to );
|
|
802
|
|
803 if ( $type eq $self->{'to'} ) {
|
|
804 $from = 'to';
|
|
805 $to = 'from';
|
|
806 } else {
|
|
807 $from = 'from';
|
|
808 $to = 'to';
|
|
809 }
|
|
810
|
|
811 unless ( defined $hash ) {
|
|
812 throw("Type $type is neither to or from coordinate systems");
|
|
813 }
|
|
814
|
|
815 my @list;
|
|
816
|
|
817 unless ( exists $hash->{ uc($id) } ) {
|
|
818 return ();
|
|
819 }
|
|
820
|
|
821 @list = @{ $hash->{ uc($id) } };
|
|
822
|
|
823 my @output;
|
|
824 if ( $start == -1 && $end == -1 ) {
|
|
825 return @list;
|
|
826 } else {
|
|
827
|
|
828 foreach my $p (@list) {
|
|
829
|
|
830 if ( $p->{$from}->{'end'} < $start ) {
|
|
831 next;
|
|
832 }
|
|
833 if ( $p->{$from}->{'start'} > $end ) {
|
|
834 last;
|
|
835 }
|
|
836 push( @output, $p );
|
|
837 }
|
|
838 return @output;
|
|
839 }
|
|
840 } ## end sub list_pairs
|
|
841
|
|
842
|
|
843 =head2 to
|
|
844
|
|
845 Arg 1 Bio::EnsEMBL::Mapper::Unit $id
|
|
846 id of 'source' sequence
|
|
847 Function accessor method form the 'source'
|
|
848 and 'target' in a Mapper::Pair
|
|
849 Returntype Bio::EnsEMBL::Mapper::Unit
|
|
850 Exceptions none
|
|
851 Caller Bio::EnsEMBL::Mapper
|
|
852
|
|
853 =cut
|
|
854
|
|
855 sub to {
|
|
856 my ( $self, $value ) = @_;
|
|
857
|
|
858 if ( defined($value) ) {
|
|
859 $self->{'to'} = $value;
|
|
860 }
|
|
861
|
|
862 return $self->{'to'};
|
|
863 }
|
|
864
|
|
865 =head2 from
|
|
866
|
|
867 Arg 1 Bio::EnsEMBL::Mapper::Unit $id
|
|
868 id of 'source' sequence
|
|
869 Function accessor method form the 'source'
|
|
870 and 'target' in a Mapper::Pair
|
|
871 Returntype Bio::EnsEMBL::Mapper::Unit
|
|
872 Exceptions none
|
|
873 Caller Bio::EnsEMBL::Mapper
|
|
874
|
|
875 =cut
|
|
876 sub from {
|
|
877 my ( $self, $value ) = @_;
|
|
878
|
|
879 if ( defined($value) ) {
|
|
880 $self->{'from'} = $value;
|
|
881 }
|
|
882
|
|
883 return $self->{'from'};
|
|
884 }
|
|
885
|
|
886
|
|
887 # _dump
|
|
888 #
|
|
889 # Arg 1 *FileHandle $fh
|
|
890 # Function convenience dump function
|
|
891 # possibly useful for debugging
|
|
892 # Returntype none
|
|
893 # Exceptions none
|
|
894 # Caller internal
|
|
895 #
|
|
896
|
|
897 sub _dump{
|
|
898 my ($self,$fh) = @_;
|
|
899
|
|
900 if( !defined $fh ) {
|
|
901 $fh = \*STDERR;
|
|
902 }
|
|
903
|
|
904 foreach my $id ( keys %{$self->{'_pair_hash_from'}} ) {
|
|
905 print $fh "From Hash $id\n";
|
|
906 foreach my $pair ( @{$self->{'_pair_hash_from'}->{uc($id)}} ) {
|
|
907 print $fh " ",$pair->from->start," ",$pair->from->end,":",$pair->to->start," ",$pair->to->end," ",$pair->to->id,"\n";
|
|
908 }
|
|
909 }
|
|
910
|
|
911 }
|
|
912
|
|
913
|
|
914 # _sort
|
|
915 #
|
|
916 # Function sort function so that all
|
|
917 # mappings are sorted by
|
|
918 # chromosome start
|
|
919 # Returntype none
|
|
920 # Exceptions none
|
|
921 # Caller internal
|
|
922 #
|
|
923
|
|
924 sub _sort {
|
|
925 my ($self) = @_;
|
|
926
|
|
927 my $to = $self->{'to'};
|
|
928 my $from = $self->{'from'};
|
|
929
|
|
930 foreach my $id ( keys %{ $self->{"_pair_$from"} } ) {
|
|
931 @{ $self->{"_pair_$from"}->{$id} } =
|
|
932 sort { $a->{'from'}->{'start'} <=> $b->{'from'}->{'start'} }
|
|
933 @{ $self->{"_pair_$from"}->{$id} };
|
|
934 }
|
|
935
|
|
936 foreach my $id ( keys %{ $self->{"_pair_$to"} } ) {
|
|
937 @{ $self->{"_pair_$to"}->{$id} } =
|
|
938 sort { $a->{'to'}->{'start'} <=> $b->{'to'}->{'start'} }
|
|
939 @{ $self->{"_pair_$to"}->{$id} };
|
|
940 }
|
|
941
|
|
942 $self->_merge_pairs();
|
|
943 $self->_is_sorted(1);
|
|
944 }
|
|
945
|
|
946 # this function merges pairs that are adjacent into one
|
|
947 sub _merge_pairs {
|
|
948 my $self = shift;
|
|
949
|
|
950 my ( $lr, $lr_from, $del_pair, $next_pair, $current_pair );
|
|
951
|
|
952 my $map_to = $self->{'to'};
|
|
953 my $map_from = $self->{'from'};
|
|
954 $self->{'pair_count'} = 0;
|
|
955
|
|
956 for my $key ( keys %{$self->{"_pair_$map_to"}} ) {
|
|
957 $lr = $self->{"_pair_$map_to"}->{$key};
|
|
958
|
|
959 my $i = 0;
|
|
960 my $next = 1;
|
|
961 my $length = $#{$lr};
|
|
962 while( $next <= $length ) {
|
|
963 $current_pair = $lr->[$i];
|
|
964 $next_pair = $lr->[$next];
|
|
965 $del_pair = undef;
|
|
966
|
|
967 if(exists $current_pair->{'indel'} || exists $next_pair->{'indel'}){
|
|
968 #necessary to modify the merge function to not merge indels
|
|
969 $next++;
|
|
970 $i++;
|
|
971 }
|
|
972 else{
|
|
973 # duplicate filter
|
|
974 if( $current_pair->{'to'}->{'start'} == $next_pair->{'to'}->{'start'}
|
|
975 and $current_pair->{'from'}->{'id'} == $next_pair->{'from'}->{'id'} ) {
|
|
976 $del_pair = $next_pair;
|
|
977 } elsif(( $current_pair->{'from'}->{'id'} eq $next_pair->{'from'}->{'id'} ) &&
|
|
978 ( $next_pair->{'ori'} == $current_pair->{'ori'} ) &&
|
|
979 ( $next_pair->{'to'}->{'start'} -1 == $current_pair->{'to'}->{'end'} )) {
|
|
980
|
|
981 if( $current_pair->{'ori'} == 1 ) {
|
|
982 # check forward strand merge
|
|
983 if( $next_pair->{'from'}->{'start'} - 1 == $current_pair->{'from'}->{'end'} ) {
|
|
984 # normal merge with previous element
|
|
985 $current_pair->{'to'}->{'end'} = $next_pair->{'to'}->{'end'};
|
|
986 $current_pair->{'from'}->{'end'} = $next_pair->{'from'}->{'end'};
|
|
987 $del_pair = $next_pair;
|
|
988 }
|
|
989 } else {
|
|
990 # check backward strand merge
|
|
991 if( $next_pair->{'from'}->{'end'} + 1 == $current_pair->{'from'}->{'start'} ) {
|
|
992 # yes its a merge
|
|
993 $current_pair->{'to'}->{'end'} = $next_pair->{'to'}->{'end'};
|
|
994 $current_pair->{'from'}->{'start'} = $next_pair->{'from'}->{'start'};
|
|
995 $del_pair = $next_pair;
|
|
996 }
|
|
997 }
|
|
998 }
|
|
999
|
|
1000 if( defined $del_pair ) {
|
|
1001 splice( @$lr, $next, 1 );
|
|
1002 $lr_from = $self->{"_pair_$map_from"}->{uc($del_pair->{'from'}->{'id'})};
|
|
1003 for( my $j=0; $j <= $#{$lr_from}; $j++ ) {
|
|
1004 if( $lr_from->[$j] == $del_pair ) {
|
|
1005 splice( @$lr_from, $j, 1 );
|
|
1006 last;
|
|
1007 }
|
|
1008 }
|
|
1009 $length--;
|
|
1010 if( $length < $next ) { last; }
|
|
1011 }
|
|
1012 else {
|
|
1013 $next++;
|
|
1014 $i++;
|
|
1015 }
|
|
1016 }
|
|
1017
|
|
1018 }
|
|
1019 $self->{'pair_count'} += scalar( @$lr );
|
|
1020 }
|
|
1021 }
|
|
1022
|
|
1023
|
|
1024 # _is_sorted
|
|
1025 #
|
|
1026 # Arg 1 int $sorted
|
|
1027 # Function toggle for whether the (internal)
|
|
1028 # map data are sorted
|
|
1029 # Returntype int
|
|
1030 # Exceptions none
|
|
1031 # Caller internal
|
|
1032 #
|
|
1033
|
|
1034 sub _is_sorted {
|
|
1035 my ($self, $value) = @_;
|
|
1036 $self->{'_is_sorted'} = $value if (defined($value));
|
|
1037 return $self->{'_is_sorted'};
|
|
1038 }
|
|
1039
|
|
1040
|
|
1041 1;
|