comparison variant_effect_predictor/Bio/EnsEMBL/DBSQL/AssemblyMapperAdaptor.pm @ 0:1f6dce3d34e0

Uploaded
author mahtabm
date Thu, 11 Apr 2013 02:01:53 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:1f6dce3d34e0
1 =head1 LICENSE
2
3 Copyright (c) 1999-2012 The European Bioinformatics Institute and
4 Genome Research Limited. All rights reserved.
5
6 This software is distributed under a modified Apache license.
7 For license details, please see
8
9 http://www.ensembl.org/info/about/code_licence.html
10
11 =head1 CONTACT
12
13 Please email comments or questions to the public Ensembl
14 developers list at <dev@ensembl.org>.
15
16 Questions may also be sent to the Ensembl help desk at
17 <helpdesk@ensembl.org>.
18
19 =cut
20
21 =head1 NAME
22
23 Bio::EnsEMBL::DBSQL::AssemblyMapperAdaptor
24
25 =head1 SYNOPSIS
26
27 use Bio::EnsEMBL::Registry;
28
29 Bio::EnsEMBL::Registry->load_registry_from_db(
30 -host => 'ensembldb.ensembl.org',
31 -user => 'anonymous'
32 );
33
34 $asma = Bio::EnsEMBL::Registry->get_adaptor( "human", "core",
35 "assemblymapper" );
36
37 $csa = Bio::EnsEMBL::Registry->get_adaptor( "human", "core",
38 "coordsystem" );
39
40 my $chr33_cs = $csa->fetch_by_name( 'chromosome', 'NCBI33' );
41 my $chr34_cs = $csa->fetch_by_name( 'chromosome', 'NCBI34' );
42 my $ctg_cs = $csa->fetch_by_name('contig');
43 my $clone_cs = $csa->fetch_by_name('clone');
44
45 my $chr_ctg_mapper =
46 $asma->fetch_by_CoordSystems( $chr33_cs, $ctg_cs );
47
48 my $ncbi33_ncbi34_mapper =
49 $asm_adptr->fetch_by_CoordSystems( $chr33, $chr34 );
50
51 my $ctg_clone_mapper =
52 $asm_adptr->fetch_by_CoordSystems( $ctg_cs, $clone_cs );
53
54
55 =head1 DESCRIPTION
56
57 Adaptor for handling Assembly mappers. This is a I<Singleton> class.
58 ie: There is only one per database (C<DBAdaptor>).
59
60 This is used to retrieve mappers between any two coordinate systems
61 whose makeup is described by the assembly table. Currently one step
62 (explicit) and two step (implicit) pairwise mapping is supported. In
63 one-step mapping an explicit relationship between the coordinate systems
64 is defined in the assembly table. In two-step 'chained' mapping no
65 explicit mapping is present but the coordinate systems must share a
66 common mapping to an intermediate coordinate system.
67
68 =head1 METHODS
69
70 =cut
71
72 package Bio::EnsEMBL::DBSQL::AssemblyMapperAdaptor;
73 use vars qw(@ISA);
74 use strict;
75
76 use Bio::EnsEMBL::DBSQL::BaseAdaptor;
77 use Bio::EnsEMBL::AssemblyMapper;
78 use Bio::EnsEMBL::ChainedAssemblyMapper;
79 use Bio::EnsEMBL::TopLevelAssemblyMapper;
80
81 use Bio::EnsEMBL::Utils::Cache; #CPAN LRU cache
82 use Bio::EnsEMBL::Utils::Exception qw(throw deprecate warning stack_trace_dump);
83 #use Bio::EnsEMBL::Utils::Exception qw(deprecate throw);
84 use Bio::EnsEMBL::Utils::SeqRegionCache;
85
86 use integer; #do proper arithmetic bitshifts
87
88 @ISA = qw(Bio::EnsEMBL::DBSQL::BaseAdaptor);
89
90
91 my $CHUNKFACTOR = 20; # 2^20 = approx. 10^6
92
93 =head2 new
94
95 Arg [1] : Bio::EnsEMBL::DBAdaptor $dbadaptor the adaptor for
96 the database this assembly mapper is using.
97 Example : my $asma = new Bio::EnsEMBL::AssemblyMapperAdaptor($dbadaptor);
98 Description: Creates a new AssemblyMapperAdaptor object
99 Returntype : Bio::EnsEMBL::DBSQL::AssemblyMapperAdaptor
100 Exceptions : none
101 Caller : Bio::EnsEMBL::DBSQL::DBAdaptor
102 Status : Stable
103
104 =cut
105
106 sub new {
107 my($class, $dbadaptor) = @_;
108
109 my $self = $class->SUPER::new($dbadaptor);
110
111 $self->{'_asm_mapper_cache'} = {};
112
113 # use a shared cache (for this database) that contains info about
114 # seq regions
115 my $seq_region_cache = $self->db->get_SeqRegionCache();
116 $self->{'sr_name_cache'} = $seq_region_cache->{'name_cache'};
117 $self->{'sr_id_cache'} = $seq_region_cache->{'id_cache'};
118
119 return $self;
120 }
121
122
123
124 =head2 cache_seq_ids_with_mult_assemblys
125
126 Example : $self->adaptor->cache_seq_ids_with_mult_assemblys();
127 Description: Creates a hash of the component seq region ids that
128 map to more than one assembly from the assembly table.
129 Retruntype : none
130 Exceptions : none
131 Caller : AssemblyMapper, ChainedAssemblyMapper
132 Status : At Risk
133
134 =cut
135
136 sub cache_seq_ids_with_mult_assemblys{
137 my $self = shift;
138 my %multis;
139
140 return if (defined($self->{'multi_seq_ids'}));
141
142 $self->{'multi_seq_ids'} = {};
143
144 my $sql = qq(
145 SELECT sra.seq_region_id
146 FROM seq_region_attrib sra,
147 attrib_type at,
148 seq_region sr,
149 coord_system cs
150 WHERE sra.attrib_type_id = at.attrib_type_id
151 AND code = "MultAssem"
152 AND sra.seq_region_id = sr.seq_region_id
153 AND sr.coord_system_id = cs.coord_system_id
154 AND cs.species_id = ?);
155
156 my $sth = $self->prepare($sql);
157
158 $sth->bind_param( 1, $self->species_id(), SQL_INTEGER );
159
160 $sth->execute();
161
162 my ($seq_region_id);
163
164 $sth->bind_columns(\$seq_region_id);
165
166 while($sth->fetch()) {
167 $self->{'multi_seq_ids'}->{$seq_region_id} = 1;
168 }
169 $sth->finish;
170 }
171
172
173 =head2 fetch_by_CoordSystems
174
175 Arg [1] : Bio::EnsEMBL::CoordSystem $cs1
176 One of the coordinate systems to retrieve the mapper
177 between
178 Arg [2] : Bio::EnsEMBL::CoordSystem $cs2
179 The other coordinate system to map between
180 Description: Retrieves an Assembly mapper for two coordinate
181 systems whose relationship is described in the
182 assembly table.
183
184 The ordering of the coodinate systems is arbitrary.
185 The following two statements are equivalent:
186 $mapper = $asma->fetch_by_CoordSystems($cs1,$cs2);
187 $mapper = $asma->fetch_by_CoordSystems($cs2,$cs1);
188 Returntype : Bio::EnsEMBL::AssemblyMapper
189 Exceptions : wrong argument types
190 Caller : general
191 Status : Stable
192
193 =cut
194
195 sub fetch_by_CoordSystems {
196 my $self = shift;
197 my $cs1 = shift;
198 my $cs2 = shift;
199
200 if(!ref($cs1) || !$cs1->isa('Bio::EnsEMBL::CoordSystem')) {
201 throw("cs1 argument must be a Bio::EnsEMBL::CoordSystem.");
202 }
203 if(!ref($cs2) || !$cs2->isa('Bio::EnsEMBL::CoordSystem')) {
204 throw("cs2 argument must be a Bio::EnsEMBL::CoordSystem.");
205 }
206
207 # if($cs1->equals($cs2)) {
208 # throw("Cannot create mapper between same coord systems: " .
209 # $cs1->name . " " . $cs1->version);
210 # }
211
212 if($cs1->is_top_level()) {
213 return Bio::EnsEMBL::TopLevelAssemblyMapper->new($self, $cs1, $cs2);
214 }
215 if($cs2->is_top_level()) {
216 return Bio::EnsEMBL::TopLevelAssemblyMapper->new($self, $cs2, $cs1);
217 }
218
219 my $csa = $self->db->get_CoordSystemAdaptor();
220
221 #retrieve the shortest possible mapping path between these systems
222 my @mapping_path = @{$csa->get_mapping_path($cs1,$cs2)};
223
224 if(!@mapping_path) {
225
226 # It is perfectly fine not to have a mapping. No warning needed really
227 # Just check the return code!!
228
229 # warning(
230 # "There is no mapping defined between these coord systems:\n" .
231 # $cs1->name() . " " . $cs1->version() . " and " . $cs2->name() . " " .
232 # $cs2->version()
233 # );
234 return undef;
235 }
236
237 my $key = join(':', map({defined($_)?$_->dbID():"-"} @mapping_path));
238
239 my $asm_mapper = $self->{'_asm_mapper_cache'}->{$key};
240
241 return $asm_mapper if($asm_mapper);
242
243 if(@mapping_path == 1) {
244 throw("Incorrect mapping path defined in meta table. " .
245 "0 step mapping encountered between:\n" .
246 $cs1->name() . " " . $cs1->version() . " and " . $cs2->name . " " .
247 $cs2->version());
248 }
249
250 if(@mapping_path == 2) {
251 #1 step regular mapping
252 $asm_mapper = Bio::EnsEMBL::AssemblyMapper->new($self, @mapping_path);
253
254 # If you want multiple pieces on two seqRegions to map to each other
255 # you need to make an assembly.mapping entry that is seperated with a #
256 # instead of an |.
257
258 $self->{'_asm_mapper_cache'}->{$key} = $asm_mapper;
259 return $asm_mapper;
260 }
261
262 if(@mapping_path == 3) {
263 #two step chained mapping
264 $asm_mapper = Bio::EnsEMBL::ChainedAssemblyMapper->new($self,@mapping_path);
265 #in multi-step mapping it is possible get requests with the
266 #coordinate system ordering reversed since both mappings directions
267 #cache on both orderings just in case
268 #e.g. chr <-> contig <-> clone and clone <-> contig <-> chr
269
270 $self->{'_asm_mapper_cache'}->{$key} = $asm_mapper;
271 $key = join(':', map({defined($_)?$_->dbID():"-"} reverse(@mapping_path)));
272 $self->{'_asm_mapper_cache'}->{$key} = $asm_mapper;
273 return $asm_mapper;
274 }
275
276 throw("Only 1 and 2 step coordinate system mapping is currently\n" .
277 "supported. Mapping between " .
278 $cs1->name() . " " . $cs1->version() . " and " .
279 $cs2->name() . " " . $cs2->version() .
280 " requires ". (scalar(@mapping_path)-1) . " steps.");
281 }
282
283
284
285 =head2 register_assembled
286
287 Arg [1] : Bio::EnsEMBL::AssemblyMapper $asm_mapper
288 A valid AssemblyMapper object
289 Arg [2] : integer $asm_seq_region
290 The dbID of the seq_region to be registered
291 Arg [3] : int $asm_start
292 The start of the region to be registered
293 Arg [4] : int $asm_end
294 The end of the region to be registered
295 Description: Declares an assembled region to the AssemblyMapper.
296 This extracts the relevant data from the assembly
297 table and stores it in Mapper internal to the $asm_mapper.
298 It therefore must be called before any mapping is
299 attempted on that region. Otherwise only gaps will
300 be returned. Note that the AssemblyMapper automatically
301 calls this method when the need arises.
302 Returntype : none
303 Exceptions : throw if the seq_region to be registered does not exist
304 or if it associated with multiple assembled pieces (bad data
305 in assembly table)
306 Caller : Bio::EnsEMBL::AssemblyMapper
307 Status : Stable
308
309 =cut
310
311
312 sub register_assembled {
313 my $self = shift;
314 my $asm_mapper = shift;
315 my $asm_seq_region = shift;
316 my $asm_start = shift;
317 my $asm_end = shift;
318
319 if(!ref($asm_mapper) || !$asm_mapper->isa('Bio::EnsEMBL::AssemblyMapper')) {
320 throw("Bio::EnsEMBL::AssemblyMapper argument expected");
321 }
322
323 throw("asm_seq_region argument expected") if(!defined($asm_seq_region));
324 throw("asm_start argument expected") if(!defined($asm_start));
325 throw("asm_end argument expected") if(!defined($asm_end));
326
327 my $asm_cs_id = $asm_mapper->assembled_CoordSystem->dbID();
328 my $cmp_cs_id = $asm_mapper->component_CoordSystem->dbID();
329
330 #split up the region to be registered into fixed chunks
331 #this allows us to keep track of regions that have already been
332 #registered and also works under the assumption that if a small region
333 #is requested it is likely that other requests will be made in the
334 #vicinity (the minimum size registered the chunksize (2^chunkfactor)
335
336 my @chunk_regions;
337 #determine span of chunks
338 #bitwise shift right is fast and easy integer division
339
340 my($start_chunk, $end_chunk);
341
342 $start_chunk = $asm_start >> $CHUNKFACTOR;
343 $end_chunk = $asm_end >> $CHUNKFACTOR;
344
345 # inserts have start = end + 1, on boundary condition start_chunk
346 # could be less than end chunk
347 if($asm_start == $asm_end + 1) {
348 ($start_chunk, $end_chunk) = ($end_chunk, $start_chunk);
349 }
350
351 #find regions of continuous unregistered chunks
352 my $i;
353 my ($begin_chunk_region,$end_chunk_region);
354 for ($i = $start_chunk; $i <= $end_chunk; $i++) {
355 if($asm_mapper->have_registered_assembled($asm_seq_region, $i)) {
356 if(defined($begin_chunk_region)) {
357 #this is the end of an unregistered region.
358 my $region = [($begin_chunk_region << $CHUNKFACTOR),
359 (($end_chunk_region+1) << $CHUNKFACTOR)-1];
360 push @chunk_regions, $region;
361 $begin_chunk_region = $end_chunk_region = undef;
362 }
363 } else {
364 $begin_chunk_region = $i if(!defined($begin_chunk_region));
365 $end_chunk_region = $i+1;
366 $asm_mapper->register_assembled($asm_seq_region,$i);
367 }
368 }
369
370 #the last part may have been an unregistered region too
371 if(defined($begin_chunk_region)) {
372 my $region = [($begin_chunk_region << $CHUNKFACTOR),
373 (($end_chunk_region+1) << $CHUNKFACTOR) -1];
374 push @chunk_regions, $region;
375 }
376
377 return if(!@chunk_regions);
378
379 # keep the Mapper to a reasonable size
380 if( $asm_mapper->size() > $asm_mapper->max_pair_count() ) {
381 $asm_mapper->flush();
382 #we now have to go and register the entire requested region since we
383 #just flushed everything
384
385 @chunk_regions = ( [ ( $start_chunk << $CHUNKFACTOR)
386 , (($end_chunk+1) << $CHUNKFACTOR)-1 ] );
387
388 for( my $i = $start_chunk; $i <= $end_chunk; $i++ ) {
389 $asm_mapper->register_assembled( $asm_seq_region, $i );
390 }
391 }
392
393 # my $asm_seq_region_id =
394 # $self->_seq_region_name_to_id($asm_seq_region,$asm_cs_id);
395
396 # Retrieve the description of how the assembled region is made from
397 # component regions for each of the continuous blocks of unregistered,
398 # chunked regions
399
400 my $q = qq{
401 SELECT
402 asm.cmp_start,
403 asm.cmp_end,
404 asm.cmp_seq_region_id,
405 sr.name,
406 sr.length,
407 asm.ori,
408 asm.asm_start,
409 asm.asm_end
410 FROM
411 assembly asm, seq_region sr
412 WHERE
413 asm.asm_seq_region_id = ? AND
414 ? <= asm.asm_end AND
415 ? >= asm.asm_start AND
416 asm.cmp_seq_region_id = sr.seq_region_id AND
417 sr.coord_system_id = ?
418 };
419
420 my $sth = $self->prepare($q);
421
422 foreach my $region (@chunk_regions) {
423 my($region_start, $region_end) = @$region;
424 $sth->bind_param(1,$asm_seq_region,SQL_INTEGER);
425 $sth->bind_param(2,$region_start,SQL_INTEGER);
426 $sth->bind_param(3,$region_end,SQL_INTEGER);
427 $sth->bind_param(4,$cmp_cs_id,SQL_INTEGER);
428
429 $sth->execute();
430
431 my($cmp_start, $cmp_end, $cmp_seq_region_id, $cmp_seq_region, $ori,
432 $asm_start, $asm_end, $cmp_seq_region_length);
433
434 $sth->bind_columns(\$cmp_start, \$cmp_end, \$cmp_seq_region_id,
435 \$cmp_seq_region, \$cmp_seq_region_length, \$ori,
436 \$asm_start, \$asm_end);
437
438 #
439 # Load the unregistered regions of the mapper
440 #
441 while($sth->fetch()) {
442 next if($asm_mapper->have_registered_component($cmp_seq_region_id)
443 and !defined($self->{'multi_seq_ids'}->{$cmp_seq_region_id}));
444 $asm_mapper->register_component($cmp_seq_region_id);
445 $asm_mapper->mapper->add_map_coordinates(
446 $asm_seq_region, $asm_start, $asm_end,
447 $ori,
448 $cmp_seq_region_id, $cmp_start, $cmp_end);
449
450 my $arr = [ $cmp_seq_region_id, $cmp_seq_region,
451 $cmp_cs_id, $cmp_seq_region_length ];
452
453 $self->{'sr_name_cache'}->{"$cmp_seq_region:$cmp_cs_id"} = $arr;
454 $self->{'sr_id_cache'}->{"$cmp_seq_region_id"} = $arr;
455 }
456 }
457
458 $sth->finish();
459 }
460
461
462
463 sub _seq_region_name_to_id {
464 my $self = shift;
465 my $sr_name = shift;
466 my $cs_id = shift;
467
468 ($sr_name && $cs_id) || throw('seq_region_name and coord_system_id args ' .
469 'are required');
470
471 my $arr = $self->{'sr_name_cache'}->{"$sr_name:$cs_id"};
472 if( $arr ) {
473 return $arr->[0];
474 }
475
476 # Get the seq_region_id via the name. This would be quicker if we just
477 # used internal ids instead but stored but then we lose the ability
478 # the transform accross databases with different internal ids
479
480 my $sth = $self->prepare("SELECT seq_region_id, length " .
481 "FROM seq_region " .
482 "WHERE name = ? AND coord_system_id = ?");
483
484 $sth->bind_param(1,$sr_name,SQL_VARCHAR);
485 $sth->bind_param(2,$cs_id,SQL_INTEGER);
486 $sth->execute();
487
488 if(!$sth->rows() == 1) {
489 throw("Ambiguous or non-existant seq_region [$sr_name] " .
490 "in coord system $cs_id");
491 }
492
493 my ($sr_id, $sr_length) = $sth->fetchrow_array();
494 $sth->finish();
495
496 $arr = [ $sr_id, $sr_name, $cs_id, $sr_length ];
497
498 $self->{'sr_name_cache'}->{"$sr_name:$cs_id"} = $arr;
499 $self->{'sr_id_cache'}->{"$sr_id"} = $arr;
500
501 return $sr_id;
502 }
503
504 sub _seq_region_id_to_name {
505 my $self = shift;
506 my $sr_id = shift;
507
508 ($sr_id) || throw('seq_region_is required');
509
510 my $arr = $self->{'sr_id_cache'}->{"$sr_id"};
511 if( $arr ) {
512 return $arr->[1];
513 }
514
515 # Get the seq_region name via the id. This would be quicker if we just
516 # used internal ids instead but stored but then we lose the ability
517 # the transform accross databases with different internal ids
518
519 my $sth = $self->prepare("SELECT name, length ,coord_system_id " .
520 "FROM seq_region " .
521 "WHERE seq_region_id = ? ");
522
523 $sth->bind_param(1,$sr_id,SQL_INTEGER);
524 $sth->execute();
525
526 if(!$sth->rows() == 1) {
527 throw("non-existant seq_region [$sr_id]");
528 }
529
530 my ($sr_name, $sr_length, $cs_id) = $sth->fetchrow_array();
531 $sth->finish();
532
533 $arr = [ $sr_id, $sr_name, $cs_id, $sr_length ];
534
535 $self->{'sr_name_cache'}->{"$sr_name:$cs_id"} = $arr;
536 $self->{'sr_id_cache'}->{"$sr_id"} = $arr;
537
538 return $sr_name;
539 }
540
541
542 =head2 register_component
543
544 Arg [1] : Bio::EnsEMBL::AssemblyMapper $asm_mapper
545 A valid AssemblyMapper object
546 Arg [2] : integer $cmp_seq_region
547 The dbID of the seq_region to be registered
548 Description: Declares a component region to the AssemblyMapper.
549 This extracts the relevant data from the assembly
550 table and stores it in Mapper internal to the $asm_mapper.
551 It therefore must be called before any mapping is
552 attempted on that region. Otherwise only gaps will
553 be returned. Note that the AssemblyMapper automatically
554 calls this method when the need arises.
555 Returntype : none
556 Exceptions : throw if the seq_region to be registered does not exist
557 or if it associated with multiple assembled pieces (bad data
558 in assembly table)
559 Caller : Bio::EnsEMBL::AssemblyMapper
560 Status : Stable
561
562 =cut
563
564 sub register_component {
565 my $self = shift;
566 my $asm_mapper = shift;
567 my $cmp_seq_region = shift;
568
569 if(!ref($asm_mapper) || !$asm_mapper->isa('Bio::EnsEMBL::AssemblyMapper')) {
570 throw("Bio::EnsEMBL::AssemblyMapper argument expected");
571 }
572
573 if(!defined($cmp_seq_region)) {
574 throw("cmp_seq_region argument expected");
575 }
576
577 my $cmp_cs_id = $asm_mapper->component_CoordSystem()->dbID();
578 my $asm_cs_id = $asm_mapper->assembled_CoordSystem()->dbID();
579
580 #do nothing if this region is already registered or special case
581 return if($asm_mapper->have_registered_component($cmp_seq_region)
582 and !defined($self->{'multi_seq_ids'}->{$cmp_seq_region}));
583
584 # my $cmp_seq_region_id =
585 # $self->_seq_region_name_to_id($cmp_seq_region, $cmp_cs_id);
586
587 # Determine what part of the assembled region this component region makes up
588
589 my $q = qq{
590 SELECT
591 asm.asm_start,
592 asm.asm_end,
593 asm.asm_seq_region_id,
594 sr.name,
595 sr.length
596 FROM
597 assembly asm, seq_region sr
598 WHERE
599 asm.cmp_seq_region_id = ? AND
600 asm.asm_seq_region_id = sr.seq_region_id AND
601 sr.coord_system_id = ?
602 };
603
604 my $sth = $self->prepare($q);
605 $sth->bind_param(1,$cmp_seq_region,SQL_INTEGER);
606 $sth->bind_param(2,$asm_cs_id,SQL_INTEGER);
607 $sth->execute();
608
609 if($sth->rows() == 0) {
610 #this component is not used in the assembled part i.e. gap
611 $asm_mapper->register_component($cmp_seq_region);
612 return;
613 }
614
615 #we do not currently support components mapping to multiple assembled
616 # make sure that you've got the correct mapping in the meta-table :
617 # chromosome:EquCab2#contig ( use'#' for multiple mappings )
618 # chromosome:EquCab2|contig ( use '|' delimiter for 1-1 mappings )
619 #
620 if($sth->rows() != 1) {
621 throw("Multiple assembled regions for single " .
622 "component region cmp_seq_region_id=[$cmp_seq_region]\n".
623 "Remember that multiple mappings use the \#-operaator".
624 " in the meta-table (i.e. chromosome:EquCab2\#contig\n");
625 }
626
627 my ($asm_start, $asm_end, $asm_seq_region_id,
628 $asm_seq_region, $asm_seq_region_length) = $sth->fetchrow_array();
629
630 my $arr = [ $asm_seq_region_id, $asm_seq_region,
631 $asm_cs_id, $asm_seq_region_length ];
632
633 $self->{'sr_name_cache'}->{"$asm_seq_region:$asm_cs_id"} = $arr;
634 $self->{'sr_id_cache'}->{"$asm_seq_region_id"} = $arr;
635
636 $sth->finish();
637
638 # Register the corresponding assembled region. This allows a us to
639 # register things in assembled chunks which allows us to:
640 # (1) Keep track of what assembled regions are registered
641 # (2) Use locality of reference (if they want something in same general
642 # region it will already be registered).
643
644 $self->register_assembled($asm_mapper,$asm_seq_region_id,$asm_start,$asm_end);
645 }
646
647
648
649 =head2 register_chained
650
651 Arg [1] : Bio::EnsEMBL::ChainedAssemblyMapper $casm_mapper
652 The chained assembly mapper to register regions on
653 Arg [2] : string $from ('first' or 'last')
654 The direction we are registering from, and the name of the
655 internal mapper.
656 Arg [3] : string $seq_region_name
657 The name of the seqregion we are registering on
658 Arg [4] : listref $ranges
659 A list of ranges to register (in [$start,$end] tuples).
660 Arg [5] : (optional) $to_slice
661 Only register those on this Slice.
662 Description: Registers a set of ranges on a chained assembly mapper.
663 This function is at the heart of the chained mapping process.
664 It retrieves information from the assembly table and
665 dynamically constructs the mappings between two coordinate
666 systems which are 2 mapping steps apart. It does this by using
667 two internal mappers to load up a third mapper which is
668 actually used by the ChainedAssemblyMapper to perform the
669 mapping.
670
671 This method must be called before any mapping is
672 attempted on regions of interest, otherwise only gaps will
673 be returned. Note that the ChainedAssemblyMapper automatically
674 calls this method when the need arises.
675 Returntype : none
676 Exceptions : throw if the seq_region to be registered does not exist
677 or if it associated with multiple assembled pieces (bad data
678 in assembly table)
679
680 throw if the mapping between the coordinate systems cannot
681 be performed in two steps, which means there is an internal
682 error in the data in the meta table or in the code that creates
683 the mapping paths.
684 Caller : Bio::EnsEMBL::AssemblyMapper
685 Status : Stable
686
687 =cut
688
689 sub register_chained {
690 my $self = shift;
691 my $casm_mapper = shift;
692 my $from = shift;
693 my $seq_region_id = shift;
694 my $ranges = shift;
695 my $to_slice = shift;
696
697 my $to_seq_region_id;
698 if(defined($to_slice)){
699 if($casm_mapper->first_CoordSystem()->equals($casm_mapper->last_CoordSystem())){
700 return $self->_register_chained_special($casm_mapper, $from, $seq_region_id, $ranges, $to_slice);
701 }
702 $to_seq_region_id = $to_slice->get_seq_region_id();
703 if(!defined($to_seq_region_id)){
704 die "Could not get seq_region_id for to_slice".$to_slice->seq_region_name."\n";
705 }
706 }
707
708 my ($start_name, $start_mid_mapper, $start_cs, $start_registry,
709 $end_name, $end_mid_mapper, $end_cs, $end_registry);
710
711 if($from eq 'first') {
712 $start_name = 'first';
713 $start_mid_mapper = $casm_mapper->first_middle_mapper();
714 $start_cs = $casm_mapper->first_CoordSystem();
715 $start_registry = $casm_mapper->first_registry();
716 $end_mid_mapper = $casm_mapper->last_middle_mapper();
717 $end_cs = $casm_mapper->last_CoordSystem();
718 $end_registry = $casm_mapper->last_registry();
719 $end_name = 'last';
720 } elsif($from eq 'last') {
721 $start_name = 'last';
722 $start_mid_mapper = $casm_mapper->last_middle_mapper();
723 $start_cs = $casm_mapper->last_CoordSystem();
724 $start_registry = $casm_mapper->last_registry();
725 $end_mid_mapper = $casm_mapper->first_middle_mapper();
726 $end_cs = $casm_mapper->first_CoordSystem();
727 $end_registry = $casm_mapper->first_registry();
728 $end_name = 'first';
729 } else {
730 throw("Invalid from argument: [$from], must be 'first' or 'last'");
731 }
732
733 my $combined_mapper = $casm_mapper->first_last_mapper();
734 my $mid_cs = $casm_mapper->middle_CoordSystem();
735 my $mid_name = 'middle';
736 my $csa = $self->db->get_CoordSystemAdaptor();
737
738 # Check for the simple case where the ChainedMapper is short
739 if( ! defined $mid_cs ) {
740 $start_mid_mapper = $combined_mapper;
741 }
742
743
744 ##############
745 # obtain the first half of the mappings and load them into the start mapper
746 #
747
748 #ascertain which is component and which is actually assembled coord system
749 my @path;
750
751 # check for the simple case, where the ChainedMapper is short
752 if( defined $mid_cs ) {
753 @path = @{$csa->get_mapping_path($start_cs, $mid_cs)};
754 } else {
755 @path = @{$csa->get_mapping_path( $start_cs, $end_cs )};
756 }
757
758 if(@path != 2 && defined( $path[1] )) {
759 my $path = join(',', map({$_->name .' '. $_->version} @path));
760 my $len = scalar(@path) - 1;
761 throw("Unexpected mapping path between start and intermediate " .
762 "coord systems (". $start_cs->name . " " . $start_cs->version .
763 " and " . $mid_cs->name . " " . $mid_cs->version . ")." .
764 "\nExpected path length 1, got $len. " .
765 "(path=$path)");
766 }
767
768 my $sth;
769 my ($asm_cs,$cmp_cs);
770 $asm_cs = $path[0];
771 $cmp_cs = $path[-1];
772
773 #the SQL varies depending on whether we are coming from assembled or
774 #component coordinate system
775
776 my $asm2cmp = (<<ASMCMP);
777 SELECT
778 asm.cmp_start,
779 asm.cmp_end,
780 asm.cmp_seq_region_id,
781 sr.name,
782 sr.length,
783 asm.ori,
784 asm.asm_start,
785 asm.asm_end
786 FROM
787 assembly asm, seq_region sr
788 WHERE
789 asm.asm_seq_region_id = ? AND
790 ? <= asm.asm_end AND
791 ? >= asm.asm_start AND
792 asm.cmp_seq_region_id = sr.seq_region_id AND
793 sr.coord_system_id = ?
794 ASMCMP
795
796
797 my $cmp2asm = (<<CMPASM);
798 SELECT
799 asm.asm_start,
800 asm.asm_end,
801 asm.asm_seq_region_id,
802 sr.name,
803 sr.length,
804 asm.ori,
805 asm.cmp_start,
806 asm.cmp_end
807 FROM
808 assembly asm, seq_region sr
809 WHERE
810 asm.cmp_seq_region_id = ? AND
811 ? <= asm.cmp_end AND
812 ? >= asm.cmp_start AND
813 asm.asm_seq_region_id = sr.seq_region_id AND
814 sr.coord_system_id = ?
815 CMPASM
816
817 my $asm2cmp_sth;
818 my $cmp2asm_sth;
819 if(defined($to_slice)){
820 my $to_cs = $to_slice->coord_system;
821 if($asm_cs->equals($to_cs)){
822 $asm2cmp_sth = $self->prepare($asm2cmp);
823 $cmp2asm_sth = $self->prepare($cmp2asm." AND asm.asm_seq_region_id = $to_seq_region_id");
824 }
825 elsif($cmp_cs->equals($to_cs)){
826 $asm2cmp_sth = $self->prepare($asm2cmp." AND asm.cmp_seq_region_id = $to_seq_region_id");
827 $cmp2asm_sth = $self->prepare($cmp2asm);
828 }
829 else{
830 $asm2cmp_sth = $self->prepare($asm2cmp);
831 $cmp2asm_sth = $self->prepare($cmp2asm);
832 }
833 }
834 else{
835 $asm2cmp_sth = $self->prepare($asm2cmp);
836 $cmp2asm_sth = $self->prepare($cmp2asm);
837 }
838
839
840
841 $sth = ($asm_cs->equals($start_cs)) ? $asm2cmp_sth : $cmp2asm_sth;
842
843 my $mid_cs_id;
844
845 # check for the simple case where the ChainedMapper is short
846 if( defined $mid_cs ) {
847 $mid_cs_id = $mid_cs->dbID();
848 } else {
849 $mid_cs_id = $end_cs->dbID();
850 }
851
852 my @mid_ranges;
853 my @start_ranges;
854
855 #need to perform the query for each unregistered range
856 foreach my $range (@$ranges) {
857 my ($start, $end) = @$range;
858 $sth->bind_param(1,$seq_region_id,SQL_INTEGER);
859 $sth->bind_param(2,$start,SQL_INTEGER);
860 $sth->bind_param(3,$end,SQL_INTEGER);
861 $sth->bind_param(4,$mid_cs_id,SQL_INTEGER);
862 $sth->execute();
863
864 #load the start <-> mid mapper with the results and record the mid cs
865 #ranges we just added to the mapper
866
867 my ($mid_start, $mid_end, $mid_seq_region_id, $mid_seq_region, $mid_length,
868 $ori, $start_start, $start_end);
869
870 $sth->bind_columns(\$mid_start, \$mid_end, \$mid_seq_region_id,
871 \$mid_seq_region, \$mid_length, \$ori, \$start_start,
872 \$start_end);
873
874 while($sth->fetch()) {
875
876 if( defined $mid_cs ) {
877 $start_mid_mapper->add_map_coordinates
878 (
879 $seq_region_id,$start_start, $start_end, $ori,
880 $mid_seq_region_id, $mid_start, $mid_end
881 );
882 } else {
883 if( $from eq "first" ) {
884 $combined_mapper->add_map_coordinates
885 (
886 $seq_region_id,$start_start, $start_end, $ori,
887 $mid_seq_region_id, $mid_start, $mid_end
888 );
889 } else {
890 $combined_mapper->add_map_coordinates
891 (
892 $mid_seq_region_id, $mid_start, $mid_end, $ori,
893 $seq_region_id,$start_start, $start_end
894 );
895 }
896 }
897
898 #update sr_name cache
899 my $arr = [ $mid_seq_region_id, $mid_seq_region,
900 $mid_cs_id, $mid_length ];
901
902 $self->{'sr_name_cache'}->{"$mid_seq_region:$mid_cs_id"} = $arr;
903 $self->{'sr_id_cache'}->{"$mid_seq_region_id"} = $arr;
904
905 push @mid_ranges,[$mid_seq_region_id,$mid_seq_region,
906 $mid_start,$mid_end];
907 push @start_ranges, [ $seq_region_id, $start_start, $start_end ];
908
909 #the region that we actually register may actually be larger or smaller
910 #than the region that we wanted to register.
911 #register the intersection of the region so we do not end up doing
912 #extra work later
913
914 if($start_start < $start || $start_end > $end) {
915 $start_registry->check_and_register($seq_region_id,$start_start,
916 $start_end);
917 }
918 }
919 $sth->finish();
920 }
921
922 # in the one step case, we load the mid ranges in the
923 # last_registry and we are done
924 if( ! defined $mid_cs ) {
925 for my $range ( @mid_ranges ) {
926 $end_registry->check_and_register( $range->[0], $range->[2],
927 $range->[3] );
928 }
929
930 # and thats it for the simple case ...
931 return;
932 }
933
934
935 ###########
936 # now the second half of the mapping
937 # perform another query and load the mid <-> end mapper using the mid cs
938 # ranges
939 #
940
941 #ascertain which is component and which is actually assembled coord system
942 @path = @{$csa->get_mapping_path($mid_cs, $end_cs)};
943 if(@path == 2 || ( @path == 3 && !defined $path[1])) {
944
945 $asm_cs = $path[0];
946 $cmp_cs = $path[-1];
947 } else {
948 my $path = join(',', map({$_->name .' '. $_->version} @path));
949 my $len = scalar(@path)-1;
950 throw("Unexpected mapping path between intermediate and last" .
951 "coord systems (". $mid_cs->name . " " . $mid_cs->version .
952 " and " . $end_cs->name . " " . $end_cs->version . ")." .
953 "\nExpected path length 1, got $len. " .
954 "(path=$path)");
955 }
956
957 if(defined($to_slice)){
958 my $to_cs = $to_slice->coord_system;
959 if($asm_cs->equals($to_cs)){
960 $asm2cmp_sth = $self->prepare($asm2cmp);
961 $cmp2asm_sth = $self->prepare($cmp2asm." AND asm.asm_seq_region_id = $to_seq_region_id");
962 }
963 elsif($cmp_cs->equals($to_cs)){
964 $asm2cmp_sth = $self->prepare($asm2cmp." AND asm.cmp_seq_region_id = $to_seq_region_id");
965 $cmp2asm_sth = $self->prepare($cmp2asm);
966 }
967 else{
968 $asm2cmp_sth = $self->prepare($asm2cmp);
969 $cmp2asm_sth = $self->prepare($cmp2asm);
970 }
971 }
972
973 $sth = ($asm_cs->equals($mid_cs)) ? $asm2cmp_sth : $cmp2asm_sth;
974
975 my $end_cs_id = $end_cs->dbID();
976 foreach my $mid_range (@mid_ranges) {
977 my ($mid_seq_region_id, $mid_seq_region,$start, $end) = @$mid_range;
978 $sth->bind_param(1,$mid_seq_region_id,SQL_INTEGER);
979 $sth->bind_param(2,$start,SQL_INTEGER);
980 $sth->bind_param(3,$end,SQL_INTEGER);
981 $sth->bind_param(4,$end_cs_id,SQL_INTEGER);
982 $sth->execute();
983
984 #load the end <-> mid mapper with the results and record the mid cs
985 #ranges we just added to the mapper
986
987 my ($end_start, $end_end, $end_seq_region_id, $end_seq_region, $end_length,
988 $ori, $mid_start, $mid_end);
989
990 $sth->bind_columns(\$end_start, \$end_end, \$end_seq_region_id,
991 \$end_seq_region, \$end_length, \$ori, \$mid_start,
992 \$mid_end);
993
994 while($sth->fetch()) {
995 $end_mid_mapper->add_map_coordinates
996 (
997 $end_seq_region_id, $end_start, $end_end, $ori,
998 $mid_seq_region_id, $mid_start, $mid_end
999 );
1000
1001 #update sr_name cache
1002 my $arr = [ $end_seq_region_id,$end_seq_region,$end_cs_id,$end_length ];
1003
1004 $self->{'sr_name_cache'}->{"$end_seq_region:$end_cs_id"} = $arr;
1005 $self->{'sr_id_cache'}->{"$end_seq_region_id"} = $arr;
1006
1007 #register this region on the end coord system
1008 $end_registry->check_and_register($end_seq_region_id, $end_start, $end_end);
1009 }
1010 $sth->finish();
1011 }
1012
1013 #########
1014 # Now that both halves are loaded
1015 # Do stepwise mapping using both of the loaded mappers to load
1016 # the final start <-> end mapper
1017 #
1018
1019 _build_combined_mapper(\@start_ranges, $start_mid_mapper, $end_mid_mapper,
1020 $combined_mapper, $start_name);
1021 #all done!
1022 return;
1023 }
1024
1025
1026 =head2 _register_chained_special
1027
1028 Arg [1] : Bio::EnsEMBL::ChainedAssemblyMapper $casm_mapper
1029 The chained assembly mapper to register regions on
1030 Arg [2] : string $from ('first' or 'last')
1031 The direction we are registering from, and the name of the
1032 internal mapper.
1033 Arg [3] : string $seq_region_name
1034 The name of the seqregion we are registering on
1035 Arg [4] : listref $ranges
1036 A list of ranges to register (in [$start,$end] tuples).
1037 Arg [5] : (optional) $to_slice
1038 Only register those on this Slice.
1039 Description: Registers a set of ranges on a chained assembly mapper.
1040 This function is at the heart of the chained mapping process.
1041 It retrieves information from the assembly table and
1042 dynamically constructs the mappings between two coordinate
1043 systems which are 2 mapping steps apart. It does this by using
1044 two internal mappers to load up a third mapper which is
1045 actually used by the ChainedAssemblyMapper to perform the
1046 mapping.
1047
1048 This method must be called before any mapping is
1049 attempted on regions of interest, otherwise only gaps will
1050 be returned. Note that the ChainedAssemblyMapper automatically
1051 calls this method when the need arises.
1052 Returntype : none
1053 Exceptions : throw if the seq_region to be registered does not exist
1054 or if it associated with multiple assembled pieces (bad data
1055 in assembly table)
1056
1057 throw if the mapping between the coordinate systems cannot
1058 be performed in two steps, which means there is an internal
1059 error in the data in the meta table or in the code that creates
1060 the mapping paths.
1061 Caller : Bio::EnsEMBL::AssemblyMapper
1062 Status : Stable
1063
1064 =cut
1065
1066 sub _register_chained_special {
1067 my $self = shift;
1068 my $casm_mapper = shift;
1069 my $from = shift;
1070 my $seq_region_id = shift;
1071 my $ranges = shift;
1072 my $to_slice = shift;
1073 my $found = 0;
1074
1075 my $sth = $self->prepare("SELECT
1076 asm.cmp_start,
1077 asm.cmp_end,
1078 asm.cmp_seq_region_id,
1079 sr.name,
1080 sr.length,
1081 asm.ori,
1082 asm.asm_start,
1083 asm.asm_end
1084 FROM
1085 assembly asm, seq_region sr
1086 WHERE
1087 asm.asm_seq_region_id = ? AND
1088 ? <= asm.asm_end AND
1089 ? >= asm.asm_start AND
1090 asm.cmp_seq_region_id = sr.seq_region_id AND
1091 sr.coord_system_id = ? AND
1092 asm.cmp_seq_region_id = ?");
1093
1094
1095 my ($start_name, $start_mid_mapper, $start_cs, $start_registry,
1096 $end_name, $end_mid_mapper, $end_cs, $end_registry);
1097
1098 if($from eq 'first') {
1099 $start_name = 'first';
1100 $start_mid_mapper = $casm_mapper->first_middle_mapper();
1101 $start_cs = $casm_mapper->first_CoordSystem();
1102 $start_registry = $casm_mapper->first_registry();
1103 $end_mid_mapper = $casm_mapper->last_middle_mapper();
1104 $end_cs = $casm_mapper->last_CoordSystem();
1105 $end_registry = $casm_mapper->last_registry();
1106 $end_name = 'last';
1107 } elsif($from eq 'last') {
1108 $start_name = 'last';
1109 $start_mid_mapper = $casm_mapper->last_middle_mapper();
1110 $start_cs = $casm_mapper->last_CoordSystem();
1111 $start_registry = $casm_mapper->last_registry();
1112 $end_mid_mapper = $casm_mapper->first_middle_mapper();
1113 $end_cs = $casm_mapper->first_CoordSystem();
1114 $end_registry = $casm_mapper->first_registry();
1115 $end_name = 'first';
1116 } else {
1117 throw("Invalid from argument: [$from], must be 'first' or 'last'");
1118 }
1119
1120 my $combined_mapper = $casm_mapper->first_last_mapper();
1121 my $mid_cs = $casm_mapper->middle_CoordSystem();
1122 my $mid_name = 'middle';
1123 my $csa = $self->db->get_CoordSystemAdaptor();
1124
1125 # Check for the simple case where the ChainedMapper is short
1126 if( ! defined $mid_cs ) {
1127 $start_mid_mapper = $combined_mapper;
1128 }
1129
1130
1131 my @path;
1132 if( defined $mid_cs ) {
1133 @path = @{$csa->get_mapping_path($start_cs, $mid_cs)};
1134 } else {
1135 @path = @{$csa->get_mapping_path( $start_cs, $end_cs )};
1136 }
1137 if( ! defined $mid_cs ) {
1138 $start_mid_mapper = $combined_mapper;
1139 }
1140
1141 if(@path != 2 && defined( $path[1] )) {
1142 my $path = join(',', map({$_->name .' '. $_->version} @path));
1143 my $len = scalar(@path) - 1;
1144 throw("Unexpected mapping path between start and intermediate " .
1145 "coord systems (". $start_cs->name . " " . $start_cs->version .
1146 " and " . $mid_cs->name . " " . $mid_cs->version . ")." .
1147 "\nExpected path length 1, got $len. " .
1148 "(path=$path)");
1149 }
1150
1151 my ($asm_cs,$cmp_cs);
1152 $asm_cs = $path[0];
1153 $cmp_cs = $path[-1];
1154
1155 $combined_mapper = $casm_mapper->first_last_mapper();
1156 $mid_cs = $casm_mapper->middle_CoordSystem();
1157 $mid_name = 'middle';
1158 $csa = $self->db->get_CoordSystemAdaptor();
1159
1160 my $mid_cs_id;
1161
1162 # Check for the simple case where the ChainedMapper is short
1163 if ( !defined $mid_cs ) {
1164 $start_mid_mapper = $combined_mapper;
1165 } else {
1166 $mid_cs_id = $mid_cs->dbID();
1167 }
1168
1169 my @mid_ranges;
1170 my @start_ranges;
1171
1172 my $to_cs = $to_slice->coord_system;
1173 foreach my $direction (1, 0){
1174 my $id1;
1175 my $id2;
1176 if($direction){
1177 $id1 = $seq_region_id;
1178 $id2 = $to_slice->get_seq_region_id();
1179 }
1180 else{
1181 $id2 = $seq_region_id;
1182 $id1 = $to_slice->get_seq_region_id();
1183 }
1184
1185 foreach my $range (@$ranges) {
1186 my ($start, $end) = @$range;
1187 $sth->bind_param(1,$id1,SQL_INTEGER);
1188 $sth->bind_param(2,$start,SQL_INTEGER);
1189 $sth->bind_param(3,$end,SQL_INTEGER);
1190 $sth->bind_param(4,$to_cs->dbID,SQL_INTEGER);
1191 $sth->bind_param(5,$id2,SQL_INTEGER);
1192 $sth->execute();
1193
1194 my ($mid_start, $mid_end, $mid_seq_region_id, $mid_seq_region, $mid_length,
1195 $ori, $start_start, $start_end);
1196
1197 $sth->bind_columns(\$mid_start, \$mid_end, \$mid_seq_region_id,
1198 \$mid_seq_region, \$mid_length, \$ori, \$start_start,
1199 \$start_end);
1200
1201 while($sth->fetch()) {
1202 $found = 1;
1203
1204 if( defined $mid_cs ) {
1205 $start_mid_mapper->add_map_coordinates
1206 (
1207 $id1,$start_start, $start_end, $ori,
1208 $mid_seq_region_id, $mid_start, $mid_end
1209 );
1210 } else {
1211 if( $from eq "first") {
1212 if($direction){
1213 $combined_mapper->add_map_coordinates
1214 (
1215 $id1,$start_start, $start_end, $ori,
1216 $mid_seq_region_id, $mid_start, $mid_end
1217 );
1218 }
1219 else{
1220 $combined_mapper->add_map_coordinates
1221 (
1222 $mid_seq_region_id, $mid_start, $mid_end, $ori,
1223 $id1,$start_start, $start_end
1224 );
1225 }
1226 } else {
1227 if($direction){
1228 $combined_mapper->add_map_coordinates
1229 (
1230 $mid_seq_region_id, $mid_start, $mid_end, $ori,
1231 $id1,$start_start, $start_end
1232 );
1233 }
1234 else{
1235 $combined_mapper->add_map_coordinates
1236 (
1237 $id1,$start_start, $start_end, $ori,
1238 $mid_seq_region_id, $mid_start, $mid_end
1239 );
1240 }
1241 }
1242 }
1243
1244 #update sr_name cache
1245 my $arr = [ $mid_seq_region_id, $mid_seq_region,
1246 $mid_cs_id, $mid_length ];
1247
1248 $self->{'sr_name_cache'}->{"$mid_seq_region:$mid_cs_id"} = $arr;
1249 $self->{'sr_id_cache'}->{"$mid_seq_region_id"} = $arr;
1250
1251 push @mid_ranges,[$mid_seq_region_id,$mid_seq_region,
1252 $mid_start,$mid_end];
1253 push @start_ranges, [ $id1, $start_start, $start_end ];
1254
1255 #the region that we actually register may actually be larger or smaller
1256 #than the region that we wanted to register.
1257 #register the intersection of the region so we do not end up doing
1258 #extra work later
1259
1260 if($start_start < $start || $start_end > $end) {
1261 $start_registry->check_and_register($id1,$start_start,
1262 $start_end);
1263 }
1264 }
1265 $sth->finish();
1266 }
1267 if($found){
1268 if( ! defined $mid_cs ) {
1269 for my $range ( @mid_ranges ) {
1270 $end_registry->check_and_register( $range->[0], $range->[2],
1271 $range->[3] );
1272 }
1273
1274 # and thats it for the simple case ...
1275 return;
1276 }
1277 }
1278 }
1279 }
1280
1281
1282 =head2 register_all
1283
1284 Arg [1] : Bio::EnsEMBL::AssemblyMapper $mapper
1285 Example : $mapper = $asm_mapper_adaptor->fetch_by_CoordSystems($cs1,$cs2);
1286
1287 # make cache large enough to hold all of the mappings
1288 $mapper->max_pair_count(10e6);
1289 $asm_mapper_adaptor->register_all($mapper);
1290
1291 # perform mappings as normal
1292 $mapper->map($slice->seq_region_name(), $sr_start, $sr_end,
1293 $sr_strand, $cs1);
1294 ...
1295 Description: This function registers the entire set of mappings between
1296 two coordinate systems in an assembly mapper.
1297 This will use a lot of memory but will be much more efficient
1298 when doing a lot of mapping which is spread over the entire
1299 genome.
1300 Returntype : none
1301 Exceptions : none
1302 Caller : specialised prograhsm
1303 Status : Stable
1304
1305 =cut
1306
1307 sub register_all {
1308 my $self = shift;
1309 my $mapper = shift;
1310
1311 my $asm_cs_id = $mapper->assembled_CoordSystem()->dbID();
1312 my $cmp_cs_id = $mapper->component_CoordSystem()->dbID();
1313
1314 # retrieve every relevant assembled/component pair from the assembly table
1315
1316 my $q = qq{
1317 SELECT
1318 asm.cmp_start,
1319 asm.cmp_end,
1320 asm.cmp_seq_region_id,
1321 cmp_sr.name,
1322 cmp_sr.length,
1323 asm.ori,
1324 asm.asm_start,
1325 asm.asm_end,
1326 asm.asm_seq_region_id,
1327 asm_sr.name,
1328 asm_sr.length
1329 FROM
1330 assembly asm, seq_region asm_sr, seq_region cmp_sr
1331 WHERE
1332 asm.cmp_seq_region_id = cmp_sr.seq_region_id AND
1333 asm.asm_seq_region_id = asm_sr.seq_region_id AND
1334 cmp_sr.coord_system_id = ? AND
1335 asm_sr.coord_system_id = ?
1336 };
1337
1338 my $sth = $self->prepare($q);
1339
1340 $sth->bind_param(1,$cmp_cs_id,SQL_INTEGER);
1341 $sth->bind_param(2,$asm_cs_id,SQL_INTEGER);
1342 $sth->execute();
1343
1344 # load the mapper with the assembly information
1345
1346 my ($cmp_start, $cmp_end, $cmp_seq_region_id, $cmp_seq_region, $cmp_length,
1347 $ori,
1348 $asm_start, $asm_end, $asm_seq_region_id, $asm_seq_region, $asm_length);
1349
1350 $sth->bind_columns(\$cmp_start, \$cmp_end, \$cmp_seq_region_id,
1351 \$cmp_seq_region, \$cmp_length, \$ori,
1352 \$asm_start, \$asm_end, \$asm_seq_region_id,
1353 \$asm_seq_region, \$asm_length);
1354
1355 my %asm_registered;
1356
1357 while($sth->fetch()) {
1358 $mapper->register_component($cmp_seq_region_id);
1359 $mapper->mapper->add_map_coordinates(
1360 $asm_seq_region_id, $asm_start, $asm_end,
1361 $ori,
1362 $cmp_seq_region_id, $cmp_start, $cmp_end);
1363
1364 my $arr = [$cmp_seq_region_id, $cmp_seq_region, $cmp_cs_id, $cmp_length];
1365
1366 $self->{'sr_name_cache'}->{"$cmp_seq_region:$cmp_cs_id"} = $arr;
1367 $self->{'sr_id_cache'}->{"$cmp_seq_region_id"} = $arr;
1368
1369 # only register each asm seq_region once since it requires some work
1370 if(!$asm_registered{$asm_seq_region_id}) {
1371 $asm_registered{$asm_seq_region_id} = 1;
1372
1373 # register all chunks from start of seq region to end
1374 my $end_chunk = $asm_length >> $CHUNKFACTOR;
1375 for(my $i = 0; $i <= $end_chunk; $i++) {
1376 $mapper->register_assembled($asm_seq_region_id, $i);
1377 }
1378
1379 $arr = [$asm_seq_region_id, $asm_seq_region, $asm_cs_id, $asm_length];
1380
1381 $self->{'sr_name_cache'}->{"$asm_seq_region:$asm_cs_id"} = $arr;
1382 $self->{'sr_id_cache'}->{"$asm_seq_region_id"} = $arr;
1383 }
1384 }
1385
1386 $sth->finish();
1387
1388 return;
1389 }
1390
1391
1392
1393 =head2 register_all_chained
1394
1395 Arg [1] : Bio::EnsEMBL::ChainedAssemblyMapper $casm_mapper
1396 Example : $mapper = $asm_mapper_adaptor->fetch_by_CoordSystems($cs1,$cs2);
1397
1398 # make the cache large enough to hold all of the mappings
1399 $mapper->max_pair_count(10e6);
1400 # load all of the mapping data
1401 $asm_mapper_adaptor->register_all_chained($mapper);
1402
1403 # perform mappings as normal
1404 $mapper->map($slice->seq_region_name(), $sr_start, $sr_end,
1405 $sr_strand, $cs1);
1406 ...
1407 Description: This function registers the entire set of mappings between
1408 two coordinate systems in a chained mapper. This will use a lot
1409 of memory but will be much more efficient when doing a lot of
1410 mapping which is spread over the entire genome.
1411 Returntype : none
1412 Exceptions : throw if mapper is between coord systems with unexpected
1413 mapping paths
1414 Caller : specialised programs doing a lot of genome-wide mapping
1415 Status : Stable
1416
1417 =cut
1418
1419 sub register_all_chained {
1420 my $self = shift;
1421 my $casm_mapper = shift;
1422
1423 my $first_cs = $casm_mapper->first_CoordSystem();
1424 my $mid_cs = $casm_mapper->middle_CoordSystem();
1425 my $last_cs = $casm_mapper->last_CoordSystem();
1426
1427 my $start_mid_mapper = $casm_mapper->first_middle_mapper();
1428 my $end_mid_mapper = $casm_mapper->last_middle_mapper();
1429 my $combined_mapper = $casm_mapper->first_last_mapper();
1430
1431 my @ranges;
1432
1433 my $sth = $self->prepare(
1434 'SELECT
1435 asm.cmp_start,
1436 asm.cmp_end,
1437 asm.cmp_seq_region_id,
1438 sr_cmp.name,
1439 sr_cmp.length,
1440 asm.ori,
1441 asm.asm_start,
1442 asm.asm_end,
1443 asm.asm_seq_region_id,
1444 sr_asm.name,
1445 sr_asm.length
1446 FROM
1447 assembly asm, seq_region sr_asm, seq_region sr_cmp
1448 WHERE
1449 sr_asm.seq_region_id = asm.asm_seq_region_id AND
1450 sr_cmp.seq_region_id = asm.cmp_seq_region_id AND
1451 sr_asm.coord_system_id = ? AND
1452 sr_cmp.coord_system_id = ?');
1453
1454 my $csa = $self->db()->get_CoordSystemAdaptor();
1455
1456 my @path;
1457
1458 if ( !defined $mid_cs ) {
1459 @path = @{ $csa->get_mapping_path( $first_cs, $last_cs ) };
1460 if ( !defined( $path[1] ) ) {
1461 splice( @path, 1, 1 );
1462 }
1463 } else {
1464 @path = @{ $csa->get_mapping_path( $first_cs, $mid_cs ) };
1465 # fix for when we have something like supercontig#contig#chromosome
1466 if ( !defined( $path[1] ) ) {
1467 splice( @path, 1, 1 );
1468 }
1469 }
1470
1471 if ( @path != 2 ) {
1472 my $path =
1473 join( ',', map( { $_->name . ' ' . $_->version } @path ) );
1474 my $len = scalar(@path) - 1;
1475 throw( "Unexpected mapping path between start and intermediate "
1476 . "coord systems ("
1477 . $first_cs->name . " "
1478 . $first_cs->version . " and "
1479 . $mid_cs->name . " "
1480 . $mid_cs->version . ")."
1481 . "\nExpected path length 1, got $len. "
1482 . "(path=$path)" );
1483 }
1484
1485 my ($asm_cs,$cmp_cs) = @path;
1486
1487 $sth->{mysql_use_result} = 1;
1488 $sth->bind_param(1,$asm_cs->dbID,SQL_INTEGER);
1489 $sth->bind_param(2,$cmp_cs->dbID,SQL_INTEGER);
1490 $sth->execute();
1491
1492
1493 my ($mid_start, $mid_end, $mid_seq_region_id, $mid_seq_region, $mid_length,
1494 $ori, $start_start, $start_end, $start_seq_region_id, $start_seq_region,
1495 $start_length);
1496
1497 if($asm_cs->equals($first_cs)) {
1498 $sth->bind_columns(\$mid_start, \$mid_end, \$mid_seq_region_id,
1499 \$mid_seq_region, \$mid_length, \$ori, \$start_start,
1500 \$start_end, \$start_seq_region_id, \$start_seq_region,
1501 \$start_length);
1502 } else {
1503 $sth->bind_columns(\$start_start, \$start_end, \$start_seq_region_id,
1504 \$start_seq_region, \$start_length, \$ori,
1505 \$mid_start, \$mid_end, \$mid_seq_region_id,
1506 \$mid_seq_region, \$mid_length);
1507
1508 }
1509
1510 my ( $mid_cs_id, $start_cs_id, $reg, $mapper );
1511 if ( !defined $mid_cs ) {
1512 $mid_cs_id = $last_cs->dbID();
1513 $start_cs_id = $first_cs->dbID();
1514 $mapper = $combined_mapper;
1515 } else {
1516 $mid_cs_id = $mid_cs->dbID();
1517 $start_cs_id = $first_cs->dbID();
1518 $mapper = $start_mid_mapper;
1519 }
1520
1521 $reg = $casm_mapper->first_registry();
1522
1523 while($sth->fetch()) {
1524 $mapper->add_map_coordinates
1525 (
1526 $start_seq_region_id, $start_start, $start_end, $ori,
1527 $mid_seq_region_id, $mid_start, $mid_end
1528 );
1529 push( @ranges, [$start_seq_region_id, $start_start, $start_end ] );
1530
1531 $reg->check_and_register( $start_seq_region_id, 1, $start_length );
1532 if( ! defined $mid_cs ) {
1533 $casm_mapper->last_registry()->check_and_register
1534 ( $mid_seq_region_id, $mid_start, $mid_end );
1535 }
1536
1537 my $arr = [ $mid_seq_region_id, $mid_seq_region,
1538 $mid_cs_id, $mid_length ];
1539
1540 $self->{'sr_name_cache'}->{"$mid_seq_region:$mid_cs_id"} = $arr;
1541 $self->{'sr_id_cache'}->{"$mid_seq_region_id"} = $arr;
1542
1543 $arr = [ $start_seq_region_id, $start_seq_region,
1544 $start_cs_id, $start_length ];
1545
1546 $self->{'sr_name_cache'}->{"$start_seq_region:$start_cs_id"} = $arr;
1547 $self->{'sr_id_cache'}->{"$start_seq_region_id"} = $arr;
1548 }
1549
1550 if( ! defined $mid_cs ) {
1551 # thats it for the simple case
1552 return;
1553 }
1554
1555
1556 @path = @{ $csa->get_mapping_path( $last_cs, $mid_cs ) };
1557 if ( defined($mid_cs) ) {
1558 if ( !defined( $path[1] ) ) {
1559 splice( @path, 1, 1 );
1560 }
1561 }
1562
1563 if ( @path != 2 ) {
1564 my $path =
1565 join( ',', map( { $_->name . ' ' . $_->version } @path ) );
1566 my $len = scalar(@path) - 1;
1567 throw( "Unexpected mapping path between intermediate and last "
1568 . "coord systems ("
1569 . $last_cs->name . " "
1570 . $last_cs->version . " and "
1571 . $mid_cs->name . " "
1572 . $mid_cs->version . ")."
1573 . "\nExpected path length 1, got $len. "
1574 . "(path=$path)" );
1575 }
1576
1577 ($asm_cs,$cmp_cs) = @path;
1578
1579 $sth->bind_param(1,$asm_cs->dbID,SQL_INTEGER);
1580 $sth->bind_param(2,$cmp_cs->dbID,SQL_INTEGER);
1581 $sth->execute();
1582
1583
1584 my ($end_start, $end_end, $end_seq_region_id, $end_seq_region,
1585 $end_length);
1586
1587 if($asm_cs->equals($mid_cs)) {
1588 $sth->bind_columns(\$end_start, \$end_end, \$end_seq_region_id,
1589 \$end_seq_region, \$end_length, \$ori,
1590 \$mid_start, \$mid_end, \$mid_seq_region_id,
1591 \$mid_seq_region, \$mid_length);
1592 } else {
1593 $sth->bind_columns(\$mid_start, \$mid_end, \$mid_seq_region_id,
1594 \$mid_seq_region, \$mid_length, \$ori, \$end_start,
1595 \$end_end, \$end_seq_region_id, \$end_seq_region,
1596 \$end_length);
1597 }
1598
1599 my $end_cs_id = $last_cs->dbID();
1600 $reg = $casm_mapper->last_registry();
1601
1602 while($sth->fetch()) {
1603 $end_mid_mapper->add_map_coordinates
1604 (
1605 $end_seq_region_id, $end_start, $end_end, $ori,
1606 $mid_seq_region_id, $mid_start, $mid_end
1607 );
1608
1609 $reg->check_and_register( $end_seq_region_id, 1, $end_length );
1610
1611 my $arr = [ $end_seq_region_id, $end_seq_region,
1612 $end_cs_id, $end_length ];
1613 $self->{'sr_name_cache'}->{"$end_seq_region:$end_cs_id"} = $arr;
1614 $self->{'sr_id_cache'}->{"$end_seq_region_id"} = $arr;
1615 }
1616
1617 _build_combined_mapper( \@ranges, $start_mid_mapper, $end_mid_mapper,
1618 $combined_mapper, "first" );
1619
1620 return;
1621 }
1622
1623
1624
1625 # after both halves of a chained mapper are loaded
1626 # this function maps all ranges in $ranges and loads the
1627 # results into the combined mapper
1628 sub _build_combined_mapper {
1629 my $ranges = shift;
1630 my $start_mid_mapper = shift;
1631 my $end_mid_mapper = shift;
1632 my $combined_mapper = shift;
1633 my $start_name = shift;
1634
1635 my $mid_name = "middle";
1636
1637 foreach my $range (@$ranges) {
1638 my ( $seq_region_id, $start, $end) = @$range;
1639
1640 my $sum = 0;
1641
1642 my @initial_coords = $start_mid_mapper->map_coordinates($seq_region_id,
1643 $start,$end,1,
1644 $start_name);
1645
1646 foreach my $icoord (@initial_coords) {
1647 #skip gaps
1648 if($icoord->isa('Bio::EnsEMBL::Mapper::Gap')) {
1649 $sum += $icoord->length();
1650 next;
1651 }
1652
1653
1654 #feed the results of the first mapping into the second mapper
1655 my @final_coords =
1656 $end_mid_mapper->map_coordinates($icoord->id, $icoord->start,
1657 $icoord->end,
1658 $icoord->strand, $mid_name);
1659
1660
1661 foreach my $fcoord (@final_coords) {
1662 #load up the final mapper
1663
1664 if($fcoord->isa('Bio::EnsEMBL::Mapper::Coordinate')) {
1665 my $total_start = $start + $sum;
1666 my $total_end = $total_start + $fcoord->length - 1;
1667 my $ori = $fcoord->strand();
1668
1669 if($start_name eq 'first') { # add coords in consistant order
1670 $combined_mapper->add_map_coordinates(
1671 $seq_region_id, $total_start, $total_end, $ori,
1672 $fcoord->id(), $fcoord->start(), $fcoord->end());
1673 } else {
1674 $combined_mapper->add_map_coordinates(
1675 $fcoord->id(), $fcoord->start(), $fcoord->end(),$ori,
1676 $seq_region_id, $total_start, $total_end);
1677 }
1678
1679 }
1680 $sum += $fcoord->length();
1681 }
1682 }
1683 }
1684 #all done!
1685 }
1686
1687
1688 =head2 seq_regions_to_ids
1689
1690 Arg [1] : Bio::EnsEMBL::CoordSystem $coord_system
1691 Arg [2] : listref of strings $seq_regions
1692 Example : my @ids = @{$asma->seq_regions_to_ids($coord_sys, \@seq_regs)};
1693 Description: Converts a list of seq_region names to internal identifiers
1694 using the internal cache that has accumulated while registering
1695 regions for AssemblyMappers. If any requested regions are
1696 not found in the cache an attempt is made to retrieve them
1697 from the database.
1698 Returntype : listref of ints
1699 Exceptions : throw if a non-existant seqregion is provided
1700 Caller : general
1701 Status : Stable
1702
1703 =cut
1704
1705 sub seq_regions_to_ids {
1706 my $self = shift;
1707 my $coord_system = shift;
1708 my $seq_regions = shift;
1709
1710 my $cs_id = $coord_system->dbID();
1711
1712 my @out;
1713
1714 foreach my $sr (@$seq_regions) {
1715 my $arr = $self->{'sr_name_cache'}->{"$sr:$cs_id"};
1716 if( $arr ) {
1717 push( @out, $arr->[0] );
1718 } else {
1719 push @out, $self->_seq_region_name_to_id($sr,$cs_id);
1720 }
1721 }
1722
1723 return \@out;
1724 }
1725
1726
1727 =head2 seq_ids_to_regions
1728
1729 Arg [1] : listref of seq_region ids
1730 Example : my @ids = @{$asma->ids_to_seq_regions(\@seq_ids)};
1731 Description: Converts a list of seq_region ids to seq region names
1732 using the internal cache that has accumulated while registering
1733 regions for AssemblyMappers. If any requested regions are
1734 not found in the cache an attempt is made to retrieve them
1735 from the database.
1736 Returntype : listref of strings
1737 Exceptions : throw if a non-existant seq_region_id is provided
1738 Caller : general
1739 Status : Stable
1740
1741 =cut
1742
1743 sub seq_ids_to_regions {
1744 my $self = shift;
1745 my $seq_region_ids = shift;
1746
1747 my @out;
1748
1749 foreach my $sr (@$seq_region_ids) {
1750 my $arr = $self->{'sr_id_cache'}->{"$sr"};
1751 if( $arr ) {
1752 push( @out, $arr->[1] );
1753 } else {
1754 push @out, $self->_seq_region_id_to_name($sr);
1755 }
1756 }
1757
1758 return \@out;
1759 }
1760
1761 =head2 delete_cache
1762
1763 Description: Delete all the caches for the mappings/seq_regions
1764 Returntype : none
1765 Exceptions : none
1766 Caller : General
1767 Status : At risk
1768
1769 =cut
1770
1771 sub delete_cache{
1772 my ($self) = @_;
1773
1774 %{$self->{'sr_name_cache'}} = ();
1775 %{$self->{'sr_id_cache'}} = ();
1776
1777 foreach my $key (keys %{$self->{'_asm_mapper_cache'}}){
1778 $self->{'_asm_mapper_cache'}->{$key}->flush();
1779 }
1780 %{$self->{'_asm_mapper_cache'}} = ();
1781 return;
1782 }
1783
1784
1785 =head2 register_region
1786
1787 Description: DEPRECATED use register_assembled instead
1788
1789 =cut
1790
1791 sub register_region{
1792 my ($self, $assmapper, $type, $chr_name, $start, $end) = @_;
1793
1794 deprecate('Use register_assembled instead');
1795
1796 $self->register_assembled($assmapper, $chr_name, $start, $end);
1797 }
1798
1799
1800 =head2 register_contig
1801
1802 Description: DEPRECATED use register_component instead
1803
1804 =cut
1805
1806 sub register_contig {
1807 my ($self, $assmapper, $type, $contig_id ) = @_;
1808
1809 deprecate('Use register_component instead');
1810
1811 #not sure if the use is passing in a seq_region_name or a
1812 #seq_region_id...
1813 register_component($assmapper, $contig_id);
1814 }
1815
1816
1817 =head2 fetch_by_type
1818
1819 Description: DEPRECATED use fetch_by_CoordSystems instead
1820
1821 =cut
1822
1823 sub fetch_by_type{
1824 my ($self,$type) = @_;
1825
1826 deprecate('Use fetch_by_CoordSystems instead');
1827
1828 #assume that what the user wanted was a mapper between the sequence coord
1829 #level and the top coord level
1830
1831 my $csa = $self->db()->get_CoordSystemAdaptor();
1832
1833 my $cs1 = $csa->fetch_top_level($type);
1834 my $cs2 = $csa->fetch_sequence_level();
1835
1836 return $self->fetch_by_CoordSystems($cs1,$cs2);
1837 }
1838
1839
1840
1841 1;