0
|
1 =head1 LICENSE
|
|
2
|
|
3 Copyright (c) 1999-2012 The European Bioinformatics Institute and
|
|
4 Genome Research Limited. All rights reserved.
|
|
5
|
|
6 This software is distributed under a modified Apache license.
|
|
7 For license details, please see
|
|
8
|
|
9 http://www.ensembl.org/info/about/code_licence.html
|
|
10
|
|
11 =head1 CONTACT
|
|
12
|
|
13 Please email comments or questions to the public Ensembl
|
|
14 developers list at <dev@ensembl.org>.
|
|
15
|
|
16 Questions may also be sent to the Ensembl help desk at
|
|
17 <helpdesk@ensembl.org>.
|
|
18
|
|
19 =cut
|
|
20
|
|
21 =head1 NAME
|
|
22
|
|
23 Bio::EnsEMBL::DBSQL::AssemblyMapperAdaptor
|
|
24
|
|
25 =head1 SYNOPSIS
|
|
26
|
|
27 use Bio::EnsEMBL::Registry;
|
|
28
|
|
29 Bio::EnsEMBL::Registry->load_registry_from_db(
|
|
30 -host => 'ensembldb.ensembl.org',
|
|
31 -user => 'anonymous'
|
|
32 );
|
|
33
|
|
34 $asma = Bio::EnsEMBL::Registry->get_adaptor( "human", "core",
|
|
35 "assemblymapper" );
|
|
36
|
|
37 $csa = Bio::EnsEMBL::Registry->get_adaptor( "human", "core",
|
|
38 "coordsystem" );
|
|
39
|
|
40 my $chr33_cs = $csa->fetch_by_name( 'chromosome', 'NCBI33' );
|
|
41 my $chr34_cs = $csa->fetch_by_name( 'chromosome', 'NCBI34' );
|
|
42 my $ctg_cs = $csa->fetch_by_name('contig');
|
|
43 my $clone_cs = $csa->fetch_by_name('clone');
|
|
44
|
|
45 my $chr_ctg_mapper =
|
|
46 $asma->fetch_by_CoordSystems( $chr33_cs, $ctg_cs );
|
|
47
|
|
48 my $ncbi33_ncbi34_mapper =
|
|
49 $asm_adptr->fetch_by_CoordSystems( $chr33, $chr34 );
|
|
50
|
|
51 my $ctg_clone_mapper =
|
|
52 $asm_adptr->fetch_by_CoordSystems( $ctg_cs, $clone_cs );
|
|
53
|
|
54
|
|
55 =head1 DESCRIPTION
|
|
56
|
|
57 Adaptor for handling Assembly mappers. This is a I<Singleton> class.
|
|
58 ie: There is only one per database (C<DBAdaptor>).
|
|
59
|
|
60 This is used to retrieve mappers between any two coordinate systems
|
|
61 whose makeup is described by the assembly table. Currently one step
|
|
62 (explicit) and two step (implicit) pairwise mapping is supported. In
|
|
63 one-step mapping an explicit relationship between the coordinate systems
|
|
64 is defined in the assembly table. In two-step 'chained' mapping no
|
|
65 explicit mapping is present but the coordinate systems must share a
|
|
66 common mapping to an intermediate coordinate system.
|
|
67
|
|
68 =head1 METHODS
|
|
69
|
|
70 =cut
|
|
71
|
|
72 package Bio::EnsEMBL::DBSQL::AssemblyMapperAdaptor;
|
|
73 use vars qw(@ISA);
|
|
74 use strict;
|
|
75
|
|
76 use Bio::EnsEMBL::DBSQL::BaseAdaptor;
|
|
77 use Bio::EnsEMBL::AssemblyMapper;
|
|
78 use Bio::EnsEMBL::ChainedAssemblyMapper;
|
|
79 use Bio::EnsEMBL::TopLevelAssemblyMapper;
|
|
80
|
|
81 use Bio::EnsEMBL::Utils::Cache; #CPAN LRU cache
|
|
82 use Bio::EnsEMBL::Utils::Exception qw(throw deprecate warning stack_trace_dump);
|
|
83 #use Bio::EnsEMBL::Utils::Exception qw(deprecate throw);
|
|
84 use Bio::EnsEMBL::Utils::SeqRegionCache;
|
|
85
|
|
86 use integer; #do proper arithmetic bitshifts
|
|
87
|
|
88 @ISA = qw(Bio::EnsEMBL::DBSQL::BaseAdaptor);
|
|
89
|
|
90
|
|
91 my $CHUNKFACTOR = 20; # 2^20 = approx. 10^6
|
|
92
|
|
93 =head2 new
|
|
94
|
|
95 Arg [1] : Bio::EnsEMBL::DBAdaptor $dbadaptor the adaptor for
|
|
96 the database this assembly mapper is using.
|
|
97 Example : my $asma = new Bio::EnsEMBL::AssemblyMapperAdaptor($dbadaptor);
|
|
98 Description: Creates a new AssemblyMapperAdaptor object
|
|
99 Returntype : Bio::EnsEMBL::DBSQL::AssemblyMapperAdaptor
|
|
100 Exceptions : none
|
|
101 Caller : Bio::EnsEMBL::DBSQL::DBAdaptor
|
|
102 Status : Stable
|
|
103
|
|
104 =cut
|
|
105
|
|
106 sub new {
|
|
107 my($class, $dbadaptor) = @_;
|
|
108
|
|
109 my $self = $class->SUPER::new($dbadaptor);
|
|
110
|
|
111 $self->{'_asm_mapper_cache'} = {};
|
|
112
|
|
113 # use a shared cache (for this database) that contains info about
|
|
114 # seq regions
|
|
115 my $seq_region_cache = $self->db->get_SeqRegionCache();
|
|
116 $self->{'sr_name_cache'} = $seq_region_cache->{'name_cache'};
|
|
117 $self->{'sr_id_cache'} = $seq_region_cache->{'id_cache'};
|
|
118
|
|
119 return $self;
|
|
120 }
|
|
121
|
|
122
|
|
123
|
|
124 =head2 cache_seq_ids_with_mult_assemblys
|
|
125
|
|
126 Example : $self->adaptor->cache_seq_ids_with_mult_assemblys();
|
|
127 Description: Creates a hash of the component seq region ids that
|
|
128 map to more than one assembly from the assembly table.
|
|
129 Retruntype : none
|
|
130 Exceptions : none
|
|
131 Caller : AssemblyMapper, ChainedAssemblyMapper
|
|
132 Status : At Risk
|
|
133
|
|
134 =cut
|
|
135
|
|
136 sub cache_seq_ids_with_mult_assemblys{
|
|
137 my $self = shift;
|
|
138 my %multis;
|
|
139
|
|
140 return if (defined($self->{'multi_seq_ids'}));
|
|
141
|
|
142 $self->{'multi_seq_ids'} = {};
|
|
143
|
|
144 my $sql = qq(
|
|
145 SELECT sra.seq_region_id
|
|
146 FROM seq_region_attrib sra,
|
|
147 attrib_type at,
|
|
148 seq_region sr,
|
|
149 coord_system cs
|
|
150 WHERE sra.attrib_type_id = at.attrib_type_id
|
|
151 AND code = "MultAssem"
|
|
152 AND sra.seq_region_id = sr.seq_region_id
|
|
153 AND sr.coord_system_id = cs.coord_system_id
|
|
154 AND cs.species_id = ?);
|
|
155
|
|
156 my $sth = $self->prepare($sql);
|
|
157
|
|
158 $sth->bind_param( 1, $self->species_id(), SQL_INTEGER );
|
|
159
|
|
160 $sth->execute();
|
|
161
|
|
162 my ($seq_region_id);
|
|
163
|
|
164 $sth->bind_columns(\$seq_region_id);
|
|
165
|
|
166 while($sth->fetch()) {
|
|
167 $self->{'multi_seq_ids'}->{$seq_region_id} = 1;
|
|
168 }
|
|
169 $sth->finish;
|
|
170 }
|
|
171
|
|
172
|
|
173 =head2 fetch_by_CoordSystems
|
|
174
|
|
175 Arg [1] : Bio::EnsEMBL::CoordSystem $cs1
|
|
176 One of the coordinate systems to retrieve the mapper
|
|
177 between
|
|
178 Arg [2] : Bio::EnsEMBL::CoordSystem $cs2
|
|
179 The other coordinate system to map between
|
|
180 Description: Retrieves an Assembly mapper for two coordinate
|
|
181 systems whose relationship is described in the
|
|
182 assembly table.
|
|
183
|
|
184 The ordering of the coodinate systems is arbitrary.
|
|
185 The following two statements are equivalent:
|
|
186 $mapper = $asma->fetch_by_CoordSystems($cs1,$cs2);
|
|
187 $mapper = $asma->fetch_by_CoordSystems($cs2,$cs1);
|
|
188 Returntype : Bio::EnsEMBL::AssemblyMapper
|
|
189 Exceptions : wrong argument types
|
|
190 Caller : general
|
|
191 Status : Stable
|
|
192
|
|
193 =cut
|
|
194
|
|
195 sub fetch_by_CoordSystems {
|
|
196 my $self = shift;
|
|
197 my $cs1 = shift;
|
|
198 my $cs2 = shift;
|
|
199
|
|
200 if(!ref($cs1) || !$cs1->isa('Bio::EnsEMBL::CoordSystem')) {
|
|
201 throw("cs1 argument must be a Bio::EnsEMBL::CoordSystem.");
|
|
202 }
|
|
203 if(!ref($cs2) || !$cs2->isa('Bio::EnsEMBL::CoordSystem')) {
|
|
204 throw("cs2 argument must be a Bio::EnsEMBL::CoordSystem.");
|
|
205 }
|
|
206
|
|
207 # if($cs1->equals($cs2)) {
|
|
208 # throw("Cannot create mapper between same coord systems: " .
|
|
209 # $cs1->name . " " . $cs1->version);
|
|
210 # }
|
|
211
|
|
212 if($cs1->is_top_level()) {
|
|
213 return Bio::EnsEMBL::TopLevelAssemblyMapper->new($self, $cs1, $cs2);
|
|
214 }
|
|
215 if($cs2->is_top_level()) {
|
|
216 return Bio::EnsEMBL::TopLevelAssemblyMapper->new($self, $cs2, $cs1);
|
|
217 }
|
|
218
|
|
219 my $csa = $self->db->get_CoordSystemAdaptor();
|
|
220
|
|
221 #retrieve the shortest possible mapping path between these systems
|
|
222 my @mapping_path = @{$csa->get_mapping_path($cs1,$cs2)};
|
|
223
|
|
224 if(!@mapping_path) {
|
|
225
|
|
226 # It is perfectly fine not to have a mapping. No warning needed really
|
|
227 # Just check the return code!!
|
|
228
|
|
229 # warning(
|
|
230 # "There is no mapping defined between these coord systems:\n" .
|
|
231 # $cs1->name() . " " . $cs1->version() . " and " . $cs2->name() . " " .
|
|
232 # $cs2->version()
|
|
233 # );
|
|
234 return undef;
|
|
235 }
|
|
236
|
|
237 my $key = join(':', map({defined($_)?$_->dbID():"-"} @mapping_path));
|
|
238
|
|
239 my $asm_mapper = $self->{'_asm_mapper_cache'}->{$key};
|
|
240
|
|
241 return $asm_mapper if($asm_mapper);
|
|
242
|
|
243 if(@mapping_path == 1) {
|
|
244 throw("Incorrect mapping path defined in meta table. " .
|
|
245 "0 step mapping encountered between:\n" .
|
|
246 $cs1->name() . " " . $cs1->version() . " and " . $cs2->name . " " .
|
|
247 $cs2->version());
|
|
248 }
|
|
249
|
|
250 if(@mapping_path == 2) {
|
|
251 #1 step regular mapping
|
|
252 $asm_mapper = Bio::EnsEMBL::AssemblyMapper->new($self, @mapping_path);
|
|
253
|
|
254 # If you want multiple pieces on two seqRegions to map to each other
|
|
255 # you need to make an assembly.mapping entry that is seperated with a #
|
|
256 # instead of an |.
|
|
257
|
|
258 $self->{'_asm_mapper_cache'}->{$key} = $asm_mapper;
|
|
259 return $asm_mapper;
|
|
260 }
|
|
261
|
|
262 if(@mapping_path == 3) {
|
|
263 #two step chained mapping
|
|
264 $asm_mapper = Bio::EnsEMBL::ChainedAssemblyMapper->new($self,@mapping_path);
|
|
265 #in multi-step mapping it is possible get requests with the
|
|
266 #coordinate system ordering reversed since both mappings directions
|
|
267 #cache on both orderings just in case
|
|
268 #e.g. chr <-> contig <-> clone and clone <-> contig <-> chr
|
|
269
|
|
270 $self->{'_asm_mapper_cache'}->{$key} = $asm_mapper;
|
|
271 $key = join(':', map({defined($_)?$_->dbID():"-"} reverse(@mapping_path)));
|
|
272 $self->{'_asm_mapper_cache'}->{$key} = $asm_mapper;
|
|
273 return $asm_mapper;
|
|
274 }
|
|
275
|
|
276 throw("Only 1 and 2 step coordinate system mapping is currently\n" .
|
|
277 "supported. Mapping between " .
|
|
278 $cs1->name() . " " . $cs1->version() . " and " .
|
|
279 $cs2->name() . " " . $cs2->version() .
|
|
280 " requires ". (scalar(@mapping_path)-1) . " steps.");
|
|
281 }
|
|
282
|
|
283
|
|
284
|
|
285 =head2 register_assembled
|
|
286
|
|
287 Arg [1] : Bio::EnsEMBL::AssemblyMapper $asm_mapper
|
|
288 A valid AssemblyMapper object
|
|
289 Arg [2] : integer $asm_seq_region
|
|
290 The dbID of the seq_region to be registered
|
|
291 Arg [3] : int $asm_start
|
|
292 The start of the region to be registered
|
|
293 Arg [4] : int $asm_end
|
|
294 The end of the region to be registered
|
|
295 Description: Declares an assembled region to the AssemblyMapper.
|
|
296 This extracts the relevant data from the assembly
|
|
297 table and stores it in Mapper internal to the $asm_mapper.
|
|
298 It therefore must be called before any mapping is
|
|
299 attempted on that region. Otherwise only gaps will
|
|
300 be returned. Note that the AssemblyMapper automatically
|
|
301 calls this method when the need arises.
|
|
302 Returntype : none
|
|
303 Exceptions : throw if the seq_region to be registered does not exist
|
|
304 or if it associated with multiple assembled pieces (bad data
|
|
305 in assembly table)
|
|
306 Caller : Bio::EnsEMBL::AssemblyMapper
|
|
307 Status : Stable
|
|
308
|
|
309 =cut
|
|
310
|
|
311
|
|
312 sub register_assembled {
|
|
313 my $self = shift;
|
|
314 my $asm_mapper = shift;
|
|
315 my $asm_seq_region = shift;
|
|
316 my $asm_start = shift;
|
|
317 my $asm_end = shift;
|
|
318
|
|
319 if(!ref($asm_mapper) || !$asm_mapper->isa('Bio::EnsEMBL::AssemblyMapper')) {
|
|
320 throw("Bio::EnsEMBL::AssemblyMapper argument expected");
|
|
321 }
|
|
322
|
|
323 throw("asm_seq_region argument expected") if(!defined($asm_seq_region));
|
|
324 throw("asm_start argument expected") if(!defined($asm_start));
|
|
325 throw("asm_end argument expected") if(!defined($asm_end));
|
|
326
|
|
327 my $asm_cs_id = $asm_mapper->assembled_CoordSystem->dbID();
|
|
328 my $cmp_cs_id = $asm_mapper->component_CoordSystem->dbID();
|
|
329
|
|
330 #split up the region to be registered into fixed chunks
|
|
331 #this allows us to keep track of regions that have already been
|
|
332 #registered and also works under the assumption that if a small region
|
|
333 #is requested it is likely that other requests will be made in the
|
|
334 #vicinity (the minimum size registered the chunksize (2^chunkfactor)
|
|
335
|
|
336 my @chunk_regions;
|
|
337 #determine span of chunks
|
|
338 #bitwise shift right is fast and easy integer division
|
|
339
|
|
340 my($start_chunk, $end_chunk);
|
|
341
|
|
342 $start_chunk = $asm_start >> $CHUNKFACTOR;
|
|
343 $end_chunk = $asm_end >> $CHUNKFACTOR;
|
|
344
|
|
345 # inserts have start = end + 1, on boundary condition start_chunk
|
|
346 # could be less than end chunk
|
|
347 if($asm_start == $asm_end + 1) {
|
|
348 ($start_chunk, $end_chunk) = ($end_chunk, $start_chunk);
|
|
349 }
|
|
350
|
|
351 #find regions of continuous unregistered chunks
|
|
352 my $i;
|
|
353 my ($begin_chunk_region,$end_chunk_region);
|
|
354 for ($i = $start_chunk; $i <= $end_chunk; $i++) {
|
|
355 if($asm_mapper->have_registered_assembled($asm_seq_region, $i)) {
|
|
356 if(defined($begin_chunk_region)) {
|
|
357 #this is the end of an unregistered region.
|
|
358 my $region = [($begin_chunk_region << $CHUNKFACTOR),
|
|
359 (($end_chunk_region+1) << $CHUNKFACTOR)-1];
|
|
360 push @chunk_regions, $region;
|
|
361 $begin_chunk_region = $end_chunk_region = undef;
|
|
362 }
|
|
363 } else {
|
|
364 $begin_chunk_region = $i if(!defined($begin_chunk_region));
|
|
365 $end_chunk_region = $i+1;
|
|
366 $asm_mapper->register_assembled($asm_seq_region,$i);
|
|
367 }
|
|
368 }
|
|
369
|
|
370 #the last part may have been an unregistered region too
|
|
371 if(defined($begin_chunk_region)) {
|
|
372 my $region = [($begin_chunk_region << $CHUNKFACTOR),
|
|
373 (($end_chunk_region+1) << $CHUNKFACTOR) -1];
|
|
374 push @chunk_regions, $region;
|
|
375 }
|
|
376
|
|
377 return if(!@chunk_regions);
|
|
378
|
|
379 # keep the Mapper to a reasonable size
|
|
380 if( $asm_mapper->size() > $asm_mapper->max_pair_count() ) {
|
|
381 $asm_mapper->flush();
|
|
382 #we now have to go and register the entire requested region since we
|
|
383 #just flushed everything
|
|
384
|
|
385 @chunk_regions = ( [ ( $start_chunk << $CHUNKFACTOR)
|
|
386 , (($end_chunk+1) << $CHUNKFACTOR)-1 ] );
|
|
387
|
|
388 for( my $i = $start_chunk; $i <= $end_chunk; $i++ ) {
|
|
389 $asm_mapper->register_assembled( $asm_seq_region, $i );
|
|
390 }
|
|
391 }
|
|
392
|
|
393 # my $asm_seq_region_id =
|
|
394 # $self->_seq_region_name_to_id($asm_seq_region,$asm_cs_id);
|
|
395
|
|
396 # Retrieve the description of how the assembled region is made from
|
|
397 # component regions for each of the continuous blocks of unregistered,
|
|
398 # chunked regions
|
|
399
|
|
400 my $q = qq{
|
|
401 SELECT
|
|
402 asm.cmp_start,
|
|
403 asm.cmp_end,
|
|
404 asm.cmp_seq_region_id,
|
|
405 sr.name,
|
|
406 sr.length,
|
|
407 asm.ori,
|
|
408 asm.asm_start,
|
|
409 asm.asm_end
|
|
410 FROM
|
|
411 assembly asm, seq_region sr
|
|
412 WHERE
|
|
413 asm.asm_seq_region_id = ? AND
|
|
414 ? <= asm.asm_end AND
|
|
415 ? >= asm.asm_start AND
|
|
416 asm.cmp_seq_region_id = sr.seq_region_id AND
|
|
417 sr.coord_system_id = ?
|
|
418 };
|
|
419
|
|
420 my $sth = $self->prepare($q);
|
|
421
|
|
422 foreach my $region (@chunk_regions) {
|
|
423 my($region_start, $region_end) = @$region;
|
|
424 $sth->bind_param(1,$asm_seq_region,SQL_INTEGER);
|
|
425 $sth->bind_param(2,$region_start,SQL_INTEGER);
|
|
426 $sth->bind_param(3,$region_end,SQL_INTEGER);
|
|
427 $sth->bind_param(4,$cmp_cs_id,SQL_INTEGER);
|
|
428
|
|
429 $sth->execute();
|
|
430
|
|
431 my($cmp_start, $cmp_end, $cmp_seq_region_id, $cmp_seq_region, $ori,
|
|
432 $asm_start, $asm_end, $cmp_seq_region_length);
|
|
433
|
|
434 $sth->bind_columns(\$cmp_start, \$cmp_end, \$cmp_seq_region_id,
|
|
435 \$cmp_seq_region, \$cmp_seq_region_length, \$ori,
|
|
436 \$asm_start, \$asm_end);
|
|
437
|
|
438 #
|
|
439 # Load the unregistered regions of the mapper
|
|
440 #
|
|
441 while($sth->fetch()) {
|
|
442 next if($asm_mapper->have_registered_component($cmp_seq_region_id)
|
|
443 and !defined($self->{'multi_seq_ids'}->{$cmp_seq_region_id}));
|
|
444 $asm_mapper->register_component($cmp_seq_region_id);
|
|
445 $asm_mapper->mapper->add_map_coordinates(
|
|
446 $asm_seq_region, $asm_start, $asm_end,
|
|
447 $ori,
|
|
448 $cmp_seq_region_id, $cmp_start, $cmp_end);
|
|
449
|
|
450 my $arr = [ $cmp_seq_region_id, $cmp_seq_region,
|
|
451 $cmp_cs_id, $cmp_seq_region_length ];
|
|
452
|
|
453 $self->{'sr_name_cache'}->{"$cmp_seq_region:$cmp_cs_id"} = $arr;
|
|
454 $self->{'sr_id_cache'}->{"$cmp_seq_region_id"} = $arr;
|
|
455 }
|
|
456 }
|
|
457
|
|
458 $sth->finish();
|
|
459 }
|
|
460
|
|
461
|
|
462
|
|
463 sub _seq_region_name_to_id {
|
|
464 my $self = shift;
|
|
465 my $sr_name = shift;
|
|
466 my $cs_id = shift;
|
|
467
|
|
468 ($sr_name && $cs_id) || throw('seq_region_name and coord_system_id args ' .
|
|
469 'are required');
|
|
470
|
|
471 my $arr = $self->{'sr_name_cache'}->{"$sr_name:$cs_id"};
|
|
472 if( $arr ) {
|
|
473 return $arr->[0];
|
|
474 }
|
|
475
|
|
476 # Get the seq_region_id via the name. This would be quicker if we just
|
|
477 # used internal ids instead but stored but then we lose the ability
|
|
478 # the transform accross databases with different internal ids
|
|
479
|
|
480 my $sth = $self->prepare("SELECT seq_region_id, length " .
|
|
481 "FROM seq_region " .
|
|
482 "WHERE name = ? AND coord_system_id = ?");
|
|
483
|
|
484 $sth->bind_param(1,$sr_name,SQL_VARCHAR);
|
|
485 $sth->bind_param(2,$cs_id,SQL_INTEGER);
|
|
486 $sth->execute();
|
|
487
|
|
488 if(!$sth->rows() == 1) {
|
|
489 throw("Ambiguous or non-existant seq_region [$sr_name] " .
|
|
490 "in coord system $cs_id");
|
|
491 }
|
|
492
|
|
493 my ($sr_id, $sr_length) = $sth->fetchrow_array();
|
|
494 $sth->finish();
|
|
495
|
|
496 $arr = [ $sr_id, $sr_name, $cs_id, $sr_length ];
|
|
497
|
|
498 $self->{'sr_name_cache'}->{"$sr_name:$cs_id"} = $arr;
|
|
499 $self->{'sr_id_cache'}->{"$sr_id"} = $arr;
|
|
500
|
|
501 return $sr_id;
|
|
502 }
|
|
503
|
|
504 sub _seq_region_id_to_name {
|
|
505 my $self = shift;
|
|
506 my $sr_id = shift;
|
|
507
|
|
508 ($sr_id) || throw('seq_region_is required');
|
|
509
|
|
510 my $arr = $self->{'sr_id_cache'}->{"$sr_id"};
|
|
511 if( $arr ) {
|
|
512 return $arr->[1];
|
|
513 }
|
|
514
|
|
515 # Get the seq_region name via the id. This would be quicker if we just
|
|
516 # used internal ids instead but stored but then we lose the ability
|
|
517 # the transform accross databases with different internal ids
|
|
518
|
|
519 my $sth = $self->prepare("SELECT name, length ,coord_system_id " .
|
|
520 "FROM seq_region " .
|
|
521 "WHERE seq_region_id = ? ");
|
|
522
|
|
523 $sth->bind_param(1,$sr_id,SQL_INTEGER);
|
|
524 $sth->execute();
|
|
525
|
|
526 if(!$sth->rows() == 1) {
|
|
527 throw("non-existant seq_region [$sr_id]");
|
|
528 }
|
|
529
|
|
530 my ($sr_name, $sr_length, $cs_id) = $sth->fetchrow_array();
|
|
531 $sth->finish();
|
|
532
|
|
533 $arr = [ $sr_id, $sr_name, $cs_id, $sr_length ];
|
|
534
|
|
535 $self->{'sr_name_cache'}->{"$sr_name:$cs_id"} = $arr;
|
|
536 $self->{'sr_id_cache'}->{"$sr_id"} = $arr;
|
|
537
|
|
538 return $sr_name;
|
|
539 }
|
|
540
|
|
541
|
|
542 =head2 register_component
|
|
543
|
|
544 Arg [1] : Bio::EnsEMBL::AssemblyMapper $asm_mapper
|
|
545 A valid AssemblyMapper object
|
|
546 Arg [2] : integer $cmp_seq_region
|
|
547 The dbID of the seq_region to be registered
|
|
548 Description: Declares a component region to the AssemblyMapper.
|
|
549 This extracts the relevant data from the assembly
|
|
550 table and stores it in Mapper internal to the $asm_mapper.
|
|
551 It therefore must be called before any mapping is
|
|
552 attempted on that region. Otherwise only gaps will
|
|
553 be returned. Note that the AssemblyMapper automatically
|
|
554 calls this method when the need arises.
|
|
555 Returntype : none
|
|
556 Exceptions : throw if the seq_region to be registered does not exist
|
|
557 or if it associated with multiple assembled pieces (bad data
|
|
558 in assembly table)
|
|
559 Caller : Bio::EnsEMBL::AssemblyMapper
|
|
560 Status : Stable
|
|
561
|
|
562 =cut
|
|
563
|
|
564 sub register_component {
|
|
565 my $self = shift;
|
|
566 my $asm_mapper = shift;
|
|
567 my $cmp_seq_region = shift;
|
|
568
|
|
569 if(!ref($asm_mapper) || !$asm_mapper->isa('Bio::EnsEMBL::AssemblyMapper')) {
|
|
570 throw("Bio::EnsEMBL::AssemblyMapper argument expected");
|
|
571 }
|
|
572
|
|
573 if(!defined($cmp_seq_region)) {
|
|
574 throw("cmp_seq_region argument expected");
|
|
575 }
|
|
576
|
|
577 my $cmp_cs_id = $asm_mapper->component_CoordSystem()->dbID();
|
|
578 my $asm_cs_id = $asm_mapper->assembled_CoordSystem()->dbID();
|
|
579
|
|
580 #do nothing if this region is already registered or special case
|
|
581 return if($asm_mapper->have_registered_component($cmp_seq_region)
|
|
582 and !defined($self->{'multi_seq_ids'}->{$cmp_seq_region}));
|
|
583
|
|
584 # my $cmp_seq_region_id =
|
|
585 # $self->_seq_region_name_to_id($cmp_seq_region, $cmp_cs_id);
|
|
586
|
|
587 # Determine what part of the assembled region this component region makes up
|
|
588
|
|
589 my $q = qq{
|
|
590 SELECT
|
|
591 asm.asm_start,
|
|
592 asm.asm_end,
|
|
593 asm.asm_seq_region_id,
|
|
594 sr.name,
|
|
595 sr.length
|
|
596 FROM
|
|
597 assembly asm, seq_region sr
|
|
598 WHERE
|
|
599 asm.cmp_seq_region_id = ? AND
|
|
600 asm.asm_seq_region_id = sr.seq_region_id AND
|
|
601 sr.coord_system_id = ?
|
|
602 };
|
|
603
|
|
604 my $sth = $self->prepare($q);
|
|
605 $sth->bind_param(1,$cmp_seq_region,SQL_INTEGER);
|
|
606 $sth->bind_param(2,$asm_cs_id,SQL_INTEGER);
|
|
607 $sth->execute();
|
|
608
|
|
609 if($sth->rows() == 0) {
|
|
610 #this component is not used in the assembled part i.e. gap
|
|
611 $asm_mapper->register_component($cmp_seq_region);
|
|
612 return;
|
|
613 }
|
|
614
|
|
615 #we do not currently support components mapping to multiple assembled
|
|
616 # make sure that you've got the correct mapping in the meta-table :
|
|
617 # chromosome:EquCab2#contig ( use'#' for multiple mappings )
|
|
618 # chromosome:EquCab2|contig ( use '|' delimiter for 1-1 mappings )
|
|
619 #
|
|
620 if($sth->rows() != 1) {
|
|
621 throw("Multiple assembled regions for single " .
|
|
622 "component region cmp_seq_region_id=[$cmp_seq_region]\n".
|
|
623 "Remember that multiple mappings use the \#-operaator".
|
|
624 " in the meta-table (i.e. chromosome:EquCab2\#contig\n");
|
|
625 }
|
|
626
|
|
627 my ($asm_start, $asm_end, $asm_seq_region_id,
|
|
628 $asm_seq_region, $asm_seq_region_length) = $sth->fetchrow_array();
|
|
629
|
|
630 my $arr = [ $asm_seq_region_id, $asm_seq_region,
|
|
631 $asm_cs_id, $asm_seq_region_length ];
|
|
632
|
|
633 $self->{'sr_name_cache'}->{"$asm_seq_region:$asm_cs_id"} = $arr;
|
|
634 $self->{'sr_id_cache'}->{"$asm_seq_region_id"} = $arr;
|
|
635
|
|
636 $sth->finish();
|
|
637
|
|
638 # Register the corresponding assembled region. This allows a us to
|
|
639 # register things in assembled chunks which allows us to:
|
|
640 # (1) Keep track of what assembled regions are registered
|
|
641 # (2) Use locality of reference (if they want something in same general
|
|
642 # region it will already be registered).
|
|
643
|
|
644 $self->register_assembled($asm_mapper,$asm_seq_region_id,$asm_start,$asm_end);
|
|
645 }
|
|
646
|
|
647
|
|
648
|
|
649 =head2 register_chained
|
|
650
|
|
651 Arg [1] : Bio::EnsEMBL::ChainedAssemblyMapper $casm_mapper
|
|
652 The chained assembly mapper to register regions on
|
|
653 Arg [2] : string $from ('first' or 'last')
|
|
654 The direction we are registering from, and the name of the
|
|
655 internal mapper.
|
|
656 Arg [3] : string $seq_region_name
|
|
657 The name of the seqregion we are registering on
|
|
658 Arg [4] : listref $ranges
|
|
659 A list of ranges to register (in [$start,$end] tuples).
|
|
660 Arg [5] : (optional) $to_slice
|
|
661 Only register those on this Slice.
|
|
662 Description: Registers a set of ranges on a chained assembly mapper.
|
|
663 This function is at the heart of the chained mapping process.
|
|
664 It retrieves information from the assembly table and
|
|
665 dynamically constructs the mappings between two coordinate
|
|
666 systems which are 2 mapping steps apart. It does this by using
|
|
667 two internal mappers to load up a third mapper which is
|
|
668 actually used by the ChainedAssemblyMapper to perform the
|
|
669 mapping.
|
|
670
|
|
671 This method must be called before any mapping is
|
|
672 attempted on regions of interest, otherwise only gaps will
|
|
673 be returned. Note that the ChainedAssemblyMapper automatically
|
|
674 calls this method when the need arises.
|
|
675 Returntype : none
|
|
676 Exceptions : throw if the seq_region to be registered does not exist
|
|
677 or if it associated with multiple assembled pieces (bad data
|
|
678 in assembly table)
|
|
679
|
|
680 throw if the mapping between the coordinate systems cannot
|
|
681 be performed in two steps, which means there is an internal
|
|
682 error in the data in the meta table or in the code that creates
|
|
683 the mapping paths.
|
|
684 Caller : Bio::EnsEMBL::AssemblyMapper
|
|
685 Status : Stable
|
|
686
|
|
687 =cut
|
|
688
|
|
689 sub register_chained {
|
|
690 my $self = shift;
|
|
691 my $casm_mapper = shift;
|
|
692 my $from = shift;
|
|
693 my $seq_region_id = shift;
|
|
694 my $ranges = shift;
|
|
695 my $to_slice = shift;
|
|
696
|
|
697 my $to_seq_region_id;
|
|
698 if(defined($to_slice)){
|
|
699 if($casm_mapper->first_CoordSystem()->equals($casm_mapper->last_CoordSystem())){
|
|
700 return $self->_register_chained_special($casm_mapper, $from, $seq_region_id, $ranges, $to_slice);
|
|
701 }
|
|
702 $to_seq_region_id = $to_slice->get_seq_region_id();
|
|
703 if(!defined($to_seq_region_id)){
|
|
704 die "Could not get seq_region_id for to_slice".$to_slice->seq_region_name."\n";
|
|
705 }
|
|
706 }
|
|
707
|
|
708 my ($start_name, $start_mid_mapper, $start_cs, $start_registry,
|
|
709 $end_name, $end_mid_mapper, $end_cs, $end_registry);
|
|
710
|
|
711 if($from eq 'first') {
|
|
712 $start_name = 'first';
|
|
713 $start_mid_mapper = $casm_mapper->first_middle_mapper();
|
|
714 $start_cs = $casm_mapper->first_CoordSystem();
|
|
715 $start_registry = $casm_mapper->first_registry();
|
|
716 $end_mid_mapper = $casm_mapper->last_middle_mapper();
|
|
717 $end_cs = $casm_mapper->last_CoordSystem();
|
|
718 $end_registry = $casm_mapper->last_registry();
|
|
719 $end_name = 'last';
|
|
720 } elsif($from eq 'last') {
|
|
721 $start_name = 'last';
|
|
722 $start_mid_mapper = $casm_mapper->last_middle_mapper();
|
|
723 $start_cs = $casm_mapper->last_CoordSystem();
|
|
724 $start_registry = $casm_mapper->last_registry();
|
|
725 $end_mid_mapper = $casm_mapper->first_middle_mapper();
|
|
726 $end_cs = $casm_mapper->first_CoordSystem();
|
|
727 $end_registry = $casm_mapper->first_registry();
|
|
728 $end_name = 'first';
|
|
729 } else {
|
|
730 throw("Invalid from argument: [$from], must be 'first' or 'last'");
|
|
731 }
|
|
732
|
|
733 my $combined_mapper = $casm_mapper->first_last_mapper();
|
|
734 my $mid_cs = $casm_mapper->middle_CoordSystem();
|
|
735 my $mid_name = 'middle';
|
|
736 my $csa = $self->db->get_CoordSystemAdaptor();
|
|
737
|
|
738 # Check for the simple case where the ChainedMapper is short
|
|
739 if( ! defined $mid_cs ) {
|
|
740 $start_mid_mapper = $combined_mapper;
|
|
741 }
|
|
742
|
|
743
|
|
744 ##############
|
|
745 # obtain the first half of the mappings and load them into the start mapper
|
|
746 #
|
|
747
|
|
748 #ascertain which is component and which is actually assembled coord system
|
|
749 my @path;
|
|
750
|
|
751 # check for the simple case, where the ChainedMapper is short
|
|
752 if( defined $mid_cs ) {
|
|
753 @path = @{$csa->get_mapping_path($start_cs, $mid_cs)};
|
|
754 } else {
|
|
755 @path = @{$csa->get_mapping_path( $start_cs, $end_cs )};
|
|
756 }
|
|
757
|
|
758 if(@path != 2 && defined( $path[1] )) {
|
|
759 my $path = join(',', map({$_->name .' '. $_->version} @path));
|
|
760 my $len = scalar(@path) - 1;
|
|
761 throw("Unexpected mapping path between start and intermediate " .
|
|
762 "coord systems (". $start_cs->name . " " . $start_cs->version .
|
|
763 " and " . $mid_cs->name . " " . $mid_cs->version . ")." .
|
|
764 "\nExpected path length 1, got $len. " .
|
|
765 "(path=$path)");
|
|
766 }
|
|
767
|
|
768 my $sth;
|
|
769 my ($asm_cs,$cmp_cs);
|
|
770 $asm_cs = $path[0];
|
|
771 $cmp_cs = $path[-1];
|
|
772
|
|
773 #the SQL varies depending on whether we are coming from assembled or
|
|
774 #component coordinate system
|
|
775
|
|
776 my $asm2cmp = (<<ASMCMP);
|
|
777 SELECT
|
|
778 asm.cmp_start,
|
|
779 asm.cmp_end,
|
|
780 asm.cmp_seq_region_id,
|
|
781 sr.name,
|
|
782 sr.length,
|
|
783 asm.ori,
|
|
784 asm.asm_start,
|
|
785 asm.asm_end
|
|
786 FROM
|
|
787 assembly asm, seq_region sr
|
|
788 WHERE
|
|
789 asm.asm_seq_region_id = ? AND
|
|
790 ? <= asm.asm_end AND
|
|
791 ? >= asm.asm_start AND
|
|
792 asm.cmp_seq_region_id = sr.seq_region_id AND
|
|
793 sr.coord_system_id = ?
|
|
794 ASMCMP
|
|
795
|
|
796
|
|
797 my $cmp2asm = (<<CMPASM);
|
|
798 SELECT
|
|
799 asm.asm_start,
|
|
800 asm.asm_end,
|
|
801 asm.asm_seq_region_id,
|
|
802 sr.name,
|
|
803 sr.length,
|
|
804 asm.ori,
|
|
805 asm.cmp_start,
|
|
806 asm.cmp_end
|
|
807 FROM
|
|
808 assembly asm, seq_region sr
|
|
809 WHERE
|
|
810 asm.cmp_seq_region_id = ? AND
|
|
811 ? <= asm.cmp_end AND
|
|
812 ? >= asm.cmp_start AND
|
|
813 asm.asm_seq_region_id = sr.seq_region_id AND
|
|
814 sr.coord_system_id = ?
|
|
815 CMPASM
|
|
816
|
|
817 my $asm2cmp_sth;
|
|
818 my $cmp2asm_sth;
|
|
819 if(defined($to_slice)){
|
|
820 my $to_cs = $to_slice->coord_system;
|
|
821 if($asm_cs->equals($to_cs)){
|
|
822 $asm2cmp_sth = $self->prepare($asm2cmp);
|
|
823 $cmp2asm_sth = $self->prepare($cmp2asm." AND asm.asm_seq_region_id = $to_seq_region_id");
|
|
824 }
|
|
825 elsif($cmp_cs->equals($to_cs)){
|
|
826 $asm2cmp_sth = $self->prepare($asm2cmp." AND asm.cmp_seq_region_id = $to_seq_region_id");
|
|
827 $cmp2asm_sth = $self->prepare($cmp2asm);
|
|
828 }
|
|
829 else{
|
|
830 $asm2cmp_sth = $self->prepare($asm2cmp);
|
|
831 $cmp2asm_sth = $self->prepare($cmp2asm);
|
|
832 }
|
|
833 }
|
|
834 else{
|
|
835 $asm2cmp_sth = $self->prepare($asm2cmp);
|
|
836 $cmp2asm_sth = $self->prepare($cmp2asm);
|
|
837 }
|
|
838
|
|
839
|
|
840
|
|
841 $sth = ($asm_cs->equals($start_cs)) ? $asm2cmp_sth : $cmp2asm_sth;
|
|
842
|
|
843 my $mid_cs_id;
|
|
844
|
|
845 # check for the simple case where the ChainedMapper is short
|
|
846 if( defined $mid_cs ) {
|
|
847 $mid_cs_id = $mid_cs->dbID();
|
|
848 } else {
|
|
849 $mid_cs_id = $end_cs->dbID();
|
|
850 }
|
|
851
|
|
852 my @mid_ranges;
|
|
853 my @start_ranges;
|
|
854
|
|
855 #need to perform the query for each unregistered range
|
|
856 foreach my $range (@$ranges) {
|
|
857 my ($start, $end) = @$range;
|
|
858 $sth->bind_param(1,$seq_region_id,SQL_INTEGER);
|
|
859 $sth->bind_param(2,$start,SQL_INTEGER);
|
|
860 $sth->bind_param(3,$end,SQL_INTEGER);
|
|
861 $sth->bind_param(4,$mid_cs_id,SQL_INTEGER);
|
|
862 $sth->execute();
|
|
863
|
|
864 #load the start <-> mid mapper with the results and record the mid cs
|
|
865 #ranges we just added to the mapper
|
|
866
|
|
867 my ($mid_start, $mid_end, $mid_seq_region_id, $mid_seq_region, $mid_length,
|
|
868 $ori, $start_start, $start_end);
|
|
869
|
|
870 $sth->bind_columns(\$mid_start, \$mid_end, \$mid_seq_region_id,
|
|
871 \$mid_seq_region, \$mid_length, \$ori, \$start_start,
|
|
872 \$start_end);
|
|
873
|
|
874 while($sth->fetch()) {
|
|
875
|
|
876 if( defined $mid_cs ) {
|
|
877 $start_mid_mapper->add_map_coordinates
|
|
878 (
|
|
879 $seq_region_id,$start_start, $start_end, $ori,
|
|
880 $mid_seq_region_id, $mid_start, $mid_end
|
|
881 );
|
|
882 } else {
|
|
883 if( $from eq "first" ) {
|
|
884 $combined_mapper->add_map_coordinates
|
|
885 (
|
|
886 $seq_region_id,$start_start, $start_end, $ori,
|
|
887 $mid_seq_region_id, $mid_start, $mid_end
|
|
888 );
|
|
889 } else {
|
|
890 $combined_mapper->add_map_coordinates
|
|
891 (
|
|
892 $mid_seq_region_id, $mid_start, $mid_end, $ori,
|
|
893 $seq_region_id,$start_start, $start_end
|
|
894 );
|
|
895 }
|
|
896 }
|
|
897
|
|
898 #update sr_name cache
|
|
899 my $arr = [ $mid_seq_region_id, $mid_seq_region,
|
|
900 $mid_cs_id, $mid_length ];
|
|
901
|
|
902 $self->{'sr_name_cache'}->{"$mid_seq_region:$mid_cs_id"} = $arr;
|
|
903 $self->{'sr_id_cache'}->{"$mid_seq_region_id"} = $arr;
|
|
904
|
|
905 push @mid_ranges,[$mid_seq_region_id,$mid_seq_region,
|
|
906 $mid_start,$mid_end];
|
|
907 push @start_ranges, [ $seq_region_id, $start_start, $start_end ];
|
|
908
|
|
909 #the region that we actually register may actually be larger or smaller
|
|
910 #than the region that we wanted to register.
|
|
911 #register the intersection of the region so we do not end up doing
|
|
912 #extra work later
|
|
913
|
|
914 if($start_start < $start || $start_end > $end) {
|
|
915 $start_registry->check_and_register($seq_region_id,$start_start,
|
|
916 $start_end);
|
|
917 }
|
|
918 }
|
|
919 $sth->finish();
|
|
920 }
|
|
921
|
|
922 # in the one step case, we load the mid ranges in the
|
|
923 # last_registry and we are done
|
|
924 if( ! defined $mid_cs ) {
|
|
925 for my $range ( @mid_ranges ) {
|
|
926 $end_registry->check_and_register( $range->[0], $range->[2],
|
|
927 $range->[3] );
|
|
928 }
|
|
929
|
|
930 # and thats it for the simple case ...
|
|
931 return;
|
|
932 }
|
|
933
|
|
934
|
|
935 ###########
|
|
936 # now the second half of the mapping
|
|
937 # perform another query and load the mid <-> end mapper using the mid cs
|
|
938 # ranges
|
|
939 #
|
|
940
|
|
941 #ascertain which is component and which is actually assembled coord system
|
|
942 @path = @{$csa->get_mapping_path($mid_cs, $end_cs)};
|
|
943 if(@path == 2 || ( @path == 3 && !defined $path[1])) {
|
|
944
|
|
945 $asm_cs = $path[0];
|
|
946 $cmp_cs = $path[-1];
|
|
947 } else {
|
|
948 my $path = join(',', map({$_->name .' '. $_->version} @path));
|
|
949 my $len = scalar(@path)-1;
|
|
950 throw("Unexpected mapping path between intermediate and last" .
|
|
951 "coord systems (". $mid_cs->name . " " . $mid_cs->version .
|
|
952 " and " . $end_cs->name . " " . $end_cs->version . ")." .
|
|
953 "\nExpected path length 1, got $len. " .
|
|
954 "(path=$path)");
|
|
955 }
|
|
956
|
|
957 if(defined($to_slice)){
|
|
958 my $to_cs = $to_slice->coord_system;
|
|
959 if($asm_cs->equals($to_cs)){
|
|
960 $asm2cmp_sth = $self->prepare($asm2cmp);
|
|
961 $cmp2asm_sth = $self->prepare($cmp2asm." AND asm.asm_seq_region_id = $to_seq_region_id");
|
|
962 }
|
|
963 elsif($cmp_cs->equals($to_cs)){
|
|
964 $asm2cmp_sth = $self->prepare($asm2cmp." AND asm.cmp_seq_region_id = $to_seq_region_id");
|
|
965 $cmp2asm_sth = $self->prepare($cmp2asm);
|
|
966 }
|
|
967 else{
|
|
968 $asm2cmp_sth = $self->prepare($asm2cmp);
|
|
969 $cmp2asm_sth = $self->prepare($cmp2asm);
|
|
970 }
|
|
971 }
|
|
972
|
|
973 $sth = ($asm_cs->equals($mid_cs)) ? $asm2cmp_sth : $cmp2asm_sth;
|
|
974
|
|
975 my $end_cs_id = $end_cs->dbID();
|
|
976 foreach my $mid_range (@mid_ranges) {
|
|
977 my ($mid_seq_region_id, $mid_seq_region,$start, $end) = @$mid_range;
|
|
978 $sth->bind_param(1,$mid_seq_region_id,SQL_INTEGER);
|
|
979 $sth->bind_param(2,$start,SQL_INTEGER);
|
|
980 $sth->bind_param(3,$end,SQL_INTEGER);
|
|
981 $sth->bind_param(4,$end_cs_id,SQL_INTEGER);
|
|
982 $sth->execute();
|
|
983
|
|
984 #load the end <-> mid mapper with the results and record the mid cs
|
|
985 #ranges we just added to the mapper
|
|
986
|
|
987 my ($end_start, $end_end, $end_seq_region_id, $end_seq_region, $end_length,
|
|
988 $ori, $mid_start, $mid_end);
|
|
989
|
|
990 $sth->bind_columns(\$end_start, \$end_end, \$end_seq_region_id,
|
|
991 \$end_seq_region, \$end_length, \$ori, \$mid_start,
|
|
992 \$mid_end);
|
|
993
|
|
994 while($sth->fetch()) {
|
|
995 $end_mid_mapper->add_map_coordinates
|
|
996 (
|
|
997 $end_seq_region_id, $end_start, $end_end, $ori,
|
|
998 $mid_seq_region_id, $mid_start, $mid_end
|
|
999 );
|
|
1000
|
|
1001 #update sr_name cache
|
|
1002 my $arr = [ $end_seq_region_id,$end_seq_region,$end_cs_id,$end_length ];
|
|
1003
|
|
1004 $self->{'sr_name_cache'}->{"$end_seq_region:$end_cs_id"} = $arr;
|
|
1005 $self->{'sr_id_cache'}->{"$end_seq_region_id"} = $arr;
|
|
1006
|
|
1007 #register this region on the end coord system
|
|
1008 $end_registry->check_and_register($end_seq_region_id, $end_start, $end_end);
|
|
1009 }
|
|
1010 $sth->finish();
|
|
1011 }
|
|
1012
|
|
1013 #########
|
|
1014 # Now that both halves are loaded
|
|
1015 # Do stepwise mapping using both of the loaded mappers to load
|
|
1016 # the final start <-> end mapper
|
|
1017 #
|
|
1018
|
|
1019 _build_combined_mapper(\@start_ranges, $start_mid_mapper, $end_mid_mapper,
|
|
1020 $combined_mapper, $start_name);
|
|
1021 #all done!
|
|
1022 return;
|
|
1023 }
|
|
1024
|
|
1025
|
|
1026 =head2 _register_chained_special
|
|
1027
|
|
1028 Arg [1] : Bio::EnsEMBL::ChainedAssemblyMapper $casm_mapper
|
|
1029 The chained assembly mapper to register regions on
|
|
1030 Arg [2] : string $from ('first' or 'last')
|
|
1031 The direction we are registering from, and the name of the
|
|
1032 internal mapper.
|
|
1033 Arg [3] : string $seq_region_name
|
|
1034 The name of the seqregion we are registering on
|
|
1035 Arg [4] : listref $ranges
|
|
1036 A list of ranges to register (in [$start,$end] tuples).
|
|
1037 Arg [5] : (optional) $to_slice
|
|
1038 Only register those on this Slice.
|
|
1039 Description: Registers a set of ranges on a chained assembly mapper.
|
|
1040 This function is at the heart of the chained mapping process.
|
|
1041 It retrieves information from the assembly table and
|
|
1042 dynamically constructs the mappings between two coordinate
|
|
1043 systems which are 2 mapping steps apart. It does this by using
|
|
1044 two internal mappers to load up a third mapper which is
|
|
1045 actually used by the ChainedAssemblyMapper to perform the
|
|
1046 mapping.
|
|
1047
|
|
1048 This method must be called before any mapping is
|
|
1049 attempted on regions of interest, otherwise only gaps will
|
|
1050 be returned. Note that the ChainedAssemblyMapper automatically
|
|
1051 calls this method when the need arises.
|
|
1052 Returntype : none
|
|
1053 Exceptions : throw if the seq_region to be registered does not exist
|
|
1054 or if it associated with multiple assembled pieces (bad data
|
|
1055 in assembly table)
|
|
1056
|
|
1057 throw if the mapping between the coordinate systems cannot
|
|
1058 be performed in two steps, which means there is an internal
|
|
1059 error in the data in the meta table or in the code that creates
|
|
1060 the mapping paths.
|
|
1061 Caller : Bio::EnsEMBL::AssemblyMapper
|
|
1062 Status : Stable
|
|
1063
|
|
1064 =cut
|
|
1065
|
|
1066 sub _register_chained_special {
|
|
1067 my $self = shift;
|
|
1068 my $casm_mapper = shift;
|
|
1069 my $from = shift;
|
|
1070 my $seq_region_id = shift;
|
|
1071 my $ranges = shift;
|
|
1072 my $to_slice = shift;
|
|
1073 my $found = 0;
|
|
1074
|
|
1075 my $sth = $self->prepare("SELECT
|
|
1076 asm.cmp_start,
|
|
1077 asm.cmp_end,
|
|
1078 asm.cmp_seq_region_id,
|
|
1079 sr.name,
|
|
1080 sr.length,
|
|
1081 asm.ori,
|
|
1082 asm.asm_start,
|
|
1083 asm.asm_end
|
|
1084 FROM
|
|
1085 assembly asm, seq_region sr
|
|
1086 WHERE
|
|
1087 asm.asm_seq_region_id = ? AND
|
|
1088 ? <= asm.asm_end AND
|
|
1089 ? >= asm.asm_start AND
|
|
1090 asm.cmp_seq_region_id = sr.seq_region_id AND
|
|
1091 sr.coord_system_id = ? AND
|
|
1092 asm.cmp_seq_region_id = ?");
|
|
1093
|
|
1094
|
|
1095 my ($start_name, $start_mid_mapper, $start_cs, $start_registry,
|
|
1096 $end_name, $end_mid_mapper, $end_cs, $end_registry);
|
|
1097
|
|
1098 if($from eq 'first') {
|
|
1099 $start_name = 'first';
|
|
1100 $start_mid_mapper = $casm_mapper->first_middle_mapper();
|
|
1101 $start_cs = $casm_mapper->first_CoordSystem();
|
|
1102 $start_registry = $casm_mapper->first_registry();
|
|
1103 $end_mid_mapper = $casm_mapper->last_middle_mapper();
|
|
1104 $end_cs = $casm_mapper->last_CoordSystem();
|
|
1105 $end_registry = $casm_mapper->last_registry();
|
|
1106 $end_name = 'last';
|
|
1107 } elsif($from eq 'last') {
|
|
1108 $start_name = 'last';
|
|
1109 $start_mid_mapper = $casm_mapper->last_middle_mapper();
|
|
1110 $start_cs = $casm_mapper->last_CoordSystem();
|
|
1111 $start_registry = $casm_mapper->last_registry();
|
|
1112 $end_mid_mapper = $casm_mapper->first_middle_mapper();
|
|
1113 $end_cs = $casm_mapper->first_CoordSystem();
|
|
1114 $end_registry = $casm_mapper->first_registry();
|
|
1115 $end_name = 'first';
|
|
1116 } else {
|
|
1117 throw("Invalid from argument: [$from], must be 'first' or 'last'");
|
|
1118 }
|
|
1119
|
|
1120 my $combined_mapper = $casm_mapper->first_last_mapper();
|
|
1121 my $mid_cs = $casm_mapper->middle_CoordSystem();
|
|
1122 my $mid_name = 'middle';
|
|
1123 my $csa = $self->db->get_CoordSystemAdaptor();
|
|
1124
|
|
1125 # Check for the simple case where the ChainedMapper is short
|
|
1126 if( ! defined $mid_cs ) {
|
|
1127 $start_mid_mapper = $combined_mapper;
|
|
1128 }
|
|
1129
|
|
1130
|
|
1131 my @path;
|
|
1132 if( defined $mid_cs ) {
|
|
1133 @path = @{$csa->get_mapping_path($start_cs, $mid_cs)};
|
|
1134 } else {
|
|
1135 @path = @{$csa->get_mapping_path( $start_cs, $end_cs )};
|
|
1136 }
|
|
1137 if( ! defined $mid_cs ) {
|
|
1138 $start_mid_mapper = $combined_mapper;
|
|
1139 }
|
|
1140
|
|
1141 if(@path != 2 && defined( $path[1] )) {
|
|
1142 my $path = join(',', map({$_->name .' '. $_->version} @path));
|
|
1143 my $len = scalar(@path) - 1;
|
|
1144 throw("Unexpected mapping path between start and intermediate " .
|
|
1145 "coord systems (". $start_cs->name . " " . $start_cs->version .
|
|
1146 " and " . $mid_cs->name . " " . $mid_cs->version . ")." .
|
|
1147 "\nExpected path length 1, got $len. " .
|
|
1148 "(path=$path)");
|
|
1149 }
|
|
1150
|
|
1151 my ($asm_cs,$cmp_cs);
|
|
1152 $asm_cs = $path[0];
|
|
1153 $cmp_cs = $path[-1];
|
|
1154
|
|
1155 $combined_mapper = $casm_mapper->first_last_mapper();
|
|
1156 $mid_cs = $casm_mapper->middle_CoordSystem();
|
|
1157 $mid_name = 'middle';
|
|
1158 $csa = $self->db->get_CoordSystemAdaptor();
|
|
1159
|
|
1160 my $mid_cs_id;
|
|
1161
|
|
1162 # Check for the simple case where the ChainedMapper is short
|
|
1163 if ( !defined $mid_cs ) {
|
|
1164 $start_mid_mapper = $combined_mapper;
|
|
1165 } else {
|
|
1166 $mid_cs_id = $mid_cs->dbID();
|
|
1167 }
|
|
1168
|
|
1169 my @mid_ranges;
|
|
1170 my @start_ranges;
|
|
1171
|
|
1172 my $to_cs = $to_slice->coord_system;
|
|
1173 foreach my $direction (1, 0){
|
|
1174 my $id1;
|
|
1175 my $id2;
|
|
1176 if($direction){
|
|
1177 $id1 = $seq_region_id;
|
|
1178 $id2 = $to_slice->get_seq_region_id();
|
|
1179 }
|
|
1180 else{
|
|
1181 $id2 = $seq_region_id;
|
|
1182 $id1 = $to_slice->get_seq_region_id();
|
|
1183 }
|
|
1184
|
|
1185 foreach my $range (@$ranges) {
|
|
1186 my ($start, $end) = @$range;
|
|
1187 $sth->bind_param(1,$id1,SQL_INTEGER);
|
|
1188 $sth->bind_param(2,$start,SQL_INTEGER);
|
|
1189 $sth->bind_param(3,$end,SQL_INTEGER);
|
|
1190 $sth->bind_param(4,$to_cs->dbID,SQL_INTEGER);
|
|
1191 $sth->bind_param(5,$id2,SQL_INTEGER);
|
|
1192 $sth->execute();
|
|
1193
|
|
1194 my ($mid_start, $mid_end, $mid_seq_region_id, $mid_seq_region, $mid_length,
|
|
1195 $ori, $start_start, $start_end);
|
|
1196
|
|
1197 $sth->bind_columns(\$mid_start, \$mid_end, \$mid_seq_region_id,
|
|
1198 \$mid_seq_region, \$mid_length, \$ori, \$start_start,
|
|
1199 \$start_end);
|
|
1200
|
|
1201 while($sth->fetch()) {
|
|
1202 $found = 1;
|
|
1203
|
|
1204 if( defined $mid_cs ) {
|
|
1205 $start_mid_mapper->add_map_coordinates
|
|
1206 (
|
|
1207 $id1,$start_start, $start_end, $ori,
|
|
1208 $mid_seq_region_id, $mid_start, $mid_end
|
|
1209 );
|
|
1210 } else {
|
|
1211 if( $from eq "first") {
|
|
1212 if($direction){
|
|
1213 $combined_mapper->add_map_coordinates
|
|
1214 (
|
|
1215 $id1,$start_start, $start_end, $ori,
|
|
1216 $mid_seq_region_id, $mid_start, $mid_end
|
|
1217 );
|
|
1218 }
|
|
1219 else{
|
|
1220 $combined_mapper->add_map_coordinates
|
|
1221 (
|
|
1222 $mid_seq_region_id, $mid_start, $mid_end, $ori,
|
|
1223 $id1,$start_start, $start_end
|
|
1224 );
|
|
1225 }
|
|
1226 } else {
|
|
1227 if($direction){
|
|
1228 $combined_mapper->add_map_coordinates
|
|
1229 (
|
|
1230 $mid_seq_region_id, $mid_start, $mid_end, $ori,
|
|
1231 $id1,$start_start, $start_end
|
|
1232 );
|
|
1233 }
|
|
1234 else{
|
|
1235 $combined_mapper->add_map_coordinates
|
|
1236 (
|
|
1237 $id1,$start_start, $start_end, $ori,
|
|
1238 $mid_seq_region_id, $mid_start, $mid_end
|
|
1239 );
|
|
1240 }
|
|
1241 }
|
|
1242 }
|
|
1243
|
|
1244 #update sr_name cache
|
|
1245 my $arr = [ $mid_seq_region_id, $mid_seq_region,
|
|
1246 $mid_cs_id, $mid_length ];
|
|
1247
|
|
1248 $self->{'sr_name_cache'}->{"$mid_seq_region:$mid_cs_id"} = $arr;
|
|
1249 $self->{'sr_id_cache'}->{"$mid_seq_region_id"} = $arr;
|
|
1250
|
|
1251 push @mid_ranges,[$mid_seq_region_id,$mid_seq_region,
|
|
1252 $mid_start,$mid_end];
|
|
1253 push @start_ranges, [ $id1, $start_start, $start_end ];
|
|
1254
|
|
1255 #the region that we actually register may actually be larger or smaller
|
|
1256 #than the region that we wanted to register.
|
|
1257 #register the intersection of the region so we do not end up doing
|
|
1258 #extra work later
|
|
1259
|
|
1260 if($start_start < $start || $start_end > $end) {
|
|
1261 $start_registry->check_and_register($id1,$start_start,
|
|
1262 $start_end);
|
|
1263 }
|
|
1264 }
|
|
1265 $sth->finish();
|
|
1266 }
|
|
1267 if($found){
|
|
1268 if( ! defined $mid_cs ) {
|
|
1269 for my $range ( @mid_ranges ) {
|
|
1270 $end_registry->check_and_register( $range->[0], $range->[2],
|
|
1271 $range->[3] );
|
|
1272 }
|
|
1273
|
|
1274 # and thats it for the simple case ...
|
|
1275 return;
|
|
1276 }
|
|
1277 }
|
|
1278 }
|
|
1279 }
|
|
1280
|
|
1281
|
|
1282 =head2 register_all
|
|
1283
|
|
1284 Arg [1] : Bio::EnsEMBL::AssemblyMapper $mapper
|
|
1285 Example : $mapper = $asm_mapper_adaptor->fetch_by_CoordSystems($cs1,$cs2);
|
|
1286
|
|
1287 # make cache large enough to hold all of the mappings
|
|
1288 $mapper->max_pair_count(10e6);
|
|
1289 $asm_mapper_adaptor->register_all($mapper);
|
|
1290
|
|
1291 # perform mappings as normal
|
|
1292 $mapper->map($slice->seq_region_name(), $sr_start, $sr_end,
|
|
1293 $sr_strand, $cs1);
|
|
1294 ...
|
|
1295 Description: This function registers the entire set of mappings between
|
|
1296 two coordinate systems in an assembly mapper.
|
|
1297 This will use a lot of memory but will be much more efficient
|
|
1298 when doing a lot of mapping which is spread over the entire
|
|
1299 genome.
|
|
1300 Returntype : none
|
|
1301 Exceptions : none
|
|
1302 Caller : specialised prograhsm
|
|
1303 Status : Stable
|
|
1304
|
|
1305 =cut
|
|
1306
|
|
1307 sub register_all {
|
|
1308 my $self = shift;
|
|
1309 my $mapper = shift;
|
|
1310
|
|
1311 my $asm_cs_id = $mapper->assembled_CoordSystem()->dbID();
|
|
1312 my $cmp_cs_id = $mapper->component_CoordSystem()->dbID();
|
|
1313
|
|
1314 # retrieve every relevant assembled/component pair from the assembly table
|
|
1315
|
|
1316 my $q = qq{
|
|
1317 SELECT
|
|
1318 asm.cmp_start,
|
|
1319 asm.cmp_end,
|
|
1320 asm.cmp_seq_region_id,
|
|
1321 cmp_sr.name,
|
|
1322 cmp_sr.length,
|
|
1323 asm.ori,
|
|
1324 asm.asm_start,
|
|
1325 asm.asm_end,
|
|
1326 asm.asm_seq_region_id,
|
|
1327 asm_sr.name,
|
|
1328 asm_sr.length
|
|
1329 FROM
|
|
1330 assembly asm, seq_region asm_sr, seq_region cmp_sr
|
|
1331 WHERE
|
|
1332 asm.cmp_seq_region_id = cmp_sr.seq_region_id AND
|
|
1333 asm.asm_seq_region_id = asm_sr.seq_region_id AND
|
|
1334 cmp_sr.coord_system_id = ? AND
|
|
1335 asm_sr.coord_system_id = ?
|
|
1336 };
|
|
1337
|
|
1338 my $sth = $self->prepare($q);
|
|
1339
|
|
1340 $sth->bind_param(1,$cmp_cs_id,SQL_INTEGER);
|
|
1341 $sth->bind_param(2,$asm_cs_id,SQL_INTEGER);
|
|
1342 $sth->execute();
|
|
1343
|
|
1344 # load the mapper with the assembly information
|
|
1345
|
|
1346 my ($cmp_start, $cmp_end, $cmp_seq_region_id, $cmp_seq_region, $cmp_length,
|
|
1347 $ori,
|
|
1348 $asm_start, $asm_end, $asm_seq_region_id, $asm_seq_region, $asm_length);
|
|
1349
|
|
1350 $sth->bind_columns(\$cmp_start, \$cmp_end, \$cmp_seq_region_id,
|
|
1351 \$cmp_seq_region, \$cmp_length, \$ori,
|
|
1352 \$asm_start, \$asm_end, \$asm_seq_region_id,
|
|
1353 \$asm_seq_region, \$asm_length);
|
|
1354
|
|
1355 my %asm_registered;
|
|
1356
|
|
1357 while($sth->fetch()) {
|
|
1358 $mapper->register_component($cmp_seq_region_id);
|
|
1359 $mapper->mapper->add_map_coordinates(
|
|
1360 $asm_seq_region_id, $asm_start, $asm_end,
|
|
1361 $ori,
|
|
1362 $cmp_seq_region_id, $cmp_start, $cmp_end);
|
|
1363
|
|
1364 my $arr = [$cmp_seq_region_id, $cmp_seq_region, $cmp_cs_id, $cmp_length];
|
|
1365
|
|
1366 $self->{'sr_name_cache'}->{"$cmp_seq_region:$cmp_cs_id"} = $arr;
|
|
1367 $self->{'sr_id_cache'}->{"$cmp_seq_region_id"} = $arr;
|
|
1368
|
|
1369 # only register each asm seq_region once since it requires some work
|
|
1370 if(!$asm_registered{$asm_seq_region_id}) {
|
|
1371 $asm_registered{$asm_seq_region_id} = 1;
|
|
1372
|
|
1373 # register all chunks from start of seq region to end
|
|
1374 my $end_chunk = $asm_length >> $CHUNKFACTOR;
|
|
1375 for(my $i = 0; $i <= $end_chunk; $i++) {
|
|
1376 $mapper->register_assembled($asm_seq_region_id, $i);
|
|
1377 }
|
|
1378
|
|
1379 $arr = [$asm_seq_region_id, $asm_seq_region, $asm_cs_id, $asm_length];
|
|
1380
|
|
1381 $self->{'sr_name_cache'}->{"$asm_seq_region:$asm_cs_id"} = $arr;
|
|
1382 $self->{'sr_id_cache'}->{"$asm_seq_region_id"} = $arr;
|
|
1383 }
|
|
1384 }
|
|
1385
|
|
1386 $sth->finish();
|
|
1387
|
|
1388 return;
|
|
1389 }
|
|
1390
|
|
1391
|
|
1392
|
|
1393 =head2 register_all_chained
|
|
1394
|
|
1395 Arg [1] : Bio::EnsEMBL::ChainedAssemblyMapper $casm_mapper
|
|
1396 Example : $mapper = $asm_mapper_adaptor->fetch_by_CoordSystems($cs1,$cs2);
|
|
1397
|
|
1398 # make the cache large enough to hold all of the mappings
|
|
1399 $mapper->max_pair_count(10e6);
|
|
1400 # load all of the mapping data
|
|
1401 $asm_mapper_adaptor->register_all_chained($mapper);
|
|
1402
|
|
1403 # perform mappings as normal
|
|
1404 $mapper->map($slice->seq_region_name(), $sr_start, $sr_end,
|
|
1405 $sr_strand, $cs1);
|
|
1406 ...
|
|
1407 Description: This function registers the entire set of mappings between
|
|
1408 two coordinate systems in a chained mapper. This will use a lot
|
|
1409 of memory but will be much more efficient when doing a lot of
|
|
1410 mapping which is spread over the entire genome.
|
|
1411 Returntype : none
|
|
1412 Exceptions : throw if mapper is between coord systems with unexpected
|
|
1413 mapping paths
|
|
1414 Caller : specialised programs doing a lot of genome-wide mapping
|
|
1415 Status : Stable
|
|
1416
|
|
1417 =cut
|
|
1418
|
|
1419 sub register_all_chained {
|
|
1420 my $self = shift;
|
|
1421 my $casm_mapper = shift;
|
|
1422
|
|
1423 my $first_cs = $casm_mapper->first_CoordSystem();
|
|
1424 my $mid_cs = $casm_mapper->middle_CoordSystem();
|
|
1425 my $last_cs = $casm_mapper->last_CoordSystem();
|
|
1426
|
|
1427 my $start_mid_mapper = $casm_mapper->first_middle_mapper();
|
|
1428 my $end_mid_mapper = $casm_mapper->last_middle_mapper();
|
|
1429 my $combined_mapper = $casm_mapper->first_last_mapper();
|
|
1430
|
|
1431 my @ranges;
|
|
1432
|
|
1433 my $sth = $self->prepare(
|
|
1434 'SELECT
|
|
1435 asm.cmp_start,
|
|
1436 asm.cmp_end,
|
|
1437 asm.cmp_seq_region_id,
|
|
1438 sr_cmp.name,
|
|
1439 sr_cmp.length,
|
|
1440 asm.ori,
|
|
1441 asm.asm_start,
|
|
1442 asm.asm_end,
|
|
1443 asm.asm_seq_region_id,
|
|
1444 sr_asm.name,
|
|
1445 sr_asm.length
|
|
1446 FROM
|
|
1447 assembly asm, seq_region sr_asm, seq_region sr_cmp
|
|
1448 WHERE
|
|
1449 sr_asm.seq_region_id = asm.asm_seq_region_id AND
|
|
1450 sr_cmp.seq_region_id = asm.cmp_seq_region_id AND
|
|
1451 sr_asm.coord_system_id = ? AND
|
|
1452 sr_cmp.coord_system_id = ?');
|
|
1453
|
|
1454 my $csa = $self->db()->get_CoordSystemAdaptor();
|
|
1455
|
|
1456 my @path;
|
|
1457
|
|
1458 if ( !defined $mid_cs ) {
|
|
1459 @path = @{ $csa->get_mapping_path( $first_cs, $last_cs ) };
|
|
1460 if ( !defined( $path[1] ) ) {
|
|
1461 splice( @path, 1, 1 );
|
|
1462 }
|
|
1463 } else {
|
|
1464 @path = @{ $csa->get_mapping_path( $first_cs, $mid_cs ) };
|
|
1465 # fix for when we have something like supercontig#contig#chromosome
|
|
1466 if ( !defined( $path[1] ) ) {
|
|
1467 splice( @path, 1, 1 );
|
|
1468 }
|
|
1469 }
|
|
1470
|
|
1471 if ( @path != 2 ) {
|
|
1472 my $path =
|
|
1473 join( ',', map( { $_->name . ' ' . $_->version } @path ) );
|
|
1474 my $len = scalar(@path) - 1;
|
|
1475 throw( "Unexpected mapping path between start and intermediate "
|
|
1476 . "coord systems ("
|
|
1477 . $first_cs->name . " "
|
|
1478 . $first_cs->version . " and "
|
|
1479 . $mid_cs->name . " "
|
|
1480 . $mid_cs->version . ")."
|
|
1481 . "\nExpected path length 1, got $len. "
|
|
1482 . "(path=$path)" );
|
|
1483 }
|
|
1484
|
|
1485 my ($asm_cs,$cmp_cs) = @path;
|
|
1486
|
|
1487 $sth->{mysql_use_result} = 1;
|
|
1488 $sth->bind_param(1,$asm_cs->dbID,SQL_INTEGER);
|
|
1489 $sth->bind_param(2,$cmp_cs->dbID,SQL_INTEGER);
|
|
1490 $sth->execute();
|
|
1491
|
|
1492
|
|
1493 my ($mid_start, $mid_end, $mid_seq_region_id, $mid_seq_region, $mid_length,
|
|
1494 $ori, $start_start, $start_end, $start_seq_region_id, $start_seq_region,
|
|
1495 $start_length);
|
|
1496
|
|
1497 if($asm_cs->equals($first_cs)) {
|
|
1498 $sth->bind_columns(\$mid_start, \$mid_end, \$mid_seq_region_id,
|
|
1499 \$mid_seq_region, \$mid_length, \$ori, \$start_start,
|
|
1500 \$start_end, \$start_seq_region_id, \$start_seq_region,
|
|
1501 \$start_length);
|
|
1502 } else {
|
|
1503 $sth->bind_columns(\$start_start, \$start_end, \$start_seq_region_id,
|
|
1504 \$start_seq_region, \$start_length, \$ori,
|
|
1505 \$mid_start, \$mid_end, \$mid_seq_region_id,
|
|
1506 \$mid_seq_region, \$mid_length);
|
|
1507
|
|
1508 }
|
|
1509
|
|
1510 my ( $mid_cs_id, $start_cs_id, $reg, $mapper );
|
|
1511 if ( !defined $mid_cs ) {
|
|
1512 $mid_cs_id = $last_cs->dbID();
|
|
1513 $start_cs_id = $first_cs->dbID();
|
|
1514 $mapper = $combined_mapper;
|
|
1515 } else {
|
|
1516 $mid_cs_id = $mid_cs->dbID();
|
|
1517 $start_cs_id = $first_cs->dbID();
|
|
1518 $mapper = $start_mid_mapper;
|
|
1519 }
|
|
1520
|
|
1521 $reg = $casm_mapper->first_registry();
|
|
1522
|
|
1523 while($sth->fetch()) {
|
|
1524 $mapper->add_map_coordinates
|
|
1525 (
|
|
1526 $start_seq_region_id, $start_start, $start_end, $ori,
|
|
1527 $mid_seq_region_id, $mid_start, $mid_end
|
|
1528 );
|
|
1529 push( @ranges, [$start_seq_region_id, $start_start, $start_end ] );
|
|
1530
|
|
1531 $reg->check_and_register( $start_seq_region_id, 1, $start_length );
|
|
1532 if( ! defined $mid_cs ) {
|
|
1533 $casm_mapper->last_registry()->check_and_register
|
|
1534 ( $mid_seq_region_id, $mid_start, $mid_end );
|
|
1535 }
|
|
1536
|
|
1537 my $arr = [ $mid_seq_region_id, $mid_seq_region,
|
|
1538 $mid_cs_id, $mid_length ];
|
|
1539
|
|
1540 $self->{'sr_name_cache'}->{"$mid_seq_region:$mid_cs_id"} = $arr;
|
|
1541 $self->{'sr_id_cache'}->{"$mid_seq_region_id"} = $arr;
|
|
1542
|
|
1543 $arr = [ $start_seq_region_id, $start_seq_region,
|
|
1544 $start_cs_id, $start_length ];
|
|
1545
|
|
1546 $self->{'sr_name_cache'}->{"$start_seq_region:$start_cs_id"} = $arr;
|
|
1547 $self->{'sr_id_cache'}->{"$start_seq_region_id"} = $arr;
|
|
1548 }
|
|
1549
|
|
1550 if( ! defined $mid_cs ) {
|
|
1551 # thats it for the simple case
|
|
1552 return;
|
|
1553 }
|
|
1554
|
|
1555
|
|
1556 @path = @{ $csa->get_mapping_path( $last_cs, $mid_cs ) };
|
|
1557 if ( defined($mid_cs) ) {
|
|
1558 if ( !defined( $path[1] ) ) {
|
|
1559 splice( @path, 1, 1 );
|
|
1560 }
|
|
1561 }
|
|
1562
|
|
1563 if ( @path != 2 ) {
|
|
1564 my $path =
|
|
1565 join( ',', map( { $_->name . ' ' . $_->version } @path ) );
|
|
1566 my $len = scalar(@path) - 1;
|
|
1567 throw( "Unexpected mapping path between intermediate and last "
|
|
1568 . "coord systems ("
|
|
1569 . $last_cs->name . " "
|
|
1570 . $last_cs->version . " and "
|
|
1571 . $mid_cs->name . " "
|
|
1572 . $mid_cs->version . ")."
|
|
1573 . "\nExpected path length 1, got $len. "
|
|
1574 . "(path=$path)" );
|
|
1575 }
|
|
1576
|
|
1577 ($asm_cs,$cmp_cs) = @path;
|
|
1578
|
|
1579 $sth->bind_param(1,$asm_cs->dbID,SQL_INTEGER);
|
|
1580 $sth->bind_param(2,$cmp_cs->dbID,SQL_INTEGER);
|
|
1581 $sth->execute();
|
|
1582
|
|
1583
|
|
1584 my ($end_start, $end_end, $end_seq_region_id, $end_seq_region,
|
|
1585 $end_length);
|
|
1586
|
|
1587 if($asm_cs->equals($mid_cs)) {
|
|
1588 $sth->bind_columns(\$end_start, \$end_end, \$end_seq_region_id,
|
|
1589 \$end_seq_region, \$end_length, \$ori,
|
|
1590 \$mid_start, \$mid_end, \$mid_seq_region_id,
|
|
1591 \$mid_seq_region, \$mid_length);
|
|
1592 } else {
|
|
1593 $sth->bind_columns(\$mid_start, \$mid_end, \$mid_seq_region_id,
|
|
1594 \$mid_seq_region, \$mid_length, \$ori, \$end_start,
|
|
1595 \$end_end, \$end_seq_region_id, \$end_seq_region,
|
|
1596 \$end_length);
|
|
1597 }
|
|
1598
|
|
1599 my $end_cs_id = $last_cs->dbID();
|
|
1600 $reg = $casm_mapper->last_registry();
|
|
1601
|
|
1602 while($sth->fetch()) {
|
|
1603 $end_mid_mapper->add_map_coordinates
|
|
1604 (
|
|
1605 $end_seq_region_id, $end_start, $end_end, $ori,
|
|
1606 $mid_seq_region_id, $mid_start, $mid_end
|
|
1607 );
|
|
1608
|
|
1609 $reg->check_and_register( $end_seq_region_id, 1, $end_length );
|
|
1610
|
|
1611 my $arr = [ $end_seq_region_id, $end_seq_region,
|
|
1612 $end_cs_id, $end_length ];
|
|
1613 $self->{'sr_name_cache'}->{"$end_seq_region:$end_cs_id"} = $arr;
|
|
1614 $self->{'sr_id_cache'}->{"$end_seq_region_id"} = $arr;
|
|
1615 }
|
|
1616
|
|
1617 _build_combined_mapper( \@ranges, $start_mid_mapper, $end_mid_mapper,
|
|
1618 $combined_mapper, "first" );
|
|
1619
|
|
1620 return;
|
|
1621 }
|
|
1622
|
|
1623
|
|
1624
|
|
1625 # after both halves of a chained mapper are loaded
|
|
1626 # this function maps all ranges in $ranges and loads the
|
|
1627 # results into the combined mapper
|
|
1628 sub _build_combined_mapper {
|
|
1629 my $ranges = shift;
|
|
1630 my $start_mid_mapper = shift;
|
|
1631 my $end_mid_mapper = shift;
|
|
1632 my $combined_mapper = shift;
|
|
1633 my $start_name = shift;
|
|
1634
|
|
1635 my $mid_name = "middle";
|
|
1636
|
|
1637 foreach my $range (@$ranges) {
|
|
1638 my ( $seq_region_id, $start, $end) = @$range;
|
|
1639
|
|
1640 my $sum = 0;
|
|
1641
|
|
1642 my @initial_coords = $start_mid_mapper->map_coordinates($seq_region_id,
|
|
1643 $start,$end,1,
|
|
1644 $start_name);
|
|
1645
|
|
1646 foreach my $icoord (@initial_coords) {
|
|
1647 #skip gaps
|
|
1648 if($icoord->isa('Bio::EnsEMBL::Mapper::Gap')) {
|
|
1649 $sum += $icoord->length();
|
|
1650 next;
|
|
1651 }
|
|
1652
|
|
1653
|
|
1654 #feed the results of the first mapping into the second mapper
|
|
1655 my @final_coords =
|
|
1656 $end_mid_mapper->map_coordinates($icoord->id, $icoord->start,
|
|
1657 $icoord->end,
|
|
1658 $icoord->strand, $mid_name);
|
|
1659
|
|
1660
|
|
1661 foreach my $fcoord (@final_coords) {
|
|
1662 #load up the final mapper
|
|
1663
|
|
1664 if($fcoord->isa('Bio::EnsEMBL::Mapper::Coordinate')) {
|
|
1665 my $total_start = $start + $sum;
|
|
1666 my $total_end = $total_start + $fcoord->length - 1;
|
|
1667 my $ori = $fcoord->strand();
|
|
1668
|
|
1669 if($start_name eq 'first') { # add coords in consistant order
|
|
1670 $combined_mapper->add_map_coordinates(
|
|
1671 $seq_region_id, $total_start, $total_end, $ori,
|
|
1672 $fcoord->id(), $fcoord->start(), $fcoord->end());
|
|
1673 } else {
|
|
1674 $combined_mapper->add_map_coordinates(
|
|
1675 $fcoord->id(), $fcoord->start(), $fcoord->end(),$ori,
|
|
1676 $seq_region_id, $total_start, $total_end);
|
|
1677 }
|
|
1678
|
|
1679 }
|
|
1680 $sum += $fcoord->length();
|
|
1681 }
|
|
1682 }
|
|
1683 }
|
|
1684 #all done!
|
|
1685 }
|
|
1686
|
|
1687
|
|
1688 =head2 seq_regions_to_ids
|
|
1689
|
|
1690 Arg [1] : Bio::EnsEMBL::CoordSystem $coord_system
|
|
1691 Arg [2] : listref of strings $seq_regions
|
|
1692 Example : my @ids = @{$asma->seq_regions_to_ids($coord_sys, \@seq_regs)};
|
|
1693 Description: Converts a list of seq_region names to internal identifiers
|
|
1694 using the internal cache that has accumulated while registering
|
|
1695 regions for AssemblyMappers. If any requested regions are
|
|
1696 not found in the cache an attempt is made to retrieve them
|
|
1697 from the database.
|
|
1698 Returntype : listref of ints
|
|
1699 Exceptions : throw if a non-existant seqregion is provided
|
|
1700 Caller : general
|
|
1701 Status : Stable
|
|
1702
|
|
1703 =cut
|
|
1704
|
|
1705 sub seq_regions_to_ids {
|
|
1706 my $self = shift;
|
|
1707 my $coord_system = shift;
|
|
1708 my $seq_regions = shift;
|
|
1709
|
|
1710 my $cs_id = $coord_system->dbID();
|
|
1711
|
|
1712 my @out;
|
|
1713
|
|
1714 foreach my $sr (@$seq_regions) {
|
|
1715 my $arr = $self->{'sr_name_cache'}->{"$sr:$cs_id"};
|
|
1716 if( $arr ) {
|
|
1717 push( @out, $arr->[0] );
|
|
1718 } else {
|
|
1719 push @out, $self->_seq_region_name_to_id($sr,$cs_id);
|
|
1720 }
|
|
1721 }
|
|
1722
|
|
1723 return \@out;
|
|
1724 }
|
|
1725
|
|
1726
|
|
1727 =head2 seq_ids_to_regions
|
|
1728
|
|
1729 Arg [1] : listref of seq_region ids
|
|
1730 Example : my @ids = @{$asma->ids_to_seq_regions(\@seq_ids)};
|
|
1731 Description: Converts a list of seq_region ids to seq region names
|
|
1732 using the internal cache that has accumulated while registering
|
|
1733 regions for AssemblyMappers. If any requested regions are
|
|
1734 not found in the cache an attempt is made to retrieve them
|
|
1735 from the database.
|
|
1736 Returntype : listref of strings
|
|
1737 Exceptions : throw if a non-existant seq_region_id is provided
|
|
1738 Caller : general
|
|
1739 Status : Stable
|
|
1740
|
|
1741 =cut
|
|
1742
|
|
1743 sub seq_ids_to_regions {
|
|
1744 my $self = shift;
|
|
1745 my $seq_region_ids = shift;
|
|
1746
|
|
1747 my @out;
|
|
1748
|
|
1749 foreach my $sr (@$seq_region_ids) {
|
|
1750 my $arr = $self->{'sr_id_cache'}->{"$sr"};
|
|
1751 if( $arr ) {
|
|
1752 push( @out, $arr->[1] );
|
|
1753 } else {
|
|
1754 push @out, $self->_seq_region_id_to_name($sr);
|
|
1755 }
|
|
1756 }
|
|
1757
|
|
1758 return \@out;
|
|
1759 }
|
|
1760
|
|
1761 =head2 delete_cache
|
|
1762
|
|
1763 Description: Delete all the caches for the mappings/seq_regions
|
|
1764 Returntype : none
|
|
1765 Exceptions : none
|
|
1766 Caller : General
|
|
1767 Status : At risk
|
|
1768
|
|
1769 =cut
|
|
1770
|
|
1771 sub delete_cache{
|
|
1772 my ($self) = @_;
|
|
1773
|
|
1774 %{$self->{'sr_name_cache'}} = ();
|
|
1775 %{$self->{'sr_id_cache'}} = ();
|
|
1776
|
|
1777 foreach my $key (keys %{$self->{'_asm_mapper_cache'}}){
|
|
1778 $self->{'_asm_mapper_cache'}->{$key}->flush();
|
|
1779 }
|
|
1780 %{$self->{'_asm_mapper_cache'}} = ();
|
|
1781 return;
|
|
1782 }
|
|
1783
|
|
1784
|
|
1785 =head2 register_region
|
|
1786
|
|
1787 Description: DEPRECATED use register_assembled instead
|
|
1788
|
|
1789 =cut
|
|
1790
|
|
1791 sub register_region{
|
|
1792 my ($self, $assmapper, $type, $chr_name, $start, $end) = @_;
|
|
1793
|
|
1794 deprecate('Use register_assembled instead');
|
|
1795
|
|
1796 $self->register_assembled($assmapper, $chr_name, $start, $end);
|
|
1797 }
|
|
1798
|
|
1799
|
|
1800 =head2 register_contig
|
|
1801
|
|
1802 Description: DEPRECATED use register_component instead
|
|
1803
|
|
1804 =cut
|
|
1805
|
|
1806 sub register_contig {
|
|
1807 my ($self, $assmapper, $type, $contig_id ) = @_;
|
|
1808
|
|
1809 deprecate('Use register_component instead');
|
|
1810
|
|
1811 #not sure if the use is passing in a seq_region_name or a
|
|
1812 #seq_region_id...
|
|
1813 register_component($assmapper, $contig_id);
|
|
1814 }
|
|
1815
|
|
1816
|
|
1817 =head2 fetch_by_type
|
|
1818
|
|
1819 Description: DEPRECATED use fetch_by_CoordSystems instead
|
|
1820
|
|
1821 =cut
|
|
1822
|
|
1823 sub fetch_by_type{
|
|
1824 my ($self,$type) = @_;
|
|
1825
|
|
1826 deprecate('Use fetch_by_CoordSystems instead');
|
|
1827
|
|
1828 #assume that what the user wanted was a mapper between the sequence coord
|
|
1829 #level and the top coord level
|
|
1830
|
|
1831 my $csa = $self->db()->get_CoordSystemAdaptor();
|
|
1832
|
|
1833 my $cs1 = $csa->fetch_top_level($type);
|
|
1834 my $cs2 = $csa->fetch_sequence_level();
|
|
1835
|
|
1836 return $self->fetch_by_CoordSystems($cs1,$cs2);
|
|
1837 }
|
|
1838
|
|
1839
|
|
1840
|
|
1841 1;
|