Mercurial > repos > mahtabm > ensembl
diff variant_effect_predictor/Bio/EnsEMBL/DBSQL/AssemblyMapperAdaptor.pm @ 0:1f6dce3d34e0
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 02:01:53 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/DBSQL/AssemblyMapperAdaptor.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,1841 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at <dev@ensembl.org>. + + Questions may also be sent to the Ensembl help desk at + <helpdesk@ensembl.org>. + +=cut + +=head1 NAME + +Bio::EnsEMBL::DBSQL::AssemblyMapperAdaptor + +=head1 SYNOPSIS + + use Bio::EnsEMBL::Registry; + + Bio::EnsEMBL::Registry->load_registry_from_db( + -host => 'ensembldb.ensembl.org', + -user => 'anonymous' + ); + + $asma = Bio::EnsEMBL::Registry->get_adaptor( "human", "core", + "assemblymapper" ); + + $csa = Bio::EnsEMBL::Registry->get_adaptor( "human", "core", + "coordsystem" ); + + my $chr33_cs = $csa->fetch_by_name( 'chromosome', 'NCBI33' ); + my $chr34_cs = $csa->fetch_by_name( 'chromosome', 'NCBI34' ); + my $ctg_cs = $csa->fetch_by_name('contig'); + my $clone_cs = $csa->fetch_by_name('clone'); + + my $chr_ctg_mapper = + $asma->fetch_by_CoordSystems( $chr33_cs, $ctg_cs ); + + my $ncbi33_ncbi34_mapper = + $asm_adptr->fetch_by_CoordSystems( $chr33, $chr34 ); + + my $ctg_clone_mapper = + $asm_adptr->fetch_by_CoordSystems( $ctg_cs, $clone_cs ); + + +=head1 DESCRIPTION + +Adaptor for handling Assembly mappers. This is a I<Singleton> class. +ie: There is only one per database (C<DBAdaptor>). + +This is used to retrieve mappers between any two coordinate systems +whose makeup is described by the assembly table. Currently one step +(explicit) and two step (implicit) pairwise mapping is supported. In +one-step mapping an explicit relationship between the coordinate systems +is defined in the assembly table. In two-step 'chained' mapping no +explicit mapping is present but the coordinate systems must share a +common mapping to an intermediate coordinate system. + +=head1 METHODS + +=cut + +package Bio::EnsEMBL::DBSQL::AssemblyMapperAdaptor; +use vars qw(@ISA); +use strict; + +use Bio::EnsEMBL::DBSQL::BaseAdaptor; +use Bio::EnsEMBL::AssemblyMapper; +use Bio::EnsEMBL::ChainedAssemblyMapper; +use Bio::EnsEMBL::TopLevelAssemblyMapper; + +use Bio::EnsEMBL::Utils::Cache; #CPAN LRU cache +use Bio::EnsEMBL::Utils::Exception qw(throw deprecate warning stack_trace_dump); +#use Bio::EnsEMBL::Utils::Exception qw(deprecate throw); +use Bio::EnsEMBL::Utils::SeqRegionCache; + +use integer; #do proper arithmetic bitshifts + +@ISA = qw(Bio::EnsEMBL::DBSQL::BaseAdaptor); + + +my $CHUNKFACTOR = 20; # 2^20 = approx. 10^6 + +=head2 new + + Arg [1] : Bio::EnsEMBL::DBAdaptor $dbadaptor the adaptor for + the database this assembly mapper is using. + Example : my $asma = new Bio::EnsEMBL::AssemblyMapperAdaptor($dbadaptor); + Description: Creates a new AssemblyMapperAdaptor object + Returntype : Bio::EnsEMBL::DBSQL::AssemblyMapperAdaptor + Exceptions : none + Caller : Bio::EnsEMBL::DBSQL::DBAdaptor + Status : Stable + +=cut + +sub new { + my($class, $dbadaptor) = @_; + + my $self = $class->SUPER::new($dbadaptor); + + $self->{'_asm_mapper_cache'} = {}; + + # use a shared cache (for this database) that contains info about + # seq regions + my $seq_region_cache = $self->db->get_SeqRegionCache(); + $self->{'sr_name_cache'} = $seq_region_cache->{'name_cache'}; + $self->{'sr_id_cache'} = $seq_region_cache->{'id_cache'}; + + return $self; +} + + + +=head2 cache_seq_ids_with_mult_assemblys + + Example : $self->adaptor->cache_seq_ids_with_mult_assemblys(); + Description: Creates a hash of the component seq region ids that + map to more than one assembly from the assembly table. + Retruntype : none + Exceptions : none + Caller : AssemblyMapper, ChainedAssemblyMapper + Status : At Risk + +=cut + +sub cache_seq_ids_with_mult_assemblys{ + my $self = shift; + my %multis; + + return if (defined($self->{'multi_seq_ids'})); + + $self->{'multi_seq_ids'} = {}; + + my $sql = qq( + SELECT sra.seq_region_id + FROM seq_region_attrib sra, + attrib_type at, + seq_region sr, + coord_system cs + WHERE sra.attrib_type_id = at.attrib_type_id + AND code = "MultAssem" + AND sra.seq_region_id = sr.seq_region_id + AND sr.coord_system_id = cs.coord_system_id + AND cs.species_id = ?); + + my $sth = $self->prepare($sql); + + $sth->bind_param( 1, $self->species_id(), SQL_INTEGER ); + + $sth->execute(); + + my ($seq_region_id); + + $sth->bind_columns(\$seq_region_id); + + while($sth->fetch()) { + $self->{'multi_seq_ids'}->{$seq_region_id} = 1; + } + $sth->finish; +} + + +=head2 fetch_by_CoordSystems + + Arg [1] : Bio::EnsEMBL::CoordSystem $cs1 + One of the coordinate systems to retrieve the mapper + between + Arg [2] : Bio::EnsEMBL::CoordSystem $cs2 + The other coordinate system to map between + Description: Retrieves an Assembly mapper for two coordinate + systems whose relationship is described in the + assembly table. + + The ordering of the coodinate systems is arbitrary. + The following two statements are equivalent: + $mapper = $asma->fetch_by_CoordSystems($cs1,$cs2); + $mapper = $asma->fetch_by_CoordSystems($cs2,$cs1); + Returntype : Bio::EnsEMBL::AssemblyMapper + Exceptions : wrong argument types + Caller : general + Status : Stable + +=cut + +sub fetch_by_CoordSystems { + my $self = shift; + my $cs1 = shift; + my $cs2 = shift; + + if(!ref($cs1) || !$cs1->isa('Bio::EnsEMBL::CoordSystem')) { + throw("cs1 argument must be a Bio::EnsEMBL::CoordSystem."); + } + if(!ref($cs2) || !$cs2->isa('Bio::EnsEMBL::CoordSystem')) { + throw("cs2 argument must be a Bio::EnsEMBL::CoordSystem."); + } + +# if($cs1->equals($cs2)) { +# throw("Cannot create mapper between same coord systems: " . +# $cs1->name . " " . $cs1->version); +# } + + if($cs1->is_top_level()) { + return Bio::EnsEMBL::TopLevelAssemblyMapper->new($self, $cs1, $cs2); + } + if($cs2->is_top_level()) { + return Bio::EnsEMBL::TopLevelAssemblyMapper->new($self, $cs2, $cs1); + } + + my $csa = $self->db->get_CoordSystemAdaptor(); + + #retrieve the shortest possible mapping path between these systems + my @mapping_path = @{$csa->get_mapping_path($cs1,$cs2)}; + + if(!@mapping_path) { + + # It is perfectly fine not to have a mapping. No warning needed really + # Just check the return code!! + +# warning( +# "There is no mapping defined between these coord systems:\n" . +# $cs1->name() . " " . $cs1->version() . " and " . $cs2->name() . " " . +# $cs2->version() +# ); + return undef; + } + + my $key = join(':', map({defined($_)?$_->dbID():"-"} @mapping_path)); + + my $asm_mapper = $self->{'_asm_mapper_cache'}->{$key}; + + return $asm_mapper if($asm_mapper); + + if(@mapping_path == 1) { + throw("Incorrect mapping path defined in meta table. " . + "0 step mapping encountered between:\n" . + $cs1->name() . " " . $cs1->version() . " and " . $cs2->name . " " . + $cs2->version()); + } + + if(@mapping_path == 2) { + #1 step regular mapping + $asm_mapper = Bio::EnsEMBL::AssemblyMapper->new($self, @mapping_path); + +# If you want multiple pieces on two seqRegions to map to each other +# you need to make an assembly.mapping entry that is seperated with a # +# instead of an |. + + $self->{'_asm_mapper_cache'}->{$key} = $asm_mapper; + return $asm_mapper; + } + + if(@mapping_path == 3) { + #two step chained mapping + $asm_mapper = Bio::EnsEMBL::ChainedAssemblyMapper->new($self,@mapping_path); + #in multi-step mapping it is possible get requests with the + #coordinate system ordering reversed since both mappings directions + #cache on both orderings just in case + #e.g. chr <-> contig <-> clone and clone <-> contig <-> chr + + $self->{'_asm_mapper_cache'}->{$key} = $asm_mapper; + $key = join(':', map({defined($_)?$_->dbID():"-"} reverse(@mapping_path))); + $self->{'_asm_mapper_cache'}->{$key} = $asm_mapper; + return $asm_mapper; + } + + throw("Only 1 and 2 step coordinate system mapping is currently\n" . + "supported. Mapping between " . + $cs1->name() . " " . $cs1->version() . " and " . + $cs2->name() . " " . $cs2->version() . + " requires ". (scalar(@mapping_path)-1) . " steps."); +} + + + +=head2 register_assembled + + Arg [1] : Bio::EnsEMBL::AssemblyMapper $asm_mapper + A valid AssemblyMapper object + Arg [2] : integer $asm_seq_region + The dbID of the seq_region to be registered + Arg [3] : int $asm_start + The start of the region to be registered + Arg [4] : int $asm_end + The end of the region to be registered + Description: Declares an assembled region to the AssemblyMapper. + This extracts the relevant data from the assembly + table and stores it in Mapper internal to the $asm_mapper. + It therefore must be called before any mapping is + attempted on that region. Otherwise only gaps will + be returned. Note that the AssemblyMapper automatically + calls this method when the need arises. + Returntype : none + Exceptions : throw if the seq_region to be registered does not exist + or if it associated with multiple assembled pieces (bad data + in assembly table) + Caller : Bio::EnsEMBL::AssemblyMapper + Status : Stable + +=cut + + +sub register_assembled { + my $self = shift; + my $asm_mapper = shift; + my $asm_seq_region = shift; + my $asm_start = shift; + my $asm_end = shift; + + if(!ref($asm_mapper) || !$asm_mapper->isa('Bio::EnsEMBL::AssemblyMapper')) { + throw("Bio::EnsEMBL::AssemblyMapper argument expected"); + } + + throw("asm_seq_region argument expected") if(!defined($asm_seq_region)); + throw("asm_start argument expected") if(!defined($asm_start)); + throw("asm_end argument expected") if(!defined($asm_end)); + + my $asm_cs_id = $asm_mapper->assembled_CoordSystem->dbID(); + my $cmp_cs_id = $asm_mapper->component_CoordSystem->dbID(); + + #split up the region to be registered into fixed chunks + #this allows us to keep track of regions that have already been + #registered and also works under the assumption that if a small region + #is requested it is likely that other requests will be made in the + #vicinity (the minimum size registered the chunksize (2^chunkfactor) + + my @chunk_regions; + #determine span of chunks + #bitwise shift right is fast and easy integer division + + my($start_chunk, $end_chunk); + + $start_chunk = $asm_start >> $CHUNKFACTOR; + $end_chunk = $asm_end >> $CHUNKFACTOR; + + # inserts have start = end + 1, on boundary condition start_chunk + # could be less than end chunk + if($asm_start == $asm_end + 1) { + ($start_chunk, $end_chunk) = ($end_chunk, $start_chunk); + } + + #find regions of continuous unregistered chunks + my $i; + my ($begin_chunk_region,$end_chunk_region); + for ($i = $start_chunk; $i <= $end_chunk; $i++) { + if($asm_mapper->have_registered_assembled($asm_seq_region, $i)) { + if(defined($begin_chunk_region)) { + #this is the end of an unregistered region. + my $region = [($begin_chunk_region << $CHUNKFACTOR), + (($end_chunk_region+1) << $CHUNKFACTOR)-1]; + push @chunk_regions, $region; + $begin_chunk_region = $end_chunk_region = undef; + } + } else { + $begin_chunk_region = $i if(!defined($begin_chunk_region)); + $end_chunk_region = $i+1; + $asm_mapper->register_assembled($asm_seq_region,$i); + } + } + + #the last part may have been an unregistered region too + if(defined($begin_chunk_region)) { + my $region = [($begin_chunk_region << $CHUNKFACTOR), + (($end_chunk_region+1) << $CHUNKFACTOR) -1]; + push @chunk_regions, $region; + } + + return if(!@chunk_regions); + + # keep the Mapper to a reasonable size + if( $asm_mapper->size() > $asm_mapper->max_pair_count() ) { + $asm_mapper->flush(); + #we now have to go and register the entire requested region since we + #just flushed everything + + @chunk_regions = ( [ ( $start_chunk << $CHUNKFACTOR) + , (($end_chunk+1) << $CHUNKFACTOR)-1 ] ); + + for( my $i = $start_chunk; $i <= $end_chunk; $i++ ) { + $asm_mapper->register_assembled( $asm_seq_region, $i ); + } + } + +# my $asm_seq_region_id = +# $self->_seq_region_name_to_id($asm_seq_region,$asm_cs_id); + + # Retrieve the description of how the assembled region is made from + # component regions for each of the continuous blocks of unregistered, + # chunked regions + + my $q = qq{ + SELECT + asm.cmp_start, + asm.cmp_end, + asm.cmp_seq_region_id, + sr.name, + sr.length, + asm.ori, + asm.asm_start, + asm.asm_end + FROM + assembly asm, seq_region sr + WHERE + asm.asm_seq_region_id = ? AND + ? <= asm.asm_end AND + ? >= asm.asm_start AND + asm.cmp_seq_region_id = sr.seq_region_id AND + sr.coord_system_id = ? + }; + + my $sth = $self->prepare($q); + + foreach my $region (@chunk_regions) { + my($region_start, $region_end) = @$region; + $sth->bind_param(1,$asm_seq_region,SQL_INTEGER); + $sth->bind_param(2,$region_start,SQL_INTEGER); + $sth->bind_param(3,$region_end,SQL_INTEGER); + $sth->bind_param(4,$cmp_cs_id,SQL_INTEGER); + + $sth->execute(); + + my($cmp_start, $cmp_end, $cmp_seq_region_id, $cmp_seq_region, $ori, + $asm_start, $asm_end, $cmp_seq_region_length); + + $sth->bind_columns(\$cmp_start, \$cmp_end, \$cmp_seq_region_id, + \$cmp_seq_region, \$cmp_seq_region_length, \$ori, + \$asm_start, \$asm_end); + + # + # Load the unregistered regions of the mapper + # + while($sth->fetch()) { + next if($asm_mapper->have_registered_component($cmp_seq_region_id) + and !defined($self->{'multi_seq_ids'}->{$cmp_seq_region_id})); + $asm_mapper->register_component($cmp_seq_region_id); + $asm_mapper->mapper->add_map_coordinates( + $asm_seq_region, $asm_start, $asm_end, + $ori, + $cmp_seq_region_id, $cmp_start, $cmp_end); + + my $arr = [ $cmp_seq_region_id, $cmp_seq_region, + $cmp_cs_id, $cmp_seq_region_length ]; + + $self->{'sr_name_cache'}->{"$cmp_seq_region:$cmp_cs_id"} = $arr; + $self->{'sr_id_cache'}->{"$cmp_seq_region_id"} = $arr; + } + } + + $sth->finish(); +} + + + +sub _seq_region_name_to_id { + my $self = shift; + my $sr_name = shift; + my $cs_id = shift; + + ($sr_name && $cs_id) || throw('seq_region_name and coord_system_id args ' . + 'are required'); + + my $arr = $self->{'sr_name_cache'}->{"$sr_name:$cs_id"}; + if( $arr ) { + return $arr->[0]; + } + + # Get the seq_region_id via the name. This would be quicker if we just + # used internal ids instead but stored but then we lose the ability + # the transform accross databases with different internal ids + + my $sth = $self->prepare("SELECT seq_region_id, length " . + "FROM seq_region " . + "WHERE name = ? AND coord_system_id = ?"); + + $sth->bind_param(1,$sr_name,SQL_VARCHAR); + $sth->bind_param(2,$cs_id,SQL_INTEGER); + $sth->execute(); + + if(!$sth->rows() == 1) { + throw("Ambiguous or non-existant seq_region [$sr_name] " . + "in coord system $cs_id"); + } + + my ($sr_id, $sr_length) = $sth->fetchrow_array(); + $sth->finish(); + + $arr = [ $sr_id, $sr_name, $cs_id, $sr_length ]; + + $self->{'sr_name_cache'}->{"$sr_name:$cs_id"} = $arr; + $self->{'sr_id_cache'}->{"$sr_id"} = $arr; + + return $sr_id; +} + +sub _seq_region_id_to_name { + my $self = shift; + my $sr_id = shift; + + ($sr_id) || throw('seq_region_is required'); + + my $arr = $self->{'sr_id_cache'}->{"$sr_id"}; + if( $arr ) { + return $arr->[1]; + } + + # Get the seq_region name via the id. This would be quicker if we just + # used internal ids instead but stored but then we lose the ability + # the transform accross databases with different internal ids + + my $sth = $self->prepare("SELECT name, length ,coord_system_id " . + "FROM seq_region " . + "WHERE seq_region_id = ? "); + + $sth->bind_param(1,$sr_id,SQL_INTEGER); + $sth->execute(); + + if(!$sth->rows() == 1) { + throw("non-existant seq_region [$sr_id]"); + } + + my ($sr_name, $sr_length, $cs_id) = $sth->fetchrow_array(); + $sth->finish(); + + $arr = [ $sr_id, $sr_name, $cs_id, $sr_length ]; + + $self->{'sr_name_cache'}->{"$sr_name:$cs_id"} = $arr; + $self->{'sr_id_cache'}->{"$sr_id"} = $arr; + + return $sr_name; +} + + +=head2 register_component + + Arg [1] : Bio::EnsEMBL::AssemblyMapper $asm_mapper + A valid AssemblyMapper object + Arg [2] : integer $cmp_seq_region + The dbID of the seq_region to be registered + Description: Declares a component region to the AssemblyMapper. + This extracts the relevant data from the assembly + table and stores it in Mapper internal to the $asm_mapper. + It therefore must be called before any mapping is + attempted on that region. Otherwise only gaps will + be returned. Note that the AssemblyMapper automatically + calls this method when the need arises. + Returntype : none + Exceptions : throw if the seq_region to be registered does not exist + or if it associated with multiple assembled pieces (bad data + in assembly table) + Caller : Bio::EnsEMBL::AssemblyMapper + Status : Stable + +=cut + +sub register_component { + my $self = shift; + my $asm_mapper = shift; + my $cmp_seq_region = shift; + + if(!ref($asm_mapper) || !$asm_mapper->isa('Bio::EnsEMBL::AssemblyMapper')) { + throw("Bio::EnsEMBL::AssemblyMapper argument expected"); + } + + if(!defined($cmp_seq_region)) { + throw("cmp_seq_region argument expected"); + } + + my $cmp_cs_id = $asm_mapper->component_CoordSystem()->dbID(); + my $asm_cs_id = $asm_mapper->assembled_CoordSystem()->dbID(); + + #do nothing if this region is already registered or special case + return if($asm_mapper->have_registered_component($cmp_seq_region) + and !defined($self->{'multi_seq_ids'}->{$cmp_seq_region})); + +# my $cmp_seq_region_id = +# $self->_seq_region_name_to_id($cmp_seq_region, $cmp_cs_id); + + # Determine what part of the assembled region this component region makes up + + my $q = qq{ + SELECT + asm.asm_start, + asm.asm_end, + asm.asm_seq_region_id, + sr.name, + sr.length + FROM + assembly asm, seq_region sr + WHERE + asm.cmp_seq_region_id = ? AND + asm.asm_seq_region_id = sr.seq_region_id AND + sr.coord_system_id = ? + }; + + my $sth = $self->prepare($q); + $sth->bind_param(1,$cmp_seq_region,SQL_INTEGER); + $sth->bind_param(2,$asm_cs_id,SQL_INTEGER); + $sth->execute(); + + if($sth->rows() == 0) { + #this component is not used in the assembled part i.e. gap + $asm_mapper->register_component($cmp_seq_region); + return; + } + + #we do not currently support components mapping to multiple assembled + # make sure that you've got the correct mapping in the meta-table : + # chromosome:EquCab2#contig ( use'#' for multiple mappings ) + # chromosome:EquCab2|contig ( use '|' delimiter for 1-1 mappings ) + # + if($sth->rows() != 1) { + throw("Multiple assembled regions for single " . + "component region cmp_seq_region_id=[$cmp_seq_region]\n". + "Remember that multiple mappings use the \#-operaator". + " in the meta-table (i.e. chromosome:EquCab2\#contig\n"); + } + + my ($asm_start, $asm_end, $asm_seq_region_id, + $asm_seq_region, $asm_seq_region_length) = $sth->fetchrow_array(); + + my $arr = [ $asm_seq_region_id, $asm_seq_region, + $asm_cs_id, $asm_seq_region_length ]; + + $self->{'sr_name_cache'}->{"$asm_seq_region:$asm_cs_id"} = $arr; + $self->{'sr_id_cache'}->{"$asm_seq_region_id"} = $arr; + + $sth->finish(); + + # Register the corresponding assembled region. This allows a us to + # register things in assembled chunks which allows us to: + # (1) Keep track of what assembled regions are registered + # (2) Use locality of reference (if they want something in same general + # region it will already be registered). + + $self->register_assembled($asm_mapper,$asm_seq_region_id,$asm_start,$asm_end); +} + + + +=head2 register_chained + + Arg [1] : Bio::EnsEMBL::ChainedAssemblyMapper $casm_mapper + The chained assembly mapper to register regions on + Arg [2] : string $from ('first' or 'last') + The direction we are registering from, and the name of the + internal mapper. + Arg [3] : string $seq_region_name + The name of the seqregion we are registering on + Arg [4] : listref $ranges + A list of ranges to register (in [$start,$end] tuples). + Arg [5] : (optional) $to_slice + Only register those on this Slice. + Description: Registers a set of ranges on a chained assembly mapper. + This function is at the heart of the chained mapping process. + It retrieves information from the assembly table and + dynamically constructs the mappings between two coordinate + systems which are 2 mapping steps apart. It does this by using + two internal mappers to load up a third mapper which is + actually used by the ChainedAssemblyMapper to perform the + mapping. + + This method must be called before any mapping is + attempted on regions of interest, otherwise only gaps will + be returned. Note that the ChainedAssemblyMapper automatically + calls this method when the need arises. + Returntype : none + Exceptions : throw if the seq_region to be registered does not exist + or if it associated with multiple assembled pieces (bad data + in assembly table) + + throw if the mapping between the coordinate systems cannot + be performed in two steps, which means there is an internal + error in the data in the meta table or in the code that creates + the mapping paths. + Caller : Bio::EnsEMBL::AssemblyMapper + Status : Stable + +=cut + +sub register_chained { + my $self = shift; + my $casm_mapper = shift; + my $from = shift; + my $seq_region_id = shift; + my $ranges = shift; + my $to_slice = shift; + + my $to_seq_region_id; + if(defined($to_slice)){ + if($casm_mapper->first_CoordSystem()->equals($casm_mapper->last_CoordSystem())){ + return $self->_register_chained_special($casm_mapper, $from, $seq_region_id, $ranges, $to_slice); + } + $to_seq_region_id = $to_slice->get_seq_region_id(); + if(!defined($to_seq_region_id)){ + die "Could not get seq_region_id for to_slice".$to_slice->seq_region_name."\n"; + } + } + + my ($start_name, $start_mid_mapper, $start_cs, $start_registry, + $end_name, $end_mid_mapper, $end_cs, $end_registry); + + if($from eq 'first') { + $start_name = 'first'; + $start_mid_mapper = $casm_mapper->first_middle_mapper(); + $start_cs = $casm_mapper->first_CoordSystem(); + $start_registry = $casm_mapper->first_registry(); + $end_mid_mapper = $casm_mapper->last_middle_mapper(); + $end_cs = $casm_mapper->last_CoordSystem(); + $end_registry = $casm_mapper->last_registry(); + $end_name = 'last'; + } elsif($from eq 'last') { + $start_name = 'last'; + $start_mid_mapper = $casm_mapper->last_middle_mapper(); + $start_cs = $casm_mapper->last_CoordSystem(); + $start_registry = $casm_mapper->last_registry(); + $end_mid_mapper = $casm_mapper->first_middle_mapper(); + $end_cs = $casm_mapper->first_CoordSystem(); + $end_registry = $casm_mapper->first_registry(); + $end_name = 'first'; + } else { + throw("Invalid from argument: [$from], must be 'first' or 'last'"); + } + + my $combined_mapper = $casm_mapper->first_last_mapper(); + my $mid_cs = $casm_mapper->middle_CoordSystem(); + my $mid_name = 'middle'; + my $csa = $self->db->get_CoordSystemAdaptor(); + + # Check for the simple case where the ChainedMapper is short + if( ! defined $mid_cs ) { + $start_mid_mapper = $combined_mapper; + } + + + ############## + # obtain the first half of the mappings and load them into the start mapper + # + + #ascertain which is component and which is actually assembled coord system + my @path; + + # check for the simple case, where the ChainedMapper is short + if( defined $mid_cs ) { + @path = @{$csa->get_mapping_path($start_cs, $mid_cs)}; + } else { + @path = @{$csa->get_mapping_path( $start_cs, $end_cs )}; + } + + if(@path != 2 && defined( $path[1] )) { + my $path = join(',', map({$_->name .' '. $_->version} @path)); + my $len = scalar(@path) - 1; + throw("Unexpected mapping path between start and intermediate " . + "coord systems (". $start_cs->name . " " . $start_cs->version . + " and " . $mid_cs->name . " " . $mid_cs->version . ")." . + "\nExpected path length 1, got $len. " . + "(path=$path)"); + } + + my $sth; + my ($asm_cs,$cmp_cs); + $asm_cs = $path[0]; + $cmp_cs = $path[-1]; + + #the SQL varies depending on whether we are coming from assembled or + #component coordinate system + +my $asm2cmp = (<<ASMCMP); + SELECT + asm.cmp_start, + asm.cmp_end, + asm.cmp_seq_region_id, + sr.name, + sr.length, + asm.ori, + asm.asm_start, + asm.asm_end + FROM + assembly asm, seq_region sr + WHERE + asm.asm_seq_region_id = ? AND + ? <= asm.asm_end AND + ? >= asm.asm_start AND + asm.cmp_seq_region_id = sr.seq_region_id AND + sr.coord_system_id = ? +ASMCMP + + +my $cmp2asm = (<<CMPASM); + SELECT + asm.asm_start, + asm.asm_end, + asm.asm_seq_region_id, + sr.name, + sr.length, + asm.ori, + asm.cmp_start, + asm.cmp_end + FROM + assembly asm, seq_region sr + WHERE + asm.cmp_seq_region_id = ? AND + ? <= asm.cmp_end AND + ? >= asm.cmp_start AND + asm.asm_seq_region_id = sr.seq_region_id AND + sr.coord_system_id = ? +CMPASM + + my $asm2cmp_sth; + my $cmp2asm_sth; + if(defined($to_slice)){ + my $to_cs = $to_slice->coord_system; + if($asm_cs->equals($to_cs)){ + $asm2cmp_sth = $self->prepare($asm2cmp); + $cmp2asm_sth = $self->prepare($cmp2asm." AND asm.asm_seq_region_id = $to_seq_region_id"); + } + elsif($cmp_cs->equals($to_cs)){ + $asm2cmp_sth = $self->prepare($asm2cmp." AND asm.cmp_seq_region_id = $to_seq_region_id"); + $cmp2asm_sth = $self->prepare($cmp2asm); + } + else{ + $asm2cmp_sth = $self->prepare($asm2cmp); + $cmp2asm_sth = $self->prepare($cmp2asm); + } + } + else{ + $asm2cmp_sth = $self->prepare($asm2cmp); + $cmp2asm_sth = $self->prepare($cmp2asm); + } + + + + $sth = ($asm_cs->equals($start_cs)) ? $asm2cmp_sth : $cmp2asm_sth; + + my $mid_cs_id; + + # check for the simple case where the ChainedMapper is short + if( defined $mid_cs ) { + $mid_cs_id = $mid_cs->dbID(); + } else { + $mid_cs_id = $end_cs->dbID(); + } + + my @mid_ranges; + my @start_ranges; + + #need to perform the query for each unregistered range + foreach my $range (@$ranges) { + my ($start, $end) = @$range; + $sth->bind_param(1,$seq_region_id,SQL_INTEGER); + $sth->bind_param(2,$start,SQL_INTEGER); + $sth->bind_param(3,$end,SQL_INTEGER); + $sth->bind_param(4,$mid_cs_id,SQL_INTEGER); + $sth->execute(); + + #load the start <-> mid mapper with the results and record the mid cs + #ranges we just added to the mapper + + my ($mid_start, $mid_end, $mid_seq_region_id, $mid_seq_region, $mid_length, + $ori, $start_start, $start_end); + + $sth->bind_columns(\$mid_start, \$mid_end, \$mid_seq_region_id, + \$mid_seq_region, \$mid_length, \$ori, \$start_start, + \$start_end); + + while($sth->fetch()) { + + if( defined $mid_cs ) { + $start_mid_mapper->add_map_coordinates + ( + $seq_region_id,$start_start, $start_end, $ori, + $mid_seq_region_id, $mid_start, $mid_end + ); + } else { + if( $from eq "first" ) { + $combined_mapper->add_map_coordinates + ( + $seq_region_id,$start_start, $start_end, $ori, + $mid_seq_region_id, $mid_start, $mid_end + ); + } else { + $combined_mapper->add_map_coordinates + ( + $mid_seq_region_id, $mid_start, $mid_end, $ori, + $seq_region_id,$start_start, $start_end + ); + } + } + + #update sr_name cache + my $arr = [ $mid_seq_region_id, $mid_seq_region, + $mid_cs_id, $mid_length ]; + + $self->{'sr_name_cache'}->{"$mid_seq_region:$mid_cs_id"} = $arr; + $self->{'sr_id_cache'}->{"$mid_seq_region_id"} = $arr; + + push @mid_ranges,[$mid_seq_region_id,$mid_seq_region, + $mid_start,$mid_end]; + push @start_ranges, [ $seq_region_id, $start_start, $start_end ]; + + #the region that we actually register may actually be larger or smaller + #than the region that we wanted to register. + #register the intersection of the region so we do not end up doing + #extra work later + + if($start_start < $start || $start_end > $end) { + $start_registry->check_and_register($seq_region_id,$start_start, + $start_end); + } + } + $sth->finish(); + } + + # in the one step case, we load the mid ranges in the + # last_registry and we are done + if( ! defined $mid_cs ) { + for my $range ( @mid_ranges ) { + $end_registry->check_and_register( $range->[0], $range->[2], + $range->[3] ); + } + + # and thats it for the simple case ... + return; + } + + + ########### + # now the second half of the mapping + # perform another query and load the mid <-> end mapper using the mid cs + # ranges + # + + #ascertain which is component and which is actually assembled coord system + @path = @{$csa->get_mapping_path($mid_cs, $end_cs)}; + if(@path == 2 || ( @path == 3 && !defined $path[1])) { + + $asm_cs = $path[0]; + $cmp_cs = $path[-1]; + } else { + my $path = join(',', map({$_->name .' '. $_->version} @path)); + my $len = scalar(@path)-1; + throw("Unexpected mapping path between intermediate and last" . + "coord systems (". $mid_cs->name . " " . $mid_cs->version . + " and " . $end_cs->name . " " . $end_cs->version . ")." . + "\nExpected path length 1, got $len. " . + "(path=$path)"); + } + + if(defined($to_slice)){ + my $to_cs = $to_slice->coord_system; + if($asm_cs->equals($to_cs)){ + $asm2cmp_sth = $self->prepare($asm2cmp); + $cmp2asm_sth = $self->prepare($cmp2asm." AND asm.asm_seq_region_id = $to_seq_region_id"); + } + elsif($cmp_cs->equals($to_cs)){ + $asm2cmp_sth = $self->prepare($asm2cmp." AND asm.cmp_seq_region_id = $to_seq_region_id"); + $cmp2asm_sth = $self->prepare($cmp2asm); + } + else{ + $asm2cmp_sth = $self->prepare($asm2cmp); + $cmp2asm_sth = $self->prepare($cmp2asm); + } + } + + $sth = ($asm_cs->equals($mid_cs)) ? $asm2cmp_sth : $cmp2asm_sth; + + my $end_cs_id = $end_cs->dbID(); + foreach my $mid_range (@mid_ranges) { + my ($mid_seq_region_id, $mid_seq_region,$start, $end) = @$mid_range; + $sth->bind_param(1,$mid_seq_region_id,SQL_INTEGER); + $sth->bind_param(2,$start,SQL_INTEGER); + $sth->bind_param(3,$end,SQL_INTEGER); + $sth->bind_param(4,$end_cs_id,SQL_INTEGER); + $sth->execute(); + + #load the end <-> mid mapper with the results and record the mid cs + #ranges we just added to the mapper + + my ($end_start, $end_end, $end_seq_region_id, $end_seq_region, $end_length, + $ori, $mid_start, $mid_end); + + $sth->bind_columns(\$end_start, \$end_end, \$end_seq_region_id, + \$end_seq_region, \$end_length, \$ori, \$mid_start, + \$mid_end); + + while($sth->fetch()) { + $end_mid_mapper->add_map_coordinates + ( + $end_seq_region_id, $end_start, $end_end, $ori, + $mid_seq_region_id, $mid_start, $mid_end + ); + + #update sr_name cache + my $arr = [ $end_seq_region_id,$end_seq_region,$end_cs_id,$end_length ]; + + $self->{'sr_name_cache'}->{"$end_seq_region:$end_cs_id"} = $arr; + $self->{'sr_id_cache'}->{"$end_seq_region_id"} = $arr; + + #register this region on the end coord system + $end_registry->check_and_register($end_seq_region_id, $end_start, $end_end); + } + $sth->finish(); + } + + ######### + # Now that both halves are loaded + # Do stepwise mapping using both of the loaded mappers to load + # the final start <-> end mapper + # + + _build_combined_mapper(\@start_ranges, $start_mid_mapper, $end_mid_mapper, + $combined_mapper, $start_name); + #all done! + return; +} + + +=head2 _register_chained_special + + Arg [1] : Bio::EnsEMBL::ChainedAssemblyMapper $casm_mapper + The chained assembly mapper to register regions on + Arg [2] : string $from ('first' or 'last') + The direction we are registering from, and the name of the + internal mapper. + Arg [3] : string $seq_region_name + The name of the seqregion we are registering on + Arg [4] : listref $ranges + A list of ranges to register (in [$start,$end] tuples). + Arg [5] : (optional) $to_slice + Only register those on this Slice. + Description: Registers a set of ranges on a chained assembly mapper. + This function is at the heart of the chained mapping process. + It retrieves information from the assembly table and + dynamically constructs the mappings between two coordinate + systems which are 2 mapping steps apart. It does this by using + two internal mappers to load up a third mapper which is + actually used by the ChainedAssemblyMapper to perform the + mapping. + + This method must be called before any mapping is + attempted on regions of interest, otherwise only gaps will + be returned. Note that the ChainedAssemblyMapper automatically + calls this method when the need arises. + Returntype : none + Exceptions : throw if the seq_region to be registered does not exist + or if it associated with multiple assembled pieces (bad data + in assembly table) + + throw if the mapping between the coordinate systems cannot + be performed in two steps, which means there is an internal + error in the data in the meta table or in the code that creates + the mapping paths. + Caller : Bio::EnsEMBL::AssemblyMapper + Status : Stable + +=cut + +sub _register_chained_special { + my $self = shift; + my $casm_mapper = shift; + my $from = shift; + my $seq_region_id = shift; + my $ranges = shift; + my $to_slice = shift; + my $found = 0; + + my $sth = $self->prepare("SELECT + asm.cmp_start, + asm.cmp_end, + asm.cmp_seq_region_id, + sr.name, + sr.length, + asm.ori, + asm.asm_start, + asm.asm_end + FROM + assembly asm, seq_region sr + WHERE + asm.asm_seq_region_id = ? AND + ? <= asm.asm_end AND + ? >= asm.asm_start AND + asm.cmp_seq_region_id = sr.seq_region_id AND + sr.coord_system_id = ? AND + asm.cmp_seq_region_id = ?"); + + + my ($start_name, $start_mid_mapper, $start_cs, $start_registry, + $end_name, $end_mid_mapper, $end_cs, $end_registry); + + if($from eq 'first') { + $start_name = 'first'; + $start_mid_mapper = $casm_mapper->first_middle_mapper(); + $start_cs = $casm_mapper->first_CoordSystem(); + $start_registry = $casm_mapper->first_registry(); + $end_mid_mapper = $casm_mapper->last_middle_mapper(); + $end_cs = $casm_mapper->last_CoordSystem(); + $end_registry = $casm_mapper->last_registry(); + $end_name = 'last'; + } elsif($from eq 'last') { + $start_name = 'last'; + $start_mid_mapper = $casm_mapper->last_middle_mapper(); + $start_cs = $casm_mapper->last_CoordSystem(); + $start_registry = $casm_mapper->last_registry(); + $end_mid_mapper = $casm_mapper->first_middle_mapper(); + $end_cs = $casm_mapper->first_CoordSystem(); + $end_registry = $casm_mapper->first_registry(); + $end_name = 'first'; + } else { + throw("Invalid from argument: [$from], must be 'first' or 'last'"); + } + + my $combined_mapper = $casm_mapper->first_last_mapper(); + my $mid_cs = $casm_mapper->middle_CoordSystem(); + my $mid_name = 'middle'; + my $csa = $self->db->get_CoordSystemAdaptor(); + + # Check for the simple case where the ChainedMapper is short + if( ! defined $mid_cs ) { + $start_mid_mapper = $combined_mapper; + } + + + my @path; + if( defined $mid_cs ) { + @path = @{$csa->get_mapping_path($start_cs, $mid_cs)}; + } else { + @path = @{$csa->get_mapping_path( $start_cs, $end_cs )}; + } + if( ! defined $mid_cs ) { + $start_mid_mapper = $combined_mapper; + } + + if(@path != 2 && defined( $path[1] )) { + my $path = join(',', map({$_->name .' '. $_->version} @path)); + my $len = scalar(@path) - 1; + throw("Unexpected mapping path between start and intermediate " . + "coord systems (". $start_cs->name . " " . $start_cs->version . + " and " . $mid_cs->name . " " . $mid_cs->version . ")." . + "\nExpected path length 1, got $len. " . + "(path=$path)"); + } + + my ($asm_cs,$cmp_cs); + $asm_cs = $path[0]; + $cmp_cs = $path[-1]; + + $combined_mapper = $casm_mapper->first_last_mapper(); + $mid_cs = $casm_mapper->middle_CoordSystem(); + $mid_name = 'middle'; + $csa = $self->db->get_CoordSystemAdaptor(); + + my $mid_cs_id; + + # Check for the simple case where the ChainedMapper is short + if ( !defined $mid_cs ) { + $start_mid_mapper = $combined_mapper; + } else { + $mid_cs_id = $mid_cs->dbID(); + } + + my @mid_ranges; + my @start_ranges; + + my $to_cs = $to_slice->coord_system; + foreach my $direction (1, 0){ + my $id1; + my $id2; + if($direction){ + $id1 = $seq_region_id; + $id2 = $to_slice->get_seq_region_id(); + } + else{ + $id2 = $seq_region_id; + $id1 = $to_slice->get_seq_region_id(); + } + + foreach my $range (@$ranges) { + my ($start, $end) = @$range; + $sth->bind_param(1,$id1,SQL_INTEGER); + $sth->bind_param(2,$start,SQL_INTEGER); + $sth->bind_param(3,$end,SQL_INTEGER); + $sth->bind_param(4,$to_cs->dbID,SQL_INTEGER); + $sth->bind_param(5,$id2,SQL_INTEGER); + $sth->execute(); + + my ($mid_start, $mid_end, $mid_seq_region_id, $mid_seq_region, $mid_length, + $ori, $start_start, $start_end); + + $sth->bind_columns(\$mid_start, \$mid_end, \$mid_seq_region_id, + \$mid_seq_region, \$mid_length, \$ori, \$start_start, + \$start_end); + + while($sth->fetch()) { + $found = 1; + + if( defined $mid_cs ) { + $start_mid_mapper->add_map_coordinates + ( + $id1,$start_start, $start_end, $ori, + $mid_seq_region_id, $mid_start, $mid_end + ); + } else { + if( $from eq "first") { + if($direction){ + $combined_mapper->add_map_coordinates + ( + $id1,$start_start, $start_end, $ori, + $mid_seq_region_id, $mid_start, $mid_end + ); + } + else{ + $combined_mapper->add_map_coordinates + ( + $mid_seq_region_id, $mid_start, $mid_end, $ori, + $id1,$start_start, $start_end + ); + } + } else { + if($direction){ + $combined_mapper->add_map_coordinates + ( + $mid_seq_region_id, $mid_start, $mid_end, $ori, + $id1,$start_start, $start_end + ); + } + else{ + $combined_mapper->add_map_coordinates + ( + $id1,$start_start, $start_end, $ori, + $mid_seq_region_id, $mid_start, $mid_end + ); + } + } + } + + #update sr_name cache + my $arr = [ $mid_seq_region_id, $mid_seq_region, + $mid_cs_id, $mid_length ]; + + $self->{'sr_name_cache'}->{"$mid_seq_region:$mid_cs_id"} = $arr; + $self->{'sr_id_cache'}->{"$mid_seq_region_id"} = $arr; + + push @mid_ranges,[$mid_seq_region_id,$mid_seq_region, + $mid_start,$mid_end]; + push @start_ranges, [ $id1, $start_start, $start_end ]; + + #the region that we actually register may actually be larger or smaller + #than the region that we wanted to register. + #register the intersection of the region so we do not end up doing + #extra work later + + if($start_start < $start || $start_end > $end) { + $start_registry->check_and_register($id1,$start_start, + $start_end); + } + } + $sth->finish(); + } + if($found){ + if( ! defined $mid_cs ) { + for my $range ( @mid_ranges ) { + $end_registry->check_and_register( $range->[0], $range->[2], + $range->[3] ); + } + + # and thats it for the simple case ... + return; + } + } + } +} + + +=head2 register_all + + Arg [1] : Bio::EnsEMBL::AssemblyMapper $mapper + Example : $mapper = $asm_mapper_adaptor->fetch_by_CoordSystems($cs1,$cs2); + + # make cache large enough to hold all of the mappings + $mapper->max_pair_count(10e6); + $asm_mapper_adaptor->register_all($mapper); + + # perform mappings as normal + $mapper->map($slice->seq_region_name(), $sr_start, $sr_end, + $sr_strand, $cs1); + ... + Description: This function registers the entire set of mappings between + two coordinate systems in an assembly mapper. + This will use a lot of memory but will be much more efficient + when doing a lot of mapping which is spread over the entire + genome. + Returntype : none + Exceptions : none + Caller : specialised prograhsm + Status : Stable + +=cut + +sub register_all { + my $self = shift; + my $mapper = shift; + + my $asm_cs_id = $mapper->assembled_CoordSystem()->dbID(); + my $cmp_cs_id = $mapper->component_CoordSystem()->dbID(); + + # retrieve every relevant assembled/component pair from the assembly table + + my $q = qq{ + SELECT + asm.cmp_start, + asm.cmp_end, + asm.cmp_seq_region_id, + cmp_sr.name, + cmp_sr.length, + asm.ori, + asm.asm_start, + asm.asm_end, + asm.asm_seq_region_id, + asm_sr.name, + asm_sr.length + FROM + assembly asm, seq_region asm_sr, seq_region cmp_sr + WHERE + asm.cmp_seq_region_id = cmp_sr.seq_region_id AND + asm.asm_seq_region_id = asm_sr.seq_region_id AND + cmp_sr.coord_system_id = ? AND + asm_sr.coord_system_id = ? + }; + + my $sth = $self->prepare($q); + + $sth->bind_param(1,$cmp_cs_id,SQL_INTEGER); + $sth->bind_param(2,$asm_cs_id,SQL_INTEGER); + $sth->execute(); + + # load the mapper with the assembly information + + my ($cmp_start, $cmp_end, $cmp_seq_region_id, $cmp_seq_region, $cmp_length, + $ori, + $asm_start, $asm_end, $asm_seq_region_id, $asm_seq_region, $asm_length); + + $sth->bind_columns(\$cmp_start, \$cmp_end, \$cmp_seq_region_id, + \$cmp_seq_region, \$cmp_length, \$ori, + \$asm_start, \$asm_end, \$asm_seq_region_id, + \$asm_seq_region, \$asm_length); + + my %asm_registered; + + while($sth->fetch()) { + $mapper->register_component($cmp_seq_region_id); + $mapper->mapper->add_map_coordinates( + $asm_seq_region_id, $asm_start, $asm_end, + $ori, + $cmp_seq_region_id, $cmp_start, $cmp_end); + + my $arr = [$cmp_seq_region_id, $cmp_seq_region, $cmp_cs_id, $cmp_length]; + + $self->{'sr_name_cache'}->{"$cmp_seq_region:$cmp_cs_id"} = $arr; + $self->{'sr_id_cache'}->{"$cmp_seq_region_id"} = $arr; + + # only register each asm seq_region once since it requires some work + if(!$asm_registered{$asm_seq_region_id}) { + $asm_registered{$asm_seq_region_id} = 1; + + # register all chunks from start of seq region to end + my $end_chunk = $asm_length >> $CHUNKFACTOR; + for(my $i = 0; $i <= $end_chunk; $i++) { + $mapper->register_assembled($asm_seq_region_id, $i); + } + + $arr = [$asm_seq_region_id, $asm_seq_region, $asm_cs_id, $asm_length]; + + $self->{'sr_name_cache'}->{"$asm_seq_region:$asm_cs_id"} = $arr; + $self->{'sr_id_cache'}->{"$asm_seq_region_id"} = $arr; + } + } + + $sth->finish(); + + return; +} + + + +=head2 register_all_chained + + Arg [1] : Bio::EnsEMBL::ChainedAssemblyMapper $casm_mapper + Example : $mapper = $asm_mapper_adaptor->fetch_by_CoordSystems($cs1,$cs2); + + # make the cache large enough to hold all of the mappings + $mapper->max_pair_count(10e6); + # load all of the mapping data + $asm_mapper_adaptor->register_all_chained($mapper); + + # perform mappings as normal + $mapper->map($slice->seq_region_name(), $sr_start, $sr_end, + $sr_strand, $cs1); + ... + Description: This function registers the entire set of mappings between + two coordinate systems in a chained mapper. This will use a lot + of memory but will be much more efficient when doing a lot of + mapping which is spread over the entire genome. + Returntype : none + Exceptions : throw if mapper is between coord systems with unexpected + mapping paths + Caller : specialised programs doing a lot of genome-wide mapping + Status : Stable + +=cut + +sub register_all_chained { + my $self = shift; + my $casm_mapper = shift; + + my $first_cs = $casm_mapper->first_CoordSystem(); + my $mid_cs = $casm_mapper->middle_CoordSystem(); + my $last_cs = $casm_mapper->last_CoordSystem(); + + my $start_mid_mapper = $casm_mapper->first_middle_mapper(); + my $end_mid_mapper = $casm_mapper->last_middle_mapper(); + my $combined_mapper = $casm_mapper->first_last_mapper(); + + my @ranges; + + my $sth = $self->prepare( + 'SELECT + asm.cmp_start, + asm.cmp_end, + asm.cmp_seq_region_id, + sr_cmp.name, + sr_cmp.length, + asm.ori, + asm.asm_start, + asm.asm_end, + asm.asm_seq_region_id, + sr_asm.name, + sr_asm.length + FROM + assembly asm, seq_region sr_asm, seq_region sr_cmp + WHERE + sr_asm.seq_region_id = asm.asm_seq_region_id AND + sr_cmp.seq_region_id = asm.cmp_seq_region_id AND + sr_asm.coord_system_id = ? AND + sr_cmp.coord_system_id = ?'); + + my $csa = $self->db()->get_CoordSystemAdaptor(); + + my @path; + + if ( !defined $mid_cs ) { + @path = @{ $csa->get_mapping_path( $first_cs, $last_cs ) }; + if ( !defined( $path[1] ) ) { + splice( @path, 1, 1 ); + } + } else { + @path = @{ $csa->get_mapping_path( $first_cs, $mid_cs ) }; + # fix for when we have something like supercontig#contig#chromosome + if ( !defined( $path[1] ) ) { + splice( @path, 1, 1 ); + } + } + + if ( @path != 2 ) { + my $path = + join( ',', map( { $_->name . ' ' . $_->version } @path ) ); + my $len = scalar(@path) - 1; + throw( "Unexpected mapping path between start and intermediate " + . "coord systems (" + . $first_cs->name . " " + . $first_cs->version . " and " + . $mid_cs->name . " " + . $mid_cs->version . ")." + . "\nExpected path length 1, got $len. " + . "(path=$path)" ); + } + + my ($asm_cs,$cmp_cs) = @path; + + $sth->{mysql_use_result} = 1; + $sth->bind_param(1,$asm_cs->dbID,SQL_INTEGER); + $sth->bind_param(2,$cmp_cs->dbID,SQL_INTEGER); + $sth->execute(); + + + my ($mid_start, $mid_end, $mid_seq_region_id, $mid_seq_region, $mid_length, + $ori, $start_start, $start_end, $start_seq_region_id, $start_seq_region, + $start_length); + + if($asm_cs->equals($first_cs)) { + $sth->bind_columns(\$mid_start, \$mid_end, \$mid_seq_region_id, + \$mid_seq_region, \$mid_length, \$ori, \$start_start, + \$start_end, \$start_seq_region_id, \$start_seq_region, + \$start_length); + } else { + $sth->bind_columns(\$start_start, \$start_end, \$start_seq_region_id, + \$start_seq_region, \$start_length, \$ori, + \$mid_start, \$mid_end, \$mid_seq_region_id, + \$mid_seq_region, \$mid_length); + + } + + my ( $mid_cs_id, $start_cs_id, $reg, $mapper ); + if ( !defined $mid_cs ) { + $mid_cs_id = $last_cs->dbID(); + $start_cs_id = $first_cs->dbID(); + $mapper = $combined_mapper; + } else { + $mid_cs_id = $mid_cs->dbID(); + $start_cs_id = $first_cs->dbID(); + $mapper = $start_mid_mapper; + } + + $reg = $casm_mapper->first_registry(); + + while($sth->fetch()) { + $mapper->add_map_coordinates + ( + $start_seq_region_id, $start_start, $start_end, $ori, + $mid_seq_region_id, $mid_start, $mid_end + ); + push( @ranges, [$start_seq_region_id, $start_start, $start_end ] ); + + $reg->check_and_register( $start_seq_region_id, 1, $start_length ); + if( ! defined $mid_cs ) { + $casm_mapper->last_registry()->check_and_register + ( $mid_seq_region_id, $mid_start, $mid_end ); + } + + my $arr = [ $mid_seq_region_id, $mid_seq_region, + $mid_cs_id, $mid_length ]; + + $self->{'sr_name_cache'}->{"$mid_seq_region:$mid_cs_id"} = $arr; + $self->{'sr_id_cache'}->{"$mid_seq_region_id"} = $arr; + + $arr = [ $start_seq_region_id, $start_seq_region, + $start_cs_id, $start_length ]; + + $self->{'sr_name_cache'}->{"$start_seq_region:$start_cs_id"} = $arr; + $self->{'sr_id_cache'}->{"$start_seq_region_id"} = $arr; + } + + if( ! defined $mid_cs ) { + # thats it for the simple case + return; + } + + + @path = @{ $csa->get_mapping_path( $last_cs, $mid_cs ) }; + if ( defined($mid_cs) ) { + if ( !defined( $path[1] ) ) { + splice( @path, 1, 1 ); + } + } + + if ( @path != 2 ) { + my $path = + join( ',', map( { $_->name . ' ' . $_->version } @path ) ); + my $len = scalar(@path) - 1; + throw( "Unexpected mapping path between intermediate and last " + . "coord systems (" + . $last_cs->name . " " + . $last_cs->version . " and " + . $mid_cs->name . " " + . $mid_cs->version . ")." + . "\nExpected path length 1, got $len. " + . "(path=$path)" ); + } + + ($asm_cs,$cmp_cs) = @path; + + $sth->bind_param(1,$asm_cs->dbID,SQL_INTEGER); + $sth->bind_param(2,$cmp_cs->dbID,SQL_INTEGER); + $sth->execute(); + + + my ($end_start, $end_end, $end_seq_region_id, $end_seq_region, + $end_length); + + if($asm_cs->equals($mid_cs)) { + $sth->bind_columns(\$end_start, \$end_end, \$end_seq_region_id, + \$end_seq_region, \$end_length, \$ori, + \$mid_start, \$mid_end, \$mid_seq_region_id, + \$mid_seq_region, \$mid_length); + } else { + $sth->bind_columns(\$mid_start, \$mid_end, \$mid_seq_region_id, + \$mid_seq_region, \$mid_length, \$ori, \$end_start, + \$end_end, \$end_seq_region_id, \$end_seq_region, + \$end_length); + } + + my $end_cs_id = $last_cs->dbID(); + $reg = $casm_mapper->last_registry(); + + while($sth->fetch()) { + $end_mid_mapper->add_map_coordinates + ( + $end_seq_region_id, $end_start, $end_end, $ori, + $mid_seq_region_id, $mid_start, $mid_end + ); + + $reg->check_and_register( $end_seq_region_id, 1, $end_length ); + + my $arr = [ $end_seq_region_id, $end_seq_region, + $end_cs_id, $end_length ]; + $self->{'sr_name_cache'}->{"$end_seq_region:$end_cs_id"} = $arr; + $self->{'sr_id_cache'}->{"$end_seq_region_id"} = $arr; + } + + _build_combined_mapper( \@ranges, $start_mid_mapper, $end_mid_mapper, + $combined_mapper, "first" ); + + return; +} + + + +# after both halves of a chained mapper are loaded +# this function maps all ranges in $ranges and loads the +# results into the combined mapper +sub _build_combined_mapper { + my $ranges = shift; + my $start_mid_mapper = shift; + my $end_mid_mapper = shift; + my $combined_mapper = shift; + my $start_name = shift; + + my $mid_name = "middle"; + + foreach my $range (@$ranges) { + my ( $seq_region_id, $start, $end) = @$range; + + my $sum = 0; + + my @initial_coords = $start_mid_mapper->map_coordinates($seq_region_id, + $start,$end,1, + $start_name); + + foreach my $icoord (@initial_coords) { + #skip gaps + if($icoord->isa('Bio::EnsEMBL::Mapper::Gap')) { + $sum += $icoord->length(); + next; + } + + + #feed the results of the first mapping into the second mapper + my @final_coords = + $end_mid_mapper->map_coordinates($icoord->id, $icoord->start, + $icoord->end, + $icoord->strand, $mid_name); + + + foreach my $fcoord (@final_coords) { + #load up the final mapper + + if($fcoord->isa('Bio::EnsEMBL::Mapper::Coordinate')) { + my $total_start = $start + $sum; + my $total_end = $total_start + $fcoord->length - 1; + my $ori = $fcoord->strand(); + + if($start_name eq 'first') { # add coords in consistant order + $combined_mapper->add_map_coordinates( + $seq_region_id, $total_start, $total_end, $ori, + $fcoord->id(), $fcoord->start(), $fcoord->end()); + } else { + $combined_mapper->add_map_coordinates( + $fcoord->id(), $fcoord->start(), $fcoord->end(),$ori, + $seq_region_id, $total_start, $total_end); + } + + } + $sum += $fcoord->length(); + } + } + } + #all done! +} + + +=head2 seq_regions_to_ids + + Arg [1] : Bio::EnsEMBL::CoordSystem $coord_system + Arg [2] : listref of strings $seq_regions + Example : my @ids = @{$asma->seq_regions_to_ids($coord_sys, \@seq_regs)}; + Description: Converts a list of seq_region names to internal identifiers + using the internal cache that has accumulated while registering + regions for AssemblyMappers. If any requested regions are + not found in the cache an attempt is made to retrieve them + from the database. + Returntype : listref of ints + Exceptions : throw if a non-existant seqregion is provided + Caller : general + Status : Stable + +=cut + +sub seq_regions_to_ids { + my $self = shift; + my $coord_system = shift; + my $seq_regions = shift; + + my $cs_id = $coord_system->dbID(); + + my @out; + + foreach my $sr (@$seq_regions) { + my $arr = $self->{'sr_name_cache'}->{"$sr:$cs_id"}; + if( $arr ) { + push( @out, $arr->[0] ); + } else { + push @out, $self->_seq_region_name_to_id($sr,$cs_id); + } + } + + return \@out; +} + + +=head2 seq_ids_to_regions + + Arg [1] : listref of seq_region ids + Example : my @ids = @{$asma->ids_to_seq_regions(\@seq_ids)}; + Description: Converts a list of seq_region ids to seq region names + using the internal cache that has accumulated while registering + regions for AssemblyMappers. If any requested regions are + not found in the cache an attempt is made to retrieve them + from the database. + Returntype : listref of strings + Exceptions : throw if a non-existant seq_region_id is provided + Caller : general + Status : Stable + +=cut + +sub seq_ids_to_regions { + my $self = shift; + my $seq_region_ids = shift; + + my @out; + + foreach my $sr (@$seq_region_ids) { + my $arr = $self->{'sr_id_cache'}->{"$sr"}; + if( $arr ) { + push( @out, $arr->[1] ); + } else { + push @out, $self->_seq_region_id_to_name($sr); + } + } + + return \@out; +} + +=head2 delete_cache + + Description: Delete all the caches for the mappings/seq_regions + Returntype : none + Exceptions : none + Caller : General + Status : At risk + +=cut + +sub delete_cache{ + my ($self) = @_; + + %{$self->{'sr_name_cache'}} = (); + %{$self->{'sr_id_cache'}} = (); + + foreach my $key (keys %{$self->{'_asm_mapper_cache'}}){ + $self->{'_asm_mapper_cache'}->{$key}->flush(); + } + %{$self->{'_asm_mapper_cache'}} = (); + return; +} + + +=head2 register_region + + Description: DEPRECATED use register_assembled instead + +=cut + +sub register_region{ + my ($self, $assmapper, $type, $chr_name, $start, $end) = @_; + + deprecate('Use register_assembled instead'); + + $self->register_assembled($assmapper, $chr_name, $start, $end); +} + + +=head2 register_contig + + Description: DEPRECATED use register_component instead + +=cut + +sub register_contig { + my ($self, $assmapper, $type, $contig_id ) = @_; + + deprecate('Use register_component instead'); + + #not sure if the use is passing in a seq_region_name or a + #seq_region_id... + register_component($assmapper, $contig_id); +} + + +=head2 fetch_by_type + + Description: DEPRECATED use fetch_by_CoordSystems instead + +=cut + +sub fetch_by_type{ + my ($self,$type) = @_; + + deprecate('Use fetch_by_CoordSystems instead'); + + #assume that what the user wanted was a mapper between the sequence coord + #level and the top coord level + + my $csa = $self->db()->get_CoordSystemAdaptor(); + + my $cs1 = $csa->fetch_top_level($type); + my $cs2 = $csa->fetch_sequence_level(); + + return $self->fetch_by_CoordSystems($cs1,$cs2); +} + + + +1;