Mercurial > repos > mahtabm > ensembl
diff variant_effect_predictor/Bio/EnsEMBL/TopLevelAssemblyMapper.pm @ 0:1f6dce3d34e0
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 02:01:53 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/TopLevelAssemblyMapper.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,436 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at <dev@ensembl.org>. + + Questions may also be sent to the Ensembl help desk at + <helpdesk@ensembl.org>. + +=cut + +=head1 NAME + +Bio::EnsEMBL::TopLevelAssemblyMapper - +Handles mapping between a given coordinate system and the toplevel +pseudo coordinate system. + +=head1 SYNOPSIS + + $db = Bio::EnsEMBL::DBSQL::DBAdaptor->new(...); + $asma = $db->get_AssemblyMapperAdaptor(); + $csa = $db->get_CoordSystemAdaptor(); + + my $toplevel = $cs_adaptor->fetch_by_name('toplevel'); + my $ctg_cs = $cs_adaptor->fetch_by_name('contig'); + + $asm_mapper = $map_adaptor->fetch_by_CoordSystems( $cs1, $cs2 ); + + # map to toplevel coord system for this region + @chr_coords = + $asm_mapper->map( 'AL30421.1.200.92341', 100, 10000, -1, $ctg_cs ); + + # list toplevel seq_region_ids for this region + @chr_ids = + $asm_mapper->list_ids( 'AL30421.1.200.92341', 1, 1000, -1, + $ctg_cs ); + +=head1 DESCRIPTION + +The TopLevelAssemblyMapper performs mapping between a provided +coordinate system and the toplevel pseudo cooordinate system. The +toplevel coordinate system is not a real coordinate system, but +represents the highest coordinate system that can be mapped to in a +given region. It is only possible to perform unidirectional mapping +using this mapper, because it does not make sense to map from the +toplevel coordinate system to another coordinate system. + +=head1 METHODS + +=cut + + +use strict; +use warnings; + +package Bio::EnsEMBL::TopLevelAssemblyMapper; + +use Bio::EnsEMBL::Utils::Exception qw(throw); +use Bio::EnsEMBL::Mapper; +use Bio::EnsEMBL::CoordSystem; +use Scalar::Util qw(weaken); + +=head2 new + + Arg [1] : Bio::EnsEMBL::DBAdaptor $dbadaptor the adaptor for + the database this mapper is using. + Arg [2] : Toplevel CoordSystem + Arg [3] : Other CoordSystem + Description: Creates a new TopLevelAssemblyMapper object + Returntype : Bio::EnsEMBL::DBSQL::TopLevelAssemblyMapper + Exceptions : throws if any of the 3 arguments are missing/ not + : of the correct type. + Caller : Bio::EnsEMBL::DBSQL::AssemblyMapperAdaptor + Status : Stable + +=cut + + +sub new { + my ($caller, $adaptor, $toplevel_cs, $other_cs) = @_; + + my $class = ref($caller) || $caller; + + if(!ref($toplevel_cs)) { + throw('Toplevel CoordSystem argument expected.'); + } + if(!ref($other_cs)) { + throw('Other CoordSystem argument expected.'); + } + + if(!$toplevel_cs->is_top_level()) { + throw($toplevel_cs->name() . " is not the toplevel CoordSystem."); + } + if($other_cs->is_top_level()) { + throw("Other CoordSystem argument should not be toplevel CoordSystem."); + } + + my $cs_adaptor = $adaptor->db()->get_CoordSystemAdaptor(); + my $coord_systems = $cs_adaptor->fetch_all(); + + my $self = bless {'coord_systems' => $coord_systems, + 'toplevel_cs' => $toplevel_cs, + 'other_cs' => $other_cs}, $class; + + $self->adaptor($adaptor); + return $self; +} + +sub adaptor { + my $self = shift; + + weaken($self->{'adaptor'} = shift) if(@_); + + return $self->{'adaptor'}; +} + +=head2 map + + Arg [1] : string $frm_seq_region + The name of the sequence region to transform FROM + Arg [2] : int $frm_start + The start of the region to transform FROM + Arg [3] : int $frm_end + The end of the region to transform FROM + Arg [4] : int $strand + The strand of the region to transform FROM + Arg [5] : Bio::EnsEMBL::CoordSystem + The coordinate system to transform FROM + Arg [6] : if set will do a fastmap + Example : @coords = $mapper->map('X', 1_000_000, 2_000_000, + 1, $chr_cs); + Description: Transforms coordinates from one coordinate system + to another. + Returntype : List of Bio::EnsEMBL::Mapper::Coordinate and/or + Bio::EnsEMBL::Mapper:Gap objects + Exceptions : thrown if if the specified TO coordinate system is not one + of the coordinate systems associated with this mapper + Caller : general + Status : Stable + +=cut + + +sub map { + throw('Incorrect number of arguments.') if(@_ != 6 && @_ != 7); + + my($self, $frm_seq_region_name, $frm_start, $frm_end, $frm_strand, $frm_cs, + $fastmap) = @_; + + if($frm_cs->is_top_level()) { + throw("The toplevel CoordSystem can only be mapped TO, not FROM."); + } + + my @tmp; + push @tmp, $frm_seq_region_name; + my $seq_region_id = @{$self->adaptor()->seq_regions_to_ids($frm_cs, \@tmp)}[0]; + + my $mapper = $self->{'mapper'}; + my $toplevel_cs = $self->{'toplevel_cs'}; + my $other_cs = $self->{'other_cs'}; + my $adaptor = $self->adaptor; + + if($frm_cs != $other_cs && !$frm_cs->equals($other_cs)) { + throw("Coordinate system " . $frm_cs->name . " " . $frm_cs->version . + " is neither the assembled nor the component coordinate system " . + " of this AssemblyMapper"); + } + + my $coord_systems = $self->{'coord_systems'}; + + my $csa = $self->adaptor()->db()->get_CoordSystemAdaptor(); + + # + # TBD try to make this more efficient + # + my $from_rank = $other_cs->rank(); + foreach my $cs (@$coord_systems) { + last if($cs->rank >= $from_rank); + + #check if a mapping path even exists to this coordinate system + my @mapping_path = @{ $csa->get_mapping_path( $cs, $other_cs ) }; + + if(@mapping_path) { + + # Try to map to this coord system. If we get back any coordinates then + # it is our 'toplevel' that we were looking for + my $mapper = $adaptor->fetch_by_CoordSystems($other_cs, $cs); + + if($fastmap) { + my @result = $mapper->fastmap($frm_seq_region_name, $frm_start, $frm_end, + $frm_strand, $frm_cs); + return @result if(@result); + } else { + my @coords = $mapper->map($frm_seq_region_name, $frm_start, $frm_end, + $frm_strand, $frm_cs); + + if(@coords > 1 || !$coords[0]->isa('Bio::EnsEMBL::Mapper::Gap')) { + return @coords; + } + } + } + } + + # the toplevel coordinate system for the region requested *is* the + # requested region. + if($fastmap) { + return ($seq_region_id,$frm_start, $frm_end, $frm_strand, $other_cs); + } + return Bio::EnsEMBL::Mapper::Coordinate->new + ($seq_region_id,$frm_start,$frm_end, $frm_strand, $other_cs); +} + +# +# for polymorphism with AssemblyMapper +# +=head2 flush + + Args : none + Example : none + Description: polymorphism with AssemblyMapper, does nothing + Returntype : none + Exceptions : none + Status : Stable + +=cut + +sub flush {} + +=head2 fastmap + + Arg [1] : string $frm_seq_region + The name of the sequence region to transform FROM + Arg [2] : int $frm_start + The start of the region to transform FROM + Arg [3] : int $frm_end + The end of the region to transform FROM + Arg [4] : int $strand + The strand of the region to transform FROM + Arg [5] : Bio::EnsEMBL::CoordSystem + The coordinate system to transform FROM + Example : @coords = $mapper->fastmap('X', 1_000_000, 2_000_000, + 1, $chr_cs); + Description: Transforms coordinates from one coordinate system + to another. + Returntype : List of Bio::EnsEMBL::Mapper::Coordinate and/or + Bio::EnsEMBL::Mapper:Gap objects + Exceptions : thrown if if the specified TO coordinate system is not one + of the coordinate systems associated with this mapper + Caller : general + Status : Stable + +=cut + +sub fastmap { + my $self = shift; + return $self->map(@_,1); +} + +=head2 assembled_CoordSystem + + Arg [1] : none + Example : $cs = $mapper->assembled_CoordSystem + Description: Retrieves the assembled CoordSystem from this mapper + Returntype : Bio::EnsEMBL::CoordSystem + Exceptions : none + Caller : internal, AssemblyMapperAdaptor + Status : Stable + +=cut + +sub assembled_CoordSystem { + my $self = shift; + return $self->{'toplevel_cs'}; +} + +=head2 component_CoordSystem + + Arg [1] : none + Example : $cs = $mapper->component_CoordSystem + Description: Retrieves the component CoordSystem from this mapper + Returntype : Bio::EnsEMBL::CoordSystem + Exceptions : none + Caller : internal, AssemblyMapperAdaptor + Status : Stable + +=cut + +sub component_CoordSystem { + my $self = shift; + return $self->{'other_cs'}; +} + + +sub _list { + my($self, $frm_seq_region_name, $frm_start, $frm_end, $frm_cs, $seq_regions) = @_; + + my $mapper = $self->{'mapper'}; + my $toplevel_cs = $self->{'toplevel_cs'}; + my $other_cs = $self->{'other_cs'}; + my $adaptor = $self->adaptor; + + if($frm_cs->is_top_level()) { + throw("The toplevel CoordSystem can only be mapped TO, not FROM."); + } + if($frm_cs != $other_cs && !$frm_cs->equals($other_cs)) { + throw("Coordinate system " . $frm_cs->name . " " . $frm_cs->version . + " is neither the assembled nor the component coordinate system " . + " of this AssemblyMapper"); + } + + my $coord_systems = $self->{'coord_systems'}; + my $csa = $self->adaptor()->db()->get_CoordSystemAdaptor(); + + # + # TBD try to make this more efficient + # + my $from_rank = $other_cs->rank(); + foreach my $cs (@$coord_systems) { + last if($cs->rank >= $from_rank); + + #check if a mapping path even exists to this coordinate system + my @mapping_path = @{ $csa->get_mapping_path( $cs, $other_cs ) }; + + if(@mapping_path) { + + # Try to map to this coord system. If we get back any coordinates then + # it is our 'toplevel' that we were looking for + my $mapper = $adaptor->fetch_by_CoordSystems($other_cs, $cs); + + my @result; + + my @tmp; + push @tmp, $frm_seq_region_name; + my $seq_region_id = @{$self->adaptor()->seq_regions_to_ids($frm_cs, \@tmp)}[0]; + + if($seq_regions) { + @result = $mapper->list_seq_regions($frm_seq_region_name, $frm_start, + $frm_end, $frm_cs); + } else { + @result = $mapper->list_ids($frm_seq_region_name, $frm_start, + $frm_end, $frm_cs); + } + + return @result if(@result); + } + } + + # the toplevel coordinate system for the region requested *is* the + return ($frm_seq_region_name); + + + # requested region. + if($seq_regions) { + return ($frm_seq_region_name); + } + + #this seems a bit silly and inefficient, but it is probably never + #called anyway. + my $slice_adaptor = $adaptor->db()->get_SliceAdaptor(); + my $slice = $slice_adaptor->fetch_by_region($other_cs->name(), + $frm_seq_region_name, + undef,undef,undef,$other_cs); + return ($slice_adaptor->get_seq_region_id($slice)); +} + + + +=head2 list_seq_regions + + Arg [1] : string $frm_seq_region + The name of the sequence region of interest + Arg [2] : int $frm_start + The start of the region of interest + Arg [3] : int $frm_end + The end of the region to transform of interest + Arg [5] : Bio::EnsEMBL::CoordSystem $frm_cs + The coordinate system to obtain overlapping ids of + Example : foreach $id ($asm_mapper->list_ids('X',1,1000,$ctg_cs)) {...} + Description: Retrieves a list of overlapping seq_region names + of another coordinate system. This is the same as the + list_ids method but uses seq_region names rather internal ids + Returntype : List of strings + Exceptions : none + Caller : general + Status : Stable + +=cut + +sub list_seq_regions { + throw('Incorrect number of arguments.') if(@_ != 5); + return _list(@_,1); +} + + +=head2 list_ids + + Arg [1] : string $frm_seq_region + The name of the sequence region of interest. + Arg [2] : int $frm_start + The start of the region of interest + Arg [3] : int $frm_end + The end of the region to transform of interest + Arg [5] : Bio::EnsEMBL::CoordSystem $frm_cs + The coordinate system to obtain overlapping ids of + Example : foreach $id ($asm_mapper->list_ids('X',1,1000,$chr_cs)) {...} + Description: Retrieves a list of overlapping seq_region internal identifiers + of another coordinate system. This is the same as the + list_seq_regions method but uses internal identfiers rather + than seq_region strings + Returntype : List of ints + Exceptions : thrown if the from CoordSystem is the toplevel coord system + thrown if the from CoordSystem is not the one used in the mapper + Caller : general + Status : Stable + +=cut + +sub list_ids { + throw('Incorrect number of arguments.') if(@_ != 5); + return _list(@_,0); +} + + + + + +1;