Mercurial > repos > mahtabm > ensembl
diff variant_effect_predictor/Bio/EnsEMBL/External/ExternalFeatureAdaptor.pm @ 0:1f6dce3d34e0
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 02:01:53 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/External/ExternalFeatureAdaptor.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,702 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at <dev@ensembl.org>. + + Questions may also be sent to the Ensembl help desk at + <helpdesk@ensembl.org>. + +=cut + +=head1 NAME + +Bio::EnsEMBL::External::ExternalFeatureAdaptor + +=head 1 SUMMARY + +Allows features created externally from Ensembl in a single +coordinate system to be retrieved in several other (Ensembl-style) +coordinate systems. This is intended to be a replacement for the old +Bio::EnsEMBL::DB::ExternalFeatureFactoryI interface. + +=head1 SYNOPSIS + + $database_adaptor = new Bio::EnsEMBL::DBSQL::DBAdaptor( + -host => 'kaka.sanger.ac.uk', + -dbname => 'homo_sapiens_core_9_30', + -pass => 'anonymous' + ); + + $xf_adaptor = new ExternalFeatureAdaptorSubClass; + + # Connect the Ensembl core database: + $xf_adaptor->db($database_adaptor); + + # get some features in vontig coords + @feats = + @{ $xf_adaptor->fetch_all_by_contig_name('AC000087.2.1.42071') }; + + # get some features in assembly coords + @feats = + @{ $xf_adaptor->fetch_all_by_chr_start_end( 'X', 100000, 200000 ) }; + + # get some features in clone coords + @feats = @{ $xf_adaptor->fetch_all_by_clone_accession('AC000087') }; + + # Add the adaptor to the ensembl core dbadaptor (implicitly sets db + # attribute) + $database_adaptor->add_ExternalFeatureAdaptor($xf_adaptor); + + # get some features in Slice coords + $slice_adaptor = $database_adaptor->get_SliceAdaptor; + $slice = + $slice_adaptor->fetch_all_by_chr_start_end( 1, 100000, 200000 ); + @feats = @{ $xf_adaptor->fetch_all_by_Slice($slice) }; + + # now features can be retrieved directly from Slice + @feats = @{ $slice->get_all_ExternalFeatures }; + +=head1 DESCRIPTION + +This class is intended to be used as a method of getting external +features into EnsEMBL. To work, this class must be extended and must +implement the the coordinate_systems method. As well, the subclass +is required to implement a single fetch method so that the external +features may be retrieved. By implementing a single fetch_method in a +single coordinate system all of the other ExternalFeatureAdaptor fetch +methods become available for retrieving the data in several different +coordinate systems. + +The coordinate_systems method should return a list of strings indicating +which coordinate system(s) have been implemented. If a given string is +returned from the coordinate_systems method then the corresponding fetch +method must be implemented. The reverse is also true: if a fetch method +is implemented then coordinate_systems must return the appropriate +string in its list of return values. The following are the valid +coordinate system values and the corresponding fetch methods that must +be implemented: + + COORD SYSTEM STRING FETCH_METHOD + ------------------- ------------ + 'ASSEMBLY' fetch_all_by_chr_start_end + 'CLONE' fetch_all_by_clone_accession + 'CONTIG' fetch_all_by_contig_name + 'SUPERCONTIG' fetch_all_by_supercontig_name + 'SLICE' fetch_all_by_Slice + +The objects returned by the fetch methods should be EnsEMBL or BioPerl +style Feature objects. These objects MUST have start, end and strand +methods. + +Before the non-overridden ExternalFeatureAdaptor fetch methods may +be called an EnsEMBL core database adaptor must be attached to the +ExternalFeatureAdaptor . This database adaptor is required to perform +the remappings between various coordinate system. This may be done +implicitly by adding the ExternalFeatureAdaptor to the database adaptor +through a call to the DBAdaptor add_ExternalFeatureAdaptor method or +explicitly by calling the ExternalFeatureAdaptor ensembl_db method. + +=head1 METHODS + +=cut + +package Bio::EnsEMBL::External::ExternalFeatureAdaptor; + +use strict; + +use Bio::EnsEMBL::Utils::Exception qw(warning throw); + + +=head2 new + + Arg [1] : none + Example : $xfa = new Bio::EnsEMBL::External::ExternalFeatureAdaptor; + Description: Creates a new ExternalFeatureAdaptor object. You may wish to + extend this constructor and provide your own set of paremeters. + Returntype : Bio::EnsEMBL::External::ExternalFeatureAdaptor + Exceptions : none + Caller : general + +=cut + +sub new { + my $class = shift; + + if(ref $class) { + return bless {}, ref $class; + } + + return bless {}, $class; +} + + + +=head2 ensembl_db + + Arg [1] : (optional) Bio::EnsEMBL::DBSQL::DBAdaptor + Example : $external_feature_adaptor->ensembl_db($new_val); + Description: none + Returntype : Bio::EnsEMBL::DBSQL::DBAdaptor + Exceptions : none + Caller : internal + +=cut + +sub ensembl_db { + my ($self, $value) = @_; + + if($value) { + $self->{'ensembl_db'} = $value; + } + + return $self->{'ensembl_db'}; +} + + + +=head2 coordinate_systems + + Arg [1] : none + Example : @implemented_coord_systems = $ext_adaptor->coordinate_systems; + Description: ABSTRACT method. Must be implemented by all + ExternalFeatureAdaptor subclasses. This method returns a list + of coordinate systems which are implemented by the subclass. + A minimum of on valid coordinate system must be implemented. + Valid coordinate systems are: 'SLICE', 'ASSEMBLY', 'CONTIG', + and 'CLONE'. + Returntype : list of strings + Exceptions : none + Caller : internal + +=cut + +sub coordinate_systems { + my $self = shift; + + throw("abstract method coordinate_systems not implemented\n"); + + return ''; +} + + +=head2 track_name + + Arg [1] : none + Example : $track_name = $xf_adaptor->track_name; + Description: Currently this is not really used. In the future it may be + possible to have ExternalFeatures automatically displayed by + the EnsEMBL web code. By default this method returns + 'External features' but you are encouraged to override this + method and provide your own meaningful name for the features + your adaptor provides. This also allows you to distinguish the + type of features retrieved from Slices. See + the PODs for Bio::EnsEMBL::Slice::get_all_ExternalFeatures and + Bio::EnsEMBL::DBSQL::DBAdaptor::add_ExternalFeatureAdaptor + methods. + Returntype : string + Exceptions : none + Caller : Bio::EnsEMBL::DBSQL::DBAdaptor::add_ExternalFeatureAdaptor + +=cut + +sub track_name { + my $self = shift; + + return 'External features'; +} + + + +=head2 feature_type + + Arg [1] : none + Example : $feature_type = $xf_adaptor->track_name + Description: Currently this is not used. In the future it may be possible + to have ExternalFeatures automatically displayed by the EnsEMBL + web code. This method would then be used do determine the + type of glyphs used to draw the features which are returned + from this external adaptor. + Returntype : string + Exceptions : none + Caller : none + +=cut + +sub feature_type { + my $self = shift; + + return qw(SIMPLE); +} + + + +=head2 fetch_all_by_Slice + + Arg [1] : Bio::EnsEMBL::Slice $slice + Example : @features = @{$ext_adaptor->fetch_all_by_Slice($slice)}; + Description: Retrieves all features which lie in the region defined + by $slice in slice coordinates. + + If this method is overridden then the coordinate_systems method + must return 'SLICE' as one of its values. + + This method will work as is (i.e. without overriding it) + providing at least one of the other fetch methods is overridden. + Returntype : reference to a list of Bio::SeqFeature objects in the Slice + coordinate system + Exceptions : Thrown on incorrect input arguments + Caller : general, fetch_all_by_chr_start_end + +=cut + +sub fetch_all_by_Slice { + my ($self, $slice) = @_; + + unless($slice && ref $slice && $slice->isa('Bio::EnsEMBL::Slice')) { + throw("[$slice] is not a Bio::EnsEMBL::Slice"); + } + + my $out = []; + + my $csa = $self->ensembl_db->get_CoordSystemAdaptor(); + + my $slice_start = $slice->start; + my $slice_end = $slice->end; + my $slice_strand = $slice->strand; + my $slice_seq_region = $slice->seq_region_name; + my $slice_seq_region_id = $slice->get_seq_region_id; + my $coord_system = $slice->coord_system; + + if($self->_supported('SLICE')) { + throw("ExternalFeatureAdaptor supports SLICE coordinate system" . + " but fetch_all_by_Slice not implemented"); + } + + my %features; + my $from_coord_system; + + my $fetch_method; + + # + # Get all of the features from whatever coord system they are computed in + # + if($self->_supported('CLONE')) { + $fetch_method = sub { + my $self = shift; + my $name = shift; + my ($acc, $ver) = split(/\./, $name); + $self->fetch_all_by_clone_accession($acc,$ver,@_); + }; + $from_coord_system = $csa->fetch_by_name('clone'); + } elsif($self->_supported('ASSEMBLY')) { + $from_coord_system = $csa->fetch_by_name('chromosome'); + $fetch_method = $self->can('fetch_all_by_chr_start_end'); + } elsif($self->_supported('CONTIG')) { + $from_coord_system = $csa->fetch_by_name('contig'); + $fetch_method = $self->can('fetch_all_by_contig_name'); + } elsif($self->_supported('SUPERCONTIG')) { + $from_coord_system = $csa->fetch_by_name('supercontig'); + $fetch_method = $self->can('fetch_all_by_supercontig_name'); + } else { + $self->_no_valid_coord_systems(); + } + + if($from_coord_system->equals($coord_system)) { + $features{$slice_seq_region} = &$fetch_method($self, $slice_seq_region, + $slice_start,$slice_end); + } else { + foreach my $segment (@{$slice->project($from_coord_system->name, + $from_coord_system->version)}) { + my ($start,$end,$pslice) = @$segment; + $features{$pslice->seq_region_name } ||= []; + push @{$features{$pslice->seq_region_name }}, + @{&$fetch_method($self, $pslice->seq_region_name, + $pslice->start(), + $pslice->end())}; + } + } + + my @out; + + if(!$coord_system->equals($from_coord_system)) { + my $asma = $self->ensembl_db->get_AssemblyMapperAdaptor(); + my $mapper = $asma->fetch_by_CoordSystems($from_coord_system, + $coord_system); + my %slice_cache; + my $slice_adaptor = $self->ensembl_db->get_SliceAdaptor(); + my $slice_setter; + + #convert the coordinates of each of the features retrieved + foreach my $fseq_region (keys %features) { + my $feats = $features{$fseq_region}; + next if(!$feats); + $slice_setter = _guess_slice_setter($feats) if(!$slice_setter); + + foreach my $f (@$feats) { + my($sr_id, $start, $end, $strand) = + $mapper->fastmap($fseq_region,$f->start,$f->end,$f->strand, + $from_coord_system); + + #maps to gap + next if(!defined($sr_id)); + + #maps to unexpected seq region, probably error in the externally + if($sr_id ne $slice_seq_region_id) { + warning("Externally created Feature mapped to [$sr_id] " . + "which is not on requested seq_region_id [$slice_seq_region_id]"); + next; + } + + #update the coordinates of the feature + &$slice_setter($f,$slice); + $f->start($start); + $f->end($end); + $f->strand($strand); + push @out, $f; + } + } + } else { + #we already know the seqregion the featues are on, we just have + #to put them on the slice + @out = @{$features{$slice_seq_region}}; + my $slice_setter = _guess_slice_setter(\@out); + + foreach my $f (@out) { + &$slice_setter($f,$slice); + } + } + + #shift the feature coordinates again if + #the requested slice is not the full seqregion + if($slice->start != 1 || $slice->strand != 1) { + #convert from assembly coords to slice coords + my($f_start, $f_end, $f_strand); + foreach my $f (@out) { + if($slice_strand == 1) { + $f_start = $f->start - $slice_start + 1; + $f_end = $f->end - $slice_start + 1; + $f_strand = $f->strand; + } else { + $f_start = $slice_end - $f->end + 1; + $f_end = $slice_end - $f->start + 1; + $f_strand = $f->strand * -1; + } + + $f->start($f_start); + $f->end($f_end); + $f->strand($f_strand); + } + } + + return \@out; +} + + +sub _guess_slice_setter { + my $features = shift; + + #we do not know what type of features these are. They might + #be bioperl features or old ensembl features, hopefully they are new + #style features. Try to come up with a setter method for the + #slice. + + return undef if(!@$features); + + my ($f) = @$features; + + my $slice_setter; + foreach my $method (qw(slice contig attach_seq)) { + last if($slice_setter = $f->can($method)); + } + + if(!$slice_setter) { + if($f->can('seqname')) { + $slice_setter = sub { $_[0]->seqname($_[1]->seq_region_name()); }; + } else { + $slice_setter = sub{} if(!$slice_setter); + } + } + + return $slice_setter; +} + + +=head2 fetch_all_by_contig_name + + Arg [1] : string $contig_name + Arg [2] : int $start (optional) + The start of the region on the contig to retrieve features on + if not specified the whole of the contig is used. + Arg [3] : int $end (optional) + The end of the region on the contig to retrieve features on + if not specified the whole of the contig is used. + Example : @fs = @{$self->fetch_all_by_contig_name('AB00879.1.1.39436')}; + Description: Retrieves features on the contig defined by the name + $contig_name in contig coordinates. + + If this method is overridden then the coordinate_systems + method must return 'CONTIG' as one of its values. + + This method will work as is (i.e. without being overridden) + providing at least one other fetch method has + been overridden. + Returntype : reference to a list of Bio::SeqFeature objects in the contig + coordinate system. + Exceptions : thrown if the input argument is incorrect + thrown if the coordinate_systems method returns the value + 'CONTIG' and this method has not been overridden. + Caller : general, fetch_all_by_Slice + +=cut + +sub fetch_all_by_contig_name { + my ($self, $contig_name, $start, $end) = @_; + + unless($contig_name) { + throw("contig_name argument not defined"); + } + + if($self->_supported('CONTIG')) { + throw("ExternalFeatureAdaptor supports CONTIG coordinate system" . + " but fetch_all_by_contig_name is not implemented"); + } + + unless($self->ensembl_db) { + throw('DB attribute not set. This value must be set for the ' . + 'ExternalFeatureAdaptor to function correctly'); + } + + my $slice_adaptor = $self->ensembl_db->get_SliceAdaptor(); + my $slice = $slice_adaptor->fetch_by_region('contig', $contig_name, + $start, $end); + return $self->fetch_all_by_Slice($slice); +} + + + +=head2 fetch_all_by_supercontig_name + + Arg [1] : string $supercontig_name + Arg [2] : int $start (optional) + The start of the region on the contig to retrieve features on + if not specified the whole of the contig is used. + Arg [3] : int $end (optional) + The end of the region on the contig to retrieve features on + if not specified the whole of the contig is used. + Example : @fs = @{$self->fetch_all_by_contig_name('NT_004321')}; + Description: Retrieves features on the contig defined by the name + $supercontigname in supercontig coordinates. + + If this method is overridden then the coordinate_systems + method must return 'SUPERCONTIG' as one of its values. + + This method will work as is (i.e. without being overridden) + providing at least one other fetch method has + been overridden. + Returntype : reference to a list of Bio::SeqFeature objects in the contig + coordinate system. + Exceptions : thrown if the input argument is incorrect + thrown if the coordinate_systems method returns the value + 'SUPERCONTIG' and this method has not been overridden. + Caller : general, fetch_all_by_Slice + +=cut + + +sub fetch_all_by_supercontig_name { + my ($self, $supercontig_name, $start, $end) = @_; + + unless($supercontig_name) { + throw("supercontig_name argument not defined"); + } + + if($self->_supported('SUPERCONTIG')) { + throw("ExternalFeatureAdaptor supports SUPERCONTIG coordinate system" . + " but fetch_all_by_supercontig_name is not implemented"); + } + + unless($self->ensembl_db) { + throw('DB attribute not set. This value must be set for the ' . + 'ExternalFeatureAdaptor to function correctly'); + } + + my $slice_adaptor = $self->ensembl_db->get_SliceAdaptor(); + my $slice = $slice_adaptor->fetch_by_region('supercontig', $supercontig_name, + $start, $end); + return $self->fetch_all_by_Slice($slice); +} + + +=head2 fetch_all_by_clone_accession + + Arg [1] : string $acc + The EMBL accession number of the clone to fetch features from. + Arg [2] : (optional) string $ver + Arg [3] : (optional) int $start + Arg [4] : (optional) int $end + + Example : @fs = @{$self->fetch_all_by_clone_accession('AC000093')}; + Description: Retrieves features on the clone defined by the $acc arg in + Clone coordinates. + + If this method is overridden then the coordinate_systems method + must return 'CLONE' as one of its values. The arguments + start, end, version are passed if this method is overridden and + can optionally be used to reduce the scope of the query and + improve performance. + + This method will work as is - providing at least one other + fetch method has been overridden. + Returntype : reference to a list of Bio::SeqFeature objects in the Clone + coordinate system + Exceptions : thrown if the input argument is incorrect + thrown if the coordinate system method returns the value 'CLONE' + and this method is not overridden. + thrown if the coordinate systems method does not return any + valid values. + Caller : general, fetch_all_by_clone_accession + +=cut + +sub fetch_all_by_clone_accession { + my ($self, $acc, $version, $start, $end) = @_; + + unless($acc) { + throw("clone accession argument not defined"); + } + + if($self->_supported('CLONE')) { + throw('ExternalFeatureAdaptor supports CLONE coordinate system ' . + 'but does not implement fetch_all_by_clone_accession'); + } + + unless($self->ensembl_db) { + throw('DB attribute not set. This value must be set for the ' . + 'ExternalFeatureAdaptor to function correctly'); + } + + if(defined($version)) { + $acc = "$acc.$version"; + } elsif(!$acc =~ /\./) { + $acc = "$acc.1"; + } + + my $slice_adaptor = $self->ensembl_db->get_SliceAdaptor; + + my $slice = $slice_adaptor->fetch_by_region('clone', $acc, $start, $end); + + return $self->fetch_all_by_Slice($slice); +} + + + +=head2 fetch_all_by_chr_start_end + + Arg [1] : string $chr_name + The name of the chromosome to retrieve features from + Arg [2] : int $start + The start coordinate of the chromosomal region to retrieve + features from. + Arg [3] : int $end + The end coordinate of the chromosomal region to retrieve + features from. + Example : @features + Description: Retrieves features on the region defined by the $chr_name, + $start, and $end args in assembly (chromosomal) coordinates. + + If this method is overridden then the coordinate_systems method + must return 'ASSEMBLY' as one of its values. + + This method will work as is (i.e. without overriding it) + providing at least one of the other fetch methods is overridden. + Returntype : reference to a list of Bio::SeqFeatures + Exceptions : Thrown if the coordinate_systems method returns ASSEMBLY as a + value and this method is not overridden. + Thrown if any of the input arguments are incorrect + Caller : general, fetch_all_by_Slice + +=cut + +sub fetch_all_by_chr_start_end { + my ($self, $chr_name, $start, $end) = @_; + + unless($chr_name && defined $start && defined $end && $start < $end) { + throw("Incorrect start [$start] end [$end] or chr [$chr_name] arg"); + } + + unless($self->ensembl_db) { + throw('DB attribute not set. This value must be set for the ' . + 'ExternalFeatureAdaptor to function correctly'); + } + + my $slice_adaptor = $self->ensembl_db->get_SliceAdaptor(); + + my $slice = $slice_adaptor->fetch_by_region('toplevel', $chr_name, $start, + $end); + + return $self->fetch_all_by_Slice($slice); +} + + +=head2 _no_valid_coord_system + + Arg [1] : none + Example : none + Description: PRIVATE method - throws an error with a descriptive message + Returntype : none + Exceptions : always thrown + Caller : internal + +=cut + +sub _no_valid_coord_system { + my $self = shift; + + throw("This ExternalFeatureAdaptor does not support a known " . + "coordinate system.\n Valid coordinate systems are: " . + "[SLICE,ASSEMBLY,SUPERCONTIG,CONTIG,CLONE].\n This External Adaptor " . + "supports: [" . join(', ', $self->coordinate_systems) . "]"); +} + + + + +=head2 _supported + + Arg [1] : string $system + Example : print "CONTIG system supported" if($self->_supported('CONTIG')); + Description: PRIVATE method. Tests if the coordinate system defined by + the $system argument is implemented. + Returntype : boolean + Exceptions : none + Caller : internal + +=cut + +sub _supported { + my ($self, $system) = @_; + + #construct the hash of supported features if it has not been already + unless(exists $self->{_supported}) { + $self->{_supported} = {}; + foreach my $coord_system ($self->coordinate_systems) { + $self->{_supported}->{$coord_system} = 1; + } + } + + return $self->{_supported}->{$system}; +} + + + +1;