diff variant_effect_predictor/Bio/SeqFeatureI.pm @ 0:1f6dce3d34e0

Uploaded
author mahtabm
date Thu, 11 Apr 2013 02:01:53 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/variant_effect_predictor/Bio/SeqFeatureI.pm	Thu Apr 11 02:01:53 2013 -0400
@@ -0,0 +1,630 @@
+# $Id: SeqFeatureI.pm,v 1.43.2.5 2003/08/28 19:29:34 jason Exp $
+#
+# BioPerl module for Bio::SeqFeatureI
+#
+# Cared for by Ewan Birney <birney@sanger.ac.uk>
+#
+# Copyright Ewan Birney
+#
+# You may distribute this module under the same terms as perl itself
+
+# POD documentation - main docs before the code
+
+=head1 NAME
+
+Bio::SeqFeatureI - Abstract interface of a Sequence Feature
+
+=head1 SYNOPSIS
+
+    # get a seqfeature somehow, eg,
+
+    foreach $feat ( $seq->top_SeqFeatures() ) {
+            print "Feature from ", $feat->start, "to ", 
+	          $feat->end, " Primary tag  ", $feat->primary_tag, 
+	          ", produced by ", $feat->source_tag(), "\n";
+
+            if( $feat->strand == 0 ) {
+		print "Feature applicable to either strand\n";
+            } else {
+                print "Feature on strand ", $feat->strand,"\n"; # -1,1
+            }
+
+            foreach $tag ( $feat->all_tags() ) {
+		print "Feature has tag ", $tag, "with values, ",
+		      join(' ',$feat->each_tag_value($tag)), "\n";
+            }
+	    print "new feature\n" if $feat->has_tag('new');
+	    # features can have sub features
+	    my @subfeat = $feat->get_SeqFeatures();
+	}
+
+=head1 DESCRIPTION
+
+This interface is the functions one can expect for any Sequence
+Feature, whatever its implementation or whether it is a more complex
+type (eg, a Gene). This object doesn\'t actually provide any
+implemention, it just provides the definitions of what methods one can
+call. See Bio::SeqFeature::Generic for a good standard implementation
+of this object
+
+=head1 FEEDBACK
+
+User feedback is an integral part of the evolution of this and other
+Bioperl modules. Send your comments and suggestions preferably to one
+of the Bioperl mailing lists.  Your participation is much appreciated.
+
+  bioperl-l@bioperl.org                     - General discussion
+  http://bio.perl.org/MailList.html         - About the mailing lists
+
+=head2 Reporting Bugs
+
+Report bugs to the Bioperl bug tracking system to help us keep track
+the bugs and their resolution.  Bug reports can be submitted via email
+or the web:
+
+  bioperl-bugs@bio.perl.org
+  http://bugzilla.bioperl.org/
+
+=head1 APPENDIX
+
+The rest of the documentation details each of the object
+methods. Internal methods are usually preceded with a _
+
+=cut
+
+
+# Let the code begin...
+
+
+package Bio::SeqFeatureI;
+use vars qw(@ISA $HasInMemory);
+use strict;
+
+BEGIN {
+    eval { require Bio::DB::InMemoryCache };
+    if( $@ ) { $HasInMemory = 0 }
+    else { $HasInMemory = 1 }
+}
+
+use Bio::RangeI;
+use Bio::Seq;
+
+use Carp;
+
+@ISA = qw(Bio::RangeI);
+
+=head1 SeqFeatureI specific methods
+
+New method interfaces.
+
+=cut
+
+=head2 get_SeqFeatures
+
+ Title   : get_SeqFeatures
+ Usage   : @feats = $feat->get_SeqFeatures();
+ Function: Returns an array of sub Sequence Features
+ Returns : An array
+ Args    : none
+
+
+=cut
+
+sub get_SeqFeatures{
+   my ($self,@args) = @_;
+
+   $self->throw_not_implemented();
+}
+
+=head2 display_name
+
+ Title   : display_name
+ Usage   : $name = $feat->display_name()
+ Function: Returns the human-readable name of the feature for displays.
+ Returns : a string
+ Args    : none
+
+=cut
+
+sub display_name { 
+    shift->throw_not_implemented();
+}
+
+=head2 primary_tag
+
+ Title   : primary_tag
+ Usage   : $tag = $feat->primary_tag()
+ Function: Returns the primary tag for a feature,
+           eg 'exon'
+ Returns : a string 
+ Args    : none
+
+
+=cut
+
+sub primary_tag{
+   my ($self,@args) = @_;
+
+   $self->throw_not_implemented();
+
+}
+
+=head2 source_tag
+
+ Title   : source_tag
+ Usage   : $tag = $feat->source_tag()
+ Function: Returns the source tag for a feature,
+           eg, 'genscan' 
+ Returns : a string 
+ Args    : none
+
+
+=cut
+
+sub source_tag{
+   my ($self,@args) = @_;
+
+   $self->throw_not_implemented();
+}
+
+=head2 has_tag
+
+ Title   : has_tag
+ Usage   : $tag_exists = $self->has_tag('some_tag')
+ Function: 
+ Returns : TRUE if the specified tag exists, and FALSE otherwise
+ Args    :
+
+
+=cut
+
+sub has_tag{
+   my ($self,@args) = @_;
+
+   $self->throw_not_implemented();
+
+}
+
+=head2 get_tag_values
+
+ Title   : get_tag_values
+ Usage   : @values = $self->get_tag_values('some_tag')
+ Function: 
+ Returns : An array comprising the values of the specified tag.
+ Args    :
+
+
+=cut
+
+sub get_tag_values {
+    shift->throw_not_implemented();
+}
+
+=head2 get_all_tags
+
+ Title   : get_all_tags
+ Usage   : @tags = $feat->get_all_tags()
+ Function: gives all tags for this feature
+ Returns : an array of strings
+ Args    : none
+
+
+=cut
+
+sub get_all_tags{
+    shift->throw_not_implemented();
+}
+
+=head2 attach_seq
+
+ Title   : attach_seq
+ Usage   : $sf->attach_seq($seq)
+ Function: Attaches a Bio::Seq object to this feature. This
+           Bio::Seq object is for the *entire* sequence: ie
+           from 1 to 10000
+
+           Note that it is not guaranteed that if you obtain a feature
+           from an object in bioperl, it will have a sequence
+           attached. Also, implementors of this interface can choose
+           to provide an empty implementation of this method. I.e.,
+           there is also no guarantee that if you do attach a
+           sequence, seq() or entire_seq() will not return undef.
+
+           The reason that this method is here on the interface is to
+           enable you to call it on every SeqFeatureI compliant
+           object, and that it will be implemented in a useful way and
+           set to a useful value for the great majority of use
+           cases. Implementors who choose to ignore the call are
+           encouraged to specifically state this in their
+           documentation.
+
+ Example :
+ Returns : TRUE on success
+ Args    : a Bio::PrimarySeqI compliant object
+
+
+=cut
+
+sub attach_seq {
+    shift->throw_not_implemented();
+}
+
+=head2 seq
+
+ Title   : seq
+ Usage   : $tseq = $sf->seq()
+ Function: returns the truncated sequence (if there is a sequence attached) 
+           for this feature
+ Example :
+ Returns : sub seq (a Bio::PrimarySeqI compliant object) on attached sequence
+           bounded by start & end, or undef if there is no sequence attached
+ Args    : none
+
+
+=cut
+
+sub seq {
+    shift->throw_not_implemented();
+}
+
+=head2 entire_seq
+
+ Title   : entire_seq
+ Usage   : $whole_seq = $sf->entire_seq()
+ Function: gives the entire sequence that this seqfeature is attached to
+ Example :
+ Returns : a Bio::PrimarySeqI compliant object, or undef if there is no
+           sequence attached
+ Args    : none
+
+
+=cut
+
+sub entire_seq {
+    shift->throw_not_implemented();
+}
+
+
+=head2 seq_id
+
+ Title   : seq_id
+ Usage   : $obj->seq_id($newval)
+ Function: There are many cases when you make a feature that you
+           do know the sequence name, but do not know its actual
+           sequence. This is an attribute such that you can store
+           the ID (e.g., display_id) of the sequence.
+
+           This attribute should *not* be used in GFF dumping, as
+           that should come from the collection in which the seq
+           feature was found.
+ Returns : value of seq_id
+ Args    : newvalue (optional)
+
+
+=cut
+
+sub seq_id {
+    shift->throw_not_implemented();
+}
+
+=head2 gff_string
+
+ Title   : gff_string
+ Usage   : $str = $feat->gff_string;
+           $str = $feat->gff_string($gff_formatter);
+ Function: Provides the feature information in GFF format.
+
+           The implementation provided here returns GFF2 by default. If you
+           want a different version, supply an object implementing a method
+           gff_string() accepting a SeqFeatureI object as argument. E.g., to
+           obtain GFF1 format, do the following:
+
+                my $gffio = Bio::Tools::GFF->new(-gff_version => 1);
+                $gff1str = $feat->gff_string($gff1io);
+
+ Returns : A string
+ Args    : Optionally, an object implementing gff_string().
+
+
+=cut
+
+sub gff_string{
+   my ($self,$formatter) = @_;
+
+   $formatter = $self->_static_gff_formatter unless $formatter;
+   return $formatter->gff_string($self);
+}
+
+my $static_gff_formatter = undef;
+
+=head2 _static_gff_formatter
+
+ Title   : _static_gff_formatter
+ Usage   :
+ Function:
+ Example :
+ Returns : 
+ Args    :
+
+
+=cut
+
+sub _static_gff_formatter{
+   my ($self,@args) = @_;
+
+   if( !defined $static_gff_formatter ) {
+       $static_gff_formatter = Bio::Tools::GFF->new('-gff_version' => 2);
+   }
+   return $static_gff_formatter;
+}
+
+=head1 Bio::RangeI methods
+
+List of interfaces inherited from Bio::RangeI (see L<Bio::RangeI>
+for details).
+
+=cut
+
+=head2 start
+
+ Title   : start
+ Usage   : $start = $feat->start
+ Function: Returns the start coordinate of the feature
+ Returns : integer
+ Args    : none
+
+
+=head2 end
+
+ Title   : end
+ Usage   : $end = $feat->end
+ Function: Returns the end coordinate of the feature
+ Returns : integer
+ Args    : none
+
+=head2 strand
+
+ Title   : strand
+ Usage   : $strand = $feat->strand()
+ Function: Returns strand information, being 1,-1 or 0
+ Returns : -1,1 or 0
+ Args    : none
+
+
+=cut
+
+=head1 Decorating methods
+
+These methods have an implementation provided by Bio::SeqFeatureI,
+but can be validly overwritten by subclasses
+
+=head2 spliced_seq
+
+  Title   : spliced_seq
+
+  Usage   : $seq = $feature->spliced_seq()
+            $seq = $feature_with_remote_locations->spliced_seq($db_for_seqs)
+
+  Function: Provides a sequence of the feature which is the most
+            semantically "relevant" feature for this sequence. A
+            default implementation is provided which for simple cases
+            returns just the sequence, but for split cases, loops over
+            the split location to return the sequence. In the case of
+            split locations with remote locations, eg
+
+            join(AB000123:5567-5589,80..1144)
+
+            in the case when a database object is passed in, it will
+            attempt to retrieve the sequence from the database object,
+            and "Do the right thing", however if no database object is
+            provided, it will generate the correct number of N's (DNA)
+            or X's (protein, though this is unlikely).
+
+            This function is deliberately "magical" attempting to
+            second guess what a user wants as "the" sequence for this
+            feature
+
+            Implementing classes are free to override this method with
+            their own magic if they have a better idea what the user
+            wants
+
+  Args    : [optional] A Bio::DB::RandomAccessI compliant object
+  Returns : A Bio::Seq
+
+=cut
+
+sub spliced_seq {
+    my $self = shift;
+    my $db = shift;
+
+    if( ! $self->location->isa("Bio::Location::SplitLocationI") ) {
+	return $self->seq(); # nice and easy!
+    }
+
+    # redundant test, but the above ISA is probably not ideal.
+    if( ! $self->location->isa("Bio::Location::SplitLocationI") ) {
+	$self->throw("not atomic, not split, yikes, in trouble!");
+    }
+
+    my $seqstr;
+    my $seqid = $self->entire_seq->display_id;
+    # This is to deal with reverse strand features
+    # so we are really sorting features 5' -> 3' on their strand
+    # i.e. rev strand features will be sorted largest to smallest
+    # as this how revcom CDSes seem to be annotated in genbank.
+    # Might need to eventually allow this to be programable?
+    # (can I mention how much fun this is NOT! --jason)
+
+    my ($mixed,$mixedloc,$fstrand) = (0);
+
+    if( defined $db && 
+	ref($db) &&  !$db->isa('Bio::DB::RandomAccessI') ) {
+	$self->warn("Must pass in a valid Bio::DB::RandomAccessI object for access to remote locations for spliced_seq");
+	$db = undef;
+    } elsif( defined $db && 
+	     $HasInMemory && ! $db->isa('Bio::DB::InMemoryCache') ) {
+	$db = new Bio::DB::InMemoryCache(-seqdb => $db);
+    }
+    
+    if( $self->isa('Bio::Das::SegmentI') &&
+	! $self->absolute ) { 
+	$self->warn("Calling spliced_seq with a Bio::Das::SegmentI ".
+                    "which does have absolute set to 1 -- be warned ".
+                    "you may not be getting things on the correct strand");
+    }
+
+    my @locs = map { $_->[0] }
+    # sort so that most negative is first basically to order
+    # the features on the opposite strand 5'->3' on their strand
+    # rather than they way most are input which is on the fwd strand
+
+    sort { $a->[1] <=> $b->[1] } # Yes Tim, Schwartzian transformation
+    map { 
+	$fstrand = $_->strand unless defined $fstrand;
+	$mixed = 1 if defined $_->strand && $fstrand != $_->strand;
+	if( defined $_->seq_id ) {
+	    $mixedloc = 1 if( $_->seq_id ne $seqid );
+	}
+	[ $_, $_->start* ($_->strand || 1)];
+    } $self->location->each_Location; 
+
+    if ( $mixed ) { 
+	$self->warn("Mixed strand locations, spliced seq using the input ".
+                    "order rather than trying to sort");
+	@locs = $self->location->each_Location; 
+    } elsif( $mixedloc ) {
+	# we'll use the prescribed location order
+	@locs = $self->location->each_Location; 
+    }
+
+
+    foreach my $loc ( @locs  ) {
+	if( ! $loc->isa("Bio::Location::Atomic") ) {
+	    $self->throw("Can only deal with one level deep locations");
+	}
+	my $called_seq;
+	if( $fstrand != $loc->strand ) {
+	    $self->warn("feature strand is different from location strand!");
+	}
+	# deal with remote sequences
+	if( defined $loc->seq_id && 
+	    $loc->seq_id ne $seqid ) {
+	    if( defined $db ) {
+		my $sid = $loc->seq_id;
+		$sid =~ s/\.\d+$//g;
+		eval {
+		    $called_seq = $db->get_Seq_by_acc($sid);
+		};
+		if( $@ ) {
+		    $self->warn("In attempting to join a remote location, sequence $sid was not in database. Will provide padding N's. Full exception \n\n$@");
+		    $called_seq = undef;
+		}
+	    } else {
+		$self->warn( "cannot get remote location for ".$loc->seq_id ." without a valid Bio::DB::RandomAccessI database handle (like Bio::DB::GenBank)");
+		$called_seq = undef;
+	    }
+	    if( !defined $called_seq ) {
+		$seqstr .= 'N' x $self->length;
+		next;
+	    }
+	} else {
+	    $called_seq = $self->entire_seq;
+	}
+	
+	if( $self->isa('Bio::Das::SegmentI') ) {
+	    my ($s,$e) = ($loc->start,$loc->end);
+	    $seqstr .= $called_seq->subseq($s,$e)->seq();
+	} else { 
+	    # This is dumb subseq should work on locations...
+	    if( $loc->strand == 1 ) {
+		$seqstr .= $called_seq->subseq($loc->start,$loc->end);
+	    } else {
+		$seqstr .= $called_seq->trunc($loc->start,$loc->end)->revcom->seq();
+	    }
+	}
+    }
+    my $out = Bio::Seq->new( -id => $self->entire_seq->display_id . "_spliced_feat",
+				      -seq => $seqstr);
+
+    return $out;
+}
+
+=head1 RangeI methods
+
+These methods are inherited from RangeI and can be used
+directly from a SeqFeatureI interface. Remember that a 
+SeqFeature is-a RangeI, and so wherever you see RangeI you
+can use a feature ($r in the below documentation).
+
+=head2 overlaps
+
+  Title   : overlaps
+  Usage   : if($feat->overlaps($r)) { do stuff }
+            if($feat->overlaps(200)) { do stuff }
+  Function: tests if $feat overlaps $r
+  Args    : a RangeI to test for overlap with, or a point
+  Returns : true if the Range overlaps with the feature, false otherwise
+
+
+=head2 contains
+
+  Title   : contains
+  Usage   : if($feat->contains($r) { do stuff }
+  Function: tests whether $feat totally contains $r
+  Args    : a RangeI to test for being contained
+  Returns : true if the argument is totaly contained within this range
+
+
+=head2 equals
+
+  Title   : equals
+  Usage   : if($feat->equals($r))
+  Function: test whether $feat has the same start, end, strand as $r
+  Args    : a RangeI to test for equality
+  Returns : true if they are describing the same range
+
+
+=head1 Geometrical methods
+
+These methods do things to the geometry of ranges, and return
+triplets (start, stop, strand) from which new ranges could be built.
+
+=head2 intersection
+
+  Title   : intersection
+  Usage   : ($start, $stop, $strand) = $feat->intersection($r)
+  Function: gives the range that is contained by both ranges
+  Args    : a RangeI to compare this one to
+  Returns : nothing if they do not overlap, or the range that they do overlap
+
+=head2 union
+
+  Title   : union
+  Usage   : ($start, $stop, $strand) = $feat->union($r);
+          : ($start, $stop, $strand) = Bio::RangeI->union(@ranges);
+  Function: finds the minimal range that contains all of the ranges
+  Args    : a range or list of ranges to find the union of
+  Returns : the range containing all of the ranges
+
+=cut
+
+=head2 location
+
+ Title   : location
+ Usage   : my $location = $seqfeature->location()
+ Function: returns a location object suitable for identifying location 
+	   of feature on sequence or parent feature  
+ Returns : Bio::LocationI object
+ Args    : none
+
+
+=cut
+
+sub location {
+   my ($self) = @_;
+
+   $self->throw_not_implemented();
+}
+
+
+1;