diff variant_effect_predictor/Bio/PrimarySeqI.pm @ 0:2bc9b66ada89 draft default tip

Uploaded
author mahtabm
date Thu, 11 Apr 2013 06:29:17 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/variant_effect_predictor/Bio/PrimarySeqI.pm	Thu Apr 11 06:29:17 2013 -0400
@@ -0,0 +1,722 @@
+# $Id: PrimarySeqI.pm,v 1.50.2.3 2003/08/29 15:37:39 birney Exp $
+#
+# BioPerl module for Bio::PrimarySeqI
+#
+# Cared for by Ewan Birney <birney@sanger.ac.uk>
+#
+# Copyright Ewan Birney
+#
+# You may distribute this module under the same terms as perl itself
+
+# POD documentation - main docs before the code
+
+=head1 NAME
+
+Bio::PrimarySeqI [Developers] - Interface definition for a Bio::PrimarySeq
+
+=head1 SYNOPSIS
+
+
+    # Bio::PrimarySeqI is the interface class for sequences.
+
+    # If you are a newcomer to bioperl, you should
+    # start with Bio::Seq documentation. This
+    # documentation is mainly for developers using
+    # Bioperl.
+
+    # to test this is a seq object
+
+    $obj->isa("Bio::PrimarySeqI") ||
+      $obj->throw("$obj does not implement the Bio::PrimarySeqI interface");
+
+    # accessors
+
+    $string    = $obj->seq();
+    $substring = $obj->subseq(12,50);
+    $display   = $obj->display_id(); # for human display
+    $id        = $obj->primary_id(); # unique id for this object,
+                                     # implementation defined
+    $unique_key= $obj->accession_number();
+                       # unique biological id
+
+    # object manipulation
+
+    eval {
+	$rev    = $obj->revcom();
+    };
+    if( $@ ) {
+	$obj->throw("Could not reverse complement. ".
+		    "Probably not DNA. Actual exception\n$@\n");
+    }
+
+    $trunc = $obj->trunc(12,50);
+
+    # $rev and $trunc are Bio::PrimarySeqI compliant objects
+
+
+=head1 DESCRIPTION
+
+This object defines an abstract interface to basic sequence
+information - for most users of the package the documentation (and
+methods) in this class are not useful - this is a developers only
+class which defines what methods have to be implmented by other Perl
+objects to comply to the Bio::PrimarySeqI interface. Go "perldoc
+Bio::Seq" or "man Bio::Seq" for more information on the main class for
+sequences.
+
+
+PrimarySeq is an object just for the sequence and its name(s), nothing
+more. Seq is the larger object complete with features. There is a pure
+perl implementation of this in Bio::PrimarySeq. If you just want to
+use Bio::PrimarySeq objects, then please read that module first. This
+module defines the interface, and is of more interest to people who
+want to wrap their own Perl Objects/RDBs/FileSystems etc in way that
+they "are" bioperl sequence objects, even though it is not using Perl
+to store the sequence etc.
+
+
+This interface defines what bioperl consideres necessary to "be" a
+sequence, without providing an implementation of this. (An
+implementation is provided in Bio::PrimarySeq). If you want to provide
+a Bio::PrimarySeq 'compliant' object which in fact wraps another
+object/database/out-of-perl experience, then this is the correct thing
+to wrap, generally by providing a wrapper class which would inheriet
+from your object and this Bio::PrimarySeqI interface. The wrapper class
+then would have methods lists in the "Implementation Specific
+Functions" which would provide these methods for your object.
+
+
+=head1 FEEDBACK
+
+=head2 Mailing Lists
+
+User feedback is an integral part of the evolution of this and other
+Bioperl modules. Send your comments and suggestions preferably to one
+of the Bioperl mailing lists.  Your participation is much appreciated.
+
+  bioperl-l@bioperl.org                       - General discussion
+  http://bio.perl.org/MailList.html           - About the mailing lists
+
+=head2 Reporting Bugs
+
+Report bugs to the Bioperl bug tracking system to help us keep track
+the bugs and their resolution.  Bug reports can be submitted via email
+or the web:
+
+  bioperl-bugs@bio.perl.org
+  http://bugzilla.bioperl.org/
+
+=head1 AUTHOR - Ewan Birney
+
+Email birney@sanger.ac.uk
+
+=head1 APPENDIX
+
+The rest of the documentation details each of the object
+methods. Internal methods are usually preceded with a _
+
+=cut
+
+
+# Let the code begin...
+
+
+package Bio::PrimarySeqI;
+use vars qw(@ISA );
+use strict;
+use Bio::Root::RootI;
+use Bio::Tools::CodonTable;
+
+@ISA = qw(Bio::Root::RootI);
+
+=head1 Implementation Specific Functions
+
+These functions are the ones that a specific implementation must
+define.
+
+=head2 seq
+
+ Title   : seq
+ Usage   : $string    = $obj->seq()
+ Function: Returns the sequence as a string of letters. The
+           case of the letters is left up to the implementer.
+           Suggested cases are upper case for proteins and lower case for
+           DNA sequence (IUPAC standard),
+           but implementations are suggested to keep an open mind about
+           case (some users... want mixed case!)
+ Returns : A scalar
+ Status  : Virtual
+
+=cut
+
+sub seq {
+   my ($self) = @_;
+   $self->throw_not_implemented();
+}
+
+=head2 subseq
+
+ Title   : subseq
+ Usage   : $substring = $obj->subseq(10,40);
+ Function: returns the subseq from start to end, where the first base
+           is 1 and the number is inclusive, ie 1-2 are the first two
+           bases of the sequence
+
+           Start cannot be larger than end but can be equal
+
+ Returns : a string
+ Args    :
+ Status  : Virtual
+
+=cut
+
+sub subseq{
+   my ($self) = @_;
+   $self->throw_not_implemented();
+}
+
+=head2 display_id
+
+ Title   : display_id
+ Usage   : $id_string = $obj->display_id();
+ Function: returns the display id, aka the common name of the Sequence object.
+
+           The semantics of this is that it is the most likely string
+           to be used as an identifier of the sequence, and likely to
+           have "human" readability.  The id is equivalent to the ID
+           field of the GenBank/EMBL databanks and the id field of the
+           Swissprot/sptrembl database. In fasta format, the >(\S+) is
+           presumed to be the id, though some people overload the id
+           to embed other information. Bioperl does not use any
+           embedded information in the ID field, and people are
+           encouraged to use other mechanisms (accession field for
+           example, or extending the sequence object) to solve this.
+
+           Notice that $seq->id() maps to this function, mainly for
+           legacy/convience issues
+ Returns : A string
+ Args    : None
+ Status  : Virtual
+
+
+=cut
+
+sub display_id {
+   my ($self) = @_;
+   $self->throw_not_implemented();
+}
+
+
+=head2 accession_number
+
+ Title   : accession_number
+ Usage   : $unique_biological_key = $obj->accession_number;
+ Function: Returns the unique biological id for a sequence, commonly
+           called the accession_number. For sequences from established
+           databases, the implementors should try to use the correct
+           accession number. Notice that primary_id() provides the
+           unique id for the implemetation, allowing multiple objects
+           to have the same accession number in a particular implementation.
+
+           For sequences with no accession number, this method should return
+           "unknown".
+ Returns : A string
+ Args    : None
+ Status  : Virtual
+
+
+=cut
+
+sub accession_number {
+   my ($self,@args) = @_;
+   $self->throw_not_implemented();
+}
+
+
+
+=head2 primary_id
+
+ Title   : primary_id
+ Usage   : $unique_implementation_key = $obj->primary_id;
+ Function: Returns the unique id for this object in this
+           implementation. This allows implementations to manage their
+           own object ids in a way the implementaiton can control
+           clients can expect one id to map to one object.
+
+           For sequences with no accession number, this method should
+           return a stringified memory location.
+
+           [Note this method name is likely to change in 1.3]
+
+ Returns : A string
+ Args    : None
+ Status  : Virtual
+
+
+=cut
+
+sub primary_id {
+   my ($self,@args) = @_;
+   $self->throw_not_implemented();
+}
+
+
+=head2 can_call_new
+
+ Title   : can_call_new
+ Usage   : if( $obj->can_call_new ) {
+             $newobj = $obj->new( %param );
+	 }
+ Function: can_call_new returns 1 or 0 depending
+           on whether an implementation allows new
+           constructor to be called. If a new constructor
+           is allowed, then it should take the followed hashed
+           constructor list.
+
+           $myobject->new( -seq => $sequence_as_string,
+			   -display_id  => $id
+			   -accession_number => $accession
+			   -alphabet => 'dna',
+			   );
+ Example :
+ Returns : 1 or 0
+ Args    :
+
+
+=cut
+
+sub can_call_new{
+   my ($self,@args) = @_;
+
+   # we default to 0 here
+
+   return 0;
+}
+
+=head2 alphabet
+
+ Title   : alphabet
+ Usage   : if( $obj->alphabet eq 'dna' ) { /Do Something/ }
+ Function: Returns the type of sequence being one of
+           'dna', 'rna' or 'protein'. This is case sensitive.
+
+           This is not called <type> because this would cause
+           upgrade problems from the 0.5 and earlier Seq objects.
+
+ Returns : a string either 'dna','rna','protein'. NB - the object must
+           make a call of the type - if there is no type specified it
+           has to guess.
+ Args    : none
+ Status  : Virtual
+
+
+=cut
+
+sub alphabet{
+    my ( $self ) = @_;
+    $self->throw_not_implemented();
+}
+
+sub moltype{
+   my ($self,@args) = @_;
+
+   $self->warn("moltype: pre v1.0 method. Calling alphabet() instead...");
+   $self->alphabet(@args);
+}
+
+
+=head1 Optional Implementation Functions
+
+The following functions rely on the above functions. An
+implementing class does not need to provide these functions, as they
+will be provided by this class, but is free to override these
+functions.
+
+All of revcom(), trunc(), and translate() create new sequence
+objects. They will call new() on the class of the sequence object
+instance passed as argument, unless can_call_new() returns FALSE. In
+the latter case a Bio::PrimarySeq object will be created. Implementors
+which really want to control how objects are created (eg, for object
+persistence over a database, or objects in a CORBA framework), they
+are encouraged to override these methods
+
+=head2 revcom
+
+ Title   : revcom
+ Usage   : $rev = $seq->revcom()
+ Function: Produces a new Bio::PrimarySeqI implementing object which
+           is the reversed complement of the sequence. For protein
+           sequences this throws an exception of "Sequence is a
+           protein. Cannot revcom"
+
+           The id is the same id as the original sequence, and the
+           accession number is also indentical. If someone wants to
+           track that this sequence has be reversed, it needs to
+           define its own extensions
+
+           To do an inplace edit of an object you can go:
+
+           $seq = $seq->revcom();
+
+           This of course, causes Perl to handle the garbage
+           collection of the old object, but it is roughly speaking as
+           efficient as an inplace edit.
+
+ Returns : A new (fresh) Bio::PrimarySeqI object
+ Args    : none
+
+
+=cut
+
+sub revcom{
+   my ($self) = @_;
+
+   # check the type is good first.
+   my $t = $self->alphabet;
+
+   if( $t eq 'protein' ) {
+       $self->throw("Sequence is a protein. Cannot revcom");
+   }
+
+   if( $t ne 'dna' && $t ne 'rna' ) {
+       if( $self->can('warn') ) {
+	   $self->warn("Sequence is not dna or rna, but [$t]. ".
+		       "Attempting to revcom, but unsure if this is right");
+       } else {
+	   warn("[$self] Sequence is not dna or rna, but [$t]. ".
+		"Attempting to revcom, but unsure if this is right");
+       }
+   }
+
+   # yank out the sequence string
+
+   my $str = $self->seq();
+
+   # if is RNA - map to DNA then map back
+
+   if( $t eq 'rna' ) {
+       $str =~ tr/uU/tT/;
+   }
+
+   # revcom etc...
+
+   $str =~ tr/acgtrymkswhbvdnxACGTRYMKSWHBVDNX/tgcayrkmswdvbhnxTGCAYRKMSWDVBHNX/;
+   my $revseq = CORE::reverse $str;
+
+   if( $t eq 'rna' ) {
+       $revseq =~ tr/tT/uU/;
+   }
+
+   my $seqclass;
+   if($self->can_call_new()) {
+       $seqclass = ref($self);
+   } else {
+       $seqclass = 'Bio::PrimarySeq';
+       $self->_attempt_to_load_Seq();
+   }
+   my $out = $seqclass->new( '-seq' => $revseq,
+			     '-display_id'  => $self->display_id,
+			     '-accession_number' => $self->accession_number,
+			     '-alphabet' => $self->alphabet,
+			     '-desc' => $self->desc(),
+                             '-verbose' => $self->verbose
+			     );
+   return $out;
+
+}
+
+=head2 trunc
+
+ Title   : trunc
+ Usage   : $subseq = $myseq->trunc(10,100);
+ Function: Provides a truncation of a sequence,
+
+ Example :
+ Returns : a fresh Bio::PrimarySeqI implementing object
+ Args    : Two integers denoting first and last base of the sub-sequence.
+
+
+=cut
+
+sub trunc{
+   my ($self,$start,$end) = @_;
+
+   my $str;
+   if( defined $start && ref($start) &&
+       $start->isa('Bio::LocationI') ) {
+       $str = $self->subseq($start); # start is a location actually
+   } elsif( !$end ) {
+       $self->throw("trunc start,end -- there was no end for $start");
+   } elsif( $end < $start ) {
+       my $msg = "start [$start] is greater than end [$end]. \n".
+	   "If you want to truncated and reverse complement, \n".
+	       "you must call trunc followed by revcom. Sorry.";
+       $self->throw($msg);
+   } else {
+       $str = $self->subseq($start,$end);
+   }
+
+   my $seqclass;
+   if($self->can_call_new()) {
+       $seqclass = ref($self);
+   } else {
+       $seqclass = 'Bio::PrimarySeq';
+       $self->_attempt_to_load_Seq();
+   }
+
+   my $out = $seqclass->new( '-seq' => $str,
+			     '-display_id'  => $self->display_id,
+			     '-accession_number' => $self->accession_number,
+			     '-alphabet' => $self->alphabet,
+			     '-desc' => $self->desc(),
+                             '-verbose' => $self->verbose
+			     );
+   return $out;
+}
+
+=head2 translate
+
+ Title   : translate
+ Usage   : $protein_seq_obj = $dna_seq_obj->translate
+           #if full CDS expected:
+           $protein_seq_obj = $cds_seq_obj->translate(undef,undef,undef,undef,1);
+ Function:
+
+           Provides the translation of the DNA sequence using full
+           IUPAC ambiguities in DNA/RNA and amino acid codes.
+
+           The full CDS translation is identical to EMBL/TREMBL
+           database translation. Note that the trailing terminator
+           character is removed before returning the translation
+           object.
+
+           Note: if you set $dna_seq_obj->verbose(1) you will get a
+           warning if the first codon is not a valid initiator.
+
+
+ Returns : A Bio::PrimarySeqI implementing object
+ Args    : character for terminator (optional) defaults to '*'
+           character for unknown amino acid (optional) defaults to 'X'
+           frame (optional) valid values 0, 1, 2, defaults to 0
+           codon table id (optional) defaults to 1
+           complete coding sequence expected, defaults to 0 (false)
+           boolean, throw exception if not complete CDS (true) or defaults to
+warning (false)
+           coding sequence expected to be complete at 5', defaults to false
+           coding sequence expected to be complete at 3', defaults to false
+
+=cut
+
+sub translate {
+    my($self) = shift;
+    my($stop, $unknown, $frame, $tableid, $fullCDS, $throw, $complete5,
+$complete3) = @_;
+    my($i, $len, $output) = (0,0,'');
+    my($codon)   = "";
+    my $aa;
+
+    ## User can pass in symbol for stop and unknown codons
+    unless(defined($stop) and $stop ne '')    { $stop = "*"; }
+    unless(defined($unknown) and $unknown ne '') { $unknown = "X"; }
+    unless(defined($frame) and $frame ne '') { $frame = 0; }
+
+    ## the codon table ID
+    unless(defined($tableid) and $tableid ne '')    { $tableid = 1; }
+
+    ##Error if monomer is "Amino"
+    $self->throw("Can't translate an amino acid sequence.") if
+	($self->alphabet eq 'protein');
+
+    ##Error if frame is not 0, 1 or 2
+    $self->throw("Valid values for frame are 0, 1, 2, not [$frame].") unless
+	($frame == 0 or $frame == 1 or $frame == 2);
+
+    #warns if ID is invalid
+    my $codonTable = Bio::Tools::CodonTable->new( -id => $tableid);
+
+    my ($seq) = $self->seq();
+
+    # deal with frame offset.
+    if( $frame ) {
+	$seq = substr ($seq,$frame);
+    }
+
+    # Translate it
+    $output = $codonTable->translate($seq);
+    # Use user-input stop/unknown
+    $output =~ s/\*/$stop/g;
+    $output =~ s/X/$unknown/g;
+
+    # $complete5 and $complete3 indicate completeness of
+    # the coding sequence at the 5' and 3' ends. Complete
+    # if true, default to false. These are in addition to
+    # $fullCDS, for backwards compatibility
+    defined($complete5) or ($complete5 = $fullCDS ? 1 : 0);
+    defined($complete3) or ($complete3 = $fullCDS ? 1 : 0);
+
+    my $id = $self->display_id;
+
+    # only if we are expecting to be complete at the 5' end
+    if($complete5) {
+	# if the initiator codon is not ATG, the amino acid needs to changed into M
+	if(substr($output,0,1) ne 'M') {
+	    if($codonTable->is_start_codon(substr($seq, 0, 3)) ) {
+		$output = 'M' . substr($output, 1);
+	    }
+	    elsif($throw) {
+		$self->throw("Seq [$id]: Not using a valid initiator codon!");
+	    } else {
+		$self->warn("Seq [$id]: Not using a valid initiator codon!");
+	    }
+	}
+    }
+
+    # only if we are expecting to be complete at the 3' end
+    if($complete3) {
+	#remove the stop character
+	if(substr($output, -1, 1) eq $stop) {
+	    chop $output;
+	} else {
+	    $throw && $self->throw("Seq [$id]: Not using a valid terminator codon!");
+	    $self->warn("Seq [$id]: Not using a valid terminator codon!");
+	}
+    }
+
+    # only if we are expecting to translate a complete coding region
+    if($complete5 and $complete3) {
+	# test if there are terminator characters inside the protein sequence!
+	if($output =~ /\*/) {
+	    $throw && $self->throw("Seq [$id]: Terminator codon inside CDS!");
+	    $self->warn("Seq [$id]: Terminator codon inside CDS!");
+	}
+    }
+
+    my $seqclass;
+    if($self->can_call_new()) {
+	$seqclass = ref($self);
+    } else {
+	$seqclass = 'Bio::PrimarySeq';
+	$self->_attempt_to_load_Seq();
+    }
+    my $out = $seqclass->new( '-seq' => $output,
+			      '-display_id'  => $self->display_id,
+			      '-accession_number' => $self->accession_number,
+			      # is there anything wrong with retaining the
+			      # description?
+			      '-desc' => $self->desc(),
+			      '-alphabet' => 'protein',
+                              '-verbose' => $self->verbose
+			      );
+    return $out;
+
+}
+
+=head2 id
+
+ Title   : id
+ Usage   : $id = $seq->id()
+ Function: ID of the sequence. This should normally be (and actually is in
+           the implementation provided here) just a synonym for display_id().
+ Example :
+ Returns : A string.
+ Args    :
+
+
+=cut
+
+sub  id {
+   return shift->display_id();
+}
+
+
+=head2 length
+
+ Title   : length
+ Usage   : $len = $seq->length()
+ Function:
+ Example :
+ Returns : integer representing the length of the sequence.
+ Args    :
+
+
+=cut
+
+sub  length {
+   shift->throw_not_implemented();
+}
+
+=head2 desc
+
+ Title   : desc
+ Usage   : $seq->desc($newval);
+           $description = $seq->desc();
+ Function: Get/set description text for a seq object
+ Example :
+ Returns : value of desc
+ Args    : newvalue (optional)
+
+
+=cut
+
+sub desc {
+   my ($self,$value) = @_;
+   $self->throw_not_implemented();
+}
+
+
+=head2 is_circular
+
+ Title   : is_circular
+ Usage   : if( $obj->is_circular) { /Do Something/ }
+ Function: Returns true if the molecule is circular
+ Returns : Boolean value
+ Args    : none
+
+=cut
+
+sub is_circular{
+    shift->throw_not_implemented();
+}
+
+=head1 Private functions
+
+These are some private functions for the PrimarySeqI interface. You do not
+need to implement these functions
+
+=head2 _attempt_to_load_Seq
+
+ Title   : _attempt_to_load_Seq
+ Usage   :
+ Function:
+ Example :
+ Returns :
+ Args    :
+
+
+=cut
+
+sub _attempt_to_load_Seq{
+   my ($self) = @_;
+
+   if( $main::{'Bio::PrimarySeq'} ) {
+       return 1;
+   } else {
+       eval {
+	   require Bio::PrimarySeq;
+       };
+       if( $@ ) {
+	   my $text = "Bio::PrimarySeq could not be loaded for [$self]\n".
+	       "This indicates that you are using Bio::PrimarySeqI ".
+	       "without Bio::PrimarySeq loaded or without providing a ".
+	       "complete implementation.\nThe most likely problem is that there ".
+	       "has been a misconfiguration of the bioperl environment\n".
+	       "Actual exception:\n\n";
+	   $self->throw("$text$@\n");
+	   return 0;
+       }
+       return 1;
+   }
+
+}
+
+1;