ensemb_rep_gvl: variant_effect_predictor/Bio/PrimarySeq.pm comparison

comparison variant_effect_predictor/Bio/PrimarySeq.pm @ 0:2bc9b66ada89 draft default tip

Uploaded

author	mahtabm
date	Thu, 11 Apr 2013 06:29:17 -0400
parents
children

comparison

equal deleted inserted replaced

--1:000000000000
+:2bc9b66ada89
+# $Id: PrimarySeq.pm,v 1.73.2.1 2003/06/29 00:25:27 jason Exp $
+#
+# bioperl module for Bio::PrimarySeq
+#
+# Cared for by Ewan Birney <birney@sanger.ac.uk>
+#
+# Copyright Ewan Birney
+#
+# You may distribute this module under the same terms as perl itself
+# POD documentation - main docs before the code
+=head1 NAME
+Bio::PrimarySeq - Bioperl lightweight Sequence Object
+=head1 SYNOPSIS
+# The Bio::SeqIO for file reading, Bio::DB::GenBank for
+# database reading
+use Bio::Seq;
+use Bio::SeqIO;
+use Bio::DB::GenBank;
+#make from memory
+$seqobj = Bio::PrimarySeq->new ( -seq => 'ATGGGGTGGGCGGTGGGTGGTTTG',
+				   -id  => 'GeneFragment-12',
+				   -accession_number => 'X78121',
+				   -alphabet => 'dna',
+				   -is_circular => 1
+				   );
+print "Sequence ", $seqobj->id(), " with accession ",
+$seqobj->accession_number, "\n";
+# read from file
+$inputstream = Bio::SeqIO->new(-file => "myseq.fa",-format => 'Fasta');
+$seqobj = $inputstream->next_seq();
+print "Sequence ", $seqobj->id(), " and desc ", $seqobj->desc, "\n";
+# to get out parts of the sequence.
+print "Sequence ", $seqobj->id(), " with accession ",
+$seqobj->accession_number, " and desc ", $seqobj->desc, "\n";
+$string  = $seqobj->seq();
+$string2 = $seqobj->subseq(1,40);
+=head1 DESCRIPTION
+PrimarySeq is a lightweight Sequence object, storing little more than
+the sequence, its name, a computer useful unique name. It does not
+contain sequence features or other information.  To have a sequence
+with sequence features you should use the Seq object which uses this
+object - go perldoc Bio::Seq
+Although newusers will use Bio::PrimarySeq alot, in general you will
+be using it from the Bio::Seq object. For more information on Bio::Seq
+go perldoc Bio::Seq. For interest you might like to known that
+Bio::Seq has-a Bio::PrimarySeq and forwards most of the function calls
+to do with sequence to it (the has-a relationship lets us get out of a
+otherwise nasty cyclical reference in Perl which would leak memory).
+Sequence objects are defined by the Bio::PrimarySeqI interface, and this
+object is a pure Perl implementation of the interface (if that's
+gibberish to you, don't worry. The take home message is that this
+object is the bioperl default sequence object, but other people can
+use their own objects as sequences if they so wish). If you are
+interested in wrapping your own objects as compliant Bioperl sequence
+objects, then you should read the Bio::PrimarySeqI documentation
+The documenation of this object is a merge of the Bio::PrimarySeq and
+Bio::PrimarySeqI documentation.  This allows all the methods which you can
+call on sequence objects here.
+=head1 FEEDBACK
+=head2 Mailing Lists
+User feedback is an integral part of the evolution of this and other
+Bioperl modules. Send your comments and suggestions preferably to one
+of the Bioperl mailing lists.  Your participation is much appreciated.
+bioperl-l@bioperl.org             - General discussion
+http://bio.perl.org/MailList.html - About the mailing lists
+=head2 Reporting Bugs
+Report bugs to the Bioperl bug tracking system to help us keep track
+the bugs and their resolution.  Bug reports can be submitted via email
+or the web:
+bioperl-bugs@bio.perl.org
+http://bugzilla.bioperl.org/
+=head1 AUTHOR - Ewan Birney
+Email birney@sanger.ac.uk
+Describe contact details here
+=head1 APPENDIX
+The rest of the documentation details each of the object
+methods. Internal methods are usually preceded with a _
+=cut
+# Let the code begin...
+package Bio::PrimarySeq;
+use vars qw(@ISA);
+use strict;
+use Bio::Root::Root;
+use Bio::PrimarySeqI;
+use Bio::IdentifiableI;
+use Bio::DescribableI;
+@ISA = qw(Bio::Root::Root Bio::PrimarySeqI
+	  Bio::IdentifiableI Bio::DescribableI);
+#
+# setup the allowed values for alphabet()
+#
+my %valid_type = map {$_, 1} qw( dna rna protein );
+=head2 new
+Title   : new
+Usage   : $seq    = Bio::PrimarySeq->new( -seq => 'ATGGGGGTGGTGGTACCCT',
+-id  => 'human_id',
+					   -accession_number => 'AL000012',
+					   );
+Function: Returns a new primary seq object from
+basic constructors, being a string for the sequence
+and strings for id and accession_number.
+Note that you can provide an empty sequence string. However, in
+this case you MUST specify the type of sequence you wish to
+initialize by the parameter -alphabet. See alphabet() for possible
+values.
+Returns : a new Bio::PrimarySeq object
+Args    : -seq         => sequence string
+-display_id  => display id of the sequence (locus name)
+-accession_number => accession number
+-primary_id  => primary id (Genbank id)
+-namespace   => the namespace for the accession
+-authority   => the authority for the namespace
+-desc        => description text
+-alphabet    => sequence type (alphabet) (dna|rna|protein)
+-id          => alias for display id
+-is_circular => boolean field for whether or not sequence is circular
+=cut
+sub new {
+my ($class, @args) = @_;
+my $self = $class->SUPER::new(@args);
+my($seq,$id,$acc,$pid,$ns,$auth,$v,$oid,
+$desc,$alphabet,$given_id,$is_circular,$direct,$ref_to_seq,$len) =
+	$self->_rearrange([qw(SEQ
+			      DISPLAY_ID
+			      ACCESSION_NUMBER
+			      PRIMARY_ID
+			      NAMESPACE
+			      AUTHORITY
+			      VERSION
+			      OBJECT_ID
+			      DESC
+			      ALPHABET
+			      ID
+			      IS_CIRCULAR
+			      DIRECT
+			      REF_TO_SEQ
+			      LENGTH
+			      )],
+			  @args);
+if( defined $id && defined $given_id ) {
+	if( $id ne $given_id ) {
+	    $self->throw("Provided both id and display_id constructor ".
+			 "functions. [$id] [$given_id]");
+	}
+}
+if( defined $given_id ) { $id = $given_id; }
+# let's set the length before the seq -- if there is one, this length is
+# going to be invalidated
+defined $len && $self->length($len);
+# if alphabet is provided we set it first, so that it won't be guessed
+# when the sequence is set
+$alphabet && $self->alphabet($alphabet);
+# if there is an alphabet, and direct is passed in, assumme the alphabet
+# and sequence is ok
+if( $direct && $ref_to_seq) {
+	$self->{'seq'} = $$ref_to_seq;
+	if( ! $alphabet ) {
+	    $self->_guess_alphabet();
+	} # else it has been set already above
+} else {
+#	print STDERR "DEBUG: setting sequence to [$seq]\n";
+	# note: the sequence string may be empty
+	$self->seq($seq) if defined($seq);
+}
+$id          && $self->display_id($id);
+$acc         && $self->accession_number($acc);
+defined $pid && $self->primary_id($pid);
+$desc        && $self->desc($desc);
+$is_circular && $self->is_circular($is_circular);
+$ns          && $self->namespace($ns);
+$auth        && $self->authority($auth);
+defined($v)  && $self->version($v);
+defined($oid) && $self->object_id($oid);
+return $self;
+}
+sub direct_seq_set {
+my $obj = shift;
+return $obj->{'seq'} = shift if @_;
+return undef;
+}
+=head2 seq
+Title   : seq
+Usage   : $string    = $obj->seq()
+Function: Returns the sequence as a string of letters. The
+case of the letters is left up to the implementer.
+Suggested cases are upper case for proteins and lower case for
+DNA sequence (IUPAC standard), but you should not rely on this
+Returns : A scalar
+Args    : Optionally on set the new value (a string). An optional second
+argument presets the alphabet (otherwise it will be guessed).
+Both parameters may also be given in named paramater style
+with -seq and -alphabet being the names.
+=cut
+sub seq {
+my ($obj,@args) = @_;
+if( scalar(@args) == 0 ) {
+return $obj->{'seq'};
+}
+my ($value,$alphabet) = @args;
+if(@args) {
+if(defined($value) && (! $obj->validate_seq($value))) {
+	   $obj->throw("Attempting to set the sequence to [$value] ".
+		       "which does not look healthy");
+}
+# if a sequence was already set we make sure that we re-adjust the
+# mol.type, otherwise we skip guessing if mol.type is already set
+# note: if the new seq is empty or undef, we don't consider that a
+# change (we wouldn't have anything to guess on anyway)
+my $is_changed_seq =
+	   exists($obj->{'seq'}) && (CORE::length($value || '') > 0);
+$obj->{'seq'} = $value;
+# new alphabet overridden by arguments?
+if($alphabet) {
+	   # yes, set it no matter what
+	   $obj->alphabet($alphabet);
+} elsif( # if we changed a previous sequence to a new one
+		$is_changed_seq ||
+		# or if there is no alphabet yet at all
+		(! defined($obj->alphabet()))) {
+	   # we need to guess the (possibly new) alphabet
+	   $obj->_guess_alphabet();
+} # else (seq not changed and alphabet was defined) do nothing
+# if the seq is changed, make sure we unset a possibly set length
+$obj->length(undef) if $is_changed_seq;
+}
+return $obj->{'seq'};
+}
+=head2 validate_seq
+Title   : validate_seq
+Usage   : if(! $seq->validate_seq($seq_str) ) {
+print "sequence $seq_str is not valid for an object of type ",
+		      ref($seq), "\n";
+	   }
+Function: Validates a given sequence string. A validating sequence string
+must be accepted by seq(). A string that does not validate will
+lead to an exception if passed to seq().
+The implementation provided here does not take alphabet() into
+account. Allowed are all letters (A-Z) and '-','.', '*' and '?'.
+Example :
+Returns : 1 if the supplied sequence string is valid for the object, and
+0 otherwise.
+Args    : The sequence string to be validated.
+=cut
+sub validate_seq {
+my ($self,$seqstr) = @_;
+if( ! defined $seqstr ){ $seqstr = $self->seq(); }
+return 0 unless( defined $seqstr);
+if((CORE::length($seqstr) > 0) && ($seqstr !~ /^([A-Za-z\-\.\*\?]+)$/)) {
+	$self->warn("seq doesn't validate, mismatch is " .
+		   ($seqstr =~ /([^A-Za-z\-\.\*\?]+)/g));
+	return 0;
+}
+return 1;
+}
+=head2 subseq
+Title   : subseq
+Usage   : $substring = $obj->subseq(10,40);
+Function: returns the subseq from start to end, where the first base
+is 1 and the number is inclusive, ie 1-2 are the first two
+bases of the sequence
+Returns : a string
+Args    : integer for start position
+integer for end position
+OR
+Bio::LocationI location for subseq (strand honored)
+=cut
+sub subseq {
+my ($self,$start,$end,$replace) = @_;
+if( ref($start) && $start->isa('Bio::LocationI') ) {
+my $loc = $start;
+$replace = $end; # do we really use this anywhere? scary. HL
+my $seq = "";
+foreach my $subloc ($loc->each_Location()) {
+	   my $piece = $self->subseq($subloc->start(),
+				     $subloc->end(), $replace);
+	   if($subloc->strand() < 0) {
+	       $piece = Bio::PrimarySeq->new('-seq' => $piece)->revcom()->seq();
+	   }
+	   $seq .= $piece;
+}
+return $seq;
+} elsif(  defined  $start && defined $end ) {
+if( $start > $end ){
+	   $self->throw("in subseq, start [$start] has to be ".
+			"greater than end [$end]");
+}
+if( $start <= 0 || $end > $self->length ) {
+	   $self->throw("You have to have start positive\n\tand length less ".
+			"than the total length of sequence [$start:$end] ".
+			"Total ".$self->length."");
+}
+# remove one from start, and then length is end-start
+$start--;
+if( defined $replace ) {
+	   return substr( $self->seq(), $start, ($end-$start), $replace);
+} else {
+	   return substr( $self->seq(), $start, ($end-$start));
+}
+} else {
+$self->warn("Incorrect parameters to subseq - must be two integers ".
+		   "or a Bio::LocationI object not ($start,$end)");
+}
+}
+=head2 length
+Title   : length
+Usage   : $len = $seq->length();
+Function: Get the length of the sequence in number of symbols (bases
+or amino acids).
+You can also set this attribute, even to a number that does
+not match the length of the sequence string. This is useful
+if you don''t want to set the sequence too, or if you want
+to free up memory by unsetting the sequence. In the latter
+case you could do e.g.
+$seq->length($seq->length);
+$seq->seq(undef);
+Note that if you set the sequence to a value other than
+undef at any time, the length attribute will be
+invalidated, and the length of the sequence string will be
+reported again. Also, we won''t let you lie about the length.
+Example :
+Returns : integer representing the length of the sequence.
+Args    : Optionally, the value on set
+=cut
+sub length {
+my $self = shift;
+my $len = CORE::length($self->seq() || '');
+if(@_) {
+	my $val = shift;
+	if(defined($val) && $len && ($len != $val)) {
+	    $self->throw("You're trying to lie about the length: ".
+			 "is $len but you say ".$val);
+	}
+	$self->{'_seq_length'} = $val;
+} elsif(defined($self->{'_seq_length'})) {
+	return $self->{'_seq_length'};
+}
+return $len;
+}
+=head2 display_id
+Title   : display_id or display_name
+Usage   : $id_string = $obj->display_id();
+Function: returns the display id, aka the common name of the Sequence object.
+The semantics of this is that it is the most likely string to
+be used as an identifier of the sequence, and likely to have
+"human" readability.  The id is equivalent to the ID field of
+the GenBank/EMBL databanks and the id field of the
+Swissprot/sptrembl database. In fasta format, the >(\S+) is
+presumed to be the id, though some people overload the id to
+embed other information. Bioperl does not use any embedded
+information in the ID field, and people are encouraged to use
+other mechanisms (accession field for example, or extending
+the sequence object) to solve this.
+With the new Bio::DescribeableI interface, display_name aliases
+to this method.
+Returns : A string
+Args    : None
+=cut
+sub display_id {
+my ($obj,$value) = @_;
+if( defined $value) {
+$obj->{'display_id'} = $value;
+}
+return $obj->{'display_id'};
+}
+=head2 accession_number
+Title   : accession_number or object_id
+Usage   : $unique_key = $obj->accession_number;
+Function: Returns the unique biological id for a sequence, commonly
+called the accession_number. For sequences from established
+databases, the implementors should try to use the correct
+accession number. Notice that primary_id() provides the
+unique id for the implemetation, allowing multiple objects
+to have the same accession number in a particular implementation.
+For sequences with no accession number, this method should
+return "unknown".
+[Note this method name is likely to change in 1.3]
+With the new Bio::IdentifiableI interface, this is aliased
+to object_id
+Returns : A string
+Args    : A string (optional) for setting
+=cut
+sub accession_number {
+my( $obj, $acc ) = @_;
+if (defined $acc) {
+$obj->{'accession_number'} = $acc;
+} else {
+$acc = $obj->{'accession_number'};
+$acc = 'unknown' unless defined $acc;
+}
+return $acc;
+}
+=head2 primary_id
+Title   : primary_id
+Usage   : $unique_key = $obj->primary_id;
+Function: Returns the unique id for this object in this
+implementation. This allows implementations to manage their
+own object ids in a way the implementaiton can control
+clients can expect one id to map to one object.
+For sequences with no natural primary id, this method
+should return a stringified memory location.
+Returns : A string
+Args    : A string (optional, for setting)
+=cut
+sub primary_id {
+my ($obj,$value) = @_;
+if( defined $value) {
+$obj->{'primary_id'} = $value;
+}
+if( ! exists $obj->{'primary_id'} ) {
+return "$obj";
+}
+return $obj->{'primary_id'};
+}
+=head2 alphabet
+Title   : alphabet
+Usage   : if( $obj->alphabet eq 'dna' ) { /Do Something/ }
+Function: Returns the type of sequence being one of
+'dna', 'rna' or 'protein'. This is case sensitive.
+This is not called <type> because this would cause
+upgrade problems from the 0.5 and earlier Seq objects.
+Returns : a string either 'dna','rna','protein'. NB - the object must
+make a call of the type - if there is no type specified it
+has to guess.
+Args    : none
+=cut
+sub alphabet {
+my ($obj,$value) = @_;
+if (defined $value) {
+	$value = lc $value;
+	unless ( $valid_type{$value} ) {
+	    $obj->throw("Molecular type '$value' is not a valid type (".
+			join(',', map "'$_'", sort keys %valid_type) .
+			") lowercase");
+	}
+	$obj->{'alphabet'} = $value;
+}
+return $obj->{'alphabet'};
+}
+=head2 desc
+Title   : desc or description
+Usage   : $obj->desc($newval)
+Function: Get/set description of the sequence.
+description is an alias for this for compliance with the
+Bio::DescribeableI interface.
+Example :
+Returns : value of desc (a string)
+Args    : newvalue (a string or undef, optional)
+=cut
+sub desc{
+my $self = shift;
+return $self->{'desc'} = shift if @_;
+return $self->{'desc'};
+}
+=head2 can_call_new
+Title   : can_call_new
+Usage   :
+Function:
+Example :
+Returns : true
+Args    :
+=cut
+sub can_call_new {
+my ($self) = @_;
+return 1;
+}
+=head2 id
+Title   : id
+Usage   : $id = $seq->id()
+Function: This is mapped on display_id
+Example :
+Returns :
+Args    :
+=cut
+sub  id {
+return shift->display_id(@_);
+}
+=head2 is_circular
+Title   : is_circular
+Usage   : if( $obj->is_circular) { /Do Something/ }
+Function: Returns true if the molecule is circular
+Returns : Boolean value
+Args    : none
+=cut
+sub is_circular{
+my $self = shift;
+return $self->{'is_circular'} = shift if @_;
+return $self->{'is_circular'};
+}
+=head1 Methods for Bio::IdentifiableI compliance
+=cut
+=head2 object_id
+Title   : object_id
+Usage   : $string    = $obj->object_id()
+Function: a string which represents the stable primary identifier
+in this namespace of this object. For DNA sequences this
+is its accession_number, similarly for protein sequences
+This is aliased to accession_number().
+Returns : A scalar
+=cut
+sub object_id {
+return shift->accession_number(@_);
+}
+=head2 version
+Title   : version
+Usage   : $version    = $obj->version()
+Function: a number which differentiates between versions of
+the same object. Higher numbers are considered to be
+later and more relevant, but a single object described
+the same identifier should represent the same concept
+Returns : A number
+=cut
+sub version{
+my ($self,$value) = @_;
+if( defined $value) {
+	$self->{'_version'} = $value;
+}
+return $self->{'_version'};
+}
+=head2 authority
+Title   : authority
+Usage   : $authority    = $obj->authority()
+Function: a string which represents the organisation which
+granted the namespace, written as the DNS name for
+organisation (eg, wormbase.org)
+Returns : A scalar
+=cut
+sub authority {
+my ($obj,$value) = @_;
+if( defined $value) {
+	$obj->{'authority'} = $value;
+}
+return $obj->{'authority'};
+}
+=head2 namespace
+Title   : namespace
+Usage   : $string    = $obj->namespace()
+Function: A string representing the name space this identifier
+is valid in, often the database name or the name
+describing the collection
+Returns : A scalar
+=cut
+sub namespace{
+my ($self,$value) = @_;
+if( defined $value) {
+	$self->{'namespace'} = $value;
+}
+return $self->{'namespace'} || "";
+}
+=head1 Methods for Bio::DescribableI compliance
+This comprises of display_name and description.
+=cut
+=head2 display_name
+Title   : display_name
+Usage   : $string    = $obj->display_name()
+Function: A string which is what should be displayed to the user
+the string should have no spaces (ideally, though a cautious
+user of this interface would not assumme this) and should be
+less than thirty characters (though again, double checking
+this is a good idea)
+This is aliased to display_id().
+Returns : A scalar
+=cut
+sub display_name {
+return shift->display_id(@_);
+}
+=head2 description
+Title   : description
+Usage   : $string    = $obj->description()
+Function: A text string suitable for displaying to the user a
+description. This string is likely to have spaces, but
+should not have any newlines or formatting - just plain
+text. The string should not be greater than 255 characters
+and clients can feel justified at truncating strings at 255
+characters for the purposes of display
+This is aliased to desc().
+Returns : A scalar
+=cut
+sub description {
+return shift->desc(@_);
+}
+=head1 Methods Inherited from Bio::PrimarySeqI
+These methods are available on Bio::PrimarySeq, although they are
+actually implemented on Bio::PrimarySeqI
+=head2 revcom
+Title   : revcom
+Usage   : $rev = $seq->revcom()
+Function: Produces a new Bio::SeqI implementing object which
+is the reversed complement of the sequence. For protein
+sequences this throws an exception of
+"Sequence is a protein. Cannot revcom"
+The id is the same id as the orginal sequence, and the
+accession number is also indentical. If someone wants to
+track that this sequence has be reversed, it needs to
+define its own extensions
+To do an inplace edit of an object you can go:
+$seqobj = $seqobj->revcom();
+This of course, causes Perl to handle the garbage
+collection of the old object, but it is roughly speaking as
+efficient as an inplace edit.
+Returns : A new (fresh) Bio::SeqI object
+Args    : none
+=cut
+=head2 trunc
+Title   : trunc
+Usage   : $subseq = $myseq->trunc(10,100);
+Function: Provides a truncation of a sequence,
+Example :
+Returns : a fresh Bio::SeqI implementing object
+Args    :
+=cut
+=head1 Internal methods
+These are internal methods to PrimarySeq
+=cut
+=head2 _guess_alphabet
+Title   : _guess_alphabet
+Usage   :
+Function:
+Example :
+Returns :
+Args    :
+=cut
+sub _guess_alphabet {
+my ($self) = @_;
+my ($str,$str2,$total,$atgc,$u,$type);
+$str = $self->seq();
+$str =~ s/\-\.\?//g;
+$total = CORE::length($str);
+if( $total == 0 ) {
+$self->throw("Got a sequence with no letters in - ".
+		    "cannot guess alphabet [$str]");
+}
+$u = ($str =~ tr/Uu//);
+$atgc = ($str =~ tr/ATGCNatgcn//);
+if( ($atgc / $total) > 0.85 ) {
+$type = 'dna';
+} elsif( (($atgc + $u) / $total) > 0.85 ) {
+$type = 'rna';
+} else {
+$type = 'protein';
+}
+$self->alphabet($type);
+return $type;
+}
+############################################################################
+# aliases due to name changes or to compensate for our lack of consistency #
+############################################################################
+sub accession {
+my $self = shift;
+$self->warn(ref($self)."::accession is deprecated, ".
+		"use accession_number() instead");
+return $self->accession_number(@_);
+}
+1;

Mercurial > repos > mahtabm > ensemb_rep_gvl

comparison variant_effect_predictor/Bio/PrimarySeq.pm @ 0:2bc9b66ada89 draft default tip