Mercurial > repos > mahtabm > ensemb_rep_gvl
diff variant_effect_predictor/Bio/PrimarySeq.pm @ 0:2bc9b66ada89 draft default tip
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 06:29:17 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/PrimarySeq.pm Thu Apr 11 06:29:17 2013 -0400 @@ -0,0 +1,866 @@ +# $Id: PrimarySeq.pm,v 1.73.2.1 2003/06/29 00:25:27 jason Exp $ +# +# bioperl module for Bio::PrimarySeq +# +# Cared for by Ewan Birney <birney@sanger.ac.uk> +# +# Copyright Ewan Birney +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::PrimarySeq - Bioperl lightweight Sequence Object + +=head1 SYNOPSIS + + # The Bio::SeqIO for file reading, Bio::DB::GenBank for + # database reading + + use Bio::Seq; + use Bio::SeqIO; + use Bio::DB::GenBank; + + #make from memory + $seqobj = Bio::PrimarySeq->new ( -seq => 'ATGGGGTGGGCGGTGGGTGGTTTG', + -id => 'GeneFragment-12', + -accession_number => 'X78121', + -alphabet => 'dna', + -is_circular => 1 + ); + print "Sequence ", $seqobj->id(), " with accession ", + $seqobj->accession_number, "\n"; + + # read from file + $inputstream = Bio::SeqIO->new(-file => "myseq.fa",-format => 'Fasta'); + $seqobj = $inputstream->next_seq(); + print "Sequence ", $seqobj->id(), " and desc ", $seqobj->desc, "\n"; + + + # to get out parts of the sequence. + + print "Sequence ", $seqobj->id(), " with accession ", + $seqobj->accession_number, " and desc ", $seqobj->desc, "\n"; + + $string = $seqobj->seq(); + $string2 = $seqobj->subseq(1,40); + + +=head1 DESCRIPTION + +PrimarySeq is a lightweight Sequence object, storing little more than +the sequence, its name, a computer useful unique name. It does not +contain sequence features or other information. To have a sequence +with sequence features you should use the Seq object which uses this +object - go perldoc Bio::Seq + +Although newusers will use Bio::PrimarySeq alot, in general you will +be using it from the Bio::Seq object. For more information on Bio::Seq +go perldoc Bio::Seq. For interest you might like to known that +Bio::Seq has-a Bio::PrimarySeq and forwards most of the function calls +to do with sequence to it (the has-a relationship lets us get out of a +otherwise nasty cyclical reference in Perl which would leak memory). + +Sequence objects are defined by the Bio::PrimarySeqI interface, and this +object is a pure Perl implementation of the interface (if that's +gibberish to you, don't worry. The take home message is that this +object is the bioperl default sequence object, but other people can +use their own objects as sequences if they so wish). If you are +interested in wrapping your own objects as compliant Bioperl sequence +objects, then you should read the Bio::PrimarySeqI documentation + +The documenation of this object is a merge of the Bio::PrimarySeq and +Bio::PrimarySeqI documentation. This allows all the methods which you can +call on sequence objects here. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via email +or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Ewan Birney + +Email birney@sanger.ac.uk + +Describe contact details here + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::PrimarySeq; +use vars qw(@ISA); +use strict; + +use Bio::Root::Root; +use Bio::PrimarySeqI; +use Bio::IdentifiableI; +use Bio::DescribableI; + +@ISA = qw(Bio::Root::Root Bio::PrimarySeqI + Bio::IdentifiableI Bio::DescribableI); + +# +# setup the allowed values for alphabet() +# + +my %valid_type = map {$_, 1} qw( dna rna protein ); + +=head2 new + + Title : new + Usage : $seq = Bio::PrimarySeq->new( -seq => 'ATGGGGGTGGTGGTACCCT', + -id => 'human_id', + -accession_number => 'AL000012', + ); + + Function: Returns a new primary seq object from + basic constructors, being a string for the sequence + and strings for id and accession_number. + + Note that you can provide an empty sequence string. However, in + this case you MUST specify the type of sequence you wish to + initialize by the parameter -alphabet. See alphabet() for possible + values. + Returns : a new Bio::PrimarySeq object + Args : -seq => sequence string + -display_id => display id of the sequence (locus name) + -accession_number => accession number + -primary_id => primary id (Genbank id) + -namespace => the namespace for the accession + -authority => the authority for the namespace + -desc => description text + -alphabet => sequence type (alphabet) (dna|rna|protein) + -id => alias for display id + -is_circular => boolean field for whether or not sequence is circular + +=cut + + +sub new { + my ($class, @args) = @_; + my $self = $class->SUPER::new(@args); + + my($seq,$id,$acc,$pid,$ns,$auth,$v,$oid, + $desc,$alphabet,$given_id,$is_circular,$direct,$ref_to_seq,$len) = + $self->_rearrange([qw(SEQ + DISPLAY_ID + ACCESSION_NUMBER + PRIMARY_ID + NAMESPACE + AUTHORITY + VERSION + OBJECT_ID + DESC + ALPHABET + ID + IS_CIRCULAR + DIRECT + REF_TO_SEQ + LENGTH + )], + @args); + if( defined $id && defined $given_id ) { + if( $id ne $given_id ) { + $self->throw("Provided both id and display_id constructor ". + "functions. [$id] [$given_id]"); + } + } + if( defined $given_id ) { $id = $given_id; } + + # let's set the length before the seq -- if there is one, this length is + # going to be invalidated + defined $len && $self->length($len); + + # if alphabet is provided we set it first, so that it won't be guessed + # when the sequence is set + $alphabet && $self->alphabet($alphabet); + + # if there is an alphabet, and direct is passed in, assumme the alphabet + # and sequence is ok + + if( $direct && $ref_to_seq) { + $self->{'seq'} = $$ref_to_seq; + if( ! $alphabet ) { + $self->_guess_alphabet(); + } # else it has been set already above + } else { +# print STDERR "DEBUG: setting sequence to [$seq]\n"; + # note: the sequence string may be empty + $self->seq($seq) if defined($seq); + } + + $id && $self->display_id($id); + $acc && $self->accession_number($acc); + defined $pid && $self->primary_id($pid); + $desc && $self->desc($desc); + $is_circular && $self->is_circular($is_circular); + $ns && $self->namespace($ns); + $auth && $self->authority($auth); + defined($v) && $self->version($v); + defined($oid) && $self->object_id($oid); + + return $self; +} + +sub direct_seq_set { + my $obj = shift; + return $obj->{'seq'} = shift if @_; + return undef; +} + + +=head2 seq + + Title : seq + Usage : $string = $obj->seq() + Function: Returns the sequence as a string of letters. The + case of the letters is left up to the implementer. + Suggested cases are upper case for proteins and lower case for + DNA sequence (IUPAC standard), but you should not rely on this + Returns : A scalar + Args : Optionally on set the new value (a string). An optional second + argument presets the alphabet (otherwise it will be guessed). + Both parameters may also be given in named paramater style + with -seq and -alphabet being the names. + +=cut + +sub seq { + my ($obj,@args) = @_; + + if( scalar(@args) == 0 ) { + return $obj->{'seq'}; + } + + my ($value,$alphabet) = @args; + + + if(@args) { + if(defined($value) && (! $obj->validate_seq($value))) { + $obj->throw("Attempting to set the sequence to [$value] ". + "which does not look healthy"); + } + # if a sequence was already set we make sure that we re-adjust the + # mol.type, otherwise we skip guessing if mol.type is already set + # note: if the new seq is empty or undef, we don't consider that a + # change (we wouldn't have anything to guess on anyway) + my $is_changed_seq = + exists($obj->{'seq'}) && (CORE::length($value || '') > 0); + $obj->{'seq'} = $value; + # new alphabet overridden by arguments? + if($alphabet) { + # yes, set it no matter what + $obj->alphabet($alphabet); + } elsif( # if we changed a previous sequence to a new one + $is_changed_seq || + # or if there is no alphabet yet at all + (! defined($obj->alphabet()))) { + # we need to guess the (possibly new) alphabet + $obj->_guess_alphabet(); + } # else (seq not changed and alphabet was defined) do nothing + # if the seq is changed, make sure we unset a possibly set length + $obj->length(undef) if $is_changed_seq; + } + return $obj->{'seq'}; +} + +=head2 validate_seq + + Title : validate_seq + Usage : if(! $seq->validate_seq($seq_str) ) { + print "sequence $seq_str is not valid for an object of type ", + ref($seq), "\n"; + } + Function: Validates a given sequence string. A validating sequence string + must be accepted by seq(). A string that does not validate will + lead to an exception if passed to seq(). + + The implementation provided here does not take alphabet() into + account. Allowed are all letters (A-Z) and '-','.', '*' and '?'. + + Example : + Returns : 1 if the supplied sequence string is valid for the object, and + 0 otherwise. + Args : The sequence string to be validated. + + +=cut + +sub validate_seq { + my ($self,$seqstr) = @_; + if( ! defined $seqstr ){ $seqstr = $self->seq(); } + return 0 unless( defined $seqstr); + if((CORE::length($seqstr) > 0) && ($seqstr !~ /^([A-Za-z\-\.\*\?]+)$/)) { + $self->warn("seq doesn't validate, mismatch is " . + ($seqstr =~ /([^A-Za-z\-\.\*\?]+)/g)); + return 0; + } + return 1; +} + +=head2 subseq + + Title : subseq + Usage : $substring = $obj->subseq(10,40); + Function: returns the subseq from start to end, where the first base + is 1 and the number is inclusive, ie 1-2 are the first two + bases of the sequence + Returns : a string + Args : integer for start position + integer for end position + OR + Bio::LocationI location for subseq (strand honored) + +=cut + +sub subseq { + my ($self,$start,$end,$replace) = @_; + + if( ref($start) && $start->isa('Bio::LocationI') ) { + my $loc = $start; + $replace = $end; # do we really use this anywhere? scary. HL + my $seq = ""; + foreach my $subloc ($loc->each_Location()) { + my $piece = $self->subseq($subloc->start(), + $subloc->end(), $replace); + if($subloc->strand() < 0) { + $piece = Bio::PrimarySeq->new('-seq' => $piece)->revcom()->seq(); + } + $seq .= $piece; + } + return $seq; + } elsif( defined $start && defined $end ) { + if( $start > $end ){ + $self->throw("in subseq, start [$start] has to be ". + "greater than end [$end]"); + } + if( $start <= 0 || $end > $self->length ) { + $self->throw("You have to have start positive\n\tand length less ". + "than the total length of sequence [$start:$end] ". + "Total ".$self->length.""); + } + + # remove one from start, and then length is end-start + $start--; + if( defined $replace ) { + return substr( $self->seq(), $start, ($end-$start), $replace); + } else { + return substr( $self->seq(), $start, ($end-$start)); + } + } else { + $self->warn("Incorrect parameters to subseq - must be two integers ". + "or a Bio::LocationI object not ($start,$end)"); + } +} + +=head2 length + + Title : length + Usage : $len = $seq->length(); + Function: Get the length of the sequence in number of symbols (bases + or amino acids). + + You can also set this attribute, even to a number that does + not match the length of the sequence string. This is useful + if you don''t want to set the sequence too, or if you want + to free up memory by unsetting the sequence. In the latter + case you could do e.g. + + $seq->length($seq->length); + $seq->seq(undef); + + Note that if you set the sequence to a value other than + undef at any time, the length attribute will be + invalidated, and the length of the sequence string will be + reported again. Also, we won''t let you lie about the length. + + Example : + Returns : integer representing the length of the sequence. + Args : Optionally, the value on set + +=cut + +sub length { + my $self = shift; + my $len = CORE::length($self->seq() || ''); + + if(@_) { + my $val = shift; + if(defined($val) && $len && ($len != $val)) { + $self->throw("You're trying to lie about the length: ". + "is $len but you say ".$val); + } + $self->{'_seq_length'} = $val; + } elsif(defined($self->{'_seq_length'})) { + return $self->{'_seq_length'}; + } + return $len; +} + +=head2 display_id + + Title : display_id or display_name + Usage : $id_string = $obj->display_id(); + Function: returns the display id, aka the common name of the Sequence object. + + The semantics of this is that it is the most likely string to + be used as an identifier of the sequence, and likely to have + "human" readability. The id is equivalent to the ID field of + the GenBank/EMBL databanks and the id field of the + Swissprot/sptrembl database. In fasta format, the >(\S+) is + presumed to be the id, though some people overload the id to + embed other information. Bioperl does not use any embedded + information in the ID field, and people are encouraged to use + other mechanisms (accession field for example, or extending + the sequence object) to solve this. + + With the new Bio::DescribeableI interface, display_name aliases + to this method. + + Returns : A string + Args : None + + +=cut + +sub display_id { + my ($obj,$value) = @_; + if( defined $value) { + $obj->{'display_id'} = $value; + } + return $obj->{'display_id'}; + +} + +=head2 accession_number + + Title : accession_number or object_id + Usage : $unique_key = $obj->accession_number; + Function: Returns the unique biological id for a sequence, commonly + called the accession_number. For sequences from established + databases, the implementors should try to use the correct + accession number. Notice that primary_id() provides the + unique id for the implemetation, allowing multiple objects + to have the same accession number in a particular implementation. + + For sequences with no accession number, this method should + return "unknown". + + [Note this method name is likely to change in 1.3] + + With the new Bio::IdentifiableI interface, this is aliased + to object_id + + Returns : A string + Args : A string (optional) for setting + +=cut + +sub accession_number { + my( $obj, $acc ) = @_; + + if (defined $acc) { + $obj->{'accession_number'} = $acc; + } else { + $acc = $obj->{'accession_number'}; + $acc = 'unknown' unless defined $acc; + } + return $acc; +} + +=head2 primary_id + + Title : primary_id + Usage : $unique_key = $obj->primary_id; + Function: Returns the unique id for this object in this + implementation. This allows implementations to manage their + own object ids in a way the implementaiton can control + clients can expect one id to map to one object. + + For sequences with no natural primary id, this method + should return a stringified memory location. + + Returns : A string + Args : A string (optional, for setting) + +=cut + +sub primary_id { + my ($obj,$value) = @_; + if( defined $value) { + $obj->{'primary_id'} = $value; + } + if( ! exists $obj->{'primary_id'} ) { + return "$obj"; + } + return $obj->{'primary_id'}; + +} + + +=head2 alphabet + + Title : alphabet + Usage : if( $obj->alphabet eq 'dna' ) { /Do Something/ } + Function: Returns the type of sequence being one of + 'dna', 'rna' or 'protein'. This is case sensitive. + + This is not called <type> because this would cause + upgrade problems from the 0.5 and earlier Seq objects. + + Returns : a string either 'dna','rna','protein'. NB - the object must + make a call of the type - if there is no type specified it + has to guess. + Args : none + + +=cut + +sub alphabet { + my ($obj,$value) = @_; + if (defined $value) { + $value = lc $value; + unless ( $valid_type{$value} ) { + $obj->throw("Molecular type '$value' is not a valid type (". + join(',', map "'$_'", sort keys %valid_type) . + ") lowercase"); + } + $obj->{'alphabet'} = $value; + } + return $obj->{'alphabet'}; +} + +=head2 desc + + Title : desc or description + Usage : $obj->desc($newval) + Function: Get/set description of the sequence. + + description is an alias for this for compliance with the + Bio::DescribeableI interface. + + Example : + Returns : value of desc (a string) + Args : newvalue (a string or undef, optional) + + +=cut + +sub desc{ + my $self = shift; + + return $self->{'desc'} = shift if @_; + return $self->{'desc'}; +} + +=head2 can_call_new + + Title : can_call_new + Usage : + Function: + Example : + Returns : true + Args : + + +=cut + +sub can_call_new { + my ($self) = @_; + + return 1; + +} + +=head2 id + + Title : id + Usage : $id = $seq->id() + Function: This is mapped on display_id + Example : + Returns : + Args : + + +=cut + +sub id { + return shift->display_id(@_); +} + +=head2 is_circular + + Title : is_circular + Usage : if( $obj->is_circular) { /Do Something/ } + Function: Returns true if the molecule is circular + Returns : Boolean value + Args : none + +=cut + +sub is_circular{ + my $self = shift; + return $self->{'is_circular'} = shift if @_; + return $self->{'is_circular'}; +} + +=head1 Methods for Bio::IdentifiableI compliance + +=cut + +=head2 object_id + + Title : object_id + Usage : $string = $obj->object_id() + Function: a string which represents the stable primary identifier + in this namespace of this object. For DNA sequences this + is its accession_number, similarly for protein sequences + + This is aliased to accession_number(). + Returns : A scalar + + +=cut + +sub object_id { + return shift->accession_number(@_); +} + +=head2 version + + Title : version + Usage : $version = $obj->version() + Function: a number which differentiates between versions of + the same object. Higher numbers are considered to be + later and more relevant, but a single object described + the same identifier should represent the same concept + + Returns : A number + +=cut + +sub version{ + my ($self,$value) = @_; + if( defined $value) { + $self->{'_version'} = $value; + } + return $self->{'_version'}; +} + + +=head2 authority + + Title : authority + Usage : $authority = $obj->authority() + Function: a string which represents the organisation which + granted the namespace, written as the DNS name for + organisation (eg, wormbase.org) + + Returns : A scalar + +=cut + +sub authority { + my ($obj,$value) = @_; + if( defined $value) { + $obj->{'authority'} = $value; + } + return $obj->{'authority'}; +} + +=head2 namespace + + Title : namespace + Usage : $string = $obj->namespace() + Function: A string representing the name space this identifier + is valid in, often the database name or the name + describing the collection + + Returns : A scalar + + +=cut + +sub namespace{ + my ($self,$value) = @_; + if( defined $value) { + $self->{'namespace'} = $value; + } + return $self->{'namespace'} || ""; +} + +=head1 Methods for Bio::DescribableI compliance + +This comprises of display_name and description. + +=cut + +=head2 display_name + + Title : display_name + Usage : $string = $obj->display_name() + Function: A string which is what should be displayed to the user + the string should have no spaces (ideally, though a cautious + user of this interface would not assumme this) and should be + less than thirty characters (though again, double checking + this is a good idea) + + This is aliased to display_id(). + Returns : A scalar + +=cut + +sub display_name { + return shift->display_id(@_); +} + +=head2 description + + Title : description + Usage : $string = $obj->description() + Function: A text string suitable for displaying to the user a + description. This string is likely to have spaces, but + should not have any newlines or formatting - just plain + text. The string should not be greater than 255 characters + and clients can feel justified at truncating strings at 255 + characters for the purposes of display + + This is aliased to desc(). + Returns : A scalar + +=cut + +sub description { + return shift->desc(@_); +} + +=head1 Methods Inherited from Bio::PrimarySeqI + +These methods are available on Bio::PrimarySeq, although they are +actually implemented on Bio::PrimarySeqI + +=head2 revcom + + Title : revcom + Usage : $rev = $seq->revcom() + Function: Produces a new Bio::SeqI implementing object which + is the reversed complement of the sequence. For protein + sequences this throws an exception of + "Sequence is a protein. Cannot revcom" + + The id is the same id as the orginal sequence, and the + accession number is also indentical. If someone wants to + track that this sequence has be reversed, it needs to + define its own extensions + + To do an inplace edit of an object you can go: + + $seqobj = $seqobj->revcom(); + + This of course, causes Perl to handle the garbage + collection of the old object, but it is roughly speaking as + efficient as an inplace edit. + + Returns : A new (fresh) Bio::SeqI object + Args : none + +=cut + +=head2 trunc + + Title : trunc + Usage : $subseq = $myseq->trunc(10,100); + Function: Provides a truncation of a sequence, + + Example : + Returns : a fresh Bio::SeqI implementing object + Args : + + +=cut + +=head1 Internal methods + +These are internal methods to PrimarySeq + +=cut + +=head2 _guess_alphabet + + Title : _guess_alphabet + Usage : + Function: + Example : + Returns : + Args : + + +=cut + +sub _guess_alphabet { + my ($self) = @_; + my ($str,$str2,$total,$atgc,$u,$type); + + $str = $self->seq(); + $str =~ s/\-\.\?//g; + + $total = CORE::length($str); + if( $total == 0 ) { + $self->throw("Got a sequence with no letters in - ". + "cannot guess alphabet [$str]"); + } + + $u = ($str =~ tr/Uu//); + $atgc = ($str =~ tr/ATGCNatgcn//); + + if( ($atgc / $total) > 0.85 ) { + $type = 'dna'; + } elsif( (($atgc + $u) / $total) > 0.85 ) { + $type = 'rna'; + } else { + $type = 'protein'; + } + + $self->alphabet($type); + return $type; +} + +############################################################################ +# aliases due to name changes or to compensate for our lack of consistency # +############################################################################ + +sub accession { + my $self = shift; + + $self->warn(ref($self)."::accession is deprecated, ". + "use accession_number() instead"); + return $self->accession_number(@_); +} + +1; +