Mercurial > repos > mahtabm > ensembl
diff variant_effect_predictor/Bio/Seq.pm @ 0:1f6dce3d34e0
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 02:01:53 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Seq.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,1250 @@ +# $Id: Seq.pm,v 1.76.2.2 2003/07/03 20:01:32 jason Exp $ +# +# BioPerl module for Bio::Seq +# +# Cared for by Ewan Birney <birney@ebi.ac.uk> +# +# Copyright Ewan Birney +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Seq - Sequence object, with features + +=head1 SYNOPSIS + + # This is the main sequence object in Bioperl + + # gets a sequence from a file + $seqio = Bio::SeqIO->new( '-format' => 'embl' , -file => 'myfile.dat'); + $seqobj = $seqio->next_seq(); + + # SeqIO can both read and write sequences; see Bio::SeqIO + # for more information and examples + + # get from database + $db = Bio::DB::GenBank->new(); + $seqobj = $db->get_Seq_by_acc('X78121'); + + # make from strings in script + $seqobj = Bio::Seq->new( -display_id => 'my_id', + -seq => $sequence_as_string); + + # gets sequence as a string from sequence object + $seqstr = $seqobj->seq(); # actual sequence as a string + $seqstr = $seqobj->subseq(10,50); # slice in biological coordinates + + # retrieves information from the sequence + # features must implement Bio::SeqFeatureI interface + + @features = $seqobj->get_SeqFeatures(); # just top level + foreach my $feat ( @features ) { + print "Feature ",$feat->primary_tag," starts ",$feat->start," ends ", + $feat->end," strand ",$feat->strand,"\n"; + + # features retain link to underlying sequence object + print "Feature sequence is ",$feat->seq->seq(),"\n" + } + + # sequences may have a species + + if( defined $seq->species ) { + print "Sequence is from ",$species->binomial_name," [",$species->common_name,"]\n"; + } + + # annotation objects are Bio::AnnotationCollectionI's + $ann = $seqobj->annotation(); # annotation object + + # references is one type of annotations to get. Also get + # comment and dblink. Look at Bio::AnnotationCollection for + # more information + + foreach my $ref ( $ann->get_Annotations('reference') ) { + print "Reference ",$ref->title,"\n"; + } + + # you can get truncations, translations and reverse complements, these + # all give back Bio::Seq objects themselves, though currently with no + # features transfered + + my $trunc = $seqobj->trunc(100,200); + my $rev = $seqobj->revcom(); + + # there are many options to translate - check out the docs + my $trans = $seqobj->translate(); + + # these functions can be chained together + + my $trans_trunc_rev = $seqobj->trunc(100,200)->revcom->translate(); + + + +=head1 DESCRIPTION + +A Seq object is a sequence with sequence features placed on it. The +Seq object contains a PrimarySeq object for the actual sequence and +also implements its interface. + +In Bioperl we have 3 main players that people are going to use frequently + + Bio::PrimarySeq - just the sequence and its names, nothing else. + Bio::SeqFeatureI - a location on a sequence, potentially with a sequence + and annotation. + Bio::Seq - A sequence and a collection of sequence features + (an aggregate) with its own annotation. + +Although Bioperl is not tied heavily to file formats these distinctions do +map to file formats sensibly and for some bioinformaticians this might help + + Bio::PrimarySeq - Fasta file of a sequence + Bio::SeqFeatureI - A single entry in an EMBL/GenBank/DDBJ feature table + Bio::Seq - A single EMBL/GenBank/DDBJ entry + +By having this split we avoid a lot of nasty circular references +(sequence features can hold a reference to a sequence without the sequence +holding a reference to the sequence feature). See L<Bio::PrimarySeq> and +L<Bio::SeqFeatureI> for more information. + +Ian Korf really helped in the design of the Seq and SeqFeature system. + +=head1 EXAMPLES + +A simple and fundamental block of code + + use Bio::SeqIO; + + my $seqIOobj = Bio::SeqIO->new(-file=>"1.fa"); # create a SeqIO object + my $seqobj = $seqIOobj->next_seq; # get a Seq object + +With the Seq object in hand one has access to a powerful set of Bioperl +methods and Bioperl objects. This next script will take a file of sequences +in EMBL format and create a file of the reverse-complemented sequences +in Fasta format using Seq objects. It also prints out details about the +exons it finds as sequence features in Genbank Flat File format. + + use Bio::Seq; + use Bio::SeqIO; + + $seqin = Bio::SeqIO->new( -format => 'EMBL' , -file => 'myfile.dat'); + $seqout= Bio::SeqIO->new( -format => 'Fasta', -file => '>output.fa'); + + while((my $seqobj = $seqin->next_seq())) { + print "Seen sequence ",$seqobj->display_id,", start of seq ", + substr($seqobj->seq,1,10),"\n"; + if( $seqobj->alphabet eq 'dna') { + $rev = $seqobj->revcom; + $id = $seqobj->display_id(); + $id = "$id.rev"; + $rev->display_id($id); + $seqout->write_seq($rev); + } + + foreach $feat ( $seqobj->get_SeqFeatures() ) { + if( $feat->primary_tag eq 'exon' ) { + print STDOUT "Location ",$feat->start,":", + $feat->end," GFF[",$feat->gff_string,"]\n"; + } + } + } + +Let's examine the script. The lines below import the Bioperl modules. +Seq is the main Bioperl sequence object and SeqIO is the Bioperl support +for reading sequences from files and to files + + use Bio::Seq; + use Bio::SeqIO; + +These two lines create two SeqIO streams: one for reading in sequences +and one for outputting sequences: + + $seqin = Bio::SeqIO->new( -format => 'EMBL' , -file => 'myfile.dat'); + $seqout= Bio::SeqIO->new( -format => 'Fasta', -file => '>output.fa'); + +Notice that in the "$seqout" case there is a greater-than sign, +indicating the file is being opened for writing. + +Using the + + '-argument' => value + +syntax is common in Bioperl. The file argument is like an argument +to open() . You can also pass in filehandles or FileHandle objects by +using the -fh argument (see L<Bio::SeqIO> documentation for details). +Many formats in Bioperl are handled, including Fasta, EMBL, GenBank, +Swissprot (swiss), PIR, and GCG. + + $seqin = Bio::SeqIO->new( -format => 'EMBL' , -file => 'myfile.dat'); + $seqout= Bio::SeqIO->new( -format => 'Fasta', -file => '>output.fa'); + +This is the main loop which will loop progressively through sequences +in a file, and each call to $seqio-E<gt>next_seq() provides a new Seq +object from the file: + + while((my $seqobj = $seqio->next_seq())) { + +This print line below accesses fields in the Seq object directly. The +$seqobj-E<gt>display_id is the way to access the display_id attribute +of the Seq object. The $seqobj-E<gt>seq method gets the actual +sequence out as string. Then you can do manipulation of this if +you want to (there are however easy ways of doing truncation, +reverse-complement and translation). + + print "Seen sequence ",$seqobj->display_id,", start of seq ", + substr($seqobj->seq,1,10),"\n"; + +Bioperl has to guess the alphabet of the sequence, being either 'dna', +'rna', or 'protein'. The alphabet attribute is one of these three +possibilities. + + if( $seqobj->alphabet eq 'dna') { + +The $seqobj-E<gt>revcom method provides the reverse complement of the Seq +object as another Seq object. Thus, the $rev variable is a reference to +another Seq object. For example, one could repeat the above print line +for this Seq object (putting $rev in place of $seqobj). In this +case we are going to output the object into the file stream we built +earlier on. + + $rev = $seqobj->revcom; + +When we output it, we want the id of the outputted object +to be changed to "$id.rev", ie, with .rev on the end of the name. The +following lines retrieve the id of the sequence object, add .rev +to this and then set the display_id of the rev sequence object to +this. Notice that to set the display_id attribute you just need +call the same method, display_id(), with the new value as an argument. +Getting and setting values with the same method is common in Bioperl. + + $id = $seqobj->display_id(); + $id = "$id.rev"; + $rev->display_id($id); + +The write_seq method on the SeqIO output object, $seqout, writes the +$rev object to the filestream we built at the top of the script. +The filestream knows that it is outputting in fasta format, and +so it provides fasta output. + + $seqout->write_seq($rev); + +This block of code loops over sequence features in the sequence +object, trying to find ones who have been tagged as 'exon'. +Features have start and end attributes and can be outputted +in Genbank Flat File format, GFF, a standarized format for sequence +features. + + foreach $feat ( $seqobj->get_SeqFeatures() ) { + if( $feat->primary_tag eq 'exon' ) { + print STDOUT "Location ",$feat->start,":", + $feat->end," GFF[",$feat->gff_string,"]\n"; + } + } + +The code above shows how a few Bio::Seq methods suffice to read, parse, +reformat and analyze sequences from a file. A full list of methods +available to Bio::Seq objects is shown below. Bear in mind that some of +these methods come from PrimarySeq objects, which are simpler +than Seq objects, stripped of features (see L<Bio::PrimarySeq> for +more information). + + # these methods return strings, and accept strings in some cases: + + $seqobj->seq(); # string of sequence + $seqobj->subseq(5,10); # part of the sequence as a string + $seqobj->accession_number(); # when there, the accession number + $seqobj->moltype(); # one of 'dna','rna',or 'protein' + $seqobj->seq_version() # when there, the version + $seqobj->keywords(); # when there, the Keywords line + $seqobj->length() # length + $seqobj->desc(); # description + $seqobj->primary_id(); # a unique id for this sequence regardless + # of its display_id or accession number + $seqobj->display_id(); # the human readable id of the sequence + +Some of these values map to fields in common formats. For example, The +display_id() method returns the LOCUS name of a Genbank entry, +the (\S+) following the E<gt> character in a Fasta file, the ID from +a SwissProt file, and so on. The desc() method will return the DEFINITION +line of a Genbank file, the description following the display_id in a +Fasta file, and the DE field in a SwissProt file. + + # the following methods return new Seq objects, but + # do not transfer features across to the new object: + + $seqobj->trunc(5,10) # truncation from 5 to 10 as new object + $seqobj->revcom # reverse complements sequence + $seqobj->translate # translation of the sequence + + # if new() can be called this method returns 1, else 0 + + $seqobj->can_call_new + + # the following method determines if the given string will be accepted + # by the seq() method - if the string is acceptable then validate() + # returns 1, or 0 if not + + $seqobj->validate_seq($string) + + # the following method returns or accepts a Species object: + + $seqobj->species(); + +Please see L<Bio::Species> for more information on this object. + + # the following method returns or accepts an Annotation object + # which in turn allows access to Annotation::Reference + # and Annotation::Comment objects: + + $seqobj->annotation(); + +These annotations typically refer to entire sequences, unlike +features. See L<Bio::AnnotationCollectionI>, +L<Bio::Annotation::Collection>, L<Bio::Annotation::Reference>, and +L<Bio::Annotation::Comment> for details. + +It is also important to be able to describe defined portions of a +sequence. The combination of some description and the corresponding +sub-sequence is called a feature - an exon and its coordinates within +a gene is an example of a feature, or a domain within a protein. + + # the following methods return an array of SeqFeatureI objects: + + $seqobj->get_SeqFeatures # The 'top level' sequence features + $seqobj->get_all_SeqFeatures # All sequence features, including sub-seq + # features, such as features in an exon + + # to find out the number of features use: + + $seqobj->feature_count + +Here are just some of the methods available to SeqFeatureI objects: + + # these methods return numbers: + + $feat->start # start position (1 is the first base) + $feat->end # end position (2 is the second base) + $feat->strand # 1 means forward, -1 reverse, 0 not relevant + + # these methods return or accept strings: + + $feat->primary_tag # the name of the sequence feature, eg + # 'exon', 'glycoslyation site', 'TM domain' + $feat->source_tag # where the feature comes from, eg, 'EMBL_GenBank', + # or 'BLAST' + + # this method returns the more austere PrimarySeq object, not a + # Seq object - the main difference is that PrimarySeq objects do not + # themselves contain sequence features + + $feat->seq # the sequence between start,end on the + # correct strand of the sequence + +See L<Bio::PrimarySeq> for more details on PrimarySeq objects. + + # useful methods for feature comparisons, for start/end points + + $feat->overlaps($other) # do $feat and $other overlap? + $feat->contains($other) # is $other completely within $feat? + $feat->equals($other) # do $feat and $other completely agree? + + # one can also add features + + $seqobj->add_SeqFeature($feat) # returns 1 if successful + $seqobj->add_SeqFeature(@features) # returns 1 if successful + + # sub features. For complex join() statements, the feature + # is one sequence feature with many sub SeqFeatures + + $feat->sub_SeqFeature # returns array of sub seq features + +Please see L<Bio::SeqFeatureI> and L<Bio::SeqFeature::Generic>, +for more information on sequence features. + +It is worth mentioning that one can also retrieve the start and end +positions of a feature using a Bio::LocationI object: + + $location = $feat->location # $location is a Bio::LocationI object + $location->start; # start position + $location->end; # end position + +This is useful because one needs a Bio::Location::SplitLocationI object +in order to retrieve the coordinates inside the Genbank or EMBL join() +statements (e.g. "CDS join(51..142,273..495,1346..1474)"): + + if ( $feat->location->isa('Bio::Location::SplitLocationI') && + $feat->primary_tag eq 'CDS' ) { + foreach $loc ( $feat->location->sub_Location ) { + print $loc->start . ".." . $loc->end . "\n"; + } + } + +See L<Bio::LocationI> and L<Bio::Location::SplitLocationI> for more +information. + +=head1 Implemented Interfaces + +This class implements the following interfaces. + +=over 4 + +=item Bio::SeqI + +Note that this includes implementing Bio::PrimarySeqI. + +=item Bio::IdentifiableI + +=item Bio::DescribableI + +=item Bio::AnnotatableI + +=item Bio::FeatureHolderI + +=back + +=head1 FEEDBACK + + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via email +or the web: + + bioperl-bugs@bioperl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Ewan Birney, inspired by Ian Korf objects + +Email birney@ebi.ac.uk + +=head1 CONTRIBUTORS + +Jason Stajich E<lt>jason@bioperl.orgE<gt> + +=head1 APPENDIX + + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a "_". + +=cut + +#' +# Let the code begin... + + +package Bio::Seq; +use vars qw(@ISA $VERSION); +use strict; + + +# Object preamble - inherits from Bio::Root::Object + +use Bio::Root::Root; +use Bio::SeqI; +use Bio::Annotation::Collection; +use Bio::PrimarySeq; +use Bio::IdentifiableI; +use Bio::DescribableI; +use Bio::AnnotatableI; +use Bio::FeatureHolderI; + +$VERSION = '1.1'; +@ISA = qw(Bio::Root::Root Bio::SeqI + Bio::IdentifiableI Bio::DescribableI + Bio::AnnotatableI Bio::FeatureHolderI); + +=head2 new + + Title : new + Usage : $seq = Bio::Seq->new( -seq => 'ATGGGGGTGGTGGTACCCT', + -id => 'human_id', + -accession_number => 'AL000012', + ); + + Function: Returns a new Seq object from + basic constructors, being a string for the sequence + and strings for id and accession_number + Returns : a new Bio::Seq object + +=cut + +sub new { + my($caller,@args) = @_; + + if( $caller ne 'Bio::Seq') { + $caller = ref($caller) if ref($caller); + } + + # we know our inherietance heirarchy + my $self = Bio::Root::Root->new(@args); + bless $self,$caller; + + # this is way too sneaky probably. We delegate the construction of + # the Seq object onto PrimarySeq and then pop primary_seq into + # our primary_seq slot + + my $pseq = Bio::PrimarySeq->new(@args); + + # as we have just made this, we know it is ok to set hash directly + # rather than going through the method + + $self->{'primary_seq'} = $pseq; + + # setting this array is now delayed until the final + # moment, again speed ups for non feature containing things + # $self->{'_as_feat'} = []; + + + my ($ann, $pid,$feat,$species) = &Bio::Root::RootI::_rearrange($self,[qw(ANNOTATION PRIMARY_ID FEATURES SPECIES)], @args); + + # for a number of cases - reading fasta files - these are never set. This + # gives a quick optimisation around testing things later on + + if( defined $ann || defined $pid || defined $feat || defined $species ) { + $pid && $self->primary_id($pid); + $species && $self->species($species); + $ann && $self->annotation($ann); + + if( defined $feat ) { + if( ref($feat) !~ /ARRAY/i ) { + if( ref($feat) && $feat->isa('Bio::SeqFeatureI') ) { + $self->add_SeqFeature($feat); + } else { + $self->warn("Must specify a valid Bio::SeqFeatureI or ArrayRef of Bio::SeqFeatureI's with the -features init parameter for ".ref($self)); + } + } else { + foreach my $feature ( @$feat ) { + $self->add_SeqFeature($feature); + } + } + } + } + + return $self; +} + +=head1 PrimarySeq interface + + +The PrimarySeq interface provides the basic sequence getting +and setting methods for on all sequences. + +These methods implement the Bio::PrimarySeq interface by delegating +to the primary_seq inside the object. This means that you +can use a Seq object wherever there is a PrimarySeq, and +of course, you are free to use these functions anyway. + +=cut + +=head2 seq + + Title : seq + Usage : $string = $obj->seq() + Function: Get/Set the sequence as a string of letters. The + case of the letters is left up to the implementer. + Suggested cases are upper case for proteins and lower case for + DNA sequence (IUPAC standard), + but implementations are suggested to keep an open mind about + case (some users... want mixed case!) + Returns : A scalar + Args : Optionally on set the new value (a string). An optional second + argument presets the alphabet (otherwise it will be guessed). + Both parameters may also be given in named paramater style + with -seq and -alphabet being the names. + +=cut + +sub seq { + return shift->primary_seq()->seq(@_); +} + +=head2 validate_seq + + Title : validate_seq + Usage : if(! $seq->validate_seq($seq_str) ) { + print "sequence $seq_str is not valid for an object of type ", + ref($seq), "\n"; + } + Function: Validates a given sequence string. A validating sequence string + must be accepted by seq(). A string that does not validate will + lead to an exception if passed to seq(). + + The implementation provided here does not take alphabet() into + account. Allowed are all letters (A-Z) and '-','.', and '*'. + + Example : + Returns : 1 if the supplied sequence string is valid for the object, and + 0 otherwise. + Args : The sequence string to be validated. + + +=cut + +sub validate_seq { + return shift->primary_seq()->validate_seq(@_); +} + +=head2 length + + Title : length + Usage : $len = $seq->length() + Function: + Example : + Returns : Integer representing the length of the sequence. + Args : None + +=cut + +sub length { + return shift->primary_seq()->length(@_); +} + +=head1 Methods from the Bio::PrimarySeqI interface + +=cut + +=head2 subseq + + Title : subseq + Usage : $substring = $obj->subseq(10,40); + Function: Returns the subseq from start to end, where the first base + is 1 and the number is inclusive, ie 1-2 are the first two + bases of the sequence + + Start cannot be larger than end but can be equal + + Returns : A string + Args : 2 integers + + +=cut + +sub subseq { + return shift->primary_seq()->subseq(@_); +} + +=head2 display_id + + Title : display_id + Usage : $id = $obj->display_id or $obj->display_id($newid); + Function: Gets or sets the display id, also known as the common name of + the Seq object. + + The semantics of this is that it is the most likely string + to be used as an identifier of the sequence, and likely to + have "human" readability. The id is equivalent to the LOCUS + field of the GenBank/EMBL databanks and the ID field of the + Swissprot/sptrembl database. In fasta format, the >(\S+) is + presumed to be the id, though some people overload the id + to embed other information. Bioperl does not use any + embedded information in the ID field, and people are + encouraged to use other mechanisms (accession field for + example, or extending the sequence object) to solve this. + + Notice that $seq->id() maps to this function, mainly for + legacy/convenience issues. + Returns : A string + Args : None or a new id + + +=cut + +sub display_id { + return shift->primary_seq->display_id(@_); +} + + + +=head2 accession_number + + Title : accession_number + Usage : $unique_biological_key = $obj->accession_number; + Function: Returns the unique biological id for a sequence, commonly + called the accession_number. For sequences from established + databases, the implementors should try to use the correct + accession number. Notice that primary_id() provides the + unique id for the implemetation, allowing multiple objects + to have the same accession number in a particular implementation. + + For sequences with no accession number, this method should return + "unknown". + + Can also be used to set the accession number. + Example : $key = $seq->accession_number or $seq->accession_number($key) + Returns : A string + Args : None or an accession number + + +=cut + +sub accession_number { + return shift->primary_seq->accession_number(@_); +} + +=head2 desc + + Title : desc + Usage : $seqobj->desc($string) or $seqobj->desc() + Function: Sets or gets the description of the sequence + Example : + Returns : The description + Args : The description or none + + +=cut + +sub desc { + return shift->primary_seq->desc(@_); +} + +=head2 primary_id + + Title : primary_id + Usage : $unique_implementation_key = $obj->primary_id; + Function: Returns the unique id for this object in this + implementation. This allows implementations to manage + their own object ids in a way the implementation can control + clients can expect one id to map to one object. + + For sequences with no natural id, this method should return + a stringified memory location. + + Can also be used to set the primary_id. + + Also notice that this method is not delegated to the + internal Bio::PrimarySeq object + + [Note this method name is likely to change in 1.3] + + Example : $id = $seq->primary_id or $seq->primary_id($id) + Returns : A string + Args : None or an id + + +=cut + +sub primary_id { + my ($obj,$value) = @_; + + if( defined $value) { + $obj->{'primary_id'} = $value; + } + if( ! exists $obj->{'primary_id'} ) { + return "$obj"; + } + return $obj->{'primary_id'}; +} + +=head2 can_call_new + + Title : can_call_new + Usage : if ( $obj->can_call_new ) { + $newobj = $obj->new( %param ); + } + Function: can_call_new returns 1 or 0 depending + on whether an implementation allows new + constructor to be called. If a new constructor + is allowed, then it should take the followed hashed + constructor list. + + $myobject->new( -seq => $sequence_as_string, + -display_id => $id + -accession_number => $accession + -alphabet => 'dna', + ); + Example : + Returns : 1 or 0 + Args : None + + +=cut + +sub can_call_new { + return 1; +} + +=head2 alphabet + + Title : alphabet + Usage : if ( $obj->alphabet eq 'dna' ) { /Do Something/ } + Function: Returns the type of sequence being one of + 'dna', 'rna' or 'protein'. This is case sensitive. + + This is not called <type> because this would cause + upgrade problems from the 0.5 and earlier Seq objects. + + Returns : A string either 'dna','rna','protein'. NB - the object must + make a call of the type - if there is no type specified it + has to guess. + Args : None + + +=cut + +sub alphabet { + my $self = shift; + return $self->primary_seq->alphabet(@_) if @_ && defined $_[0]; + return $self->primary_seq->alphabet(); +} + +sub is_circular { shift->primary_seq->is_circular } + +=head1 Methods for Bio::IdentifiableI compliance + +=cut + +=head2 object_id + + Title : object_id + Usage : $string = $obj->object_id() + Function: a string which represents the stable primary identifier + in this namespace of this object. For DNA sequences this + is its accession_number, similarly for protein sequences + + This is aliased to accession_number(). + Returns : A scalar + + +=cut + +sub object_id { + return shift->accession_number(@_); +} + +=head2 version + + Title : version + Usage : $version = $obj->version() + Function: a number which differentiates between versions of + the same object. Higher numbers are considered to be + later and more relevant, but a single object described + the same identifier should represent the same concept + + Returns : A number + +=cut + +sub version{ + return shift->primary_seq->version(@_); +} + + +=head2 authority + + Title : authority + Usage : $authority = $obj->authority() + Function: a string which represents the organisation which + granted the namespace, written as the DNS name for + organisation (eg, wormbase.org) + + Returns : A scalar + +=cut + +sub authority { + return shift->primary_seq()->authority(@_); +} + +=head2 namespace + + Title : namespace + Usage : $string = $obj->namespace() + Function: A string representing the name space this identifier + is valid in, often the database name or the name + describing the collection + + Returns : A scalar + + +=cut + +sub namespace{ + return shift->primary_seq()->namespace(@_); +} + +=head1 Methods for Bio::DescribableI compliance + +=cut + +=head2 display_name + + Title : display_name + Usage : $string = $obj->display_name() + Function: A string which is what should be displayed to the user + the string should have no spaces (ideally, though a cautious + user of this interface would not assumme this) and should be + less than thirty characters (though again, double checking + this is a good idea) + + This is aliased to display_id(). + Returns : A scalar + +=cut + +sub display_name { + return shift->display_id(@_); +} + +=head2 description + + Title : description + Usage : $string = $obj->description() + Function: A text string suitable for displaying to the user a + description. This string is likely to have spaces, but + should not have any newlines or formatting - just plain + text. The string should not be greater than 255 characters + and clients can feel justified at truncating strings at 255 + characters for the purposes of display + + This is aliased to desc(). + Returns : A scalar + +=cut + +sub description { + return shift->desc(@_); +} + +=head1 Methods for implementing Bio::AnnotatableI + +=cut + +=head2 annotation + + Title : annotation + Usage : $ann = $seq->annotation or $seq->annotation($annotation) + Function: Gets or sets the annotation + Returns : L<Bio::AnnotationCollectionI> object + Args : None or L<Bio::AnnotationCollectionI> object + +See L<Bio::AnnotationCollectionI> and L<Bio::Annotation::Collection> +for more information + +=cut + +sub annotation { + my ($obj,$value) = @_; + if( defined $value ) { + $obj->throw("object of class ".ref($value)." does not implement ". + "Bio::AnnotationCollectionI. Too bad.") + unless $value->isa("Bio::AnnotationCollectionI"); + $obj->{'_annotation'} = $value; + } elsif( ! defined $obj->{'_annotation'}) { + $obj->{'_annotation'} = new Bio::Annotation::Collection; + } + return $obj->{'_annotation'}; +} + +=head1 Methods to implement Bio::FeatureHolderI + +This includes methods for retrieving, adding, and removing features. + +=cut + +=head2 get_SeqFeatures + + Title : get_SeqFeatures + Usage : + Function: Get the feature objects held by this feature holder. + + Features which are not top-level are subfeatures of one or + more of the returned feature objects, which means that you + must traverse the subfeature arrays of each top-level + feature object in order to traverse all features associated + with this sequence. + + Use get_all_SeqFeatures() if you want the feature tree + flattened into one single array. + + Example : + Returns : an array of Bio::SeqFeatureI implementing objects + Args : none + +At some day we may want to expand this method to allow for a feature +filter to be passed in. + +=cut + +sub get_SeqFeatures{ + my $self = shift; + + if( !defined $self->{'_as_feat'} ) { + $self->{'_as_feat'} = []; + } + + return @{$self->{'_as_feat'}}; +} + +=head2 get_all_SeqFeatures + + Title : get_all_SeqFeatures + Usage : @feat_ary = $seq->get_all_SeqFeatures(); + Function: Returns the tree of feature objects attached to this + sequence object flattened into one single array. Top-level + features will still contain their subfeature-arrays, which + means that you will encounter subfeatures twice if you + traverse the subfeature tree of the returned objects. + + Use get_SeqFeatures() if you want the array to contain only + the top-level features. + + Returns : An array of Bio::SeqFeatureI implementing objects. + Args : None + + +=cut + +# this implementation is inherited from FeatureHolderI + +=head2 feature_count + + Title : feature_count + Usage : $seq->feature_count() + Function: Return the number of SeqFeatures attached to a sequence + Returns : integer representing the number of SeqFeatures + Args : None + + +=cut + +sub feature_count { + my ($self) = @_; + + if (defined($self->{'_as_feat'})) { + return ($#{$self->{'_as_feat'}} + 1); + } else { + return 0; + } +} + +=head2 add_SeqFeature + + Title : add_SeqFeature + Usage : $seq->add_SeqFeature($feat); + $seq->add_SeqFeature(@feat); + Function: Adds the given feature object (or each of an array of feature + objects to the feature array of this + sequence. The object passed is required to implement the + Bio::SeqFeatureI interface. + Returns : 1 on success + Args : A Bio::SeqFeatureI implementing object, or an array of such objects. + + +=cut + +sub add_SeqFeature { + my ($self,@feat) = @_; + + $self->{'_as_feat'} = [] unless $self->{'_as_feat'}; + + foreach my $feat ( @feat ) { + if( !$feat->isa("Bio::SeqFeatureI") ) { + $self->throw("$feat is not a SeqFeatureI and that's what we expect..."); + } + + # make sure we attach ourselves to the feature if the feature wants it + my $aseq = $self->primary_seq; + $feat->attach_seq($aseq) if $aseq; + + push(@{$self->{'_as_feat'}},$feat); + } + return 1; +} + +=head2 remove_SeqFeatures + + Title : remove_SeqFeatures + Usage : $seq->remove_SeqFeatures(); + Function: Flushes all attached SeqFeatureI objects. + + To remove individual feature objects, delete those from the returned + array and re-add the rest. + Example : + Returns : The array of Bio::SeqFeatureI objects removed from this seq. + Args : None + + +=cut + +sub remove_SeqFeatures { + my $self = shift; + + return () unless $self->{'_as_feat'}; + my @feats = @{$self->{'_as_feat'}}; + $self->{'_as_feat'} = []; + return @feats; +} + +=head1 Methods provided in the Bio::PrimarySeqI interface + + +These methods are inherited from the PrimarySeq interface +and work as one expects, building new Bio::Seq objects +or other information as expected. See L<Bio::PrimarySeq> +for more information. + +Sequence Features are B<not> transfered to the new objects. +This is possibly a mistake. Anyone who feels the urge in +dealing with this is welcome to give it a go. + +=head2 revcom + + Title : revcom + Usage : $rev = $seq->revcom() + Function: Produces a new Bio::Seq object which + is the reversed complement of the sequence. For protein + sequences this throws an exception of "Sequence is a protein. + Cannot revcom" + + The id is the same id as the original sequence, and the + accession number is also identical. If someone wants to track + that this sequence has be reversed, it needs to define its own + extensions + + To do an in-place edit of an object you can go: + + $seq = $seq->revcom(); + + This of course, causes Perl to handle the garbage collection of + the old object, but it is roughly speaking as efficient as an + in-place edit. + + Returns : A new (fresh) Bio::Seq object + Args : None + + +=cut + +=head2 trunc + + Title : trunc + Usage : $subseq = $myseq->trunc(10,100); + Function: Provides a truncation of a sequence + + Example : + Returns : A fresh Seq object + Args : A Seq object + + +=cut + +=head2 id + + Title : id + Usage : $id = $seq->id() + Function: This is mapped on display_id + Returns : value of display_id() + Args : [optional] value to update display_id + + +=cut + +sub id { + return shift->display_id(@_); +} + + +=head1 Seq only methods + + +These methods are specific to the Bio::Seq object, and not +found on the Bio::PrimarySeq object + +=head2 primary_seq + + Title : primary_seq + Usage : $seq->primary_seq or $seq->primary_seq($newval) + Function: Get or set a PrimarySeq object + Example : + Returns : PrimarySeq object + Args : None or PrimarySeq object + + +=cut + +sub primary_seq { + my ($obj,$value) = @_; + + if( defined $value) { + if( ! ref $value || ! $value->isa('Bio::PrimarySeqI') ) { + $obj->throw("$value is not a Bio::PrimarySeq compliant object"); + } + + $obj->{'primary_seq'} = $value; + # descend down over all seqfeature objects, seeing whether they + # want an attached seq. + + foreach my $sf ( $obj->get_SeqFeatures() ) { + $sf->attach_seq($value); + } + + } + return $obj->{'primary_seq'}; + +} + +=head2 species + + Title : species + Usage : $species = $seq->species() or $seq->species($species) + Function: Gets or sets the species + Returns : L<Bio::Species> object + Args : None or L<Bio::Species> object + +See L<Bio::Species> for more information + +=cut + +sub species { + my ($self, $species) = @_; + if ($species) { + $self->{'species'} = $species; + } else { + return $self->{'species'}; + } +} + +=head1 Internal methods + +=cut + +# keep AUTOLOAD happy +sub DESTROY { } + +############################################################################ +# aliases due to name changes or to compensate for our lack of consistency # +############################################################################ + +# in all other modules we use the object in the singular -- +# lack of consistency sucks +*flush_SeqFeature = \&remove_SeqFeatures; +*flush_SeqFeatures = \&remove_SeqFeatures; + +# this is now get_SeqFeatures() (from FeatureHolderI) +*top_SeqFeatures = \&get_SeqFeatures; + +# this is now get_all_SeqFeatures() in FeatureHolderI +sub all_SeqFeatures{ + return shift->get_all_SeqFeatures(@_); +} + +sub accession { + my $self = shift; + $self->warn(ref($self)."::accession is deprecated, ". + "use accession_number() instead"); + return $self->accession_number(@_); +} + +1;