comparison variant_effect_predictor/Bio/Seq.pm @ 0:1f6dce3d34e0

Uploaded
author mahtabm
date Thu, 11 Apr 2013 02:01:53 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:1f6dce3d34e0
1 # $Id: Seq.pm,v 1.76.2.2 2003/07/03 20:01:32 jason Exp $
2 #
3 # BioPerl module for Bio::Seq
4 #
5 # Cared for by Ewan Birney <birney@ebi.ac.uk>
6 #
7 # Copyright Ewan Birney
8 #
9 # You may distribute this module under the same terms as perl itself
10
11 # POD documentation - main docs before the code
12
13 =head1 NAME
14
15 Bio::Seq - Sequence object, with features
16
17 =head1 SYNOPSIS
18
19 # This is the main sequence object in Bioperl
20
21 # gets a sequence from a file
22 $seqio = Bio::SeqIO->new( '-format' => 'embl' , -file => 'myfile.dat');
23 $seqobj = $seqio->next_seq();
24
25 # SeqIO can both read and write sequences; see Bio::SeqIO
26 # for more information and examples
27
28 # get from database
29 $db = Bio::DB::GenBank->new();
30 $seqobj = $db->get_Seq_by_acc('X78121');
31
32 # make from strings in script
33 $seqobj = Bio::Seq->new( -display_id => 'my_id',
34 -seq => $sequence_as_string);
35
36 # gets sequence as a string from sequence object
37 $seqstr = $seqobj->seq(); # actual sequence as a string
38 $seqstr = $seqobj->subseq(10,50); # slice in biological coordinates
39
40 # retrieves information from the sequence
41 # features must implement Bio::SeqFeatureI interface
42
43 @features = $seqobj->get_SeqFeatures(); # just top level
44 foreach my $feat ( @features ) {
45 print "Feature ",$feat->primary_tag," starts ",$feat->start," ends ",
46 $feat->end," strand ",$feat->strand,"\n";
47
48 # features retain link to underlying sequence object
49 print "Feature sequence is ",$feat->seq->seq(),"\n"
50 }
51
52 # sequences may have a species
53
54 if( defined $seq->species ) {
55 print "Sequence is from ",$species->binomial_name," [",$species->common_name,"]\n";
56 }
57
58 # annotation objects are Bio::AnnotationCollectionI's
59 $ann = $seqobj->annotation(); # annotation object
60
61 # references is one type of annotations to get. Also get
62 # comment and dblink. Look at Bio::AnnotationCollection for
63 # more information
64
65 foreach my $ref ( $ann->get_Annotations('reference') ) {
66 print "Reference ",$ref->title,"\n";
67 }
68
69 # you can get truncations, translations and reverse complements, these
70 # all give back Bio::Seq objects themselves, though currently with no
71 # features transfered
72
73 my $trunc = $seqobj->trunc(100,200);
74 my $rev = $seqobj->revcom();
75
76 # there are many options to translate - check out the docs
77 my $trans = $seqobj->translate();
78
79 # these functions can be chained together
80
81 my $trans_trunc_rev = $seqobj->trunc(100,200)->revcom->translate();
82
83
84
85 =head1 DESCRIPTION
86
87 A Seq object is a sequence with sequence features placed on it. The
88 Seq object contains a PrimarySeq object for the actual sequence and
89 also implements its interface.
90
91 In Bioperl we have 3 main players that people are going to use frequently
92
93 Bio::PrimarySeq - just the sequence and its names, nothing else.
94 Bio::SeqFeatureI - a location on a sequence, potentially with a sequence
95 and annotation.
96 Bio::Seq - A sequence and a collection of sequence features
97 (an aggregate) with its own annotation.
98
99 Although Bioperl is not tied heavily to file formats these distinctions do
100 map to file formats sensibly and for some bioinformaticians this might help
101
102 Bio::PrimarySeq - Fasta file of a sequence
103 Bio::SeqFeatureI - A single entry in an EMBL/GenBank/DDBJ feature table
104 Bio::Seq - A single EMBL/GenBank/DDBJ entry
105
106 By having this split we avoid a lot of nasty circular references
107 (sequence features can hold a reference to a sequence without the sequence
108 holding a reference to the sequence feature). See L<Bio::PrimarySeq> and
109 L<Bio::SeqFeatureI> for more information.
110
111 Ian Korf really helped in the design of the Seq and SeqFeature system.
112
113 =head1 EXAMPLES
114
115 A simple and fundamental block of code
116
117 use Bio::SeqIO;
118
119 my $seqIOobj = Bio::SeqIO->new(-file=>"1.fa"); # create a SeqIO object
120 my $seqobj = $seqIOobj->next_seq; # get a Seq object
121
122 With the Seq object in hand one has access to a powerful set of Bioperl
123 methods and Bioperl objects. This next script will take a file of sequences
124 in EMBL format and create a file of the reverse-complemented sequences
125 in Fasta format using Seq objects. It also prints out details about the
126 exons it finds as sequence features in Genbank Flat File format.
127
128 use Bio::Seq;
129 use Bio::SeqIO;
130
131 $seqin = Bio::SeqIO->new( -format => 'EMBL' , -file => 'myfile.dat');
132 $seqout= Bio::SeqIO->new( -format => 'Fasta', -file => '>output.fa');
133
134 while((my $seqobj = $seqin->next_seq())) {
135 print "Seen sequence ",$seqobj->display_id,", start of seq ",
136 substr($seqobj->seq,1,10),"\n";
137 if( $seqobj->alphabet eq 'dna') {
138 $rev = $seqobj->revcom;
139 $id = $seqobj->display_id();
140 $id = "$id.rev";
141 $rev->display_id($id);
142 $seqout->write_seq($rev);
143 }
144
145 foreach $feat ( $seqobj->get_SeqFeatures() ) {
146 if( $feat->primary_tag eq 'exon' ) {
147 print STDOUT "Location ",$feat->start,":",
148 $feat->end," GFF[",$feat->gff_string,"]\n";
149 }
150 }
151 }
152
153 Let's examine the script. The lines below import the Bioperl modules.
154 Seq is the main Bioperl sequence object and SeqIO is the Bioperl support
155 for reading sequences from files and to files
156
157 use Bio::Seq;
158 use Bio::SeqIO;
159
160 These two lines create two SeqIO streams: one for reading in sequences
161 and one for outputting sequences:
162
163 $seqin = Bio::SeqIO->new( -format => 'EMBL' , -file => 'myfile.dat');
164 $seqout= Bio::SeqIO->new( -format => 'Fasta', -file => '>output.fa');
165
166 Notice that in the "$seqout" case there is a greater-than sign,
167 indicating the file is being opened for writing.
168
169 Using the
170
171 '-argument' => value
172
173 syntax is common in Bioperl. The file argument is like an argument
174 to open() . You can also pass in filehandles or FileHandle objects by
175 using the -fh argument (see L<Bio::SeqIO> documentation for details).
176 Many formats in Bioperl are handled, including Fasta, EMBL, GenBank,
177 Swissprot (swiss), PIR, and GCG.
178
179 $seqin = Bio::SeqIO->new( -format => 'EMBL' , -file => 'myfile.dat');
180 $seqout= Bio::SeqIO->new( -format => 'Fasta', -file => '>output.fa');
181
182 This is the main loop which will loop progressively through sequences
183 in a file, and each call to $seqio-E<gt>next_seq() provides a new Seq
184 object from the file:
185
186 while((my $seqobj = $seqio->next_seq())) {
187
188 This print line below accesses fields in the Seq object directly. The
189 $seqobj-E<gt>display_id is the way to access the display_id attribute
190 of the Seq object. The $seqobj-E<gt>seq method gets the actual
191 sequence out as string. Then you can do manipulation of this if
192 you want to (there are however easy ways of doing truncation,
193 reverse-complement and translation).
194
195 print "Seen sequence ",$seqobj->display_id,", start of seq ",
196 substr($seqobj->seq,1,10),"\n";
197
198 Bioperl has to guess the alphabet of the sequence, being either 'dna',
199 'rna', or 'protein'. The alphabet attribute is one of these three
200 possibilities.
201
202 if( $seqobj->alphabet eq 'dna') {
203
204 The $seqobj-E<gt>revcom method provides the reverse complement of the Seq
205 object as another Seq object. Thus, the $rev variable is a reference to
206 another Seq object. For example, one could repeat the above print line
207 for this Seq object (putting $rev in place of $seqobj). In this
208 case we are going to output the object into the file stream we built
209 earlier on.
210
211 $rev = $seqobj->revcom;
212
213 When we output it, we want the id of the outputted object
214 to be changed to "$id.rev", ie, with .rev on the end of the name. The
215 following lines retrieve the id of the sequence object, add .rev
216 to this and then set the display_id of the rev sequence object to
217 this. Notice that to set the display_id attribute you just need
218 call the same method, display_id(), with the new value as an argument.
219 Getting and setting values with the same method is common in Bioperl.
220
221 $id = $seqobj->display_id();
222 $id = "$id.rev";
223 $rev->display_id($id);
224
225 The write_seq method on the SeqIO output object, $seqout, writes the
226 $rev object to the filestream we built at the top of the script.
227 The filestream knows that it is outputting in fasta format, and
228 so it provides fasta output.
229
230 $seqout->write_seq($rev);
231
232 This block of code loops over sequence features in the sequence
233 object, trying to find ones who have been tagged as 'exon'.
234 Features have start and end attributes and can be outputted
235 in Genbank Flat File format, GFF, a standarized format for sequence
236 features.
237
238 foreach $feat ( $seqobj->get_SeqFeatures() ) {
239 if( $feat->primary_tag eq 'exon' ) {
240 print STDOUT "Location ",$feat->start,":",
241 $feat->end," GFF[",$feat->gff_string,"]\n";
242 }
243 }
244
245 The code above shows how a few Bio::Seq methods suffice to read, parse,
246 reformat and analyze sequences from a file. A full list of methods
247 available to Bio::Seq objects is shown below. Bear in mind that some of
248 these methods come from PrimarySeq objects, which are simpler
249 than Seq objects, stripped of features (see L<Bio::PrimarySeq> for
250 more information).
251
252 # these methods return strings, and accept strings in some cases:
253
254 $seqobj->seq(); # string of sequence
255 $seqobj->subseq(5,10); # part of the sequence as a string
256 $seqobj->accession_number(); # when there, the accession number
257 $seqobj->moltype(); # one of 'dna','rna',or 'protein'
258 $seqobj->seq_version() # when there, the version
259 $seqobj->keywords(); # when there, the Keywords line
260 $seqobj->length() # length
261 $seqobj->desc(); # description
262 $seqobj->primary_id(); # a unique id for this sequence regardless
263 # of its display_id or accession number
264 $seqobj->display_id(); # the human readable id of the sequence
265
266 Some of these values map to fields in common formats. For example, The
267 display_id() method returns the LOCUS name of a Genbank entry,
268 the (\S+) following the E<gt> character in a Fasta file, the ID from
269 a SwissProt file, and so on. The desc() method will return the DEFINITION
270 line of a Genbank file, the description following the display_id in a
271 Fasta file, and the DE field in a SwissProt file.
272
273 # the following methods return new Seq objects, but
274 # do not transfer features across to the new object:
275
276 $seqobj->trunc(5,10) # truncation from 5 to 10 as new object
277 $seqobj->revcom # reverse complements sequence
278 $seqobj->translate # translation of the sequence
279
280 # if new() can be called this method returns 1, else 0
281
282 $seqobj->can_call_new
283
284 # the following method determines if the given string will be accepted
285 # by the seq() method - if the string is acceptable then validate()
286 # returns 1, or 0 if not
287
288 $seqobj->validate_seq($string)
289
290 # the following method returns or accepts a Species object:
291
292 $seqobj->species();
293
294 Please see L<Bio::Species> for more information on this object.
295
296 # the following method returns or accepts an Annotation object
297 # which in turn allows access to Annotation::Reference
298 # and Annotation::Comment objects:
299
300 $seqobj->annotation();
301
302 These annotations typically refer to entire sequences, unlike
303 features. See L<Bio::AnnotationCollectionI>,
304 L<Bio::Annotation::Collection>, L<Bio::Annotation::Reference>, and
305 L<Bio::Annotation::Comment> for details.
306
307 It is also important to be able to describe defined portions of a
308 sequence. The combination of some description and the corresponding
309 sub-sequence is called a feature - an exon and its coordinates within
310 a gene is an example of a feature, or a domain within a protein.
311
312 # the following methods return an array of SeqFeatureI objects:
313
314 $seqobj->get_SeqFeatures # The 'top level' sequence features
315 $seqobj->get_all_SeqFeatures # All sequence features, including sub-seq
316 # features, such as features in an exon
317
318 # to find out the number of features use:
319
320 $seqobj->feature_count
321
322 Here are just some of the methods available to SeqFeatureI objects:
323
324 # these methods return numbers:
325
326 $feat->start # start position (1 is the first base)
327 $feat->end # end position (2 is the second base)
328 $feat->strand # 1 means forward, -1 reverse, 0 not relevant
329
330 # these methods return or accept strings:
331
332 $feat->primary_tag # the name of the sequence feature, eg
333 # 'exon', 'glycoslyation site', 'TM domain'
334 $feat->source_tag # where the feature comes from, eg, 'EMBL_GenBank',
335 # or 'BLAST'
336
337 # this method returns the more austere PrimarySeq object, not a
338 # Seq object - the main difference is that PrimarySeq objects do not
339 # themselves contain sequence features
340
341 $feat->seq # the sequence between start,end on the
342 # correct strand of the sequence
343
344 See L<Bio::PrimarySeq> for more details on PrimarySeq objects.
345
346 # useful methods for feature comparisons, for start/end points
347
348 $feat->overlaps($other) # do $feat and $other overlap?
349 $feat->contains($other) # is $other completely within $feat?
350 $feat->equals($other) # do $feat and $other completely agree?
351
352 # one can also add features
353
354 $seqobj->add_SeqFeature($feat) # returns 1 if successful
355 $seqobj->add_SeqFeature(@features) # returns 1 if successful
356
357 # sub features. For complex join() statements, the feature
358 # is one sequence feature with many sub SeqFeatures
359
360 $feat->sub_SeqFeature # returns array of sub seq features
361
362 Please see L<Bio::SeqFeatureI> and L<Bio::SeqFeature::Generic>,
363 for more information on sequence features.
364
365 It is worth mentioning that one can also retrieve the start and end
366 positions of a feature using a Bio::LocationI object:
367
368 $location = $feat->location # $location is a Bio::LocationI object
369 $location->start; # start position
370 $location->end; # end position
371
372 This is useful because one needs a Bio::Location::SplitLocationI object
373 in order to retrieve the coordinates inside the Genbank or EMBL join()
374 statements (e.g. "CDS join(51..142,273..495,1346..1474)"):
375
376 if ( $feat->location->isa('Bio::Location::SplitLocationI') &&
377 $feat->primary_tag eq 'CDS' ) {
378 foreach $loc ( $feat->location->sub_Location ) {
379 print $loc->start . ".." . $loc->end . "\n";
380 }
381 }
382
383 See L<Bio::LocationI> and L<Bio::Location::SplitLocationI> for more
384 information.
385
386 =head1 Implemented Interfaces
387
388 This class implements the following interfaces.
389
390 =over 4
391
392 =item Bio::SeqI
393
394 Note that this includes implementing Bio::PrimarySeqI.
395
396 =item Bio::IdentifiableI
397
398 =item Bio::DescribableI
399
400 =item Bio::AnnotatableI
401
402 =item Bio::FeatureHolderI
403
404 =back
405
406 =head1 FEEDBACK
407
408
409 =head2 Mailing Lists
410
411 User feedback is an integral part of the evolution of this and other
412 Bioperl modules. Send your comments and suggestions preferably to one
413 of the Bioperl mailing lists. Your participation is much appreciated.
414
415 bioperl-l@bioperl.org - General discussion
416 http://bio.perl.org/MailList.html - About the mailing lists
417
418 =head2 Reporting Bugs
419
420 Report bugs to the Bioperl bug tracking system to help us keep track
421 the bugs and their resolution. Bug reports can be submitted via email
422 or the web:
423
424 bioperl-bugs@bioperl.org
425 http://bugzilla.bioperl.org/
426
427 =head1 AUTHOR - Ewan Birney, inspired by Ian Korf objects
428
429 Email birney@ebi.ac.uk
430
431 =head1 CONTRIBUTORS
432
433 Jason Stajich E<lt>jason@bioperl.orgE<gt>
434
435 =head1 APPENDIX
436
437
438 The rest of the documentation details each of the object
439 methods. Internal methods are usually preceded with a "_".
440
441 =cut
442
443 #'
444 # Let the code begin...
445
446
447 package Bio::Seq;
448 use vars qw(@ISA $VERSION);
449 use strict;
450
451
452 # Object preamble - inherits from Bio::Root::Object
453
454 use Bio::Root::Root;
455 use Bio::SeqI;
456 use Bio::Annotation::Collection;
457 use Bio::PrimarySeq;
458 use Bio::IdentifiableI;
459 use Bio::DescribableI;
460 use Bio::AnnotatableI;
461 use Bio::FeatureHolderI;
462
463 $VERSION = '1.1';
464 @ISA = qw(Bio::Root::Root Bio::SeqI
465 Bio::IdentifiableI Bio::DescribableI
466 Bio::AnnotatableI Bio::FeatureHolderI);
467
468 =head2 new
469
470 Title : new
471 Usage : $seq = Bio::Seq->new( -seq => 'ATGGGGGTGGTGGTACCCT',
472 -id => 'human_id',
473 -accession_number => 'AL000012',
474 );
475
476 Function: Returns a new Seq object from
477 basic constructors, being a string for the sequence
478 and strings for id and accession_number
479 Returns : a new Bio::Seq object
480
481 =cut
482
483 sub new {
484 my($caller,@args) = @_;
485
486 if( $caller ne 'Bio::Seq') {
487 $caller = ref($caller) if ref($caller);
488 }
489
490 # we know our inherietance heirarchy
491 my $self = Bio::Root::Root->new(@args);
492 bless $self,$caller;
493
494 # this is way too sneaky probably. We delegate the construction of
495 # the Seq object onto PrimarySeq and then pop primary_seq into
496 # our primary_seq slot
497
498 my $pseq = Bio::PrimarySeq->new(@args);
499
500 # as we have just made this, we know it is ok to set hash directly
501 # rather than going through the method
502
503 $self->{'primary_seq'} = $pseq;
504
505 # setting this array is now delayed until the final
506 # moment, again speed ups for non feature containing things
507 # $self->{'_as_feat'} = [];
508
509
510 my ($ann, $pid,$feat,$species) = &Bio::Root::RootI::_rearrange($self,[qw(ANNOTATION PRIMARY_ID FEATURES SPECIES)], @args);
511
512 # for a number of cases - reading fasta files - these are never set. This
513 # gives a quick optimisation around testing things later on
514
515 if( defined $ann || defined $pid || defined $feat || defined $species ) {
516 $pid && $self->primary_id($pid);
517 $species && $self->species($species);
518 $ann && $self->annotation($ann);
519
520 if( defined $feat ) {
521 if( ref($feat) !~ /ARRAY/i ) {
522 if( ref($feat) && $feat->isa('Bio::SeqFeatureI') ) {
523 $self->add_SeqFeature($feat);
524 } else {
525 $self->warn("Must specify a valid Bio::SeqFeatureI or ArrayRef of Bio::SeqFeatureI's with the -features init parameter for ".ref($self));
526 }
527 } else {
528 foreach my $feature ( @$feat ) {
529 $self->add_SeqFeature($feature);
530 }
531 }
532 }
533 }
534
535 return $self;
536 }
537
538 =head1 PrimarySeq interface
539
540
541 The PrimarySeq interface provides the basic sequence getting
542 and setting methods for on all sequences.
543
544 These methods implement the Bio::PrimarySeq interface by delegating
545 to the primary_seq inside the object. This means that you
546 can use a Seq object wherever there is a PrimarySeq, and
547 of course, you are free to use these functions anyway.
548
549 =cut
550
551 =head2 seq
552
553 Title : seq
554 Usage : $string = $obj->seq()
555 Function: Get/Set the sequence as a string of letters. The
556 case of the letters is left up to the implementer.
557 Suggested cases are upper case for proteins and lower case for
558 DNA sequence (IUPAC standard),
559 but implementations are suggested to keep an open mind about
560 case (some users... want mixed case!)
561 Returns : A scalar
562 Args : Optionally on set the new value (a string). An optional second
563 argument presets the alphabet (otherwise it will be guessed).
564 Both parameters may also be given in named paramater style
565 with -seq and -alphabet being the names.
566
567 =cut
568
569 sub seq {
570 return shift->primary_seq()->seq(@_);
571 }
572
573 =head2 validate_seq
574
575 Title : validate_seq
576 Usage : if(! $seq->validate_seq($seq_str) ) {
577 print "sequence $seq_str is not valid for an object of type ",
578 ref($seq), "\n";
579 }
580 Function: Validates a given sequence string. A validating sequence string
581 must be accepted by seq(). A string that does not validate will
582 lead to an exception if passed to seq().
583
584 The implementation provided here does not take alphabet() into
585 account. Allowed are all letters (A-Z) and '-','.', and '*'.
586
587 Example :
588 Returns : 1 if the supplied sequence string is valid for the object, and
589 0 otherwise.
590 Args : The sequence string to be validated.
591
592
593 =cut
594
595 sub validate_seq {
596 return shift->primary_seq()->validate_seq(@_);
597 }
598
599 =head2 length
600
601 Title : length
602 Usage : $len = $seq->length()
603 Function:
604 Example :
605 Returns : Integer representing the length of the sequence.
606 Args : None
607
608 =cut
609
610 sub length {
611 return shift->primary_seq()->length(@_);
612 }
613
614 =head1 Methods from the Bio::PrimarySeqI interface
615
616 =cut
617
618 =head2 subseq
619
620 Title : subseq
621 Usage : $substring = $obj->subseq(10,40);
622 Function: Returns the subseq from start to end, where the first base
623 is 1 and the number is inclusive, ie 1-2 are the first two
624 bases of the sequence
625
626 Start cannot be larger than end but can be equal
627
628 Returns : A string
629 Args : 2 integers
630
631
632 =cut
633
634 sub subseq {
635 return shift->primary_seq()->subseq(@_);
636 }
637
638 =head2 display_id
639
640 Title : display_id
641 Usage : $id = $obj->display_id or $obj->display_id($newid);
642 Function: Gets or sets the display id, also known as the common name of
643 the Seq object.
644
645 The semantics of this is that it is the most likely string
646 to be used as an identifier of the sequence, and likely to
647 have "human" readability. The id is equivalent to the LOCUS
648 field of the GenBank/EMBL databanks and the ID field of the
649 Swissprot/sptrembl database. In fasta format, the >(\S+) is
650 presumed to be the id, though some people overload the id
651 to embed other information. Bioperl does not use any
652 embedded information in the ID field, and people are
653 encouraged to use other mechanisms (accession field for
654 example, or extending the sequence object) to solve this.
655
656 Notice that $seq->id() maps to this function, mainly for
657 legacy/convenience issues.
658 Returns : A string
659 Args : None or a new id
660
661
662 =cut
663
664 sub display_id {
665 return shift->primary_seq->display_id(@_);
666 }
667
668
669
670 =head2 accession_number
671
672 Title : accession_number
673 Usage : $unique_biological_key = $obj->accession_number;
674 Function: Returns the unique biological id for a sequence, commonly
675 called the accession_number. For sequences from established
676 databases, the implementors should try to use the correct
677 accession number. Notice that primary_id() provides the
678 unique id for the implemetation, allowing multiple objects
679 to have the same accession number in a particular implementation.
680
681 For sequences with no accession number, this method should return
682 "unknown".
683
684 Can also be used to set the accession number.
685 Example : $key = $seq->accession_number or $seq->accession_number($key)
686 Returns : A string
687 Args : None or an accession number
688
689
690 =cut
691
692 sub accession_number {
693 return shift->primary_seq->accession_number(@_);
694 }
695
696 =head2 desc
697
698 Title : desc
699 Usage : $seqobj->desc($string) or $seqobj->desc()
700 Function: Sets or gets the description of the sequence
701 Example :
702 Returns : The description
703 Args : The description or none
704
705
706 =cut
707
708 sub desc {
709 return shift->primary_seq->desc(@_);
710 }
711
712 =head2 primary_id
713
714 Title : primary_id
715 Usage : $unique_implementation_key = $obj->primary_id;
716 Function: Returns the unique id for this object in this
717 implementation. This allows implementations to manage
718 their own object ids in a way the implementation can control
719 clients can expect one id to map to one object.
720
721 For sequences with no natural id, this method should return
722 a stringified memory location.
723
724 Can also be used to set the primary_id.
725
726 Also notice that this method is not delegated to the
727 internal Bio::PrimarySeq object
728
729 [Note this method name is likely to change in 1.3]
730
731 Example : $id = $seq->primary_id or $seq->primary_id($id)
732 Returns : A string
733 Args : None or an id
734
735
736 =cut
737
738 sub primary_id {
739 my ($obj,$value) = @_;
740
741 if( defined $value) {
742 $obj->{'primary_id'} = $value;
743 }
744 if( ! exists $obj->{'primary_id'} ) {
745 return "$obj";
746 }
747 return $obj->{'primary_id'};
748 }
749
750 =head2 can_call_new
751
752 Title : can_call_new
753 Usage : if ( $obj->can_call_new ) {
754 $newobj = $obj->new( %param );
755 }
756 Function: can_call_new returns 1 or 0 depending
757 on whether an implementation allows new
758 constructor to be called. If a new constructor
759 is allowed, then it should take the followed hashed
760 constructor list.
761
762 $myobject->new( -seq => $sequence_as_string,
763 -display_id => $id
764 -accession_number => $accession
765 -alphabet => 'dna',
766 );
767 Example :
768 Returns : 1 or 0
769 Args : None
770
771
772 =cut
773
774 sub can_call_new {
775 return 1;
776 }
777
778 =head2 alphabet
779
780 Title : alphabet
781 Usage : if ( $obj->alphabet eq 'dna' ) { /Do Something/ }
782 Function: Returns the type of sequence being one of
783 'dna', 'rna' or 'protein'. This is case sensitive.
784
785 This is not called <type> because this would cause
786 upgrade problems from the 0.5 and earlier Seq objects.
787
788 Returns : A string either 'dna','rna','protein'. NB - the object must
789 make a call of the type - if there is no type specified it
790 has to guess.
791 Args : None
792
793
794 =cut
795
796 sub alphabet {
797 my $self = shift;
798 return $self->primary_seq->alphabet(@_) if @_ && defined $_[0];
799 return $self->primary_seq->alphabet();
800 }
801
802 sub is_circular { shift->primary_seq->is_circular }
803
804 =head1 Methods for Bio::IdentifiableI compliance
805
806 =cut
807
808 =head2 object_id
809
810 Title : object_id
811 Usage : $string = $obj->object_id()
812 Function: a string which represents the stable primary identifier
813 in this namespace of this object. For DNA sequences this
814 is its accession_number, similarly for protein sequences
815
816 This is aliased to accession_number().
817 Returns : A scalar
818
819
820 =cut
821
822 sub object_id {
823 return shift->accession_number(@_);
824 }
825
826 =head2 version
827
828 Title : version
829 Usage : $version = $obj->version()
830 Function: a number which differentiates between versions of
831 the same object. Higher numbers are considered to be
832 later and more relevant, but a single object described
833 the same identifier should represent the same concept
834
835 Returns : A number
836
837 =cut
838
839 sub version{
840 return shift->primary_seq->version(@_);
841 }
842
843
844 =head2 authority
845
846 Title : authority
847 Usage : $authority = $obj->authority()
848 Function: a string which represents the organisation which
849 granted the namespace, written as the DNS name for
850 organisation (eg, wormbase.org)
851
852 Returns : A scalar
853
854 =cut
855
856 sub authority {
857 return shift->primary_seq()->authority(@_);
858 }
859
860 =head2 namespace
861
862 Title : namespace
863 Usage : $string = $obj->namespace()
864 Function: A string representing the name space this identifier
865 is valid in, often the database name or the name
866 describing the collection
867
868 Returns : A scalar
869
870
871 =cut
872
873 sub namespace{
874 return shift->primary_seq()->namespace(@_);
875 }
876
877 =head1 Methods for Bio::DescribableI compliance
878
879 =cut
880
881 =head2 display_name
882
883 Title : display_name
884 Usage : $string = $obj->display_name()
885 Function: A string which is what should be displayed to the user
886 the string should have no spaces (ideally, though a cautious
887 user of this interface would not assumme this) and should be
888 less than thirty characters (though again, double checking
889 this is a good idea)
890
891 This is aliased to display_id().
892 Returns : A scalar
893
894 =cut
895
896 sub display_name {
897 return shift->display_id(@_);
898 }
899
900 =head2 description
901
902 Title : description
903 Usage : $string = $obj->description()
904 Function: A text string suitable for displaying to the user a
905 description. This string is likely to have spaces, but
906 should not have any newlines or formatting - just plain
907 text. The string should not be greater than 255 characters
908 and clients can feel justified at truncating strings at 255
909 characters for the purposes of display
910
911 This is aliased to desc().
912 Returns : A scalar
913
914 =cut
915
916 sub description {
917 return shift->desc(@_);
918 }
919
920 =head1 Methods for implementing Bio::AnnotatableI
921
922 =cut
923
924 =head2 annotation
925
926 Title : annotation
927 Usage : $ann = $seq->annotation or $seq->annotation($annotation)
928 Function: Gets or sets the annotation
929 Returns : L<Bio::AnnotationCollectionI> object
930 Args : None or L<Bio::AnnotationCollectionI> object
931
932 See L<Bio::AnnotationCollectionI> and L<Bio::Annotation::Collection>
933 for more information
934
935 =cut
936
937 sub annotation {
938 my ($obj,$value) = @_;
939 if( defined $value ) {
940 $obj->throw("object of class ".ref($value)." does not implement ".
941 "Bio::AnnotationCollectionI. Too bad.")
942 unless $value->isa("Bio::AnnotationCollectionI");
943 $obj->{'_annotation'} = $value;
944 } elsif( ! defined $obj->{'_annotation'}) {
945 $obj->{'_annotation'} = new Bio::Annotation::Collection;
946 }
947 return $obj->{'_annotation'};
948 }
949
950 =head1 Methods to implement Bio::FeatureHolderI
951
952 This includes methods for retrieving, adding, and removing features.
953
954 =cut
955
956 =head2 get_SeqFeatures
957
958 Title : get_SeqFeatures
959 Usage :
960 Function: Get the feature objects held by this feature holder.
961
962 Features which are not top-level are subfeatures of one or
963 more of the returned feature objects, which means that you
964 must traverse the subfeature arrays of each top-level
965 feature object in order to traverse all features associated
966 with this sequence.
967
968 Use get_all_SeqFeatures() if you want the feature tree
969 flattened into one single array.
970
971 Example :
972 Returns : an array of Bio::SeqFeatureI implementing objects
973 Args : none
974
975 At some day we may want to expand this method to allow for a feature
976 filter to be passed in.
977
978 =cut
979
980 sub get_SeqFeatures{
981 my $self = shift;
982
983 if( !defined $self->{'_as_feat'} ) {
984 $self->{'_as_feat'} = [];
985 }
986
987 return @{$self->{'_as_feat'}};
988 }
989
990 =head2 get_all_SeqFeatures
991
992 Title : get_all_SeqFeatures
993 Usage : @feat_ary = $seq->get_all_SeqFeatures();
994 Function: Returns the tree of feature objects attached to this
995 sequence object flattened into one single array. Top-level
996 features will still contain their subfeature-arrays, which
997 means that you will encounter subfeatures twice if you
998 traverse the subfeature tree of the returned objects.
999
1000 Use get_SeqFeatures() if you want the array to contain only
1001 the top-level features.
1002
1003 Returns : An array of Bio::SeqFeatureI implementing objects.
1004 Args : None
1005
1006
1007 =cut
1008
1009 # this implementation is inherited from FeatureHolderI
1010
1011 =head2 feature_count
1012
1013 Title : feature_count
1014 Usage : $seq->feature_count()
1015 Function: Return the number of SeqFeatures attached to a sequence
1016 Returns : integer representing the number of SeqFeatures
1017 Args : None
1018
1019
1020 =cut
1021
1022 sub feature_count {
1023 my ($self) = @_;
1024
1025 if (defined($self->{'_as_feat'})) {
1026 return ($#{$self->{'_as_feat'}} + 1);
1027 } else {
1028 return 0;
1029 }
1030 }
1031
1032 =head2 add_SeqFeature
1033
1034 Title : add_SeqFeature
1035 Usage : $seq->add_SeqFeature($feat);
1036 $seq->add_SeqFeature(@feat);
1037 Function: Adds the given feature object (or each of an array of feature
1038 objects to the feature array of this
1039 sequence. The object passed is required to implement the
1040 Bio::SeqFeatureI interface.
1041 Returns : 1 on success
1042 Args : A Bio::SeqFeatureI implementing object, or an array of such objects.
1043
1044
1045 =cut
1046
1047 sub add_SeqFeature {
1048 my ($self,@feat) = @_;
1049
1050 $self->{'_as_feat'} = [] unless $self->{'_as_feat'};
1051
1052 foreach my $feat ( @feat ) {
1053 if( !$feat->isa("Bio::SeqFeatureI") ) {
1054 $self->throw("$feat is not a SeqFeatureI and that's what we expect...");
1055 }
1056
1057 # make sure we attach ourselves to the feature if the feature wants it
1058 my $aseq = $self->primary_seq;
1059 $feat->attach_seq($aseq) if $aseq;
1060
1061 push(@{$self->{'_as_feat'}},$feat);
1062 }
1063 return 1;
1064 }
1065
1066 =head2 remove_SeqFeatures
1067
1068 Title : remove_SeqFeatures
1069 Usage : $seq->remove_SeqFeatures();
1070 Function: Flushes all attached SeqFeatureI objects.
1071
1072 To remove individual feature objects, delete those from the returned
1073 array and re-add the rest.
1074 Example :
1075 Returns : The array of Bio::SeqFeatureI objects removed from this seq.
1076 Args : None
1077
1078
1079 =cut
1080
1081 sub remove_SeqFeatures {
1082 my $self = shift;
1083
1084 return () unless $self->{'_as_feat'};
1085 my @feats = @{$self->{'_as_feat'}};
1086 $self->{'_as_feat'} = [];
1087 return @feats;
1088 }
1089
1090 =head1 Methods provided in the Bio::PrimarySeqI interface
1091
1092
1093 These methods are inherited from the PrimarySeq interface
1094 and work as one expects, building new Bio::Seq objects
1095 or other information as expected. See L<Bio::PrimarySeq>
1096 for more information.
1097
1098 Sequence Features are B<not> transfered to the new objects.
1099 This is possibly a mistake. Anyone who feels the urge in
1100 dealing with this is welcome to give it a go.
1101
1102 =head2 revcom
1103
1104 Title : revcom
1105 Usage : $rev = $seq->revcom()
1106 Function: Produces a new Bio::Seq object which
1107 is the reversed complement of the sequence. For protein
1108 sequences this throws an exception of "Sequence is a protein.
1109 Cannot revcom"
1110
1111 The id is the same id as the original sequence, and the
1112 accession number is also identical. If someone wants to track
1113 that this sequence has be reversed, it needs to define its own
1114 extensions
1115
1116 To do an in-place edit of an object you can go:
1117
1118 $seq = $seq->revcom();
1119
1120 This of course, causes Perl to handle the garbage collection of
1121 the old object, but it is roughly speaking as efficient as an
1122 in-place edit.
1123
1124 Returns : A new (fresh) Bio::Seq object
1125 Args : None
1126
1127
1128 =cut
1129
1130 =head2 trunc
1131
1132 Title : trunc
1133 Usage : $subseq = $myseq->trunc(10,100);
1134 Function: Provides a truncation of a sequence
1135
1136 Example :
1137 Returns : A fresh Seq object
1138 Args : A Seq object
1139
1140
1141 =cut
1142
1143 =head2 id
1144
1145 Title : id
1146 Usage : $id = $seq->id()
1147 Function: This is mapped on display_id
1148 Returns : value of display_id()
1149 Args : [optional] value to update display_id
1150
1151
1152 =cut
1153
1154 sub id {
1155 return shift->display_id(@_);
1156 }
1157
1158
1159 =head1 Seq only methods
1160
1161
1162 These methods are specific to the Bio::Seq object, and not
1163 found on the Bio::PrimarySeq object
1164
1165 =head2 primary_seq
1166
1167 Title : primary_seq
1168 Usage : $seq->primary_seq or $seq->primary_seq($newval)
1169 Function: Get or set a PrimarySeq object
1170 Example :
1171 Returns : PrimarySeq object
1172 Args : None or PrimarySeq object
1173
1174
1175 =cut
1176
1177 sub primary_seq {
1178 my ($obj,$value) = @_;
1179
1180 if( defined $value) {
1181 if( ! ref $value || ! $value->isa('Bio::PrimarySeqI') ) {
1182 $obj->throw("$value is not a Bio::PrimarySeq compliant object");
1183 }
1184
1185 $obj->{'primary_seq'} = $value;
1186 # descend down over all seqfeature objects, seeing whether they
1187 # want an attached seq.
1188
1189 foreach my $sf ( $obj->get_SeqFeatures() ) {
1190 $sf->attach_seq($value);
1191 }
1192
1193 }
1194 return $obj->{'primary_seq'};
1195
1196 }
1197
1198 =head2 species
1199
1200 Title : species
1201 Usage : $species = $seq->species() or $seq->species($species)
1202 Function: Gets or sets the species
1203 Returns : L<Bio::Species> object
1204 Args : None or L<Bio::Species> object
1205
1206 See L<Bio::Species> for more information
1207
1208 =cut
1209
1210 sub species {
1211 my ($self, $species) = @_;
1212 if ($species) {
1213 $self->{'species'} = $species;
1214 } else {
1215 return $self->{'species'};
1216 }
1217 }
1218
1219 =head1 Internal methods
1220
1221 =cut
1222
1223 # keep AUTOLOAD happy
1224 sub DESTROY { }
1225
1226 ############################################################################
1227 # aliases due to name changes or to compensate for our lack of consistency #
1228 ############################################################################
1229
1230 # in all other modules we use the object in the singular --
1231 # lack of consistency sucks
1232 *flush_SeqFeature = \&remove_SeqFeatures;
1233 *flush_SeqFeatures = \&remove_SeqFeatures;
1234
1235 # this is now get_SeqFeatures() (from FeatureHolderI)
1236 *top_SeqFeatures = \&get_SeqFeatures;
1237
1238 # this is now get_all_SeqFeatures() in FeatureHolderI
1239 sub all_SeqFeatures{
1240 return shift->get_all_SeqFeatures(@_);
1241 }
1242
1243 sub accession {
1244 my $self = shift;
1245 $self->warn(ref($self)."::accession is deprecated, ".
1246 "use accession_number() instead");
1247 return $self->accession_number(@_);
1248 }
1249
1250 1;