Mercurial > repos > mahtabm > ensembl
comparison variant_effect_predictor/Bio/Seq.pm @ 0:1f6dce3d34e0
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 02:01:53 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:1f6dce3d34e0 |
---|---|
1 # $Id: Seq.pm,v 1.76.2.2 2003/07/03 20:01:32 jason Exp $ | |
2 # | |
3 # BioPerl module for Bio::Seq | |
4 # | |
5 # Cared for by Ewan Birney <birney@ebi.ac.uk> | |
6 # | |
7 # Copyright Ewan Birney | |
8 # | |
9 # You may distribute this module under the same terms as perl itself | |
10 | |
11 # POD documentation - main docs before the code | |
12 | |
13 =head1 NAME | |
14 | |
15 Bio::Seq - Sequence object, with features | |
16 | |
17 =head1 SYNOPSIS | |
18 | |
19 # This is the main sequence object in Bioperl | |
20 | |
21 # gets a sequence from a file | |
22 $seqio = Bio::SeqIO->new( '-format' => 'embl' , -file => 'myfile.dat'); | |
23 $seqobj = $seqio->next_seq(); | |
24 | |
25 # SeqIO can both read and write sequences; see Bio::SeqIO | |
26 # for more information and examples | |
27 | |
28 # get from database | |
29 $db = Bio::DB::GenBank->new(); | |
30 $seqobj = $db->get_Seq_by_acc('X78121'); | |
31 | |
32 # make from strings in script | |
33 $seqobj = Bio::Seq->new( -display_id => 'my_id', | |
34 -seq => $sequence_as_string); | |
35 | |
36 # gets sequence as a string from sequence object | |
37 $seqstr = $seqobj->seq(); # actual sequence as a string | |
38 $seqstr = $seqobj->subseq(10,50); # slice in biological coordinates | |
39 | |
40 # retrieves information from the sequence | |
41 # features must implement Bio::SeqFeatureI interface | |
42 | |
43 @features = $seqobj->get_SeqFeatures(); # just top level | |
44 foreach my $feat ( @features ) { | |
45 print "Feature ",$feat->primary_tag," starts ",$feat->start," ends ", | |
46 $feat->end," strand ",$feat->strand,"\n"; | |
47 | |
48 # features retain link to underlying sequence object | |
49 print "Feature sequence is ",$feat->seq->seq(),"\n" | |
50 } | |
51 | |
52 # sequences may have a species | |
53 | |
54 if( defined $seq->species ) { | |
55 print "Sequence is from ",$species->binomial_name," [",$species->common_name,"]\n"; | |
56 } | |
57 | |
58 # annotation objects are Bio::AnnotationCollectionI's | |
59 $ann = $seqobj->annotation(); # annotation object | |
60 | |
61 # references is one type of annotations to get. Also get | |
62 # comment and dblink. Look at Bio::AnnotationCollection for | |
63 # more information | |
64 | |
65 foreach my $ref ( $ann->get_Annotations('reference') ) { | |
66 print "Reference ",$ref->title,"\n"; | |
67 } | |
68 | |
69 # you can get truncations, translations and reverse complements, these | |
70 # all give back Bio::Seq objects themselves, though currently with no | |
71 # features transfered | |
72 | |
73 my $trunc = $seqobj->trunc(100,200); | |
74 my $rev = $seqobj->revcom(); | |
75 | |
76 # there are many options to translate - check out the docs | |
77 my $trans = $seqobj->translate(); | |
78 | |
79 # these functions can be chained together | |
80 | |
81 my $trans_trunc_rev = $seqobj->trunc(100,200)->revcom->translate(); | |
82 | |
83 | |
84 | |
85 =head1 DESCRIPTION | |
86 | |
87 A Seq object is a sequence with sequence features placed on it. The | |
88 Seq object contains a PrimarySeq object for the actual sequence and | |
89 also implements its interface. | |
90 | |
91 In Bioperl we have 3 main players that people are going to use frequently | |
92 | |
93 Bio::PrimarySeq - just the sequence and its names, nothing else. | |
94 Bio::SeqFeatureI - a location on a sequence, potentially with a sequence | |
95 and annotation. | |
96 Bio::Seq - A sequence and a collection of sequence features | |
97 (an aggregate) with its own annotation. | |
98 | |
99 Although Bioperl is not tied heavily to file formats these distinctions do | |
100 map to file formats sensibly and for some bioinformaticians this might help | |
101 | |
102 Bio::PrimarySeq - Fasta file of a sequence | |
103 Bio::SeqFeatureI - A single entry in an EMBL/GenBank/DDBJ feature table | |
104 Bio::Seq - A single EMBL/GenBank/DDBJ entry | |
105 | |
106 By having this split we avoid a lot of nasty circular references | |
107 (sequence features can hold a reference to a sequence without the sequence | |
108 holding a reference to the sequence feature). See L<Bio::PrimarySeq> and | |
109 L<Bio::SeqFeatureI> for more information. | |
110 | |
111 Ian Korf really helped in the design of the Seq and SeqFeature system. | |
112 | |
113 =head1 EXAMPLES | |
114 | |
115 A simple and fundamental block of code | |
116 | |
117 use Bio::SeqIO; | |
118 | |
119 my $seqIOobj = Bio::SeqIO->new(-file=>"1.fa"); # create a SeqIO object | |
120 my $seqobj = $seqIOobj->next_seq; # get a Seq object | |
121 | |
122 With the Seq object in hand one has access to a powerful set of Bioperl | |
123 methods and Bioperl objects. This next script will take a file of sequences | |
124 in EMBL format and create a file of the reverse-complemented sequences | |
125 in Fasta format using Seq objects. It also prints out details about the | |
126 exons it finds as sequence features in Genbank Flat File format. | |
127 | |
128 use Bio::Seq; | |
129 use Bio::SeqIO; | |
130 | |
131 $seqin = Bio::SeqIO->new( -format => 'EMBL' , -file => 'myfile.dat'); | |
132 $seqout= Bio::SeqIO->new( -format => 'Fasta', -file => '>output.fa'); | |
133 | |
134 while((my $seqobj = $seqin->next_seq())) { | |
135 print "Seen sequence ",$seqobj->display_id,", start of seq ", | |
136 substr($seqobj->seq,1,10),"\n"; | |
137 if( $seqobj->alphabet eq 'dna') { | |
138 $rev = $seqobj->revcom; | |
139 $id = $seqobj->display_id(); | |
140 $id = "$id.rev"; | |
141 $rev->display_id($id); | |
142 $seqout->write_seq($rev); | |
143 } | |
144 | |
145 foreach $feat ( $seqobj->get_SeqFeatures() ) { | |
146 if( $feat->primary_tag eq 'exon' ) { | |
147 print STDOUT "Location ",$feat->start,":", | |
148 $feat->end," GFF[",$feat->gff_string,"]\n"; | |
149 } | |
150 } | |
151 } | |
152 | |
153 Let's examine the script. The lines below import the Bioperl modules. | |
154 Seq is the main Bioperl sequence object and SeqIO is the Bioperl support | |
155 for reading sequences from files and to files | |
156 | |
157 use Bio::Seq; | |
158 use Bio::SeqIO; | |
159 | |
160 These two lines create two SeqIO streams: one for reading in sequences | |
161 and one for outputting sequences: | |
162 | |
163 $seqin = Bio::SeqIO->new( -format => 'EMBL' , -file => 'myfile.dat'); | |
164 $seqout= Bio::SeqIO->new( -format => 'Fasta', -file => '>output.fa'); | |
165 | |
166 Notice that in the "$seqout" case there is a greater-than sign, | |
167 indicating the file is being opened for writing. | |
168 | |
169 Using the | |
170 | |
171 '-argument' => value | |
172 | |
173 syntax is common in Bioperl. The file argument is like an argument | |
174 to open() . You can also pass in filehandles or FileHandle objects by | |
175 using the -fh argument (see L<Bio::SeqIO> documentation for details). | |
176 Many formats in Bioperl are handled, including Fasta, EMBL, GenBank, | |
177 Swissprot (swiss), PIR, and GCG. | |
178 | |
179 $seqin = Bio::SeqIO->new( -format => 'EMBL' , -file => 'myfile.dat'); | |
180 $seqout= Bio::SeqIO->new( -format => 'Fasta', -file => '>output.fa'); | |
181 | |
182 This is the main loop which will loop progressively through sequences | |
183 in a file, and each call to $seqio-E<gt>next_seq() provides a new Seq | |
184 object from the file: | |
185 | |
186 while((my $seqobj = $seqio->next_seq())) { | |
187 | |
188 This print line below accesses fields in the Seq object directly. The | |
189 $seqobj-E<gt>display_id is the way to access the display_id attribute | |
190 of the Seq object. The $seqobj-E<gt>seq method gets the actual | |
191 sequence out as string. Then you can do manipulation of this if | |
192 you want to (there are however easy ways of doing truncation, | |
193 reverse-complement and translation). | |
194 | |
195 print "Seen sequence ",$seqobj->display_id,", start of seq ", | |
196 substr($seqobj->seq,1,10),"\n"; | |
197 | |
198 Bioperl has to guess the alphabet of the sequence, being either 'dna', | |
199 'rna', or 'protein'. The alphabet attribute is one of these three | |
200 possibilities. | |
201 | |
202 if( $seqobj->alphabet eq 'dna') { | |
203 | |
204 The $seqobj-E<gt>revcom method provides the reverse complement of the Seq | |
205 object as another Seq object. Thus, the $rev variable is a reference to | |
206 another Seq object. For example, one could repeat the above print line | |
207 for this Seq object (putting $rev in place of $seqobj). In this | |
208 case we are going to output the object into the file stream we built | |
209 earlier on. | |
210 | |
211 $rev = $seqobj->revcom; | |
212 | |
213 When we output it, we want the id of the outputted object | |
214 to be changed to "$id.rev", ie, with .rev on the end of the name. The | |
215 following lines retrieve the id of the sequence object, add .rev | |
216 to this and then set the display_id of the rev sequence object to | |
217 this. Notice that to set the display_id attribute you just need | |
218 call the same method, display_id(), with the new value as an argument. | |
219 Getting and setting values with the same method is common in Bioperl. | |
220 | |
221 $id = $seqobj->display_id(); | |
222 $id = "$id.rev"; | |
223 $rev->display_id($id); | |
224 | |
225 The write_seq method on the SeqIO output object, $seqout, writes the | |
226 $rev object to the filestream we built at the top of the script. | |
227 The filestream knows that it is outputting in fasta format, and | |
228 so it provides fasta output. | |
229 | |
230 $seqout->write_seq($rev); | |
231 | |
232 This block of code loops over sequence features in the sequence | |
233 object, trying to find ones who have been tagged as 'exon'. | |
234 Features have start and end attributes and can be outputted | |
235 in Genbank Flat File format, GFF, a standarized format for sequence | |
236 features. | |
237 | |
238 foreach $feat ( $seqobj->get_SeqFeatures() ) { | |
239 if( $feat->primary_tag eq 'exon' ) { | |
240 print STDOUT "Location ",$feat->start,":", | |
241 $feat->end," GFF[",$feat->gff_string,"]\n"; | |
242 } | |
243 } | |
244 | |
245 The code above shows how a few Bio::Seq methods suffice to read, parse, | |
246 reformat and analyze sequences from a file. A full list of methods | |
247 available to Bio::Seq objects is shown below. Bear in mind that some of | |
248 these methods come from PrimarySeq objects, which are simpler | |
249 than Seq objects, stripped of features (see L<Bio::PrimarySeq> for | |
250 more information). | |
251 | |
252 # these methods return strings, and accept strings in some cases: | |
253 | |
254 $seqobj->seq(); # string of sequence | |
255 $seqobj->subseq(5,10); # part of the sequence as a string | |
256 $seqobj->accession_number(); # when there, the accession number | |
257 $seqobj->moltype(); # one of 'dna','rna',or 'protein' | |
258 $seqobj->seq_version() # when there, the version | |
259 $seqobj->keywords(); # when there, the Keywords line | |
260 $seqobj->length() # length | |
261 $seqobj->desc(); # description | |
262 $seqobj->primary_id(); # a unique id for this sequence regardless | |
263 # of its display_id or accession number | |
264 $seqobj->display_id(); # the human readable id of the sequence | |
265 | |
266 Some of these values map to fields in common formats. For example, The | |
267 display_id() method returns the LOCUS name of a Genbank entry, | |
268 the (\S+) following the E<gt> character in a Fasta file, the ID from | |
269 a SwissProt file, and so on. The desc() method will return the DEFINITION | |
270 line of a Genbank file, the description following the display_id in a | |
271 Fasta file, and the DE field in a SwissProt file. | |
272 | |
273 # the following methods return new Seq objects, but | |
274 # do not transfer features across to the new object: | |
275 | |
276 $seqobj->trunc(5,10) # truncation from 5 to 10 as new object | |
277 $seqobj->revcom # reverse complements sequence | |
278 $seqobj->translate # translation of the sequence | |
279 | |
280 # if new() can be called this method returns 1, else 0 | |
281 | |
282 $seqobj->can_call_new | |
283 | |
284 # the following method determines if the given string will be accepted | |
285 # by the seq() method - if the string is acceptable then validate() | |
286 # returns 1, or 0 if not | |
287 | |
288 $seqobj->validate_seq($string) | |
289 | |
290 # the following method returns or accepts a Species object: | |
291 | |
292 $seqobj->species(); | |
293 | |
294 Please see L<Bio::Species> for more information on this object. | |
295 | |
296 # the following method returns or accepts an Annotation object | |
297 # which in turn allows access to Annotation::Reference | |
298 # and Annotation::Comment objects: | |
299 | |
300 $seqobj->annotation(); | |
301 | |
302 These annotations typically refer to entire sequences, unlike | |
303 features. See L<Bio::AnnotationCollectionI>, | |
304 L<Bio::Annotation::Collection>, L<Bio::Annotation::Reference>, and | |
305 L<Bio::Annotation::Comment> for details. | |
306 | |
307 It is also important to be able to describe defined portions of a | |
308 sequence. The combination of some description and the corresponding | |
309 sub-sequence is called a feature - an exon and its coordinates within | |
310 a gene is an example of a feature, or a domain within a protein. | |
311 | |
312 # the following methods return an array of SeqFeatureI objects: | |
313 | |
314 $seqobj->get_SeqFeatures # The 'top level' sequence features | |
315 $seqobj->get_all_SeqFeatures # All sequence features, including sub-seq | |
316 # features, such as features in an exon | |
317 | |
318 # to find out the number of features use: | |
319 | |
320 $seqobj->feature_count | |
321 | |
322 Here are just some of the methods available to SeqFeatureI objects: | |
323 | |
324 # these methods return numbers: | |
325 | |
326 $feat->start # start position (1 is the first base) | |
327 $feat->end # end position (2 is the second base) | |
328 $feat->strand # 1 means forward, -1 reverse, 0 not relevant | |
329 | |
330 # these methods return or accept strings: | |
331 | |
332 $feat->primary_tag # the name of the sequence feature, eg | |
333 # 'exon', 'glycoslyation site', 'TM domain' | |
334 $feat->source_tag # where the feature comes from, eg, 'EMBL_GenBank', | |
335 # or 'BLAST' | |
336 | |
337 # this method returns the more austere PrimarySeq object, not a | |
338 # Seq object - the main difference is that PrimarySeq objects do not | |
339 # themselves contain sequence features | |
340 | |
341 $feat->seq # the sequence between start,end on the | |
342 # correct strand of the sequence | |
343 | |
344 See L<Bio::PrimarySeq> for more details on PrimarySeq objects. | |
345 | |
346 # useful methods for feature comparisons, for start/end points | |
347 | |
348 $feat->overlaps($other) # do $feat and $other overlap? | |
349 $feat->contains($other) # is $other completely within $feat? | |
350 $feat->equals($other) # do $feat and $other completely agree? | |
351 | |
352 # one can also add features | |
353 | |
354 $seqobj->add_SeqFeature($feat) # returns 1 if successful | |
355 $seqobj->add_SeqFeature(@features) # returns 1 if successful | |
356 | |
357 # sub features. For complex join() statements, the feature | |
358 # is one sequence feature with many sub SeqFeatures | |
359 | |
360 $feat->sub_SeqFeature # returns array of sub seq features | |
361 | |
362 Please see L<Bio::SeqFeatureI> and L<Bio::SeqFeature::Generic>, | |
363 for more information on sequence features. | |
364 | |
365 It is worth mentioning that one can also retrieve the start and end | |
366 positions of a feature using a Bio::LocationI object: | |
367 | |
368 $location = $feat->location # $location is a Bio::LocationI object | |
369 $location->start; # start position | |
370 $location->end; # end position | |
371 | |
372 This is useful because one needs a Bio::Location::SplitLocationI object | |
373 in order to retrieve the coordinates inside the Genbank or EMBL join() | |
374 statements (e.g. "CDS join(51..142,273..495,1346..1474)"): | |
375 | |
376 if ( $feat->location->isa('Bio::Location::SplitLocationI') && | |
377 $feat->primary_tag eq 'CDS' ) { | |
378 foreach $loc ( $feat->location->sub_Location ) { | |
379 print $loc->start . ".." . $loc->end . "\n"; | |
380 } | |
381 } | |
382 | |
383 See L<Bio::LocationI> and L<Bio::Location::SplitLocationI> for more | |
384 information. | |
385 | |
386 =head1 Implemented Interfaces | |
387 | |
388 This class implements the following interfaces. | |
389 | |
390 =over 4 | |
391 | |
392 =item Bio::SeqI | |
393 | |
394 Note that this includes implementing Bio::PrimarySeqI. | |
395 | |
396 =item Bio::IdentifiableI | |
397 | |
398 =item Bio::DescribableI | |
399 | |
400 =item Bio::AnnotatableI | |
401 | |
402 =item Bio::FeatureHolderI | |
403 | |
404 =back | |
405 | |
406 =head1 FEEDBACK | |
407 | |
408 | |
409 =head2 Mailing Lists | |
410 | |
411 User feedback is an integral part of the evolution of this and other | |
412 Bioperl modules. Send your comments and suggestions preferably to one | |
413 of the Bioperl mailing lists. Your participation is much appreciated. | |
414 | |
415 bioperl-l@bioperl.org - General discussion | |
416 http://bio.perl.org/MailList.html - About the mailing lists | |
417 | |
418 =head2 Reporting Bugs | |
419 | |
420 Report bugs to the Bioperl bug tracking system to help us keep track | |
421 the bugs and their resolution. Bug reports can be submitted via email | |
422 or the web: | |
423 | |
424 bioperl-bugs@bioperl.org | |
425 http://bugzilla.bioperl.org/ | |
426 | |
427 =head1 AUTHOR - Ewan Birney, inspired by Ian Korf objects | |
428 | |
429 Email birney@ebi.ac.uk | |
430 | |
431 =head1 CONTRIBUTORS | |
432 | |
433 Jason Stajich E<lt>jason@bioperl.orgE<gt> | |
434 | |
435 =head1 APPENDIX | |
436 | |
437 | |
438 The rest of the documentation details each of the object | |
439 methods. Internal methods are usually preceded with a "_". | |
440 | |
441 =cut | |
442 | |
443 #' | |
444 # Let the code begin... | |
445 | |
446 | |
447 package Bio::Seq; | |
448 use vars qw(@ISA $VERSION); | |
449 use strict; | |
450 | |
451 | |
452 # Object preamble - inherits from Bio::Root::Object | |
453 | |
454 use Bio::Root::Root; | |
455 use Bio::SeqI; | |
456 use Bio::Annotation::Collection; | |
457 use Bio::PrimarySeq; | |
458 use Bio::IdentifiableI; | |
459 use Bio::DescribableI; | |
460 use Bio::AnnotatableI; | |
461 use Bio::FeatureHolderI; | |
462 | |
463 $VERSION = '1.1'; | |
464 @ISA = qw(Bio::Root::Root Bio::SeqI | |
465 Bio::IdentifiableI Bio::DescribableI | |
466 Bio::AnnotatableI Bio::FeatureHolderI); | |
467 | |
468 =head2 new | |
469 | |
470 Title : new | |
471 Usage : $seq = Bio::Seq->new( -seq => 'ATGGGGGTGGTGGTACCCT', | |
472 -id => 'human_id', | |
473 -accession_number => 'AL000012', | |
474 ); | |
475 | |
476 Function: Returns a new Seq object from | |
477 basic constructors, being a string for the sequence | |
478 and strings for id and accession_number | |
479 Returns : a new Bio::Seq object | |
480 | |
481 =cut | |
482 | |
483 sub new { | |
484 my($caller,@args) = @_; | |
485 | |
486 if( $caller ne 'Bio::Seq') { | |
487 $caller = ref($caller) if ref($caller); | |
488 } | |
489 | |
490 # we know our inherietance heirarchy | |
491 my $self = Bio::Root::Root->new(@args); | |
492 bless $self,$caller; | |
493 | |
494 # this is way too sneaky probably. We delegate the construction of | |
495 # the Seq object onto PrimarySeq and then pop primary_seq into | |
496 # our primary_seq slot | |
497 | |
498 my $pseq = Bio::PrimarySeq->new(@args); | |
499 | |
500 # as we have just made this, we know it is ok to set hash directly | |
501 # rather than going through the method | |
502 | |
503 $self->{'primary_seq'} = $pseq; | |
504 | |
505 # setting this array is now delayed until the final | |
506 # moment, again speed ups for non feature containing things | |
507 # $self->{'_as_feat'} = []; | |
508 | |
509 | |
510 my ($ann, $pid,$feat,$species) = &Bio::Root::RootI::_rearrange($self,[qw(ANNOTATION PRIMARY_ID FEATURES SPECIES)], @args); | |
511 | |
512 # for a number of cases - reading fasta files - these are never set. This | |
513 # gives a quick optimisation around testing things later on | |
514 | |
515 if( defined $ann || defined $pid || defined $feat || defined $species ) { | |
516 $pid && $self->primary_id($pid); | |
517 $species && $self->species($species); | |
518 $ann && $self->annotation($ann); | |
519 | |
520 if( defined $feat ) { | |
521 if( ref($feat) !~ /ARRAY/i ) { | |
522 if( ref($feat) && $feat->isa('Bio::SeqFeatureI') ) { | |
523 $self->add_SeqFeature($feat); | |
524 } else { | |
525 $self->warn("Must specify a valid Bio::SeqFeatureI or ArrayRef of Bio::SeqFeatureI's with the -features init parameter for ".ref($self)); | |
526 } | |
527 } else { | |
528 foreach my $feature ( @$feat ) { | |
529 $self->add_SeqFeature($feature); | |
530 } | |
531 } | |
532 } | |
533 } | |
534 | |
535 return $self; | |
536 } | |
537 | |
538 =head1 PrimarySeq interface | |
539 | |
540 | |
541 The PrimarySeq interface provides the basic sequence getting | |
542 and setting methods for on all sequences. | |
543 | |
544 These methods implement the Bio::PrimarySeq interface by delegating | |
545 to the primary_seq inside the object. This means that you | |
546 can use a Seq object wherever there is a PrimarySeq, and | |
547 of course, you are free to use these functions anyway. | |
548 | |
549 =cut | |
550 | |
551 =head2 seq | |
552 | |
553 Title : seq | |
554 Usage : $string = $obj->seq() | |
555 Function: Get/Set the sequence as a string of letters. The | |
556 case of the letters is left up to the implementer. | |
557 Suggested cases are upper case for proteins and lower case for | |
558 DNA sequence (IUPAC standard), | |
559 but implementations are suggested to keep an open mind about | |
560 case (some users... want mixed case!) | |
561 Returns : A scalar | |
562 Args : Optionally on set the new value (a string). An optional second | |
563 argument presets the alphabet (otherwise it will be guessed). | |
564 Both parameters may also be given in named paramater style | |
565 with -seq and -alphabet being the names. | |
566 | |
567 =cut | |
568 | |
569 sub seq { | |
570 return shift->primary_seq()->seq(@_); | |
571 } | |
572 | |
573 =head2 validate_seq | |
574 | |
575 Title : validate_seq | |
576 Usage : if(! $seq->validate_seq($seq_str) ) { | |
577 print "sequence $seq_str is not valid for an object of type ", | |
578 ref($seq), "\n"; | |
579 } | |
580 Function: Validates a given sequence string. A validating sequence string | |
581 must be accepted by seq(). A string that does not validate will | |
582 lead to an exception if passed to seq(). | |
583 | |
584 The implementation provided here does not take alphabet() into | |
585 account. Allowed are all letters (A-Z) and '-','.', and '*'. | |
586 | |
587 Example : | |
588 Returns : 1 if the supplied sequence string is valid for the object, and | |
589 0 otherwise. | |
590 Args : The sequence string to be validated. | |
591 | |
592 | |
593 =cut | |
594 | |
595 sub validate_seq { | |
596 return shift->primary_seq()->validate_seq(@_); | |
597 } | |
598 | |
599 =head2 length | |
600 | |
601 Title : length | |
602 Usage : $len = $seq->length() | |
603 Function: | |
604 Example : | |
605 Returns : Integer representing the length of the sequence. | |
606 Args : None | |
607 | |
608 =cut | |
609 | |
610 sub length { | |
611 return shift->primary_seq()->length(@_); | |
612 } | |
613 | |
614 =head1 Methods from the Bio::PrimarySeqI interface | |
615 | |
616 =cut | |
617 | |
618 =head2 subseq | |
619 | |
620 Title : subseq | |
621 Usage : $substring = $obj->subseq(10,40); | |
622 Function: Returns the subseq from start to end, where the first base | |
623 is 1 and the number is inclusive, ie 1-2 are the first two | |
624 bases of the sequence | |
625 | |
626 Start cannot be larger than end but can be equal | |
627 | |
628 Returns : A string | |
629 Args : 2 integers | |
630 | |
631 | |
632 =cut | |
633 | |
634 sub subseq { | |
635 return shift->primary_seq()->subseq(@_); | |
636 } | |
637 | |
638 =head2 display_id | |
639 | |
640 Title : display_id | |
641 Usage : $id = $obj->display_id or $obj->display_id($newid); | |
642 Function: Gets or sets the display id, also known as the common name of | |
643 the Seq object. | |
644 | |
645 The semantics of this is that it is the most likely string | |
646 to be used as an identifier of the sequence, and likely to | |
647 have "human" readability. The id is equivalent to the LOCUS | |
648 field of the GenBank/EMBL databanks and the ID field of the | |
649 Swissprot/sptrembl database. In fasta format, the >(\S+) is | |
650 presumed to be the id, though some people overload the id | |
651 to embed other information. Bioperl does not use any | |
652 embedded information in the ID field, and people are | |
653 encouraged to use other mechanisms (accession field for | |
654 example, or extending the sequence object) to solve this. | |
655 | |
656 Notice that $seq->id() maps to this function, mainly for | |
657 legacy/convenience issues. | |
658 Returns : A string | |
659 Args : None or a new id | |
660 | |
661 | |
662 =cut | |
663 | |
664 sub display_id { | |
665 return shift->primary_seq->display_id(@_); | |
666 } | |
667 | |
668 | |
669 | |
670 =head2 accession_number | |
671 | |
672 Title : accession_number | |
673 Usage : $unique_biological_key = $obj->accession_number; | |
674 Function: Returns the unique biological id for a sequence, commonly | |
675 called the accession_number. For sequences from established | |
676 databases, the implementors should try to use the correct | |
677 accession number. Notice that primary_id() provides the | |
678 unique id for the implemetation, allowing multiple objects | |
679 to have the same accession number in a particular implementation. | |
680 | |
681 For sequences with no accession number, this method should return | |
682 "unknown". | |
683 | |
684 Can also be used to set the accession number. | |
685 Example : $key = $seq->accession_number or $seq->accession_number($key) | |
686 Returns : A string | |
687 Args : None or an accession number | |
688 | |
689 | |
690 =cut | |
691 | |
692 sub accession_number { | |
693 return shift->primary_seq->accession_number(@_); | |
694 } | |
695 | |
696 =head2 desc | |
697 | |
698 Title : desc | |
699 Usage : $seqobj->desc($string) or $seqobj->desc() | |
700 Function: Sets or gets the description of the sequence | |
701 Example : | |
702 Returns : The description | |
703 Args : The description or none | |
704 | |
705 | |
706 =cut | |
707 | |
708 sub desc { | |
709 return shift->primary_seq->desc(@_); | |
710 } | |
711 | |
712 =head2 primary_id | |
713 | |
714 Title : primary_id | |
715 Usage : $unique_implementation_key = $obj->primary_id; | |
716 Function: Returns the unique id for this object in this | |
717 implementation. This allows implementations to manage | |
718 their own object ids in a way the implementation can control | |
719 clients can expect one id to map to one object. | |
720 | |
721 For sequences with no natural id, this method should return | |
722 a stringified memory location. | |
723 | |
724 Can also be used to set the primary_id. | |
725 | |
726 Also notice that this method is not delegated to the | |
727 internal Bio::PrimarySeq object | |
728 | |
729 [Note this method name is likely to change in 1.3] | |
730 | |
731 Example : $id = $seq->primary_id or $seq->primary_id($id) | |
732 Returns : A string | |
733 Args : None or an id | |
734 | |
735 | |
736 =cut | |
737 | |
738 sub primary_id { | |
739 my ($obj,$value) = @_; | |
740 | |
741 if( defined $value) { | |
742 $obj->{'primary_id'} = $value; | |
743 } | |
744 if( ! exists $obj->{'primary_id'} ) { | |
745 return "$obj"; | |
746 } | |
747 return $obj->{'primary_id'}; | |
748 } | |
749 | |
750 =head2 can_call_new | |
751 | |
752 Title : can_call_new | |
753 Usage : if ( $obj->can_call_new ) { | |
754 $newobj = $obj->new( %param ); | |
755 } | |
756 Function: can_call_new returns 1 or 0 depending | |
757 on whether an implementation allows new | |
758 constructor to be called. If a new constructor | |
759 is allowed, then it should take the followed hashed | |
760 constructor list. | |
761 | |
762 $myobject->new( -seq => $sequence_as_string, | |
763 -display_id => $id | |
764 -accession_number => $accession | |
765 -alphabet => 'dna', | |
766 ); | |
767 Example : | |
768 Returns : 1 or 0 | |
769 Args : None | |
770 | |
771 | |
772 =cut | |
773 | |
774 sub can_call_new { | |
775 return 1; | |
776 } | |
777 | |
778 =head2 alphabet | |
779 | |
780 Title : alphabet | |
781 Usage : if ( $obj->alphabet eq 'dna' ) { /Do Something/ } | |
782 Function: Returns the type of sequence being one of | |
783 'dna', 'rna' or 'protein'. This is case sensitive. | |
784 | |
785 This is not called <type> because this would cause | |
786 upgrade problems from the 0.5 and earlier Seq objects. | |
787 | |
788 Returns : A string either 'dna','rna','protein'. NB - the object must | |
789 make a call of the type - if there is no type specified it | |
790 has to guess. | |
791 Args : None | |
792 | |
793 | |
794 =cut | |
795 | |
796 sub alphabet { | |
797 my $self = shift; | |
798 return $self->primary_seq->alphabet(@_) if @_ && defined $_[0]; | |
799 return $self->primary_seq->alphabet(); | |
800 } | |
801 | |
802 sub is_circular { shift->primary_seq->is_circular } | |
803 | |
804 =head1 Methods for Bio::IdentifiableI compliance | |
805 | |
806 =cut | |
807 | |
808 =head2 object_id | |
809 | |
810 Title : object_id | |
811 Usage : $string = $obj->object_id() | |
812 Function: a string which represents the stable primary identifier | |
813 in this namespace of this object. For DNA sequences this | |
814 is its accession_number, similarly for protein sequences | |
815 | |
816 This is aliased to accession_number(). | |
817 Returns : A scalar | |
818 | |
819 | |
820 =cut | |
821 | |
822 sub object_id { | |
823 return shift->accession_number(@_); | |
824 } | |
825 | |
826 =head2 version | |
827 | |
828 Title : version | |
829 Usage : $version = $obj->version() | |
830 Function: a number which differentiates between versions of | |
831 the same object. Higher numbers are considered to be | |
832 later and more relevant, but a single object described | |
833 the same identifier should represent the same concept | |
834 | |
835 Returns : A number | |
836 | |
837 =cut | |
838 | |
839 sub version{ | |
840 return shift->primary_seq->version(@_); | |
841 } | |
842 | |
843 | |
844 =head2 authority | |
845 | |
846 Title : authority | |
847 Usage : $authority = $obj->authority() | |
848 Function: a string which represents the organisation which | |
849 granted the namespace, written as the DNS name for | |
850 organisation (eg, wormbase.org) | |
851 | |
852 Returns : A scalar | |
853 | |
854 =cut | |
855 | |
856 sub authority { | |
857 return shift->primary_seq()->authority(@_); | |
858 } | |
859 | |
860 =head2 namespace | |
861 | |
862 Title : namespace | |
863 Usage : $string = $obj->namespace() | |
864 Function: A string representing the name space this identifier | |
865 is valid in, often the database name or the name | |
866 describing the collection | |
867 | |
868 Returns : A scalar | |
869 | |
870 | |
871 =cut | |
872 | |
873 sub namespace{ | |
874 return shift->primary_seq()->namespace(@_); | |
875 } | |
876 | |
877 =head1 Methods for Bio::DescribableI compliance | |
878 | |
879 =cut | |
880 | |
881 =head2 display_name | |
882 | |
883 Title : display_name | |
884 Usage : $string = $obj->display_name() | |
885 Function: A string which is what should be displayed to the user | |
886 the string should have no spaces (ideally, though a cautious | |
887 user of this interface would not assumme this) and should be | |
888 less than thirty characters (though again, double checking | |
889 this is a good idea) | |
890 | |
891 This is aliased to display_id(). | |
892 Returns : A scalar | |
893 | |
894 =cut | |
895 | |
896 sub display_name { | |
897 return shift->display_id(@_); | |
898 } | |
899 | |
900 =head2 description | |
901 | |
902 Title : description | |
903 Usage : $string = $obj->description() | |
904 Function: A text string suitable for displaying to the user a | |
905 description. This string is likely to have spaces, but | |
906 should not have any newlines or formatting - just plain | |
907 text. The string should not be greater than 255 characters | |
908 and clients can feel justified at truncating strings at 255 | |
909 characters for the purposes of display | |
910 | |
911 This is aliased to desc(). | |
912 Returns : A scalar | |
913 | |
914 =cut | |
915 | |
916 sub description { | |
917 return shift->desc(@_); | |
918 } | |
919 | |
920 =head1 Methods for implementing Bio::AnnotatableI | |
921 | |
922 =cut | |
923 | |
924 =head2 annotation | |
925 | |
926 Title : annotation | |
927 Usage : $ann = $seq->annotation or $seq->annotation($annotation) | |
928 Function: Gets or sets the annotation | |
929 Returns : L<Bio::AnnotationCollectionI> object | |
930 Args : None or L<Bio::AnnotationCollectionI> object | |
931 | |
932 See L<Bio::AnnotationCollectionI> and L<Bio::Annotation::Collection> | |
933 for more information | |
934 | |
935 =cut | |
936 | |
937 sub annotation { | |
938 my ($obj,$value) = @_; | |
939 if( defined $value ) { | |
940 $obj->throw("object of class ".ref($value)." does not implement ". | |
941 "Bio::AnnotationCollectionI. Too bad.") | |
942 unless $value->isa("Bio::AnnotationCollectionI"); | |
943 $obj->{'_annotation'} = $value; | |
944 } elsif( ! defined $obj->{'_annotation'}) { | |
945 $obj->{'_annotation'} = new Bio::Annotation::Collection; | |
946 } | |
947 return $obj->{'_annotation'}; | |
948 } | |
949 | |
950 =head1 Methods to implement Bio::FeatureHolderI | |
951 | |
952 This includes methods for retrieving, adding, and removing features. | |
953 | |
954 =cut | |
955 | |
956 =head2 get_SeqFeatures | |
957 | |
958 Title : get_SeqFeatures | |
959 Usage : | |
960 Function: Get the feature objects held by this feature holder. | |
961 | |
962 Features which are not top-level are subfeatures of one or | |
963 more of the returned feature objects, which means that you | |
964 must traverse the subfeature arrays of each top-level | |
965 feature object in order to traverse all features associated | |
966 with this sequence. | |
967 | |
968 Use get_all_SeqFeatures() if you want the feature tree | |
969 flattened into one single array. | |
970 | |
971 Example : | |
972 Returns : an array of Bio::SeqFeatureI implementing objects | |
973 Args : none | |
974 | |
975 At some day we may want to expand this method to allow for a feature | |
976 filter to be passed in. | |
977 | |
978 =cut | |
979 | |
980 sub get_SeqFeatures{ | |
981 my $self = shift; | |
982 | |
983 if( !defined $self->{'_as_feat'} ) { | |
984 $self->{'_as_feat'} = []; | |
985 } | |
986 | |
987 return @{$self->{'_as_feat'}}; | |
988 } | |
989 | |
990 =head2 get_all_SeqFeatures | |
991 | |
992 Title : get_all_SeqFeatures | |
993 Usage : @feat_ary = $seq->get_all_SeqFeatures(); | |
994 Function: Returns the tree of feature objects attached to this | |
995 sequence object flattened into one single array. Top-level | |
996 features will still contain their subfeature-arrays, which | |
997 means that you will encounter subfeatures twice if you | |
998 traverse the subfeature tree of the returned objects. | |
999 | |
1000 Use get_SeqFeatures() if you want the array to contain only | |
1001 the top-level features. | |
1002 | |
1003 Returns : An array of Bio::SeqFeatureI implementing objects. | |
1004 Args : None | |
1005 | |
1006 | |
1007 =cut | |
1008 | |
1009 # this implementation is inherited from FeatureHolderI | |
1010 | |
1011 =head2 feature_count | |
1012 | |
1013 Title : feature_count | |
1014 Usage : $seq->feature_count() | |
1015 Function: Return the number of SeqFeatures attached to a sequence | |
1016 Returns : integer representing the number of SeqFeatures | |
1017 Args : None | |
1018 | |
1019 | |
1020 =cut | |
1021 | |
1022 sub feature_count { | |
1023 my ($self) = @_; | |
1024 | |
1025 if (defined($self->{'_as_feat'})) { | |
1026 return ($#{$self->{'_as_feat'}} + 1); | |
1027 } else { | |
1028 return 0; | |
1029 } | |
1030 } | |
1031 | |
1032 =head2 add_SeqFeature | |
1033 | |
1034 Title : add_SeqFeature | |
1035 Usage : $seq->add_SeqFeature($feat); | |
1036 $seq->add_SeqFeature(@feat); | |
1037 Function: Adds the given feature object (or each of an array of feature | |
1038 objects to the feature array of this | |
1039 sequence. The object passed is required to implement the | |
1040 Bio::SeqFeatureI interface. | |
1041 Returns : 1 on success | |
1042 Args : A Bio::SeqFeatureI implementing object, or an array of such objects. | |
1043 | |
1044 | |
1045 =cut | |
1046 | |
1047 sub add_SeqFeature { | |
1048 my ($self,@feat) = @_; | |
1049 | |
1050 $self->{'_as_feat'} = [] unless $self->{'_as_feat'}; | |
1051 | |
1052 foreach my $feat ( @feat ) { | |
1053 if( !$feat->isa("Bio::SeqFeatureI") ) { | |
1054 $self->throw("$feat is not a SeqFeatureI and that's what we expect..."); | |
1055 } | |
1056 | |
1057 # make sure we attach ourselves to the feature if the feature wants it | |
1058 my $aseq = $self->primary_seq; | |
1059 $feat->attach_seq($aseq) if $aseq; | |
1060 | |
1061 push(@{$self->{'_as_feat'}},$feat); | |
1062 } | |
1063 return 1; | |
1064 } | |
1065 | |
1066 =head2 remove_SeqFeatures | |
1067 | |
1068 Title : remove_SeqFeatures | |
1069 Usage : $seq->remove_SeqFeatures(); | |
1070 Function: Flushes all attached SeqFeatureI objects. | |
1071 | |
1072 To remove individual feature objects, delete those from the returned | |
1073 array and re-add the rest. | |
1074 Example : | |
1075 Returns : The array of Bio::SeqFeatureI objects removed from this seq. | |
1076 Args : None | |
1077 | |
1078 | |
1079 =cut | |
1080 | |
1081 sub remove_SeqFeatures { | |
1082 my $self = shift; | |
1083 | |
1084 return () unless $self->{'_as_feat'}; | |
1085 my @feats = @{$self->{'_as_feat'}}; | |
1086 $self->{'_as_feat'} = []; | |
1087 return @feats; | |
1088 } | |
1089 | |
1090 =head1 Methods provided in the Bio::PrimarySeqI interface | |
1091 | |
1092 | |
1093 These methods are inherited from the PrimarySeq interface | |
1094 and work as one expects, building new Bio::Seq objects | |
1095 or other information as expected. See L<Bio::PrimarySeq> | |
1096 for more information. | |
1097 | |
1098 Sequence Features are B<not> transfered to the new objects. | |
1099 This is possibly a mistake. Anyone who feels the urge in | |
1100 dealing with this is welcome to give it a go. | |
1101 | |
1102 =head2 revcom | |
1103 | |
1104 Title : revcom | |
1105 Usage : $rev = $seq->revcom() | |
1106 Function: Produces a new Bio::Seq object which | |
1107 is the reversed complement of the sequence. For protein | |
1108 sequences this throws an exception of "Sequence is a protein. | |
1109 Cannot revcom" | |
1110 | |
1111 The id is the same id as the original sequence, and the | |
1112 accession number is also identical. If someone wants to track | |
1113 that this sequence has be reversed, it needs to define its own | |
1114 extensions | |
1115 | |
1116 To do an in-place edit of an object you can go: | |
1117 | |
1118 $seq = $seq->revcom(); | |
1119 | |
1120 This of course, causes Perl to handle the garbage collection of | |
1121 the old object, but it is roughly speaking as efficient as an | |
1122 in-place edit. | |
1123 | |
1124 Returns : A new (fresh) Bio::Seq object | |
1125 Args : None | |
1126 | |
1127 | |
1128 =cut | |
1129 | |
1130 =head2 trunc | |
1131 | |
1132 Title : trunc | |
1133 Usage : $subseq = $myseq->trunc(10,100); | |
1134 Function: Provides a truncation of a sequence | |
1135 | |
1136 Example : | |
1137 Returns : A fresh Seq object | |
1138 Args : A Seq object | |
1139 | |
1140 | |
1141 =cut | |
1142 | |
1143 =head2 id | |
1144 | |
1145 Title : id | |
1146 Usage : $id = $seq->id() | |
1147 Function: This is mapped on display_id | |
1148 Returns : value of display_id() | |
1149 Args : [optional] value to update display_id | |
1150 | |
1151 | |
1152 =cut | |
1153 | |
1154 sub id { | |
1155 return shift->display_id(@_); | |
1156 } | |
1157 | |
1158 | |
1159 =head1 Seq only methods | |
1160 | |
1161 | |
1162 These methods are specific to the Bio::Seq object, and not | |
1163 found on the Bio::PrimarySeq object | |
1164 | |
1165 =head2 primary_seq | |
1166 | |
1167 Title : primary_seq | |
1168 Usage : $seq->primary_seq or $seq->primary_seq($newval) | |
1169 Function: Get or set a PrimarySeq object | |
1170 Example : | |
1171 Returns : PrimarySeq object | |
1172 Args : None or PrimarySeq object | |
1173 | |
1174 | |
1175 =cut | |
1176 | |
1177 sub primary_seq { | |
1178 my ($obj,$value) = @_; | |
1179 | |
1180 if( defined $value) { | |
1181 if( ! ref $value || ! $value->isa('Bio::PrimarySeqI') ) { | |
1182 $obj->throw("$value is not a Bio::PrimarySeq compliant object"); | |
1183 } | |
1184 | |
1185 $obj->{'primary_seq'} = $value; | |
1186 # descend down over all seqfeature objects, seeing whether they | |
1187 # want an attached seq. | |
1188 | |
1189 foreach my $sf ( $obj->get_SeqFeatures() ) { | |
1190 $sf->attach_seq($value); | |
1191 } | |
1192 | |
1193 } | |
1194 return $obj->{'primary_seq'}; | |
1195 | |
1196 } | |
1197 | |
1198 =head2 species | |
1199 | |
1200 Title : species | |
1201 Usage : $species = $seq->species() or $seq->species($species) | |
1202 Function: Gets or sets the species | |
1203 Returns : L<Bio::Species> object | |
1204 Args : None or L<Bio::Species> object | |
1205 | |
1206 See L<Bio::Species> for more information | |
1207 | |
1208 =cut | |
1209 | |
1210 sub species { | |
1211 my ($self, $species) = @_; | |
1212 if ($species) { | |
1213 $self->{'species'} = $species; | |
1214 } else { | |
1215 return $self->{'species'}; | |
1216 } | |
1217 } | |
1218 | |
1219 =head1 Internal methods | |
1220 | |
1221 =cut | |
1222 | |
1223 # keep AUTOLOAD happy | |
1224 sub DESTROY { } | |
1225 | |
1226 ############################################################################ | |
1227 # aliases due to name changes or to compensate for our lack of consistency # | |
1228 ############################################################################ | |
1229 | |
1230 # in all other modules we use the object in the singular -- | |
1231 # lack of consistency sucks | |
1232 *flush_SeqFeature = \&remove_SeqFeatures; | |
1233 *flush_SeqFeatures = \&remove_SeqFeatures; | |
1234 | |
1235 # this is now get_SeqFeatures() (from FeatureHolderI) | |
1236 *top_SeqFeatures = \&get_SeqFeatures; | |
1237 | |
1238 # this is now get_all_SeqFeatures() in FeatureHolderI | |
1239 sub all_SeqFeatures{ | |
1240 return shift->get_all_SeqFeatures(@_); | |
1241 } | |
1242 | |
1243 sub accession { | |
1244 my $self = shift; | |
1245 $self->warn(ref($self)."::accession is deprecated, ". | |
1246 "use accession_number() instead"); | |
1247 return $self->accession_number(@_); | |
1248 } | |
1249 | |
1250 1; |