0
|
1 # $Id: SeqFeatureI.pm,v 1.43.2.5 2003/08/28 19:29:34 jason Exp $
|
|
2 #
|
|
3 # BioPerl module for Bio::SeqFeatureI
|
|
4 #
|
|
5 # Cared for by Ewan Birney <birney@sanger.ac.uk>
|
|
6 #
|
|
7 # Copyright Ewan Birney
|
|
8 #
|
|
9 # You may distribute this module under the same terms as perl itself
|
|
10
|
|
11 # POD documentation - main docs before the code
|
|
12
|
|
13 =head1 NAME
|
|
14
|
|
15 Bio::SeqFeatureI - Abstract interface of a Sequence Feature
|
|
16
|
|
17 =head1 SYNOPSIS
|
|
18
|
|
19 # get a seqfeature somehow, eg,
|
|
20
|
|
21 foreach $feat ( $seq->top_SeqFeatures() ) {
|
|
22 print "Feature from ", $feat->start, "to ",
|
|
23 $feat->end, " Primary tag ", $feat->primary_tag,
|
|
24 ", produced by ", $feat->source_tag(), "\n";
|
|
25
|
|
26 if( $feat->strand == 0 ) {
|
|
27 print "Feature applicable to either strand\n";
|
|
28 } else {
|
|
29 print "Feature on strand ", $feat->strand,"\n"; # -1,1
|
|
30 }
|
|
31
|
|
32 foreach $tag ( $feat->all_tags() ) {
|
|
33 print "Feature has tag ", $tag, "with values, ",
|
|
34 join(' ',$feat->each_tag_value($tag)), "\n";
|
|
35 }
|
|
36 print "new feature\n" if $feat->has_tag('new');
|
|
37 # features can have sub features
|
|
38 my @subfeat = $feat->get_SeqFeatures();
|
|
39 }
|
|
40
|
|
41 =head1 DESCRIPTION
|
|
42
|
|
43 This interface is the functions one can expect for any Sequence
|
|
44 Feature, whatever its implementation or whether it is a more complex
|
|
45 type (eg, a Gene). This object doesn\'t actually provide any
|
|
46 implemention, it just provides the definitions of what methods one can
|
|
47 call. See Bio::SeqFeature::Generic for a good standard implementation
|
|
48 of this object
|
|
49
|
|
50 =head1 FEEDBACK
|
|
51
|
|
52 User feedback is an integral part of the evolution of this and other
|
|
53 Bioperl modules. Send your comments and suggestions preferably to one
|
|
54 of the Bioperl mailing lists. Your participation is much appreciated.
|
|
55
|
|
56 bioperl-l@bioperl.org - General discussion
|
|
57 http://bio.perl.org/MailList.html - About the mailing lists
|
|
58
|
|
59 =head2 Reporting Bugs
|
|
60
|
|
61 Report bugs to the Bioperl bug tracking system to help us keep track
|
|
62 the bugs and their resolution. Bug reports can be submitted via email
|
|
63 or the web:
|
|
64
|
|
65 bioperl-bugs@bio.perl.org
|
|
66 http://bugzilla.bioperl.org/
|
|
67
|
|
68 =head1 APPENDIX
|
|
69
|
|
70 The rest of the documentation details each of the object
|
|
71 methods. Internal methods are usually preceded with a _
|
|
72
|
|
73 =cut
|
|
74
|
|
75
|
|
76 # Let the code begin...
|
|
77
|
|
78
|
|
79 package Bio::SeqFeatureI;
|
|
80 use vars qw(@ISA $HasInMemory);
|
|
81 use strict;
|
|
82
|
|
83 BEGIN {
|
|
84 eval { require Bio::DB::InMemoryCache };
|
|
85 if( $@ ) { $HasInMemory = 0 }
|
|
86 else { $HasInMemory = 1 }
|
|
87 }
|
|
88
|
|
89 use Bio::RangeI;
|
|
90 use Bio::Seq;
|
|
91
|
|
92 use Carp;
|
|
93
|
|
94 @ISA = qw(Bio::RangeI);
|
|
95
|
|
96 =head1 SeqFeatureI specific methods
|
|
97
|
|
98 New method interfaces.
|
|
99
|
|
100 =cut
|
|
101
|
|
102 =head2 get_SeqFeatures
|
|
103
|
|
104 Title : get_SeqFeatures
|
|
105 Usage : @feats = $feat->get_SeqFeatures();
|
|
106 Function: Returns an array of sub Sequence Features
|
|
107 Returns : An array
|
|
108 Args : none
|
|
109
|
|
110
|
|
111 =cut
|
|
112
|
|
113 sub get_SeqFeatures{
|
|
114 my ($self,@args) = @_;
|
|
115
|
|
116 $self->throw_not_implemented();
|
|
117 }
|
|
118
|
|
119 =head2 display_name
|
|
120
|
|
121 Title : display_name
|
|
122 Usage : $name = $feat->display_name()
|
|
123 Function: Returns the human-readable name of the feature for displays.
|
|
124 Returns : a string
|
|
125 Args : none
|
|
126
|
|
127 =cut
|
|
128
|
|
129 sub display_name {
|
|
130 shift->throw_not_implemented();
|
|
131 }
|
|
132
|
|
133 =head2 primary_tag
|
|
134
|
|
135 Title : primary_tag
|
|
136 Usage : $tag = $feat->primary_tag()
|
|
137 Function: Returns the primary tag for a feature,
|
|
138 eg 'exon'
|
|
139 Returns : a string
|
|
140 Args : none
|
|
141
|
|
142
|
|
143 =cut
|
|
144
|
|
145 sub primary_tag{
|
|
146 my ($self,@args) = @_;
|
|
147
|
|
148 $self->throw_not_implemented();
|
|
149
|
|
150 }
|
|
151
|
|
152 =head2 source_tag
|
|
153
|
|
154 Title : source_tag
|
|
155 Usage : $tag = $feat->source_tag()
|
|
156 Function: Returns the source tag for a feature,
|
|
157 eg, 'genscan'
|
|
158 Returns : a string
|
|
159 Args : none
|
|
160
|
|
161
|
|
162 =cut
|
|
163
|
|
164 sub source_tag{
|
|
165 my ($self,@args) = @_;
|
|
166
|
|
167 $self->throw_not_implemented();
|
|
168 }
|
|
169
|
|
170 =head2 has_tag
|
|
171
|
|
172 Title : has_tag
|
|
173 Usage : $tag_exists = $self->has_tag('some_tag')
|
|
174 Function:
|
|
175 Returns : TRUE if the specified tag exists, and FALSE otherwise
|
|
176 Args :
|
|
177
|
|
178
|
|
179 =cut
|
|
180
|
|
181 sub has_tag{
|
|
182 my ($self,@args) = @_;
|
|
183
|
|
184 $self->throw_not_implemented();
|
|
185
|
|
186 }
|
|
187
|
|
188 =head2 get_tag_values
|
|
189
|
|
190 Title : get_tag_values
|
|
191 Usage : @values = $self->get_tag_values('some_tag')
|
|
192 Function:
|
|
193 Returns : An array comprising the values of the specified tag.
|
|
194 Args :
|
|
195
|
|
196
|
|
197 =cut
|
|
198
|
|
199 sub get_tag_values {
|
|
200 shift->throw_not_implemented();
|
|
201 }
|
|
202
|
|
203 =head2 get_all_tags
|
|
204
|
|
205 Title : get_all_tags
|
|
206 Usage : @tags = $feat->get_all_tags()
|
|
207 Function: gives all tags for this feature
|
|
208 Returns : an array of strings
|
|
209 Args : none
|
|
210
|
|
211
|
|
212 =cut
|
|
213
|
|
214 sub get_all_tags{
|
|
215 shift->throw_not_implemented();
|
|
216 }
|
|
217
|
|
218 =head2 attach_seq
|
|
219
|
|
220 Title : attach_seq
|
|
221 Usage : $sf->attach_seq($seq)
|
|
222 Function: Attaches a Bio::Seq object to this feature. This
|
|
223 Bio::Seq object is for the *entire* sequence: ie
|
|
224 from 1 to 10000
|
|
225
|
|
226 Note that it is not guaranteed that if you obtain a feature
|
|
227 from an object in bioperl, it will have a sequence
|
|
228 attached. Also, implementors of this interface can choose
|
|
229 to provide an empty implementation of this method. I.e.,
|
|
230 there is also no guarantee that if you do attach a
|
|
231 sequence, seq() or entire_seq() will not return undef.
|
|
232
|
|
233 The reason that this method is here on the interface is to
|
|
234 enable you to call it on every SeqFeatureI compliant
|
|
235 object, and that it will be implemented in a useful way and
|
|
236 set to a useful value for the great majority of use
|
|
237 cases. Implementors who choose to ignore the call are
|
|
238 encouraged to specifically state this in their
|
|
239 documentation.
|
|
240
|
|
241 Example :
|
|
242 Returns : TRUE on success
|
|
243 Args : a Bio::PrimarySeqI compliant object
|
|
244
|
|
245
|
|
246 =cut
|
|
247
|
|
248 sub attach_seq {
|
|
249 shift->throw_not_implemented();
|
|
250 }
|
|
251
|
|
252 =head2 seq
|
|
253
|
|
254 Title : seq
|
|
255 Usage : $tseq = $sf->seq()
|
|
256 Function: returns the truncated sequence (if there is a sequence attached)
|
|
257 for this feature
|
|
258 Example :
|
|
259 Returns : sub seq (a Bio::PrimarySeqI compliant object) on attached sequence
|
|
260 bounded by start & end, or undef if there is no sequence attached
|
|
261 Args : none
|
|
262
|
|
263
|
|
264 =cut
|
|
265
|
|
266 sub seq {
|
|
267 shift->throw_not_implemented();
|
|
268 }
|
|
269
|
|
270 =head2 entire_seq
|
|
271
|
|
272 Title : entire_seq
|
|
273 Usage : $whole_seq = $sf->entire_seq()
|
|
274 Function: gives the entire sequence that this seqfeature is attached to
|
|
275 Example :
|
|
276 Returns : a Bio::PrimarySeqI compliant object, or undef if there is no
|
|
277 sequence attached
|
|
278 Args : none
|
|
279
|
|
280
|
|
281 =cut
|
|
282
|
|
283 sub entire_seq {
|
|
284 shift->throw_not_implemented();
|
|
285 }
|
|
286
|
|
287
|
|
288 =head2 seq_id
|
|
289
|
|
290 Title : seq_id
|
|
291 Usage : $obj->seq_id($newval)
|
|
292 Function: There are many cases when you make a feature that you
|
|
293 do know the sequence name, but do not know its actual
|
|
294 sequence. This is an attribute such that you can store
|
|
295 the ID (e.g., display_id) of the sequence.
|
|
296
|
|
297 This attribute should *not* be used in GFF dumping, as
|
|
298 that should come from the collection in which the seq
|
|
299 feature was found.
|
|
300 Returns : value of seq_id
|
|
301 Args : newvalue (optional)
|
|
302
|
|
303
|
|
304 =cut
|
|
305
|
|
306 sub seq_id {
|
|
307 shift->throw_not_implemented();
|
|
308 }
|
|
309
|
|
310 =head2 gff_string
|
|
311
|
|
312 Title : gff_string
|
|
313 Usage : $str = $feat->gff_string;
|
|
314 $str = $feat->gff_string($gff_formatter);
|
|
315 Function: Provides the feature information in GFF format.
|
|
316
|
|
317 The implementation provided here returns GFF2 by default. If you
|
|
318 want a different version, supply an object implementing a method
|
|
319 gff_string() accepting a SeqFeatureI object as argument. E.g., to
|
|
320 obtain GFF1 format, do the following:
|
|
321
|
|
322 my $gffio = Bio::Tools::GFF->new(-gff_version => 1);
|
|
323 $gff1str = $feat->gff_string($gff1io);
|
|
324
|
|
325 Returns : A string
|
|
326 Args : Optionally, an object implementing gff_string().
|
|
327
|
|
328
|
|
329 =cut
|
|
330
|
|
331 sub gff_string{
|
|
332 my ($self,$formatter) = @_;
|
|
333
|
|
334 $formatter = $self->_static_gff_formatter unless $formatter;
|
|
335 return $formatter->gff_string($self);
|
|
336 }
|
|
337
|
|
338 my $static_gff_formatter = undef;
|
|
339
|
|
340 =head2 _static_gff_formatter
|
|
341
|
|
342 Title : _static_gff_formatter
|
|
343 Usage :
|
|
344 Function:
|
|
345 Example :
|
|
346 Returns :
|
|
347 Args :
|
|
348
|
|
349
|
|
350 =cut
|
|
351
|
|
352 sub _static_gff_formatter{
|
|
353 my ($self,@args) = @_;
|
|
354
|
|
355 if( !defined $static_gff_formatter ) {
|
|
356 $static_gff_formatter = Bio::Tools::GFF->new('-gff_version' => 2);
|
|
357 }
|
|
358 return $static_gff_formatter;
|
|
359 }
|
|
360
|
|
361 =head1 Bio::RangeI methods
|
|
362
|
|
363 List of interfaces inherited from Bio::RangeI (see L<Bio::RangeI>
|
|
364 for details).
|
|
365
|
|
366 =cut
|
|
367
|
|
368 =head2 start
|
|
369
|
|
370 Title : start
|
|
371 Usage : $start = $feat->start
|
|
372 Function: Returns the start coordinate of the feature
|
|
373 Returns : integer
|
|
374 Args : none
|
|
375
|
|
376
|
|
377 =head2 end
|
|
378
|
|
379 Title : end
|
|
380 Usage : $end = $feat->end
|
|
381 Function: Returns the end coordinate of the feature
|
|
382 Returns : integer
|
|
383 Args : none
|
|
384
|
|
385 =head2 strand
|
|
386
|
|
387 Title : strand
|
|
388 Usage : $strand = $feat->strand()
|
|
389 Function: Returns strand information, being 1,-1 or 0
|
|
390 Returns : -1,1 or 0
|
|
391 Args : none
|
|
392
|
|
393
|
|
394 =cut
|
|
395
|
|
396 =head1 Decorating methods
|
|
397
|
|
398 These methods have an implementation provided by Bio::SeqFeatureI,
|
|
399 but can be validly overwritten by subclasses
|
|
400
|
|
401 =head2 spliced_seq
|
|
402
|
|
403 Title : spliced_seq
|
|
404
|
|
405 Usage : $seq = $feature->spliced_seq()
|
|
406 $seq = $feature_with_remote_locations->spliced_seq($db_for_seqs)
|
|
407
|
|
408 Function: Provides a sequence of the feature which is the most
|
|
409 semantically "relevant" feature for this sequence. A
|
|
410 default implementation is provided which for simple cases
|
|
411 returns just the sequence, but for split cases, loops over
|
|
412 the split location to return the sequence. In the case of
|
|
413 split locations with remote locations, eg
|
|
414
|
|
415 join(AB000123:5567-5589,80..1144)
|
|
416
|
|
417 in the case when a database object is passed in, it will
|
|
418 attempt to retrieve the sequence from the database object,
|
|
419 and "Do the right thing", however if no database object is
|
|
420 provided, it will generate the correct number of N's (DNA)
|
|
421 or X's (protein, though this is unlikely).
|
|
422
|
|
423 This function is deliberately "magical" attempting to
|
|
424 second guess what a user wants as "the" sequence for this
|
|
425 feature
|
|
426
|
|
427 Implementing classes are free to override this method with
|
|
428 their own magic if they have a better idea what the user
|
|
429 wants
|
|
430
|
|
431 Args : [optional] A Bio::DB::RandomAccessI compliant object
|
|
432 Returns : A Bio::Seq
|
|
433
|
|
434 =cut
|
|
435
|
|
436 sub spliced_seq {
|
|
437 my $self = shift;
|
|
438 my $db = shift;
|
|
439
|
|
440 if( ! $self->location->isa("Bio::Location::SplitLocationI") ) {
|
|
441 return $self->seq(); # nice and easy!
|
|
442 }
|
|
443
|
|
444 # redundant test, but the above ISA is probably not ideal.
|
|
445 if( ! $self->location->isa("Bio::Location::SplitLocationI") ) {
|
|
446 $self->throw("not atomic, not split, yikes, in trouble!");
|
|
447 }
|
|
448
|
|
449 my $seqstr;
|
|
450 my $seqid = $self->entire_seq->display_id;
|
|
451 # This is to deal with reverse strand features
|
|
452 # so we are really sorting features 5' -> 3' on their strand
|
|
453 # i.e. rev strand features will be sorted largest to smallest
|
|
454 # as this how revcom CDSes seem to be annotated in genbank.
|
|
455 # Might need to eventually allow this to be programable?
|
|
456 # (can I mention how much fun this is NOT! --jason)
|
|
457
|
|
458 my ($mixed,$mixedloc,$fstrand) = (0);
|
|
459
|
|
460 if( defined $db &&
|
|
461 ref($db) && !$db->isa('Bio::DB::RandomAccessI') ) {
|
|
462 $self->warn("Must pass in a valid Bio::DB::RandomAccessI object for access to remote locations for spliced_seq");
|
|
463 $db = undef;
|
|
464 } elsif( defined $db &&
|
|
465 $HasInMemory && ! $db->isa('Bio::DB::InMemoryCache') ) {
|
|
466 $db = new Bio::DB::InMemoryCache(-seqdb => $db);
|
|
467 }
|
|
468
|
|
469 if( $self->isa('Bio::Das::SegmentI') &&
|
|
470 ! $self->absolute ) {
|
|
471 $self->warn("Calling spliced_seq with a Bio::Das::SegmentI ".
|
|
472 "which does have absolute set to 1 -- be warned ".
|
|
473 "you may not be getting things on the correct strand");
|
|
474 }
|
|
475
|
|
476 my @locs = map { $_->[0] }
|
|
477 # sort so that most negative is first basically to order
|
|
478 # the features on the opposite strand 5'->3' on their strand
|
|
479 # rather than they way most are input which is on the fwd strand
|
|
480
|
|
481 sort { $a->[1] <=> $b->[1] } # Yes Tim, Schwartzian transformation
|
|
482 map {
|
|
483 $fstrand = $_->strand unless defined $fstrand;
|
|
484 $mixed = 1 if defined $_->strand && $fstrand != $_->strand;
|
|
485 if( defined $_->seq_id ) {
|
|
486 $mixedloc = 1 if( $_->seq_id ne $seqid );
|
|
487 }
|
|
488 [ $_, $_->start* ($_->strand || 1)];
|
|
489 } $self->location->each_Location;
|
|
490
|
|
491 if ( $mixed ) {
|
|
492 $self->warn("Mixed strand locations, spliced seq using the input ".
|
|
493 "order rather than trying to sort");
|
|
494 @locs = $self->location->each_Location;
|
|
495 } elsif( $mixedloc ) {
|
|
496 # we'll use the prescribed location order
|
|
497 @locs = $self->location->each_Location;
|
|
498 }
|
|
499
|
|
500
|
|
501 foreach my $loc ( @locs ) {
|
|
502 if( ! $loc->isa("Bio::Location::Atomic") ) {
|
|
503 $self->throw("Can only deal with one level deep locations");
|
|
504 }
|
|
505 my $called_seq;
|
|
506 if( $fstrand != $loc->strand ) {
|
|
507 $self->warn("feature strand is different from location strand!");
|
|
508 }
|
|
509 # deal with remote sequences
|
|
510 if( defined $loc->seq_id &&
|
|
511 $loc->seq_id ne $seqid ) {
|
|
512 if( defined $db ) {
|
|
513 my $sid = $loc->seq_id;
|
|
514 $sid =~ s/\.\d+$//g;
|
|
515 eval {
|
|
516 $called_seq = $db->get_Seq_by_acc($sid);
|
|
517 };
|
|
518 if( $@ ) {
|
|
519 $self->warn("In attempting to join a remote location, sequence $sid was not in database. Will provide padding N's. Full exception \n\n$@");
|
|
520 $called_seq = undef;
|
|
521 }
|
|
522 } else {
|
|
523 $self->warn( "cannot get remote location for ".$loc->seq_id ." without a valid Bio::DB::RandomAccessI database handle (like Bio::DB::GenBank)");
|
|
524 $called_seq = undef;
|
|
525 }
|
|
526 if( !defined $called_seq ) {
|
|
527 $seqstr .= 'N' x $self->length;
|
|
528 next;
|
|
529 }
|
|
530 } else {
|
|
531 $called_seq = $self->entire_seq;
|
|
532 }
|
|
533
|
|
534 if( $self->isa('Bio::Das::SegmentI') ) {
|
|
535 my ($s,$e) = ($loc->start,$loc->end);
|
|
536 $seqstr .= $called_seq->subseq($s,$e)->seq();
|
|
537 } else {
|
|
538 # This is dumb subseq should work on locations...
|
|
539 if( $loc->strand == 1 ) {
|
|
540 $seqstr .= $called_seq->subseq($loc->start,$loc->end);
|
|
541 } else {
|
|
542 $seqstr .= $called_seq->trunc($loc->start,$loc->end)->revcom->seq();
|
|
543 }
|
|
544 }
|
|
545 }
|
|
546 my $out = Bio::Seq->new( -id => $self->entire_seq->display_id . "_spliced_feat",
|
|
547 -seq => $seqstr);
|
|
548
|
|
549 return $out;
|
|
550 }
|
|
551
|
|
552 =head1 RangeI methods
|
|
553
|
|
554 These methods are inherited from RangeI and can be used
|
|
555 directly from a SeqFeatureI interface. Remember that a
|
|
556 SeqFeature is-a RangeI, and so wherever you see RangeI you
|
|
557 can use a feature ($r in the below documentation).
|
|
558
|
|
559 =head2 overlaps
|
|
560
|
|
561 Title : overlaps
|
|
562 Usage : if($feat->overlaps($r)) { do stuff }
|
|
563 if($feat->overlaps(200)) { do stuff }
|
|
564 Function: tests if $feat overlaps $r
|
|
565 Args : a RangeI to test for overlap with, or a point
|
|
566 Returns : true if the Range overlaps with the feature, false otherwise
|
|
567
|
|
568
|
|
569 =head2 contains
|
|
570
|
|
571 Title : contains
|
|
572 Usage : if($feat->contains($r) { do stuff }
|
|
573 Function: tests whether $feat totally contains $r
|
|
574 Args : a RangeI to test for being contained
|
|
575 Returns : true if the argument is totaly contained within this range
|
|
576
|
|
577
|
|
578 =head2 equals
|
|
579
|
|
580 Title : equals
|
|
581 Usage : if($feat->equals($r))
|
|
582 Function: test whether $feat has the same start, end, strand as $r
|
|
583 Args : a RangeI to test for equality
|
|
584 Returns : true if they are describing the same range
|
|
585
|
|
586
|
|
587 =head1 Geometrical methods
|
|
588
|
|
589 These methods do things to the geometry of ranges, and return
|
|
590 triplets (start, stop, strand) from which new ranges could be built.
|
|
591
|
|
592 =head2 intersection
|
|
593
|
|
594 Title : intersection
|
|
595 Usage : ($start, $stop, $strand) = $feat->intersection($r)
|
|
596 Function: gives the range that is contained by both ranges
|
|
597 Args : a RangeI to compare this one to
|
|
598 Returns : nothing if they do not overlap, or the range that they do overlap
|
|
599
|
|
600 =head2 union
|
|
601
|
|
602 Title : union
|
|
603 Usage : ($start, $stop, $strand) = $feat->union($r);
|
|
604 : ($start, $stop, $strand) = Bio::RangeI->union(@ranges);
|
|
605 Function: finds the minimal range that contains all of the ranges
|
|
606 Args : a range or list of ranges to find the union of
|
|
607 Returns : the range containing all of the ranges
|
|
608
|
|
609 =cut
|
|
610
|
|
611 =head2 location
|
|
612
|
|
613 Title : location
|
|
614 Usage : my $location = $seqfeature->location()
|
|
615 Function: returns a location object suitable for identifying location
|
|
616 of feature on sequence or parent feature
|
|
617 Returns : Bio::LocationI object
|
|
618 Args : none
|
|
619
|
|
620
|
|
621 =cut
|
|
622
|
|
623 sub location {
|
|
624 my ($self) = @_;
|
|
625
|
|
626 $self->throw_not_implemented();
|
|
627 }
|
|
628
|
|
629
|
|
630 1;
|