Mercurial > repos > mahtabm > ensembl
comparison variant_effect_predictor/Bio/SeqFeatureI.pm @ 0:1f6dce3d34e0
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 02:01:53 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:1f6dce3d34e0 |
---|---|
1 # $Id: SeqFeatureI.pm,v 1.43.2.5 2003/08/28 19:29:34 jason Exp $ | |
2 # | |
3 # BioPerl module for Bio::SeqFeatureI | |
4 # | |
5 # Cared for by Ewan Birney <birney@sanger.ac.uk> | |
6 # | |
7 # Copyright Ewan Birney | |
8 # | |
9 # You may distribute this module under the same terms as perl itself | |
10 | |
11 # POD documentation - main docs before the code | |
12 | |
13 =head1 NAME | |
14 | |
15 Bio::SeqFeatureI - Abstract interface of a Sequence Feature | |
16 | |
17 =head1 SYNOPSIS | |
18 | |
19 # get a seqfeature somehow, eg, | |
20 | |
21 foreach $feat ( $seq->top_SeqFeatures() ) { | |
22 print "Feature from ", $feat->start, "to ", | |
23 $feat->end, " Primary tag ", $feat->primary_tag, | |
24 ", produced by ", $feat->source_tag(), "\n"; | |
25 | |
26 if( $feat->strand == 0 ) { | |
27 print "Feature applicable to either strand\n"; | |
28 } else { | |
29 print "Feature on strand ", $feat->strand,"\n"; # -1,1 | |
30 } | |
31 | |
32 foreach $tag ( $feat->all_tags() ) { | |
33 print "Feature has tag ", $tag, "with values, ", | |
34 join(' ',$feat->each_tag_value($tag)), "\n"; | |
35 } | |
36 print "new feature\n" if $feat->has_tag('new'); | |
37 # features can have sub features | |
38 my @subfeat = $feat->get_SeqFeatures(); | |
39 } | |
40 | |
41 =head1 DESCRIPTION | |
42 | |
43 This interface is the functions one can expect for any Sequence | |
44 Feature, whatever its implementation or whether it is a more complex | |
45 type (eg, a Gene). This object doesn\'t actually provide any | |
46 implemention, it just provides the definitions of what methods one can | |
47 call. See Bio::SeqFeature::Generic for a good standard implementation | |
48 of this object | |
49 | |
50 =head1 FEEDBACK | |
51 | |
52 User feedback is an integral part of the evolution of this and other | |
53 Bioperl modules. Send your comments and suggestions preferably to one | |
54 of the Bioperl mailing lists. Your participation is much appreciated. | |
55 | |
56 bioperl-l@bioperl.org - General discussion | |
57 http://bio.perl.org/MailList.html - About the mailing lists | |
58 | |
59 =head2 Reporting Bugs | |
60 | |
61 Report bugs to the Bioperl bug tracking system to help us keep track | |
62 the bugs and their resolution. Bug reports can be submitted via email | |
63 or the web: | |
64 | |
65 bioperl-bugs@bio.perl.org | |
66 http://bugzilla.bioperl.org/ | |
67 | |
68 =head1 APPENDIX | |
69 | |
70 The rest of the documentation details each of the object | |
71 methods. Internal methods are usually preceded with a _ | |
72 | |
73 =cut | |
74 | |
75 | |
76 # Let the code begin... | |
77 | |
78 | |
79 package Bio::SeqFeatureI; | |
80 use vars qw(@ISA $HasInMemory); | |
81 use strict; | |
82 | |
83 BEGIN { | |
84 eval { require Bio::DB::InMemoryCache }; | |
85 if( $@ ) { $HasInMemory = 0 } | |
86 else { $HasInMemory = 1 } | |
87 } | |
88 | |
89 use Bio::RangeI; | |
90 use Bio::Seq; | |
91 | |
92 use Carp; | |
93 | |
94 @ISA = qw(Bio::RangeI); | |
95 | |
96 =head1 SeqFeatureI specific methods | |
97 | |
98 New method interfaces. | |
99 | |
100 =cut | |
101 | |
102 =head2 get_SeqFeatures | |
103 | |
104 Title : get_SeqFeatures | |
105 Usage : @feats = $feat->get_SeqFeatures(); | |
106 Function: Returns an array of sub Sequence Features | |
107 Returns : An array | |
108 Args : none | |
109 | |
110 | |
111 =cut | |
112 | |
113 sub get_SeqFeatures{ | |
114 my ($self,@args) = @_; | |
115 | |
116 $self->throw_not_implemented(); | |
117 } | |
118 | |
119 =head2 display_name | |
120 | |
121 Title : display_name | |
122 Usage : $name = $feat->display_name() | |
123 Function: Returns the human-readable name of the feature for displays. | |
124 Returns : a string | |
125 Args : none | |
126 | |
127 =cut | |
128 | |
129 sub display_name { | |
130 shift->throw_not_implemented(); | |
131 } | |
132 | |
133 =head2 primary_tag | |
134 | |
135 Title : primary_tag | |
136 Usage : $tag = $feat->primary_tag() | |
137 Function: Returns the primary tag for a feature, | |
138 eg 'exon' | |
139 Returns : a string | |
140 Args : none | |
141 | |
142 | |
143 =cut | |
144 | |
145 sub primary_tag{ | |
146 my ($self,@args) = @_; | |
147 | |
148 $self->throw_not_implemented(); | |
149 | |
150 } | |
151 | |
152 =head2 source_tag | |
153 | |
154 Title : source_tag | |
155 Usage : $tag = $feat->source_tag() | |
156 Function: Returns the source tag for a feature, | |
157 eg, 'genscan' | |
158 Returns : a string | |
159 Args : none | |
160 | |
161 | |
162 =cut | |
163 | |
164 sub source_tag{ | |
165 my ($self,@args) = @_; | |
166 | |
167 $self->throw_not_implemented(); | |
168 } | |
169 | |
170 =head2 has_tag | |
171 | |
172 Title : has_tag | |
173 Usage : $tag_exists = $self->has_tag('some_tag') | |
174 Function: | |
175 Returns : TRUE if the specified tag exists, and FALSE otherwise | |
176 Args : | |
177 | |
178 | |
179 =cut | |
180 | |
181 sub has_tag{ | |
182 my ($self,@args) = @_; | |
183 | |
184 $self->throw_not_implemented(); | |
185 | |
186 } | |
187 | |
188 =head2 get_tag_values | |
189 | |
190 Title : get_tag_values | |
191 Usage : @values = $self->get_tag_values('some_tag') | |
192 Function: | |
193 Returns : An array comprising the values of the specified tag. | |
194 Args : | |
195 | |
196 | |
197 =cut | |
198 | |
199 sub get_tag_values { | |
200 shift->throw_not_implemented(); | |
201 } | |
202 | |
203 =head2 get_all_tags | |
204 | |
205 Title : get_all_tags | |
206 Usage : @tags = $feat->get_all_tags() | |
207 Function: gives all tags for this feature | |
208 Returns : an array of strings | |
209 Args : none | |
210 | |
211 | |
212 =cut | |
213 | |
214 sub get_all_tags{ | |
215 shift->throw_not_implemented(); | |
216 } | |
217 | |
218 =head2 attach_seq | |
219 | |
220 Title : attach_seq | |
221 Usage : $sf->attach_seq($seq) | |
222 Function: Attaches a Bio::Seq object to this feature. This | |
223 Bio::Seq object is for the *entire* sequence: ie | |
224 from 1 to 10000 | |
225 | |
226 Note that it is not guaranteed that if you obtain a feature | |
227 from an object in bioperl, it will have a sequence | |
228 attached. Also, implementors of this interface can choose | |
229 to provide an empty implementation of this method. I.e., | |
230 there is also no guarantee that if you do attach a | |
231 sequence, seq() or entire_seq() will not return undef. | |
232 | |
233 The reason that this method is here on the interface is to | |
234 enable you to call it on every SeqFeatureI compliant | |
235 object, and that it will be implemented in a useful way and | |
236 set to a useful value for the great majority of use | |
237 cases. Implementors who choose to ignore the call are | |
238 encouraged to specifically state this in their | |
239 documentation. | |
240 | |
241 Example : | |
242 Returns : TRUE on success | |
243 Args : a Bio::PrimarySeqI compliant object | |
244 | |
245 | |
246 =cut | |
247 | |
248 sub attach_seq { | |
249 shift->throw_not_implemented(); | |
250 } | |
251 | |
252 =head2 seq | |
253 | |
254 Title : seq | |
255 Usage : $tseq = $sf->seq() | |
256 Function: returns the truncated sequence (if there is a sequence attached) | |
257 for this feature | |
258 Example : | |
259 Returns : sub seq (a Bio::PrimarySeqI compliant object) on attached sequence | |
260 bounded by start & end, or undef if there is no sequence attached | |
261 Args : none | |
262 | |
263 | |
264 =cut | |
265 | |
266 sub seq { | |
267 shift->throw_not_implemented(); | |
268 } | |
269 | |
270 =head2 entire_seq | |
271 | |
272 Title : entire_seq | |
273 Usage : $whole_seq = $sf->entire_seq() | |
274 Function: gives the entire sequence that this seqfeature is attached to | |
275 Example : | |
276 Returns : a Bio::PrimarySeqI compliant object, or undef if there is no | |
277 sequence attached | |
278 Args : none | |
279 | |
280 | |
281 =cut | |
282 | |
283 sub entire_seq { | |
284 shift->throw_not_implemented(); | |
285 } | |
286 | |
287 | |
288 =head2 seq_id | |
289 | |
290 Title : seq_id | |
291 Usage : $obj->seq_id($newval) | |
292 Function: There are many cases when you make a feature that you | |
293 do know the sequence name, but do not know its actual | |
294 sequence. This is an attribute such that you can store | |
295 the ID (e.g., display_id) of the sequence. | |
296 | |
297 This attribute should *not* be used in GFF dumping, as | |
298 that should come from the collection in which the seq | |
299 feature was found. | |
300 Returns : value of seq_id | |
301 Args : newvalue (optional) | |
302 | |
303 | |
304 =cut | |
305 | |
306 sub seq_id { | |
307 shift->throw_not_implemented(); | |
308 } | |
309 | |
310 =head2 gff_string | |
311 | |
312 Title : gff_string | |
313 Usage : $str = $feat->gff_string; | |
314 $str = $feat->gff_string($gff_formatter); | |
315 Function: Provides the feature information in GFF format. | |
316 | |
317 The implementation provided here returns GFF2 by default. If you | |
318 want a different version, supply an object implementing a method | |
319 gff_string() accepting a SeqFeatureI object as argument. E.g., to | |
320 obtain GFF1 format, do the following: | |
321 | |
322 my $gffio = Bio::Tools::GFF->new(-gff_version => 1); | |
323 $gff1str = $feat->gff_string($gff1io); | |
324 | |
325 Returns : A string | |
326 Args : Optionally, an object implementing gff_string(). | |
327 | |
328 | |
329 =cut | |
330 | |
331 sub gff_string{ | |
332 my ($self,$formatter) = @_; | |
333 | |
334 $formatter = $self->_static_gff_formatter unless $formatter; | |
335 return $formatter->gff_string($self); | |
336 } | |
337 | |
338 my $static_gff_formatter = undef; | |
339 | |
340 =head2 _static_gff_formatter | |
341 | |
342 Title : _static_gff_formatter | |
343 Usage : | |
344 Function: | |
345 Example : | |
346 Returns : | |
347 Args : | |
348 | |
349 | |
350 =cut | |
351 | |
352 sub _static_gff_formatter{ | |
353 my ($self,@args) = @_; | |
354 | |
355 if( !defined $static_gff_formatter ) { | |
356 $static_gff_formatter = Bio::Tools::GFF->new('-gff_version' => 2); | |
357 } | |
358 return $static_gff_formatter; | |
359 } | |
360 | |
361 =head1 Bio::RangeI methods | |
362 | |
363 List of interfaces inherited from Bio::RangeI (see L<Bio::RangeI> | |
364 for details). | |
365 | |
366 =cut | |
367 | |
368 =head2 start | |
369 | |
370 Title : start | |
371 Usage : $start = $feat->start | |
372 Function: Returns the start coordinate of the feature | |
373 Returns : integer | |
374 Args : none | |
375 | |
376 | |
377 =head2 end | |
378 | |
379 Title : end | |
380 Usage : $end = $feat->end | |
381 Function: Returns the end coordinate of the feature | |
382 Returns : integer | |
383 Args : none | |
384 | |
385 =head2 strand | |
386 | |
387 Title : strand | |
388 Usage : $strand = $feat->strand() | |
389 Function: Returns strand information, being 1,-1 or 0 | |
390 Returns : -1,1 or 0 | |
391 Args : none | |
392 | |
393 | |
394 =cut | |
395 | |
396 =head1 Decorating methods | |
397 | |
398 These methods have an implementation provided by Bio::SeqFeatureI, | |
399 but can be validly overwritten by subclasses | |
400 | |
401 =head2 spliced_seq | |
402 | |
403 Title : spliced_seq | |
404 | |
405 Usage : $seq = $feature->spliced_seq() | |
406 $seq = $feature_with_remote_locations->spliced_seq($db_for_seqs) | |
407 | |
408 Function: Provides a sequence of the feature which is the most | |
409 semantically "relevant" feature for this sequence. A | |
410 default implementation is provided which for simple cases | |
411 returns just the sequence, but for split cases, loops over | |
412 the split location to return the sequence. In the case of | |
413 split locations with remote locations, eg | |
414 | |
415 join(AB000123:5567-5589,80..1144) | |
416 | |
417 in the case when a database object is passed in, it will | |
418 attempt to retrieve the sequence from the database object, | |
419 and "Do the right thing", however if no database object is | |
420 provided, it will generate the correct number of N's (DNA) | |
421 or X's (protein, though this is unlikely). | |
422 | |
423 This function is deliberately "magical" attempting to | |
424 second guess what a user wants as "the" sequence for this | |
425 feature | |
426 | |
427 Implementing classes are free to override this method with | |
428 their own magic if they have a better idea what the user | |
429 wants | |
430 | |
431 Args : [optional] A Bio::DB::RandomAccessI compliant object | |
432 Returns : A Bio::Seq | |
433 | |
434 =cut | |
435 | |
436 sub spliced_seq { | |
437 my $self = shift; | |
438 my $db = shift; | |
439 | |
440 if( ! $self->location->isa("Bio::Location::SplitLocationI") ) { | |
441 return $self->seq(); # nice and easy! | |
442 } | |
443 | |
444 # redundant test, but the above ISA is probably not ideal. | |
445 if( ! $self->location->isa("Bio::Location::SplitLocationI") ) { | |
446 $self->throw("not atomic, not split, yikes, in trouble!"); | |
447 } | |
448 | |
449 my $seqstr; | |
450 my $seqid = $self->entire_seq->display_id; | |
451 # This is to deal with reverse strand features | |
452 # so we are really sorting features 5' -> 3' on their strand | |
453 # i.e. rev strand features will be sorted largest to smallest | |
454 # as this how revcom CDSes seem to be annotated in genbank. | |
455 # Might need to eventually allow this to be programable? | |
456 # (can I mention how much fun this is NOT! --jason) | |
457 | |
458 my ($mixed,$mixedloc,$fstrand) = (0); | |
459 | |
460 if( defined $db && | |
461 ref($db) && !$db->isa('Bio::DB::RandomAccessI') ) { | |
462 $self->warn("Must pass in a valid Bio::DB::RandomAccessI object for access to remote locations for spliced_seq"); | |
463 $db = undef; | |
464 } elsif( defined $db && | |
465 $HasInMemory && ! $db->isa('Bio::DB::InMemoryCache') ) { | |
466 $db = new Bio::DB::InMemoryCache(-seqdb => $db); | |
467 } | |
468 | |
469 if( $self->isa('Bio::Das::SegmentI') && | |
470 ! $self->absolute ) { | |
471 $self->warn("Calling spliced_seq with a Bio::Das::SegmentI ". | |
472 "which does have absolute set to 1 -- be warned ". | |
473 "you may not be getting things on the correct strand"); | |
474 } | |
475 | |
476 my @locs = map { $_->[0] } | |
477 # sort so that most negative is first basically to order | |
478 # the features on the opposite strand 5'->3' on their strand | |
479 # rather than they way most are input which is on the fwd strand | |
480 | |
481 sort { $a->[1] <=> $b->[1] } # Yes Tim, Schwartzian transformation | |
482 map { | |
483 $fstrand = $_->strand unless defined $fstrand; | |
484 $mixed = 1 if defined $_->strand && $fstrand != $_->strand; | |
485 if( defined $_->seq_id ) { | |
486 $mixedloc = 1 if( $_->seq_id ne $seqid ); | |
487 } | |
488 [ $_, $_->start* ($_->strand || 1)]; | |
489 } $self->location->each_Location; | |
490 | |
491 if ( $mixed ) { | |
492 $self->warn("Mixed strand locations, spliced seq using the input ". | |
493 "order rather than trying to sort"); | |
494 @locs = $self->location->each_Location; | |
495 } elsif( $mixedloc ) { | |
496 # we'll use the prescribed location order | |
497 @locs = $self->location->each_Location; | |
498 } | |
499 | |
500 | |
501 foreach my $loc ( @locs ) { | |
502 if( ! $loc->isa("Bio::Location::Atomic") ) { | |
503 $self->throw("Can only deal with one level deep locations"); | |
504 } | |
505 my $called_seq; | |
506 if( $fstrand != $loc->strand ) { | |
507 $self->warn("feature strand is different from location strand!"); | |
508 } | |
509 # deal with remote sequences | |
510 if( defined $loc->seq_id && | |
511 $loc->seq_id ne $seqid ) { | |
512 if( defined $db ) { | |
513 my $sid = $loc->seq_id; | |
514 $sid =~ s/\.\d+$//g; | |
515 eval { | |
516 $called_seq = $db->get_Seq_by_acc($sid); | |
517 }; | |
518 if( $@ ) { | |
519 $self->warn("In attempting to join a remote location, sequence $sid was not in database. Will provide padding N's. Full exception \n\n$@"); | |
520 $called_seq = undef; | |
521 } | |
522 } else { | |
523 $self->warn( "cannot get remote location for ".$loc->seq_id ." without a valid Bio::DB::RandomAccessI database handle (like Bio::DB::GenBank)"); | |
524 $called_seq = undef; | |
525 } | |
526 if( !defined $called_seq ) { | |
527 $seqstr .= 'N' x $self->length; | |
528 next; | |
529 } | |
530 } else { | |
531 $called_seq = $self->entire_seq; | |
532 } | |
533 | |
534 if( $self->isa('Bio::Das::SegmentI') ) { | |
535 my ($s,$e) = ($loc->start,$loc->end); | |
536 $seqstr .= $called_seq->subseq($s,$e)->seq(); | |
537 } else { | |
538 # This is dumb subseq should work on locations... | |
539 if( $loc->strand == 1 ) { | |
540 $seqstr .= $called_seq->subseq($loc->start,$loc->end); | |
541 } else { | |
542 $seqstr .= $called_seq->trunc($loc->start,$loc->end)->revcom->seq(); | |
543 } | |
544 } | |
545 } | |
546 my $out = Bio::Seq->new( -id => $self->entire_seq->display_id . "_spliced_feat", | |
547 -seq => $seqstr); | |
548 | |
549 return $out; | |
550 } | |
551 | |
552 =head1 RangeI methods | |
553 | |
554 These methods are inherited from RangeI and can be used | |
555 directly from a SeqFeatureI interface. Remember that a | |
556 SeqFeature is-a RangeI, and so wherever you see RangeI you | |
557 can use a feature ($r in the below documentation). | |
558 | |
559 =head2 overlaps | |
560 | |
561 Title : overlaps | |
562 Usage : if($feat->overlaps($r)) { do stuff } | |
563 if($feat->overlaps(200)) { do stuff } | |
564 Function: tests if $feat overlaps $r | |
565 Args : a RangeI to test for overlap with, or a point | |
566 Returns : true if the Range overlaps with the feature, false otherwise | |
567 | |
568 | |
569 =head2 contains | |
570 | |
571 Title : contains | |
572 Usage : if($feat->contains($r) { do stuff } | |
573 Function: tests whether $feat totally contains $r | |
574 Args : a RangeI to test for being contained | |
575 Returns : true if the argument is totaly contained within this range | |
576 | |
577 | |
578 =head2 equals | |
579 | |
580 Title : equals | |
581 Usage : if($feat->equals($r)) | |
582 Function: test whether $feat has the same start, end, strand as $r | |
583 Args : a RangeI to test for equality | |
584 Returns : true if they are describing the same range | |
585 | |
586 | |
587 =head1 Geometrical methods | |
588 | |
589 These methods do things to the geometry of ranges, and return | |
590 triplets (start, stop, strand) from which new ranges could be built. | |
591 | |
592 =head2 intersection | |
593 | |
594 Title : intersection | |
595 Usage : ($start, $stop, $strand) = $feat->intersection($r) | |
596 Function: gives the range that is contained by both ranges | |
597 Args : a RangeI to compare this one to | |
598 Returns : nothing if they do not overlap, or the range that they do overlap | |
599 | |
600 =head2 union | |
601 | |
602 Title : union | |
603 Usage : ($start, $stop, $strand) = $feat->union($r); | |
604 : ($start, $stop, $strand) = Bio::RangeI->union(@ranges); | |
605 Function: finds the minimal range that contains all of the ranges | |
606 Args : a range or list of ranges to find the union of | |
607 Returns : the range containing all of the ranges | |
608 | |
609 =cut | |
610 | |
611 =head2 location | |
612 | |
613 Title : location | |
614 Usage : my $location = $seqfeature->location() | |
615 Function: returns a location object suitable for identifying location | |
616 of feature on sequence or parent feature | |
617 Returns : Bio::LocationI object | |
618 Args : none | |
619 | |
620 | |
621 =cut | |
622 | |
623 sub location { | |
624 my ($self) = @_; | |
625 | |
626 $self->throw_not_implemented(); | |
627 } | |
628 | |
629 | |
630 1; |