comparison variant_effect_predictor/Bio/PrimarySeqI.pm @ 0:2bc9b66ada89 draft default tip

Uploaded
author mahtabm
date Thu, 11 Apr 2013 06:29:17 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:2bc9b66ada89
1 # $Id: PrimarySeqI.pm,v 1.50.2.3 2003/08/29 15:37:39 birney Exp $
2 #
3 # BioPerl module for Bio::PrimarySeqI
4 #
5 # Cared for by Ewan Birney <birney@sanger.ac.uk>
6 #
7 # Copyright Ewan Birney
8 #
9 # You may distribute this module under the same terms as perl itself
10
11 # POD documentation - main docs before the code
12
13 =head1 NAME
14
15 Bio::PrimarySeqI [Developers] - Interface definition for a Bio::PrimarySeq
16
17 =head1 SYNOPSIS
18
19
20 # Bio::PrimarySeqI is the interface class for sequences.
21
22 # If you are a newcomer to bioperl, you should
23 # start with Bio::Seq documentation. This
24 # documentation is mainly for developers using
25 # Bioperl.
26
27 # to test this is a seq object
28
29 $obj->isa("Bio::PrimarySeqI") ||
30 $obj->throw("$obj does not implement the Bio::PrimarySeqI interface");
31
32 # accessors
33
34 $string = $obj->seq();
35 $substring = $obj->subseq(12,50);
36 $display = $obj->display_id(); # for human display
37 $id = $obj->primary_id(); # unique id for this object,
38 # implementation defined
39 $unique_key= $obj->accession_number();
40 # unique biological id
41
42 # object manipulation
43
44 eval {
45 $rev = $obj->revcom();
46 };
47 if( $@ ) {
48 $obj->throw("Could not reverse complement. ".
49 "Probably not DNA. Actual exception\n$@\n");
50 }
51
52 $trunc = $obj->trunc(12,50);
53
54 # $rev and $trunc are Bio::PrimarySeqI compliant objects
55
56
57 =head1 DESCRIPTION
58
59 This object defines an abstract interface to basic sequence
60 information - for most users of the package the documentation (and
61 methods) in this class are not useful - this is a developers only
62 class which defines what methods have to be implmented by other Perl
63 objects to comply to the Bio::PrimarySeqI interface. Go "perldoc
64 Bio::Seq" or "man Bio::Seq" for more information on the main class for
65 sequences.
66
67
68 PrimarySeq is an object just for the sequence and its name(s), nothing
69 more. Seq is the larger object complete with features. There is a pure
70 perl implementation of this in Bio::PrimarySeq. If you just want to
71 use Bio::PrimarySeq objects, then please read that module first. This
72 module defines the interface, and is of more interest to people who
73 want to wrap their own Perl Objects/RDBs/FileSystems etc in way that
74 they "are" bioperl sequence objects, even though it is not using Perl
75 to store the sequence etc.
76
77
78 This interface defines what bioperl consideres necessary to "be" a
79 sequence, without providing an implementation of this. (An
80 implementation is provided in Bio::PrimarySeq). If you want to provide
81 a Bio::PrimarySeq 'compliant' object which in fact wraps another
82 object/database/out-of-perl experience, then this is the correct thing
83 to wrap, generally by providing a wrapper class which would inheriet
84 from your object and this Bio::PrimarySeqI interface. The wrapper class
85 then would have methods lists in the "Implementation Specific
86 Functions" which would provide these methods for your object.
87
88
89 =head1 FEEDBACK
90
91 =head2 Mailing Lists
92
93 User feedback is an integral part of the evolution of this and other
94 Bioperl modules. Send your comments and suggestions preferably to one
95 of the Bioperl mailing lists. Your participation is much appreciated.
96
97 bioperl-l@bioperl.org - General discussion
98 http://bio.perl.org/MailList.html - About the mailing lists
99
100 =head2 Reporting Bugs
101
102 Report bugs to the Bioperl bug tracking system to help us keep track
103 the bugs and their resolution. Bug reports can be submitted via email
104 or the web:
105
106 bioperl-bugs@bio.perl.org
107 http://bugzilla.bioperl.org/
108
109 =head1 AUTHOR - Ewan Birney
110
111 Email birney@sanger.ac.uk
112
113 =head1 APPENDIX
114
115 The rest of the documentation details each of the object
116 methods. Internal methods are usually preceded with a _
117
118 =cut
119
120
121 # Let the code begin...
122
123
124 package Bio::PrimarySeqI;
125 use vars qw(@ISA );
126 use strict;
127 use Bio::Root::RootI;
128 use Bio::Tools::CodonTable;
129
130 @ISA = qw(Bio::Root::RootI);
131
132 =head1 Implementation Specific Functions
133
134 These functions are the ones that a specific implementation must
135 define.
136
137 =head2 seq
138
139 Title : seq
140 Usage : $string = $obj->seq()
141 Function: Returns the sequence as a string of letters. The
142 case of the letters is left up to the implementer.
143 Suggested cases are upper case for proteins and lower case for
144 DNA sequence (IUPAC standard),
145 but implementations are suggested to keep an open mind about
146 case (some users... want mixed case!)
147 Returns : A scalar
148 Status : Virtual
149
150 =cut
151
152 sub seq {
153 my ($self) = @_;
154 $self->throw_not_implemented();
155 }
156
157 =head2 subseq
158
159 Title : subseq
160 Usage : $substring = $obj->subseq(10,40);
161 Function: returns the subseq from start to end, where the first base
162 is 1 and the number is inclusive, ie 1-2 are the first two
163 bases of the sequence
164
165 Start cannot be larger than end but can be equal
166
167 Returns : a string
168 Args :
169 Status : Virtual
170
171 =cut
172
173 sub subseq{
174 my ($self) = @_;
175 $self->throw_not_implemented();
176 }
177
178 =head2 display_id
179
180 Title : display_id
181 Usage : $id_string = $obj->display_id();
182 Function: returns the display id, aka the common name of the Sequence object.
183
184 The semantics of this is that it is the most likely string
185 to be used as an identifier of the sequence, and likely to
186 have "human" readability. The id is equivalent to the ID
187 field of the GenBank/EMBL databanks and the id field of the
188 Swissprot/sptrembl database. In fasta format, the >(\S+) is
189 presumed to be the id, though some people overload the id
190 to embed other information. Bioperl does not use any
191 embedded information in the ID field, and people are
192 encouraged to use other mechanisms (accession field for
193 example, or extending the sequence object) to solve this.
194
195 Notice that $seq->id() maps to this function, mainly for
196 legacy/convience issues
197 Returns : A string
198 Args : None
199 Status : Virtual
200
201
202 =cut
203
204 sub display_id {
205 my ($self) = @_;
206 $self->throw_not_implemented();
207 }
208
209
210 =head2 accession_number
211
212 Title : accession_number
213 Usage : $unique_biological_key = $obj->accession_number;
214 Function: Returns the unique biological id for a sequence, commonly
215 called the accession_number. For sequences from established
216 databases, the implementors should try to use the correct
217 accession number. Notice that primary_id() provides the
218 unique id for the implemetation, allowing multiple objects
219 to have the same accession number in a particular implementation.
220
221 For sequences with no accession number, this method should return
222 "unknown".
223 Returns : A string
224 Args : None
225 Status : Virtual
226
227
228 =cut
229
230 sub accession_number {
231 my ($self,@args) = @_;
232 $self->throw_not_implemented();
233 }
234
235
236
237 =head2 primary_id
238
239 Title : primary_id
240 Usage : $unique_implementation_key = $obj->primary_id;
241 Function: Returns the unique id for this object in this
242 implementation. This allows implementations to manage their
243 own object ids in a way the implementaiton can control
244 clients can expect one id to map to one object.
245
246 For sequences with no accession number, this method should
247 return a stringified memory location.
248
249 [Note this method name is likely to change in 1.3]
250
251 Returns : A string
252 Args : None
253 Status : Virtual
254
255
256 =cut
257
258 sub primary_id {
259 my ($self,@args) = @_;
260 $self->throw_not_implemented();
261 }
262
263
264 =head2 can_call_new
265
266 Title : can_call_new
267 Usage : if( $obj->can_call_new ) {
268 $newobj = $obj->new( %param );
269 }
270 Function: can_call_new returns 1 or 0 depending
271 on whether an implementation allows new
272 constructor to be called. If a new constructor
273 is allowed, then it should take the followed hashed
274 constructor list.
275
276 $myobject->new( -seq => $sequence_as_string,
277 -display_id => $id
278 -accession_number => $accession
279 -alphabet => 'dna',
280 );
281 Example :
282 Returns : 1 or 0
283 Args :
284
285
286 =cut
287
288 sub can_call_new{
289 my ($self,@args) = @_;
290
291 # we default to 0 here
292
293 return 0;
294 }
295
296 =head2 alphabet
297
298 Title : alphabet
299 Usage : if( $obj->alphabet eq 'dna' ) { /Do Something/ }
300 Function: Returns the type of sequence being one of
301 'dna', 'rna' or 'protein'. This is case sensitive.
302
303 This is not called <type> because this would cause
304 upgrade problems from the 0.5 and earlier Seq objects.
305
306 Returns : a string either 'dna','rna','protein'. NB - the object must
307 make a call of the type - if there is no type specified it
308 has to guess.
309 Args : none
310 Status : Virtual
311
312
313 =cut
314
315 sub alphabet{
316 my ( $self ) = @_;
317 $self->throw_not_implemented();
318 }
319
320 sub moltype{
321 my ($self,@args) = @_;
322
323 $self->warn("moltype: pre v1.0 method. Calling alphabet() instead...");
324 $self->alphabet(@args);
325 }
326
327
328 =head1 Optional Implementation Functions
329
330 The following functions rely on the above functions. An
331 implementing class does not need to provide these functions, as they
332 will be provided by this class, but is free to override these
333 functions.
334
335 All of revcom(), trunc(), and translate() create new sequence
336 objects. They will call new() on the class of the sequence object
337 instance passed as argument, unless can_call_new() returns FALSE. In
338 the latter case a Bio::PrimarySeq object will be created. Implementors
339 which really want to control how objects are created (eg, for object
340 persistence over a database, or objects in a CORBA framework), they
341 are encouraged to override these methods
342
343 =head2 revcom
344
345 Title : revcom
346 Usage : $rev = $seq->revcom()
347 Function: Produces a new Bio::PrimarySeqI implementing object which
348 is the reversed complement of the sequence. For protein
349 sequences this throws an exception of "Sequence is a
350 protein. Cannot revcom"
351
352 The id is the same id as the original sequence, and the
353 accession number is also indentical. If someone wants to
354 track that this sequence has be reversed, it needs to
355 define its own extensions
356
357 To do an inplace edit of an object you can go:
358
359 $seq = $seq->revcom();
360
361 This of course, causes Perl to handle the garbage
362 collection of the old object, but it is roughly speaking as
363 efficient as an inplace edit.
364
365 Returns : A new (fresh) Bio::PrimarySeqI object
366 Args : none
367
368
369 =cut
370
371 sub revcom{
372 my ($self) = @_;
373
374 # check the type is good first.
375 my $t = $self->alphabet;
376
377 if( $t eq 'protein' ) {
378 $self->throw("Sequence is a protein. Cannot revcom");
379 }
380
381 if( $t ne 'dna' && $t ne 'rna' ) {
382 if( $self->can('warn') ) {
383 $self->warn("Sequence is not dna or rna, but [$t]. ".
384 "Attempting to revcom, but unsure if this is right");
385 } else {
386 warn("[$self] Sequence is not dna or rna, but [$t]. ".
387 "Attempting to revcom, but unsure if this is right");
388 }
389 }
390
391 # yank out the sequence string
392
393 my $str = $self->seq();
394
395 # if is RNA - map to DNA then map back
396
397 if( $t eq 'rna' ) {
398 $str =~ tr/uU/tT/;
399 }
400
401 # revcom etc...
402
403 $str =~ tr/acgtrymkswhbvdnxACGTRYMKSWHBVDNX/tgcayrkmswdvbhnxTGCAYRKMSWDVBHNX/;
404 my $revseq = CORE::reverse $str;
405
406 if( $t eq 'rna' ) {
407 $revseq =~ tr/tT/uU/;
408 }
409
410 my $seqclass;
411 if($self->can_call_new()) {
412 $seqclass = ref($self);
413 } else {
414 $seqclass = 'Bio::PrimarySeq';
415 $self->_attempt_to_load_Seq();
416 }
417 my $out = $seqclass->new( '-seq' => $revseq,
418 '-display_id' => $self->display_id,
419 '-accession_number' => $self->accession_number,
420 '-alphabet' => $self->alphabet,
421 '-desc' => $self->desc(),
422 '-verbose' => $self->verbose
423 );
424 return $out;
425
426 }
427
428 =head2 trunc
429
430 Title : trunc
431 Usage : $subseq = $myseq->trunc(10,100);
432 Function: Provides a truncation of a sequence,
433
434 Example :
435 Returns : a fresh Bio::PrimarySeqI implementing object
436 Args : Two integers denoting first and last base of the sub-sequence.
437
438
439 =cut
440
441 sub trunc{
442 my ($self,$start,$end) = @_;
443
444 my $str;
445 if( defined $start && ref($start) &&
446 $start->isa('Bio::LocationI') ) {
447 $str = $self->subseq($start); # start is a location actually
448 } elsif( !$end ) {
449 $self->throw("trunc start,end -- there was no end for $start");
450 } elsif( $end < $start ) {
451 my $msg = "start [$start] is greater than end [$end]. \n".
452 "If you want to truncated and reverse complement, \n".
453 "you must call trunc followed by revcom. Sorry.";
454 $self->throw($msg);
455 } else {
456 $str = $self->subseq($start,$end);
457 }
458
459 my $seqclass;
460 if($self->can_call_new()) {
461 $seqclass = ref($self);
462 } else {
463 $seqclass = 'Bio::PrimarySeq';
464 $self->_attempt_to_load_Seq();
465 }
466
467 my $out = $seqclass->new( '-seq' => $str,
468 '-display_id' => $self->display_id,
469 '-accession_number' => $self->accession_number,
470 '-alphabet' => $self->alphabet,
471 '-desc' => $self->desc(),
472 '-verbose' => $self->verbose
473 );
474 return $out;
475 }
476
477 =head2 translate
478
479 Title : translate
480 Usage : $protein_seq_obj = $dna_seq_obj->translate
481 #if full CDS expected:
482 $protein_seq_obj = $cds_seq_obj->translate(undef,undef,undef,undef,1);
483 Function:
484
485 Provides the translation of the DNA sequence using full
486 IUPAC ambiguities in DNA/RNA and amino acid codes.
487
488 The full CDS translation is identical to EMBL/TREMBL
489 database translation. Note that the trailing terminator
490 character is removed before returning the translation
491 object.
492
493 Note: if you set $dna_seq_obj->verbose(1) you will get a
494 warning if the first codon is not a valid initiator.
495
496
497 Returns : A Bio::PrimarySeqI implementing object
498 Args : character for terminator (optional) defaults to '*'
499 character for unknown amino acid (optional) defaults to 'X'
500 frame (optional) valid values 0, 1, 2, defaults to 0
501 codon table id (optional) defaults to 1
502 complete coding sequence expected, defaults to 0 (false)
503 boolean, throw exception if not complete CDS (true) or defaults to
504 warning (false)
505 coding sequence expected to be complete at 5', defaults to false
506 coding sequence expected to be complete at 3', defaults to false
507
508 =cut
509
510 sub translate {
511 my($self) = shift;
512 my($stop, $unknown, $frame, $tableid, $fullCDS, $throw, $complete5,
513 $complete3) = @_;
514 my($i, $len, $output) = (0,0,'');
515 my($codon) = "";
516 my $aa;
517
518 ## User can pass in symbol for stop and unknown codons
519 unless(defined($stop) and $stop ne '') { $stop = "*"; }
520 unless(defined($unknown) and $unknown ne '') { $unknown = "X"; }
521 unless(defined($frame) and $frame ne '') { $frame = 0; }
522
523 ## the codon table ID
524 unless(defined($tableid) and $tableid ne '') { $tableid = 1; }
525
526 ##Error if monomer is "Amino"
527 $self->throw("Can't translate an amino acid sequence.") if
528 ($self->alphabet eq 'protein');
529
530 ##Error if frame is not 0, 1 or 2
531 $self->throw("Valid values for frame are 0, 1, 2, not [$frame].") unless
532 ($frame == 0 or $frame == 1 or $frame == 2);
533
534 #warns if ID is invalid
535 my $codonTable = Bio::Tools::CodonTable->new( -id => $tableid);
536
537 my ($seq) = $self->seq();
538
539 # deal with frame offset.
540 if( $frame ) {
541 $seq = substr ($seq,$frame);
542 }
543
544 # Translate it
545 $output = $codonTable->translate($seq);
546 # Use user-input stop/unknown
547 $output =~ s/\*/$stop/g;
548 $output =~ s/X/$unknown/g;
549
550 # $complete5 and $complete3 indicate completeness of
551 # the coding sequence at the 5' and 3' ends. Complete
552 # if true, default to false. These are in addition to
553 # $fullCDS, for backwards compatibility
554 defined($complete5) or ($complete5 = $fullCDS ? 1 : 0);
555 defined($complete3) or ($complete3 = $fullCDS ? 1 : 0);
556
557 my $id = $self->display_id;
558
559 # only if we are expecting to be complete at the 5' end
560 if($complete5) {
561 # if the initiator codon is not ATG, the amino acid needs to changed into M
562 if(substr($output,0,1) ne 'M') {
563 if($codonTable->is_start_codon(substr($seq, 0, 3)) ) {
564 $output = 'M' . substr($output, 1);
565 }
566 elsif($throw) {
567 $self->throw("Seq [$id]: Not using a valid initiator codon!");
568 } else {
569 $self->warn("Seq [$id]: Not using a valid initiator codon!");
570 }
571 }
572 }
573
574 # only if we are expecting to be complete at the 3' end
575 if($complete3) {
576 #remove the stop character
577 if(substr($output, -1, 1) eq $stop) {
578 chop $output;
579 } else {
580 $throw && $self->throw("Seq [$id]: Not using a valid terminator codon!");
581 $self->warn("Seq [$id]: Not using a valid terminator codon!");
582 }
583 }
584
585 # only if we are expecting to translate a complete coding region
586 if($complete5 and $complete3) {
587 # test if there are terminator characters inside the protein sequence!
588 if($output =~ /\*/) {
589 $throw && $self->throw("Seq [$id]: Terminator codon inside CDS!");
590 $self->warn("Seq [$id]: Terminator codon inside CDS!");
591 }
592 }
593
594 my $seqclass;
595 if($self->can_call_new()) {
596 $seqclass = ref($self);
597 } else {
598 $seqclass = 'Bio::PrimarySeq';
599 $self->_attempt_to_load_Seq();
600 }
601 my $out = $seqclass->new( '-seq' => $output,
602 '-display_id' => $self->display_id,
603 '-accession_number' => $self->accession_number,
604 # is there anything wrong with retaining the
605 # description?
606 '-desc' => $self->desc(),
607 '-alphabet' => 'protein',
608 '-verbose' => $self->verbose
609 );
610 return $out;
611
612 }
613
614 =head2 id
615
616 Title : id
617 Usage : $id = $seq->id()
618 Function: ID of the sequence. This should normally be (and actually is in
619 the implementation provided here) just a synonym for display_id().
620 Example :
621 Returns : A string.
622 Args :
623
624
625 =cut
626
627 sub id {
628 return shift->display_id();
629 }
630
631
632 =head2 length
633
634 Title : length
635 Usage : $len = $seq->length()
636 Function:
637 Example :
638 Returns : integer representing the length of the sequence.
639 Args :
640
641
642 =cut
643
644 sub length {
645 shift->throw_not_implemented();
646 }
647
648 =head2 desc
649
650 Title : desc
651 Usage : $seq->desc($newval);
652 $description = $seq->desc();
653 Function: Get/set description text for a seq object
654 Example :
655 Returns : value of desc
656 Args : newvalue (optional)
657
658
659 =cut
660
661 sub desc {
662 my ($self,$value) = @_;
663 $self->throw_not_implemented();
664 }
665
666
667 =head2 is_circular
668
669 Title : is_circular
670 Usage : if( $obj->is_circular) { /Do Something/ }
671 Function: Returns true if the molecule is circular
672 Returns : Boolean value
673 Args : none
674
675 =cut
676
677 sub is_circular{
678 shift->throw_not_implemented();
679 }
680
681 =head1 Private functions
682
683 These are some private functions for the PrimarySeqI interface. You do not
684 need to implement these functions
685
686 =head2 _attempt_to_load_Seq
687
688 Title : _attempt_to_load_Seq
689 Usage :
690 Function:
691 Example :
692 Returns :
693 Args :
694
695
696 =cut
697
698 sub _attempt_to_load_Seq{
699 my ($self) = @_;
700
701 if( $main::{'Bio::PrimarySeq'} ) {
702 return 1;
703 } else {
704 eval {
705 require Bio::PrimarySeq;
706 };
707 if( $@ ) {
708 my $text = "Bio::PrimarySeq could not be loaded for [$self]\n".
709 "This indicates that you are using Bio::PrimarySeqI ".
710 "without Bio::PrimarySeq loaded or without providing a ".
711 "complete implementation.\nThe most likely problem is that there ".
712 "has been a misconfiguration of the bioperl environment\n".
713 "Actual exception:\n\n";
714 $self->throw("$text$@\n");
715 return 0;
716 }
717 return 1;
718 }
719
720 }
721
722 1;