comparison variant_effect_predictor/Bio/Tools/CodonTable.pm @ 0:1f6dce3d34e0

Uploaded
author mahtabm
date Thu, 11 Apr 2013 02:01:53 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:1f6dce3d34e0
1 # $Id: CodonTable.pm,v 1.23 2002/10/22 07:38:45 lapp Exp $
2 #
3 # bioperl module for Bio::Tools::CodonTable
4 #
5 # Cared for by Heikki Lehvaslaiho <heikki@ebi.ac.uk>
6 #
7 # Copyright Heikki Lehvaslaiho
8 #
9 # You may distribute this module under the same terms as perl itself
10
11 # POD documentation - main docs before the code
12
13 =head1 NAME
14
15 Bio::Tools::CodonTable - Bioperl codon table object
16
17 =head1 SYNOPSIS
18
19 This is a read-only class for all known codon tables. The IDs are
20 the ones used by nucleotide sequence databases. All common IUPAC
21 ambiguity codes for DNA, RNA and animo acids are recognized.
22
23 # to use
24 use Bio::Tools::CodonTable;
25
26 # defaults to ID 1 "Standard"
27 $myCodonTable = Bio::Tools::CodonTable->new();
28 $myCodonTable2 = Bio::Tools::CodonTable -> new ( -id => 3 );
29
30 # change codon table
31 $myCodonTable->id(5);
32
33 # examine codon table
34 print join (' ', "The name of the codon table no.", $myCodonTable->id(4),
35 "is:", $myCodonTable->name(), "\n");
36
37 # translate a codon
38 $aa = $myCodonTable->translate('ACU');
39 $aa = $myCodonTable->translate('act');
40 $aa = $myCodonTable->translate('ytr');
41
42 # reverse translate an amino acid
43 @codons = $myCodonTable->revtranslate('A');
44 @codons = $myCodonTable->revtranslate('Ser');
45 @codons = $myCodonTable->revtranslate('Glx');
46 @codons = $myCodonTable->revtranslate('cYS', 'rna');
47
48 #boolean tests
49 print "Is a start\n" if $myCodonTable->is_start_codon('ATG');
50 print "Is a termianator\n" if $myCodonTable->is_ter_codon('tar');
51 print "Is a unknown\n" if $myCodonTable->is_unknown_codon('JTG');
52
53 =head1 DESCRIPTION
54
55 Codon tables are also called translation tables or genetics codes
56 since that is what they try to represent. A bit more complete picture
57 of the full complexity of codon usage in various taxonomic groups
58 presented at the NCBI Genetic Codes Home page.
59
60 CodonTable is a BioPerl class that knows all current translation
61 tables that are used by primary nucleotide sequence databases
62 (GenBank, EMBL and DDBJ). It provides methods to output information
63 about tables and relationships between codons and amino acids.
64
65 This class and its methods recognized all common IUPAC ambiguity codes
66 for DNA, RNA and animo acids. The translation method follows the
67 conventions in EMBL and TREMBL databases.
68
69 It is a nuisance to separate RNA and cDNA representations of nucleic
70 acid transcripts. The CodonTable object accepts codons of both type as
71 input and allows the user to set the mode for output when reverse
72 translating. Its default for output is DNA.
73
74 Note: This class deals primarily with individual codons and amino
75 acids. However in the interest of speed you can L<translate>
76 longer sequence, too. The full complexity of protein translation
77 is tackled by L<Bio::PrimarySeqI::translate>.
78
79
80 The amino acid codes are IUPAC recommendations for common amino acids:
81
82 A Ala Alanine
83 R Arg Arginine
84 N Asn Asparagine
85 D Asp Aspartic acid
86 C Cys Cysteine
87 Q Gln Glutamine
88 E Glu Glutamic acid
89 G Gly Glycine
90 H His Histidine
91 I Ile Isoleucine
92 L Leu Leucine
93 K Lys Lysine
94 M Met Methionine
95 F Phe Phenylalanine
96 P Pro Proline
97 S Ser Serine
98 T Thr Threonine
99 W Trp Tryptophan
100 Y Tyr Tyrosine
101 V Val Valine
102 B Asx Aspartic acid or Asparagine
103 Z Glx Glutamine or Glutamic acid
104 X Xaa Any or unknown amino acid
105
106
107 It is worth noting that, "Bacterial" codon table no. 11 produces an
108 polypeptide that is, confusingly, identical to the standard one. The
109 only differences are in available initiator codons.
110
111
112 NCBI Genetic Codes home page:
113 http://www.ncbi.nlm.nih.gov/htbin-post/Taxonomy/wprintgc?mode=c
114
115 EBI Translation Table Viewer:
116 http://www.ebi.ac.uk/cgi-bin/mutations/trtables.cgi
117
118 Amended ASN.1 version with ids 16 and 21 is at:
119 ftp://ftp.ebi.ac.uk/pub/databases/geneticcode/
120
121 Thank your for Matteo diTomasso for the original Perl implementation
122 of these tables.
123
124 =head1 FEEDBACK
125
126 =head2 Mailing Lists
127
128 User feedback is an integral part of the evolution of this and other
129 Bioperl modules. Send your comments and suggestions preferably to the
130 Bioperl mailing lists Your participation is much appreciated.
131
132 bioperl-l@bioperl.org - General discussion
133 http://bio.perl.org/MailList.html - About the mailing lists
134
135 =head2 Reporting Bugs
136
137 report bugs to the Bioperl bug tracking system to help us keep track
138 the bugs and their resolution. Bug reports can be submitted via
139 email or the web:
140
141 bioperl-bugs@bio.perl.org
142 http://bugzilla.bioperl.org/
143
144 =head1 AUTHOR - Heikki Lehvaslaiho
145
146 Email: heikki@ebi.ac.uk
147 Address:
148
149 EMBL Outstation, European Bioinformatics Institute
150 Wellcome Trust Genome Campus, Hinxton
151 Cambs. CB10 1SD, United Kingdom
152
153 =head1 APPENDIX
154
155 The rest of the documentation details each of the object
156 methods. Internal methods are usually preceded with a _
157
158 =cut
159
160
161 # Let the code begin...
162
163 package Bio::Tools::CodonTable;
164 use vars qw(@ISA @NAMES @TABLES @STARTS $TRCOL $CODONS %IUPAC_DNA
165 %IUPAC_AA %THREELETTERSYMBOLS $VALID_PROTEIN $TERMINATOR);
166 use strict;
167
168 # Object preamble - inherits from Bio::Root::Root
169 use Bio::Root::Root;
170 use Bio::Tools::IUPAC;
171 use Bio::SeqUtils;
172
173 @ISA = qw(Bio::Root::Root);
174
175 # first set internal values for all translation tables
176
177 BEGIN {
178 @NAMES = #id
179 (
180 'Standard', #1
181 'Vertebrate Mitochondrial',#2
182 'Yeast Mitochondrial',# 3
183 'Mold, Protozoan, and CoelenterateMitochondrial and Mycoplasma/Spiroplasma',#4
184 'Invertebrate Mitochondrial',#5
185 'Ciliate, Dasycladacean and Hexamita Nuclear',# 6
186 '', '',
187 'Echinoderm Mitochondrial',#9
188 'Euplotid Nuclear',#10
189 '"Bacterial"',# 11
190 'Alternative Yeast Nuclear',# 12
191 'Ascidian Mitochondrial',# 13
192 'Flatworm Mitochondrial',# 14
193 'Blepharisma Nuclear',# 15
194 'Chlorophycean Mitochondrial',# 16
195 '', '', '', '',
196 'Trematode Mitochondrial',# 21
197 'Scenedesmus obliquus Mitochondrial', #22
198 'Thraustochytrium Mitochondrial' #23
199 );
200
201 @TABLES =
202 qw(
203 FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG
204 FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSS**VVVVAAAADDEEGGGG
205 FFLLSSSSYY**CCWWTTTTPPPPHHQQRRRRIIMMTTTTNNKKSSRRVVVVAAAADDEEGGGG
206 FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG
207 FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSSSSVVVVAAAADDEEGGGG
208 FFLLSSSSYYQQCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG
209 '' ''
210 FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNNKSSSSVVVVAAAADDEEGGGG
211 FFLLSSSSYY**CCCWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG
212 FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG
213 FFLLSSSSYY**CC*WLLLSPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG
214 FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSSGGVVVVAAAADDEEGGGG
215 FFLLSSSSYYY*CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNNKSSSSVVVVAAAADDEEGGGG
216 FFLLSSSSYY*QCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG
217 FFLLSSSSYY*LCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG
218 '' '' '' ''
219 FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNNKSSSSVVVVAAAADDEEGGGG
220 FFLLSS*SYY*LCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG
221 FF*LSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG
222 );
223
224
225 @STARTS =
226 qw(
227 ---M---------------M---------------M----------------------------
228 --------------------------------MMMM---------------M------------
229 ----------------------------------MM----------------------------
230 --MM---------------M------------MMMM---------------M------------
231 ---M----------------------------MMMM---------------M------------
232 -----------------------------------M----------------------------
233 '' ''
234 -----------------------------------M----------------------------
235 -----------------------------------M----------------------------
236 ---M---------------M------------MMMM---------------M------------
237 -------------------M---------------M----------------------------
238 -----------------------------------M----------------------------
239 -----------------------------------M----------------------------
240 -----------------------------------M----------------------------
241 -----------------------------------M----------------------------
242 '' '' '' ''
243 -----------------------------------M---------------M------------
244 -----------------------------------M----------------------------
245 --------------------------------M--M---------------M------------
246 );
247
248 my @nucs = qw(t c a g);
249 my $x = 0;
250 ($CODONS, $TRCOL) = ({}, {});
251 for my $i (@nucs) {
252 for my $j (@nucs) {
253 for my $k (@nucs) {
254 my $codon = "$i$j$k";
255 $CODONS->{$codon} = $x;
256 $TRCOL->{$x} = $codon;
257 $x++;
258 }
259 }
260 }
261 %IUPAC_DNA = Bio::Tools::IUPAC->iupac_iub();
262 %IUPAC_AA = Bio::Tools::IUPAC->iupac_iup();
263 %THREELETTERSYMBOLS = Bio::SeqUtils->valid_aa(2);
264 $VALID_PROTEIN = '['.join('',Bio::SeqUtils->valid_aa(0)).']';
265 $TERMINATOR = '*';
266 }
267
268 sub new {
269 my($class,@args) = @_;
270 my $self = $class->SUPER::new(@args);
271
272 my($id) =
273 $self->_rearrange([qw(ID
274 )],
275 @args);
276
277 $id = 1 if ( ! $id );
278 $id && $self->id($id);
279 return $self; # success - we hope!
280 }
281
282 =head2 id
283
284 Title : id
285 Usage : $obj->id(3); $id_integer = $obj->id();
286 Function:
287
288 Sets or returns the id of the translation table. IDs are
289 integers from 1 to 15, excluding 7 and 8 which have been
290 removed as redundant. If an invalid ID is given the method
291 returns 0, false.
292
293
294 Example :
295 Returns : value of id, a scalar, 0 if not a valid
296 Args : newvalue (optional)
297
298 =cut
299
300 sub id{
301 my ($self,$value) = @_;
302 if( defined $value) {
303 if ( !(defined $TABLES[$value-1]) or $TABLES[$value-1] eq '') {
304 $self->warn("Not a valid codon table ID [$value] ");
305 $value = 0;
306 }
307 $self->{'id'} = $value;
308 }
309 return $self->{'id'};
310 }
311
312 =head2 name
313
314 Title : name
315 Usage : $obj->name()
316 Function: returns the descriptive name of the translation table
317 Example :
318 Returns : A string
319 Args : None
320
321
322 =cut
323
324 sub name{
325 my ($self) = @_;
326
327 my ($id) = $self->{'id'};
328 return $NAMES[$id-1];
329 }
330
331 =head2 translate
332
333 Title : translate
334 Usage : $obj->translate('YTR')
335 Function: Returns a string of one letter amino acid codes from
336 nucleotide sequence input. The imput can be of any length.
337
338 Returns 'X' for unknown codons and codons that code for
339 more than one amino acid. Returns an empty string if input
340 is not three characters long. Exceptions for these are:
341
342 - IUPAC amino acid code B for Aspartic Acid and
343 Asparagine, is used.
344 - IUPAC amino acid code Z for Glutamic Acid, Glutamine is
345 used.
346 - if the codon is two nucleotides long and if by adding
347 an a third character 'N', it codes for a single amino
348 acid (with exceptions above), return that, otherwise
349 return empty string.
350
351 Returns empty string for other input strings that are not
352 three characters long.
353
354 Example :
355 Returns : a string of one letter ambiguous IUPAC amino acid codes
356 Args : ambiguous IUPAC nucleotide string
357
358
359 =cut
360
361 sub translate {
362 my ($self, $seq) = @_;
363 $self->throw("Calling translate without a seq argument!") unless defined $seq;
364 return '' unless $seq;
365
366 my $id = $self->id;
367 my ($partial) = 0;
368 $partial = 2 if length($seq) % 3 == 2;
369
370 $seq = lc $seq;
371 $seq =~ tr/u/t/;
372 my $protein = "";
373 if ($seq =~ /[^actg]/ ) { #ambiguous chars
374 for (my $i = 0; $i < (length($seq) - 2 ); $i+=3) {
375 my $triplet = substr($seq, $i, 3);
376 if (exists $CODONS->{$triplet}) {
377 $protein .= substr($TABLES[$id-1],
378 $CODONS->{$triplet},1);
379 } else {
380 $protein .= $self->_translate_ambiguous_codon($triplet);
381 }
382 }
383 } else { # simple, strict translation
384 for (my $i = 0; $i < (length($seq) - 2 ); $i+=3) {
385 my $triplet = substr($seq, $i, 3);
386 if (exists $CODONS->{$triplet}) {
387 $protein .= substr($TABLES[$id-1], $CODONS->{$triplet}, 1);
388 } else {
389 $protein .= 'X';
390 }
391 }
392 }
393 if ($partial == 2) { # 2 overhanging nucleotides
394 my $triplet = substr($seq, ($partial -4)). "n";
395 if (exists $CODONS->{$triplet}) {
396 my $aa = substr($TABLES[$id-1], $CODONS->{$triplet},1);
397 $protein .= $aa;
398 } else {
399 $protein .= $self->_translate_ambiguous_codon($triplet, $partial);
400 }
401 }
402 return $protein;
403 }
404
405 sub _translate_ambiguous_codon {
406 my ($self, $triplet, $partial) = @_;
407 $partial ||= 0;
408 my $id = $self->id;
409 my $aa;
410 my @codons = _unambiquous_codons($triplet);
411 my %aas =();
412 foreach my $codon (@codons) {
413 $aas{substr($TABLES[$id-1],$CODONS->{$codon},1)} = 1;
414 }
415 my $count = scalar keys %aas;
416 if ( $count == 1 ) {
417 $aa = (keys %aas)[0];
418 }
419 elsif ( $count == 2 ) {
420 if ($aas{'D'} and $aas{'N'}) {
421 $aa = 'B';
422 }
423 elsif ($aas{'E'} and $aas{'Q'}) {
424 $aa = 'Z';
425 } else {
426 $partial ? ($aa = '') : ($aa = 'X');
427 }
428 } else {
429 $partial ? ($aa = '') : ($aa = 'X');
430 }
431 return $aa;
432 }
433
434 =head2 translate_strict
435
436 Title : translate_strict
437 Usage : $obj->translate_strict('ACT')
438 Function: returns one letter amino acid code for a codon input
439
440 Fast and simple translation. User is responsible to resolve
441 ambiguous nucleotide codes before calling this
442 method. Returns 'X' for unknown codons and an empty string
443 for input strings that are not three characters long.
444
445 It is not recommended to use this method in a production
446 environment. Use method translate, instead.
447
448 Example :
449 Returns : A string
450 Args : a codon = a three nucleotide character string
451
452
453 =cut
454
455 sub translate_strict{
456 my ($self, $value) = @_;
457 my ($id) = $self->{'id'};
458
459 $value = lc $value;
460 $value =~ tr/u/t/;
461
462 if (length $value != 3 ) {
463 return '';
464 }
465 elsif (!(defined $CODONS->{$value})) {
466 return 'X';
467 }
468 else {
469 return substr($TABLES[$id-1],$CODONS->{$value},1);
470 }
471 }
472
473 =head2 revtranslate
474
475 Title : revtranslate
476 Usage : $obj->revtranslate('G')
477 Function: returns codons for an amino acid
478
479 Returns an empty string for unknown amino acid
480 codes. Ambiquous IUPAC codes Asx,B, (Asp,D; Asn,N) and
481 Glx,Z (Glu,E; Gln,Q) are resolved. Both single and three
482 letter amino acid codes are accepted. '*' and 'Ter' are
483 used for terminator.
484
485 By default, the output codons are shown in DNA. If the
486 output is needed in RNA (tr/t/u/), add a second argument
487 'RNA'.
488
489 Example : $obj->revtranslate('Gly', 'RNA')
490 Returns : An array of three lower case letter strings i.e. codons
491 Args : amino acid, 'RNA'
492
493 =cut
494
495 sub revtranslate {
496 my ($self, $value, $coding) = @_;
497 my ($id) = $self->{'id'};
498 my (@aas, $p);
499 my (@codons) = ();
500
501 if (length($value) == 3 ) {
502 $value = lc $value;
503 $value = ucfirst $value;
504 $value = $THREELETTERSYMBOLS{$value};
505 }
506 if ( defined $value and $value =~ /$VALID_PROTEIN/
507 and length($value) == 1 ) {
508 $value = uc $value;
509 @aas = @{$IUPAC_AA{$value}};
510 foreach my $aa (@aas) {
511 #print $aa, " -2\n";
512 $aa = '\*' if $aa eq '*';
513 while ($TABLES[$id-1] =~ m/$aa/g) {
514 $p = pos $TABLES[$id-1];
515 push (@codons, $TRCOL->{--$p});
516 }
517 }
518 }
519
520 if ($coding and uc ($coding) eq 'RNA') {
521 for my $i (0..$#codons) {
522 $codons[$i] =~ tr/t/u/;
523 }
524 }
525
526 return @codons;
527 }
528
529 =head2 is_start_codon
530
531 Title : is_start_codon
532 Usage : $obj->is_start_codon('ATG')
533 Function: returns true (1) for all codons that can be used as a
534 translation start, false (0) for others.
535 Example : $myCodonTable->is_start_codon('ATG')
536 Returns : boolean
537 Args : codon
538
539
540 =cut
541
542 sub is_start_codon{
543 my ($self, $value) = @_;
544 my ($id) = $self->{'id'};
545
546 $value = lc $value;
547 $value =~ tr/u/t/;
548
549 if (length $value != 3 ) {
550 return 0;
551 }
552 else {
553 my $result = 1;
554 my @ms = map { substr($STARTS[$id-1],$CODONS->{$_},1) } _unambiquous_codons($value);
555 foreach my $c (@ms) {
556 $result = 0 if $c ne 'M';
557 }
558 return $result;
559 }
560 }
561
562
563
564 =head2 is_ter_codon
565
566 Title : is_ter_codon
567 Usage : $obj->is_ter_codon('GAA')
568 Function: returns true (1) for all codons that can be used as a
569 translation tarminator, false (0) for others.
570 Example : $myCodonTable->is_ter_codon('ATG')
571 Returns : boolean
572 Args : codon
573
574
575 =cut
576
577 sub is_ter_codon{
578 my ($self, $value) = @_;
579 my ($id) = $self->{'id'};
580
581 $value = lc $value;
582 $value =~ tr/u/t/;
583
584 if (length $value != 3 ) {
585 return 0;
586 }
587 else {
588 my $result = 1;
589 my @ms = map { substr($TABLES[$id-1],$CODONS->{$_},1) } _unambiquous_codons($value);
590 foreach my $c (@ms) {
591 $result = 0 if $c ne $TERMINATOR;
592 }
593 return $result;
594 }
595 }
596
597 =head2 is_unknown_codon
598
599 Title : is_unknown_codon
600 Usage : $obj->is_unknown_codon('GAJ')
601 Function: returns false (0) for all codons that are valid,
602 true (1) for others.
603 Example : $myCodonTable->is_unknown_codon('NTG')
604 Returns : boolean
605 Args : codon
606
607
608 =cut
609
610 sub is_unknown_codon{
611 my ($self, $value) = @_;
612 my ($id) = $self->{'id'};
613
614 $value = lc $value;
615 $value =~ tr/u/t/;
616
617 if (length $value != 3 ) {
618 return 1;
619 }
620 else {
621 my $result = 0;
622 my @cs = map { substr($TABLES[$id-1],$CODONS->{$_},1) } _unambiquous_codons($value);
623 $result = 1 if scalar @cs == 0;
624 return $result;
625 }
626 }
627
628 =head2 _unambiquous_codons
629
630 Title : _unambiquous_codons
631 Usage : @codons = _unambiquous_codons('ACN')
632 Function:
633 Example :
634 Returns : array of strings (one letter unambiguous amino acid codes)
635 Args : a codon = a three IUPAC nucleotide character string
636
637 =cut
638
639 sub _unambiquous_codons{
640 my ($value) = @_;
641 my @nts = ();
642 my @codons = ();
643 my ($i, $j, $k);
644 @nts = map { $IUPAC_DNA{uc $_} } split(//, $value);
645 for my $i (@{$nts[0]}) {
646 for my $j (@{$nts[1]}) {
647 for my $k (@{$nts[2]}) {
648 push @codons, lc "$i$j$k";
649 }
650 }
651 }
652 return @codons;
653 }
654
655 1;