Mercurial > repos > mahtabm > ensembl
diff variant_effect_predictor/Bio/Tools/CodonTable.pm @ 0:1f6dce3d34e0
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 02:01:53 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Tools/CodonTable.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,655 @@ +# $Id: CodonTable.pm,v 1.23 2002/10/22 07:38:45 lapp Exp $ +# +# bioperl module for Bio::Tools::CodonTable +# +# Cared for by Heikki Lehvaslaiho <heikki@ebi.ac.uk> +# +# Copyright Heikki Lehvaslaiho +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Tools::CodonTable - Bioperl codon table object + +=head1 SYNOPSIS + + This is a read-only class for all known codon tables. The IDs are + the ones used by nucleotide sequence databases. All common IUPAC + ambiguity codes for DNA, RNA and animo acids are recognized. + + # to use + use Bio::Tools::CodonTable; + + # defaults to ID 1 "Standard" + $myCodonTable = Bio::Tools::CodonTable->new(); + $myCodonTable2 = Bio::Tools::CodonTable -> new ( -id => 3 ); + + # change codon table + $myCodonTable->id(5); + + # examine codon table + print join (' ', "The name of the codon table no.", $myCodonTable->id(4), + "is:", $myCodonTable->name(), "\n"); + + # translate a codon + $aa = $myCodonTable->translate('ACU'); + $aa = $myCodonTable->translate('act'); + $aa = $myCodonTable->translate('ytr'); + + # reverse translate an amino acid + @codons = $myCodonTable->revtranslate('A'); + @codons = $myCodonTable->revtranslate('Ser'); + @codons = $myCodonTable->revtranslate('Glx'); + @codons = $myCodonTable->revtranslate('cYS', 'rna'); + + #boolean tests + print "Is a start\n" if $myCodonTable->is_start_codon('ATG'); + print "Is a termianator\n" if $myCodonTable->is_ter_codon('tar'); + print "Is a unknown\n" if $myCodonTable->is_unknown_codon('JTG'); + +=head1 DESCRIPTION + +Codon tables are also called translation tables or genetics codes +since that is what they try to represent. A bit more complete picture +of the full complexity of codon usage in various taxonomic groups +presented at the NCBI Genetic Codes Home page. + +CodonTable is a BioPerl class that knows all current translation +tables that are used by primary nucleotide sequence databases +(GenBank, EMBL and DDBJ). It provides methods to output information +about tables and relationships between codons and amino acids. + +This class and its methods recognized all common IUPAC ambiguity codes +for DNA, RNA and animo acids. The translation method follows the +conventions in EMBL and TREMBL databases. + +It is a nuisance to separate RNA and cDNA representations of nucleic +acid transcripts. The CodonTable object accepts codons of both type as +input and allows the user to set the mode for output when reverse +translating. Its default for output is DNA. + +Note: This class deals primarily with individual codons and amino + acids. However in the interest of speed you can L<translate> + longer sequence, too. The full complexity of protein translation + is tackled by L<Bio::PrimarySeqI::translate>. + + +The amino acid codes are IUPAC recommendations for common amino acids: + + A Ala Alanine + R Arg Arginine + N Asn Asparagine + D Asp Aspartic acid + C Cys Cysteine + Q Gln Glutamine + E Glu Glutamic acid + G Gly Glycine + H His Histidine + I Ile Isoleucine + L Leu Leucine + K Lys Lysine + M Met Methionine + F Phe Phenylalanine + P Pro Proline + S Ser Serine + T Thr Threonine + W Trp Tryptophan + Y Tyr Tyrosine + V Val Valine + B Asx Aspartic acid or Asparagine + Z Glx Glutamine or Glutamic acid + X Xaa Any or unknown amino acid + + +It is worth noting that, "Bacterial" codon table no. 11 produces an +polypeptide that is, confusingly, identical to the standard one. The +only differences are in available initiator codons. + + +NCBI Genetic Codes home page: + http://www.ncbi.nlm.nih.gov/htbin-post/Taxonomy/wprintgc?mode=c + +EBI Translation Table Viewer: + http://www.ebi.ac.uk/cgi-bin/mutations/trtables.cgi + +Amended ASN.1 version with ids 16 and 21 is at: + ftp://ftp.ebi.ac.uk/pub/databases/geneticcode/ + +Thank your for Matteo diTomasso for the original Perl implementation +of these tables. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to the +Bioperl mailing lists Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +report bugs to the Bioperl bug tracking system to help us keep track + the bugs and their resolution. Bug reports can be submitted via + email or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Heikki Lehvaslaiho + +Email: heikki@ebi.ac.uk +Address: + + EMBL Outstation, European Bioinformatics Institute + Wellcome Trust Genome Campus, Hinxton + Cambs. CB10 1SD, United Kingdom + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + +package Bio::Tools::CodonTable; +use vars qw(@ISA @NAMES @TABLES @STARTS $TRCOL $CODONS %IUPAC_DNA + %IUPAC_AA %THREELETTERSYMBOLS $VALID_PROTEIN $TERMINATOR); +use strict; + +# Object preamble - inherits from Bio::Root::Root +use Bio::Root::Root; +use Bio::Tools::IUPAC; +use Bio::SeqUtils; + +@ISA = qw(Bio::Root::Root); + +# first set internal values for all translation tables + +BEGIN { + @NAMES = #id + ( + 'Standard', #1 + 'Vertebrate Mitochondrial',#2 + 'Yeast Mitochondrial',# 3 + 'Mold, Protozoan, and CoelenterateMitochondrial and Mycoplasma/Spiroplasma',#4 + 'Invertebrate Mitochondrial',#5 + 'Ciliate, Dasycladacean and Hexamita Nuclear',# 6 + '', '', + 'Echinoderm Mitochondrial',#9 + 'Euplotid Nuclear',#10 + '"Bacterial"',# 11 + 'Alternative Yeast Nuclear',# 12 + 'Ascidian Mitochondrial',# 13 + 'Flatworm Mitochondrial',# 14 + 'Blepharisma Nuclear',# 15 + 'Chlorophycean Mitochondrial',# 16 + '', '', '', '', + 'Trematode Mitochondrial',# 21 + 'Scenedesmus obliquus Mitochondrial', #22 + 'Thraustochytrium Mitochondrial' #23 + ); + + @TABLES = + qw( + FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG + FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSS**VVVVAAAADDEEGGGG + FFLLSSSSYY**CCWWTTTTPPPPHHQQRRRRIIMMTTTTNNKKSSRRVVVVAAAADDEEGGGG + FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG + FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSSSSVVVVAAAADDEEGGGG + FFLLSSSSYYQQCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG + '' '' + FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNNKSSSSVVVVAAAADDEEGGGG + FFLLSSSSYY**CCCWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG + FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG + FFLLSSSSYY**CC*WLLLSPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG + FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSSGGVVVVAAAADDEEGGGG + FFLLSSSSYYY*CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNNKSSSSVVVVAAAADDEEGGGG + FFLLSSSSYY*QCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG + FFLLSSSSYY*LCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG + '' '' '' '' + FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNNKSSSSVVVVAAAADDEEGGGG + FFLLSS*SYY*LCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG + FF*LSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG + ); + + + @STARTS = + qw( + ---M---------------M---------------M---------------------------- + --------------------------------MMMM---------------M------------ + ----------------------------------MM---------------------------- + --MM---------------M------------MMMM---------------M------------ + ---M----------------------------MMMM---------------M------------ + -----------------------------------M---------------------------- + '' '' + -----------------------------------M---------------------------- + -----------------------------------M---------------------------- + ---M---------------M------------MMMM---------------M------------ + -------------------M---------------M---------------------------- + -----------------------------------M---------------------------- + -----------------------------------M---------------------------- + -----------------------------------M---------------------------- + -----------------------------------M---------------------------- + '' '' '' '' + -----------------------------------M---------------M------------ + -----------------------------------M---------------------------- + --------------------------------M--M---------------M------------ + ); + + my @nucs = qw(t c a g); + my $x = 0; + ($CODONS, $TRCOL) = ({}, {}); + for my $i (@nucs) { + for my $j (@nucs) { + for my $k (@nucs) { + my $codon = "$i$j$k"; + $CODONS->{$codon} = $x; + $TRCOL->{$x} = $codon; + $x++; + } + } + } + %IUPAC_DNA = Bio::Tools::IUPAC->iupac_iub(); + %IUPAC_AA = Bio::Tools::IUPAC->iupac_iup(); + %THREELETTERSYMBOLS = Bio::SeqUtils->valid_aa(2); + $VALID_PROTEIN = '['.join('',Bio::SeqUtils->valid_aa(0)).']'; + $TERMINATOR = '*'; +} + +sub new { + my($class,@args) = @_; + my $self = $class->SUPER::new(@args); + + my($id) = + $self->_rearrange([qw(ID + )], + @args); + + $id = 1 if ( ! $id ); + $id && $self->id($id); + return $self; # success - we hope! +} + +=head2 id + + Title : id + Usage : $obj->id(3); $id_integer = $obj->id(); + Function: + + Sets or returns the id of the translation table. IDs are + integers from 1 to 15, excluding 7 and 8 which have been + removed as redundant. If an invalid ID is given the method + returns 0, false. + + + Example : + Returns : value of id, a scalar, 0 if not a valid + Args : newvalue (optional) + +=cut + +sub id{ + my ($self,$value) = @_; + if( defined $value) { + if ( !(defined $TABLES[$value-1]) or $TABLES[$value-1] eq '') { + $self->warn("Not a valid codon table ID [$value] "); + $value = 0; + } + $self->{'id'} = $value; + } + return $self->{'id'}; +} + +=head2 name + + Title : name + Usage : $obj->name() + Function: returns the descriptive name of the translation table + Example : + Returns : A string + Args : None + + +=cut + +sub name{ + my ($self) = @_; + + my ($id) = $self->{'id'}; + return $NAMES[$id-1]; +} + +=head2 translate + + Title : translate + Usage : $obj->translate('YTR') + Function: Returns a string of one letter amino acid codes from + nucleotide sequence input. The imput can be of any length. + + Returns 'X' for unknown codons and codons that code for + more than one amino acid. Returns an empty string if input + is not three characters long. Exceptions for these are: + + - IUPAC amino acid code B for Aspartic Acid and + Asparagine, is used. + - IUPAC amino acid code Z for Glutamic Acid, Glutamine is + used. + - if the codon is two nucleotides long and if by adding + an a third character 'N', it codes for a single amino + acid (with exceptions above), return that, otherwise + return empty string. + + Returns empty string for other input strings that are not + three characters long. + + Example : + Returns : a string of one letter ambiguous IUPAC amino acid codes + Args : ambiguous IUPAC nucleotide string + + +=cut + +sub translate { + my ($self, $seq) = @_; + $self->throw("Calling translate without a seq argument!") unless defined $seq; + return '' unless $seq; + + my $id = $self->id; + my ($partial) = 0; + $partial = 2 if length($seq) % 3 == 2; + + $seq = lc $seq; + $seq =~ tr/u/t/; + my $protein = ""; + if ($seq =~ /[^actg]/ ) { #ambiguous chars + for (my $i = 0; $i < (length($seq) - 2 ); $i+=3) { + my $triplet = substr($seq, $i, 3); + if (exists $CODONS->{$triplet}) { + $protein .= substr($TABLES[$id-1], + $CODONS->{$triplet},1); + } else { + $protein .= $self->_translate_ambiguous_codon($triplet); + } + } + } else { # simple, strict translation + for (my $i = 0; $i < (length($seq) - 2 ); $i+=3) { + my $triplet = substr($seq, $i, 3); + if (exists $CODONS->{$triplet}) { + $protein .= substr($TABLES[$id-1], $CODONS->{$triplet}, 1); + } else { + $protein .= 'X'; + } + } + } + if ($partial == 2) { # 2 overhanging nucleotides + my $triplet = substr($seq, ($partial -4)). "n"; + if (exists $CODONS->{$triplet}) { + my $aa = substr($TABLES[$id-1], $CODONS->{$triplet},1); + $protein .= $aa; + } else { + $protein .= $self->_translate_ambiguous_codon($triplet, $partial); + } + } + return $protein; +} + +sub _translate_ambiguous_codon { + my ($self, $triplet, $partial) = @_; + $partial ||= 0; + my $id = $self->id; + my $aa; + my @codons = _unambiquous_codons($triplet); + my %aas =(); + foreach my $codon (@codons) { + $aas{substr($TABLES[$id-1],$CODONS->{$codon},1)} = 1; + } + my $count = scalar keys %aas; + if ( $count == 1 ) { + $aa = (keys %aas)[0]; + } + elsif ( $count == 2 ) { + if ($aas{'D'} and $aas{'N'}) { + $aa = 'B'; + } + elsif ($aas{'E'} and $aas{'Q'}) { + $aa = 'Z'; + } else { + $partial ? ($aa = '') : ($aa = 'X'); + } + } else { + $partial ? ($aa = '') : ($aa = 'X'); + } + return $aa; +} + +=head2 translate_strict + + Title : translate_strict + Usage : $obj->translate_strict('ACT') + Function: returns one letter amino acid code for a codon input + + Fast and simple translation. User is responsible to resolve + ambiguous nucleotide codes before calling this + method. Returns 'X' for unknown codons and an empty string + for input strings that are not three characters long. + + It is not recommended to use this method in a production + environment. Use method translate, instead. + + Example : + Returns : A string + Args : a codon = a three nucleotide character string + + +=cut + +sub translate_strict{ + my ($self, $value) = @_; + my ($id) = $self->{'id'}; + + $value = lc $value; + $value =~ tr/u/t/; + + if (length $value != 3 ) { + return ''; + } + elsif (!(defined $CODONS->{$value})) { + return 'X'; + } + else { + return substr($TABLES[$id-1],$CODONS->{$value},1); + } +} + +=head2 revtranslate + + Title : revtranslate + Usage : $obj->revtranslate('G') + Function: returns codons for an amino acid + + Returns an empty string for unknown amino acid + codes. Ambiquous IUPAC codes Asx,B, (Asp,D; Asn,N) and + Glx,Z (Glu,E; Gln,Q) are resolved. Both single and three + letter amino acid codes are accepted. '*' and 'Ter' are + used for terminator. + + By default, the output codons are shown in DNA. If the + output is needed in RNA (tr/t/u/), add a second argument + 'RNA'. + + Example : $obj->revtranslate('Gly', 'RNA') + Returns : An array of three lower case letter strings i.e. codons + Args : amino acid, 'RNA' + +=cut + +sub revtranslate { + my ($self, $value, $coding) = @_; + my ($id) = $self->{'id'}; + my (@aas, $p); + my (@codons) = (); + + if (length($value) == 3 ) { + $value = lc $value; + $value = ucfirst $value; + $value = $THREELETTERSYMBOLS{$value}; + } + if ( defined $value and $value =~ /$VALID_PROTEIN/ + and length($value) == 1 ) { + $value = uc $value; + @aas = @{$IUPAC_AA{$value}}; + foreach my $aa (@aas) { + #print $aa, " -2\n"; + $aa = '\*' if $aa eq '*'; + while ($TABLES[$id-1] =~ m/$aa/g) { + $p = pos $TABLES[$id-1]; + push (@codons, $TRCOL->{--$p}); + } + } + } + + if ($coding and uc ($coding) eq 'RNA') { + for my $i (0..$#codons) { + $codons[$i] =~ tr/t/u/; + } + } + + return @codons; +} + +=head2 is_start_codon + + Title : is_start_codon + Usage : $obj->is_start_codon('ATG') + Function: returns true (1) for all codons that can be used as a + translation start, false (0) for others. + Example : $myCodonTable->is_start_codon('ATG') + Returns : boolean + Args : codon + + +=cut + +sub is_start_codon{ + my ($self, $value) = @_; + my ($id) = $self->{'id'}; + + $value = lc $value; + $value =~ tr/u/t/; + + if (length $value != 3 ) { + return 0; + } + else { + my $result = 1; + my @ms = map { substr($STARTS[$id-1],$CODONS->{$_},1) } _unambiquous_codons($value); + foreach my $c (@ms) { + $result = 0 if $c ne 'M'; + } + return $result; + } +} + + + +=head2 is_ter_codon + + Title : is_ter_codon + Usage : $obj->is_ter_codon('GAA') + Function: returns true (1) for all codons that can be used as a + translation tarminator, false (0) for others. + Example : $myCodonTable->is_ter_codon('ATG') + Returns : boolean + Args : codon + + +=cut + +sub is_ter_codon{ + my ($self, $value) = @_; + my ($id) = $self->{'id'}; + + $value = lc $value; + $value =~ tr/u/t/; + + if (length $value != 3 ) { + return 0; + } + else { + my $result = 1; + my @ms = map { substr($TABLES[$id-1],$CODONS->{$_},1) } _unambiquous_codons($value); + foreach my $c (@ms) { + $result = 0 if $c ne $TERMINATOR; + } + return $result; + } +} + +=head2 is_unknown_codon + + Title : is_unknown_codon + Usage : $obj->is_unknown_codon('GAJ') + Function: returns false (0) for all codons that are valid, + true (1) for others. + Example : $myCodonTable->is_unknown_codon('NTG') + Returns : boolean + Args : codon + + +=cut + +sub is_unknown_codon{ + my ($self, $value) = @_; + my ($id) = $self->{'id'}; + + $value = lc $value; + $value =~ tr/u/t/; + + if (length $value != 3 ) { + return 1; + } + else { + my $result = 0; + my @cs = map { substr($TABLES[$id-1],$CODONS->{$_},1) } _unambiquous_codons($value); + $result = 1 if scalar @cs == 0; + return $result; + } +} + +=head2 _unambiquous_codons + + Title : _unambiquous_codons + Usage : @codons = _unambiquous_codons('ACN') + Function: + Example : + Returns : array of strings (one letter unambiguous amino acid codes) + Args : a codon = a three IUPAC nucleotide character string + +=cut + +sub _unambiquous_codons{ + my ($value) = @_; + my @nts = (); + my @codons = (); + my ($i, $j, $k); + @nts = map { $IUPAC_DNA{uc $_} } split(//, $value); + for my $i (@{$nts[0]}) { + for my $j (@{$nts[1]}) { + for my $k (@{$nts[2]}) { + push @codons, lc "$i$j$k"; + } + } + } + return @codons; +} + +1;