diff variant_effect_predictor/Bio/SeqUtils.pm @ 0:2bc9b66ada89 draft default tip

Uploaded
author mahtabm
date Thu, 11 Apr 2013 06:29:17 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/variant_effect_predictor/Bio/SeqUtils.pm	Thu Apr 11 06:29:17 2013 -0400
@@ -0,0 +1,320 @@
+# $Id: SeqUtils.pm,v 1.11.2.1 2003/08/11 20:11:17 jason Exp $
+#
+# BioPerl module for Bio::SeqUtils
+#
+# Cared for by Heikki Lehvaslaiho <heikki@ebi.ac.uk>
+#
+# Copyright Heikki Lehvaslaiho
+#
+# You may distribute this module under the same terms as perl itself
+
+# POD documentation - main docs before the code
+
+=head1 NAME
+
+Bio::SeqUtils - Additional methods for PrimarySeq objects
+
+=head1 SYNOPSIS
+
+    use Bio::SeqUtils;
+    # get a Bio::PrimarySeqI compliant object, $seq, somehow
+    $util = new Bio::SeqUtils;
+    $polypeptide_3char = $util->seq3($seq);
+    # or
+    $polypeptide_3char = Bio::SeqUtils->seq3($seq);
+
+    # set the sequence string (stored in one char code in the object)
+    Bio::SeqUtils->seq3($seq, $polypeptide_3char);
+
+    # translate a sequence in all six frames
+    @seqs = Bio::SeqUtils->translate_6frames($seq);
+
+=head1 DESCRIPTION
+
+This class is a holder of methods that work on Bio::PrimarySeqI-
+compliant sequence objects, e.g. Bio::PrimarySeq and
+Bio::Seq. These methods are not part of the Bio::PrimarySeqI
+interface and should in general not be essential to the primary function
+of sequence objects. If you are thinking of adding essential
+functions, it might be better to create your own sequence class.
+See L<Bio::PrimarySeqI>, L<Bio::PrimarySeq>, and L<Bio::Seq> for more.
+
+The methods take as their first argument a sequence object. It is
+possible to use methods without first creating a SeqUtils object,
+i.e. use it as an anonymous hash.
+
+The first two methods, seq3() and seq3in(), give out or read in protein
+sequences coded in three letter IUPAC amino acid codes.
+
+The next two methods, translate_3frames() and translate_6frames(), wrap
+around the standard translate method to give back an array of three
+forward or all six frame translations.
+
+=head1 FEEDBACK
+
+=head2 Mailing Lists
+
+User feedback is an integral part of the evolution of this and other
+Bioperl modules. Send your comments and suggestions preferably to one
+of the Bioperl mailing lists.  Your participation is much appreciated.
+
+  bioperl-l@bioperl.org          - General discussion
+  http://bio.perl.org/MailList.html             - About the mailing lists
+
+=head2 Reporting Bugs
+
+Report bugs to the Bioperl bug tracking system to help us keep track
+the bugs and their resolution.  Bug reports can be submitted via email
+or the web:
+
+  bioperl-bugs@bio.perl.org
+  http://bugzilla.bioperl.org/
+
+=head1 AUTHOR - Heikki Lehvaslaiho
+
+Email:  heikki@ebi.ac.uk
+Address:
+
+     EMBL Outstation, European Bioinformatics Institute
+     Wellcome Trust Genome Campus, Hinxton
+     Cambs. CB10 1SD, United Kingdom
+
+=head1 APPENDIX
+
+The rest of the documentation details each of the object
+methods. Internal methods are usually preceded with a _
+
+=cut
+
+
+# Let the code begin...
+
+
+package Bio::SeqUtils;
+use vars qw(@ISA %ONECODE %THREECODE);
+use strict;
+use Carp;
+
+@ISA = qw(Bio::Root::Root);
+# new inherited from RootI
+
+BEGIN {
+
+    %ONECODE =
+    ('Ala' => 'A', 'Asx' => 'B', 'Cys' => 'C', 'Asp' => 'D',
+     'Glu' => 'E', 'Phe' => 'F', 'Gly' => 'G', 'His' => 'H',
+     'Ile' => 'I', 'Lys' => 'K', 'Leu' => 'L', 'Met' => 'M',
+     'Asn' => 'N', 'Pro' => 'P', 'Gln' => 'Q', 'Arg' => 'R',
+     'Ser' => 'S', 'Thr' => 'T', 'Val' => 'V', 'Trp' => 'W',
+     'Xaa' => 'X', 'Tyr' => 'Y', 'Glx' => 'Z', 'Ter' => '*',
+     'Sec' => 'U'
+     );
+
+    %THREECODE =
+    ('A' => 'Ala', 'B' => 'Asx', 'C' => 'Cys', 'D' => 'Asp',
+     'E' => 'Glu', 'F' => 'Phe', 'G' => 'Gly', 'H' => 'His',
+     'I' => 'Ile', 'K' => 'Lys', 'L' => 'Leu', 'M' => 'Met',
+     'N' => 'Asn', 'P' => 'Pro', 'Q' => 'Gln', 'R' => 'Arg',
+     'S' => 'Ser', 'T' => 'Thr', 'V' => 'Val', 'W' => 'Trp',
+     'Y' => 'Tyr', 'Z' => 'Glx', 'X' => 'Xaa', '*' => 'Ter',
+     'U' => 'Sec'
+     );
+}
+
+=head2 seq3
+
+ Title   : seq3
+ Usage   : $string = Bio::SeqUtils->seq3($seq)
+ Function:
+
+           Read only method that returns the amino acid sequence as a
+           string of three letter codes. alphabet has to be
+           'protein'. Output follows the IUPAC standard plus 'Ter' for
+           terminator. Any unknown character, including the default
+           unknown character 'X', is changed into 'Xaa'. A noncoded
+           aminoacid selenocystein is recognized (Sec, U).
+
+ Returns : A scalar
+ Args    : character used for stop in the protein sequence optional,
+           defaults to '*' string used to separate the output amino
+           acid codes, optional, defaults to ''
+
+=cut
+
+sub seq3 {
+   my ($self, $seq, $stop, $sep ) = @_;
+
+   $seq->isa('Bio::PrimarySeqI') ||
+       $self->throw('Not a Bio::PrimarySeqI object but [$self]');
+   $seq->alphabet eq 'protein' ||
+       $self->throw('Not a protein sequence');
+
+   if (defined $stop) {
+       length $stop != 1 and $self->throw('One character stop needed, not [$stop]');
+       $THREECODE{$stop} = "Ter";
+   }
+   $sep ||= '';
+
+   my $aa3s;
+   foreach my $aa  (split //, uc $seq->seq) {
+       $THREECODE{$aa} and $aa3s .= $THREECODE{$aa}. $sep, next;
+       $aa3s .= 'Xaa'. $sep;
+   }
+   $sep and substr($aa3s, -(length $sep), length $sep) = '' ;
+   return $aa3s;
+}
+
+=head2 seq3in
+
+ Title   : seq3in
+ Usage   : $string = Bio::SeqUtils->seq3in($seq, 'MetGlyTer')
+ Function:
+
+           Method for in-place changing of the sequence of a
+           Bio::PrimarySeqI sequence object. The three letter amino
+           acid input string is converted into one letter code.  Any
+           unknown character triplet, including the default 'Xaa', is
+           converted into 'X'.
+
+ Returns : Bio::PrimarySeq object;
+ Args    : character to be used for stop in the protein seqence,
+              optional, defaults to '*'
+           character to be used for unknown in the protein seqence,
+              optional, defaults to 'X'
+
+=cut
+
+sub seq3in {
+   my ($self, $seq, $string, $stop, $unknown) = @_;
+
+   $seq->isa('Bio::PrimarySeqI') ||
+       $self->throw('Not a Bio::PrimarySeqI object but [$self]');
+   $seq->alphabet eq 'protein' ||
+       $self->throw('Not a protein sequence');
+
+   if (defined $stop) {
+       length $stop != 1 and $self->throw('One character stop needed, not [$stop]');
+       $ONECODE{'Ter'} = $stop;
+   }
+   if (defined $unknown) {
+       length $unknown != 1 and $self->throw('One character stop needed, not [$unknown]');
+       $ONECODE{'Xaa'} = $unknown;
+   }
+
+   my ($aas, $aa3);
+   my $length = (length $string) - 2;
+   for (my $i = 0 ; $i < $length ; $i += 3)  {
+       $aa3 = substr($string, $i, 3);
+       $ONECODE{$aa3} and $aas .= $ONECODE{$aa3}, next;
+       $aas .= 'X';
+   }
+   $seq->seq($aas);
+   return $seq;
+}
+
+=head2 translate_3frames
+
+ Title   : translate_3frames
+ Usage   : @prots = Bio::SeqUtils->translate_3frames($seq)
+ Function: Translate a nucleotide sequence in three forward frames.
+           The IDs of the sequences are appended with '-0F', '-1F', '-2F'.
+ Returns : An array of seq objects
+ Args    : sequence object
+           same arguments as to Bio::PrimarySeqI::translate
+
+=cut
+
+sub translate_3frames {
+    my ($self, $seq, @args ) = @_;
+    
+    $self->throw('Object [$seq] '. 'of class ['. ref($seq).  ']  can not be translated.')
+	unless $seq->can('translate');
+
+    my ($stop, $unknown, $frame, $tableid, $fullCDS, $throw) = @args;
+    my @seqs;
+    my $f = 0;
+    while ($f != 3) {
+        my $translation = $seq->translate($stop, $unknown,$f,$tableid, $fullCDS, $throw );
+	$translation->id($seq->id. "-". $f. "F");
+	push @seqs, $translation;
+	$f++;
+    }
+
+    return @seqs;
+}
+
+=head2 translate_6frames
+
+ Title   : translate_6frames
+ Usage   : @prots = Bio::SeqUtils->translate_6frames($seq)
+ Function: translate a nucleotide sequence in all six frames
+           The IDs of the sequences are appended with '-0F', '-1F', '-2F',
+           '-0R', '-1R', '-2R'.
+ Returns : An array of seq objects
+ Args    : sequence object
+           same arguments as to Bio::PrimarySeqI::translate
+
+=cut
+
+sub translate_6frames {
+    my ($self, $seq, @args ) = @_;
+    
+    my @seqs = $self->translate_3frames($seq, @args);
+    $seq->seq($seq->revcom->seq);
+    my @seqs2 = $self->translate_3frames($seq, @args);
+    foreach my $seq2 (@seqs2) {
+	my ($tmp) = $seq2->id;
+	$tmp =~ s/F$/R/g;
+	$seq2->id($tmp);
+    }
+    return @seqs, @seqs2;
+}
+
+
+=head2 valid_aa
+
+ Title   : valid_aa
+ Usage   : my @aa = $table->valid_aa
+ Function: Retrieves a list of the valid amino acid codes.
+           The list is ordered so that first 21 codes are for unique 
+           amino acids. The rest are ['B', 'Z', 'X', '*'].
+ Returns : array of all the valid amino acid codes
+ Args    : [optional] $code => [0 -> return list of 1 letter aa codes,
+				1 -> return list of 3 letter aa codes,
+				2 -> return associative array of both ]
+
+=cut
+
+sub valid_aa{
+   my ($self,$code) = @_;
+
+   if( ! $code ) { 
+       my @codes;
+       foreach my $c ( sort values %ONECODE ) {
+	   push @codes, $c unless ( $c =~ /[BZX\*]/ );
+       }
+       push @codes, qw(B Z X *); # so they are in correct order ?
+       return @codes;
+  }
+   elsif( $code == 1 ) { 
+       my @codes;
+       foreach my $c ( sort keys %ONECODE ) {
+	   push @codes, $c unless ( $c =~ /(Asx|Glx|Xaa|Ter)/ );
+       }
+       push @codes, ('Asx', 'Glx', 'Xaa', 'Ter' );
+       return @codes;
+   }
+   elsif( $code == 2 ) { 
+       my %codes = %ONECODE;
+       foreach my $c ( keys %ONECODE ) {
+	   my $aa = $ONECODE{$c};
+	   $codes{$aa} = $c;
+       }
+       return %codes;
+   } else {
+       $self->warn("unrecognized code in ".ref($self)." method valid_aa()");
+       return ();
+   }
+}
+
+1;