Mercurial > repos > mahtabm > ensembl
comparison variant_effect_predictor/Bio/SeqUtils.pm @ 0:1f6dce3d34e0
Uploaded
| author | mahtabm |
|---|---|
| date | Thu, 11 Apr 2013 02:01:53 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:1f6dce3d34e0 |
|---|---|
| 1 # $Id: SeqUtils.pm,v 1.11.2.1 2003/08/11 20:11:17 jason Exp $ | |
| 2 # | |
| 3 # BioPerl module for Bio::SeqUtils | |
| 4 # | |
| 5 # Cared for by Heikki Lehvaslaiho <heikki@ebi.ac.uk> | |
| 6 # | |
| 7 # Copyright Heikki Lehvaslaiho | |
| 8 # | |
| 9 # You may distribute this module under the same terms as perl itself | |
| 10 | |
| 11 # POD documentation - main docs before the code | |
| 12 | |
| 13 =head1 NAME | |
| 14 | |
| 15 Bio::SeqUtils - Additional methods for PrimarySeq objects | |
| 16 | |
| 17 =head1 SYNOPSIS | |
| 18 | |
| 19 use Bio::SeqUtils; | |
| 20 # get a Bio::PrimarySeqI compliant object, $seq, somehow | |
| 21 $util = new Bio::SeqUtils; | |
| 22 $polypeptide_3char = $util->seq3($seq); | |
| 23 # or | |
| 24 $polypeptide_3char = Bio::SeqUtils->seq3($seq); | |
| 25 | |
| 26 # set the sequence string (stored in one char code in the object) | |
| 27 Bio::SeqUtils->seq3($seq, $polypeptide_3char); | |
| 28 | |
| 29 # translate a sequence in all six frames | |
| 30 @seqs = Bio::SeqUtils->translate_6frames($seq); | |
| 31 | |
| 32 =head1 DESCRIPTION | |
| 33 | |
| 34 This class is a holder of methods that work on Bio::PrimarySeqI- | |
| 35 compliant sequence objects, e.g. Bio::PrimarySeq and | |
| 36 Bio::Seq. These methods are not part of the Bio::PrimarySeqI | |
| 37 interface and should in general not be essential to the primary function | |
| 38 of sequence objects. If you are thinking of adding essential | |
| 39 functions, it might be better to create your own sequence class. | |
| 40 See L<Bio::PrimarySeqI>, L<Bio::PrimarySeq>, and L<Bio::Seq> for more. | |
| 41 | |
| 42 The methods take as their first argument a sequence object. It is | |
| 43 possible to use methods without first creating a SeqUtils object, | |
| 44 i.e. use it as an anonymous hash. | |
| 45 | |
| 46 The first two methods, seq3() and seq3in(), give out or read in protein | |
| 47 sequences coded in three letter IUPAC amino acid codes. | |
| 48 | |
| 49 The next two methods, translate_3frames() and translate_6frames(), wrap | |
| 50 around the standard translate method to give back an array of three | |
| 51 forward or all six frame translations. | |
| 52 | |
| 53 =head1 FEEDBACK | |
| 54 | |
| 55 =head2 Mailing Lists | |
| 56 | |
| 57 User feedback is an integral part of the evolution of this and other | |
| 58 Bioperl modules. Send your comments and suggestions preferably to one | |
| 59 of the Bioperl mailing lists. Your participation is much appreciated. | |
| 60 | |
| 61 bioperl-l@bioperl.org - General discussion | |
| 62 http://bio.perl.org/MailList.html - About the mailing lists | |
| 63 | |
| 64 =head2 Reporting Bugs | |
| 65 | |
| 66 Report bugs to the Bioperl bug tracking system to help us keep track | |
| 67 the bugs and their resolution. Bug reports can be submitted via email | |
| 68 or the web: | |
| 69 | |
| 70 bioperl-bugs@bio.perl.org | |
| 71 http://bugzilla.bioperl.org/ | |
| 72 | |
| 73 =head1 AUTHOR - Heikki Lehvaslaiho | |
| 74 | |
| 75 Email: heikki@ebi.ac.uk | |
| 76 Address: | |
| 77 | |
| 78 EMBL Outstation, European Bioinformatics Institute | |
| 79 Wellcome Trust Genome Campus, Hinxton | |
| 80 Cambs. CB10 1SD, United Kingdom | |
| 81 | |
| 82 =head1 APPENDIX | |
| 83 | |
| 84 The rest of the documentation details each of the object | |
| 85 methods. Internal methods are usually preceded with a _ | |
| 86 | |
| 87 =cut | |
| 88 | |
| 89 | |
| 90 # Let the code begin... | |
| 91 | |
| 92 | |
| 93 package Bio::SeqUtils; | |
| 94 use vars qw(@ISA %ONECODE %THREECODE); | |
| 95 use strict; | |
| 96 use Carp; | |
| 97 | |
| 98 @ISA = qw(Bio::Root::Root); | |
| 99 # new inherited from RootI | |
| 100 | |
| 101 BEGIN { | |
| 102 | |
| 103 %ONECODE = | |
| 104 ('Ala' => 'A', 'Asx' => 'B', 'Cys' => 'C', 'Asp' => 'D', | |
| 105 'Glu' => 'E', 'Phe' => 'F', 'Gly' => 'G', 'His' => 'H', | |
| 106 'Ile' => 'I', 'Lys' => 'K', 'Leu' => 'L', 'Met' => 'M', | |
| 107 'Asn' => 'N', 'Pro' => 'P', 'Gln' => 'Q', 'Arg' => 'R', | |
| 108 'Ser' => 'S', 'Thr' => 'T', 'Val' => 'V', 'Trp' => 'W', | |
| 109 'Xaa' => 'X', 'Tyr' => 'Y', 'Glx' => 'Z', 'Ter' => '*', | |
| 110 'Sec' => 'U' | |
| 111 ); | |
| 112 | |
| 113 %THREECODE = | |
| 114 ('A' => 'Ala', 'B' => 'Asx', 'C' => 'Cys', 'D' => 'Asp', | |
| 115 'E' => 'Glu', 'F' => 'Phe', 'G' => 'Gly', 'H' => 'His', | |
| 116 'I' => 'Ile', 'K' => 'Lys', 'L' => 'Leu', 'M' => 'Met', | |
| 117 'N' => 'Asn', 'P' => 'Pro', 'Q' => 'Gln', 'R' => 'Arg', | |
| 118 'S' => 'Ser', 'T' => 'Thr', 'V' => 'Val', 'W' => 'Trp', | |
| 119 'Y' => 'Tyr', 'Z' => 'Glx', 'X' => 'Xaa', '*' => 'Ter', | |
| 120 'U' => 'Sec' | |
| 121 ); | |
| 122 } | |
| 123 | |
| 124 =head2 seq3 | |
| 125 | |
| 126 Title : seq3 | |
| 127 Usage : $string = Bio::SeqUtils->seq3($seq) | |
| 128 Function: | |
| 129 | |
| 130 Read only method that returns the amino acid sequence as a | |
| 131 string of three letter codes. alphabet has to be | |
| 132 'protein'. Output follows the IUPAC standard plus 'Ter' for | |
| 133 terminator. Any unknown character, including the default | |
| 134 unknown character 'X', is changed into 'Xaa'. A noncoded | |
| 135 aminoacid selenocystein is recognized (Sec, U). | |
| 136 | |
| 137 Returns : A scalar | |
| 138 Args : character used for stop in the protein sequence optional, | |
| 139 defaults to '*' string used to separate the output amino | |
| 140 acid codes, optional, defaults to '' | |
| 141 | |
| 142 =cut | |
| 143 | |
| 144 sub seq3 { | |
| 145 my ($self, $seq, $stop, $sep ) = @_; | |
| 146 | |
| 147 $seq->isa('Bio::PrimarySeqI') || | |
| 148 $self->throw('Not a Bio::PrimarySeqI object but [$self]'); | |
| 149 $seq->alphabet eq 'protein' || | |
| 150 $self->throw('Not a protein sequence'); | |
| 151 | |
| 152 if (defined $stop) { | |
| 153 length $stop != 1 and $self->throw('One character stop needed, not [$stop]'); | |
| 154 $THREECODE{$stop} = "Ter"; | |
| 155 } | |
| 156 $sep ||= ''; | |
| 157 | |
| 158 my $aa3s; | |
| 159 foreach my $aa (split //, uc $seq->seq) { | |
| 160 $THREECODE{$aa} and $aa3s .= $THREECODE{$aa}. $sep, next; | |
| 161 $aa3s .= 'Xaa'. $sep; | |
| 162 } | |
| 163 $sep and substr($aa3s, -(length $sep), length $sep) = '' ; | |
| 164 return $aa3s; | |
| 165 } | |
| 166 | |
| 167 =head2 seq3in | |
| 168 | |
| 169 Title : seq3in | |
| 170 Usage : $string = Bio::SeqUtils->seq3in($seq, 'MetGlyTer') | |
| 171 Function: | |
| 172 | |
| 173 Method for in-place changing of the sequence of a | |
| 174 Bio::PrimarySeqI sequence object. The three letter amino | |
| 175 acid input string is converted into one letter code. Any | |
| 176 unknown character triplet, including the default 'Xaa', is | |
| 177 converted into 'X'. | |
| 178 | |
| 179 Returns : Bio::PrimarySeq object; | |
| 180 Args : character to be used for stop in the protein seqence, | |
| 181 optional, defaults to '*' | |
| 182 character to be used for unknown in the protein seqence, | |
| 183 optional, defaults to 'X' | |
| 184 | |
| 185 =cut | |
| 186 | |
| 187 sub seq3in { | |
| 188 my ($self, $seq, $string, $stop, $unknown) = @_; | |
| 189 | |
| 190 $seq->isa('Bio::PrimarySeqI') || | |
| 191 $self->throw('Not a Bio::PrimarySeqI object but [$self]'); | |
| 192 $seq->alphabet eq 'protein' || | |
| 193 $self->throw('Not a protein sequence'); | |
| 194 | |
| 195 if (defined $stop) { | |
| 196 length $stop != 1 and $self->throw('One character stop needed, not [$stop]'); | |
| 197 $ONECODE{'Ter'} = $stop; | |
| 198 } | |
| 199 if (defined $unknown) { | |
| 200 length $unknown != 1 and $self->throw('One character stop needed, not [$unknown]'); | |
| 201 $ONECODE{'Xaa'} = $unknown; | |
| 202 } | |
| 203 | |
| 204 my ($aas, $aa3); | |
| 205 my $length = (length $string) - 2; | |
| 206 for (my $i = 0 ; $i < $length ; $i += 3) { | |
| 207 $aa3 = substr($string, $i, 3); | |
| 208 $ONECODE{$aa3} and $aas .= $ONECODE{$aa3}, next; | |
| 209 $aas .= 'X'; | |
| 210 } | |
| 211 $seq->seq($aas); | |
| 212 return $seq; | |
| 213 } | |
| 214 | |
| 215 =head2 translate_3frames | |
| 216 | |
| 217 Title : translate_3frames | |
| 218 Usage : @prots = Bio::SeqUtils->translate_3frames($seq) | |
| 219 Function: Translate a nucleotide sequence in three forward frames. | |
| 220 The IDs of the sequences are appended with '-0F', '-1F', '-2F'. | |
| 221 Returns : An array of seq objects | |
| 222 Args : sequence object | |
| 223 same arguments as to Bio::PrimarySeqI::translate | |
| 224 | |
| 225 =cut | |
| 226 | |
| 227 sub translate_3frames { | |
| 228 my ($self, $seq, @args ) = @_; | |
| 229 | |
| 230 $self->throw('Object [$seq] '. 'of class ['. ref($seq). '] can not be translated.') | |
| 231 unless $seq->can('translate'); | |
| 232 | |
| 233 my ($stop, $unknown, $frame, $tableid, $fullCDS, $throw) = @args; | |
| 234 my @seqs; | |
| 235 my $f = 0; | |
| 236 while ($f != 3) { | |
| 237 my $translation = $seq->translate($stop, $unknown,$f,$tableid, $fullCDS, $throw ); | |
| 238 $translation->id($seq->id. "-". $f. "F"); | |
| 239 push @seqs, $translation; | |
| 240 $f++; | |
| 241 } | |
| 242 | |
| 243 return @seqs; | |
| 244 } | |
| 245 | |
| 246 =head2 translate_6frames | |
| 247 | |
| 248 Title : translate_6frames | |
| 249 Usage : @prots = Bio::SeqUtils->translate_6frames($seq) | |
| 250 Function: translate a nucleotide sequence in all six frames | |
| 251 The IDs of the sequences are appended with '-0F', '-1F', '-2F', | |
| 252 '-0R', '-1R', '-2R'. | |
| 253 Returns : An array of seq objects | |
| 254 Args : sequence object | |
| 255 same arguments as to Bio::PrimarySeqI::translate | |
| 256 | |
| 257 =cut | |
| 258 | |
| 259 sub translate_6frames { | |
| 260 my ($self, $seq, @args ) = @_; | |
| 261 | |
| 262 my @seqs = $self->translate_3frames($seq, @args); | |
| 263 $seq->seq($seq->revcom->seq); | |
| 264 my @seqs2 = $self->translate_3frames($seq, @args); | |
| 265 foreach my $seq2 (@seqs2) { | |
| 266 my ($tmp) = $seq2->id; | |
| 267 $tmp =~ s/F$/R/g; | |
| 268 $seq2->id($tmp); | |
| 269 } | |
| 270 return @seqs, @seqs2; | |
| 271 } | |
| 272 | |
| 273 | |
| 274 =head2 valid_aa | |
| 275 | |
| 276 Title : valid_aa | |
| 277 Usage : my @aa = $table->valid_aa | |
| 278 Function: Retrieves a list of the valid amino acid codes. | |
| 279 The list is ordered so that first 21 codes are for unique | |
| 280 amino acids. The rest are ['B', 'Z', 'X', '*']. | |
| 281 Returns : array of all the valid amino acid codes | |
| 282 Args : [optional] $code => [0 -> return list of 1 letter aa codes, | |
| 283 1 -> return list of 3 letter aa codes, | |
| 284 2 -> return associative array of both ] | |
| 285 | |
| 286 =cut | |
| 287 | |
| 288 sub valid_aa{ | |
| 289 my ($self,$code) = @_; | |
| 290 | |
| 291 if( ! $code ) { | |
| 292 my @codes; | |
| 293 foreach my $c ( sort values %ONECODE ) { | |
| 294 push @codes, $c unless ( $c =~ /[BZX\*]/ ); | |
| 295 } | |
| 296 push @codes, qw(B Z X *); # so they are in correct order ? | |
| 297 return @codes; | |
| 298 } | |
| 299 elsif( $code == 1 ) { | |
| 300 my @codes; | |
| 301 foreach my $c ( sort keys %ONECODE ) { | |
| 302 push @codes, $c unless ( $c =~ /(Asx|Glx|Xaa|Ter)/ ); | |
| 303 } | |
| 304 push @codes, ('Asx', 'Glx', 'Xaa', 'Ter' ); | |
| 305 return @codes; | |
| 306 } | |
| 307 elsif( $code == 2 ) { | |
| 308 my %codes = %ONECODE; | |
| 309 foreach my $c ( keys %ONECODE ) { | |
| 310 my $aa = $ONECODE{$c}; | |
| 311 $codes{$aa} = $c; | |
| 312 } | |
| 313 return %codes; | |
| 314 } else { | |
| 315 $self->warn("unrecognized code in ".ref($self)." method valid_aa()"); | |
| 316 return (); | |
| 317 } | |
| 318 } | |
| 319 | |
| 320 1; |
