0
|
1 # $Id: SeqUtils.pm,v 1.11.2.1 2003/08/11 20:11:17 jason Exp $
|
|
2 #
|
|
3 # BioPerl module for Bio::SeqUtils
|
|
4 #
|
|
5 # Cared for by Heikki Lehvaslaiho <heikki@ebi.ac.uk>
|
|
6 #
|
|
7 # Copyright Heikki Lehvaslaiho
|
|
8 #
|
|
9 # You may distribute this module under the same terms as perl itself
|
|
10
|
|
11 # POD documentation - main docs before the code
|
|
12
|
|
13 =head1 NAME
|
|
14
|
|
15 Bio::SeqUtils - Additional methods for PrimarySeq objects
|
|
16
|
|
17 =head1 SYNOPSIS
|
|
18
|
|
19 use Bio::SeqUtils;
|
|
20 # get a Bio::PrimarySeqI compliant object, $seq, somehow
|
|
21 $util = new Bio::SeqUtils;
|
|
22 $polypeptide_3char = $util->seq3($seq);
|
|
23 # or
|
|
24 $polypeptide_3char = Bio::SeqUtils->seq3($seq);
|
|
25
|
|
26 # set the sequence string (stored in one char code in the object)
|
|
27 Bio::SeqUtils->seq3($seq, $polypeptide_3char);
|
|
28
|
|
29 # translate a sequence in all six frames
|
|
30 @seqs = Bio::SeqUtils->translate_6frames($seq);
|
|
31
|
|
32 =head1 DESCRIPTION
|
|
33
|
|
34 This class is a holder of methods that work on Bio::PrimarySeqI-
|
|
35 compliant sequence objects, e.g. Bio::PrimarySeq and
|
|
36 Bio::Seq. These methods are not part of the Bio::PrimarySeqI
|
|
37 interface and should in general not be essential to the primary function
|
|
38 of sequence objects. If you are thinking of adding essential
|
|
39 functions, it might be better to create your own sequence class.
|
|
40 See L<Bio::PrimarySeqI>, L<Bio::PrimarySeq>, and L<Bio::Seq> for more.
|
|
41
|
|
42 The methods take as their first argument a sequence object. It is
|
|
43 possible to use methods without first creating a SeqUtils object,
|
|
44 i.e. use it as an anonymous hash.
|
|
45
|
|
46 The first two methods, seq3() and seq3in(), give out or read in protein
|
|
47 sequences coded in three letter IUPAC amino acid codes.
|
|
48
|
|
49 The next two methods, translate_3frames() and translate_6frames(), wrap
|
|
50 around the standard translate method to give back an array of three
|
|
51 forward or all six frame translations.
|
|
52
|
|
53 =head1 FEEDBACK
|
|
54
|
|
55 =head2 Mailing Lists
|
|
56
|
|
57 User feedback is an integral part of the evolution of this and other
|
|
58 Bioperl modules. Send your comments and suggestions preferably to one
|
|
59 of the Bioperl mailing lists. Your participation is much appreciated.
|
|
60
|
|
61 bioperl-l@bioperl.org - General discussion
|
|
62 http://bio.perl.org/MailList.html - About the mailing lists
|
|
63
|
|
64 =head2 Reporting Bugs
|
|
65
|
|
66 Report bugs to the Bioperl bug tracking system to help us keep track
|
|
67 the bugs and their resolution. Bug reports can be submitted via email
|
|
68 or the web:
|
|
69
|
|
70 bioperl-bugs@bio.perl.org
|
|
71 http://bugzilla.bioperl.org/
|
|
72
|
|
73 =head1 AUTHOR - Heikki Lehvaslaiho
|
|
74
|
|
75 Email: heikki@ebi.ac.uk
|
|
76 Address:
|
|
77
|
|
78 EMBL Outstation, European Bioinformatics Institute
|
|
79 Wellcome Trust Genome Campus, Hinxton
|
|
80 Cambs. CB10 1SD, United Kingdom
|
|
81
|
|
82 =head1 APPENDIX
|
|
83
|
|
84 The rest of the documentation details each of the object
|
|
85 methods. Internal methods are usually preceded with a _
|
|
86
|
|
87 =cut
|
|
88
|
|
89
|
|
90 # Let the code begin...
|
|
91
|
|
92
|
|
93 package Bio::SeqUtils;
|
|
94 use vars qw(@ISA %ONECODE %THREECODE);
|
|
95 use strict;
|
|
96 use Carp;
|
|
97
|
|
98 @ISA = qw(Bio::Root::Root);
|
|
99 # new inherited from RootI
|
|
100
|
|
101 BEGIN {
|
|
102
|
|
103 %ONECODE =
|
|
104 ('Ala' => 'A', 'Asx' => 'B', 'Cys' => 'C', 'Asp' => 'D',
|
|
105 'Glu' => 'E', 'Phe' => 'F', 'Gly' => 'G', 'His' => 'H',
|
|
106 'Ile' => 'I', 'Lys' => 'K', 'Leu' => 'L', 'Met' => 'M',
|
|
107 'Asn' => 'N', 'Pro' => 'P', 'Gln' => 'Q', 'Arg' => 'R',
|
|
108 'Ser' => 'S', 'Thr' => 'T', 'Val' => 'V', 'Trp' => 'W',
|
|
109 'Xaa' => 'X', 'Tyr' => 'Y', 'Glx' => 'Z', 'Ter' => '*',
|
|
110 'Sec' => 'U'
|
|
111 );
|
|
112
|
|
113 %THREECODE =
|
|
114 ('A' => 'Ala', 'B' => 'Asx', 'C' => 'Cys', 'D' => 'Asp',
|
|
115 'E' => 'Glu', 'F' => 'Phe', 'G' => 'Gly', 'H' => 'His',
|
|
116 'I' => 'Ile', 'K' => 'Lys', 'L' => 'Leu', 'M' => 'Met',
|
|
117 'N' => 'Asn', 'P' => 'Pro', 'Q' => 'Gln', 'R' => 'Arg',
|
|
118 'S' => 'Ser', 'T' => 'Thr', 'V' => 'Val', 'W' => 'Trp',
|
|
119 'Y' => 'Tyr', 'Z' => 'Glx', 'X' => 'Xaa', '*' => 'Ter',
|
|
120 'U' => 'Sec'
|
|
121 );
|
|
122 }
|
|
123
|
|
124 =head2 seq3
|
|
125
|
|
126 Title : seq3
|
|
127 Usage : $string = Bio::SeqUtils->seq3($seq)
|
|
128 Function:
|
|
129
|
|
130 Read only method that returns the amino acid sequence as a
|
|
131 string of three letter codes. alphabet has to be
|
|
132 'protein'. Output follows the IUPAC standard plus 'Ter' for
|
|
133 terminator. Any unknown character, including the default
|
|
134 unknown character 'X', is changed into 'Xaa'. A noncoded
|
|
135 aminoacid selenocystein is recognized (Sec, U).
|
|
136
|
|
137 Returns : A scalar
|
|
138 Args : character used for stop in the protein sequence optional,
|
|
139 defaults to '*' string used to separate the output amino
|
|
140 acid codes, optional, defaults to ''
|
|
141
|
|
142 =cut
|
|
143
|
|
144 sub seq3 {
|
|
145 my ($self, $seq, $stop, $sep ) = @_;
|
|
146
|
|
147 $seq->isa('Bio::PrimarySeqI') ||
|
|
148 $self->throw('Not a Bio::PrimarySeqI object but [$self]');
|
|
149 $seq->alphabet eq 'protein' ||
|
|
150 $self->throw('Not a protein sequence');
|
|
151
|
|
152 if (defined $stop) {
|
|
153 length $stop != 1 and $self->throw('One character stop needed, not [$stop]');
|
|
154 $THREECODE{$stop} = "Ter";
|
|
155 }
|
|
156 $sep ||= '';
|
|
157
|
|
158 my $aa3s;
|
|
159 foreach my $aa (split //, uc $seq->seq) {
|
|
160 $THREECODE{$aa} and $aa3s .= $THREECODE{$aa}. $sep, next;
|
|
161 $aa3s .= 'Xaa'. $sep;
|
|
162 }
|
|
163 $sep and substr($aa3s, -(length $sep), length $sep) = '' ;
|
|
164 return $aa3s;
|
|
165 }
|
|
166
|
|
167 =head2 seq3in
|
|
168
|
|
169 Title : seq3in
|
|
170 Usage : $string = Bio::SeqUtils->seq3in($seq, 'MetGlyTer')
|
|
171 Function:
|
|
172
|
|
173 Method for in-place changing of the sequence of a
|
|
174 Bio::PrimarySeqI sequence object. The three letter amino
|
|
175 acid input string is converted into one letter code. Any
|
|
176 unknown character triplet, including the default 'Xaa', is
|
|
177 converted into 'X'.
|
|
178
|
|
179 Returns : Bio::PrimarySeq object;
|
|
180 Args : character to be used for stop in the protein seqence,
|
|
181 optional, defaults to '*'
|
|
182 character to be used for unknown in the protein seqence,
|
|
183 optional, defaults to 'X'
|
|
184
|
|
185 =cut
|
|
186
|
|
187 sub seq3in {
|
|
188 my ($self, $seq, $string, $stop, $unknown) = @_;
|
|
189
|
|
190 $seq->isa('Bio::PrimarySeqI') ||
|
|
191 $self->throw('Not a Bio::PrimarySeqI object but [$self]');
|
|
192 $seq->alphabet eq 'protein' ||
|
|
193 $self->throw('Not a protein sequence');
|
|
194
|
|
195 if (defined $stop) {
|
|
196 length $stop != 1 and $self->throw('One character stop needed, not [$stop]');
|
|
197 $ONECODE{'Ter'} = $stop;
|
|
198 }
|
|
199 if (defined $unknown) {
|
|
200 length $unknown != 1 and $self->throw('One character stop needed, not [$unknown]');
|
|
201 $ONECODE{'Xaa'} = $unknown;
|
|
202 }
|
|
203
|
|
204 my ($aas, $aa3);
|
|
205 my $length = (length $string) - 2;
|
|
206 for (my $i = 0 ; $i < $length ; $i += 3) {
|
|
207 $aa3 = substr($string, $i, 3);
|
|
208 $ONECODE{$aa3} and $aas .= $ONECODE{$aa3}, next;
|
|
209 $aas .= 'X';
|
|
210 }
|
|
211 $seq->seq($aas);
|
|
212 return $seq;
|
|
213 }
|
|
214
|
|
215 =head2 translate_3frames
|
|
216
|
|
217 Title : translate_3frames
|
|
218 Usage : @prots = Bio::SeqUtils->translate_3frames($seq)
|
|
219 Function: Translate a nucleotide sequence in three forward frames.
|
|
220 The IDs of the sequences are appended with '-0F', '-1F', '-2F'.
|
|
221 Returns : An array of seq objects
|
|
222 Args : sequence object
|
|
223 same arguments as to Bio::PrimarySeqI::translate
|
|
224
|
|
225 =cut
|
|
226
|
|
227 sub translate_3frames {
|
|
228 my ($self, $seq, @args ) = @_;
|
|
229
|
|
230 $self->throw('Object [$seq] '. 'of class ['. ref($seq). '] can not be translated.')
|
|
231 unless $seq->can('translate');
|
|
232
|
|
233 my ($stop, $unknown, $frame, $tableid, $fullCDS, $throw) = @args;
|
|
234 my @seqs;
|
|
235 my $f = 0;
|
|
236 while ($f != 3) {
|
|
237 my $translation = $seq->translate($stop, $unknown,$f,$tableid, $fullCDS, $throw );
|
|
238 $translation->id($seq->id. "-". $f. "F");
|
|
239 push @seqs, $translation;
|
|
240 $f++;
|
|
241 }
|
|
242
|
|
243 return @seqs;
|
|
244 }
|
|
245
|
|
246 =head2 translate_6frames
|
|
247
|
|
248 Title : translate_6frames
|
|
249 Usage : @prots = Bio::SeqUtils->translate_6frames($seq)
|
|
250 Function: translate a nucleotide sequence in all six frames
|
|
251 The IDs of the sequences are appended with '-0F', '-1F', '-2F',
|
|
252 '-0R', '-1R', '-2R'.
|
|
253 Returns : An array of seq objects
|
|
254 Args : sequence object
|
|
255 same arguments as to Bio::PrimarySeqI::translate
|
|
256
|
|
257 =cut
|
|
258
|
|
259 sub translate_6frames {
|
|
260 my ($self, $seq, @args ) = @_;
|
|
261
|
|
262 my @seqs = $self->translate_3frames($seq, @args);
|
|
263 $seq->seq($seq->revcom->seq);
|
|
264 my @seqs2 = $self->translate_3frames($seq, @args);
|
|
265 foreach my $seq2 (@seqs2) {
|
|
266 my ($tmp) = $seq2->id;
|
|
267 $tmp =~ s/F$/R/g;
|
|
268 $seq2->id($tmp);
|
|
269 }
|
|
270 return @seqs, @seqs2;
|
|
271 }
|
|
272
|
|
273
|
|
274 =head2 valid_aa
|
|
275
|
|
276 Title : valid_aa
|
|
277 Usage : my @aa = $table->valid_aa
|
|
278 Function: Retrieves a list of the valid amino acid codes.
|
|
279 The list is ordered so that first 21 codes are for unique
|
|
280 amino acids. The rest are ['B', 'Z', 'X', '*'].
|
|
281 Returns : array of all the valid amino acid codes
|
|
282 Args : [optional] $code => [0 -> return list of 1 letter aa codes,
|
|
283 1 -> return list of 3 letter aa codes,
|
|
284 2 -> return associative array of both ]
|
|
285
|
|
286 =cut
|
|
287
|
|
288 sub valid_aa{
|
|
289 my ($self,$code) = @_;
|
|
290
|
|
291 if( ! $code ) {
|
|
292 my @codes;
|
|
293 foreach my $c ( sort values %ONECODE ) {
|
|
294 push @codes, $c unless ( $c =~ /[BZX\*]/ );
|
|
295 }
|
|
296 push @codes, qw(B Z X *); # so they are in correct order ?
|
|
297 return @codes;
|
|
298 }
|
|
299 elsif( $code == 1 ) {
|
|
300 my @codes;
|
|
301 foreach my $c ( sort keys %ONECODE ) {
|
|
302 push @codes, $c unless ( $c =~ /(Asx|Glx|Xaa|Ter)/ );
|
|
303 }
|
|
304 push @codes, ('Asx', 'Glx', 'Xaa', 'Ter' );
|
|
305 return @codes;
|
|
306 }
|
|
307 elsif( $code == 2 ) {
|
|
308 my %codes = %ONECODE;
|
|
309 foreach my $c ( keys %ONECODE ) {
|
|
310 my $aa = $ONECODE{$c};
|
|
311 $codes{$aa} = $c;
|
|
312 }
|
|
313 return %codes;
|
|
314 } else {
|
|
315 $self->warn("unrecognized code in ".ref($self)." method valid_aa()");
|
|
316 return ();
|
|
317 }
|
|
318 }
|
|
319
|
|
320 1;
|