Mercurial > repos > mahtabm > ensemb_rep_gvl
comparison variant_effect_predictor/Bio/SeqUtils.pm @ 0:2bc9b66ada89 draft default tip
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 06:29:17 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:2bc9b66ada89 |
---|---|
1 # $Id: SeqUtils.pm,v 1.11.2.1 2003/08/11 20:11:17 jason Exp $ | |
2 # | |
3 # BioPerl module for Bio::SeqUtils | |
4 # | |
5 # Cared for by Heikki Lehvaslaiho <heikki@ebi.ac.uk> | |
6 # | |
7 # Copyright Heikki Lehvaslaiho | |
8 # | |
9 # You may distribute this module under the same terms as perl itself | |
10 | |
11 # POD documentation - main docs before the code | |
12 | |
13 =head1 NAME | |
14 | |
15 Bio::SeqUtils - Additional methods for PrimarySeq objects | |
16 | |
17 =head1 SYNOPSIS | |
18 | |
19 use Bio::SeqUtils; | |
20 # get a Bio::PrimarySeqI compliant object, $seq, somehow | |
21 $util = new Bio::SeqUtils; | |
22 $polypeptide_3char = $util->seq3($seq); | |
23 # or | |
24 $polypeptide_3char = Bio::SeqUtils->seq3($seq); | |
25 | |
26 # set the sequence string (stored in one char code in the object) | |
27 Bio::SeqUtils->seq3($seq, $polypeptide_3char); | |
28 | |
29 # translate a sequence in all six frames | |
30 @seqs = Bio::SeqUtils->translate_6frames($seq); | |
31 | |
32 =head1 DESCRIPTION | |
33 | |
34 This class is a holder of methods that work on Bio::PrimarySeqI- | |
35 compliant sequence objects, e.g. Bio::PrimarySeq and | |
36 Bio::Seq. These methods are not part of the Bio::PrimarySeqI | |
37 interface and should in general not be essential to the primary function | |
38 of sequence objects. If you are thinking of adding essential | |
39 functions, it might be better to create your own sequence class. | |
40 See L<Bio::PrimarySeqI>, L<Bio::PrimarySeq>, and L<Bio::Seq> for more. | |
41 | |
42 The methods take as their first argument a sequence object. It is | |
43 possible to use methods without first creating a SeqUtils object, | |
44 i.e. use it as an anonymous hash. | |
45 | |
46 The first two methods, seq3() and seq3in(), give out or read in protein | |
47 sequences coded in three letter IUPAC amino acid codes. | |
48 | |
49 The next two methods, translate_3frames() and translate_6frames(), wrap | |
50 around the standard translate method to give back an array of three | |
51 forward or all six frame translations. | |
52 | |
53 =head1 FEEDBACK | |
54 | |
55 =head2 Mailing Lists | |
56 | |
57 User feedback is an integral part of the evolution of this and other | |
58 Bioperl modules. Send your comments and suggestions preferably to one | |
59 of the Bioperl mailing lists. Your participation is much appreciated. | |
60 | |
61 bioperl-l@bioperl.org - General discussion | |
62 http://bio.perl.org/MailList.html - About the mailing lists | |
63 | |
64 =head2 Reporting Bugs | |
65 | |
66 Report bugs to the Bioperl bug tracking system to help us keep track | |
67 the bugs and their resolution. Bug reports can be submitted via email | |
68 or the web: | |
69 | |
70 bioperl-bugs@bio.perl.org | |
71 http://bugzilla.bioperl.org/ | |
72 | |
73 =head1 AUTHOR - Heikki Lehvaslaiho | |
74 | |
75 Email: heikki@ebi.ac.uk | |
76 Address: | |
77 | |
78 EMBL Outstation, European Bioinformatics Institute | |
79 Wellcome Trust Genome Campus, Hinxton | |
80 Cambs. CB10 1SD, United Kingdom | |
81 | |
82 =head1 APPENDIX | |
83 | |
84 The rest of the documentation details each of the object | |
85 methods. Internal methods are usually preceded with a _ | |
86 | |
87 =cut | |
88 | |
89 | |
90 # Let the code begin... | |
91 | |
92 | |
93 package Bio::SeqUtils; | |
94 use vars qw(@ISA %ONECODE %THREECODE); | |
95 use strict; | |
96 use Carp; | |
97 | |
98 @ISA = qw(Bio::Root::Root); | |
99 # new inherited from RootI | |
100 | |
101 BEGIN { | |
102 | |
103 %ONECODE = | |
104 ('Ala' => 'A', 'Asx' => 'B', 'Cys' => 'C', 'Asp' => 'D', | |
105 'Glu' => 'E', 'Phe' => 'F', 'Gly' => 'G', 'His' => 'H', | |
106 'Ile' => 'I', 'Lys' => 'K', 'Leu' => 'L', 'Met' => 'M', | |
107 'Asn' => 'N', 'Pro' => 'P', 'Gln' => 'Q', 'Arg' => 'R', | |
108 'Ser' => 'S', 'Thr' => 'T', 'Val' => 'V', 'Trp' => 'W', | |
109 'Xaa' => 'X', 'Tyr' => 'Y', 'Glx' => 'Z', 'Ter' => '*', | |
110 'Sec' => 'U' | |
111 ); | |
112 | |
113 %THREECODE = | |
114 ('A' => 'Ala', 'B' => 'Asx', 'C' => 'Cys', 'D' => 'Asp', | |
115 'E' => 'Glu', 'F' => 'Phe', 'G' => 'Gly', 'H' => 'His', | |
116 'I' => 'Ile', 'K' => 'Lys', 'L' => 'Leu', 'M' => 'Met', | |
117 'N' => 'Asn', 'P' => 'Pro', 'Q' => 'Gln', 'R' => 'Arg', | |
118 'S' => 'Ser', 'T' => 'Thr', 'V' => 'Val', 'W' => 'Trp', | |
119 'Y' => 'Tyr', 'Z' => 'Glx', 'X' => 'Xaa', '*' => 'Ter', | |
120 'U' => 'Sec' | |
121 ); | |
122 } | |
123 | |
124 =head2 seq3 | |
125 | |
126 Title : seq3 | |
127 Usage : $string = Bio::SeqUtils->seq3($seq) | |
128 Function: | |
129 | |
130 Read only method that returns the amino acid sequence as a | |
131 string of three letter codes. alphabet has to be | |
132 'protein'. Output follows the IUPAC standard plus 'Ter' for | |
133 terminator. Any unknown character, including the default | |
134 unknown character 'X', is changed into 'Xaa'. A noncoded | |
135 aminoacid selenocystein is recognized (Sec, U). | |
136 | |
137 Returns : A scalar | |
138 Args : character used for stop in the protein sequence optional, | |
139 defaults to '*' string used to separate the output amino | |
140 acid codes, optional, defaults to '' | |
141 | |
142 =cut | |
143 | |
144 sub seq3 { | |
145 my ($self, $seq, $stop, $sep ) = @_; | |
146 | |
147 $seq->isa('Bio::PrimarySeqI') || | |
148 $self->throw('Not a Bio::PrimarySeqI object but [$self]'); | |
149 $seq->alphabet eq 'protein' || | |
150 $self->throw('Not a protein sequence'); | |
151 | |
152 if (defined $stop) { | |
153 length $stop != 1 and $self->throw('One character stop needed, not [$stop]'); | |
154 $THREECODE{$stop} = "Ter"; | |
155 } | |
156 $sep ||= ''; | |
157 | |
158 my $aa3s; | |
159 foreach my $aa (split //, uc $seq->seq) { | |
160 $THREECODE{$aa} and $aa3s .= $THREECODE{$aa}. $sep, next; | |
161 $aa3s .= 'Xaa'. $sep; | |
162 } | |
163 $sep and substr($aa3s, -(length $sep), length $sep) = '' ; | |
164 return $aa3s; | |
165 } | |
166 | |
167 =head2 seq3in | |
168 | |
169 Title : seq3in | |
170 Usage : $string = Bio::SeqUtils->seq3in($seq, 'MetGlyTer') | |
171 Function: | |
172 | |
173 Method for in-place changing of the sequence of a | |
174 Bio::PrimarySeqI sequence object. The three letter amino | |
175 acid input string is converted into one letter code. Any | |
176 unknown character triplet, including the default 'Xaa', is | |
177 converted into 'X'. | |
178 | |
179 Returns : Bio::PrimarySeq object; | |
180 Args : character to be used for stop in the protein seqence, | |
181 optional, defaults to '*' | |
182 character to be used for unknown in the protein seqence, | |
183 optional, defaults to 'X' | |
184 | |
185 =cut | |
186 | |
187 sub seq3in { | |
188 my ($self, $seq, $string, $stop, $unknown) = @_; | |
189 | |
190 $seq->isa('Bio::PrimarySeqI') || | |
191 $self->throw('Not a Bio::PrimarySeqI object but [$self]'); | |
192 $seq->alphabet eq 'protein' || | |
193 $self->throw('Not a protein sequence'); | |
194 | |
195 if (defined $stop) { | |
196 length $stop != 1 and $self->throw('One character stop needed, not [$stop]'); | |
197 $ONECODE{'Ter'} = $stop; | |
198 } | |
199 if (defined $unknown) { | |
200 length $unknown != 1 and $self->throw('One character stop needed, not [$unknown]'); | |
201 $ONECODE{'Xaa'} = $unknown; | |
202 } | |
203 | |
204 my ($aas, $aa3); | |
205 my $length = (length $string) - 2; | |
206 for (my $i = 0 ; $i < $length ; $i += 3) { | |
207 $aa3 = substr($string, $i, 3); | |
208 $ONECODE{$aa3} and $aas .= $ONECODE{$aa3}, next; | |
209 $aas .= 'X'; | |
210 } | |
211 $seq->seq($aas); | |
212 return $seq; | |
213 } | |
214 | |
215 =head2 translate_3frames | |
216 | |
217 Title : translate_3frames | |
218 Usage : @prots = Bio::SeqUtils->translate_3frames($seq) | |
219 Function: Translate a nucleotide sequence in three forward frames. | |
220 The IDs of the sequences are appended with '-0F', '-1F', '-2F'. | |
221 Returns : An array of seq objects | |
222 Args : sequence object | |
223 same arguments as to Bio::PrimarySeqI::translate | |
224 | |
225 =cut | |
226 | |
227 sub translate_3frames { | |
228 my ($self, $seq, @args ) = @_; | |
229 | |
230 $self->throw('Object [$seq] '. 'of class ['. ref($seq). '] can not be translated.') | |
231 unless $seq->can('translate'); | |
232 | |
233 my ($stop, $unknown, $frame, $tableid, $fullCDS, $throw) = @args; | |
234 my @seqs; | |
235 my $f = 0; | |
236 while ($f != 3) { | |
237 my $translation = $seq->translate($stop, $unknown,$f,$tableid, $fullCDS, $throw ); | |
238 $translation->id($seq->id. "-". $f. "F"); | |
239 push @seqs, $translation; | |
240 $f++; | |
241 } | |
242 | |
243 return @seqs; | |
244 } | |
245 | |
246 =head2 translate_6frames | |
247 | |
248 Title : translate_6frames | |
249 Usage : @prots = Bio::SeqUtils->translate_6frames($seq) | |
250 Function: translate a nucleotide sequence in all six frames | |
251 The IDs of the sequences are appended with '-0F', '-1F', '-2F', | |
252 '-0R', '-1R', '-2R'. | |
253 Returns : An array of seq objects | |
254 Args : sequence object | |
255 same arguments as to Bio::PrimarySeqI::translate | |
256 | |
257 =cut | |
258 | |
259 sub translate_6frames { | |
260 my ($self, $seq, @args ) = @_; | |
261 | |
262 my @seqs = $self->translate_3frames($seq, @args); | |
263 $seq->seq($seq->revcom->seq); | |
264 my @seqs2 = $self->translate_3frames($seq, @args); | |
265 foreach my $seq2 (@seqs2) { | |
266 my ($tmp) = $seq2->id; | |
267 $tmp =~ s/F$/R/g; | |
268 $seq2->id($tmp); | |
269 } | |
270 return @seqs, @seqs2; | |
271 } | |
272 | |
273 | |
274 =head2 valid_aa | |
275 | |
276 Title : valid_aa | |
277 Usage : my @aa = $table->valid_aa | |
278 Function: Retrieves a list of the valid amino acid codes. | |
279 The list is ordered so that first 21 codes are for unique | |
280 amino acids. The rest are ['B', 'Z', 'X', '*']. | |
281 Returns : array of all the valid amino acid codes | |
282 Args : [optional] $code => [0 -> return list of 1 letter aa codes, | |
283 1 -> return list of 3 letter aa codes, | |
284 2 -> return associative array of both ] | |
285 | |
286 =cut | |
287 | |
288 sub valid_aa{ | |
289 my ($self,$code) = @_; | |
290 | |
291 if( ! $code ) { | |
292 my @codes; | |
293 foreach my $c ( sort values %ONECODE ) { | |
294 push @codes, $c unless ( $c =~ /[BZX\*]/ ); | |
295 } | |
296 push @codes, qw(B Z X *); # so they are in correct order ? | |
297 return @codes; | |
298 } | |
299 elsif( $code == 1 ) { | |
300 my @codes; | |
301 foreach my $c ( sort keys %ONECODE ) { | |
302 push @codes, $c unless ( $c =~ /(Asx|Glx|Xaa|Ter)/ ); | |
303 } | |
304 push @codes, ('Asx', 'Glx', 'Xaa', 'Ter' ); | |
305 return @codes; | |
306 } | |
307 elsif( $code == 2 ) { | |
308 my %codes = %ONECODE; | |
309 foreach my $c ( keys %ONECODE ) { | |
310 my $aa = $ONECODE{$c}; | |
311 $codes{$aa} = $c; | |
312 } | |
313 return %codes; | |
314 } else { | |
315 $self->warn("unrecognized code in ".ref($self)." method valid_aa()"); | |
316 return (); | |
317 } | |
318 } | |
319 | |
320 1; |