0
|
1 #$Id: OddCodes.pm,v 1.10.2.1 2003/04/07 04:27:42 heikki Exp $
|
|
2 #-----------------------------------------------------------------------------
|
|
3 # PACKAGE : OddCodes.pm
|
|
4 # PURPOSE : To write amino acid sequences in alternative alphabets
|
|
5 # AUTHOR : Derek Gatherer (D.Gatherer@organon.nhe.akzonobel.nl)
|
|
6 # SOURCE :
|
|
7 # CREATED : 8th July 2000
|
|
8 # MODIFIED :
|
|
9 # DISCLAIMER : I am employed in the pharmaceutical industry but my
|
|
10 # : employers do not endorse or sponsor this module
|
|
11 # : in any way whatsoever. The above email address is
|
|
12 # : given purely for the purpose of easy communication
|
|
13 # : with the author, and does not imply any connection
|
|
14 # : between my employers and anything written below.
|
|
15 # LICENCE : You may distribute this module under the same terms
|
|
16 # : as the rest of BioPerl.
|
|
17 #----------------------------------------------------------------------------
|
|
18
|
|
19 =head1 NAME
|
|
20
|
|
21 Bio::Tools::OddCodes - Object holding alternative alphabet coding for
|
|
22 one protein sequence
|
|
23
|
|
24 =head1 SYNOPSIS
|
|
25
|
|
26 Take a sequence object from eg, an inputstream, and creates an object
|
|
27 for the purposes of rewriting that sequence in another alphabet.
|
|
28 These are abbreviated amino acid sequence alphabets, designed to
|
|
29 simplify the statistical aspects of analysing protein sequences, by
|
|
30 reducing the combinatorial explosion of the 20-letter alphabet. These
|
|
31 abbreviated alphabets range in size from 2 to 8.
|
|
32
|
|
33 Creating the OddCodes object, eg:
|
|
34
|
|
35 my $inputstream = Bio::SeqIO->new( '-file' => "seqfile",
|
|
36 '-format' => 'Fasta');
|
|
37 my $seqobj = $inputstream->next_seq();
|
|
38 my $oddcode_obj = Bio::Tools::Oddcodes->new(-seq => $seqobj);
|
|
39
|
|
40 or:
|
|
41
|
|
42 my $seqobj = Bio::PrimarySeq->new
|
|
43 (-seq=>'[cut and paste a sequence here]',
|
|
44 -alphabet => 'protein',
|
|
45 -id => 'test');
|
|
46 my $oddcode_obj = Bio::Tools::OddCodes->new(-seq => $seqobj);
|
|
47
|
|
48 do the alternative coding, returning the answer as a reference to a string
|
|
49
|
|
50 my $output = $oddcode_obj->structural();
|
|
51 my $output = $oddcode_obj->chemical();
|
|
52 my $output = $oddcode_obj->functional();
|
|
53 my $output = $oddcode_obj->charge();
|
|
54 my $output = $oddcode_obj->hydrophobic();
|
|
55 my $output = $oddcode_obj->Dayhoff();
|
|
56 my $output = $oddcode_obj->Sneath();
|
|
57 my $output = $oddcode_obj->Stanfel();
|
|
58
|
|
59
|
|
60 display sequence in new form, eg:
|
|
61
|
|
62 my $new_coding = $$output;
|
|
63 print "\n$new_coding";
|
|
64
|
|
65 =head1 DESCRIPTION
|
|
66
|
|
67 Bio::Tools::Oddcodes is a welterweight object for rewriting a protein
|
|
68 sequence in an alternative alphabet. 8 of these are provided, ranging
|
|
69 from the the 2-letter hydrophobic alphabet, to the 8-letter chemical
|
|
70 alphabet. These are useful for the statistical analysis of protein
|
|
71 sequences since they can partially avoid the combinatorial explosion
|
|
72 produced by the full 20-letter alphabet (eg. 400 dimers, 8000 trimers
|
|
73 etc.)
|
|
74
|
|
75 The objects will print out a warning if the input sequence is not a
|
|
76 protein. If you know what you are doing, you can silence the warning
|
|
77 by setting verbose() to a negetive value.
|
|
78
|
|
79 See Synopsis above for object creation code.
|
|
80
|
|
81 =head1 FEEDBACK
|
|
82
|
|
83 =head2 Mailing Lists
|
|
84
|
|
85 User feedback is an integral part of the evolution of this
|
|
86 and other Bioperl modules. Send your comments and suggestions preferably
|
|
87 to one of the Bioperl mailing lists.
|
|
88 Your participation is much appreciated.
|
|
89
|
|
90 bioperl-l@bioperl.org - General discussion
|
|
91 http://www.bioperl.org/MailList.html - About the mailing lists
|
|
92
|
|
93 =head2 Reporting Bugs
|
|
94
|
|
95 Report bugs to the Bioperl bug tracking system to help us keep track
|
|
96 the bugs and their resolution. Bug reports can be submitted via email
|
|
97 or the web:
|
|
98
|
|
99 bioperl-bugs@bioperl.org
|
|
100 http://www.bugzilla.bioperl.org/
|
|
101
|
|
102 =head1 AUTHOR
|
|
103
|
|
104 Derek Gatherer
|
|
105
|
|
106 =head1 APPENDIX
|
|
107
|
|
108 The rest of the documentation details each of the object methods.
|
|
109 Internal methods are usually preceded with a _
|
|
110
|
|
111 =cut
|
|
112
|
|
113 #'
|
|
114
|
|
115 package Bio::Tools::OddCodes;
|
|
116 use vars qw(@ISA);
|
|
117 use strict;
|
|
118
|
|
119 use Bio::Root::Root;
|
|
120
|
|
121 @ISA = qw(Bio::Root::Root);
|
|
122
|
|
123
|
|
124 sub new
|
|
125 {
|
|
126 my($class,@args) = @_;
|
|
127
|
|
128 my $self = $class->SUPER::new(@args);
|
|
129
|
|
130 my ($seqobj) = $self->_rearrange([qw(SEQ)],@args);
|
|
131 if((! defined($seqobj)) && @args && ref($args[0])) {
|
|
132 # parameter not passed as named parameter?
|
|
133 $seqobj = $args[0];
|
|
134 }
|
|
135 unless ($seqobj->isa("Bio::PrimarySeqI"))
|
|
136 {
|
|
137 die("die in _init, OddCodes works only on PrimarySeqI
|
|
138 objects\n");
|
|
139 }
|
|
140
|
|
141 $self->{'_seqref'} = $seqobj;
|
|
142
|
|
143 return $self;
|
|
144 }
|
|
145
|
|
146 =head2 structural
|
|
147
|
|
148 Title : structural
|
|
149 Usage : $output = $oddcode_obj->structural();
|
|
150 Function: turns amino acid sequence into 3-letter structural alphabet
|
|
151 : A (ambivalent), E (external), I (internal)
|
|
152 Example : a sequence ACDEFGH will become AAEEIAE
|
|
153 Returns : Reference to the new sequence string
|
|
154 Args : none
|
|
155
|
|
156 =cut
|
|
157
|
|
158 sub structural()
|
|
159 {
|
|
160 my $self = $_[0];
|
|
161 my $seqstring = &_pullseq($self); # see _pullseq() below
|
|
162
|
|
163 # now the real business
|
|
164
|
|
165 $seqstring =~ tr/[ACGPSTWY]/1/;
|
|
166 $seqstring =~ tr/[RNDQEHK]/2/;
|
|
167 $seqstring =~ tr/[ILMFV]/3/;
|
|
168 $seqstring =~ tr/1/A/;
|
|
169 $seqstring =~ tr/2/E/;
|
|
170 $seqstring =~ tr/3/I/;
|
|
171
|
|
172 return \$seqstring;
|
|
173
|
|
174 # and that's that one
|
|
175 }
|
|
176
|
|
177 =head2 functional
|
|
178
|
|
179 Title : functional
|
|
180 Usage : $output = $oddcode_obj->functional();
|
|
181 Function: turns amino acid sequence into 4-letter functional alphabet
|
|
182 : A (acidic), C (basic), H (hydrophobic), P (polar)
|
|
183 Example : a sequence ACDEFGH will become HPAAHHC
|
|
184 Returns : Reference to the new sequence string
|
|
185 Args : none
|
|
186
|
|
187 =cut
|
|
188
|
|
189 sub functional()
|
|
190 {
|
|
191 my $self = $_[0];
|
|
192 my $seqstring = &_pullseq($self);
|
|
193
|
|
194 # now the real business
|
|
195
|
|
196 $seqstring =~ tr/[DE]/1/;
|
|
197 $seqstring =~ tr/[HKR]/2/;
|
|
198 $seqstring =~ tr/[AFILMPVW]/3/;
|
|
199 $seqstring =~ tr/[CGNQSTY]/4/;
|
|
200 $seqstring =~ tr/1/A/;
|
|
201 $seqstring =~ tr/2/C/;
|
|
202 $seqstring =~ tr/3/H/;
|
|
203 $seqstring =~ tr/4/P/;
|
|
204
|
|
205 return \$seqstring;
|
|
206
|
|
207 # and that's that one
|
|
208 }
|
|
209
|
|
210 =head2 hydrophobic
|
|
211
|
|
212 Title : hydrophobic
|
|
213 Usage : $output = $oddcode_obj->hydrophobic();
|
|
214 Function: turns amino acid sequence into 2-letter hydrophobicity alphabet
|
|
215 : O (hydrophobic), I (hydrophilic)
|
|
216 Example : a sequence ACDEFGH will become OIIIOII
|
|
217 Returns : Reference to the new sequence string
|
|
218 Args : none
|
|
219
|
|
220 =cut
|
|
221
|
|
222 sub hydrophobic()
|
|
223 {
|
|
224 my $self = $_[0];
|
|
225 my $seqstring = &_pullseq($self);
|
|
226
|
|
227 # now the real business
|
|
228
|
|
229 $seqstring =~ tr/[AFILMPVW]/1/;
|
|
230 $seqstring =~ tr/[CDEGHKNQRSTY]/2/;
|
|
231 $seqstring =~ tr/1/I/;
|
|
232 $seqstring =~ tr/2/O/;
|
|
233
|
|
234 return \$seqstring;
|
|
235
|
|
236 # and that's that one
|
|
237 }
|
|
238
|
|
239 =head2 Dayhoff
|
|
240
|
|
241 Title : Dayhoff
|
|
242 Usage : $output = $oddcode_obj->Dayhoff();
|
|
243 Function: turns amino acid sequence into 6-letter Dayhoff alphabet
|
|
244 Example : a sequence ACDEFGH will become CADDGCE
|
|
245 Returns : Reference to the new sequence string
|
|
246 Args : none
|
|
247
|
|
248 =cut
|
|
249
|
|
250 sub Dayhoff()
|
|
251 {
|
|
252 my $self = $_[0];
|
|
253 my $seqstring = &_pullseq($self);
|
|
254
|
|
255 # now the real business
|
|
256
|
|
257 $seqstring =~ tr/[C]/1/;
|
|
258 $seqstring =~ tr/[AGPST]/2/;
|
|
259 $seqstring =~ tr/[DENQ]/3/;
|
|
260 $seqstring =~ tr/[HKR]/4/;
|
|
261 $seqstring =~ tr/[ILMV]/5/;
|
|
262 $seqstring =~ tr/[FWY]/6/;
|
|
263 $seqstring =~ tr/1/A/;
|
|
264 $seqstring =~ tr/2/C/;
|
|
265 $seqstring =~ tr/3/D/;
|
|
266 $seqstring =~ tr/4/E/;
|
|
267 $seqstring =~ tr/5/F/;
|
|
268 $seqstring =~ tr/6/G/;
|
|
269
|
|
270 return \$seqstring;
|
|
271
|
|
272 # and that's that one
|
|
273 }
|
|
274
|
|
275 =head2 Sneath
|
|
276
|
|
277 Title : Sneath
|
|
278 Usage : $output = $oddcode_obj->Sneath();
|
|
279 Function: turns amino acid sequence into 7-letter Sneath alphabet
|
|
280 Example : a sequence ACDEFGH will become CEFFHCF
|
|
281 Returns : Reference to the new sequence string
|
|
282 Args : none
|
|
283
|
|
284 =cut
|
|
285
|
|
286 sub Sneath()
|
|
287 {
|
|
288 my $self = $_[0];
|
|
289 my $seqstring = &_pullseq($self);
|
|
290
|
|
291 # now the real business
|
|
292
|
|
293 $seqstring =~ tr/[ILV]/1/;
|
|
294 $seqstring =~ tr/[AGP]/2/;
|
|
295 $seqstring =~ tr/[MNQ]/3/;
|
|
296 $seqstring =~ tr/[CST]/4/;
|
|
297 $seqstring =~ tr/[DE]/5/;
|
|
298 $seqstring =~ tr/[KR]/6/;
|
|
299 $seqstring =~ tr/[FHWY]/7/;
|
|
300 $seqstring =~ tr/1/A/;
|
|
301 $seqstring =~ tr/2/C/;
|
|
302 $seqstring =~ tr/3/D/;
|
|
303 $seqstring =~ tr/4/E/;
|
|
304 $seqstring =~ tr/5/F/;
|
|
305 $seqstring =~ tr/6/G/;
|
|
306 $seqstring =~ tr/7/H/;
|
|
307
|
|
308 return \$seqstring;
|
|
309
|
|
310 # and that's that one
|
|
311 }
|
|
312
|
|
313 =head2 Stanfel
|
|
314
|
|
315 Title : Stanfel
|
|
316 Usage : $output = $oddcode_obj->Stanfel();
|
|
317 Function: turns amino acid sequence into 4-letter Stanfel alphabet
|
|
318 Example : a sequence ACDEFGH will become AACCDAE
|
|
319 Returns : Reference to the new sequence string
|
|
320 Args : none
|
|
321
|
|
322 =cut
|
|
323
|
|
324 sub Stanfel()
|
|
325 {
|
|
326 my $self = $_[0];
|
|
327 my $seqstring = &_pullseq($self);
|
|
328
|
|
329 # now the real business
|
|
330
|
|
331 $seqstring =~ tr/[ACGILMPSTV]/1/;
|
|
332 $seqstring =~ tr/[DENQ]/2/;
|
|
333 $seqstring =~ tr/[FWY]/3/;
|
|
334 $seqstring =~ tr/[HKR]/4/;
|
|
335 $seqstring =~ tr/1/A/;
|
|
336 $seqstring =~ tr/2/C/;
|
|
337 $seqstring =~ tr/3/D/;
|
|
338 $seqstring =~ tr/4/E/;
|
|
339
|
|
340 return \$seqstring;
|
|
341
|
|
342 # and that's that one
|
|
343 }
|
|
344
|
|
345 =head2 chemical()
|
|
346
|
|
347 Title : chemical
|
|
348 Usage : $output = $oddcode_obj->chemical();
|
|
349 Function: turns amino acid sequence into 8-letter chemical alphabet
|
|
350 : A (acidic), L (aliphatic), M (amide), R (aromatic)
|
|
351 : C (basic), H (hydroxyl), I (imino), S (sulphur)
|
|
352 Example : a sequence ACDEFGH will become LSAARAC
|
|
353 Returns : Reference to the new sequence string
|
|
354 Args : none
|
|
355
|
|
356 =cut
|
|
357
|
|
358 sub chemical()
|
|
359 {
|
|
360 my $self = $_[0];
|
|
361 my $seqstring = &_pullseq($self);
|
|
362
|
|
363 # now the real business
|
|
364
|
|
365 $seqstring =~ tr/[DE]/1/;
|
|
366 $seqstring =~ tr/[AGILV]/2/;
|
|
367 $seqstring =~ tr/[NQ]/3/;
|
|
368 $seqstring =~ tr/[FWY]/4/;
|
|
369 $seqstring =~ tr/[RHK]/5/;
|
|
370 $seqstring =~ tr/[ST]/6/;
|
|
371 $seqstring =~ tr/P/7/;
|
|
372 $seqstring =~ tr/[CM]/8/;
|
|
373 $seqstring =~ tr/1/A/;
|
|
374 $seqstring =~ tr/2/L/;
|
|
375 $seqstring =~ tr/3/M/;
|
|
376 $seqstring =~ tr/4/R/;
|
|
377 $seqstring =~ tr/5/C/;
|
|
378 $seqstring =~ tr/6/H/;
|
|
379 $seqstring =~ tr/7/I/;
|
|
380 $seqstring =~ tr/8/S/;
|
|
381
|
|
382 return \$seqstring;
|
|
383
|
|
384 # and that's that one
|
|
385 }
|
|
386
|
|
387 =head2 charge
|
|
388
|
|
389 Title : charge
|
|
390 Usage : $output = $oddcode_obj->charge();
|
|
391 Function: turns amino acid sequence into 3-letter charge alphabet
|
|
392 Example : a sequence ACDEFGH will become NNAANNC
|
|
393 Returns : Reference to the new sequence string
|
|
394 Args : none
|
|
395
|
|
396 =cut
|
|
397
|
|
398 sub charge()
|
|
399 {
|
|
400 my $self = $_[0];
|
|
401 my $seqstring = &_pullseq($self);
|
|
402
|
|
403 # now the real business
|
|
404
|
|
405 $seqstring =~ tr/[DE]/1/;
|
|
406 $seqstring =~ tr/[HKR]/2/;
|
|
407 $seqstring =~ tr/[ACFGILMNPQSTVWY]/3/;
|
|
408 $seqstring =~ tr/1/A/;
|
|
409 $seqstring =~ tr/2/C/;
|
|
410 $seqstring =~ tr/3/N/;
|
|
411
|
|
412 return \$seqstring;
|
|
413
|
|
414 # and that's that one
|
|
415 }
|
|
416
|
|
417 # _pullseq is called within each of the subroutines
|
|
418 # it just checks a few things and returns the sequence
|
|
419
|
|
420 sub _pullseq
|
|
421 {
|
|
422 my $self = $_[0];
|
|
423
|
|
424 my $seqobj = $self->{'_seqref'};
|
|
425
|
|
426 unless ($seqobj->isa("Bio::PrimarySeqI"))
|
|
427 {
|
|
428 die("die, OddCodes works only on PrimarySeqI objects\n");
|
|
429 }
|
|
430 $self->warn("\tAll OddCode alphabets need a protein sequence,\n".
|
|
431 "\tbut BioPerl thinks this is not: [". $seqobj->id. "]")
|
|
432 unless $seqobj->alphabet eq 'protein' or $self->verbose < 0;;
|
|
433
|
|
434 my $seqstring = uc $seqobj->seq();
|
|
435
|
|
436 if(length($seqstring)<1)
|
|
437 {
|
|
438 die("$seqstring: die, sequence has zero length\n");
|
|
439 }
|
|
440 return $seqstring;
|
|
441 }
|
|
442
|
|
443 1;
|