comparison variant_effect_predictor/Bio/Tools/Sigcleave.pm @ 0:2bc9b66ada89 draft default tip

Uploaded
author mahtabm
date Thu, 11 Apr 2013 06:29:17 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:2bc9b66ada89
1 #-----------------------------------------------------------------------------
2 # PACKAGE : Bio::Tools::Sigcleave
3 # AUTHOR : Chris Dagdigian, dag@sonsorol.org
4 # CREATED : Jan 28 1999
5 # REVISION: $Id: Sigcleave.pm,v 1.17 2002/10/22 07:45:22 lapp Exp $
6 #
7 # Copyright (c) 1997-9 bioperl, Chris Dagdigian and others. All Rights Reserved.
8 # This module is free software; you can redistribute it and/or
9 # modify it under the same terms as Perl itself.
10 #
11 # _History_
12 #
13 # Object framework ripped from Steve Chervits's SeqPattern.pm
14 #
15 # Core EGCG Sigcleave emulation from perl code developed by
16 # Danh Nguyen & Kamalakar Gulukota which itself was based
17 # loosely on Colgrove's signal.c program.
18 #
19 # The overall idea is to replicate the output of the sigcleave
20 # program which was distributed with the EGCG extension to the GCG sequence
21 # analysis package. There is also an accessor method for just getting at
22 # the raw results.
23 #
24 #-----------------------------------------------------------------------------
25
26 =head1 NAME
27
28 Bio::Tools::Sigcleave - Bioperl object for sigcleave analysis
29
30 =head1 SYNOPSIS
31
32 =head2 Object Creation
33
34 use Bio::Tools::Sigcleave ();
35
36 # to keep the module backwar compatible, you can pass it a sequence string, but
37 # there recommended say is to pass it a Seq object
38
39 # this works
40 $seq = "MVLLLILSVLLLKEDVRGSAQSSERRVVAHMPGDIIIGALFSVHHQPTVDKVHERKCGAVREQYGI";
41 $sig = new Bio::Tools::Sigcleave(-seq => $seq,
42 -type => 'protein',
43 -threshold=>'3.5',
44 );
45 # but you do:
46 $seqobj = Bio::PrimarySeq->new(-seq => $seq);
47
48 $sig = new Bio::Tools::Sigcleave(-seq => $seqobj,
49 -threshold=>'3.5',
50 );
51
52
53 # now you can detect procaryotic signal sequences as well as eucaryotic
54 $sig->matrix('eucaryotic'); # or 'procaryotic'
55
56
57 =head2 Object Methods & Accessors
58
59 # you can use this method to fine tune the threshod before printing out the results
60 $sig->result_count:
61
62 %raw_results = $sig->signals;
63 $formatted_output = $sig->pretty_print;
64
65 =head1 DESCRIPTION
66
67 "Sigcleave" was a program distributed as part of the free EGCG add-on
68 to earlier versions of the GCG Sequence Analysis package. A new
69 implementation of the algorithm is now part of EMBOSS package.
70
71 From the EGCG documentation:
72
73 SigCleave uses the von Heijne method to locate signal sequences, and
74 to identify the cleavage site. The method is 95% accurate in
75 resolving signal sequences from non-signal sequences with a cutoff
76 score of 3.5, and 75-80% accurate in identifying the cleavage
77 site. The program reports all hits above a minimum value.
78
79 The EGCG Sigcleave program was written by Peter Rice (E-mail:
80 pmr@sanger.ac.uk Post: Informatics Division, The Sanger Centre,
81 Wellcome Trust Genome Campus, Hinxton, Cambs, CB10 1SA, UK).
82
83 Since EGCG is no longer distributed for the latest versions of GCG,
84 this code was developed to emulate the output of the original program
85 as much as possible for those who lost access to sigcleave when
86 upgrading to newer versions of GCG.
87
88 There are 2 accessor methods for this object. "signals" will return a
89 perl associative array containing the sigcleave scores keyed by amino
90 acid position. "pretty_print" returns a formatted string similar to
91 the output of the original sigcleave utility.
92
93 In both cases, the "threshold" setting controls the score reporting
94 level. If no value for threshold is passed in by the user, the code
95 defaults to a reporting value of 3.5.
96
97 In this implemntation the accessor will never return any
98 score/position pair which does not meet the threshold limit. This is
99 the slightly different from the behaviour of the 8.1 EGCG sigcleave
100 program which will report the highest of the under-threshold results
101 if nothing else is found.
102
103
104 Example of pretty_print output:
105
106 SIGCLEAVE of sigtest from: 1 to 146
107
108 Report scores over 3.5
109 Maximum score 4.9 at residue 131
110
111 Sequence: FVILAAMSIQGSA-NLQTQWKSTASLALET
112 | (signal) | (mature peptide)
113 118 131
114
115 Other entries above 3.5
116
117 Maximum score 3.7 at residue 112
118
119 Sequence: CSRQLFGWLFCKV-HPGAIVFVILAAMSIQGSANLQTQWKSTASLALET
120 | (signal) | (mature peptide)
121 99 112
122
123
124 =head1 FEEDBACK
125
126 When updating and maintaining a module, it helps to know that people
127 are actually using it. Let us know if you find a bug, think this code
128 is useful or have any improvements/features to suggest.
129
130 =head2 Reporting Bugs
131
132 Report bugs to the Bioperl bug tracking system to help us keep track
133 the bugs and their resolution. Bug reports can be submitted via email
134 or the web:
135
136 bioperl-bugs@bio.perl.org
137 http://bugzilla.bioperl.org/
138
139 =head1 AUTHOR
140
141 Chris Dagdigian, dag@sonsorol.org & others
142
143 =head1 CONTRIBUTORS
144
145 Heikki Lehvaslaiho, heikki@ebi.ac.uk
146
147 =head1 VERSION
148
149 Bio::Tools::Sigcleave.pm, $Id: Sigcleave.pm,v 1.17 2002/10/22 07:45:22 lapp Exp $
150
151 =head1 COPYRIGHT
152
153 Copyright (c) 1999 Chris Dagdigian & others. All Rights Reserved.
154 This module is free software; you can redistribute it and/or modify it
155 under the same terms as Perl itself.
156
157 =head1 REFERENCES / SEE ALSO
158
159 von Heijne G. (1986) "A new method for predicting signal sequences
160 cleavage sites." Nucleic Acids Res. 14, 4683-4690.
161
162 von Heijne G. (1987) in "Sequence Analysis in Molecular Biology:
163 Treasure Trove or Trivial Pursuit" (Acad. Press, (1987), 113-117).
164
165
166 =head1 APPENDIX
167
168 The following documentation describes the various functions
169 contained in this module. Some functions are for internal
170 use and are not meant to be called by the user; they are
171 preceded by an underscore ("_").
172
173
174 =cut
175
176 #
177 ##
178 ###
179 #### END of main POD documentation.
180 ###
181 ##
182 #
183
184
185 package Bio::Tools::Sigcleave;
186
187 use Bio::Root::Root;
188 use Bio::PrimarySeq;
189
190 @ISA = qw(Bio::Root::Root);
191 use strict;
192 use vars qw ($ID $VERSION %WeightTable_euc %WeightTable_pro );
193 $ID = 'Bio::Tools::Sigcleave';
194 $VERSION = 0.02;
195
196
197 %WeightTable_euc = (
198 #Sample: 161 aligned sequences
199 # R -13 -12 -11 -10 -9 -8 -7 -6 -5 -4 -3 -2 -1 +1 +2 Expect
200 'A' => [16, 13, 14, 15, 20, 18, 18, 17, 25, 15, 47, 6, 80, 18, 6, 14.5],
201 'C' => [ 3, 6, 9, 7, 9, 14, 6, 8, 5, 6, 19, 3, 9, 8, 3, 4.5],
202 'D' => [ 0, 0, 0, 0, 0, 0, 0, 0, 5, 3, 0, 5, 0, 10, 11, 8.9],
203 'E' => [ 0, 0, 0, 1, 0, 0, 0, 0, 3, 7, 0, 7, 0, 13, 14, 10.0],
204 'F' => [13, 9, 11, 11, 6, 7, 18, 13, 4, 5, 0, 13, 0, 6, 4, 5.6],
205 'G' => [ 4, 4, 3, 6, 3, 13, 3, 2, 19, 34, 5, 7, 39, 10, 7, 12.1],
206 'H' => [ 0, 0, 0, 0, 0, 1, 1, 0, 5, 0, 0, 6, 0, 4, 2, 3.4],
207 'I' => [15, 15, 8, 6, 11, 5, 4, 8, 5, 1, 10, 5, 0, 8, 7, 7.4],
208 'K' => [ 0, 0, 0, 1, 0, 0, 1, 0, 0, 4, 0, 2, 0, 11, 9, 11.3],
209 'L' => [71, 68, 72, 79, 78, 45, 64, 49, 10, 23, 8, 20, 1, 8, 4, 12.1],
210 'M' => [ 0, 3, 7, 4, 1, 6, 2, 2, 0, 0, 0, 1, 0, 1, 2, 2.7],
211 'N' => [ 0, 1, 0, 1, 1, 0, 0, 0, 3, 3, 0, 10, 0, 4, 7, 7.1],
212 'P' => [ 2, 0, 2, 0, 0, 4, 1, 8, 20, 14, 0, 1, 3, 0, 22, 7.4],
213 'Q' => [ 0, 0, 0, 1, 0, 6, 1, 0, 10, 8, 0, 18, 3, 19, 10, 6.3],
214 'R' => [ 2, 0, 0, 0, 0, 1, 0, 0, 7, 4, 0, 15, 0, 12, 9, 7.6],
215 'S' => [ 9, 3, 8, 6, 13, 10, 15, 16, 26, 11, 23, 17, 20, 15, 10, 11.4],
216 'T' => [ 2, 10, 5, 4, 5, 13, 7, 7, 12, 6, 17, 8, 6, 3, 10, 9.7],
217 'V' => [20, 25, 15, 18, 13, 15, 11, 27, 0, 12, 32, 3, 0, 8, 17, 11.1],
218 'W' => [ 4, 3, 3, 1, 1, 2, 6, 3, 1, 3, 0, 9, 0, 2, 0, 1.8],
219 'Y' => [ 0, 1, 4, 0, 0, 1, 3, 1, 1, 2, 0, 5, 0, 1, 7, 5.6]
220 );
221
222 %WeightTable_pro = (
223 #Sample: 36 aligned sequences
224 # R -13 -12 -11 -10 -9 -8 -7 -6 -5 -4 -3 -2 -1 +1 +2 Expect
225 'A' => [0, 8, 8, 9, 6, 7, 5, 6, 7, 7, 24, 2, 31, 18, 4, 3.2],
226 'C' => [1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1.0],
227 'D' => [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 8, 2.0],
228 'E' => [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 4, 8, 2.2],
229 'F' => [2, 4, 3, 4, 1, 1, 8, 0, 4, 1, 0, 7, 0, 1, 0, 1.3],
230 'G' => [4, 2, 2, 2, 3, 5, 2, 4, 2, 2, 0, 2, 2, 1, 0, 2.7],
231 'H' => [0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 7, 0, 1, 0, 0.8],
232 'I' => [3, 1, 5, 1, 5, 0, 1, 3, 0, 0, 0, 0, 0, 0, 2, 1.7],
233 'K' => [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 2, 0, 3, 0, 2.5],
234 'L' => [8, 11, 9, 8, 9, 13, 1, 0, 2, 2, 1, 2, 0, 0, 1, 2.7],
235 'M' => [0, 2, 1, 1, 3, 2, 3, 0, 1, 2, 0, 4, 0, 0, 1, 0.6],
236 'N' => [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 3, 0, 1, 4, 1.6],
237 'P' => [0, 1, 1, 1, 1, 1, 2, 3, 5, 2, 0, 0, 0, 0, 5, 1.7],
238 'Q' => [0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 0, 3, 0, 0, 1, 1.4],
239 'R' => [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1.7],
240 'S' => [1, 0, 1, 4, 4, 1, 5, 15, 5, 8, 5, 2, 2, 0, 0, 2.6],
241 'T' => [2, 0, 4, 2, 2, 2, 2, 2, 5, 1, 3, 0, 1, 1, 2, 2.2],
242 'V' => [5, 7, 1, 3, 1, 4, 7, 0, 0, 4, 3, 0, 0, 2, 0, 2.5],
243 'W' => [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0.4],
244 'Y' => [0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 1, 0, 0, 0, 1.3]
245 );
246
247
248 ##
249 ## Now we calculate the _real_ values for the weight tables
250 ##
251 ##
252 ## yeah yeah yeah there is lots of math here that gets repeated
253 ## every single time a sigcleave object gets created. This is
254 ## a quick hack to make sure that we get the scores as accurate as
255 ## possible. Need all those significant digits....
256 ##
257 ## suggestions for speedup aproaches welcome
258 ##
259
260
261 foreach my $i (keys %WeightTable_euc) {
262 my $expected = $WeightTable_euc{$i}[15];
263 if ($expected > 0) {
264 for (my $j=0; $j<16; $j++) {
265 if ($WeightTable_euc{$i}[$j] == 0) {
266 $WeightTable_euc{$i}[$j] = 1;
267 if ($j == 10 || $j == 12) {
268 $WeightTable_euc{$i}[$j] = 1.e-10;
269 }
270 }
271 $WeightTable_euc{$i}[$j] = log($WeightTable_euc{$i}[$j]/$expected);
272 }
273 }
274 }
275
276
277 foreach my $i (keys %WeightTable_pro) {
278 my $expected = $WeightTable_pro{$i}[15];
279 if ($expected > 0) {
280 for (my $j=0; $j<16; $j++) {
281 if ($WeightTable_pro{$i}[$j] == 0) {
282 $WeightTable_pro{$i}[$j] = 1;
283 if ($j == 10 || $j == 12) {
284 $WeightTable_pro{$i}[$j] = 1.e-10;
285 }
286 }
287 $WeightTable_pro{$i}[$j] = log($WeightTable_pro{$i}[$j]/$expected);
288 }
289 }
290 }
291
292
293
294 #####################################################################################
295 ## CONSTRUCTOR ##
296 #####################################################################################
297
298
299 sub new {
300 my ($class, @args) = @_;
301
302 my $self = $class->SUPER::new(@args);
303 #my $self = Bio::Seq->new(@args);
304
305 my ($seq, $threshold, $matrix) = $self->_rearrange([qw(SEQ THRESHOLD MATRIX)],@args);
306
307 defined $threshold && $self->threshold($threshold);
308 $matrix && $self->matrix($matrix);
309 $seq && $self->seq($seq);
310
311 return $self;
312 }
313
314
315
316 =head1 threshold
317
318 Title : threshold
319 Usage : $value = $self->threshold
320 Purpose : Read/write method sigcleave score reporting threshold.
321 Returns : float.
322 Argument : new value, float
323 Throws : on non-number argument
324 Comments : defaults to 3.5
325 See Also : n/a
326
327 =cut
328
329 #----------------
330 sub threshold {
331 #----------------
332 my ($self, $value) = @_;
333 if( defined $value) {
334 $self->throw("I need a number, not [$value]")
335 if $value !~ /^[+-]?[\d\.]+$/;
336 $self->{'_threshold'} = $value;
337 }
338 return $self->{'_threshold'} || 3.5 ;
339 }
340
341 =head1 matrix
342
343 Title : matrix
344 Usage : $value = $self->matrix('procaryotic')
345 Purpose : Read/write method sigcleave matrix.
346 Returns : float.
347 Argument : new value: 'eucaryotic' or 'procaryotic'
348 Throws : on non-number argument
349 Comments : defaults to 3.5
350 See Also : n/a
351
352 =cut
353
354 #----------------
355 sub matrix {
356 #----------------
357 my ($self, $value) = @_;
358 if( defined $value) {
359 $self->throw("I need 'eucaryotic' or 'procaryotic', not [$value]")
360 unless $value eq 'eucaryotic' or $value eq 'procaryotic';
361 $self->{'_matrix'} = $value;
362 }
363 return $self->{'_matrix'} || 'eucaryotic' ;
364
365 }
366
367 =head1 seq
368
369 Title : seq
370 Usage : $value = $self->seq('procaryotic')
371 Purpose : Read/write method sigcleave seq.
372 Returns : float.
373 Argument : new value: 'eucaryotic' or 'procaryotic'
374 Throws : on non-number argument
375 Comments : defaults to 3.5
376 See Also : n/a
377
378 =cut
379
380 #----------------
381 sub seq {
382 #----------------
383 my ($self, $value) = @_;
384 if( defined $value) {
385 if ($value->isa('Bio::PrimarySeqI')) {
386 $self->{'_seq'} = $value;
387 } else {
388 $self->{'_seq'} = Bio::PrimarySeq->new(-seq=>$value,
389 -alphabet=>'protein');
390 }
391 }
392 return $self->{'_seq'};
393 }
394
395
396
397 =head1 _Analyze
398
399 Title : _Analyze
400 Usage : N/A This is an internal method. Not meant to be called from outside
401 : the package
402 :
403 Purpose : calculates sigcleave score and amino acid position for the
404 : given protein sequence. The score reporting threshold can
405 : be adjusted by passing in the "threshold" parameter during
406 : object construction. If no threshold is passed in, the code
407 : defaults to reporting any scores equal to or above 3.5
408 :
409 Returns : nothing. results are added to the object
410 Argument : none.
411 Throws : nothing.
412 Comments : nothing.
413 See Also : n/a
414
415 =cut
416
417 #----------------
418 sub _Analyze {
419 #----------------
420 my($self) = @_;
421
422 my %signals;
423 my @hitWeight = ();
424 my @hitsort = ();
425 my @hitpos = ();
426 my $maxSite = "";
427 my $seqPos = "";
428 my $istart = "";
429 my $iend = "";
430 my $icol = "";
431 my $i = "";
432 my $weight = "";
433 my $k = 0;
434 my $c = 0;
435 my $seqBegin = 0;
436 my $pVal = -13;
437 my $nVal = 2;
438 my $nHits = 0;
439 my $seqEnd = $self->seq->length;
440 my $pep = $self->seq->seq;
441 my $minWeight = $self->threshold;
442 my $matrix = $self->matrix;
443
444 ## The weight table is keyed by UPPERCASE letters so we uppercase
445 ## the pep string because we don't want to alter the actual object
446 ## sequence.
447
448 $pep =~ tr/a-z/A-Z/;
449
450 for ($seqPos = $seqBegin; $seqPos < $seqEnd; $seqPos++) {
451 $istart = (0 > $seqPos + $pVal)? 0 : $seqPos + $pVal;
452 $iend = ($seqPos + $nVal - 1 < $seqEnd)? $seqPos + $nVal - 1 : $seqEnd;
453 $icol= $iend - $istart + 1;
454 $weight = 0.00;
455 for ($k=0; $k<$icol; $k++) {
456 $c = substr($pep, $istart + $k, 1);
457
458 ## CD: The if(defined) stuff was put in here because Sigcleave.pm
459 ## CD: kept getting warnings about undefined vals during 'make test' ...
460 if ($matrix eq 'eucaryotic') {
461 $weight += $WeightTable_euc{$c}[$k] if defined $WeightTable_euc{$c}[$k];
462 } else {
463 $weight += $WeightTable_pro{$c}[$k] if defined $WeightTable_pro{$c}[$k];
464 }
465 }
466 $signals{$seqPos+1} = sprintf ("%.1f", $weight) if $weight >= $minWeight;
467 }
468
469 $self->{"_signal_scores"} = { %signals };
470 }
471
472
473 =head1 signals
474
475 Title : signals
476 Usage : %sigcleave_results = $sig->signals;
477 :
478 Purpose : Accessor method for sigcleave results
479 :
480 Returns : Associative array. The key value represents the amino acid position
481 : and the value represents the score. Only scores that
482 : are greater than or equal to the THRESHOLD value are reported.
483 :
484 Argument : none.
485 Throws : none.
486 Comments : none.
487 See Also : THRESHOLD
488
489 =cut
490
491 #----------------
492 sub signals {
493 #----------------
494 my $self = shift;
495 my %results;
496 my $position;
497
498 # do the calculations
499 $self->_Analyze;
500
501 foreach $position ( sort keys %{ $self->{'_signal_scores'} } ) {
502 $results{$position} = $self->{'_signal_scores'}{$position};
503 }
504 return %results;
505 }
506
507
508 =head1 result_count
509
510 Title : result_count
511 Usage : $count = $sig->result_count;
512 :
513 Purpose : Accessor method for sigcleave results
514 :
515 Returns : Integer, number of results above the threshold
516 :
517 Argument : none.
518 Throws : none.
519 Comments : none.
520
521 See Also : THRESHOLD
522
523 =cut
524
525 #----------------
526 sub result_count {
527 #----------------
528 my $self = shift;
529 $self->_Analyze;
530 return keys %{ $self->{'_signal_scores'} };
531 }
532
533
534 =head1 pretty_print
535
536 Title : pretty_print
537 Usage : $output = $sig->pretty_print;
538 : print $sig->pretty_print;
539 :
540 Purpose : Emulates the output of the EGCG Sigcleave
541 : utility.
542 :
543 Returns : A formatted string.
544 Argument : none.
545 Throws : none.
546 Comments : none.
547 See Also : n/a
548
549 =cut
550
551 #----------------
552 sub pretty_print {
553 #----------------
554 my $self = shift;
555 my $pos;
556 my $output;
557 my $cnt = 1;
558 my %results = $self->signals;
559 my @hits = keys %results;
560 my $hitcount = $#hits; $hitcount++;
561 my $thresh = $self->threshold;
562 my $seqlen = $self->seq->length || 0;
563 my $name = $self->seq->id || 'NONAME';
564 my $pep = $self->seq->seq;
565 $pep =~ tr/a-z/A-Z/;
566
567 $output = "SIGCLEAVE of $name from: 1 to $seqlen\n\n";
568
569 if ($hitcount > 0) {
570 $output .= "Report scores over $thresh\n";
571 foreach $pos ((sort { $results{$b} cmp $results{$a} } keys %results)) {
572 my $start = $pos - 15;
573 $start = 1 if $start < 1;
574 my $sig = substr($pep,$start -1,$pos-$start );
575
576 $output .= sprintf ("Maximum score %1.1f at residue %3d\n",$results{$pos},$pos);
577 $output .= "\n";
578 $output .= " Sequence: ";
579 $output .= $sig;
580 $output .= "-" x (15- length($sig));
581 $output .= "-";
582 $output .= substr($pep,$pos-1,50);
583 $output .= "\n";
584 $output .= " " x 12;
585 $output .= "| \(signal\) | \(mature peptide\)\n";
586 $output .= sprintf(" %3d %3d\n\n",$start,$pos);
587
588 if (($hitcount > 1) && ($cnt == 1)) {
589 $output .= " Other entries above $thresh\n\n";
590 }
591 $cnt++;
592 }
593 }
594 $output;
595 }
596
597
598 1;
599 __END__
600
601
602 #########################################################################
603 # End of class
604 #########################################################################