diff variant_effect_predictor/Bio/Tools/Sigcleave.pm @ 0:2bc9b66ada89 draft default tip

Uploaded
author mahtabm
date Thu, 11 Apr 2013 06:29:17 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/variant_effect_predictor/Bio/Tools/Sigcleave.pm	Thu Apr 11 06:29:17 2013 -0400
@@ -0,0 +1,604 @@
+#-----------------------------------------------------------------------------
+# PACKAGE : Bio::Tools::Sigcleave
+# AUTHOR  : Chris Dagdigian, dag@sonsorol.org
+# CREATED : Jan 28 1999
+# REVISION: $Id: Sigcleave.pm,v 1.17 2002/10/22 07:45:22 lapp Exp $
+#
+# Copyright (c) 1997-9 bioperl, Chris Dagdigian and others. All Rights Reserved.
+#           This module is free software; you can redistribute it and/or 
+#           modify it under the same terms as Perl itself.
+#
+# _History_
+#
+# Object framework ripped from Steve Chervits's SeqPattern.pm
+# 
+# Core EGCG Sigcleave emulation from perl code developed by
+# Danh Nguyen & Kamalakar Gulukota which itself was based 
+# loosely on Colgrove's signal.c program.
+#
+# The overall idea is to replicate the output of the sigcleave
+# program which was distributed with the EGCG extension to the GCG sequence
+# analysis package. There is also an accessor method for just getting at
+# the raw results.
+#
+#-----------------------------------------------------------------------------
+
+=head1 NAME
+
+Bio::Tools::Sigcleave - Bioperl object for sigcleave analysis
+
+=head1 SYNOPSIS
+
+=head2 Object Creation
+
+  use Bio::Tools::Sigcleave ();
+
+  # to keep the module backwar compatible, you can pass it a sequence string, but
+  # there recommended say is to pass it a Seq object
+
+  # this works
+  $seq = "MVLLLILSVLLLKEDVRGSAQSSERRVVAHMPGDIIIGALFSVHHQPTVDKVHERKCGAVREQYGI";
+  $sig = new Bio::Tools::Sigcleave(-seq  => $seq,
+                                                -type => 'protein',
+                                                -threshold=>'3.5',
+                                                );
+  # but you do:
+  $seqobj = Bio::PrimarySeq->new(-seq => $seq);
+
+  $sig = new Bio::Tools::Sigcleave(-seq  => $seqobj,
+                                                -threshold=>'3.5',
+                                                );
+
+
+  # now you can detect procaryotic signal sequences as well as eucaryotic
+  $sig->matrix('eucaryotic'); # or 'procaryotic'
+
+
+=head2 Object Methods & Accessors
+
+  # you can use this method to fine tune the threshod before printing out the results
+  $sig->result_count:
+
+  %raw_results      = $sig->signals;
+  $formatted_output = $sig->pretty_print;
+
+=head1 DESCRIPTION
+
+"Sigcleave" was a program distributed as part of the free EGCG add-on
+to earlier versions of the GCG Sequence Analysis package. A new
+implementation of the algorithm is now part of EMBOSS package.
+
+From the EGCG documentation:
+
+  SigCleave uses the von Heijne method to locate signal sequences, and
+  to identify the cleavage site. The method is 95% accurate in
+  resolving signal sequences from non-signal sequences with a cutoff
+  score of 3.5, and 75-80% accurate in identifying the cleavage
+  site. The program reports all hits above a minimum value.
+
+The EGCG Sigcleave program was written by Peter Rice (E-mail:
+pmr@sanger.ac.uk Post: Informatics Division, The Sanger Centre,
+Wellcome Trust Genome Campus, Hinxton, Cambs, CB10 1SA, UK).
+
+Since EGCG is no longer distributed for the latest versions of GCG,
+this code was developed to emulate the output of the original program
+as much as possible for those who lost access to sigcleave when
+upgrading to newer versions of GCG.
+
+There are 2 accessor methods for this object. "signals" will return a
+perl associative array containing the sigcleave scores keyed by amino
+acid position.  "pretty_print" returns a formatted string similar to
+the output of the original sigcleave utility.
+
+In both cases, the "threshold" setting controls the score reporting
+level. If no value for threshold is passed in by the user, the code
+defaults to a reporting value of 3.5.
+
+In this implemntation the accessor will never return any
+score/position pair which does not meet the threshold limit. This is
+the slightly different from the behaviour of the 8.1 EGCG sigcleave
+program which will report the highest of the under-threshold results
+if nothing else is found.
+
+
+Example of pretty_print output:
+
+	SIGCLEAVE of sigtest from: 1 to 146
+
+	Report scores over 3.5
+	Maximum score 4.9 at residue 131
+
+	 Sequence:  FVILAAMSIQGSA-NLQTQWKSTASLALET
+        	    | (signal)    | (mature peptide)
+          	118            131
+
+	 Other entries above 3.5
+
+	Maximum score 3.7 at residue 112
+
+	 Sequence:  CSRQLFGWLFCKV-HPGAIVFVILAAMSIQGSANLQTQWKSTASLALET
+         	   | (signal)    | (mature peptide)
+           	99            112
+
+
+=head1 FEEDBACK
+
+When updating and maintaining a module, it helps to know that people
+are actually using it. Let us know if you find a bug, think this code
+is useful or have any improvements/features to suggest.
+
+=head2 Reporting Bugs
+
+Report bugs to the Bioperl bug tracking system to help us keep track
+the bugs and their resolution. Bug reports can be submitted via email
+or the web:
+
+    bioperl-bugs@bio.perl.org
+    http://bugzilla.bioperl.org/
+
+=head1 AUTHOR
+
+Chris Dagdigian, dag@sonsorol.org  & others
+
+=head1 CONTRIBUTORS
+
+Heikki Lehvaslaiho, heikki@ebi.ac.uk
+
+=head1 VERSION
+
+Bio::Tools::Sigcleave.pm, $Id: Sigcleave.pm,v 1.17 2002/10/22 07:45:22 lapp Exp $
+
+=head1 COPYRIGHT
+
+Copyright (c) 1999 Chris Dagdigian & others. All Rights Reserved.
+This module is free software; you can redistribute it and/or modify it
+under the same terms as Perl itself.
+
+=head1 REFERENCES / SEE ALSO
+
+von Heijne G. (1986) "A new method for predicting signal sequences
+cleavage sites."  Nucleic Acids Res. 14, 4683-4690.
+
+von Heijne G. (1987) in "Sequence Analysis in Molecular Biology:
+Treasure Trove or Trivial Pursuit" (Acad. Press, (1987), 113-117).
+
+
+=head1 APPENDIX
+
+The following documentation describes the various functions
+contained in this module. Some functions are for internal 
+use and are not meant to be called by the user; they are 
+preceded by an underscore ("_").
+
+
+=cut
+
+#
+##
+###
+#### END of main POD documentation.
+###
+##
+#
+
+
+package Bio::Tools::Sigcleave;
+
+use Bio::Root::Root;
+use Bio::PrimarySeq;
+
+@ISA = qw(Bio::Root::Root);
+use strict;
+use vars qw ($ID $VERSION %WeightTable_euc  %WeightTable_pro );
+$ID  = 'Bio::Tools::Sigcleave';
+$VERSION = 0.02;
+
+
+  %WeightTable_euc = (
+#Sample: 161 aligned sequences
+# R     -13 -12 -11 -10  -9  -8  -7  -6  -5  -4  -3  -2  -1  +1  +2 Expect
+ 'A' => [16, 13, 14, 15, 20, 18, 18, 17, 25, 15, 47,  6, 80, 18,  6, 14.5],
+ 'C' => [ 3,  6,  9,  7,  9, 14,  6,  8,  5,  6, 19,  3,  9,  8,  3,  4.5],
+ 'D' => [ 0,  0,  0,  0,  0,  0,  0,  0,  5,  3,  0,  5,  0, 10, 11,  8.9],
+ 'E' => [ 0,  0,  0,  1,  0,  0,  0,  0,  3,  7,  0,  7,  0, 13, 14, 10.0],
+ 'F' => [13,  9, 11, 11,  6,  7, 18, 13,  4,  5,  0, 13,  0,  6,  4,  5.6],
+ 'G' => [ 4,  4,  3,  6,  3, 13,  3,  2, 19, 34,  5,  7, 39, 10,  7, 12.1],
+ 'H' => [ 0,  0,  0,  0,  0,  1,  1,  0,  5,  0,  0,  6,  0,  4,  2,  3.4],
+ 'I' => [15, 15,  8,  6, 11,  5,  4,  8,  5,  1, 10,  5,  0,  8,  7,  7.4],
+ 'K' => [ 0,  0,  0,  1,  0,  0,  1,  0,  0,  4,  0,  2,  0, 11,  9, 11.3],
+ 'L' => [71, 68, 72, 79, 78, 45, 64, 49, 10, 23,  8, 20,  1,  8,  4, 12.1],
+ 'M' => [ 0,  3,  7,  4,  1,  6,  2,  2,  0,  0,  0,  1,  0,  1,  2,  2.7],
+ 'N' => [ 0,  1,  0,  1,  1,  0,  0,  0,  3,  3,  0, 10,  0,  4,  7,  7.1],
+ 'P' => [ 2,  0,  2,  0,  0,  4,  1,  8, 20, 14,  0,  1,  3,  0, 22,  7.4],
+ 'Q' => [ 0,  0,  0,  1,  0,  6,  1,  0, 10,  8,  0, 18,  3, 19, 10,  6.3],
+ 'R' => [ 2,  0,  0,  0,  0,  1,  0,  0,  7,  4,  0, 15,  0, 12,  9,  7.6],
+ 'S' => [ 9,  3,  8,  6, 13, 10, 15, 16, 26, 11, 23, 17, 20, 15, 10, 11.4],
+ 'T' => [ 2, 10,  5,  4,  5, 13,  7,  7, 12,  6, 17,  8,  6,  3, 10,  9.7],
+ 'V' => [20, 25, 15, 18, 13, 15, 11, 27,  0, 12, 32,  3,  0,  8, 17, 11.1],
+ 'W' => [ 4,  3,  3,  1,  1,  2,  6,  3,  1,  3,  0,  9,  0,  2,  0,  1.8],
+ 'Y' => [ 0,  1,  4,  0,  0,  1,  3,  1,  1,  2,  0,  5,  0,  1,  7,  5.6]
+);
+
+  %WeightTable_pro = (
+#Sample: 36 aligned sequences
+#  R    -13 -12 -11 -10  -9  -8  -7  -6  -5  -4  -3  -2  -1  +1  +2 Expect
+  'A' => [0,  8,  8,  9,  6,  7,  5,  6,  7,  7, 24,  2, 31, 18,  4,  3.2],
+  'C' => [1,  0,  0,  1,  1,  0,  0,  1,  1,  0,  0,  0,  0,  0,  0,  1.0],
+  'D' => [0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  2,  8,  2.0],
+  'E' => [0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  0,  4,  8,  2.2],
+  'F' => [2,  4,  3,  4,  1,  1,  8,  0,  4,  1,  0,  7,  0,  1,  0,  1.3],
+  'G' => [4,  2,  2,  2,  3,  5,  2,  4,  2,  2,  0,  2,  2,  1,  0,  2.7],
+  'H' => [0,  0,  1,  0,  0,  0,  0,  1,  1,  0,  0,  7,  0,  1,  0,  0.8],
+  'I' => [3,  1,  5,  1,  5,  0,  1,  3,  0,  0,  0,  0,  0,  0,  2,  1.7],
+  'K' => [0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  0,  2,  0,  3,  0,  2.5],
+  'L' => [8, 11,  9,  8,  9, 13,  1,  0,  2,  2,  1,  2,  0,  0,  1,  2.7],
+  'M' => [0,  2,  1,  1,  3,  2,  3,  0,  1,  2,  0,  4,  0,  0,  1,  0.6],
+  'N' => [0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  0,  3,  0,  1,  4,  1.6],
+  'P' => [0,  1,  1,  1,  1,  1,  2,  3,  5,  2,  0,  0,  0,  0,  5,  1.7],
+  'Q' => [0,  0,  0,  0,  0,  0,  0,  0,  2,  2,  0,  3,  0,  0,  1,  1.4],
+  'R' => [0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  0,  1.7],
+  'S' => [1,  0,  1,  4,  4,  1,  5, 15,  5,  8,  5,  2,  2,  0,  0,  2.6],
+  'T' => [2,  0,  4,  2,  2,  2,  2,  2,  5,  1,  3,  0,  1,  1,  2,  2.2],
+  'V' => [5,  7,  1,  3,  1,  4,  7,  0,  0,  4,  3,  0,  0,  2,  0,  2.5],
+  'W' => [0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  0,  0.4],
+  'Y' => [0,  0,  0,  0,  0,  0,  0,  0,  0,  3,  0,  1,  0,  0,  0,  1.3]
+);
+
+
+##
+## Now we calculate the _real_ values for the weight tables
+##
+##
+## yeah yeah yeah there is lots of math here that gets repeated
+## every single time a sigcleave object gets created. This is
+## a quick hack to make sure that we get the scores as accurate as
+## possible. Need all those significant digits....
+##
+## suggestions for speedup aproaches welcome
+##
+
+
+foreach my $i (keys %WeightTable_euc) {
+    my $expected = $WeightTable_euc{$i}[15];
+    if ($expected > 0) {
+	for (my $j=0; $j<16; $j++) {
+	    if ($WeightTable_euc{$i}[$j] == 0) {
+		$WeightTable_euc{$i}[$j] = 1; 
+		if ($j == 10 || $j == 12) {
+		    $WeightTable_euc{$i}[$j] = 1.e-10;
+		}
+	    }
+	    $WeightTable_euc{$i}[$j] = log($WeightTable_euc{$i}[$j]/$expected);
+	}
+    }
+}
+
+
+foreach my $i (keys %WeightTable_pro) {
+    my $expected = $WeightTable_pro{$i}[15];
+    if ($expected > 0) {
+	for (my $j=0; $j<16; $j++) {
+	    if ($WeightTable_pro{$i}[$j] == 0) {
+		$WeightTable_pro{$i}[$j] = 1; 
+		if ($j == 10 || $j == 12) {
+		    $WeightTable_pro{$i}[$j] = 1.e-10;
+		}
+	    }
+	    $WeightTable_pro{$i}[$j] = log($WeightTable_pro{$i}[$j]/$expected);
+	}
+    }
+}
+
+
+
+#####################################################################################
+##                                 CONSTRUCTOR                                     ##
+#####################################################################################
+
+
+sub new {
+    my ($class, @args) = @_;
+
+    my $self = $class->SUPER::new(@args);
+    #my $self = Bio::Seq->new(@args);
+
+    my ($seq, $threshold, $matrix) = $self->_rearrange([qw(SEQ THRESHOLD MATRIX)],@args);
+
+    defined $threshold && $self->threshold($threshold);
+    $matrix && $self->matrix($matrix);
+    $seq && $self->seq($seq);
+
+    return $self;
+}
+
+
+
+=head1 threshold
+
+ Title     : threshold
+ Usage     : $value = $self->threshold
+ Purpose   : Read/write method sigcleave score reporting threshold.
+ Returns   : float.
+ Argument  : new value, float
+ Throws    : on non-number argument
+ Comments  : defaults to 3.5
+ See Also   : n/a
+
+=cut
+
+#----------------
+sub threshold {
+#----------------
+    my ($self, $value) = @_;
+    if( defined $value) {
+	$self->throw("I need a number, not [$value]")
+	    if  $value !~ /^[+-]?[\d\.]+$/;
+	$self->{'_threshold'} = $value;
+    }
+    return $self->{'_threshold'} || 3.5 ;
+}
+
+=head1 matrix
+
+ Title     : matrix
+ Usage     : $value = $self->matrix('procaryotic')
+ Purpose   : Read/write method sigcleave matrix.
+ Returns   : float.
+ Argument  : new value: 'eucaryotic' or 'procaryotic'
+ Throws    : on non-number argument
+ Comments  : defaults to 3.5
+ See Also   : n/a
+
+=cut
+
+#----------------
+sub matrix {
+#----------------
+    my ($self, $value) = @_;
+    if( defined $value) {
+	$self->throw("I need 'eucaryotic' or 'procaryotic', not [$value]")
+	    unless  $value eq 'eucaryotic' or $value eq 'procaryotic';
+	$self->{'_matrix'} = $value;
+    }
+    return $self->{'_matrix'} || 'eucaryotic' ;
+
+}
+
+=head1 seq
+
+ Title     : seq
+ Usage     : $value = $self->seq('procaryotic')
+ Purpose   : Read/write method sigcleave seq.
+ Returns   : float.
+ Argument  : new value: 'eucaryotic' or 'procaryotic'
+ Throws    : on non-number argument
+ Comments  : defaults to 3.5
+ See Also   : n/a
+
+=cut
+
+#----------------
+sub seq {
+#----------------
+    my ($self, $value) = @_;
+    if( defined $value) {
+	if ($value->isa('Bio::PrimarySeqI')) {
+	    $self->{'_seq'} = $value;
+	} else {
+	    $self->{'_seq'} = Bio::PrimarySeq->new(-seq=>$value, 
+						   -alphabet=>'protein');
+	}
+    }
+    return $self->{'_seq'};
+}
+
+
+
+=head1 _Analyze
+
+ Title     : _Analyze
+ Usage     : N/A This is an internal method. Not meant to be called from outside
+           : the package
+           :
+ Purpose   : calculates sigcleave score and amino acid position for the
+           : given protein sequence. The score reporting threshold can
+           : be adjusted by passing in the "threshold" parameter during
+           : object construction. If no threshold is passed in, the code
+           : defaults to reporting any scores equal to or above 3.5
+           :
+ Returns   : nothing. results are added to the object
+ Argument  : none.
+ Throws    : nothing.
+ Comments  : nothing.
+See Also   : n/a
+
+=cut
+
+#----------------
+sub _Analyze {
+#----------------
+    my($self) = @_;
+
+    my %signals;
+    my @hitWeight = ();
+    my @hitsort   = ();
+    my @hitpos    = ();
+    my $maxSite   = "";
+    my $seqPos    = "";
+    my $istart    = "";
+    my $iend      = "";
+    my $icol      = "";
+    my $i         = "";
+    my $weight    = "";
+    my $k         = 0;
+    my $c         = 0;
+    my $seqBegin  = 0;
+    my $pVal      = -13;
+    my $nVal      = 2;
+    my $nHits     = 0;
+    my $seqEnd    = $self->seq->length;
+    my $pep       = $self->seq->seq;
+    my $minWeight = $self->threshold;
+    my $matrix    = $self->matrix;
+
+    ## The weight table is keyed by UPPERCASE letters so we uppercase
+    ## the pep string because we don't want to alter the actual object
+    ## sequence.
+
+    $pep =~ tr/a-z/A-Z/;
+
+    for ($seqPos = $seqBegin; $seqPos < $seqEnd; $seqPos++) {
+	$istart = (0 > $seqPos + $pVal)? 0 : $seqPos + $pVal;
+	$iend = ($seqPos + $nVal - 1 < $seqEnd)? $seqPos + $nVal - 1 : $seqEnd;
+	$icol= $iend - $istart + 1;
+	$weight = 0.00;
+	for ($k=0; $k<$icol; $k++) {
+	    $c = substr($pep, $istart + $k, 1);
+
+	    ## CD: The if(defined) stuff was put in here because Sigcleave.pm
+	    ## CD: kept getting warnings about undefined vals during 'make test' ...
+	    if ($matrix eq 'eucaryotic') {
+		$weight += $WeightTable_euc{$c}[$k] if defined $WeightTable_euc{$c}[$k];
+	    } else {
+		$weight += $WeightTable_pro{$c}[$k] if defined $WeightTable_pro{$c}[$k];
+	    }
+	}
+	$signals{$seqPos+1} = sprintf ("%.1f", $weight)	if $weight >= $minWeight;
+    }
+
+    $self->{"_signal_scores"} = { %signals };
+}
+
+
+=head1 signals
+
+ Title     : signals
+ Usage     : %sigcleave_results = $sig->signals;
+           :
+ Purpose   : Accessor method for sigcleave results
+           : 
+ Returns   : Associative array. The key value represents the amino acid position
+           : and the value represents the score. Only scores that
+           : are greater than or equal to the THRESHOLD value are reported.
+           : 
+ Argument  : none.
+ Throws    : none.
+ Comments  : none.
+See Also   : THRESHOLD
+
+=cut
+
+#----------------
+sub signals {
+#----------------
+    my $self = shift;
+    my %results;
+    my $position;
+
+    # do the calculations
+    $self->_Analyze;
+
+    foreach $position ( sort keys %{ $self->{'_signal_scores'} } ) {
+	$results{$position} = $self->{'_signal_scores'}{$position};
+    }
+    return %results;
+}
+
+
+=head1 result_count
+
+ Title     : result_count
+ Usage     : $count = $sig->result_count;
+           :
+ Purpose   : Accessor method for sigcleave results
+           : 
+ Returns   : Integer, number of results above the threshold
+           : 
+ Argument  : none.
+ Throws    : none.
+ Comments  : none.
+
+See Also   : THRESHOLD
+
+=cut
+
+#----------------
+sub result_count {
+#----------------
+    my $self = shift;
+    $self->_Analyze;
+    return keys %{ $self->{'_signal_scores'} };
+}
+
+
+=head1 pretty_print
+
+ Title     : pretty_print
+ Usage     : $output = $sig->pretty_print;
+           : print $sig->pretty_print;
+           :
+ Purpose   : Emulates the output of the EGCG Sigcleave
+           : utility.
+           : 
+ Returns   : A formatted string.
+ Argument  : none.
+ Throws    : none.
+ Comments  : none.
+See Also   : n/a
+
+=cut
+
+#----------------
+sub pretty_print {
+#----------------
+    my $self = shift;
+    my $pos;
+    my $output;
+    my $cnt = 1;
+    my %results  = $self->signals;
+    my @hits     = keys %results;
+    my $hitcount = $#hits; $hitcount++;
+    my $thresh   = $self->threshold;
+    my $seqlen   = $self->seq->length || 0;
+    my $name     = $self->seq->id || 'NONAME';
+    my $pep      = $self->seq->seq;
+    $pep      =~ tr/a-z/A-Z/;
+
+    $output = "SIGCLEAVE of $name from: 1 to $seqlen\n\n";
+
+    if ($hitcount > 0) {
+	$output .= "Report scores over $thresh\n";
+	foreach $pos ((sort { $results{$b} cmp $results{$a} } keys %results)) {
+	    my $start = $pos - 15;
+	    $start = 1 if $start < 1;
+	    my $sig = substr($pep,$start -1,$pos-$start );
+
+	    $output .= sprintf ("Maximum score %1.1f at residue %3d\n",$results{$pos},$pos);
+	    $output .= "\n";
+	    $output .= " Sequence:  ";
+	    $output .= $sig;
+	    $output .= "-" x (15- length($sig));
+	    $output .= "-";
+	    $output .= substr($pep,$pos-1,50);
+	    $output .= "\n";
+	    $output .= " " x 12;
+	    $output .= "| \(signal\)      | \(mature peptide\)\n";
+	    $output .= sprintf("          %3d             %3d\n\n",$start,$pos);
+
+	    if (($hitcount > 1) && ($cnt == 1)) {
+    		$output .= " Other entries above $thresh\n\n";
+	    }
+	    $cnt++;
+ 	}
+    }
+    $output;
+}
+
+
+1;
+__END__
+
+
+#########################################################################
+#  End of class 
+#########################################################################