diff variant_effect_predictor/Bio/Search/HSP/HSPI.pm @ 0:1f6dce3d34e0

Uploaded
author mahtabm
date Thu, 11 Apr 2013 02:01:53 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/variant_effect_predictor/Bio/Search/HSP/HSPI.pm	Thu Apr 11 02:01:53 2013 -0400
@@ -0,0 +1,701 @@
+#-----------------------------------------------------------------
+# $Id: HSPI.pm,v 1.21.2.1 2003/01/22 22:51:00 jason Exp $
+#
+# BioPerl module for Bio::Search::HSP::HSPI
+#
+# Cared for by Steve Chervitz <sac@bioperl.org>
+# and Jason Stajich <jason@bioperl.org>
+#
+# You may distribute this module under the same terms as perl itself
+#-----------------------------------------------------------------
+
+# POD documentation - main docs before the code
+
+=head1 NAME
+
+Bio::Search::HSP::HSPI - Interface for a High Scoring Pair in a similarity search result
+
+=head1 SYNOPSIS
+
+    $r_type = $hsp->algorithm
+
+    $pvalue = $hsp->pvalue();
+
+    $evalue = $hsp->evalue();
+
+    $frac_id = $hsp->frac_identical( ['query'|'hit'|'total'] );
+
+    $frac_cons = $hsp->frac_conserved( ['query'|'hit'|'total'] );
+
+    $gaps = $hsp->gaps( ['query'|'hit'|'total'] );
+
+    $qseq = $hsp->query_string;
+
+    $hseq = $hsp->hit_string;
+
+    $homology_string = $hsp->homology_string;
+
+    $len = $hsp->length( ['query'|'hit'|'total'] );
+
+    $rank = $hsp->rank;
+
+=head1 DESCRIPTION
+
+Bio::Search::HSP::HSPI objects cannot be instantiated since this
+module defines a pure interface.
+
+Given an object that implements the Bio::Search::HSP::HSPI  interface,
+you can do the following things with it:
+
+=head1 SEE ALSO
+
+This interface inherits methods from these other modules:
+
+L<Bio::SeqFeatureI>,
+L<Bio::SeqFeature::FeaturePair>
+L<Bio::SeqFeature::SimilarityPair>
+
+Please refer to these modules for documentation of the 
+many additional inherited methods.
+
+=head1 FEEDBACK
+
+=head2 Mailing Lists
+
+User feedback is an integral part of the evolution of this and other
+Bioperl modules. Send your comments and suggestions preferably to
+the Bioperl mailing list.  Your participation is much appreciated.
+
+  bioperl-l@bioperl.org              - General discussion
+  http://bioperl.org/MailList.shtml  - About the mailing lists
+
+=head2 Reporting Bugs
+
+Report bugs to the Bioperl bug tracking system to help us keep track
+of the bugs and their resolution. Bug reports can be submitted via
+email or the web:
+
+  bioperl-bugs@bioperl.org
+  http://bugzilla.bioperl.org/
+
+=head1 AUTHOR - Steve Chervitz, Jason Stajich
+
+Email sac@bioperl.org
+Email jason@bioperl.org
+
+=head1 COPYRIGHT
+
+Copyright (c) 2001 Steve Chervitz, Jason Stajich. All Rights Reserved.
+
+=head1 DISCLAIMER
+
+This software is provided "as is" without warranty of any kind.
+
+=head1 APPENDIX
+
+The rest of the documentation details each of the object methods.
+Internal methods are usually preceded with a _
+
+=cut
+
+
+# Let the code begin...
+
+
+package Bio::Search::HSP::HSPI;
+use vars qw(@ISA);
+
+use Bio::Root::RootI;
+use Bio::SeqFeature::SimilarityPair;
+
+use strict;
+use Carp;
+
+@ISA = qw(Bio::SeqFeature::SimilarityPair Bio::Root::RootI);
+
+
+=head2 algorithm
+
+ Title   : algorithm
+ Usage   : my $r_type = $hsp->algorithm
+ Function: Obtain the name of the algorithm used to obtain the HSP
+ Returns : string (e.g., BLASTP)
+ Args    : none
+
+=cut
+
+sub algorithm{
+   my ($self,@args) = @_;
+   $self->throw_not_implemented;
+}
+
+=head2 pvalue
+
+ Title   : pvalue
+ Usage   : my $pvalue = $hsp->pvalue();
+ Function: Returns the P-value for this HSP or undef 
+ Returns : float or exponential (2e-10)
+           P-value is not defined with NCBI Blast2 reports.
+ Args    : none
+
+=cut
+
+sub pvalue {
+   my ($self) = @_;
+   $self->throw_not_implemented;
+}
+
+=head2 evalue
+
+ Title   : evalue
+ Usage   : my $evalue = $hsp->evalue();
+ Function: Returns the e-value for this HSP
+ Returns : float or exponential (2e-10)
+ Args    : none
+
+=cut
+
+sub evalue {
+   my ($self) = @_;
+   $self->throw_not_implemented;
+}
+
+=head2 frac_identical
+
+ Title   : frac_identical
+ Usage   : my $frac_id = $hsp->frac_identical( ['query'|'hit'|'total'] );
+ Function: Returns the fraction of identitical positions for this HSP 
+ Returns : Float in range 0.0 -> 1.0
+ Args    : 'query' = num identical / length of query seq (without gaps)
+           'hit'   = num identical / length of hit seq (without gaps)
+           'total' = num identical / length of alignment (with gaps)
+           default = 'total' 
+
+=cut
+
+sub frac_identical {
+   my ($self, $type) = @_;
+   $self->throw_not_implemented;
+}
+
+=head2 frac_conserved
+
+ Title    : frac_conserved
+ Usage    : my $frac_cons = $hsp->frac_conserved( ['query'|'hit'|'total'] );
+ Function : Returns the fraction of conserved positions for this HSP.
+            This is the fraction of symbols in the alignment with a 
+            positive score.
+ Returns : Float in range 0.0 -> 1.0
+ Args    : 'query' = num conserved / length of query seq (without gaps)
+           'hit'   = num conserved / length of hit seq (without gaps)
+           'total' = num conserved / length of alignment (with gaps)
+           default = 'total' 
+
+=cut
+
+sub frac_conserved {
+    my ($self, $type) = @_;
+    $self->throw_not_implemented;
+}
+
+=head2 num_identical
+
+ Title   : num_identical
+ Usage   : $obj->num_identical($newval)
+ Function: returns the number of identical residues in the alignment
+ Returns : integer
+ Args    : integer (optional)
+
+
+=cut
+
+sub num_identical{
+    shift->throw_not_implemented;
+}
+
+=head2 num_conserved
+
+ Title   : num_conserved
+ Usage   : $obj->num_conserved($newval)
+ Function: returns the number of conserved residues in the alignment
+ Returns : inetger
+ Args    : integer (optional)
+
+
+=cut
+
+sub num_conserved{
+    shift->throw_not_implemented();
+}
+
+=head2 gaps
+
+ Title    : gaps
+ Usage    : my $gaps = $hsp->gaps( ['query'|'hit'|'total'] );
+ Function : Get the number of gaps in the query, hit, or total alignment.
+ Returns  : Integer, number of gaps or 0 if none
+ Args     : 'query' = num conserved / length of query seq (without gaps)
+            'hit'   = num conserved / length of hit seq (without gaps)
+            'total' = num conserved / length of alignment (with gaps)
+            default = 'total' 
+
+=cut
+
+sub gaps        {
+    my ($self, $type) = @_;
+    $self->throw_not_implemented;
+}
+
+=head2 query_string
+
+ Title   : query_string
+ Usage   : my $qseq = $hsp->query_string;
+ Function: Retrieves the query sequence of this HSP as a string
+ Returns : string
+ Args    : none
+
+
+=cut
+
+sub query_string{
+   my ($self) = @_;
+   $self->throw_not_implemented;
+}
+
+=head2 hit_string
+
+ Title   : hit_string
+ Usage   : my $hseq = $hsp->hit_string;
+ Function: Retrieves the hit sequence of this HSP as a string
+ Returns : string
+ Args    : none
+
+
+=cut
+
+sub hit_string{
+   my ($self) = @_;
+   $self->throw_not_implemented;
+}
+
+=head2 homology_string
+
+ Title   : homology_string
+ Usage   : my $homo_string = $hsp->homology_string;
+ Function: Retrieves the homology sequence for this HSP as a string.
+         : The homology sequence is the string of symbols in between the 
+         : query and hit sequences in the alignment indicating the degree
+         : of conservation (e.g., identical, similar, not similar).
+ Returns : string
+ Args    : none
+
+=cut
+
+sub homology_string{
+   my ($self) = @_;
+   $self->throw_not_implemented;
+}
+
+=head2 length
+
+ Title    : length
+ Usage    : my $len = $hsp->length( ['query'|'hit'|'total'] );
+ Function : Returns the length of the query or hit in the alignment (without gaps) 
+            or the aggregate length of the HSP (including gaps;
+            this may be greater than either hit or query )
+ Returns  : integer
+ Args     : 'query' = length of query seq (without gaps)
+            'hit'   = length of hit seq (without gaps)
+            'total' = length of alignment (with gaps)
+            default = 'total' 
+ Args    : none
+
+=cut
+
+sub length{
+    shift->throw_not_implemented();
+}
+
+=head2 percent_identity
+
+ Title   : percent_identity
+ Usage   : my $percentid = $hsp->percent_identity()
+ Function: Returns the calculated percent identity for an HSP
+ Returns : floating point between 0 and 100 
+ Args    : none
+
+
+=cut
+
+sub percent_identity{
+   my ($self) = @_;
+   return $self->frac_identical('hsp') * 100;   
+}
+
+=head2 get_aln
+
+ Title   : get_aln
+ Usage   : my $aln = $hsp->gel_aln
+ Function: Returns a Bio::SimpleAlign representing the HSP alignment
+ Returns : Bio::SimpleAlign
+ Args    : none
+
+=cut
+
+sub get_aln {
+   my ($self) = @_;
+   $self->throw_not_implemented;
+}
+
+
+=head2 seq_inds
+
+ Title   : seq_inds
+ Purpose   : Get a list of residue positions (indices) for all identical 
+           : or conserved residues in the query or sbjct sequence.
+ Example   : @s_ind = $hsp->seq_inds('query', 'identical');
+           : @h_ind = $hsp->seq_inds('hit', 'conserved');
+           : @h_ind = $hsp->seq_inds('hit', 'conserved', 1);
+ Returns   : List of integers 
+           : May include ranges if collapse is true.
+ Argument  : seq_type  = 'query' or 'hit' or 'sbjct'  (default = query)
+              ('sbjct' is synonymous with 'hit') 
+             class     = 'identical' or 'conserved' or 'nomatch' or 'gap'
+                          (default = identical)
+                          (can be shortened to 'id' or 'cons')
+
+             collapse  = boolean, if true, consecutive positions are merged
+                         using a range notation, e.g., "1 2 3 4 5 7 9 10 11" 
+                         collapses to "1-5 7 9-11". This is useful for 
+                         consolidating long lists. Default = no collapse.
+ Throws    : n/a.
+ Comments  : 
+
+See Also   : L<Bio::Search::BlastUtils::collapse_nums()|Bio::Search::BlastUtils>, L<Bio::Search::Hit::HitI::seq_inds()|Bio::Search::Hit::HitI>
+
+=cut
+
+sub seq_inds {
+    shift->throw_not_implemented();
+}
+
+=head2 Inherited from Bio::SeqFeature::SimilarityPair
+
+These methods come from Bio::SeqFeature::SimilarityPair
+
+=head2 query
+
+ Title   : query
+ Usage   : my $query = $hsp->query
+ Function: Returns a SeqFeature representing the query in the HSP
+ Returns : Bio::SeqFeature::Similarity
+ Args    : [optional] new value to set
+
+
+=head2 hit
+
+ Title   : hit
+ Usage   : my $hit = $hsp->hit
+ Function: Returns a SeqFeature representing the hit in the HSP
+ Returns : Bio::SeqFeature::Similarity
+ Args    : [optional] new value to set
+
+
+=head2 significance
+
+ Title   : significance
+ Usage   : $evalue = $obj->significance();
+           $obj->significance($evalue);
+ Function: Get/Set the significance value (see Bio::SeqFeature::SimilarityPair)
+ Returns : significance value (scientific notation string)
+ Args    : significance value (sci notation string)
+
+
+=head2 score
+
+ Title   : score
+ Usage   : my $score = $hsp->score();
+ Function: Returns the score for this HSP or undef 
+ Returns : numeric           
+ Args    : [optional] numeric to set value
+
+=head2 bits
+
+ Title   : bits
+ Usage   : my $bits = $hsp->bits();
+ Function: Returns the bit value for this HSP or undef 
+ Returns : numeric
+ Args    : none
+
+=cut
+
+# override 
+
+=head2 strand
+
+ Title   : strand
+ Usage   : $hsp->strand('query')
+ Function: Retrieves the strand for the HSP component requested
+ Returns : +1 or -1 (0 if unknown)
+ Args    : 'hit' or 'subject' or 'sbjct' to retrieve the strand of the subject
+           'query' to retrieve the query strand (default)
+           'list' or 'array' to retreive both query and hit together
+
+=cut
+
+sub strand {
+    my $self = shift;
+    my $val = shift;
+    $val = 'query' unless defined $val;
+    $val =~ s/^\s+//;
+
+    if( $val =~ /^q/i ) { 
+	return $self->query->strand(shift);
+    } elsif( $val =~ /^(hi|s)/i ) {
+	return $self->hit->strand(shift);
+    } elsif ( $val =~ m/^(list|array)/) {
+	return ($self->query->strand(shift), $self->hit->strand(shift));
+    } else { 
+	$self->warn("unrecognized component $val requested\n");
+    }
+    return 0;
+}
+
+=head2 start
+
+ Title   : start
+ Usage   : $hsp->start('query')
+ Function: Retrieves the start for the HSP component requested
+ Returns : integer
+ Args    : 'hit' or 'subject' or 'sbjct' to retrieve the start of the subject
+           'query' to retrieve the query start (default)
+
+=cut
+
+sub start {
+    my $self = shift;
+    my $val = shift;
+    $val = 'query' unless defined $val;
+    $val =~ s/^\s+//;
+
+    if( $val =~ /^q/i ) { 
+	return $self->query->start(shift);
+    } elsif( $val =~ /^(hi|s)/i ) {
+	return $self->hit->start(shift);
+    } else { 
+	$self->warn("unrecognized component $val requested\n");
+    }
+    return 0;
+}
+
+=head2 end
+
+ Title   : end
+ Usage   : $hsp->end('query')
+ Function: Retrieves the end for the HSP component requested
+ Returns : integer
+ Args    : 'hit' or 'subject' or 'sbjct' to retrieve the end of the subject
+           'query' to retrieve the query end (default)
+
+=cut
+
+sub end {
+    my $self = shift;
+    my $val = shift;
+    $val = 'query' unless defined $val;
+    $val =~ s/^\s+//;
+
+    if( $val =~ /^q/i ) { 
+	return $self->query->end(shift);
+    } elsif( $val =~ /^(hi|s)/i ) {
+	return $self->hit->end(shift);
+    } else { 
+	$self->warn("unrecognized component $val requested\n");
+    }
+    return 0;
+}
+
+=head2 seq_str
+
+ Usage     : $hsp->seq_str( seq_type );
+ Purpose   : Get the full query, sbjct, or 'match' sequence as a string.
+           : The 'match' sequence is the string of symbols in between the 
+           : query and sbjct sequences.
+ Example   : $str = $hsp->seq_str('query');
+ Returns   : String
+ Argument  : seq_Type = 'query' or 'hit' or 'sbjct' or 'match'
+           :  ('sbjct' is synonymous with 'hit')
+           : default is 'query'
+ Throws    : Exception if the argument does not match an accepted seq_type.
+ Comments  : 
+
+See Also   : L<seq()|seq>, L<seq_inds()|seq_inds>, B<_set_match_seq()>
+
+=cut
+
+sub seq_str {  
+    my $self = shift;
+    my $type = shift;
+    if( $type =~ /^q/i ) { return $self->query_string(shift) }
+    elsif( $type =~ /^(s|hi)/i ) { return $self->hit_string(shift)}
+    elsif ( $type =~ /^(ho|ma)/i ) { return $self->hit_string(shift) }
+    else { 
+	$self->warn("unknown sequence type $type");
+    }
+    return '';
+}
+
+
+=head2 rank
+
+ Usage     : $hsp->rank( [string] );
+ Purpose   : Get the rank of the HSP within a given Blast hit.
+ Example   : $rank = $hsp->rank;
+ Returns   : Integer (1..n) corresponding to the order in which the HSP
+             appears in the BLAST report.
+
+=cut
+
+sub rank { shift->throw_not_implemented }
+
+=head2 matches
+
+ Usage     : $hsp->matches([seq_type], [start], [stop]);
+ Purpose   : Get the total number of identical and conservative matches 
+           : in the query or sbjct sequence for the given HSP. Optionally can
+           : report data within a defined interval along the seq.
+           : (Note: 'conservative' matches are called 'positives' in the
+	   : Blast report.)
+ Example   : ($id,$cons) = $hsp_object->matches('hit');
+           : ($id,$cons) = $hsp_object->matches('query',300,400);
+ Returns   : 2-element array of integers 
+ Argument  : (1) seq_type = 'query' or 'hit' or 'sbjct' (default = query)
+           :  ('sbjct' is synonymous with 'hit') 
+           : (2) start = Starting coordinate (optional)
+           : (3) stop  = Ending coordinate (optional)
+ Throws    : Exception if the supplied coordinates are out of range.
+ Comments  : Relies on seq_str('match') to get the string of alignment symbols
+           : between the query and sbjct lines which are used for determining
+           : the number of identical and conservative matches.
+
+See Also   : L<length()|length>, L<gaps()|gaps>, L<seq_str()|seq_str>, L<Bio::Search::Hit::BlastHit::_adjust_contigs()|Bio::Search::Hit::BlastHit>
+
+=cut
+
+#-----------
+sub matches {
+#-----------
+    my( $self, %param ) = @_;
+    my(@data);
+    my($seqType, $beg, $end) = ($param{-SEQ}, $param{-START}, $param{-STOP});
+    $seqType ||= 'query';
+   $seqType = 'sbjct' if $seqType eq 'hit';
+
+    if(!defined $beg && !defined $end) {
+	## Get data for the whole alignment.
+	push @data, ($self->num_identical, $self->num_conserved);
+    } else {
+	## Get the substring representing the desired sub-section of aln.
+	$beg ||= 0;
+	$end ||= 0;
+	my($start,$stop) = $self->range($seqType);
+	if($beg == 0) { $beg = $start; $end = $beg+$end; }
+	elsif($end == 0) { $end = $stop; $beg = $end-$beg; }
+
+	if($end >= $stop) { $end = $stop; } ##ML changed from if (end >stop)
+	else { $end += 1;}   ##ML moved from commented position below, makes
+                             ##more sense here
+#	if($end > $stop) { $end = $stop; }
+	if($beg < $start) { $beg = $start; }
+#	else { $end += 1;}
+
+#	my $seq = substr($self->seq_str('match'), $beg-$start, ($end-$beg));
+
+	## ML: START fix for substr out of range error ------------------
+	my $seq = "";
+	if (($self->algorithm eq 'TBLASTN') and ($seqType eq 'sbjct'))
+	{
+	    $seq = substr($self->seq_str('match'),
+			  int(($beg-$start)/3), int(($end-$beg+1)/3));
+
+	} elsif (($self->algorithm eq 'BLASTX') and ($seqType eq 'query'))
+	{
+	    $seq = substr($self->seq_str('match'),
+			  int(($beg-$start)/3), int(($end-$beg+1)/3));
+	} else {
+	    $seq = substr($self->seq_str('match'), 
+			  $beg-$start, ($end-$beg));
+	}
+	## ML: End of fix for  substr out of range error -----------------
+
+	
+	## ML: debugging code
+	## This is where we get our exception.  Try printing out the values going
+	## into this:
+	##
+#	 print STDERR 
+#	     qq(*------------MY EXCEPTION --------------------\nSeq: ") , 
+#	     $self->seq_str("$seqType"), qq("\n),$self->rank,",(  index:";
+#	 print STDERR  $beg-$start, ", len: ", $end-$beg," ), (HSPRealLen:", 
+#	     CORE::length $self->seq_str("$seqType");
+#	 print STDERR ", HSPCalcLen: ", $stop - $start +1 ," ), 
+#	     ( beg: $beg, end: $end ), ( start: $start, stop: stop )\n";
+	 ## ML: END DEBUGGING CODE----------
+
+	if(!CORE::length $seq) {
+	    $self->throw("Undefined sub-sequence ($beg,$end). Valid range = $start - $stop");
+	}
+	## Get data for a substring.
+#	printf "Collecting HSP subsection data: beg,end = %d,%d; start,stop = %d,%d\n%s<---\n", $beg, $end, $start, $stop, $seq;
+#	printf "Original match seq:\n%s\n",$self->seq_str('match');
+	$seq =~ s/ //g;  # remove space (no info).
+	my $len_cons = CORE::length $seq;
+	$seq =~ s/\+//g;  # remove '+' characters (conservative substitutions)
+	my $len_id = CORE::length $seq;
+	push @data, ($len_id, $len_cons);
+#	printf "  HSP = %s\n  id = %d; cons = %d\n", $self->rank, $len_id, $len_cons; <STDIN>;
+    }
+    @data;
+}
+
+=head2 n
+
+ Usage     : $hsp_obj->n()
+ Purpose   : Get the N value (num HSPs on which P/Expect is based).
+           : This value is not defined with NCBI Blast2 with gapping.
+ Returns   : Integer or null string if not defined.
+ Argument  : n/a
+ Throws    : n/a
+ Comments  : The 'N' value is listed in parenthesis with P/Expect value:
+           : e.g., P(3) = 1.2e-30  ---> (N = 3).
+           : Not defined in NCBI Blast2 with gaps.
+           : This typically is equal to the number of HSPs but not always.
+           : To obtain the number of HSPs, use Bio::Search::Hit::HitI::num_hsps().
+
+See Also   : L<Bio::SeqFeature::SimilarityPair::score()|Bio::SeqFeature::SimilarityPair>
+
+=cut
+
+sub n { shift->throw_not_implemented }
+
+=head2 range
+
+ Usage     : $hsp->range( [seq_type] );
+ Purpose   : Gets the (start, end) coordinates for the query or sbjct sequence
+           : in the HSP alignment.
+ Example   : ($query_beg, $query_end) = $hsp->range('query');
+           : ($hit_beg, $hit_end) = $hsp->range('hit');
+ Returns   : Two-element array of integers 
+ Argument  : seq_type = string, 'query' or 'hit' or 'sbjct'  (default = 'query')
+           :  ('sbjct' is synonymous with 'hit') 
+ Throws    : n/a
+ Comments  : This is a convenience method for constructions such as
+             ($hsp->query->start, $hsp->query->end)
+
+=cut
+
+sub range { shift->throw_not_implemented }
+
+
+1;
+