Mercurial > repos > mahtabm > ensembl
diff variant_effect_predictor/Bio/Tools/BPlite/HSP.pm @ 0:1f6dce3d34e0
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 02:01:53 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Tools/BPlite/HSP.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,505 @@ +############################################################################### +# Bio::Tools::BPlite::HSP +############################################################################### +# HSP = High Scoring Pair (to all non-experts as I am) +# +# The original BPlite.pm module has been written by Ian Korf ! +# see http://sapiens.wustl.edu/~ikorf +# +# You may distribute this module under the same terms as perl itself + + +# +# BioPerl module for Bio::Tools::BPlite::HSP +# +# Cared for by Peter Schattner <schattner@alum.mit.edu> +# +# Copyright Peter Schattner +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Tools::BPlite::HSP - Blast report High Scoring Pair (HSP) + +=head1 SYNOPSIS + + use Bio::Tools::BPlite; + my $report = new Bio::Tools::BPlite(-fh=>\*STDIN); + { + while(my $sbjct = $report->nextSbjct) { + while (my $hsp = $sbjct->nextHSP) { + $hsp->score; + $hsp->bits; + $hsp->percent; + $hsp->P; + $hsp->match; + $hsp->positive; + $hsp->length; + $hsp->querySeq; + $hsp->sbjctSeq; + $hsp->homologySeq; + $hsp->query->start; + $hsp->query->end; + $hsp->hit->start; + $hsp->hit->end; + $hsp->hit->seq_id; + $hsp->hit->overlaps($exon); + } + } + + # the following line takes you to the next report in the stream/file + # it will return 0 if that report is empty, + # but that is valid for an empty blast report. + # Returns -1 for EOF. + + last if ($report->_parseHeader == -1)); + + redo + } + +=head1 DESCRIPTION + +This object handles the High Scoring Pair data for a Blast report. +This is where the percent identity, query and hit sequence length, +P value, etc are stored and where most of the necessary information is located when building logic around parsing a Blast report. + +See L<Bio::Tools::BPlite> for more detailed information on the entire +BPlite Blast parsing system. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to +the Bioperl mailing list. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion +http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +of the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bioperl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Peter Schattner + +Email: schattner@alum.mit.edu + +=head1 CONTRIBUTORS + +Jason Stajich, jason@bioperl.org + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. +Internal methods are usually preceded with a _ + +=cut + +# Let the code begin... + +package Bio::Tools::BPlite::HSP; + +use vars qw(@ISA); +use strict; + +# to disable overloading comment this out: +#use overload '""' => '_overload'; + +# Object preamble - inheriets from Bio::SeqFeature::SimilarityPair + +use Bio::SeqFeature::SimilarityPair; +use Bio::SeqFeature::Similarity; + +@ISA = qw(Bio::SeqFeature::SimilarityPair); + +sub new { + my ($class, @args) = @_; + + # workaround to make sure frame is not set before strand is + # interpreted from query/hit info + # this workaround removes the key from the hash + # so the superclass does not try and work with it + # we'll take care of setting it in this module later on + + my %newargs = @args; + foreach ( keys %newargs ) { + if( /frame$/i ) { + delete $newargs{$_}; + } + } + # done with workaround + + my $self = $class->SUPER::new(%newargs); + + my ($score,$bits,$match,$hsplength,$positive,$gaps,$p,$exp,$qb,$qe,$sb, + $se,$qs,$ss,$hs,$qname,$sname,$qlength,$slength,$qframe,$sframe, + $blasttype) = + $self->_rearrange([qw(SCORE + BITS + MATCH + HSPLENGTH + POSITIVE + GAPS + P + EXP + QUERYBEGIN + QUERYEND + SBJCTBEGIN + SBJCTEND + QUERYSEQ + SBJCTSEQ + HOMOLOGYSEQ + QUERYNAME + SBJCTNAME + QUERYLENGTH + SBJCTLENGTH + QUERYFRAME + SBJCTFRAME + BLASTTYPE + )],@args); + + $blasttype = 'UNKNOWN' unless $blasttype; + $self->report_type($blasttype); + # Determine strand meanings + my ($queryfactor, $sbjctfactor) = (1,0); # default + if ($blasttype eq 'BLASTP' || $blasttype eq 'TBLASTN' ) { + $queryfactor = 0; + } + if ($blasttype eq 'TBLASTN' || $blasttype eq 'TBLASTX' || + $blasttype eq 'BLASTN' ) { + $sbjctfactor = 1; + } + + # Set BLAST type + $self->{'BLAST_TYPE'} = $blasttype; + + # Store the aligned query as sequence feature + my $strand; + if ($qe > $qb) { # normal query: start < end + if ($queryfactor) { $strand = 1; } else { $strand = undef; } + $self->query( Bio::SeqFeature::Similarity->new + (-start=>$qb, -end=>$qe, -strand=>$strand, + -source=>"BLAST" ) ) } + else { # reverse query (i dont know if this is possible, but feel free to correct) + if ($queryfactor) { $strand = -1; } else { $strand = undef; } + $self->query( Bio::SeqFeature::Similarity->new + (-start=>$qe, -end=>$qb, -strand=>$strand, + -source=>"BLAST" ) ) } + + # store the aligned hit as sequence feature + if ($se > $sb) { # normal hit + if ($sbjctfactor) { $strand = 1; } else { $strand = undef; } + $self->hit( Bio::SeqFeature::Similarity->new + (-start=>$sb, -end=>$se, -strand=>$strand, + -source=>"BLAST" ) ) } + else { # reverse hit: start bigger than end + if ($sbjctfactor) { $strand = -1; } else { $strand = undef; } + $self->hit( Bio::SeqFeature::Similarity->new + (-start=>$se, -end=>$sb, -strand=>$strand, + -source=>"BLAST" ) ) } + + # name the sequences + $self->query->seq_id($qname); # query name + $self->hit->seq_id($sname); # hit name + + # set lengths + $self->query->seqlength($qlength); # query length + $self->hit->seqlength($slength); # hit length + + # set object vars + $self->score($score); + $self->bits($bits); + + $self->significance($p); + $self->{'EXP'} = $exp; + + $self->query->frac_identical($match); + $self->hit->frac_identical($match); + $self->{'HSPLENGTH'} = $hsplength; + $self->{'PERCENT'} = int((1000 * $match)/$hsplength)/10; + $self->{'POSITIVE'} = $positive; + $self->{'GAPS'} = $gaps; + $self->{'QS'} = $qs; + $self->{'SS'} = $ss; + $self->{'HS'} = $hs; + + $self->frame($qframe, $sframe); + return $self; # success - we hope! +} + +# to disable overloading comment this out: +sub _overload { + my $self = shift; + return $self->start."..".$self->end." ".$self->bits; +} + +=head2 report_type + + Title : report_type + Usage : $type = $sbjct->report_type() + Function : Returns the type of report from which this hit was obtained. + This usually pertains only to BLAST and friends reports, for which + the report type denotes what type of sequence was aligned against + what (BLASTN: dna-dna, BLASTP prt-prt, BLASTX translated dna-prt, + TBLASTN prt-translated dna, TBLASTX translated dna-translated dna). + Example : + Returns : A string (BLASTN, BLASTP, BLASTX, TBLASTN, TBLASTX, UNKNOWN) + Args : a string on set (you should know what you are doing) + +=cut + +sub report_type { + my ($self, $rpt) = @_; + if($rpt) { + $self->{'_report_type'} = $rpt; + } + return $self->{'_report_type'}; +} + +=head2 EXP + + Title : EXP + Usage : my $exp = $hsp->EXP; + Function: returns the EXP value for the HSP + Returns : string value + Args : none + Note : Patch provided by Sami Ashour for BTK parsing + + +=cut + +sub EXP{ + return $_[0]->{'EXP'}; +} + + +=head2 P + + Title : P + Usage : $hsp->P(); + Function : returns the P (significance) value for a HSP + Returns : (double) significance value + Args : + +=cut + +sub P { + my ($self, @args) = @_; + my $float = $self->significance(@args); + my $match = '([+-]?)(?=\d|\.\d)\d*(\.\d*)?([Ee]([+-]?\d+))?'; # Perl Cookbook 2.1 + if ($float =~ /^$match$/) { + # Is a C float + return $float; + } elsif ("1$float" =~ /^$match$/) { + # Almost C float, Jitterbug 974 + return "1$float"; + } else { + $self->warn("[HSP::P()] '$float' is not a known number format. Returning zero (0) instead."); + return 0; + } +} + +=head2 percent + + Title : percent + Usage : $hsp->percent(); + Function : returns the percent matching + Returns : (double) percent matching + Args : none + +=cut + +sub percent {shift->{'PERCENT'}} + + +=head2 match + + Title : match + Usage : $hsp->match(); + Function : returns the match + Example : + Returns : (double) frac_identical + Args : + +=cut + +sub match {shift->query->frac_identical(@_)} + +=head2 hsplength + + Title : hsplength + Usage : $hsp->hsplength(); + Function : returns the HSP length (including gaps) + Returns : (integer) HSP length + Args : none + +=cut + +sub hsplength {shift->{'HSPLENGTH'}} + +=head2 positive + + Title : positive + Usage : $hsp->positive(); + Function : returns the number of positive matches (symbols in the alignment + with a positive score) + Returns : (int) number of positive matches in the alignment + Args : none + +=cut + +sub positive {shift->{'POSITIVE'}} + +=head2 gaps + + Title : gaps + Usage : $hsp->gaps(); + Function : returns the number of gaps or 0 if none + Returns : (int) number of gaps or 0 if none + Args : none + +=cut + +sub gaps {shift->{'GAPS'}} + +=head2 querySeq + + Title : querySeq + Usage : $hsp->querySeq(); + Function : returns the query sequence + Returns : (string) the Query Sequence + Args : none + +=cut + +sub querySeq {shift->{'QS'}} + +=head2 sbjctSeq + + Title : sbjctSeq + Usage : $hsp->sbjctSeq(); + Function : returns the Sbjct sequence + Returns : (string) the Sbjct Sequence + Args : none + +=cut + +sub sbjctSeq {shift->{'SS'}} + +=head2 homologySeq + + Title : homologySeq + Usage : $hsp->homologySeq(); + Function : returns the homologous sequence + Returns : (string) homologous sequence + Args : none + +=cut + +sub homologySeq {shift->{'HS'}} + +=head2 qs + + Title : qs + Usage : $hsp->qs(); + Function : returns the Query Sequence (same as querySeq) + Returns : (string) query Sequence + Args : none + +=cut + +sub qs {shift->{'QS'}} + +=head2 ss + + Title : ss + Usage : $hsp->ss(); + Function : returns the subject sequence ( same as sbjctSeq) + Returns : (string) Sbjct Sequence + Args : none + +=cut + +sub ss {shift->{'SS'}} + +=head2 hs + + Title : hs + Usage : $hsp->hs(); + Function : returns the Homologous Sequence (same as homologySeq ) + Returns : (string) Homologous Sequence + Args : none + +=cut + +sub hs {shift->{'HS'}} + +sub frame { + my ($self, $qframe, $sframe) = @_; + if( defined $qframe ) { + if( $qframe == 0 ) { + $qframe = undef; + } elsif( $qframe !~ /^([+-])?([1-3])/ ) { + $self->warn("Specifying an invalid query frame ($qframe)"); + $qframe = undef; + } else { + if( ($1 eq '-' && $self->query->strand >= 0) || + ($1 eq '+' && $self->query->strand <= 0) ) { + $self->warn("Query frame ($qframe) did not match strand of query (". $self->query->strand() . ")"); + } + # Set frame to GFF [0-2] + $qframe = $2 - 1; + } + $self->{'QFRAME'} = $qframe; + } + if( defined $sframe ) { + if( $sframe == 0 ) { + $sframe = undef; + } elsif( $sframe !~ /^([+-])?([1-3])/ ) { + $self->warn("Specifying an invalid hit frame ($sframe)"); + $sframe = undef; + } else { + if( ($1 eq '-' && $self->hit->strand >= 0) || + ($1 eq '+' && $self->hit->strand <= 0) ) + { + $self->warn("Hit frame ($sframe) did not match strand of hit (". $self->hit->strand() . ")"); + } + + # Set frame to GFF [0-2] + $sframe = $2 - 1; + } + $self->{'SFRAME'} = $sframe; + } + + (defined $qframe && $self->SUPER::frame($qframe) && + ($self->{'FRAME'} = $qframe)) || + (defined $sframe && $self->SUPER::frame($sframe) && + ($self->{'FRAME'} = $sframe)); + + if (wantarray() && + $self->{'BLAST_TYPE'} eq 'TBLASTX') + { + return ($self->{'QFRAME'}, $self->{'SFRAME'}); + } elsif (wantarray()) { + (defined $self->{'QFRAME'} && + return ($self->{'QFRAME'}, undef)) || + (defined $self->{'SFRAME'} && + return (undef, $self->{'SFRAME'})); + } else { + (defined $self->{'QFRAME'} && + return $self->{'QFRAME'}) || + (defined $self->{'SFRAME'} && + return $self->{'SFRAME'}); + } +} + +1;