Mercurial > repos > mahtabm > ensembl
diff variant_effect_predictor/Bio/SearchIO/psiblast.pm @ 0:1f6dce3d34e0
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 02:01:53 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/SearchIO/psiblast.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,1181 @@ +#------------------------------------------------------------------ +# $Id: psiblast.pm,v 1.17 2002/12/24 15:48:41 jason Exp $ +# +# BioPerl module Bio::SearchIO::psiblast +# +# Cared for by Steve Chervitz <sac@bioperl.org> +# +# You may distribute this module under the same terms as perl itself +#------------------------------------------------------------------ + +=head1 NAME + +Bio::SearchIO::psiblast - Parser for traditional BLAST and PSI-BLAST reports + +=head1 SYNOPSIS + + use Bio::SearchIO; + + my $in = Bio::SearchIO->new( -format => 'psiblast', + -file => 'report.blastp' ); + + while ( my $blast = $in->next_result() ) { + foreach my $hit ( $blast->hits ) { + print "Hit: $hit\n"; + } + } + + # BLAST hit filtering function. All hits of each BLAST report must satisfy + # this criteria to be retained. If a hit fails this test, it is ignored. + # If all hits of a report fail, the report will be considered hitless. + # But we can distinguish this from the case where there were no + # hits in the report by testing the function $blast->no_hits_found(). + + my $filt_func = sub{ my $hit=shift; + $hit->frac_identical('query') >= 0.5 + && $hit->frac_aligned_query >= 0.50 + }; + + # Not supplying a -file or -fh parameter means read from STDIN + + my $in2 = Bio::SearchIO->new( -format => 'psiblast', + -hit_filter => $filt_func + ); + + +=head1 DESCRIPTION + +This module parses BLAST and PSI-BLAST reports and acts as a factory for +objects that encapsulate BLAST results: +L<Bio::Search::Result::BlastResult>, L<Bio::Search::Hit::BlastHit>, +L<Bio::Search::HSP::BlastHSP>. + +This module does not parse XML-formatted BLAST reports. +See L<Bio::SearchIO::blastxml|Bio::SearchIO::blastxml> if you need to do that. + +To use this module, the only module you need to C<use> is +Bio::SearchIO.pm. SearchIO knows how to load this module when you +supply a C<-format =E<gt> 'psiblast'> parameters to its C<new>() +function. For more information about the SearchIO system, see +documentation in Bio::SearchIO.pm. + +=head2 PSI-BLAST Support + +In addition to BLAST1 and BLAST2 reports, this module can also handle +PSI-BLAST reports. When accessing the set of Hits in a result, hits +from different iterations are lumped together but can be distinguished by +interrogating L<Bio::Search::Hit::BlastHit::iteration> and +L<Bio::Search::Hit::BlastHit::found_again>. + +If you want to collect hits only from a certain iteration during parsing, +supply a function using the C<-HIT_FILTER> parameter. + +=head1 EXAMPLES + +To get a feel for how to use this, have look at scripts in the +B<examples/searchio> and B<examples/searchio/writer> directory of the Bioperl +distribution as well as the test script B<t/SearchIO.t>. + +=head1 SEE ALSO + +For more documentation about working with Blast result objects that are +produced by this parser, see L<Bio::Search::Result::BlastResult>, +L<Bio::Search::Hit::BlastHit>, L<Bio::Search::HSP::BlastHSP>. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to +the Bioperl mailing list. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +of the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bioperl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR + +Steve Chervitz E<lt>sac@bioperl.orgE<gt> + +See L<the FEEDBACK section | FEEDBACK> for where to send bug reports and comments. + +=head1 ACKNOWLEDGEMENTS + +I would like to acknowledge my colleagues at Affymetrix for useful +feedback. + +=head1 COPYRIGHT + +Copyright (c) 2001 Steve Chervitz. All Rights Reserved. + +This library is free software; you can redistribute it and/or modify +it under the same terms as Perl itself. + +=head1 DISCLAIMER + +This software is provided "as is" without warranty of any kind. + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. +Internal methods are usually preceded with a _ + +=cut + + +# Let the code begin... + + +package Bio::SearchIO::psiblast; + +use strict; +use vars qw( @ISA + $MAX_HSP_OVERLAP + $DEFAULT_MATRIX + $DEFAULT_SIGNIF + $DEFAULT_SCORE + $DEFAULT_BLAST_WRITER_CLASS + $DEFAULT_HIT_FACTORY_CLASS + $DEFAULT_RESULT_FACTORY_CLASS + ); + +use Bio::SearchIO; +use Bio::Search::Result::BlastResult; +use Bio::Factory::BlastHitFactory; +use Bio::Factory::BlastResultFactory; +use Bio::Tools::StateMachine::IOStateMachine qw($INITIAL_STATE $FINAL_STATE); + +@ISA = qw( Bio::SearchIO + Bio::Tools::StateMachine::IOStateMachine ); + +$MAX_HSP_OVERLAP = 2; # Used when tiling multiple HSPs. +$DEFAULT_MATRIX = 'BLOSUM62'; +$DEFAULT_SIGNIF = 999;# Value used as significance cutoff if none supplied. +$DEFAULT_SCORE = 0; # Value used as score cutoff if none supplied. +$DEFAULT_BLAST_WRITER_CLASS = 'Bio::Search::Writer::HitTableWriter'; +$DEFAULT_HIT_FACTORY_CLASS = 'Bio::Factory::BlastHitFactory'; +$DEFAULT_RESULT_FACTORY_CLASS = 'Bio::Factory::BlastResultFactory'; + +my %state = ( + 'header' => 'Header', + 'descr' => 'Descriptions', + 'align' => 'Alignment', + 'footer' => 'Footer', + 'iteration' => 'Iteration', # psiblast + 'nohits' => 'No Hits' + ); + +# These are the expected transitions assuming a "normal" report (Blast2 or PSI-Blast). +my @state_transitions = ( [ $INITIAL_STATE, $state{'header'}], + [ $state{'header'}, $state{'descr'} ], + [ $state{'header'}, $state{'iteration'} ], + [ $state{'iteration'}, $state{'descr'} ], + [ $state{'iteration'}, $state{'nohits'} ], + [ $state{'descr'}, $state{'align'} ], + [ $state{'align'}, $state{'align'} ], + [ $state{'align'}, $state{'footer'} ], + [ $state{'align'}, $state{'iteration'} ], # psiblast + [ $state{'nohits'}, $state{'iteration'} ], # psiblast + [ $state{'nohits'}, $state{'footer'} ], + [ $state{'footer'}, $state{'header'} ], + [ $state{'footer'}, $FINAL_STATE] + ); + +my $current_iteration; # psiblast + +=head2 new + + Usage : Bio::SearchIO::psiblast->new( %named_parameters ) + Purpose : Parse traditional BLAST or PSI-BLAST data a file or input stream. + : Handles Blast1, Blast2, NCBI and WU Blast reports. + : Populates Bio::Search:: objects with data extracted from the report. + : (The exact type of Bio::Search objects depends on the type of + : Bio::Factory::ResultFactory and Bio::Factory::HitFactory you hook up + : to the SearchIO object.) + Returns : Bio::SearchIO::psiblast object. + Argument : Named parameters: (PARAMETER TAGS CAN BE UPPER OR LOWER CASE). + : These are in addition to those specified by Bio::SearchIO::new() (see). + : -SIGNIF => number (float or scientific notation number to be used + : as a P- or Expect value cutoff; default = 999.) + : -SCORE => number (integer or scientific notation number to be used + : as a score value cutoff; default = 0.) + : -HIT_FILTER => func_ref (reference to a function to be used for + : filtering out hits based on arbitrary criteria. + : This function should take a + : Bio::Search::Hit::BlastHit.pm object as its first + : argument and return a boolean value, + : true if the hit should be filtered out). + : Sample filter function: + : -HIT_FILTER => sub { $hit = shift; + : $hit->gaps == 0; }, + : Historical note: This parameter was formerly + called -FILT_FUNC in the older + Bio::Tools::Blast::parse method. Using + -FILT_FUNC will still work for backward + compatibility. + : -CHECK_ALL_HITS => boolean (check all hits for significance against + : significance criteria. Default = false. + : If false, stops processing hits after the first + : non-significant hit or the first hit that fails + : the hit_filter call. This speeds parsing, + : taking advantage of the fact that the hits + : are processed in the order they are ranked.) + : -MIN_LEN => integer (to be used as a minimum query sequence length + : sequences below this length will not be processed). + : default = no minimum length). + : -STATS => boolean (collect key statistical parameters for the report: + : matrix, filters, etc. default = false). + : This requires extra parsing + : so if you aren't interested in this info, don't + : set this parameter. Note that the unparsed + : parameter section of a Blast report is always + : accessible via $blast->raw_statistics(). + : -BEST => boolean (only process the best hit of each report; + : default = false). + : -OVERLAP => integer (the amount of overlap to permit between + : adjacent HSPs when tiling HSPs. A reasonable value is 2. + : Default = $Bio::SearchIO::psiblast::MAX_HSP_OVERLAP) + : -HOLD_RAW_DATA => boolean (store the raw alignment sections for each hit. + : used with the -SHALLOW_PARSE option). + : -SHALLOW_PARSE => boolean (only minimal parsing; does not parse HSPs. + : Hit data is limited to what can be obtained + : from the description line. + : Replaces the older NO_ALIGNS option.) + : + : + Comments : Do NOT remove the HTML from an HTML-formatted Blast report by using the + : "Save As" option of a web browser to save it as text. This renders the + : report unparsable. + Throws : An exception will be thrown if a BLAST report contains a FATAL: error. + +=cut + +sub new { + my ($class, %args) = @_; + + # TODO: Resolve this issue: + # Temporary hack to allow factory-based and non-factory based + # SearchIO objects co-exist. + # steve --- Sat Dec 22 04:41:20 2001 + $args{-USE_FACTORIES} = 1; + + my $self = $class->Bio::SearchIO::new(%args); + $self->_init_state_machine( %args, -transition_table => \@state_transitions); + + $self->_init_parse_params( %args ); + + $self->pause_between_reports( 1 ); + + $self->{'_result_count'} = 0; + + return $self; +} + + +sub default_result_factory_class { + my $self = shift; + return $DEFAULT_RESULT_FACTORY_CLASS; +} + +sub default_hit_factory_class { + my $self = shift; + return $DEFAULT_HIT_FACTORY_CLASS; +} + +sub check_for_new_state { + my ($self) = @_; + + # Ignore blank lines + my $chunk = $self->SUPER::check_for_new_state(1); + + my $newstate = undef; + + # End the run if there's no more input. + if( ! $chunk ) { + return $self->final_state; + } + $self->clear_state_change_cache; + my $curr_state = $self->current_state; + + if( $chunk =~ /^(<.*>)?T?BLAST[NPX]/ ) { + $newstate = $state{header}; + $self->state_change_cache( $chunk ); + } + + elsif ($chunk =~ /^Sequences producing/ ) { + $newstate = $state{descr}; + $self->state_change_cache( $chunk ); + } + + elsif ($chunk =~ /No hits found/i ) { + $newstate = $state{nohits}; + $self->state_change_cache( $chunk ); + } + + elsif ($chunk =~ /^\s*Searching/ ) { + $newstate = $state{iteration}; + } + + elsif ($chunk =~ /^>(.*)/ ) { + $newstate = $state{align}; + $self->state_change_cache( "$1\n" ); + } + + elsif ($chunk =~ /^(CPU time|Parameters):/ ) { + $newstate = $state{footer}; + $self->state_change_cache( $chunk ); + } + + # Necessary to distinguish " Database:" lines that start a footer section + # from those that are internal to a footer section. + elsif ($chunk =~ /^\s+Database:/ && $curr_state ne $state{'footer'}) { + $newstate = $state{footer}; + $self->state_change_cache( $chunk ); + } + + if( $curr_state ne $INITIAL_STATE and not $newstate ) { +# print "Appending input cache with ($curr_state): $chunk\n"; + $self->append_input_cache( $chunk ); + } + + return $newstate; +} + + +sub change_state { + my ($self, $state) = @_; + + my $from_state = $self->current_state; + my $verbose = $self->verbose; + $verbose and print STDERR ">>>>> Changing state from $from_state to $state\n"; + + if ( $self->validate_transition( $from_state, $state ) ) { + + # Now we know the current state is complete + # and all data from it is now in the input cache. + my @data = $self->get_input_cache(); + +# if($from_state eq $state{iteration} ) { +# do{ +# print STDERR "State change cache: ", $self->state_change_cache, "\n"; +# print STDERR "Input cache ($from_state):\n@data\n\n"; +# }; +# } + + # Now we need to process the input cache data. + # Remember, at this point, the current state is the "from" state + # of the state transition. The "to" state doesn't get set until + # the finalize_state_change() call at the end of this method. + + if($from_state eq $state{header} ) { + $self->_process_header( @data ); + } + elsif($from_state eq $state{descr} ) { + $self->_process_descriptions( @data ); + } + elsif($from_state eq $state{iteration} ) { + $self->_process_iteration( @data, $self->state_change_cache() ); + } + elsif($from_state eq $state{align} ) { + $self->_process_alignment( @data ); + } + elsif($from_state eq $state{footer} ) { + my $ok_to_pause = not $state eq $self->final_state; + $self->_process_footer( $ok_to_pause, @data ); + } + + $self->finalize_state_change( $state, 1 ); + } +} + + +sub _add_error { + my ($self, $err) = @_; + if( $err ) { + push @{$self->{'_blast_errs'}}, $err; + } +} + +sub _clear_errors { + my $self = shift; + $self->{'_blast_errs'} = undef; +} + +#--------- +sub errors { +#--------- + my $self = shift; + my @errs = (); + @errs = @{$self->{'_blast_errs'}} if ref $self->{'_blast_errs'}; + return @errs; +} + + +#---------------------- +sub _init_parse_params { +#---------------------- +#Initializes parameters used during parsing of Blast reports. + + my ( $self, @param ) = @_; + # -FILT_FUNC has been replaced by -HIT_FILTER. + # Leaving -FILT_FUNC in place for backward compatibility + my($signif, $filt_func, $hit_filter, $min_len, $check_all, + $gapped, $score, $overlap, $stats, $best, $shallow_parse, + $hold_raw) = + $self->_rearrange([qw(SIGNIF FILT_FUNC HIT_FILTER MIN_LEN + CHECK_ALL_HITS GAPPED SCORE + OVERLAP STATS BEST SHALLOW_PARSE HOLD_RAW_DATA)], @param); + + $self->{'_hit_filter'} = $hit_filter || $filt_func || 0; + $self->{'_check_all'} = $check_all || 0; + $self->{'_shallow_parse'} = $shallow_parse || 0; + $self->{'_hold_raw_data'} = $hold_raw || 0; + + $self->_set_signif($signif, $min_len, $self->{'_hit_filter'}, $score); + $self->best_hit_only($best) if $best; + $self->{'_blast_count'} = 0; + + $self->{'_collect_stats'} = defined($stats) ? $stats : 0; + + # TODO: Automatically determine whether gapping was used. + # e.g., based on version number. Otherwise, have to read params. + $self->{'_gapped'} = $gapped || 1; + + # Clear any errors from previous parse. + $self->_clear_errors; + undef $self->{'_hit_count'}; + undef $self->{'_num_hits_significant'}; +} + +#=head2 _set_signif +# +# Usage : n/a; called automatically by _init_parse_params() +# Purpose : Sets significance criteria for the BLAST object. +# Argument : Obligatory three arguments: +# : $signif = float or sci-notation number or undef +# : $min_len = integer or undef +# : $hit_filter = function reference or undef +# : +# : If $signif is undefined, a default value is set +# : (see $DEFAULT_SIGNIF; min_length = not set). +# Throws : Exception if significance value is defined but appears +# : out of range or invalid. +# : Exception if $hit_filter if defined and is not a func ref. +# Comments : The significance of a BLAST report can be based on +# : the P (or Expect) value and/or the length of the query sequence. +# : P (or Expect) values GREATER than '_max_significance' are not significant. +# : Query sequence lengths LESS than '_min_length' are not significant. +# : +# : Hits can also be screened using arbitrary significance criteria +# : as discussed in the parse() method. +# : +# : If no $signif is defined, the '_max_significance' level is set to +# : $DEFAULT_SIGNIF (999). +# +#See Also : L<signif>(), L<min_length>(), L<_init_parse_params>(), L<parse>() +# +#=cut + +#----------------- +sub _set_signif { +#----------------- + my( $self, $sig, $len, $func, $score ) = @_; + + if(defined $sig) { + $self->{'_confirm_significance'} = 1; + if( $sig =~ /[^\d.e-]/ or $sig <= 0) { + $self->throw(-class => 'Bio::Root::BadParameter', + -text => "Invalid significance value: $sig\n". + "Must be a number greater than zero."); + } + $self->{'_max_significance'} = $sig; + } else { + $self->{'_max_significance'} = $DEFAULT_SIGNIF; + } + + if(defined $score) { + $self->{'_confirm_significance'} = 1; + if( $score =~ /[^\de+]/ or $score <= 0) { + $self->throw(-class => 'Bio::Root::BadParameter', + -text => "Invalid score value: $score\n". + "Must be an integer greater than zero."); + } + $self->{'_min_score'} = $score; + } else { + $self->{'_min_score'} = $DEFAULT_SCORE; + } + + if(defined $len) { + if($len =~ /\D/ or $len <= 0) { + $self->warn("Invalid minimum length value: $len", + "Value must be an integer > 0. Value not set."); + } else { + $self->{'_min_length'} = $len; + } + } + + if(defined $func) { + $self->{'_check_all'} = 1; + $self->{'_confirm_significance'} = 1; + if($func and not ref $func eq 'CODE') { + $self->throw("Not a function reference: $func", + "The -hit_filter parameter must be function reference."); + } + } +} + +=head2 signif + +Synonym for L<max_significance()|max_significance> + +=cut + +#----------- +sub signif { shift->max_significance } + + +=head2 max_significance + + Usage : $obj->max_significance(); + Purpose : Gets the P or Expect value used as significance screening cutoff. + This is the value of the -signif parameter supplied to new(). + Hits with P or E-value above this are skipped. + Returns : Scientific notation number with this format: 1.0e-05. + Argument : n/a + Comments : Screening of significant hits uses the data provided on the + : description line. For NCBI BLAST1 and WU-BLAST, this data + : is P-value. for NCBI BLAST2 it is an Expect value. + +=cut + +#----------- +sub max_significance { +#----------- + my $self = shift; + my $sig = $self->{'_max_significance'}; + sprintf "%.1e", $sig; +} + +=head2 min_score + + Usage : $obj->min_score(); + Purpose : Gets the Blast score used as screening cutoff. + This is the value of the -score parameter supplied to new(). + Hits with scores below this are skipped. + Returns : Integer or scientific notation number. + Argument : n/a + Comments : Screening of significant hits uses the data provided on the + : description line. + +=cut + +#----------- +sub min_score { +#----------- + my $self = shift; + return $self->{'_min_score'}; +} + +=head2 min_length + + Usage : $obj->min_length(); + Purpose : Gets the query sequence length used as screening criteria. + This is the value of the -min_len parameter supplied to new(). + Hits with sequence length below this are skipped. + Returns : Integer + Argument : n/a + +See Also : L<signif()|signif> + +=cut + +#-------------- +sub min_length { +#-------------- + my $self = shift; + $self->{'_min_length'}; +} + +=head2 highest_signif + + Usage : $value = $obj->highest_signif(); + Purpose : Gets the largest significance (P- or E-value) observed in + the report. + : For NCBI BLAST1 and WU-BLAST, this is a P-value. + : For NCBI BLAST2 it is an Expect value. + Returns : Float or sci notation number + Argument : n/a + +=cut + +sub highest_signif { shift->{'_highestSignif'} } + +=head2 lowest_signif + + Usage : $value = $obj->lowest_signif(); + Purpose : Gets the largest significance (P- or E-value) observed in + the report. + : For NCBI BLAST1 and WU-BLAST, this is a P-value. + : For NCBI BLAST2 it is an Expect value. + Returns : Float or sci notation number + Argument : n/a + +=cut + +sub lowest_signif { shift->{'_lowestSignif'} } + +=head2 highest_score + + Usage : $value = $obj->highest_score(); + Purpose : Gets the largest BLAST score observed in the report. + Returns : Integer or sci notation number + Argument : n/a + +=cut + +sub highest_score { shift->{'_highestScore'} } + +=head2 lowest_score + + Usage : $value = $obj->lowest_score(); + Purpose : Gets the largest BLAST score observed in the report. + Returns : Integer or sci notation number + Argument : n/a + +=cut + +sub lowest_score { shift->{'_lowestScore'} } + + +# Throws : Exception if BLAST report contains a FATAL: error. +sub _process_header { + my ($self, @data) = @_; + +# print STDERR "Processing Header...\n"; + + $current_iteration = 0; + $self->{'_result_count'}++; + # Finish off the current Blast object, if any + my $blast = $self->{'_current_blast'} = $self->result_factory->create_result(); + + my ($set_method, $set_query, $set_db, $set_length); + my ($query_start, $query_desc); + + foreach my $line (@data) { + if( $line =~ /^(<.*>)?(T?BLAST[NPX])\s+(.*)$/ ) { + $blast->analysis_method( $2 ); + $blast->analysis_method_version( $3 ); + $set_method = 1; + } + elsif ($line =~ /^Query= *(.+)$/ ) { + $query_start = 1; + my $info = $1; + $info =~ s/TITLE //; + # Split the query line into two parts. + # Using \s instead of ' ' + $info =~ /(\S+?)\s+(.*)/; + # set name of Blast object and return. + $blast->query_name($1 || 'UNKNOWN'); + $query_desc = $2 || ''; + $set_query = 1; + } + elsif ($line =~ /^Database:\s+(.+)$/ ) { + require Bio::Search::GenericDatabase; + my $blastdb = Bio::Search::GenericDatabase->new( -name => $1 ); + $blast->analysis_subject( $blastdb ); + $set_db = 1; + } + elsif( $line =~ m/^\s+\(([\d|,]+) letters\)/ ) { + my $length = $1; + $length =~ s/,//g; + $self->_set_query_length( $length ); + $set_length = 1; + $blast->query_description( $query_desc ); + $query_start = 0; + } + elsif( $line =~ /WARNING: (.+?)/ ) { + $self->warn( $1 ); + } + elsif( $line =~ /FATAL: (.+?)/ ) { + $self->throw("FATAL BLAST Report Error: $1"); + } + # This needs to be the last elsif block. + elsif( $query_start ) { + # Handling multi-line query descriptions. + chomp( $line ); + $query_desc .= " $line"; + } + } + if (!$set_method) { + $self->throw("Can't determine type of BLAST program."); + } + if (!$set_query) { + $self->throw("Can't determine name of query sequence."); + } + if(!$set_db) { + $self->throw("Can't determine name of database."); + } + if(!$set_length) { + $self->throw("Can't determine sequence length from BLAST report."); + } + +} + +sub _process_descriptions { + my ($self, @data) = @_; +# print STDERR "Processing Descriptions...\n"; + + # Step through each line parsing out the P/Expect value + # All we really need to do is check the first one, if it doesn't + # meet the significance requirement, we can skip the report. + # BUT: we want to collect data for all hits anyway to get min/max signif. + + my $max_signif = $self->max_significance; + my $min_score = $self->min_score; + my $layout_set = $self->{'_layout'} || 0; + my ($layout, $sig, $hitid, $score, $is_p_value); + + if( $data[0] =~ /^\s*Sequences producing.*Score\s+P/i ) { + $is_p_value = 1; + } else { + $is_p_value = 0; + } + + my $hit_found_again; + + desc_loop: + foreach my $line (@data) { + last desc_loop if $line =~ / NONE |End of List|Results from round/; + next desc_loop if $line =~ /^\.\./; + + if($line =~ /^Sequences used in model/ ) { + #Sequences used in model and found again: + $hit_found_again = 1; + next; + } + elsif($line =~ /^Sequences not found previously/ ) { + #Sequences not found previously or not previously below threshold: + $hit_found_again = 0; + next; + } + + ## Checking the significance value (P- or Expect value) of the hit + ## in the description line. + + next desc_loop unless $line =~ /\d/; + + # TODO: These regexps need testing on a variety of reports. + if ( $line =~ /^(\S+)\s+.*\s+([\de.+-]+)\s{1,5}[\de.-]+\s*$/) { + $hitid = $1; + $score = $2; + $layout = 2; + } elsif( $line =~ /^(\S+)\s+.*\s+([\de.+-]+)\s{1,5}[\de.-]+\s{1,}\d+\s*$/) { + $hitid = $1; + $score = $2; + $layout = 1; + } else { + $self->warn("Can't parse description line\n $line"); + next desc_loop; + } + not $layout_set and ($self->_layout($layout), $layout_set = 1); + + $sig = &_parse_signif( $line, $layout, $self->{'_gapped'} ); + +# print STDERR " Parsed signif for $hitid: $sig (layout=$layout)\n"; + + $self->{'_hit_hash'}->{$hitid}->{'signif'} = $sig; + $self->{'_hit_hash'}->{$hitid}->{'score'} = $score; + $self->{'_hit_hash'}->{$hitid}->{'found_again'} = $hit_found_again; + $self->{'_hit_hash'}->{'is_pval'} = $is_p_value; + + last desc_loop if (not $self->{'_check_all'} and + ($sig > $max_signif or $score < $min_score)); + + $self->_process_significance($sig, $score); + } + +# printf "\n%d SIGNIFICANT HITS.\nDONE PARSING DESCRIPTIONS.\n", $self->{'_num_hits_significant'}; +} + + +#=head2 _set_query_length +# +# Usage : n/a; called automatically during Blast report parsing. +# Purpose : Sets the length of the query sequence (extracted from report). +# Returns : integer (length of the query sequence) +# Throws : Exception if cannot determine the query sequence length from +# : the BLAST report. +# : Exception if the length is below the min_length cutoff (if any). +# Comments : The logic here is a bit different from the other _set_XXX() +# : methods since the significance of the BLAST report is assessed +# : if MIN_LENGTH is set. +# +#=cut + +#--------------- +sub _set_query_length { +#--------------- + my ($self, $length) = @_; + + my($sig_len); + if(defined($self->{'_min_length'})) { + local $^W = 0; + if($length < $self->{'_min_len'}) { + $self->throw("Query sequence too short (Query= ${\$self->{'_current_blast'}->query_name}, length= $length)", + "Minimum length is $self->{'_min_len'}"); + } + } + + $self->{'_current_blast'}->query_length($length); +} + + +# Records the highest and lowest significance (P- or E-value) and +# score encountered in a given report. +sub _set_hi_low_signif_and_score { + my($self, $sig, $score) = @_; + + my $hiSig = $self->{'_highestSignif'} || 0; + my $lowSig = $self->{'_lowestSignif'} || $DEFAULT_SIGNIF; + my $hiScore = $self->{'_highestScore'} || 0; + my $lowScore = $self->{'_lowestScore'} || $DEFAULT_SIGNIF; + + $self->{'_highestSignif'} = ($sig > $hiSig) + ? $sig : $hiSig; + + $self->{'_lowestSignif'} = ($sig < $lowSig) + ? $sig : $lowSig; + + $self->{'_highestScore'} = ($score > $hiScore) + ? $score : $hiScore; + + $self->{'_lowestScore'} = ($score < $lowScore) + ? $score : $lowScore; +} + + +sub _process_significance { + my($self, $sig, $score) = @_; + + $self->_set_hi_low_signif_and_score($sig, $score); + + # Significance value assessment. + if($sig <= $self->{'_max_significance'} and $score >= $self->{'_min_score'}) { + $self->{'_num_hits_significant'}++; + } + $self->{'_num_hits'}++; + + $self->{'_is_significant'} = 1 if $self->{'_num_hits_significant'}; +} + +#=head2 _layout +# +# Usage : n/a; internal method. +# Purpose : Set/Get indicator for the layout of the report. +# Returns : Integer (1 | 2) +# : Defaults to 2 if not set. +# Comments : Blast1 and WashU-Blast2 have a layout = 1. +# : This is intended for internal use by this and closely +# : allied modules like BlastHit.pm and BlastHSP.pm. +# +#=cut + +#------------ +sub _layout { +#------------ + my $self = shift; + if(@_) { + $self->{'_layout'} = shift; + } + $self->{'_layout'} || 2; +} + +#=head2 _parse_signif +# +# Usage : $signif = _parse_signif(string, layout, gapped); +# : This is a class function. +# Purpose : Extracts the P- or Expect value from a single line of a BLAST description section. +# Example : _parse_signif("PDB_UNIQUEP:3HSC_ heat-shock cognate ... 799 4.0e-206 2", 1); +# : _parse_signif("gi|758803 (U23828) peritrophin-95 precurs 38 0.19", 2); +# Argument : string = line from BLAST description section +# : layout = integer (1 or 2) +# : gapped = boolean (true if gapped Blast). +# Returns : Float (0.001 or 1e-03) +# Status : Static +# +#=cut + +#------------------ +sub _parse_signif { +#------------------ + my ($line, $layout, $gapped) = @_; + + local $_ = $line; + my @linedat = split(); + + # When processing both Blast1 and Blast2 reports + # in the same run, offset needs to be configured each time. + # NOTE: We likely won't be supporting mixed report streams. Not needed. + + my $offset = 0; + $offset = 1 if $layout == 1 or not $gapped; + + my $signif = $linedat[ $#linedat - $offset ]; + + # fail-safe check + if(not $signif =~ /[.-]/) { + $offset = ($offset == 0 ? 1 : 0); + $signif = $linedat[ $#linedat - $offset ]; + } + + $signif = "1$signif" if $signif =~ /^e/i; + return $signif; +} + + +=head2 best_hit_only + + Usage : print "only getting best hit.\n" if $obj->best_hit_only(); + Purpose : Set/Get the indicator for whether or not to processing only + : the best BlastHit. + Returns : Boolean (1 | 0) + Argument : n/a + +=cut + +#---------- +sub best_hit_only { +#---------- + my $self = shift; + if(@_) { $self->{'_best'} = shift; } + $self->{'_best'}; +} + +sub _process_alignment { + my ($self, @data) = @_; +# print STDERR "Processing Alignment...\n"; + + # If all of the significant hits have been parsed, + # return if we're not checking all or if we don't need to get + # the Blast stats (parameters at footer of report). + if(defined $self->{'_hit_count'} and + defined $self->{'_num_hits_significant'}) { + return if $self->{'_hit_count'} >= $self->{'_num_hits_significant'} and + not ($self->{'_check_all'} or $self->{'_collect_stats'}); + } + + # Return if we're only interested in the best hit. + # This has to occur after checking for the parameters section + # in the footer (since we may still be interested in them). + return if $self->best_hit_only and ( defined $self->{'_hit_count'} and $self->{'_hit_count'} >=1); + + push @data, 'end'; + +# print STDERR "\nALIGNMENT DATA:\n@data\n"; + + my $max_signif = $self->max_significance; + my $min_score = $self->min_score; + + my ($hitid, $score, $signif, $is_pval, $found_again); + if( $data[0] =~ /^(\S+)\s+/ ) { + $hitid = $1; + return unless defined $self->{'_hit_hash'}->{$hitid}; + $score = $self->{'_hit_hash'}->{$hitid}->{'score'}; + $signif = $self->{'_hit_hash'}->{$hitid}->{'signif'}; + $found_again = $self->{'_hit_hash'}->{$hitid}->{'found_again'}; + $is_pval = $self->{'_hit_hash'}->{'is_pval'}; +# print STDERR " Got hitid: $hitid ($signif, $score, P?=$is_pval)\n"; + } + + # Now construct the BlastHit objects from the alignment section + + # debug(1); + + $self->{'_hit_count'}++; + + # If not confirming significance, _process_descriptions will not have been run, + # so we need to count the total number of hits here. + if( not $self->{'_confirm_significance'}) { + $self->{'_num_hits'}++; + } + + my %hit_params = ( -RESULT => $self->{'_current_blast'}, + -RAW_DATA =>\@data, + -SIGNIF => $signif, + -IS_PVAL => $is_pval, + -SCORE => $score, + -RANK => $self->{'_hit_count'}, + -RANK_BY => 'order', + -OVERLAP => $self->{'_overlap'} || $MAX_HSP_OVERLAP, + -FOUND_AGAIN => $found_again, + -SHALLOW_PARSE => $self->{'_shallow_parse'}, + -HOLD_RAW_DATA => $self->{'_hold_raw_data'}, + ); + + my $hit; + $hit = $self->hit_factory->create_hit( %hit_params ); + + # printf STDERR "NEW HIT: %s, SIGNIFICANCE = %g\n", $hit->name, $hit->expect; <STDIN>; + # The BLAST report may have not had a description section. + if(not $self->{'_has_descriptions'}) { + $self->_process_significance($hit->signif, $score); + } + + # Collect overall signif data if we don't already have it, + # (as occurs if no -signif or -score parameter are supplied). + my $hit_signif = $hit->signif; + + if (not $self->{'_confirm_significance'} ) { + $self->_set_hi_low_signif_and_score($hit_signif, $score); + } + + # Test significance using custom function (if supplied) + my $add_hit = 0; + + my $hit_filter = $self->{'_hit_filter'} || 0; + + if($hit_filter) { + if(&$hit_filter($hit)) { + $add_hit = 1; + } + } elsif($hit_signif <= $max_signif and $score >= $min_score) { + $add_hit = 1; + } + $add_hit && $self->{'_current_blast'}->add_hit( $hit ); +} + + +sub _process_footer { + my ($self, $ok_to_pause, @data) = @_; +# print STDERR "Processing Footer...\n"; + + $self->{'_current_blast'}->raw_statistics( [@data] ); + + if($self->{'_collect_stats'}) { + foreach my $line (@data) { + if( $line =~ /^\s*Matrix:\s*(\S+)/i ) { + $self->{'_current_blast'}->matrix( $1 ); + } + elsif( $line =~ /^\s*Number of Sequences:\s*(\d+)/i ) { + $self->{'_current_blast'}->analysis_subject->entries( $1 ); + } + elsif( $line =~ /^\s*length of database:\s*(\d+)/i ) { + $self->{'_current_blast'}->analysis_subject->letters( $1 ); + } + elsif( $line =~ /^\s*Posted date:\s*(.+)$/i ) { + $self->{'_current_blast'}->analysis_subject->date( $1 ); + } + } + } + + if( $self->errors ) { + my $num_err = scalar($self->errors); + $self->warn( "$num_err Blast parsing errors occurred."); + foreach( $self->errors ) { print STDERR "$_\n"; }; + } + + if( $self->{'_pause_between_reports'} and $ok_to_pause ) { + $self->pause; + } + +} + +sub _process_nohits { + my $self = shift; +# print STDERR "Processing No Hits (iteration = $current_iteration)\n"; + $self->{'_current_blast'}->set_no_hits_found( $current_iteration ); +} + + +sub _process_iteration { + my ($self, @data) = @_; +# print STDERR "Processing Iteration\n"; +# print STDERR " Incrementing current iteration (was=$current_iteration)\n"; + $current_iteration++; + $self->{'_current_blast'}->iterations( $current_iteration ); + + foreach( @data ) { + if( /Results from round \d+/i ) { + $self->{'_current_blast'}->psiblast( 1 ); + } + elsif( /No hits found/i ) { + $self->_process_nohits(); + last; + } + elsif( /^\s*Sequences/i ) { + $self->_process_descriptions( @data ); + last; + } + } +} + +sub pause_between_reports { + my ($self, $setting) = @_; + if( defined $setting ) { + $self->{'_pause_between_reports'} = $setting; + } + $self->{'_pause_between_reports'}; +} + +sub result_count { + my $self = shift; + return $self->{'_result_count'}; +} + +# For backward compatibility: +sub report_count { shift->result_count } + +sub next_result { + my ($self) = @_; + # print STDERR "next_result() called\n"; + if( not $self->running ) { + $self->run; + } + else { + $self->resume; + } + my $blast = $self->{'_current_blast'}; + $self->{'_current_blast'} = undef; + return $blast; +} + +=head2 write_result + + Title : write_result + Usage : $stream->write_result($result_result, @other_args) + Function: Writes data from the $result_result object into the stream. + : Delegates to the to_string() method of the associated + : WriterI object. + Returns : 1 for success and 0 for error + Args : Bio::Search:Result::ResultI object, + : plus any other arguments for the Writer + Throws : Bio::Root::Exception if a Writer has not been set. + +See L<Bio::Root::Exception> + +=cut + +sub write_result { + my ($self, $blast, @args) = @_; + + if( not defined($self->writer) ) { + $self->warn("Writer not defined. Using a $DEFAULT_BLAST_WRITER_CLASS"); + $self->writer( $DEFAULT_BLAST_WRITER_CLASS->new() ); + } + $self->SUPER::write_result( $blast, @args ); +} + + + +1; +__END__ +