diff variant_effect_predictor/Bio/SearchIO/blast.pm @ 0:1f6dce3d34e0

Uploaded
author mahtabm
date Thu, 11 Apr 2013 02:01:53 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/variant_effect_predictor/Bio/SearchIO/blast.pm	Thu Apr 11 02:01:53 2013 -0400
@@ -0,0 +1,988 @@
+# $Id: blast.pm,v 1.42.2.14 2003/09/15 16:19:01 jason Exp $
+#
+# BioPerl module for Bio::SearchIO::blast
+#
+# Cared for by Jason Stajich <jason@bioperl.org>
+#
+# Copyright Jason Stajich
+#
+# You may distribute this module under the same terms as perl itself
+
+# POD documentation - main docs before the code
+
+=head1 NAME
+
+Bio::SearchIO::blast - Event generator for event based parsing of blast reports 
+
+=head1 SYNOPSIS
+
+   # Do not use this object directly - it is used as part of the
+   # Bio::SearchIO system.
+
+    use Bio::SearchIO;
+    my $searchio = new Bio::SearchIO(-format => 'blast',
+				     -file   => 't/data/ecolitst.bls');
+    while( my $result = $searchio->next_result ) {
+	while( my $hit = $result->next_hit ) {
+	    while( my $hsp = $hit->next_hsp ) {
+		# ...
+	    }
+	}
+    }
+
+=head1 DESCRIPTION
+
+This object encapsulated the necessary methods for generating events
+suitable for building Bio::Search objects from a BLAST report file. 
+Read the L<Bio::SearchIO> for more information about how to use this.
+
+=head1 FEEDBACK
+
+=head2 Mailing Lists
+
+User feedback is an integral part of the evolution of this and other
+Bioperl modules. Send your comments and suggestions preferably to
+the Bioperl mailing list.  Your participation is much appreciated.
+
+  bioperl-l@bioperl.org              - General discussion
+  http://bioperl.org/MailList.shtml  - About the mailing lists
+
+=head2 Reporting Bugs
+
+Report bugs to the Bioperl bug tracking system to help us keep track
+of the bugs and their resolution. Bug reports can be submitted via
+email or the web:
+
+  bioperl-bugs@bioperl.org
+  http://bugzilla.bioperl.org/
+
+=head1 AUTHOR - Jason Stajich
+
+Email jason@bioperl.org
+
+Describe contact details here
+
+=head1 CONTRIBUTORS
+
+Additional contributors names and emails here
+
+=head1 APPENDIX
+
+The rest of the documentation details each of the object methods.
+Internal methods are usually preceded with a _
+
+=cut
+
+
+# Let the code begin...
+
+
+
+package Bio::SearchIO::blast;
+use strict;
+use vars qw(@ISA %MAPPING %MODEMAP $DEFAULT_BLAST_WRITER_CLASS);
+use Bio::SearchIO;
+
+@ISA = qw(Bio::SearchIO );
+
+BEGIN { 
+    # mapping of NCBI Blast terms to Bioperl hash keys
+    %MODEMAP = ('BlastOutput' => 'result',
+		'Hit'         => 'hit',
+		'Hsp'         => 'hsp'
+		);
+
+    # This should really be done more intelligently, like with
+    # XSLT
+
+    %MAPPING = 
+	( 
+	  'Hsp_bit-score'  => 'HSP-bits',
+	  'Hsp_score'      => 'HSP-score',
+	  'Hsp_evalue'     => 'HSP-evalue',
+	  'Hsp_pvalue'     => 'HSP-pvalue',
+	  'Hsp_query-from' => 'HSP-query_start',
+	  'Hsp_query-to'   => 'HSP-query_end',
+	  'Hsp_hit-from'   => 'HSP-hit_start',
+	  'Hsp_hit-to'     => 'HSP-hit_end',
+	  'Hsp_positive'   => 'HSP-conserved',
+	  'Hsp_identity'   => 'HSP-identical',
+	  'Hsp_gaps'       => 'HSP-hsp_gaps',
+	  'Hsp_hitgaps'    => 'HSP-hit_gaps',
+	  'Hsp_querygaps'  => 'HSP-query_gaps',
+	  'Hsp_qseq'       => 'HSP-query_seq',
+	  'Hsp_hseq'       => 'HSP-hit_seq',
+	  'Hsp_midline'    => 'HSP-homology_seq',
+	  'Hsp_align-len'  => 'HSP-hsp_length',
+	  'Hsp_query-frame'=> 'HSP-query_frame',
+	  'Hsp_hit-frame'  => 'HSP-hit_frame',
+
+	  'Hit_id'        => 'HIT-name',
+	  'Hit_len'       => 'HIT-length',
+	  'Hit_accession' => 'HIT-accession',
+	  'Hit_def'       => 'HIT-description',
+	  'Hit_signif'    => 'HIT-significance',
+	  'Hit_score'     => 'HIT-score',
+	  'Iteration_iter-num'   => 'HIT-iteration',
+
+	  'BlastOutput_program'  => 'RESULT-algorithm_name',
+	  'BlastOutput_version'  => 'RESULT-algorithm_version',
+	  'BlastOutput_query-def'=> 'RESULT-query_name',
+	  'BlastOutput_query-len'=> 'RESULT-query_length',
+	  'BlastOutput_query-acc'=> 'RESULT-query_accession',
+	  'BlastOutput_querydesc'=> 'RESULT-query_description',
+	  'BlastOutput_db'       => 'RESULT-database_name',
+	  'BlastOutput_db-len'   => 'RESULT-database_entries',
+	  'BlastOutput_db-let'   => 'RESULT-database_letters',
+
+	  'Parameters_matrix'    => { 'RESULT-parameters' => 'matrix'},
+	  'Parameters_expect'    => { 'RESULT-parameters' => 'expect'},
+	  'Parameters_include'   => { 'RESULT-parameters' => 'include'},
+	  'Parameters_sc-match'  => { 'RESULT-parameters' => 'match'},
+	  'Parameters_sc-mismatch' => { 'RESULT-parameters' => 'mismatch'},
+	  'Parameters_gap-open'  =>   { 'RESULT-parameters' => 'gapopen'},
+	  'Parameters_gap-extend'=>   { 'RESULT-parameters' => 'gapext'},
+	  'Parameters_filter'    =>  {'RESULT-parameters' => 'filter'},
+	  'Parameters_allowgaps' =>   { 'RESULT-parameters' => 'allowgaps'},
+
+	  'Statistics_db-len'    => {'RESULT-statistics' => 'dbentries'},
+	  'Statistics_db-let'    => { 'RESULT-statistics' => 'dbletters'},
+	  'Statistics_hsp-len'   => { 'RESULT-statistics' => 'effective_hsplength'},
+	  'Statistics_query-len'   => { 'RESULT-statistics' => 'querylength'},
+	  'Statistics_eff-space' => { 'RESULT-statistics' => 'effectivespace'},
+	  'Statistics_eff-spaceused' => { 'RESULT-statistics' => 'effectivespaceused'},
+	  'Statistics_eff-dblen' => { 'RESULT-statistics' => 'effectivedblength'},
+	  'Statistics_kappa'     => { 'RESULT-statistics' => 'kappa' },
+	  'Statistics_lambda'    => { 'RESULT-statistics' => 'lambda' },
+	  'Statistics_entropy'   => { 'RESULT-statistics' => 'entropy'},
+	  'Statistics_framewindow'=> { 'RESULT-statistics' => 'frameshiftwindow'},
+	  'Statistics_decay'=> { 'RESULT-statistics' => 'decayconst'},
+	  
+	  'Statistics_T'=> { 'RESULT-statistics' => 'T'},
+	  'Statistics_A'=> { 'RESULT-statistics' => 'A'},
+	  'Statistics_X1'=> { 'RESULT-statistics' => 'X1'},
+	  'Statistics_X2'=> { 'RESULT-statistics' => 'X2'},
+	  'Statistics_S1'=> { 'RESULT-statistics' => 'S1'},
+	  'Statistics_S2'=> { 'RESULT-statistics' => 'S2'},
+	  'Statistics_hit_to_db' => { 'RESULT-statistics' => 'Hits_to_DB'},
+	  'Statistics_num_extensions' => { 'RESULT-statistics' => 'num_extensions'},
+	  'Statistics_num_extensions' => { 'RESULT-statistics' => 'num_extensions'},
+	  'Statistics_num_suc_extensions' => { 'RESULT-statistics' => 'num_successful_extensions'},
+	  'Statistics_seqs_better_than_cutoff' => { 'RESULT-statistics' => 'seqs_better_than_cutoff'},
+	  'Statistics_posted_date' => { 'RESULT-statistics' => 'posted_date'},
+	  
+	  # WU-BLAST stats
+	  'Statistics_DFA_states'=> { 'RESULT-statistics' => 'num_dfa_states'},
+	  'Statistics_DFA_size'=> { 'RESULT-statistics' => 'dfa_size'},
+
+	  'Statistics_search_cputime' => { 'RESULT-statistics' => 'search_cputime'},
+	  'Statistics_total_cputime' => { 'RESULT-statistics' => 'total_cputime'},
+	  'Statistics_search_actualtime' => { 'RESULT-statistics' => 'search_actualtime'},
+	  'Statistics_total_actualtime' => { 'RESULT-statistics' => 'total_actualtime'},
+
+	  'Statistics_noprocessors' => { 'RESULT-statistics' => 'no_of_processors'},
+	  'Statistics_neighbortime' => { 'RESULT-statistics' => 'neighborhood_generate_time'},
+	  'Statistics_starttime' => { 'RESULT-statistics' => 'start_time'},
+	  'Statistics_endtime' => { 'RESULT-statistics' => 'end_time'},
+	  );
+
+    $DEFAULT_BLAST_WRITER_CLASS = 'Bio::Search::Writer::HitTableWriter';
+}
+
+
+=head2 new
+
+ Title   : new
+ Usage   : my $obj = new Bio::SearchIO::blast();
+ Function: Builds a new Bio::SearchIO::blast object 
+ Returns : Bio::SearchIO::blast
+ Args    : -fh/-file => filehandle/filename to BLAST file
+           -format   => 'blast'
+
+=cut
+
+=head2 next_result
+
+ Title   : next_result
+ Usage   : my $hit = $searchio->next_result;
+ Function: Returns the next Result from a search
+ Returns : Bio::Search::Result::ResultI object
+ Args    : none
+
+=cut
+
+sub next_result{
+   my ($self) = @_;
+   
+   my $data = '';
+   my $seentop = 0;
+   my ($reporttype,$seenquery,$reportline);
+   $self->start_document();
+   my @hit_signifs;
+   
+   while( defined ($_ = $self->_readline )) {       
+       next if( /^\s+$/); # skip empty lines
+       next if( /CPU time:/);
+       next if( /^>\s*$/);
+
+       if( /^([T]?BLAST[NPX])\s*(.+)$/i ||
+	   /^(PSITBLASTN)\s+(.+)$/i ||
+	   /^(RPS-BLAST)\s*(.+)$/i ||
+	   /^(MEGABLAST)\s*(.+)$/i 
+	   ) {
+	   if( $seentop ) {	    
+	       $self->_pushback($_);
+	       $self->in_element('hsp') && 
+		   $self->end_element({ 'Name' => 'Hsp'});
+	       $self->in_element('hit') && 
+		   $self->end_element({ 'Name' => 'Hit'});
+	       $self->end_element({ 'Name' => 'BlastOutput'});
+	       return $self->end_document();
+	   }
+	   $self->start_element({ 'Name' => 'BlastOutput' } );
+	   $self->{'_result_count'}++;
+	   $seentop = 1;
+	   $reporttype = $1;
+	   $reportline = $_; # to fix the fact that RPS-BLAST output is wrong
+	   $self->element({ 'Name' => 'BlastOutput_program',
+			    'Data' => $reporttype});
+	   
+	   $self->element({ 'Name' => 'BlastOutput_version',
+			    'Data' => $2});
+       } elsif ( /^Query=\s*(.+)$/ ) {	   
+	   my $q = $1;
+	   my $size = 0;
+	   if( defined $seenquery ) {
+	       $self->_pushback($reportline);
+	       $self->_pushback($_);
+	       $self->in_element('hsp') &&
+		   $self->end_element({'Name'=> 'Hsp'});
+	       $self->in_element('hit') &&
+		   $self->end_element({'Name'=> 'Hit'});
+	       $self->end_element({'Name' => 'BlastOutput'});
+	       return $self->end_document();
+	   } else { 
+	       if( ! defined $reporttype ) {
+		   $self->start_element({'Name' => 'BlastOutput'});
+		   $seentop = 1;
+		   $self->{'_result_count'}++;
+	       }
+	   }
+	   $seenquery = $q;
+	   $_ = $self->_readline;
+	   while( defined ($_) ) {
+               if( /^Database:/ ) {
+		   $self->_pushback($_);
+		   last;
+	       }
+               chomp;               
+               if( /\((\-?[\d,]+)\s+letters.*\)/ ) {
+                   $size = $1;
+                   $size =~ s/,//g;
+                   last;
+               } else { 
+                   $q .= " $_";
+                   $q =~ s/ +/ /g;
+                   $q =~ s/^ | $//g;
+               }
+
+               $_ = $self->_readline;
+           }
+	   chomp($q);
+	   my ($nm,$desc) = split(/\s+/,$q,2);	   
+	   $self->element({ 'Name' => 'BlastOutput_query-def',
+			    'Data' => $nm});
+	   $self->element({ 'Name' => 'BlastOutput_query-len', 
+			    'Data' => $size});
+	   defined $desc && $desc =~ s/\s+$//;
+	   $self->element({ 'Name' => 'BlastOutput_querydesc', 
+			    'Data' => $desc});
+	   
+	   if( my @pieces = split(/\|/,$nm) ) {
+	       my $acc = pop @pieces;
+	       $acc = pop @pieces if( ! defined $acc || $acc =~ /^\s+$/);
+	       $self->element({ 'Name' =>  'BlastOutput_query-acc',
+				'Data'  => $acc});
+	   }
+	   
+       } elsif( /Sequences producing significant alignments:/ ) {
+	 descline:
+           while( defined ($_ = $self->_readline() )) {
+               if( /^>/ ) {
+                   $self->_pushback($_);
+                   last descline;
+               } elsif( /(\d+)\s+([\d\.\-eE]+)(\s+\d+)?\s*$/) {
+		   # the last match is for gapped BLAST output
+		   # which will report the number of HSPs for the Hit
+                   my ($score, $evalue) = ($1, $2);
+                   # Some data clean-up so e-value will appear numeric to perl
+                   $evalue =~ s/^e/1e/i;
+		   push @hit_signifs, [ $evalue, $score ];
+               }
+	   }
+       } elsif( /Sequences producing High-scoring Segment Pairs:/ ) {
+	   # skip the next line
+	   $_ = $self->_readline();
+	   
+	    while( defined ($_ = $self->_readline() ) && 
+		  ! /^\s+$/ ) {	
+	       my @line = split;
+	       pop @line; # throw away first number which is for 'N'col
+	       push @hit_signifs, [ pop @line, pop @line];
+	   }
+       } elsif ( /^Database:\s*(.+)$/ ) {
+	   my $db = $1;
+
+	   while( defined($_ = $self->_readline) ) {
+	       if( /^\s+(\-?[\d\,]+)\s+sequences\;\s+(\-?[\d,]+)\s+total\s+letters/){
+		   my ($s,$l) = ($1,$2);
+		   $s =~ s/,//g;
+		   $l =~ s/,//g;
+		   $self->element({'Name' => 'BlastOutput_db-len',
+				   'Data' => $s});	       
+		   $self->element({'Name' => 'BlastOutput_db-let',
+				   'Data' => $l});
+		   last;
+	       } else {
+		   chomp;
+		   $db .= $_;
+	       }
+	   }	       
+	   $self->element({'Name' => 'BlastOutput_db',
+			   'Data' => $db});
+       } elsif( /^>(\S+)\s*(.*)?/ ) {
+	   chomp;
+
+	   $self->in_element('hsp') && $self->end_element({ 'Name' => 'Hsp'});
+	   $self->in_element('hit') && $self->end_element({ 'Name' => 'Hit'});
+	   
+	   $self->start_element({ 'Name' => 'Hit'});
+	   my $id = $1;	  
+	   my $restofline = $2;
+	   $self->element({ 'Name' => 'Hit_id',
+			    'Data' => $id});	   
+	   my ($acc, $version);
+	   if ($id =~ /(gb|emb|dbj|sp|pdb|bbs|ref|lcl)\|(.*)\|(.*)/) {
+	   ($acc, $version) = split /\./, $2; 
+	   } elsif ($id =~ /(pir|prf|pat|gnl)\|(.*)\|(.*)/) {
+	   ($acc, $version) = split /\./, $3;  
+	   } else {
+	   	#punt, not matching the db's at ftp://ftp.ncbi.nih.gov/blast/db/README
+	   	#Database Name                     Identifier Syntax
+        #============================      ========================
+        #GenBank                           gb|accession|locus
+        #EMBL Data Library                 emb|accession|locus
+        #DDBJ, DNA Database of Japan       dbj|accession|locus
+        #NBRF PIR                          pir||entry
+        #Protein Research Foundation       prf||name
+        #SWISS-PROT                        sp|accession|entry name
+        #Brookhaven Protein Data Bank      pdb|entry|chain
+        #Patents                           pat|country|number 
+        #GenInfo Backbone Id               bbs|number 
+        #General database identifier	   gnl|database|identifier
+        #NCBI Reference Sequence           ref|accession|locus
+        #Local Sequence identifier         lcl|identifier
+	   	$acc=$id;
+	   }
+	   $self->element({ 'Name' =>  'Hit_accession',
+			    'Data'  => $acc});	   
+
+	   my $v = shift @hit_signifs;
+	   if( defined $v ) {
+	       $self->element({'Name' => 'Hit_signif',
+			       'Data' => $v->[0]});
+	       $self->element({'Name' => 'Hit_score',
+			       'Data' => $v->[1]});
+	   }
+	   while(defined($_ = $self->_readline()) ) {
+	       next if( /^\s+$/ );
+	       chomp;
+	       if(  /Length\s*=\s*([\d,]+)/ ) {
+		   my $l = $1;
+		   $l =~ s/\,//g;
+		   $self->element({ 'Name' => 'Hit_len',
+				    'Data' => $l });
+		   last;	       
+	       } else { 
+		   $restofline .= $_;
+	       }
+	   }
+	   $restofline =~ s/\s+/ /g;
+	   $self->element({ 'Name' => 'Hit_def',
+			    'Data' => $restofline});       
+      } elsif( /\s+(Plus|Minus) Strand HSPs:/i ) {
+	   next;
+       } elsif( ($self->in_element('hit') || 
+		 $self->in_element('hsp')) && # wublast
+	       m/Score\s*=\s*(\S+)\s*         # Bit score
+		\(([\d\.]+)\s*bits\),         # Raw score
+		\s*Expect\s*=\s*([^,\s]+),    # E-value
+		\s*(?:Sum)?\s*                # SUM
+		P(?:\(\d+\))?\s*=\s*([^,\s]+) # P-value
+		/ox 
+		  ) {	   
+	   my ($score, $bits,$evalue,$pvalue) = ($1,$2,$3,$4);
+	   $evalue =~ s/^e/1e/i;
+	   $pvalue =~ s/^e/1e/i;
+	   $self->in_element('hsp') && $self->end_element({'Name' => 'Hsp'});
+	   $self->start_element({'Name' => 'Hsp'});
+       	   $self->element( { 'Name' => 'Hsp_score',
+			     'Data' => $score});
+	   $self->element( { 'Name' => 'Hsp_bit-score',
+			     'Data' => $bits});
+	   $self->element( { 'Name' => 'Hsp_evalue',			     
+			     'Data' => $evalue});
+	   $self->element( {'Name'  => 'Hsp_pvalue',
+			    'Data'  => $pvalue});       
+       } elsif( ($self->in_element('hit') || 
+		 $self->in_element('hsp')) && # ncbi blast
+		m/Score\s*=\s*(\S+)\s*bits\s* # Bit score
+		(?:\((\d+)\))?,             # Missing for BLAT pseudo-BLAST fmt
+		\s*Expect(?:\(\d+\))?\s*=\s*(\S+) # E-value
+		/ox) {
+	   my ($bits,$score,$evalue) = ($1,$2,$3);
+	   $evalue =~ s/^e/1e/i;
+	   $self->in_element('hsp') && $self->end_element({ 'Name' => 'Hsp'});
+	   
+	   $self->start_element({'Name' => 'Hsp'});
+	   $self->element( { 'Name' => 'Hsp_score',
+			     'Data' => $score});
+	   $self->element( { 'Name' => 'Hsp_bit-score',
+			     'Data' => $bits});
+	   $self->element( { 'Name' => 'Hsp_evalue',
+			     'Data' => $evalue});
+       } elsif( $self->in_element('hsp') &&
+		m/Identities\s*=\s*(\d+)\s*\/\s*(\d+)\s*[\d\%\(\)]+\s*
+		(?:,\s*Positives\s*=\s*(\d+)\/(\d+)\s*[\d\%\(\)]+\s*)? # pos only valid for Protein alignments
+		(?:\,\s*Gaps\s*=\s*(\d+)\/(\d+))? # Gaps
+		/oxi 
+		) {
+	   $self->element( { 'Name' => 'Hsp_identity',
+			     'Data' => $1});
+	   $self->element( {'Name' => 'Hsp_align-len',
+			    'Data' => $2});
+	   if( defined $3 ) {
+	       $self->element( { 'Name' => 'Hsp_positive',
+				 'Data' => $3});
+	   } else { 
+	       $self->element( { 'Name' => 'Hsp_positive',
+				 'Data' => $1});
+	   }
+	   if( defined $6 ) { 	       
+	       $self->element( { 'Name' => 'Hsp_gaps',
+				 'Data' => $5});
+	   }
+	   
+	   $self->{'_Query'} = { 'begin' => 0, 'end' => 0};
+	   $self->{'_Sbjct'} = { 'begin' => 0, 'end' => 0};
+
+	   if( /(Frame\s*=\s*.+)$/ ) {
+	       # handle wu-blast Frame listing on same line
+	       $self->_pushback($1);
+	   }	   
+       } elsif( $self->in_element('hsp') &&
+		/Strand\s*=\s*(Plus|Minus)\s*\/\s*(Plus|Minus)/i ) {
+	   # consume this event ( we infer strand from start/end)
+	   next;
+       } elsif( $self->in_element('hsp') &&
+		/Frame\s*=\s*([\+\-][1-3])\s*(\/\s*([\+\-][1-3]))?/ ){
+	   my ($one,$two)= ($1,$2);
+	   my ($queryframe,$hitframe);
+	   if( $reporttype eq 'TBLASTX' ) {
+	       ($queryframe,$hitframe) = ($one,$two);
+	       $hitframe =~ s/\/\s*//g;
+	   } elsif( $reporttype =~ /^(PSI)?TBLASTN/oi ) {
+	       ($hitframe,$queryframe) = ($one,0);	       
+	   } elsif( $reporttype eq 'BLASTX' ) {	       
+	       ($queryframe,$hitframe) = ($one,0);
+	   } 
+	   $self->element({'Name' => 'Hsp_query-frame',
+			   'Data' => $queryframe});
+	   	   
+	   $self->element({'Name' => 'Hsp_hit-frame',
+			   'Data' => $hitframe});
+       } elsif(  /^Parameters:/ || /^\s+Database:\s+?/ || /^\s+Subset/ ||
+		 /^\s+Subset/ || /^\s*Lambda/ || /^\s*Histogram/ ||
+		 ( $self->in_element('hsp') &&  /WARNING|NOTE/ ) ) {
+	   $self->in_element('hsp') && $self->end_element({'Name' => 'Hsp'});
+	   $self->in_element('hit') && $self->end_element({'Name' => 'Hit'});
+	   next if /^\s+Subset/;
+	   my $blast = ( /^(\s+Database\:)|(\s*Lambda)/ ) ? 'ncbi' : 'wublast';
+	   if( /^\s*Histogram/ ) {
+	       $blast = 'btk';
+	   }
+	   my $last = '';
+	   # default is that gaps are allowed
+	   $self->element({'Name' => 'Parameters_allowgaps',
+			   'Data' => 'yes'});
+	   while( defined ($_ = $self->_readline ) ) {
+	       if( /^(PSI)?([T]?BLAST[NPX])\s*(.+)$/i ||
+		   /^(RPS-BLAST)\s*(.+)$/i ||
+		   /^(MEGABLAST)\s*(.+)$/i  ) {
+		   $self->_pushback($_);
+		   # let's handle this in the loop
+		   last;
+	       } elsif( /^Query=/ ) {	
+		   $self->_pushback($reportline);
+		   $self->_pushback($_);
+		   # -- Superfluous I think
+		   $self->in_element('hsp') && 
+		       $self->end_element({'Name' => 'Hsp'});
+		   $self->in_element('hit') && 
+		       $self->end_element({'Name' => 'Hit'});
+		   # --
+		   $self->end_element({ 'Name' => 'BlastOutput'});
+		   return $self->end_document();
+	       }
+	       
+	       # here is where difference between wublast and ncbiblast
+	       # is better handled by different logic
+	       if( /Number of Sequences:\s+([\d\,]+)/i ||
+			/of sequences in database:\s+([\d,]+)/i) {
+		   my $c = $1;
+		   $c =~ s/\,//g;
+		   $self->element({'Name' => 'Statistics_db-len',
+				   'Data' => $c});
+	       } elsif ( /letters in database:\s+(\-?[\d,]+)/i) {	   
+		   my $s = $1;
+		   $s =~ s/,//g;
+		   $self->element({'Name' => 'Statistics_db-let',
+				   'Data' => $s});
+	       } elsif( $blast eq 'btk' ) { 
+		   next;
+	       } elsif( $blast eq 'wublast' ) {
+		   if( /E=(\S+)/ ) {
+		       $self->element({'Name' => 'Parameters_expect',
+				       'Data' => $1});
+		   } elsif( /nogaps/ ) {
+		       $self->element({'Name' => 'Parameters_allowgaps',
+				       'Data' => 'no'});
+		   } elsif( $last =~ /(Frame|Strand)\s+MatID\s+Matrix name/i ){
+		       s/^\s+//;
+                       #throw away first two slots
+		       my @vals = split;
+		       splice(@vals, 0,2);
+		       my ($matrix,$lambda,$kappa,$entropy) = @vals;
+		       $self->element({'Name' => 'Parameters_matrix',
+				       'Data' => $matrix});
+		       $self->element({'Name' => 'Statistics_lambda',
+				       'Data' => $lambda});
+		       $self->element({'Name' => 'Statistics_kappa',
+				       'Data' => $kappa});
+		       $self->element({'Name' => 'Statistics_entropy',
+				       'Data' => $entropy});
+		   } elsif( m/^\s+Q=(\d+),R=(\d+)\s+/ox ) {
+		       $self->element({'Name' => 'Parameters_gap-open',
+				       'Data' => $1});
+		       $self->element({'Name' => 'Parameters_gap-extend',
+				       'Data' => $2});
+		   } elsif( /(\S+\s+\S+)\s+DFA:\s+(\S+)\s+\((.+)\)/ ) {
+		       if( $1 eq 'states in') { 
+			   $self->element({'Name' => 'Statistics_DFA_states',
+					   'Data' => "$2 $3"});
+		       } elsif( $1 eq 'size of') {
+			   $self->element({'Name' => 'Statistics_DFA_size',
+					   'Data' => "$2 $3"});
+		       }
+		   } elsif( /^\s+Time to generate neighborhood:\s+(\S+\s+\S+\s+\S+)/ ) { 
+		       $self->element({'Name' => 'Statistics_neighbortime',
+				       'Data' => $1});
+		   } elsif( /processors\s+used:\s+(\d+)/ ) {
+		          $self->element({'Name' => 'Statistics_noprocessors',
+					   'Data' => $1});
+		   } elsif( m/^\s+(\S+)\s+cpu\s+time:\s+(\S+\s+\S+\s+\S+)\s+
+			  Elapsed:\s+(\S+)/ox ) {
+		       my $cputype = lc($1);
+		       $self->element({'Name' => "Statistics_$cputype\_cputime",
+				       'Data' => $2});
+		       $self->element({'Name' => "Statistics_$cputype\_actualtime",
+				       'Data' => $3});
+		   } elsif( /^\s+Start:/ ) {
+		       my ($junk,$start,$stime,$end,$etime) = 
+			   split(/\s+(Start|End)\:\s+/,$_);
+		       chomp($stime);
+		       $self->element({'Name' => 'Statistics_starttime',
+				       'Data' => $stime});
+		       chomp($etime);
+		       $self->element({'Name' => 'Statistics_endtime',
+				       'Data' => $etime});
+		   } elsif( !/^\s+$/ ) {
+		       $self->debug( "unmatched stat $_");
+		   }
+		   
+	       } elsif ( $blast eq 'ncbi' ) {
+		   if( m/^Matrix:\s+(\S+)/oxi ) {
+		       $self->element({'Name' => 'Parameters_matrix',
+				       'Data' => $1});		       
+		   } elsif( /Lambda/ ) {
+		       $_ = $self->_readline;
+		       s/^\s+//;
+		       my ($lambda, $kappa, $entropy) = split;
+		       $self->element({'Name' => 'Statistics_lambda',
+				       'Data' => $lambda});
+		       $self->element({'Name' => 'Statistics_kappa',
+				       'Data' => $kappa});
+		       $self->element({'Name' => 'Statistics_entropy',
+				       'Data' => $entropy});
+		   } elsif( m/effective\s+search\s+space\s+used:\s+(\d+)/ox ) {
+		       $self->element({'Name' => 'Statistics_eff-spaceused',
+				       'Data' => $1});		       
+		   } elsif( m/effective\s+search\s+space:\s+(\d+)/ox ) {
+		       $self->element({'Name' => 'Statistics_eff-space',
+				       'Data' => $1});
+		   } elsif( m/Gap\s+Penalties:\s+Existence:\s+(\d+)\,
+			    \s+Extension:\s+(\d+)/ox) {		       
+		       $self->element({'Name' => 'Parameters_gap-open',
+				       'Data' => $1});
+		       $self->element({'Name' => 'Parameters_gap-extend',
+				       'Data' => $2});
+		   } elsif( /effective\s+HSP\s+length:\s+(\d+)/ ) {
+		        $self->element({'Name' => 'Statistics_hsp-len',
+					'Data' => $1});
+		   } elsif( /effective\s+length\s+of\s+query:\s+([\d\,]+)/ ) {
+		       my $c = $1;
+		       $c =~ s/\,//g;
+		       $self->element({'Name' => 'Statistics_query-len',
+				       'Data' => $c});
+		   } elsif( m/effective\s+length\s+of\s+database:\s+
+			    ([\d\,]+)/ox){
+		       my $c = $1;
+		       $c =~ s/\,//g;
+		       $self->element({'Name' => 'Statistics_eff-dblen',
+				       'Data' => $c});
+		   } elsif( m/^(T|A|X1|X2|S1|S2):\s+(.+)/ox ) {
+		       my $v = $2;
+		       chomp($v);
+		       $self->element({'Name' => "Statistics_$1",
+                                       'Data' => $v});
+		   } elsif( m/frameshift\s+window\,\s+decay\s+const:\s+
+			    (\d+)\,\s+([\.\d]+)/ox ) {
+		       $self->element({'Name'=> 'Statistics_framewindow',
+				       'Data' => $1});
+		       $self->element({'Name'=> 'Statistics_decay',
+				       'Data' => $2});
+		   } elsif( m/^Number\s+of\s+Hits\s+to\s+DB:\s+(\S+)/ox ) {
+		       $self->element({'Name' => 'Statistics_hit_to_db',
+				       'Data' => $1});
+		   } elsif( m/^Number\s+of\s+extensions:\s+(\S+)/ox ) {
+		       $self->element({'Name' => 'Statistics_num_extensions',
+				       'Data' => $1});
+		   } elsif( m/^Number\s+of\s+successful\s+extensions:\s+
+			    (\S+)/ox ) {
+		       $self->element({'Name' => 'Statistics_num_suc_extensions',
+				       'Data' => $1});
+		   } elsif( m/^Number\s+of\s+sequences\s+better\s+than\s+
+			    (\S+):\s+(\d+)/ox ) {
+		       $self->element({'Name' => 'Parameters_expect',
+				       'Data' => $1});
+		       $self->element({'Name' => 'Statistics_seqs_better_than_cutoff',
+				       'Data' => $2});
+		   } elsif( /^\s+Posted\s+date:\s+(.+)/ ) {
+		       my $d = $1;
+		       chomp($d);
+		       $self->element({'Name' => 'Statistics_posted_date',
+				       'Data' => $d});
+		   } elsif( ! /^\s+$/ ) { 
+		       $self->debug( "unmatched stat $_");
+		   }
+	       }
+	       $last = $_;
+	   }
+       } elsif( $self->in_element('hsp') ) {
+           # let's read 3 lines at a time;
+	   my %data = ( 'Query' => '',
+			'Mid' => '',
+			'Hit' => '' );
+	   my ($l,$len);
+	   for( my $i = 0; 
+		defined($_) && $i < 3; 
+		$i++ ){	       
+	       chomp;	       
+	       if( ($i == 0 &&  /^\s+$/) || ($l = /^\s*Lambda/i) ) { 
+		   $self->_pushback($_) if defined $_;
+		   # this fixes bug #1443
+		   $self->end_element({'Name' => 'Hsp'});
+		   $self->end_element({'Name' => 'Hit'}) if $l;		   
+		   last; 
+	       }
+	       if( /^((Query|Sbjct):\s+(\d+)\s*)(\S+)\s+(\d+)/ ) {
+		   $data{$2} = $4;
+		   $len = length($1);
+		   $self->{"\_$2"}->{'begin'} = $3 unless $self->{"_$2"}->{'begin'};
+		   $self->{"\_$2"}->{'end'} = $5;
+	       } else { 
+		   $self->throw("no data for midline $_") 
+		       unless (defined $_ && defined $len);
+		   $data{'Mid'} = substr($_,$len);
+	       }
+	       $_ = $self->_readline();	       
+	   }
+	   $self->characters({'Name' => 'Hsp_qseq',
+			      'Data' => $data{'Query'} });
+	   $self->characters({'Name' => 'Hsp_hseq',
+			      'Data' => $data{'Sbjct'}});
+	   $self->characters({'Name' => 'Hsp_midline',
+			      'Data' => $data{'Mid'} });
+       } else { 
+	   $self->debug( "unrecognized line $_");
+       }
+   } 
+
+   if( $seentop ) {
+       # double back check that hits and hsps are closed
+       # this in response to bug #1443 (may be uncessary due to fix
+       # above, but making double sure)
+       $self->in_element('hsp') && 
+	   $self->end_element({'Name' => 'Hsp'});
+       $self->in_element('hit') && 
+	   $self->end_element({'Name' => 'Hit'});       
+       $self->end_element({'Name' => 'BlastOutput'});
+   }
+#   $self->end_element({'Name' => 'BlastOutput'}) unless ! $seentop;
+   return $self->end_document();
+}
+
+=head2 start_element
+
+ Title   : start_element
+ Usage   : $eventgenerator->start_element
+ Function: Handles a start element event
+ Returns : none
+ Args    : hashref with at least 2 keys 'Data' and 'Name'
+
+
+=cut
+
+sub start_element{
+   my ($self,$data) = @_;
+   # we currently don't care about attributes
+   my $nm = $data->{'Name'};    
+   my $type = $MODEMAP{$nm};
+   if( $type ) {
+       if( $self->_eventHandler->will_handle($type) ) {
+	   my $func = sprintf("start_%s",lc $type);
+	   $self->_eventHandler->$func($data->{'Attributes'});
+       }
+       unshift @{$self->{'_elements'}}, $type;
+       if( $type eq 'result') {
+	   $self->{'_values'} = {};
+	   $self->{'_result'}= undef;
+       } else { 
+	   # cleanup some things
+	   if( defined $self->{'_values'} ) {
+	       foreach my $k ( grep { /^\U$type\-/ } 
+			       keys %{$self->{'_values'}} ) { 
+		   delete $self->{'_values'}->{$k};
+	       }
+	   }
+       }
+   }
+}
+
+=head2 end_element
+
+ Title   : start_element
+ Usage   : $eventgenerator->end_element
+ Function: Handles an end element event
+ Returns : none
+ Args    : hashref with at least 2 keys 'Data' and 'Name'
+
+
+=cut
+
+sub end_element {
+    my ($self,$data) = @_;
+    my $nm = $data->{'Name'};
+    my $type = $MODEMAP{$nm};
+    my $rc;
+    if($nm eq 'BlastOutput_program' &&
+       $self->{'_last_data'} =~ /(t?blast[npx])/i ) {
+	$self->{'_reporttype'} = uc $1; 	    
+    }   
+
+    # Hsp are sort of weird, in that they end when another
+    # object begins so have to detect this in end_element for now
+    if( $nm eq 'Hsp' ) {
+	foreach ( qw(Hsp_qseq Hsp_midline Hsp_hseq) ) {
+	    $self->element({'Name' => $_,
+			    'Data' => $self->{'_last_hspdata'}->{$_}});
+	}
+	$self->{'_last_hspdata'} = {};
+	$self->element({'Name' => 'Hsp_query-from',
+			'Data' => $self->{'_Query'}->{'begin'}});
+	$self->element({'Name' => 'Hsp_query-to',
+			'Data' => $self->{'_Query'}->{'end'}});
+	
+	$self->element({'Name' => 'Hsp_hit-from',
+			'Data' => $self->{'_Sbjct'}->{'begin'}});
+	$self->element({'Name' => 'Hsp_hit-to',
+			'Data' => $self->{'_Sbjct'}->{'end'}});
+    }
+    if( $type = $MODEMAP{$nm} ) {
+	if( $self->_eventHandler->will_handle($type) ) {
+	    my $func = sprintf("end_%s",lc $type);
+	    $rc = $self->_eventHandler->$func($self->{'_reporttype'},
+					      $self->{'_values'});	    
+	}
+	shift @{$self->{'_elements'}};
+
+    } elsif( $MAPPING{$nm} ) { 	
+	
+	if ( ref($MAPPING{$nm}) =~ /hash/i ) {
+	    # this is where we shove in the data from the 
+	    # hashref info about params or statistics
+	    my $key = (keys %{$MAPPING{$nm}})[0];	    	    
+	    $self->{'_values'}->{$key}->{$MAPPING{$nm}->{$key}} = $self->{'_last_data'};
+	} else {
+	    $self->{'_values'}->{$MAPPING{$nm}} = $self->{'_last_data'};
+	}
+    } else { 
+	$self->debug( "unknown nm $nm, ignoring\n");
+    }
+    $self->{'_last_data'} = ''; # remove read data if we are at 
+				# end of an element
+    $self->{'_result'} = $rc if( defined $type && $type eq 'result' );
+    return $rc;
+
+}
+
+=head2 element
+
+ Title   : element
+ Usage   : $eventhandler->element({'Name' => $name, 'Data' => $str});
+ Function: Convience method that calls start_element, characters, end_element
+ Returns : none
+ Args    : Hash ref with the keys 'Name' and 'Data'
+
+
+=cut
+
+sub element{
+   my ($self,$data) = @_;
+   $self->start_element($data);       
+   $self->characters($data);
+   $self->end_element($data);
+}
+
+=head2 characters
+
+ Title   : characters
+ Usage   : $eventgenerator->characters($str)
+ Function: Send a character events
+ Returns : none
+ Args    : string
+
+
+=cut
+
+sub characters{
+   my ($self,$data) = @_;   
+   return unless ( defined $data->{'Data'} && $data->{'Data'} !~ /^\s+$/ );   
+
+   if( $self->in_element('hsp') && 
+       $data->{'Name'} =~ /Hsp\_(qseq|hseq|midline)/ ) {
+       $self->{'_last_hspdata'}->{$data->{'Name'}} .= $data->{'Data'};
+   }   
+   $self->{'_last_data'} = $data->{'Data'}; 
+}
+
+=head2 within_element
+
+ Title   : within_element
+ Usage   : if( $eventgenerator->within_element($element) ) {}
+ Function: Test if we are within a particular element
+           This is different than 'in' because within can be tested
+           for a whole block.
+ Returns : boolean
+ Args    : string element name 
+
+
+=cut
+
+sub within_element{
+   my ($self,$name) = @_;  
+   return 0 if ( ! defined $name &&
+		 ! defined  $self->{'_elements'} ||
+		 scalar @{$self->{'_elements'}} == 0) ;
+   foreach (  @{$self->{'_elements'}} ) {
+       if( $_ eq $name  ) {
+	   return 1;
+       } 
+   }
+   return 0;
+}
+
+
+=head2 in_element
+
+ Title   : in_element
+ Usage   : if( $eventgenerator->in_element($element) ) {}
+ Function: Test if we are in a particular element
+           This is different than 'in' because within can be tested
+           for a whole block.
+ Returns : boolean
+ Args    : string element name 
+
+
+=cut
+
+sub in_element{
+   my ($self,$name) = @_;  
+   return 0 if ! defined $self->{'_elements'}->[0];
+   return ( $self->{'_elements'}->[0] eq $name)
+}
+
+=head2 start_document
+
+ Title   : start_document
+ Usage   : $eventgenerator->start_document
+ Function: Handle a start document event
+ Returns : none
+ Args    : none
+
+
+=cut
+
+sub start_document{
+    my ($self) = @_;
+    $self->{'_lasttype'} = '';
+    $self->{'_values'} = {};
+    $self->{'_result'}= undef;
+    $self->{'_elements'} = [];
+}
+
+=head2 end_document
+
+ Title   : end_document
+ Usage   : $eventgenerator->end_document
+ Function: Handles an end document event
+ Returns : Bio::Search::Result::ResultI object
+ Args    : none
+
+
+=cut
+
+sub end_document{
+   my ($self,@args) = @_;
+   return $self->{'_result'};
+}
+
+
+sub write_result {
+   my ($self, $blast, @args) = @_;
+
+   if( not defined($self->writer) ) {
+       $self->warn("Writer not defined. Using a $DEFAULT_BLAST_WRITER_CLASS");
+       $self->writer( $DEFAULT_BLAST_WRITER_CLASS->new() );
+   }
+   $self->SUPER::write_result( $blast, @args );
+}
+
+sub result_count {
+    my $self = shift;
+    return $self->{'_result_count'};
+}
+
+sub report_count { shift->result_count }
+
+1;