diff variant_effect_predictor/Bio/SearchIO/blastxml.pm @ 0:2bc9b66ada89 draft default tip

Uploaded
author mahtabm
date Thu, 11 Apr 2013 06:29:17 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/variant_effect_predictor/Bio/SearchIO/blastxml.pm	Thu Apr 11 06:29:17 2013 -0400
@@ -0,0 +1,436 @@
+# $Id: blastxml.pm,v 1.24 2002/10/26 09:32:16 sac Exp $
+#
+# BioPerl module for Bio::SearchIO::blastxml
+#
+# Cared for by Jason Stajich <jason@bioperl.org>
+#
+# Copyright Jason Stajich
+#
+# You may distribute this module under the same terms as perl itself
+
+# POD documentation - main docs before the code
+
+=head1 NAME
+
+Bio::SearchIO::blastxml - A SearchIO implementation of NCBI Blast XML parsing. 
+
+=head1 SYNOPSIS
+
+    use Bio::SearchIO;
+    my $searchin = new Bio::SearchIO(-format => 'blastxml',
+				     -file   => 't/data/plague_yeast.bls.xml');
+    while( my $result = $searchin->next_result ) {
+    }
+
+    # one can also request that the parser NOT keep the XML data in memory
+    # by using the tempfile initialization flag.
+    my $searchin = new Bio::SearchIO(-tempfile => 1,
+				     -format => 'blastxml',
+				     -file   => 't/data/plague_yeast.bls.xml');
+    while( my $result = $searchin->next_result ) {
+    }
+
+=head1 DESCRIPTION
+
+This object implements a NCBI Blast XML parser.
+
+There is one additional initialization flag from the SearchIO defaults
+- that is the -tempfile flag.  If specified as true, then the parser
+will write out each report to a temporary filehandle rather than
+holding the entire report as a string in memory.  The reason this is
+done in the first place is NCBI reports have an uncessary E<lt>?xml
+version="1.0"?E<gt> at the beginning of each report and RPS-BLAST reports
+have an additional unecessary RPS-BLAST tag at the top of each report.
+So we currently have implemented the work around by preparsing the
+file (yes it makes the process slower, but it works).
+
+
+=head1 FEEDBACK
+
+=head2 Mailing Lists
+
+User feedback is an integral part of the evolution of this and other
+Bioperl modules. Send your comments and suggestions preferably to
+the Bioperl mailing list.  Your participation is much appreciated.
+
+  bioperl-l@bioperl.org              - General discussion
+  http://bioperl.org/MailList.shtml  - About the mailing lists
+
+=head2 Reporting Bugs
+
+Report bugs to the Bioperl bug tracking system to help us keep track
+of the bugs and their resolution. Bug reports can be submitted via
+email or the web:
+
+  bioperl-bugs@bioperl.org
+  http://bugzilla.bioperl.org/
+
+=head1 AUTHOR - Jason Stajich
+
+Email jason@bioperl.org
+
+Describe contact details here
+
+=head1 CONTRIBUTORS
+
+Additional contributors names and emails here
+
+=head1 APPENDIX
+
+The rest of the documentation details each of the object methods.
+Internal methods are usually preceded with a _
+
+=cut
+
+# Let the code begin...
+
+package Bio::SearchIO::blastxml;
+use vars qw(@ISA $DTD %MAPPING %MODEMAP $DEBUG);
+use strict;
+
+$DTD = 'ftp://ftp.ncbi.nlm.nih.gov/blast/documents/NCBI_BlastOutput.dtd';
+# Object preamble - inherits from Bio::Root::Root
+
+use Bio::Root::Root;
+use Bio::SearchIO;
+use XML::Parser::PerlSAX;
+use XML::Handler::Subs;
+use HTML::Entities;
+use IO::File;
+
+
+BEGIN { 
+    # mapping of NCBI Blast terms to Bioperl hash keys
+    %MODEMAP = ('BlastOutput' => 'result',
+		'Hit'         => 'hit',
+		'Hsp'         => 'hsp'
+		);
+
+    %MAPPING = ( 
+		 # HSP specific fields
+		 'Hsp_bit-score'  => 'HSP-bits',
+		 'Hsp_score'      => 'HSP-score',
+		 'Hsp_evalue'     => 'HSP-evalue',
+		 'Hsp_query-from' => 'HSP-query_start',
+		 'Hsp_query-to'   => 'HSP-query_end',
+		 'Hsp_hit-from'   => 'HSP-hit_start',
+		 'Hsp_hit-to'     => 'HSP-hit_end',
+		 'Hsp_positive'   => 'HSP-conserved',
+		 'Hsp_identity'   => 'HSP-identical',
+		 'Hsp_gaps'       => 'HSP-gaps',
+		 'Hsp_hitgaps'    => 'HSP-hit_gaps',
+		 'Hsp_querygaps'  => 'HSP-query_gaps',
+		 'Hsp_qseq'       => 'HSP-query_seq',
+		 'Hsp_hseq'       => 'HSP-hit_seq',
+		 'Hsp_midline'    => 'HSP-homology_seq',
+		 'Hsp_align-len'  => 'HSP-hsp_length',
+		 'Hsp_query-frame'=> 'HSP-query_frame',
+		 'Hsp_hit-frame'  => 'HSP-hit_frame',
+
+		 # these are ignored for now
+		 'Hsp_num'          => 'HSP-order',
+		 'Hsp_pattern-from' => 'patternend',
+		 'Hsp_pattern-to'   => 'patternstart',
+		 'Hsp_density'      => 'hspdensity',
+
+		 # Hit specific fields
+		 'Hit_id'               => 'HIT-name',
+		 'Hit_len'              => 'HIT-length',
+		 'Hit_accession'        => 'HIT-accession',
+		 'Hit_def'              => 'HIT-description',
+		 'Hit_num'              => 'HIT-order',
+		 'Iteration_iter-num'   => 'HIT-iteration',
+		 'Iteration_stat'       => 'HIT-iteration_statistic',
+		 
+		 'BlastOutput_program'   => 'RESULT-algorithm_name',
+		 'BlastOutput_version'   => 'RESULT-algorithm_version',
+		 'BlastOutput_query-def' => 'RESULT-query_description',
+		 'BlastOutput_query-len' => 'RESULT-query_length',
+		 'BlastOutput_db'        => 'RESULT-database_name',
+		 'BlastOutput_reference' => 'RESULT-program_reference',
+		 'BlastOutput_query-ID'  => 'runid',
+		 
+		 'Parameters_matrix'    => { 'RESULT-parameters' => 'matrix'},
+		 'Parameters_expect'    => { 'RESULT-parameters' => 'expect'},
+		 'Parameters_include'   => { 'RESULT-parameters' => 'include'},
+		 'Parameters_sc-match'  => { 'RESULT-parameters' => 'match'},
+		 'Parameters_sc-mismatch' => { 'RESULT-parameters' => 'mismatch'},
+		 'Parameters_gap-open'  => { 'RESULT-parameters' => 'gapopen'},
+		 'Parameters_gap-extend'=> { 'RESULT-parameters' => 'gapext'},
+		 'Parameters_filter'    => {'RESULT-parameters' => 'filter'},
+		 'Statistics_db-num'    => 'RESULT-database_entries',
+		 'Statistics_db-len'    => 'RESULT-database_letters',
+		 'Statistics_hsp-len'   => { 'RESULT-statistics' => 'hsplength'},
+		 'Statistics_eff-space' => { 'RESULT-statistics' => 'effectivespace'},
+		 'Statistics_kappa'     => { 'RESULT-statistics' => 'kappa' },
+		 'Statistics_lambda'    => { 'RESULT-statistics' => 'lambda' },
+		 'Statistics_entropy'   => { 'RESULT-statistics' => 'entropy'},
+		 );
+    eval {  require Time::HiRes };	
+    if( $@ ) { $DEBUG = 0; }
+}
+
+
+@ISA = qw(Bio::SearchIO );
+
+=head2 new
+
+ Title   : new
+ Usage   : my $searchio = new Bio::SearchIO(-format => 'blastxml',
+					    -file   => 'filename',
+					    -tempfile => 1);
+ Function: Initializes the object - this is chained through new in SearchIO
+ Returns : Bio::SearchIO::blastxml object
+ Args    : One additional argument from the format and file/fh parameters.
+           -tempfile => boolean.  Defaults to false.  Write out XML data
+                                  to a temporary filehandle to send to 
+                                  PerlSAX parser.
+=cut
+
+=head2 _initialize
+
+ Title   : _initialize
+ Usage   : private
+ Function: Initializes the object - this is chained through new in SearchIO
+
+=cut
+
+sub _initialize{
+   my ($self,@args) = @_;   
+   $self->SUPER::_initialize(@args);
+   my ($usetempfile) = $self->_rearrange([qw(TEMPFILE)],@args);
+   defined $usetempfile && $self->use_tempfile($usetempfile);
+   $self->{'_xmlparser'} = new XML::Parser::PerlSAX();
+   $DEBUG = 1 if( ! defined $DEBUG && $self->verbose > 0);
+}
+
+=head2 next_result
+
+ Title   : next_result
+ Usage   : my $hit = $searchio->next_result;
+ Function: Returns the next Result from a search
+ Returns : Bio::Search::Result::ResultI object
+ Args    : none
+
+=cut
+
+sub next_result {
+    my ($self) = @_;
+ 
+    my $data = '';
+    my $firstline = 1;
+    my ($tfh);
+    if( $self->use_tempfile ) {
+	$tfh = IO::File->new_tmpfile or $self->throw("Unable to open temp file: $!");	
+	$tfh->autoflush(1);
+    }
+    my $okaytoprocess;
+    while( defined( $_ = $self->_readline) ) {
+	if( /^RPS-BLAST/i ) {
+	    $self->{'_type'} = 'RPSBLAST';
+	    next;
+	}
+	if( /^<\?xml version/ && ! $firstline) { 
+	    $self->_pushback($_);
+	    last;
+	}
+	$_ = decode_entities($_);
+#	s/\&apos;/\`/g;	
+#	s/\&gt;/\>/g;
+#	s/\&lt;/\</g;
+	$okaytoprocess = 1;
+	if( defined $tfh ) {
+	    print $tfh $_;
+	} else {
+	    $data .= $_;
+	}
+	$firstline = 0;
+    }
+
+    return undef unless( $okaytoprocess);
+    
+    my %parser_args;
+    if( defined $tfh ) {
+	seek($tfh,0,0);
+	%parser_args = ('Source' => { 'ByteStream' => $tfh },
+			'Handler' => $self);
+    } else {
+	%parser_args = ('Source' => { 'String' => $data },
+			'Handler' => $self);
+    }
+    my $result;
+    my $starttime;
+    if(  $DEBUG ) {  $starttime = [ Time::HiRes::gettimeofday() ]; }
+
+    eval { 
+	$result = $self->{'_xmlparser'}->parse(%parser_args);
+        $self->{'_result_count'}++;
+    };
+    if( $@ ) {
+	$self->warn("error in parsing a report:\n $@");
+	$result = undef;
+    }    
+    if( $DEBUG ) {
+	$self->debug( sprintf("parsing took %f seconds\n", Time::HiRes::tv_interval($starttime)));
+    }
+    # parsing magic here - but we call event handlers rather than 
+    # instantiating things 
+    return $result;
+}
+
+=head2 SAX methods
+
+=cut
+
+=head2 start_document
+
+ Title   : start_document
+ Usage   : $parser->start_document;
+ Function: SAX method to indicate starting to parse a new document
+ Returns : none
+ Args    : none
+
+
+=cut
+
+sub start_document{
+    my ($self) = @_;
+    $self->{'_lasttype'} = '';
+    $self->{'_values'} = {};
+    $self->{'_result'}= undef;
+}
+
+=head2 end_document
+
+ Title   : end_document
+ Usage   : $parser->end_document;
+ Function: SAX method to indicate finishing parsing a new document
+ Returns : Bio::Search::Result::ResultI object
+ Args    : none
+
+=cut
+
+sub end_document{
+   my ($self,@args) = @_;
+   return $self->{'_result'};
+}
+
+=head2 start_element
+
+ Title   : start_element
+ Usage   : $parser->start_element($data)
+ Function: SAX method to indicate starting a new element
+ Returns : none
+ Args    : hash ref for data
+
+=cut
+
+sub start_element{
+    my ($self,$data) = @_;
+    # we currently don't care about attributes
+    my $nm = $data->{'Name'};    
+
+    if( my $type = $MODEMAP{$nm} ) {
+	if( $self->_eventHandler->will_handle($type) ) {
+	    my $func = sprintf("start_%s",lc $type);
+	    $self->_eventHandler->$func($data->{'Attributes'});
+	}						     
+    }
+
+    if($nm eq 'BlastOutput') {
+	$self->{'_values'} = {};
+	$self->{'_result'}= undef;
+    }
+}
+
+=head2 end_element
+
+ Title   : end_element
+ Usage   : $parser->end_element($data)
+ Function: Signals finishing an element
+ Returns : Bio::Search object dpending on what type of element
+ Args    : hash ref for data
+
+=cut
+
+sub end_element{
+    my ($self,$data) = @_;
+
+    my $nm = $data->{'Name'};
+    my $rc;
+    if($nm eq 'BlastOutput_program' &&
+       $self->{'_last_data'} =~ /(t?blast[npx])/i ) {
+	$self->{'_type'} = uc $1; 
+    }
+
+    if( my $type = $MODEMAP{$nm} ) {
+	if( $self->_eventHandler->will_handle($type) ) {
+	    my $func = sprintf("end_%s",lc $type);
+	    $rc = $self->_eventHandler->$func($self->{'_type'},
+					      $self->{'_values'});
+	}
+    } elsif( $MAPPING{$nm} ) { 
+	if ( ref($MAPPING{$nm}) =~ /hash/i ) {
+	    my $key = (keys %{$MAPPING{$nm}})[0];
+	    $self->{'_values'}->{$key}->{$MAPPING{$nm}->{$key}} = $self->{'_last_data'};
+	} else {
+	    $self->{'_values'}->{$MAPPING{$nm}} = $self->{'_last_data'};
+	}
+    } elsif( $nm eq 'Iteration' || $nm eq 'Hit_hsps' || $nm eq 'Parameters' ||
+	     $nm eq 'BlastOutput_param' || $nm eq 'Iteration_hits' || 
+	     $nm eq 'Statistics' || $nm eq 'BlastOutput_iterations' ){
+    
+    } else { 	
+	
+	$self->debug("ignoring unrecognized element type $nm\n");
+    }
+    $self->{'_last_data'} = ''; # remove read data if we are at 
+				# end of an element
+    $self->{'_result'} = $rc if( $nm eq 'BlastOutput' );
+    return $rc;
+}
+
+=head2 characters
+
+ Title   : characters
+ Usage   : $parser->characters($data)
+ Function: Signals new characters to be processed
+ Returns : characters read
+ Args    : hash ref with the key 'Data'
+
+
+=cut
+
+sub characters{
+   my ($self,$data) = @_;   
+   return unless ( defined $data->{'Data'} && $data->{'Data'} !~ /^\s+$/ );
+   
+   $self->{'_last_data'} = $data->{'Data'}; 
+}
+
+=head2 use_tempfile
+
+ Title   : use_tempfile
+ Usage   : $obj->use_tempfile($newval)
+ Function: Get/Set boolean flag on whether or not use a tempfile
+ Example : 
+ Returns : value of use_tempfile
+ Args    : newvalue (optional)
+
+
+=cut
+
+sub use_tempfile{
+   my ($self,$value) = @_;
+   if( defined $value) {
+      $self->{'_use_tempfile'} = $value;
+    }
+    return $self->{'_use_tempfile'};
+}
+
+sub result_count {
+    my $self = shift;
+    return $self->{'_result_count'};
+}
+
+1;