diff variant_effect_predictor/Bio/Tools/BPlite.pm @ 0:1f6dce3d34e0

Uploaded
author mahtabm
date Thu, 11 Apr 2013 02:01:53 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/variant_effect_predictor/Bio/Tools/BPlite.pm	Thu Apr 11 02:01:53 2013 -0400
@@ -0,0 +1,448 @@
+# $Id: BPlite.pm,v 1.36.2.2 2003/02/20 00:39:03 jason Exp $
+##############################################################################
+# Bioperl module Bio::Tools::BPlite
+##############################################################################
+#
+# The original BPlite.pm module has been written by Ian Korf !
+# see http://sapiens.wustl.edu/~ikorf
+#
+# You may distribute this module under the same terms as perl itself
+
+=head1 NAME
+
+Bio::Tools::BPlite - Lightweight BLAST parser
+
+=head1 SYNOPSIS
+
+ use Bio::Tools::BPlite;
+ my $report = new Bio::Tools::BPlite(-fh=>\*STDIN);
+
+  {
+    $report->query;
+    $report->database;
+    while(my $sbjct = $report->nextSbjct) {
+	$sbjct->name;
+	while (my $hsp = $sbjct->nextHSP) {
+	    $hsp->score;
+	    $hsp->bits;
+	    $hsp->percent;
+	    $hsp->P;
+            $hsp->EXP;
+	    $hsp->match;
+	    $hsp->positive;
+	    $hsp->length;
+	    $hsp->querySeq;
+	    $hsp->sbjctSeq;
+	    $hsp->homologySeq;
+	    $hsp->query->start;
+	    $hsp->query->end;
+	    $hsp->hit->start;
+	    $hsp->hit->end;
+	    $hsp->hit->seq_id;
+	    $hsp->hit->overlaps($exon);
+	}
+    }
+
+    # the following line takes you to the next report in the stream/file
+    # it will return 0 if that report is empty,
+    # but that is valid for an empty blast report.
+    # Returns -1 for EOF.
+
+    last if ($report->_parseHeader == -1);
+    redo;
+  }
+
+
+=head1 DESCRIPTION
+
+BPlite is a package for parsing BLAST reports. The BLAST programs are a family
+of widely used algorithms for sequence database searches. The reports are
+non-trivial to parse, and there are differences in the formats of the various
+flavors of BLAST. BPlite parses BLASTN, BLASTP, BLASTX, TBLASTN, and TBLASTX
+reports from both the high performance WU-BLAST, and the more generic
+NCBI-BLAST.
+
+Many people have developed BLAST parsers (I myself have made at least three).
+BPlite is for those people who would rather not have a giant object
+specification, but rather a simple handle to a BLAST report that works well
+in pipes.
+
+=head2 Object
+
+BPlite has three kinds of objects, the report, the subject, and the HSP. To
+create a new report, you pass a filehandle reference to the BPlite constructor.
+
+ my $report = new Bio::Tools::BPlite(-fh=>\*STDIN); # or any other filehandle
+
+The report has two attributes (query and database), and one method (nextSbjct).
+
+ $report->query;     # access to the query name
+ $report->database;  # access to the database name
+ $report->nextSbjct; # gets the next subject
+ while(my $sbjct = $report->nextSbjct) {
+     # canonical form of use is in a while loop
+ }
+
+A subject is a BLAST hit, which should not be confused with an HSP (below). A
+BLAST hit may have several alignments associated with it. A useful way of
+thinking about it is that a subject is a gene and HSPs are the exons. Subjects
+have one attribute (name) and one method (nextHSP).
+
+ $sbjct->name;    # access to the subject name
+ $sbjct->nextHSP; # gets the next HSP from the sbjct
+ while(my $hsp = $sbjct->nextHSP) {
+     # canonical form is again a while loop
+ }
+
+An HSP is a high scoring pair, or simply an alignment.  HSP objects
+inherit all the useful methods from RangeI/SeqFeatureI/FeaturePair,
+but provide an additional set of attributes (score, bits, percent, P,
+match, EXP, positive, length, querySeq, sbjctSeq, homologySeq) that
+should be familiar to anyone who has seen a blast report.
+
+For lazy/efficient coders, two-letter abbreviations are available for the 
+attributes with long names (qs, ss, hs). Ranges of the aligned sequences in
+query/subject and other information (like seqname) are stored
+in SeqFeature objects (i.e.: $hsp-E<gt>query, $hsp-E<gt>subject which is equal to
+$hsp-E<gt>feature1, $hsp-E<gt>feature2). querySeq, sbjctSeq and homologySeq do only
+contain the alignment sequences from the blast report.
+
+ $hsp->score;
+ $hsp->bits;
+ $hsp->percent;
+ $hsp->P;
+ $hsp->match;
+ $hsp->positive;
+ $hsp->length;
+ $hsp->querySeq;      $hsp->qs;
+ $hsp->sbjctSeq;      $hsp->ss;
+ $hsp->homologySeq;   $hsp->hs;
+ $hsp->query->start;
+ $hsp->query->end;
+ $hsp->query->seq_id;
+ $hsp->hit->primary_tag; # "similarity"
+ $hsp->hit->source_tag;  # "BLAST"
+ $hsp->hit->start;
+ $hsp->hit->end;
+ ...
+
+So a very simple look into a BLAST report might look like this.
+
+ my $report = new Bio::Tools::BPlite(-fh=>\*STDIN);
+ while(my $sbjct = $report->nextSbjct) {
+     print ">",$sbjct->name,"\n";
+     while(my $hsp = $sbjct->nextHSP) {
+	 	print "\t",$hsp->start,"..",$hsp->end," ",$hsp->bits,"\n";
+     }
+ }
+
+The output of such code might look like this:
+
+ >foo
+     100..155 29.5
+     268..300 20.1
+ >bar
+     100..153 28.5
+     265..290 22.1
+
+
+=head1 AUTHORS
+
+Ian Korf (ikorf@sapiens.wustl.edu, http://sapiens.wustl.edu/~ikorf), 
+Lorenz Pollak (lorenz@ist.org, bioperl port)
+
+=head1 ACKNOWLEDGEMENTS
+
+This software was developed at the Genome Sequencing Center at Washington
+Univeristy, St. Louis, MO.
+
+=head1 CONTRIBUTORS
+
+Jason Stajich, jason@bioperl.org
+
+=head1 COPYRIGHT
+
+Copyright (C) 1999 Ian Korf. All Rights Reserved.
+
+=head1 DISCLAIMER
+
+This software is provided "as is" without warranty of any kind.
+
+=cut
+
+package Bio::Tools::BPlite;
+
+use strict;
+use vars qw(@ISA);
+
+use Bio::Root::Root;
+use Bio::Root::IO;
+use Bio::Tools::BPlite::Sbjct; # we want to use Sbjct
+use Bio::SeqAnalysisParserI;
+use Symbol;
+
+@ISA = qw(Bio::Root::Root Bio::SeqAnalysisParserI Bio::Root::IO);
+
+# new comes from a RootI now
+
+=head2 new
+
+ Title   : new
+ Function: Create a new Bio::Tools::BPlite object
+ Returns : Bio::Tools::BPlite
+ Args    : -file     input file (alternative to -fh)
+           -fh       input stream (alternative to -file)
+
+=cut
+
+sub new {
+  my ($class, @args) = @_; 
+  my $self = $class->SUPER::new(@args);
+
+  # initialize IO
+  $self->_initialize_io(@args);
+
+  $self->{'QPATLOCATION'} = [];  # Anonymous array of query pattern locations for PHIBLAST
+
+  if ($self->_parseHeader) {$self->{'REPORT_DONE'} = 0} # there are alignments
+  else                     {$self->{'REPORT_DONE'} = 1} # empty report
+  
+  return $self; # success - we hope!
+}
+
+# for SeqAnalysisParserI compliance
+
+=head2 next_feature
+
+ Title   : next_feature
+ Usage   : while( my $feat = $res->next_feature ) { # do something }
+ Function: SeqAnalysisParserI implementing function. This implementation
+           iterates over all HSPs. If the HSPs of the current subject match
+           are exhausted, it will automatically call nextSbjct().
+ Example :
+ Returns : A Bio::SeqFeatureI compliant object, in this case a
+           Bio::Tools::BPlite::HSP object, and FALSE if there are no more
+           HSPs.
+ Args    : None
+
+=cut
+
+sub next_feature{
+   my ($self) = @_;
+   my ($sbjct, $hsp);
+   $sbjct = $self->{'_current_sbjct'};
+   unless( defined $sbjct ) {
+       $sbjct = $self->{'_current_sbjct'} = $self->nextSbjct;
+       return undef unless defined $sbjct;
+   }   
+   $hsp = $sbjct->nextHSP;
+   unless( defined $hsp ) {
+       $self->{'_current_sbjct'} = undef;
+       return $self->next_feature;
+   }
+   return $hsp || undef;
+}
+
+=head2 query
+
+ Title    : query
+ Usage    : $query = $obj->query();
+ Function : returns the query object
+ Example  :
+ Returns  : query object
+ Args     :
+
+=cut
+
+sub query    {shift->{'QUERY'}}
+
+=head2 qlength
+
+ Title    : qlength
+ Usage    : $len = $obj->qlength();
+ Function : returns the length of the query 
+ Example  :
+ Returns  : length of query
+ Args     :
+
+=cut
+
+sub qlength  {shift->{'LENGTH'}}
+
+=head2 pattern
+
+ Title    : pattern
+ Usage    : $pattern = $obj->pattern();
+ Function : returns the pattern used in a PHIBLAST search
+
+=cut
+
+sub pattern {shift->{'PATTERN'}}
+
+=head2 query_pattern_location
+
+ Title    : query_pattern_location
+ Usage    : $qpl = $obj->query_pattern_location();
+ Function : returns reference to array of locations in the query sequence
+            of pattern used in a PHIBLAST search
+
+=cut
+
+sub query_pattern_location {shift->{'QPATLOCATION'}}
+
+=head2 database
+
+ Title    : database
+ Usage    : $db = $obj->database();
+ Function : returns the database used in this search
+ Example  :
+ Returns  : database used for search
+ Args     :
+
+=cut
+
+sub database {shift->{'DATABASE'}}
+
+=head2 nextSbjct
+
+ Title    : nextSbjct
+ Usage    : $sbjct = $obj->nextSbjct();
+ Function : Method of iterating through all the Sbjct retrieved 
+            from parsing the report 
+ Example  : while ( my $sbjct = $obj->nextSbjct ) {}
+ Returns  : next Sbjct object or null if finished
+ Args     :
+
+=cut
+
+sub nextSbjct {
+  my ($self) = @_;
+  
+  $self->_fastForward or return undef;
+  
+  #######################
+  # get all sbjct lines #
+  #######################
+  my $def = $self->_readline();  
+  while(defined ($_ = $self->_readline() ) ) {
+    if    ($_ !~ /\w/)            {next}
+    elsif ($_ =~ /Strand HSP/)    {next} # WU-BLAST non-data
+    elsif ($_ =~ /^\s{0,2}Score/) {$self->_pushback($_); last}
+    elsif ($_ =~ /^Histogram|^Searching|^Parameters|^\s+Database:|^\s+Posted date:/) {
+	$self->_pushback($_); 
+	last;
+    }
+    else                          {$def .= $_}
+  }
+  $def =~ s/\s+/ /g;
+  $def =~ s/\s+$//g;
+  $def =~ s/Length = ([\d,]+)$//g;
+  my $length = $1;
+  return undef unless $def =~ /^>/;
+  $def =~ s/^>//;
+
+  ####################
+  # the Sbjct object #
+  ####################
+  my $sbjct = new Bio::Tools::BPlite::Sbjct('-name'=>$def,
+					    '-length'=>$length,
+                                            '-parent'=>$self);
+  return $sbjct;
+}
+
+# begin private routines
+
+sub _parseHeader {
+  my ($self) = @_;
+
+  # normally, _parseHeader will break out of the parse as soon as it
+  # reaches a new Subject (i.e. the first one after the header) if you
+  # call _parseHeader twice in a row, with nothing in between, all you
+  # accomplish is a ->nextSubject call..  so we need a flag to
+  # indicate that we have *entered* a header, before we are allowed to
+  # leave it!
+
+  my $header_flag = 0; # here is the flag/ It is "false" at first, and
+                       # is set to "true" when any valid header element
+                       # is encountered
+
+  $self->{'REPORT_DONE'} = 0;  # reset this bit for a new report
+  while(defined($_ = $self->_readline() ) ) {
+      s/\(\s*\)//;
+      if ($_ =~ /^Query=(?:\s+([^\(]+))?/) {
+	  $header_flag = 1;	# valid header element found
+	  my $query = $1;
+	  while( defined($_ = $self->_readline() ) ) {
+	      # Continue reading query name until encountering either
+	      # a line that starts with "Database" or a blank line.
+	      # The latter condition is needed in order to be able to
+	      # parse megablast output correctly, since Database comes
+	      # before (not after) the query.
+	      if( ($_ =~ /^Database/) || ($_ =~ /^$/) ) {
+		  $self->_pushback($_); last;
+	      }	      
+	      $query .= $_;
+	  }
+	  $query =~ s/\s+/ /g;
+	  $query =~ s/^>//;
+
+	  my $length = 0;
+	  if( $query =~ /\(([\d,]+)\s+\S+\)\s*$/ ) {      
+	      $length = $1;
+	      $length =~ s/,//g;
+	  } else { 
+	      $self->debug("length is 0 for '$query'\n");
+	  }
+	  $self->{'QUERY'} = $query;
+	  $self->{'LENGTH'} = $length;
+      }
+      elsif ($_ =~ /^(<b>)?(T?BLAST[NPX])\s+([\w\.-]+)\s+(\[[\w-]*\])/) { 
+	  $self->{'BLAST_TYPE'} = $2; 
+	  $self->{'BLAST_VERSION'} = $3;
+      }				# BLAST report type - not a valid header element # JB949
+      
+      # Support Paracel BTK output
+      elsif ( $_ =~ /(^[A-Z0-9_]+)\s+BTK\s+/ ) { 
+	  $self->{'BLAST_TYPE'} = $1;
+	  $self->{'BTK'} = 1;
+     } 
+      elsif ($_ =~ /^Database:\s+(.+)/) {$header_flag = 1;$self->{'DATABASE'} = $1} # valid header element found
+      elsif ($_ =~ /^\s*pattern\s+(\S+).*position\s+(\d+)\D/) {   
+	  # For PHIBLAST reports
+	  $header_flag = 1;	# valid header element found
+	  $self->{'PATTERN'} = $1;
+	  push (@{$self->{'QPATLOCATION'}}, $2);
+      } 
+      elsif (($_ =~ /^>/) && ($header_flag==1)) {$self->_pushback($_); return 1} # only leave if we have actually parsed a valid header!
+      elsif (($_ =~ /^Parameters|^\s+Database:/) && ($header_flag==1)) { # if we entered a header, and saw nothing before the stats at the end, then it was empty
+	  $self->_pushback($_);
+	  return 0;		# there's nothing in the report
+      }
+      # bug fix suggested by MI Sadowski via Martin Lomas
+      # see bug report #1118
+      if( ref($self->_fh()) !~ /GLOB/ && $self->_fh()->can('EOF') && eof($self->_fh()) ) {
+	  $self->warn("unexpected EOF in file\n");
+	  return -1;
+      }
+  }
+  return -1; # EOF
+}
+
+sub _fastForward {
+    my ($self) = @_;
+    return 0 if $self->{'REPORT_DONE'}; # empty report
+    while(defined( $_ = $self->_readline() ) ) {
+	if ($_ =~ /^Histogram|^Searching|^Parameters|^\s+Database:|^\s+Posted date:/) {
+	    return 0;
+	} elsif( $_ =~ /^>/ ) {
+	    $self->_pushback($_);	
+	    return 1;
+	}
+    }
+    unless( $self->{'BTK'} ) { # Paracel BTK reports have no footer
+	$self->warn("Possible error (1) while parsing BLAST report!");
+    }
+}
+
+1;
+__END__