diff variant_effect_predictor/Bio/Tools/BPpsilite.pm @ 0:2bc9b66ada89 draft default tip

Uploaded
author mahtabm
date Thu, 11 Apr 2013 06:29:17 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/variant_effect_predictor/Bio/Tools/BPpsilite.pm	Thu Apr 11 06:29:17 2013 -0400
@@ -0,0 +1,357 @@
+# $Id: BPpsilite.pm,v 1.22 2002/10/22 07:38:45 lapp Exp $
+# Bioperl module Bio::Tools::BPpsilite
+############################################################
+#	based closely on the Bio::Tools::BPlite modules
+#	Ian Korf (ikorf@sapiens.wustl.edu, http://sapiens.wustl.edu/~ikorf), 
+#	Lorenz Pollak (lorenz@ist.org, bioperl port)
+#
+#
+# Copyright Peter Schattner
+#
+# You may distribute this module under the same terms as perl itself
+# _history
+# October 20, 2000
+# POD documentation - main docs before the code
+
+=head1 NAME
+
+Bio::Tools::BPpsilite - Lightweight BLAST parser for (iterated) psiblast reports
+
+=head1 SYNOPSIS
+
+  use Bio::Tools::BPpsilite;
+  open FH, "t/psiblastreport.out";
+  $report = Bio::Tools::BPpsilite->new(-fh=>\*FH);
+
+  # determine number of iterations executed by psiblast
+  $total_iterations = $report->number_of_iterations;
+  $last_iteration = $report->round($total_iterations);
+
+  # Process only hits found in last iteration ...
+   $oldhitarray_ref = $last_iteration->oldhits;
+   HIT: while($sbjct = $last_iteration->nextSbjct) {
+  	  $id = $sbjct->name;
+  	  $is_old =  grep  /\Q$id\E/, @$oldhitarray_ref;
+  	  if ($is_old ){next HIT;}
+  #  do something with new hit...
+  }
+
+
+=head1 DESCRIPTION
+
+BPpsilite is a package for parsing multiple iteration PSIBLAST
+reports.  It is based closely on Ian Korf's BPlite.pm module for
+parsing single iteration BLAST reports (as modified by Lorenz Pollak).
+
+Two of the four basic objects of BPpsilite.pm are identical to the
+corresponding objects in BPlite - the "HSP.pm" and "Sbjct.pm" objects.
+This DESCRIPTION documents only the one new object, the "iteration",
+as well as the additional methods that are implemented in BPpsilite
+that are not in BPlite.  See the BPlite documentation for information
+on the BPlite, SBJCT and HSP objects.
+
+The essential difference between PSIBLAST and the other BLAST programs
+(in terms of report parsing) is that PSIBLAST performs multiple
+iterations of the BLASTing of the database and the results of all of
+these iterations are stored in a single PSIBLAST report.  (For general
+information on PSIBLAST see the README.bla file in the standalone
+BLAST distribution and references therein). PSIBLAST's use of multiple
+iterations imposes additional demands on the report parser: * There
+are several iterations of hits.  Many of those hits will be repeated
+in more than one iteration.  Often only the last iteration will be of
+interest.  * Each iteration will list two different kinds of hits -
+repeated hits that were used in the model and newly identified hits -
+which may need to be processed in different manners * The total number
+of iterations performed is not displayed in the report until (almost)
+the very end of the report.  (The user can specify a maximum number of
+iterations for the PSIBLAST search, but the program may perform fewer
+iterations if convergence is reached)
+
+BPpsilite addresses these issues by offering the following methods:
+
+* The total number of iteration used is given by the method
+   number_of_iterations as in:
+
+	$total_iterations = $report->number_of_iterations;
+
+* Results from an arbitrary iteration round can be accessed by using
+  the 'round' method:
+
+	$iteration3_report = $report->round(3);
+
+* The ids of the sequences which passed the significance threshold for
+  the first time in the "nth" iteration can be identified by using the
+  newhits method.  Previously identified hits are identified by using
+  the oldhits method, as in:
+
+ 	$oldhitarray_ref = $iteration3_report->oldhits;
+ 	$newhitarray_ref = $iteration3_report->newhits;
+
+BPpsilite.pm should work equally well on reports generated by the
+StandAloneBlast.pm local BLAST module as with reports generated by
+remote psiblast searches. For examples of usage of BPpsilite.pm, the
+user is referred to the BPpsilite.t script in the "t" directory.
+
+=head1 FEEDBACK
+
+=head2 Mailing Lists
+
+User feedback is an integral part of the evolution of this and other
+Bioperl modules. Send your comments and suggestions preferably to one
+of the Bioperl mailing lists.  Your participation is much appreciated.
+
+  bioperl-l@bioperl.org              - General discussion
+  http://bio.perl.org/MailList.html  - About the mailing lists
+
+=head2 Reporting Bugs
+
+Report bugs to the Bioperl bug tracking system to help us keep track
+the bugs and their resolution.  Bug reports can be submitted via email
+or the web:
+
+  bioperl-bugs@bio.perl.org
+  http://bugzilla.bioperl.org/
+
+=head1 AUTHOR - Peter Schattner
+
+Email: schattner@alum.mit.edu
+
+=head1 CONTRIBUTORS
+
+Jason Stajich, jason@cgt.mc.duke.edu
+
+=head1 ACKNOWLEDGEMENTS
+
+Based on work of:
+Ian Korf (ikorf@sapiens.wustl.edu, http://sapiens.wustl.edu/~ikorf), 
+Lorenz Pollak (lorenz@ist.org, bioperl port)
+
+=head1 COPYRIGHT
+
+BPlite.pm is copyright (C) 1999 by Ian Korf. 
+
+=head1 DISCLAIMER
+
+This software is provided "as is" without warranty of any kind.
+
+=cut
+
+package Bio::Tools::BPpsilite;
+
+use strict;
+use vars qw(@ISA);
+use Bio::Tools::BPlite::Iteration; #
+use Bio::Tools::BPlite::Sbjct; #   Debug code
+use Bio::Root::Root; # root interface to inherit from
+use Bio::Root::IO;
+use Bio::Tools::BPlite; 
+
+@ISA = qw(Bio::Root::Root Bio::Root::IO);
+
+sub new {
+  my ($class, @args) = @_; 
+  my $self = $class->SUPER::new(@args);
+  
+  # initialize IO
+  $self->_initialize_io(@args);
+  $self->{'_tempdir'} = $self->tempdir('CLEANUP' => 1);
+  $self->{'QPATLOCATION'} = [];  # Anonymous array of query pattern locations for PHIBLAST
+  $self->{'NEXT_ITERATION_NUMBER'} = 1;
+  $self->{'TOTAL_ITERATION_NUMBER'} = -1;  # -1 indicates preprocessing not yet done
+
+  if ($self->_parseHeader) {$self->{'REPORT_DONE'} = 0} # there are alignments
+  else                     {$self->{'REPORT_DONE'} = 1} # empty report
+  
+  return $self; # success - we hope!
+}
+
+=head2 query
+
+ Title    : query
+ Usage    : $query = $obj->query();
+ Function : returns the query object
+ Returns  : query object
+ Args     :
+
+=cut
+
+sub query    {shift->{'QUERY'}}
+
+=head2 qlength
+
+ Title    : qlength
+ Usage    : $len = $obj->qlength();
+ Function : returns the length of the query 
+ Returns  : length of query
+ Args     :
+
+=cut
+
+sub qlength  {shift->{'LENGTH'}}
+
+=head2 database
+
+ Title    : database
+ Usage    : $db = $obj->database();
+ Function : returns the database used in this search
+ Returns  : database used for search
+ Args     :
+
+=cut
+
+sub database {shift->{'DATABASE'}}
+
+=head2 number_of_iterations
+
+ Title    : number_of_iterations
+ Usage    : $total_iterations = $obj-> number_of_iterations();
+ Function : returns the total number of iterations used in this search
+ Returns  : total number of iterations used for search
+ Args     : none
+
+=cut
+
+
+=head2 pattern
+
+ Title    : database
+ Usage    : $pattern = $obj->pattern();
+ Function : returns the pattern used in a PHIBLAST search
+
+=cut
+
+sub pattern {shift->{'PATTERN'}}
+
+=head2 query_pattern_location
+
+ Title    : query_pattern_location
+ Usage    : $qpl = $obj->query_pattern_location();
+ Function : returns reference to array of locations in the query sequence
+            of pattern used in a PHIBLAST search
+
+=cut
+
+sub query_pattern_location {shift->{'QPATLOCATION'}}
+
+
+
+
+sub number_of_iterations {
+	my $self = shift;
+  	if ($self->{'TOTAL_ITERATION_NUMBER'} == -1){&_preprocess($self);}
+	$self->{'TOTAL_ITERATION_NUMBER'};
+}
+
+=head2 round
+
+ Title    : round
+ Usage    : $Iteration3 = $report->round(3);
+ Function : Method of retrieving data from a specific iteration 
+ Example  :  
+ Returns  : reference to requested Iteration object or null if argument
+		is greater than total number of iterations
+ Args     : number of the requested iteration
+
+=cut
+
+sub round {
+  my $self = shift;
+  my $iter_num = shift;
+  $self->_initialize_io(-file => Bio::Root::IO->catfile
+			($self->{'_tempdir'},"iteration".$iter_num.".tmp"));
+  if( ! $self->_fh ) {
+      $self->throw("unable to re-open iteration file for round ".$iter_num);
+  }
+  return Bio::Tools::BPlite::Iteration->new(-round=>$iter_num,
+					    -parent=>$self);
+}
+
+# begin private routines
+
+sub _parseHeader {
+  my ($self) = @_;
+
+  
+  while(defined ($_ = $self->_readline) ) {
+    if ($_ =~ /^Query=\s+([^\(]+)/)    {
+      my $query = $1;
+      while(defined ($_ = $self->_readline)) {
+        last if $_ !~ /\S/;
+	$query .= $_;
+      }
+      $query =~ s/\s+/ /g;
+      $query =~ s/^>//;
+      $query =~ /\((\d+)\s+\S+\)\s*$/;
+      my $length = $1;
+      $self->{'QUERY'} = $query;
+      $self->{'LENGTH'} = $length;
+    }
+    elsif ($_ =~ /^Database:\s+(.+)/) {$self->{'DATABASE'} = $1}
+    elsif ($_ =~ /^\s*pattern\s+(\S+).*position\s+(\d+)\D/) 
+    {   # For PHIBLAST reports
+	$self->{'PATTERN'} = $1;
+	push (@{$self->{'QPATLOCATION'}}, $2);
+    } elsif ($_ =~ /^>|^Results from round 1/)    {
+	$self->_pushback($_); 
+	return 1;
+    } elsif ($_ =~ /^Parameters|^\s+Database:/) {
+	$self->_pushback($_); 
+	return 0; # there's nothing in the report
+    }
+  }
+}
+
+=head2 _preprocess
+
+ Title    : _preprocess
+ Usage    : internal routine, not called directly
+ Function : determines number of iterations in report and prepares
+	    data so individual iterations canbe parsed in non-sequential 
+            order 
+ Example  :  
+ Returns  : nothing. Sets TOTAL_ITERATION_NUMBER in object's hash
+ Args     : reference to calling object
+
+=cut
+
+#'
+sub _preprocess {
+    my $self = shift;
+#	$self->throw(" PSIBLAST report preprocessing not implemented yet!");
+
+    my  $oldround = 0;
+    my ($currentline, $currentfile, $round);
+
+# open output file for data from iteration round #1
+    $round = 1;
+    $currentfile = Bio::Root::IO->catfile($self->{'_tempdir'}, 
+					  "iteration$round.tmp");
+    open (FILEHANDLE, ">$currentfile") || 
+	$self->throw("cannot open filehandle to write to file $currentfile");
+
+    while(defined ($currentline = $self->_readline()) ) {
+	if ($currentline =~ /^Results from round\s+(\d+)/) {
+	    if ($oldround) { close (FILEHANDLE) ;}
+	    $round = $1;
+	    $currentfile = Bio::Root::IO->catfile($self->{'_tempdir'}, 
+						  "iteration$round.tmp");
+
+	    close FILEHANDLE;
+	    open (FILEHANDLE, ">$currentfile") || 
+		$self->throw("cannot open filehandle to write to file $currentfile");
+	    $oldround = $round;
+	}elsif ($currentline =~ /CONVERGED/){ # This is a fix for psiblast parsing with -m 6 /AE
+	    $round--;
+	}
+	print FILEHANDLE $currentline ;
+	
+    }
+    $self->{'TOTAL_ITERATION_NUMBER'}= $round;
+# It is necessary to close filehandle otherwise the whole
+# file will not be read later !!
+    close FILEHANDLE;
+}
+
+1;
+
+__END__