Mercurial > repos > mahtabm > ensemb_rep_gvl
diff variant_effect_predictor/Bio/Tools/BPpsilite.pm @ 0:2bc9b66ada89 draft default tip
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 06:29:17 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Tools/BPpsilite.pm Thu Apr 11 06:29:17 2013 -0400 @@ -0,0 +1,357 @@ +# $Id: BPpsilite.pm,v 1.22 2002/10/22 07:38:45 lapp Exp $ +# Bioperl module Bio::Tools::BPpsilite +############################################################ +# based closely on the Bio::Tools::BPlite modules +# Ian Korf (ikorf@sapiens.wustl.edu, http://sapiens.wustl.edu/~ikorf), +# Lorenz Pollak (lorenz@ist.org, bioperl port) +# +# +# Copyright Peter Schattner +# +# You may distribute this module under the same terms as perl itself +# _history +# October 20, 2000 +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Tools::BPpsilite - Lightweight BLAST parser for (iterated) psiblast reports + +=head1 SYNOPSIS + + use Bio::Tools::BPpsilite; + open FH, "t/psiblastreport.out"; + $report = Bio::Tools::BPpsilite->new(-fh=>\*FH); + + # determine number of iterations executed by psiblast + $total_iterations = $report->number_of_iterations; + $last_iteration = $report->round($total_iterations); + + # Process only hits found in last iteration ... + $oldhitarray_ref = $last_iteration->oldhits; + HIT: while($sbjct = $last_iteration->nextSbjct) { + $id = $sbjct->name; + $is_old = grep /\Q$id\E/, @$oldhitarray_ref; + if ($is_old ){next HIT;} + # do something with new hit... + } + + +=head1 DESCRIPTION + +BPpsilite is a package for parsing multiple iteration PSIBLAST +reports. It is based closely on Ian Korf's BPlite.pm module for +parsing single iteration BLAST reports (as modified by Lorenz Pollak). + +Two of the four basic objects of BPpsilite.pm are identical to the +corresponding objects in BPlite - the "HSP.pm" and "Sbjct.pm" objects. +This DESCRIPTION documents only the one new object, the "iteration", +as well as the additional methods that are implemented in BPpsilite +that are not in BPlite. See the BPlite documentation for information +on the BPlite, SBJCT and HSP objects. + +The essential difference between PSIBLAST and the other BLAST programs +(in terms of report parsing) is that PSIBLAST performs multiple +iterations of the BLASTing of the database and the results of all of +these iterations are stored in a single PSIBLAST report. (For general +information on PSIBLAST see the README.bla file in the standalone +BLAST distribution and references therein). PSIBLAST's use of multiple +iterations imposes additional demands on the report parser: * There +are several iterations of hits. Many of those hits will be repeated +in more than one iteration. Often only the last iteration will be of +interest. * Each iteration will list two different kinds of hits - +repeated hits that were used in the model and newly identified hits - +which may need to be processed in different manners * The total number +of iterations performed is not displayed in the report until (almost) +the very end of the report. (The user can specify a maximum number of +iterations for the PSIBLAST search, but the program may perform fewer +iterations if convergence is reached) + +BPpsilite addresses these issues by offering the following methods: + +* The total number of iteration used is given by the method + number_of_iterations as in: + + $total_iterations = $report->number_of_iterations; + +* Results from an arbitrary iteration round can be accessed by using + the 'round' method: + + $iteration3_report = $report->round(3); + +* The ids of the sequences which passed the significance threshold for + the first time in the "nth" iteration can be identified by using the + newhits method. Previously identified hits are identified by using + the oldhits method, as in: + + $oldhitarray_ref = $iteration3_report->oldhits; + $newhitarray_ref = $iteration3_report->newhits; + +BPpsilite.pm should work equally well on reports generated by the +StandAloneBlast.pm local BLAST module as with reports generated by +remote psiblast searches. For examples of usage of BPpsilite.pm, the +user is referred to the BPpsilite.t script in the "t" directory. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via email +or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Peter Schattner + +Email: schattner@alum.mit.edu + +=head1 CONTRIBUTORS + +Jason Stajich, jason@cgt.mc.duke.edu + +=head1 ACKNOWLEDGEMENTS + +Based on work of: +Ian Korf (ikorf@sapiens.wustl.edu, http://sapiens.wustl.edu/~ikorf), +Lorenz Pollak (lorenz@ist.org, bioperl port) + +=head1 COPYRIGHT + +BPlite.pm is copyright (C) 1999 by Ian Korf. + +=head1 DISCLAIMER + +This software is provided "as is" without warranty of any kind. + +=cut + +package Bio::Tools::BPpsilite; + +use strict; +use vars qw(@ISA); +use Bio::Tools::BPlite::Iteration; # +use Bio::Tools::BPlite::Sbjct; # Debug code +use Bio::Root::Root; # root interface to inherit from +use Bio::Root::IO; +use Bio::Tools::BPlite; + +@ISA = qw(Bio::Root::Root Bio::Root::IO); + +sub new { + my ($class, @args) = @_; + my $self = $class->SUPER::new(@args); + + # initialize IO + $self->_initialize_io(@args); + $self->{'_tempdir'} = $self->tempdir('CLEANUP' => 1); + $self->{'QPATLOCATION'} = []; # Anonymous array of query pattern locations for PHIBLAST + $self->{'NEXT_ITERATION_NUMBER'} = 1; + $self->{'TOTAL_ITERATION_NUMBER'} = -1; # -1 indicates preprocessing not yet done + + if ($self->_parseHeader) {$self->{'REPORT_DONE'} = 0} # there are alignments + else {$self->{'REPORT_DONE'} = 1} # empty report + + return $self; # success - we hope! +} + +=head2 query + + Title : query + Usage : $query = $obj->query(); + Function : returns the query object + Returns : query object + Args : + +=cut + +sub query {shift->{'QUERY'}} + +=head2 qlength + + Title : qlength + Usage : $len = $obj->qlength(); + Function : returns the length of the query + Returns : length of query + Args : + +=cut + +sub qlength {shift->{'LENGTH'}} + +=head2 database + + Title : database + Usage : $db = $obj->database(); + Function : returns the database used in this search + Returns : database used for search + Args : + +=cut + +sub database {shift->{'DATABASE'}} + +=head2 number_of_iterations + + Title : number_of_iterations + Usage : $total_iterations = $obj-> number_of_iterations(); + Function : returns the total number of iterations used in this search + Returns : total number of iterations used for search + Args : none + +=cut + + +=head2 pattern + + Title : database + Usage : $pattern = $obj->pattern(); + Function : returns the pattern used in a PHIBLAST search + +=cut + +sub pattern {shift->{'PATTERN'}} + +=head2 query_pattern_location + + Title : query_pattern_location + Usage : $qpl = $obj->query_pattern_location(); + Function : returns reference to array of locations in the query sequence + of pattern used in a PHIBLAST search + +=cut + +sub query_pattern_location {shift->{'QPATLOCATION'}} + + + + +sub number_of_iterations { + my $self = shift; + if ($self->{'TOTAL_ITERATION_NUMBER'} == -1){&_preprocess($self);} + $self->{'TOTAL_ITERATION_NUMBER'}; +} + +=head2 round + + Title : round + Usage : $Iteration3 = $report->round(3); + Function : Method of retrieving data from a specific iteration + Example : + Returns : reference to requested Iteration object or null if argument + is greater than total number of iterations + Args : number of the requested iteration + +=cut + +sub round { + my $self = shift; + my $iter_num = shift; + $self->_initialize_io(-file => Bio::Root::IO->catfile + ($self->{'_tempdir'},"iteration".$iter_num.".tmp")); + if( ! $self->_fh ) { + $self->throw("unable to re-open iteration file for round ".$iter_num); + } + return Bio::Tools::BPlite::Iteration->new(-round=>$iter_num, + -parent=>$self); +} + +# begin private routines + +sub _parseHeader { + my ($self) = @_; + + + while(defined ($_ = $self->_readline) ) { + if ($_ =~ /^Query=\s+([^\(]+)/) { + my $query = $1; + while(defined ($_ = $self->_readline)) { + last if $_ !~ /\S/; + $query .= $_; + } + $query =~ s/\s+/ /g; + $query =~ s/^>//; + $query =~ /\((\d+)\s+\S+\)\s*$/; + my $length = $1; + $self->{'QUERY'} = $query; + $self->{'LENGTH'} = $length; + } + elsif ($_ =~ /^Database:\s+(.+)/) {$self->{'DATABASE'} = $1} + elsif ($_ =~ /^\s*pattern\s+(\S+).*position\s+(\d+)\D/) + { # For PHIBLAST reports + $self->{'PATTERN'} = $1; + push (@{$self->{'QPATLOCATION'}}, $2); + } elsif ($_ =~ /^>|^Results from round 1/) { + $self->_pushback($_); + return 1; + } elsif ($_ =~ /^Parameters|^\s+Database:/) { + $self->_pushback($_); + return 0; # there's nothing in the report + } + } +} + +=head2 _preprocess + + Title : _preprocess + Usage : internal routine, not called directly + Function : determines number of iterations in report and prepares + data so individual iterations canbe parsed in non-sequential + order + Example : + Returns : nothing. Sets TOTAL_ITERATION_NUMBER in object's hash + Args : reference to calling object + +=cut + +#' +sub _preprocess { + my $self = shift; +# $self->throw(" PSIBLAST report preprocessing not implemented yet!"); + + my $oldround = 0; + my ($currentline, $currentfile, $round); + +# open output file for data from iteration round #1 + $round = 1; + $currentfile = Bio::Root::IO->catfile($self->{'_tempdir'}, + "iteration$round.tmp"); + open (FILEHANDLE, ">$currentfile") || + $self->throw("cannot open filehandle to write to file $currentfile"); + + while(defined ($currentline = $self->_readline()) ) { + if ($currentline =~ /^Results from round\s+(\d+)/) { + if ($oldround) { close (FILEHANDLE) ;} + $round = $1; + $currentfile = Bio::Root::IO->catfile($self->{'_tempdir'}, + "iteration$round.tmp"); + + close FILEHANDLE; + open (FILEHANDLE, ">$currentfile") || + $self->throw("cannot open filehandle to write to file $currentfile"); + $oldround = $round; + }elsif ($currentline =~ /CONVERGED/){ # This is a fix for psiblast parsing with -m 6 /AE + $round--; + } + print FILEHANDLE $currentline ; + + } + $self->{'TOTAL_ITERATION_NUMBER'}= $round; +# It is necessary to close filehandle otherwise the whole +# file will not be read later !! + close FILEHANDLE; +} + +1; + +__END__