Mercurial > repos > mahtabm > ensembl
diff variant_effect_predictor/Bio/Tools/BPlite/Iteration.pm @ 0:1f6dce3d34e0
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 02:01:53 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Tools/BPlite/Iteration.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,394 @@ +# $Id: Iteration.pm,v 1.15 2002/06/19 00:27:49 jason Exp $ +# Bioperl module Bio::Tools::BPlite::Iteration +# based closely on the Bio::Tools::BPlite modules +# Ian Korf (ikorf@sapiens.wustl.edu, http://sapiens.wustl.edu/~ikorf), +# Lorenz Pollak (lorenz@ist.org, bioperl port) +# +# Copyright Peter Schattner +# +# You may distribute this module under the same terms as perl itself +# _history +# October 20, 2000 +# POD documentation - main docs before the code +# +# Added to get a simple_align object for a psiblast run with the -m 6 flag /AE +# + +=head1 NAME + +Bio::Tools::BPlite::Iteration - object for parsing single iteration +of a PSIBLAST report + +=head1 SYNOPSIS + + use Bio::Tools:: BPpsilite; + + open FH, "t/psiblastreport.out"; + $report = Bio::Tools::BPpsilite->new(-fh=>\*FH); + + # determine number of iterations executed by psiblast + $total_iterations = $report->number_of_iterations; + $last_iteration = $report->round($total_iterations); + + # Process only hits found in last iteration ... + $oldhitarray_ref = $last_iteration->oldhits; + HIT: while($sbjct = $last_iteration->nextSbjct) { + $id = $sbjct->name; + $is_old = grep /\Q$id\E/, @$oldhitarray_ref; + if ($is_old ){next HIT;} + # do something with new hit... + } + +=head2 ALIGNMENTS + + # This assumed that you have $db pointing to a database, $out to an output file + # $slxdir to a directory and $psiout + # note the alignments can only be obtained if the flag "-m 6" is run. + # It might also be necessary to use the flag -v to get all alignments + # + my @psiparams = ('database' => $db , 'output' => $out, 'j' => 3, 'm' => 6, + 'h' => 1.e-3 , 'F' => 'T' , 'Q' => $psiout ); + my $factory = Bio::Tools::Run::StandAloneBlast->new(@psiparams); + my $report = $factory->blastpgp($seq); + my $total_iterations = $report->number_of_iterations(); + my $last_iteration = $report->round($total_iterations); + my $align=$last_iteration->Align; + my $slxfile=$slxdir.$id.".slx"; + my $slx = Bio::AlignIO->new('-format' => 'selex','-file' => ">".$slxfile ); + $slx->write_aln($align); + +=head1 DESCRIPTION + +See the documentation for BPpsilite.pm for a description of the +Iteration.pm module. + +=head1 AUTHORS - Peter Schattner + +Email: schattner@alum.mit.edu + +=head1 CONTRIBUTORS + +Jason Stajich, jason@cgt.mc.duke.edu + +=head1 ACKNOWLEDGEMENTS + +Based on work of: +Ian Korf (ikorf@sapiens.wustl.edu, http://sapiens.wustl.edu/~ikorf), +Lorenz Pollak (lorenz@ist.org, bioperl port) + +=head1 COPYRIGHT + +BPlite.pm is copyright (C) 1999 by Ian Korf. + +=head1 DISCLAIMER + +This software is provided "as is" without warranty of any kind. + +=cut + +package Bio::Tools::BPlite::Iteration; + +use strict; +use vars qw(@ISA); +use Bio::Root::Root; # root object to inherit from +use Bio::Tools::BPlite; # +use Bio::Tools::BPlite::Sbjct; + +@ISA = qw(Bio::Root::Root); + +sub new { + my ($class, @args) = @_; + my $self = $class->SUPER::new(@args); + + ($self->{'PARENT'},$self->{'ROUND'}) = + $self->_rearrange([qw(PARENT + ROUND + )],@args); + + $self->{'QUERY'} = $self->{'PARENT'}->{'QUERY'}; + $self->{'LENGTH'} = $self->{'PARENT'}->{'LENGTH'}; + + if($self->_parseHeader) {$self->{'REPORT_DONE'} = 0} # there are alignments + else {$self->{'REPORT_DONE'} = 1} # empty report + + return $self; # success - we hope! +} + +=head2 query + + Title : query + Usage : $query = $obj->query(); + Function : returns the query object + Example : + Returns : query object + Args : + +=cut + +sub query {shift->{'QUERY'}} + +=head2 qlength + + Title : qlength + Usage : $len = $obj->qlength(); + Returns : length of query + Args : none + +=cut + +sub qlength {shift->{'LENGTH'}} + +=head2 newhits + + Title : newhits + Usage : $newhits = $obj->newhits(); + Returns : reference to an array listing all the hits + from the current iteration which were not identified + in the previous iteration + Args : none + +=cut + +sub newhits {shift->{'NEWHITS'}} + +=head2 oldhits + + Title : oldhits + Usage : $oldhits = $obj->oldhits(); + Returns : reference to an array listing all the hits from + the current iteration which were identified and + above threshold in the previous iteration + Args : none + +=cut + +sub oldhits {shift->{'OLDHITS'}} + + +=head2 nextSbjct + + Title : nextSbjct + Usage : $sbjct = $obj->nextSbjct(); + Function : Method of iterating through all the Sbjct retrieved + from parsing the report +#Example : while ( my $sbjct = $obj->nextSbjct ) {} + Returns : next Sbjct object or undef if finished + Args : + +=cut + +sub nextSbjct { + my ($self) = @_; + $self->_fastForward or return undef; + + ####################### + # get all sbjct lines # + ####################### + my $def = $self->_readline(); + + while(defined ($_ = $self->_readline) ) { + if ($_ !~ /\w/) {next} + elsif ($_ =~ /Strand HSP/) {next} # WU-BLAST non-data + elsif ($_ =~ /^\s{0,2}Score/) {$self->_pushback( $_); last} + elsif ($_ =~ /^(\d+) .* \d+$/) { # This is not correct at all + $self->_pushback($_); # 1: HSP does not work for -m 6 flag + $def = $1; # 2: length/name are incorrect + my $length = undef; # 3: Names are repeated many times. + my $sbjct = new Bio::Tools::BPlite::Sbjct('-name'=>$def, + '-length'=>$length, + '-parent'=>$self); + return $sbjct; + } # m-6 + elsif ($_ =~ /^Parameters|^\s+Database:|^\s+Posted date:/) { + $self->_pushback( $_); + last; + } else {$def .= $_} +} + $def =~ s/\s+/ /g; + $def =~ s/\s+$//g; + $def =~ s/Length = ([\d,]+)$//g; + my $length = $1; + return 0 unless $def =~ /^>/; + $def =~ s/^>//; + + #################### + # the Sbjct object # + #################### + my $sbjct = new Bio::Tools::BPlite::Sbjct('-name'=>$def, + '-length'=>$length, + '-parent'=>$self); + return $sbjct; +} + + +# This is added by /AE + +=head2 Align + + Title : Align + Usage : $SimpleAlign = $obj->Align(); + Function : Method to obtain a simpleAlign object from psiblast + Example : $SimpleAlign = $obj->Align(); + Returns : SimpleAlign object or undef if not found. + BUG : Only works if psiblast has been run with m 6 flag + Args : + +=cut + +sub Align { + use Bio::SimpleAlign; + my ($self) = @_; + $self->_fastForward or return undef; + my $lastline = $self->_readline(); + return undef unless $lastline =~ /^QUERY/; # If psiblast not run correctly + my (%sequence,%first,%last,$num); + + if ( $lastline =~ /^QUERY\s+(\d*)\s*([-\w]+)\s*(\d*)\s*$/){ + my $name='QUERY'; + my $start=$1; + my $seq=$2; + my $stop=$3; + $seq =~ s/-/\./g; + $start =~ s/ //g; + $stop =~ s/ //g; + $sequence{$name} .= $seq; + if ($first{$name} eq ''){$first{$name}=$start;} + if ($stop ne ''){$last{$name}=$stop;} +# print "FOUND:\t$seq\t$start\t$stop\n"; + $num=0; + } + while(defined($_ = $self->_readline()) ){ + chomp($_); + if ( $_ =~ /^QUERY\s+(\d+)\s*([\-A-Z]+)\s*(\+)\s*$/){ + my $name='QUERY'; + my $start=$1; + my $seq=$2; + my $stop=$3; + $seq =~ s/-/\./g; + $start =~ s/ //g; + $stop =~ s/ //g; + $sequence{$name} .= $seq; + if ($first{$name} eq '') { $first{$name} = $start;} + if ($stop ne '') { $last{$name}=$stop;} + $num=0; + } elsif ( $_ =~ /^(\d+)\s+(\d+)\s*([\-A-Z]+)\s*(\d+)\s*$/ ){ + my $name=$1.".".$num; + my $start=$2; + my $seq=$3; + my $stop=$4; + $seq =~ s/-/\./g; + $start =~ s/ //g; + $stop =~ s/ //g; + $sequence{$name} .= $seq; + if ($first{$name} eq ''){$first{$name}=$start;} + if ($stop ne ''){$last{$name}=$stop;} + $num++; + } + } + my $align = new Bio::SimpleAlign(); + my @keys=sort keys(%sequence); + foreach my $name (@keys){ + my $nse = $name."/".$first{$name}."-".$last{$name}; + my $seqobj = Bio::LocatableSeq->new( -seq => $sequence{$name}, + -id => $name, + -name => $nse, + -start => $first{$name}, + -end => $last{$name} + ); + + $align->add_seq($seqobj); + } + return $align; +} + +# Start of internal subroutines. + +sub _parseHeader { + my ($self) = @_; + my (@old_hits, @new_hits); + + my $newhits_true = ($self->{'ROUND'} < 2) ? 1 : 0 ; + while(defined($_ = $self->_readline()) ) { + if ($_ =~ /(\w\w|.*|\w+.*)\s\s+(\d+)\s+([-\.e\d]+)$/) { + my $id = $1; + my $score= $2; #not used currently + my $evalue= $3; #not used currently + if ($newhits_true) { push ( @new_hits, $id);} + else { push (@old_hits, $id);} + } + elsif ($_ =~ /^Sequences not found previously/) {$newhits_true = 1 ;} +# This is changed for "-m 6" option /AE + elsif ($_ =~ /^>/ || $_ =~ /^QUERY/) + { + $self->_pushback($_); + $self->{'OLDHITS'} = \@old_hits; + $self->{'NEWHITS'} = \@new_hits; + return 1; + } + elsif ($_ =~ /^Parameters|^\s+Database:|^\s*Results from round\s+(d+)/) { + $self->_pushback($_); + return 0; # no sequences found in this iteration + } + } + return 0; # no sequences found in this iteration +} + +sub _fastForward { + my ($self) = @_; + return 0 if $self->{'REPORT_DONE'}; # empty report + + while(defined($_ = $self->_readline()) ) { + if( $_ =~ /^>/ || + $_ =~ /^QUERY|^\d+ .* \d+$/ ) { # Changed to also handle "-m 6" /AE + $self->_pushback($_); + return 1; + } +# print "FASTFORWARD",$_,"\n"; + if ($_ =~ /^>|^Parameters|^\s+Database:/) { + $self->_pushback($_); + return 1; + } + } + $self->warn("Possible error (2) while parsing BLAST report!"); +} + + +=head2 _readline + + Title : _readline + Usage : $obj->_readline + Function: Reads a line of input. + + Note that this method implicitely uses the value of $/ that is + in effect when called. + + Note also that the current implementation does not handle pushed + back input correctly unless the pushed back input ends with the + value of $/. + Example : + Returns : + +=cut + +sub _readline{ + my ($self) = @_; + return $self->{'PARENT'}->_readline(); +} + +=head2 _pushback + + Title : _pushback + Usage : $obj->_pushback($newvalue) + Function: puts a line previously read with _readline back into a buffer + Example : + Returns : + Args : newvalue + +=cut + +sub _pushback { + my ($self, $arg) = @_; + return $self->{'PARENT'}->_pushback($arg); +} +1; +__END__