comparison variant_effect_predictor/Bio/SearchIO/psiblast.pm @ 0:1f6dce3d34e0

Uploaded
author mahtabm
date Thu, 11 Apr 2013 02:01:53 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:1f6dce3d34e0
1 #------------------------------------------------------------------
2 # $Id: psiblast.pm,v 1.17 2002/12/24 15:48:41 jason Exp $
3 #
4 # BioPerl module Bio::SearchIO::psiblast
5 #
6 # Cared for by Steve Chervitz <sac@bioperl.org>
7 #
8 # You may distribute this module under the same terms as perl itself
9 #------------------------------------------------------------------
10
11 =head1 NAME
12
13 Bio::SearchIO::psiblast - Parser for traditional BLAST and PSI-BLAST reports
14
15 =head1 SYNOPSIS
16
17 use Bio::SearchIO;
18
19 my $in = Bio::SearchIO->new( -format => 'psiblast',
20 -file => 'report.blastp' );
21
22 while ( my $blast = $in->next_result() ) {
23 foreach my $hit ( $blast->hits ) {
24 print "Hit: $hit\n";
25 }
26 }
27
28 # BLAST hit filtering function. All hits of each BLAST report must satisfy
29 # this criteria to be retained. If a hit fails this test, it is ignored.
30 # If all hits of a report fail, the report will be considered hitless.
31 # But we can distinguish this from the case where there were no
32 # hits in the report by testing the function $blast->no_hits_found().
33
34 my $filt_func = sub{ my $hit=shift;
35 $hit->frac_identical('query') >= 0.5
36 && $hit->frac_aligned_query >= 0.50
37 };
38
39 # Not supplying a -file or -fh parameter means read from STDIN
40
41 my $in2 = Bio::SearchIO->new( -format => 'psiblast',
42 -hit_filter => $filt_func
43 );
44
45
46 =head1 DESCRIPTION
47
48 This module parses BLAST and PSI-BLAST reports and acts as a factory for
49 objects that encapsulate BLAST results:
50 L<Bio::Search::Result::BlastResult>, L<Bio::Search::Hit::BlastHit>,
51 L<Bio::Search::HSP::BlastHSP>.
52
53 This module does not parse XML-formatted BLAST reports.
54 See L<Bio::SearchIO::blastxml|Bio::SearchIO::blastxml> if you need to do that.
55
56 To use this module, the only module you need to C<use> is
57 Bio::SearchIO.pm. SearchIO knows how to load this module when you
58 supply a C<-format =E<gt> 'psiblast'> parameters to its C<new>()
59 function. For more information about the SearchIO system, see
60 documentation in Bio::SearchIO.pm.
61
62 =head2 PSI-BLAST Support
63
64 In addition to BLAST1 and BLAST2 reports, this module can also handle
65 PSI-BLAST reports. When accessing the set of Hits in a result, hits
66 from different iterations are lumped together but can be distinguished by
67 interrogating L<Bio::Search::Hit::BlastHit::iteration> and
68 L<Bio::Search::Hit::BlastHit::found_again>.
69
70 If you want to collect hits only from a certain iteration during parsing,
71 supply a function using the C<-HIT_FILTER> parameter.
72
73 =head1 EXAMPLES
74
75 To get a feel for how to use this, have look at scripts in the
76 B<examples/searchio> and B<examples/searchio/writer> directory of the Bioperl
77 distribution as well as the test script B<t/SearchIO.t>.
78
79 =head1 SEE ALSO
80
81 For more documentation about working with Blast result objects that are
82 produced by this parser, see L<Bio::Search::Result::BlastResult>,
83 L<Bio::Search::Hit::BlastHit>, L<Bio::Search::HSP::BlastHSP>.
84
85 =head1 FEEDBACK
86
87 =head2 Mailing Lists
88
89 User feedback is an integral part of the evolution of this and other
90 Bioperl modules. Send your comments and suggestions preferably to
91 the Bioperl mailing list. Your participation is much appreciated.
92
93 bioperl-l@bioperl.org - General discussion
94 http://bioperl.org/MailList.shtml - About the mailing lists
95
96 =head2 Reporting Bugs
97
98 Report bugs to the Bioperl bug tracking system to help us keep track
99 of the bugs and their resolution. Bug reports can be submitted via
100 email or the web:
101
102 bioperl-bugs@bioperl.org
103 http://bugzilla.bioperl.org/
104
105 =head1 AUTHOR
106
107 Steve Chervitz E<lt>sac@bioperl.orgE<gt>
108
109 See L<the FEEDBACK section | FEEDBACK> for where to send bug reports and comments.
110
111 =head1 ACKNOWLEDGEMENTS
112
113 I would like to acknowledge my colleagues at Affymetrix for useful
114 feedback.
115
116 =head1 COPYRIGHT
117
118 Copyright (c) 2001 Steve Chervitz. All Rights Reserved.
119
120 This library is free software; you can redistribute it and/or modify
121 it under the same terms as Perl itself.
122
123 =head1 DISCLAIMER
124
125 This software is provided "as is" without warranty of any kind.
126
127 =head1 APPENDIX
128
129 The rest of the documentation details each of the object methods.
130 Internal methods are usually preceded with a _
131
132 =cut
133
134
135 # Let the code begin...
136
137
138 package Bio::SearchIO::psiblast;
139
140 use strict;
141 use vars qw( @ISA
142 $MAX_HSP_OVERLAP
143 $DEFAULT_MATRIX
144 $DEFAULT_SIGNIF
145 $DEFAULT_SCORE
146 $DEFAULT_BLAST_WRITER_CLASS
147 $DEFAULT_HIT_FACTORY_CLASS
148 $DEFAULT_RESULT_FACTORY_CLASS
149 );
150
151 use Bio::SearchIO;
152 use Bio::Search::Result::BlastResult;
153 use Bio::Factory::BlastHitFactory;
154 use Bio::Factory::BlastResultFactory;
155 use Bio::Tools::StateMachine::IOStateMachine qw($INITIAL_STATE $FINAL_STATE);
156
157 @ISA = qw( Bio::SearchIO
158 Bio::Tools::StateMachine::IOStateMachine );
159
160 $MAX_HSP_OVERLAP = 2; # Used when tiling multiple HSPs.
161 $DEFAULT_MATRIX = 'BLOSUM62';
162 $DEFAULT_SIGNIF = 999;# Value used as significance cutoff if none supplied.
163 $DEFAULT_SCORE = 0; # Value used as score cutoff if none supplied.
164 $DEFAULT_BLAST_WRITER_CLASS = 'Bio::Search::Writer::HitTableWriter';
165 $DEFAULT_HIT_FACTORY_CLASS = 'Bio::Factory::BlastHitFactory';
166 $DEFAULT_RESULT_FACTORY_CLASS = 'Bio::Factory::BlastResultFactory';
167
168 my %state = (
169 'header' => 'Header',
170 'descr' => 'Descriptions',
171 'align' => 'Alignment',
172 'footer' => 'Footer',
173 'iteration' => 'Iteration', # psiblast
174 'nohits' => 'No Hits'
175 );
176
177 # These are the expected transitions assuming a "normal" report (Blast2 or PSI-Blast).
178 my @state_transitions = ( [ $INITIAL_STATE, $state{'header'}],
179 [ $state{'header'}, $state{'descr'} ],
180 [ $state{'header'}, $state{'iteration'} ],
181 [ $state{'iteration'}, $state{'descr'} ],
182 [ $state{'iteration'}, $state{'nohits'} ],
183 [ $state{'descr'}, $state{'align'} ],
184 [ $state{'align'}, $state{'align'} ],
185 [ $state{'align'}, $state{'footer'} ],
186 [ $state{'align'}, $state{'iteration'} ], # psiblast
187 [ $state{'nohits'}, $state{'iteration'} ], # psiblast
188 [ $state{'nohits'}, $state{'footer'} ],
189 [ $state{'footer'}, $state{'header'} ],
190 [ $state{'footer'}, $FINAL_STATE]
191 );
192
193 my $current_iteration; # psiblast
194
195 =head2 new
196
197 Usage : Bio::SearchIO::psiblast->new( %named_parameters )
198 Purpose : Parse traditional BLAST or PSI-BLAST data a file or input stream.
199 : Handles Blast1, Blast2, NCBI and WU Blast reports.
200 : Populates Bio::Search:: objects with data extracted from the report.
201 : (The exact type of Bio::Search objects depends on the type of
202 : Bio::Factory::ResultFactory and Bio::Factory::HitFactory you hook up
203 : to the SearchIO object.)
204 Returns : Bio::SearchIO::psiblast object.
205 Argument : Named parameters: (PARAMETER TAGS CAN BE UPPER OR LOWER CASE).
206 : These are in addition to those specified by Bio::SearchIO::new() (see).
207 : -SIGNIF => number (float or scientific notation number to be used
208 : as a P- or Expect value cutoff; default = 999.)
209 : -SCORE => number (integer or scientific notation number to be used
210 : as a score value cutoff; default = 0.)
211 : -HIT_FILTER => func_ref (reference to a function to be used for
212 : filtering out hits based on arbitrary criteria.
213 : This function should take a
214 : Bio::Search::Hit::BlastHit.pm object as its first
215 : argument and return a boolean value,
216 : true if the hit should be filtered out).
217 : Sample filter function:
218 : -HIT_FILTER => sub { $hit = shift;
219 : $hit->gaps == 0; },
220 : Historical note: This parameter was formerly
221 called -FILT_FUNC in the older
222 Bio::Tools::Blast::parse method. Using
223 -FILT_FUNC will still work for backward
224 compatibility.
225 : -CHECK_ALL_HITS => boolean (check all hits for significance against
226 : significance criteria. Default = false.
227 : If false, stops processing hits after the first
228 : non-significant hit or the first hit that fails
229 : the hit_filter call. This speeds parsing,
230 : taking advantage of the fact that the hits
231 : are processed in the order they are ranked.)
232 : -MIN_LEN => integer (to be used as a minimum query sequence length
233 : sequences below this length will not be processed).
234 : default = no minimum length).
235 : -STATS => boolean (collect key statistical parameters for the report:
236 : matrix, filters, etc. default = false).
237 : This requires extra parsing
238 : so if you aren't interested in this info, don't
239 : set this parameter. Note that the unparsed
240 : parameter section of a Blast report is always
241 : accessible via $blast->raw_statistics().
242 : -BEST => boolean (only process the best hit of each report;
243 : default = false).
244 : -OVERLAP => integer (the amount of overlap to permit between
245 : adjacent HSPs when tiling HSPs. A reasonable value is 2.
246 : Default = $Bio::SearchIO::psiblast::MAX_HSP_OVERLAP)
247 : -HOLD_RAW_DATA => boolean (store the raw alignment sections for each hit.
248 : used with the -SHALLOW_PARSE option).
249 : -SHALLOW_PARSE => boolean (only minimal parsing; does not parse HSPs.
250 : Hit data is limited to what can be obtained
251 : from the description line.
252 : Replaces the older NO_ALIGNS option.)
253 :
254 :
255 Comments : Do NOT remove the HTML from an HTML-formatted Blast report by using the
256 : "Save As" option of a web browser to save it as text. This renders the
257 : report unparsable.
258 Throws : An exception will be thrown if a BLAST report contains a FATAL: error.
259
260 =cut
261
262 sub new {
263 my ($class, %args) = @_;
264
265 # TODO: Resolve this issue:
266 # Temporary hack to allow factory-based and non-factory based
267 # SearchIO objects co-exist.
268 # steve --- Sat Dec 22 04:41:20 2001
269 $args{-USE_FACTORIES} = 1;
270
271 my $self = $class->Bio::SearchIO::new(%args);
272 $self->_init_state_machine( %args, -transition_table => \@state_transitions);
273
274 $self->_init_parse_params( %args );
275
276 $self->pause_between_reports( 1 );
277
278 $self->{'_result_count'} = 0;
279
280 return $self;
281 }
282
283
284 sub default_result_factory_class {
285 my $self = shift;
286 return $DEFAULT_RESULT_FACTORY_CLASS;
287 }
288
289 sub default_hit_factory_class {
290 my $self = shift;
291 return $DEFAULT_HIT_FACTORY_CLASS;
292 }
293
294 sub check_for_new_state {
295 my ($self) = @_;
296
297 # Ignore blank lines
298 my $chunk = $self->SUPER::check_for_new_state(1);
299
300 my $newstate = undef;
301
302 # End the run if there's no more input.
303 if( ! $chunk ) {
304 return $self->final_state;
305 }
306 $self->clear_state_change_cache;
307 my $curr_state = $self->current_state;
308
309 if( $chunk =~ /^(<.*>)?T?BLAST[NPX]/ ) {
310 $newstate = $state{header};
311 $self->state_change_cache( $chunk );
312 }
313
314 elsif ($chunk =~ /^Sequences producing/ ) {
315 $newstate = $state{descr};
316 $self->state_change_cache( $chunk );
317 }
318
319 elsif ($chunk =~ /No hits found/i ) {
320 $newstate = $state{nohits};
321 $self->state_change_cache( $chunk );
322 }
323
324 elsif ($chunk =~ /^\s*Searching/ ) {
325 $newstate = $state{iteration};
326 }
327
328 elsif ($chunk =~ /^>(.*)/ ) {
329 $newstate = $state{align};
330 $self->state_change_cache( "$1\n" );
331 }
332
333 elsif ($chunk =~ /^(CPU time|Parameters):/ ) {
334 $newstate = $state{footer};
335 $self->state_change_cache( $chunk );
336 }
337
338 # Necessary to distinguish " Database:" lines that start a footer section
339 # from those that are internal to a footer section.
340 elsif ($chunk =~ /^\s+Database:/ && $curr_state ne $state{'footer'}) {
341 $newstate = $state{footer};
342 $self->state_change_cache( $chunk );
343 }
344
345 if( $curr_state ne $INITIAL_STATE and not $newstate ) {
346 # print "Appending input cache with ($curr_state): $chunk\n";
347 $self->append_input_cache( $chunk );
348 }
349
350 return $newstate;
351 }
352
353
354 sub change_state {
355 my ($self, $state) = @_;
356
357 my $from_state = $self->current_state;
358 my $verbose = $self->verbose;
359 $verbose and print STDERR ">>>>> Changing state from $from_state to $state\n";
360
361 if ( $self->validate_transition( $from_state, $state ) ) {
362
363 # Now we know the current state is complete
364 # and all data from it is now in the input cache.
365 my @data = $self->get_input_cache();
366
367 # if($from_state eq $state{iteration} ) {
368 # do{
369 # print STDERR "State change cache: ", $self->state_change_cache, "\n";
370 # print STDERR "Input cache ($from_state):\n@data\n\n";
371 # };
372 # }
373
374 # Now we need to process the input cache data.
375 # Remember, at this point, the current state is the "from" state
376 # of the state transition. The "to" state doesn't get set until
377 # the finalize_state_change() call at the end of this method.
378
379 if($from_state eq $state{header} ) {
380 $self->_process_header( @data );
381 }
382 elsif($from_state eq $state{descr} ) {
383 $self->_process_descriptions( @data );
384 }
385 elsif($from_state eq $state{iteration} ) {
386 $self->_process_iteration( @data, $self->state_change_cache() );
387 }
388 elsif($from_state eq $state{align} ) {
389 $self->_process_alignment( @data );
390 }
391 elsif($from_state eq $state{footer} ) {
392 my $ok_to_pause = not $state eq $self->final_state;
393 $self->_process_footer( $ok_to_pause, @data );
394 }
395
396 $self->finalize_state_change( $state, 1 );
397 }
398 }
399
400
401 sub _add_error {
402 my ($self, $err) = @_;
403 if( $err ) {
404 push @{$self->{'_blast_errs'}}, $err;
405 }
406 }
407
408 sub _clear_errors {
409 my $self = shift;
410 $self->{'_blast_errs'} = undef;
411 }
412
413 #---------
414 sub errors {
415 #---------
416 my $self = shift;
417 my @errs = ();
418 @errs = @{$self->{'_blast_errs'}} if ref $self->{'_blast_errs'};
419 return @errs;
420 }
421
422
423 #----------------------
424 sub _init_parse_params {
425 #----------------------
426 #Initializes parameters used during parsing of Blast reports.
427
428 my ( $self, @param ) = @_;
429 # -FILT_FUNC has been replaced by -HIT_FILTER.
430 # Leaving -FILT_FUNC in place for backward compatibility
431 my($signif, $filt_func, $hit_filter, $min_len, $check_all,
432 $gapped, $score, $overlap, $stats, $best, $shallow_parse,
433 $hold_raw) =
434 $self->_rearrange([qw(SIGNIF FILT_FUNC HIT_FILTER MIN_LEN
435 CHECK_ALL_HITS GAPPED SCORE
436 OVERLAP STATS BEST SHALLOW_PARSE HOLD_RAW_DATA)], @param);
437
438 $self->{'_hit_filter'} = $hit_filter || $filt_func || 0;
439 $self->{'_check_all'} = $check_all || 0;
440 $self->{'_shallow_parse'} = $shallow_parse || 0;
441 $self->{'_hold_raw_data'} = $hold_raw || 0;
442
443 $self->_set_signif($signif, $min_len, $self->{'_hit_filter'}, $score);
444 $self->best_hit_only($best) if $best;
445 $self->{'_blast_count'} = 0;
446
447 $self->{'_collect_stats'} = defined($stats) ? $stats : 0;
448
449 # TODO: Automatically determine whether gapping was used.
450 # e.g., based on version number. Otherwise, have to read params.
451 $self->{'_gapped'} = $gapped || 1;
452
453 # Clear any errors from previous parse.
454 $self->_clear_errors;
455 undef $self->{'_hit_count'};
456 undef $self->{'_num_hits_significant'};
457 }
458
459 #=head2 _set_signif
460 #
461 # Usage : n/a; called automatically by _init_parse_params()
462 # Purpose : Sets significance criteria for the BLAST object.
463 # Argument : Obligatory three arguments:
464 # : $signif = float or sci-notation number or undef
465 # : $min_len = integer or undef
466 # : $hit_filter = function reference or undef
467 # :
468 # : If $signif is undefined, a default value is set
469 # : (see $DEFAULT_SIGNIF; min_length = not set).
470 # Throws : Exception if significance value is defined but appears
471 # : out of range or invalid.
472 # : Exception if $hit_filter if defined and is not a func ref.
473 # Comments : The significance of a BLAST report can be based on
474 # : the P (or Expect) value and/or the length of the query sequence.
475 # : P (or Expect) values GREATER than '_max_significance' are not significant.
476 # : Query sequence lengths LESS than '_min_length' are not significant.
477 # :
478 # : Hits can also be screened using arbitrary significance criteria
479 # : as discussed in the parse() method.
480 # :
481 # : If no $signif is defined, the '_max_significance' level is set to
482 # : $DEFAULT_SIGNIF (999).
483 #
484 #See Also : L<signif>(), L<min_length>(), L<_init_parse_params>(), L<parse>()
485 #
486 #=cut
487
488 #-----------------
489 sub _set_signif {
490 #-----------------
491 my( $self, $sig, $len, $func, $score ) = @_;
492
493 if(defined $sig) {
494 $self->{'_confirm_significance'} = 1;
495 if( $sig =~ /[^\d.e-]/ or $sig <= 0) {
496 $self->throw(-class => 'Bio::Root::BadParameter',
497 -text => "Invalid significance value: $sig\n".
498 "Must be a number greater than zero.");
499 }
500 $self->{'_max_significance'} = $sig;
501 } else {
502 $self->{'_max_significance'} = $DEFAULT_SIGNIF;
503 }
504
505 if(defined $score) {
506 $self->{'_confirm_significance'} = 1;
507 if( $score =~ /[^\de+]/ or $score <= 0) {
508 $self->throw(-class => 'Bio::Root::BadParameter',
509 -text => "Invalid score value: $score\n".
510 "Must be an integer greater than zero.");
511 }
512 $self->{'_min_score'} = $score;
513 } else {
514 $self->{'_min_score'} = $DEFAULT_SCORE;
515 }
516
517 if(defined $len) {
518 if($len =~ /\D/ or $len <= 0) {
519 $self->warn("Invalid minimum length value: $len",
520 "Value must be an integer > 0. Value not set.");
521 } else {
522 $self->{'_min_length'} = $len;
523 }
524 }
525
526 if(defined $func) {
527 $self->{'_check_all'} = 1;
528 $self->{'_confirm_significance'} = 1;
529 if($func and not ref $func eq 'CODE') {
530 $self->throw("Not a function reference: $func",
531 "The -hit_filter parameter must be function reference.");
532 }
533 }
534 }
535
536 =head2 signif
537
538 Synonym for L<max_significance()|max_significance>
539
540 =cut
541
542 #-----------
543 sub signif { shift->max_significance }
544
545
546 =head2 max_significance
547
548 Usage : $obj->max_significance();
549 Purpose : Gets the P or Expect value used as significance screening cutoff.
550 This is the value of the -signif parameter supplied to new().
551 Hits with P or E-value above this are skipped.
552 Returns : Scientific notation number with this format: 1.0e-05.
553 Argument : n/a
554 Comments : Screening of significant hits uses the data provided on the
555 : description line. For NCBI BLAST1 and WU-BLAST, this data
556 : is P-value. for NCBI BLAST2 it is an Expect value.
557
558 =cut
559
560 #-----------
561 sub max_significance {
562 #-----------
563 my $self = shift;
564 my $sig = $self->{'_max_significance'};
565 sprintf "%.1e", $sig;
566 }
567
568 =head2 min_score
569
570 Usage : $obj->min_score();
571 Purpose : Gets the Blast score used as screening cutoff.
572 This is the value of the -score parameter supplied to new().
573 Hits with scores below this are skipped.
574 Returns : Integer or scientific notation number.
575 Argument : n/a
576 Comments : Screening of significant hits uses the data provided on the
577 : description line.
578
579 =cut
580
581 #-----------
582 sub min_score {
583 #-----------
584 my $self = shift;
585 return $self->{'_min_score'};
586 }
587
588 =head2 min_length
589
590 Usage : $obj->min_length();
591 Purpose : Gets the query sequence length used as screening criteria.
592 This is the value of the -min_len parameter supplied to new().
593 Hits with sequence length below this are skipped.
594 Returns : Integer
595 Argument : n/a
596
597 See Also : L<signif()|signif>
598
599 =cut
600
601 #--------------
602 sub min_length {
603 #--------------
604 my $self = shift;
605 $self->{'_min_length'};
606 }
607
608 =head2 highest_signif
609
610 Usage : $value = $obj->highest_signif();
611 Purpose : Gets the largest significance (P- or E-value) observed in
612 the report.
613 : For NCBI BLAST1 and WU-BLAST, this is a P-value.
614 : For NCBI BLAST2 it is an Expect value.
615 Returns : Float or sci notation number
616 Argument : n/a
617
618 =cut
619
620 sub highest_signif { shift->{'_highestSignif'} }
621
622 =head2 lowest_signif
623
624 Usage : $value = $obj->lowest_signif();
625 Purpose : Gets the largest significance (P- or E-value) observed in
626 the report.
627 : For NCBI BLAST1 and WU-BLAST, this is a P-value.
628 : For NCBI BLAST2 it is an Expect value.
629 Returns : Float or sci notation number
630 Argument : n/a
631
632 =cut
633
634 sub lowest_signif { shift->{'_lowestSignif'} }
635
636 =head2 highest_score
637
638 Usage : $value = $obj->highest_score();
639 Purpose : Gets the largest BLAST score observed in the report.
640 Returns : Integer or sci notation number
641 Argument : n/a
642
643 =cut
644
645 sub highest_score { shift->{'_highestScore'} }
646
647 =head2 lowest_score
648
649 Usage : $value = $obj->lowest_score();
650 Purpose : Gets the largest BLAST score observed in the report.
651 Returns : Integer or sci notation number
652 Argument : n/a
653
654 =cut
655
656 sub lowest_score { shift->{'_lowestScore'} }
657
658
659 # Throws : Exception if BLAST report contains a FATAL: error.
660 sub _process_header {
661 my ($self, @data) = @_;
662
663 # print STDERR "Processing Header...\n";
664
665 $current_iteration = 0;
666 $self->{'_result_count'}++;
667 # Finish off the current Blast object, if any
668 my $blast = $self->{'_current_blast'} = $self->result_factory->create_result();
669
670 my ($set_method, $set_query, $set_db, $set_length);
671 my ($query_start, $query_desc);
672
673 foreach my $line (@data) {
674 if( $line =~ /^(<.*>)?(T?BLAST[NPX])\s+(.*)$/ ) {
675 $blast->analysis_method( $2 );
676 $blast->analysis_method_version( $3 );
677 $set_method = 1;
678 }
679 elsif ($line =~ /^Query= *(.+)$/ ) {
680 $query_start = 1;
681 my $info = $1;
682 $info =~ s/TITLE //;
683 # Split the query line into two parts.
684 # Using \s instead of ' '
685 $info =~ /(\S+?)\s+(.*)/;
686 # set name of Blast object and return.
687 $blast->query_name($1 || 'UNKNOWN');
688 $query_desc = $2 || '';
689 $set_query = 1;
690 }
691 elsif ($line =~ /^Database:\s+(.+)$/ ) {
692 require Bio::Search::GenericDatabase;
693 my $blastdb = Bio::Search::GenericDatabase->new( -name => $1 );
694 $blast->analysis_subject( $blastdb );
695 $set_db = 1;
696 }
697 elsif( $line =~ m/^\s+\(([\d|,]+) letters\)/ ) {
698 my $length = $1;
699 $length =~ s/,//g;
700 $self->_set_query_length( $length );
701 $set_length = 1;
702 $blast->query_description( $query_desc );
703 $query_start = 0;
704 }
705 elsif( $line =~ /WARNING: (.+?)/ ) {
706 $self->warn( $1 );
707 }
708 elsif( $line =~ /FATAL: (.+?)/ ) {
709 $self->throw("FATAL BLAST Report Error: $1");
710 }
711 # This needs to be the last elsif block.
712 elsif( $query_start ) {
713 # Handling multi-line query descriptions.
714 chomp( $line );
715 $query_desc .= " $line";
716 }
717 }
718 if (!$set_method) {
719 $self->throw("Can't determine type of BLAST program.");
720 }
721 if (!$set_query) {
722 $self->throw("Can't determine name of query sequence.");
723 }
724 if(!$set_db) {
725 $self->throw("Can't determine name of database.");
726 }
727 if(!$set_length) {
728 $self->throw("Can't determine sequence length from BLAST report.");
729 }
730
731 }
732
733 sub _process_descriptions {
734 my ($self, @data) = @_;
735 # print STDERR "Processing Descriptions...\n";
736
737 # Step through each line parsing out the P/Expect value
738 # All we really need to do is check the first one, if it doesn't
739 # meet the significance requirement, we can skip the report.
740 # BUT: we want to collect data for all hits anyway to get min/max signif.
741
742 my $max_signif = $self->max_significance;
743 my $min_score = $self->min_score;
744 my $layout_set = $self->{'_layout'} || 0;
745 my ($layout, $sig, $hitid, $score, $is_p_value);
746
747 if( $data[0] =~ /^\s*Sequences producing.*Score\s+P/i ) {
748 $is_p_value = 1;
749 } else {
750 $is_p_value = 0;
751 }
752
753 my $hit_found_again;
754
755 desc_loop:
756 foreach my $line (@data) {
757 last desc_loop if $line =~ / NONE |End of List|Results from round/;
758 next desc_loop if $line =~ /^\.\./;
759
760 if($line =~ /^Sequences used in model/ ) {
761 #Sequences used in model and found again:
762 $hit_found_again = 1;
763 next;
764 }
765 elsif($line =~ /^Sequences not found previously/ ) {
766 #Sequences not found previously or not previously below threshold:
767 $hit_found_again = 0;
768 next;
769 }
770
771 ## Checking the significance value (P- or Expect value) of the hit
772 ## in the description line.
773
774 next desc_loop unless $line =~ /\d/;
775
776 # TODO: These regexps need testing on a variety of reports.
777 if ( $line =~ /^(\S+)\s+.*\s+([\de.+-]+)\s{1,5}[\de.-]+\s*$/) {
778 $hitid = $1;
779 $score = $2;
780 $layout = 2;
781 } elsif( $line =~ /^(\S+)\s+.*\s+([\de.+-]+)\s{1,5}[\de.-]+\s{1,}\d+\s*$/) {
782 $hitid = $1;
783 $score = $2;
784 $layout = 1;
785 } else {
786 $self->warn("Can't parse description line\n $line");
787 next desc_loop;
788 }
789 not $layout_set and ($self->_layout($layout), $layout_set = 1);
790
791 $sig = &_parse_signif( $line, $layout, $self->{'_gapped'} );
792
793 # print STDERR " Parsed signif for $hitid: $sig (layout=$layout)\n";
794
795 $self->{'_hit_hash'}->{$hitid}->{'signif'} = $sig;
796 $self->{'_hit_hash'}->{$hitid}->{'score'} = $score;
797 $self->{'_hit_hash'}->{$hitid}->{'found_again'} = $hit_found_again;
798 $self->{'_hit_hash'}->{'is_pval'} = $is_p_value;
799
800 last desc_loop if (not $self->{'_check_all'} and
801 ($sig > $max_signif or $score < $min_score));
802
803 $self->_process_significance($sig, $score);
804 }
805
806 # printf "\n%d SIGNIFICANT HITS.\nDONE PARSING DESCRIPTIONS.\n", $self->{'_num_hits_significant'};
807 }
808
809
810 #=head2 _set_query_length
811 #
812 # Usage : n/a; called automatically during Blast report parsing.
813 # Purpose : Sets the length of the query sequence (extracted from report).
814 # Returns : integer (length of the query sequence)
815 # Throws : Exception if cannot determine the query sequence length from
816 # : the BLAST report.
817 # : Exception if the length is below the min_length cutoff (if any).
818 # Comments : The logic here is a bit different from the other _set_XXX()
819 # : methods since the significance of the BLAST report is assessed
820 # : if MIN_LENGTH is set.
821 #
822 #=cut
823
824 #---------------
825 sub _set_query_length {
826 #---------------
827 my ($self, $length) = @_;
828
829 my($sig_len);
830 if(defined($self->{'_min_length'})) {
831 local $^W = 0;
832 if($length < $self->{'_min_len'}) {
833 $self->throw("Query sequence too short (Query= ${\$self->{'_current_blast'}->query_name}, length= $length)",
834 "Minimum length is $self->{'_min_len'}");
835 }
836 }
837
838 $self->{'_current_blast'}->query_length($length);
839 }
840
841
842 # Records the highest and lowest significance (P- or E-value) and
843 # score encountered in a given report.
844 sub _set_hi_low_signif_and_score {
845 my($self, $sig, $score) = @_;
846
847 my $hiSig = $self->{'_highestSignif'} || 0;
848 my $lowSig = $self->{'_lowestSignif'} || $DEFAULT_SIGNIF;
849 my $hiScore = $self->{'_highestScore'} || 0;
850 my $lowScore = $self->{'_lowestScore'} || $DEFAULT_SIGNIF;
851
852 $self->{'_highestSignif'} = ($sig > $hiSig)
853 ? $sig : $hiSig;
854
855 $self->{'_lowestSignif'} = ($sig < $lowSig)
856 ? $sig : $lowSig;
857
858 $self->{'_highestScore'} = ($score > $hiScore)
859 ? $score : $hiScore;
860
861 $self->{'_lowestScore'} = ($score < $lowScore)
862 ? $score : $lowScore;
863 }
864
865
866 sub _process_significance {
867 my($self, $sig, $score) = @_;
868
869 $self->_set_hi_low_signif_and_score($sig, $score);
870
871 # Significance value assessment.
872 if($sig <= $self->{'_max_significance'} and $score >= $self->{'_min_score'}) {
873 $self->{'_num_hits_significant'}++;
874 }
875 $self->{'_num_hits'}++;
876
877 $self->{'_is_significant'} = 1 if $self->{'_num_hits_significant'};
878 }
879
880 #=head2 _layout
881 #
882 # Usage : n/a; internal method.
883 # Purpose : Set/Get indicator for the layout of the report.
884 # Returns : Integer (1 | 2)
885 # : Defaults to 2 if not set.
886 # Comments : Blast1 and WashU-Blast2 have a layout = 1.
887 # : This is intended for internal use by this and closely
888 # : allied modules like BlastHit.pm and BlastHSP.pm.
889 #
890 #=cut
891
892 #------------
893 sub _layout {
894 #------------
895 my $self = shift;
896 if(@_) {
897 $self->{'_layout'} = shift;
898 }
899 $self->{'_layout'} || 2;
900 }
901
902 #=head2 _parse_signif
903 #
904 # Usage : $signif = _parse_signif(string, layout, gapped);
905 # : This is a class function.
906 # Purpose : Extracts the P- or Expect value from a single line of a BLAST description section.
907 # Example : _parse_signif("PDB_UNIQUEP:3HSC_ heat-shock cognate ... 799 4.0e-206 2", 1);
908 # : _parse_signif("gi|758803 (U23828) peritrophin-95 precurs 38 0.19", 2);
909 # Argument : string = line from BLAST description section
910 # : layout = integer (1 or 2)
911 # : gapped = boolean (true if gapped Blast).
912 # Returns : Float (0.001 or 1e-03)
913 # Status : Static
914 #
915 #=cut
916
917 #------------------
918 sub _parse_signif {
919 #------------------
920 my ($line, $layout, $gapped) = @_;
921
922 local $_ = $line;
923 my @linedat = split();
924
925 # When processing both Blast1 and Blast2 reports
926 # in the same run, offset needs to be configured each time.
927 # NOTE: We likely won't be supporting mixed report streams. Not needed.
928
929 my $offset = 0;
930 $offset = 1 if $layout == 1 or not $gapped;
931
932 my $signif = $linedat[ $#linedat - $offset ];
933
934 # fail-safe check
935 if(not $signif =~ /[.-]/) {
936 $offset = ($offset == 0 ? 1 : 0);
937 $signif = $linedat[ $#linedat - $offset ];
938 }
939
940 $signif = "1$signif" if $signif =~ /^e/i;
941 return $signif;
942 }
943
944
945 =head2 best_hit_only
946
947 Usage : print "only getting best hit.\n" if $obj->best_hit_only();
948 Purpose : Set/Get the indicator for whether or not to processing only
949 : the best BlastHit.
950 Returns : Boolean (1 | 0)
951 Argument : n/a
952
953 =cut
954
955 #----------
956 sub best_hit_only {
957 #----------
958 my $self = shift;
959 if(@_) { $self->{'_best'} = shift; }
960 $self->{'_best'};
961 }
962
963 sub _process_alignment {
964 my ($self, @data) = @_;
965 # print STDERR "Processing Alignment...\n";
966
967 # If all of the significant hits have been parsed,
968 # return if we're not checking all or if we don't need to get
969 # the Blast stats (parameters at footer of report).
970 if(defined $self->{'_hit_count'} and
971 defined $self->{'_num_hits_significant'}) {
972 return if $self->{'_hit_count'} >= $self->{'_num_hits_significant'} and
973 not ($self->{'_check_all'} or $self->{'_collect_stats'});
974 }
975
976 # Return if we're only interested in the best hit.
977 # This has to occur after checking for the parameters section
978 # in the footer (since we may still be interested in them).
979 return if $self->best_hit_only and ( defined $self->{'_hit_count'} and $self->{'_hit_count'} >=1);
980
981 push @data, 'end';
982
983 # print STDERR "\nALIGNMENT DATA:\n@data\n";
984
985 my $max_signif = $self->max_significance;
986 my $min_score = $self->min_score;
987
988 my ($hitid, $score, $signif, $is_pval, $found_again);
989 if( $data[0] =~ /^(\S+)\s+/ ) {
990 $hitid = $1;
991 return unless defined $self->{'_hit_hash'}->{$hitid};
992 $score = $self->{'_hit_hash'}->{$hitid}->{'score'};
993 $signif = $self->{'_hit_hash'}->{$hitid}->{'signif'};
994 $found_again = $self->{'_hit_hash'}->{$hitid}->{'found_again'};
995 $is_pval = $self->{'_hit_hash'}->{'is_pval'};
996 # print STDERR " Got hitid: $hitid ($signif, $score, P?=$is_pval)\n";
997 }
998
999 # Now construct the BlastHit objects from the alignment section
1000
1001 # debug(1);
1002
1003 $self->{'_hit_count'}++;
1004
1005 # If not confirming significance, _process_descriptions will not have been run,
1006 # so we need to count the total number of hits here.
1007 if( not $self->{'_confirm_significance'}) {
1008 $self->{'_num_hits'}++;
1009 }
1010
1011 my %hit_params = ( -RESULT => $self->{'_current_blast'},
1012 -RAW_DATA =>\@data,
1013 -SIGNIF => $signif,
1014 -IS_PVAL => $is_pval,
1015 -SCORE => $score,
1016 -RANK => $self->{'_hit_count'},
1017 -RANK_BY => 'order',
1018 -OVERLAP => $self->{'_overlap'} || $MAX_HSP_OVERLAP,
1019 -FOUND_AGAIN => $found_again,
1020 -SHALLOW_PARSE => $self->{'_shallow_parse'},
1021 -HOLD_RAW_DATA => $self->{'_hold_raw_data'},
1022 );
1023
1024 my $hit;
1025 $hit = $self->hit_factory->create_hit( %hit_params );
1026
1027 # printf STDERR "NEW HIT: %s, SIGNIFICANCE = %g\n", $hit->name, $hit->expect; <STDIN>;
1028 # The BLAST report may have not had a description section.
1029 if(not $self->{'_has_descriptions'}) {
1030 $self->_process_significance($hit->signif, $score);
1031 }
1032
1033 # Collect overall signif data if we don't already have it,
1034 # (as occurs if no -signif or -score parameter are supplied).
1035 my $hit_signif = $hit->signif;
1036
1037 if (not $self->{'_confirm_significance'} ) {
1038 $self->_set_hi_low_signif_and_score($hit_signif, $score);
1039 }
1040
1041 # Test significance using custom function (if supplied)
1042 my $add_hit = 0;
1043
1044 my $hit_filter = $self->{'_hit_filter'} || 0;
1045
1046 if($hit_filter) {
1047 if(&$hit_filter($hit)) {
1048 $add_hit = 1;
1049 }
1050 } elsif($hit_signif <= $max_signif and $score >= $min_score) {
1051 $add_hit = 1;
1052 }
1053 $add_hit && $self->{'_current_blast'}->add_hit( $hit );
1054 }
1055
1056
1057 sub _process_footer {
1058 my ($self, $ok_to_pause, @data) = @_;
1059 # print STDERR "Processing Footer...\n";
1060
1061 $self->{'_current_blast'}->raw_statistics( [@data] );
1062
1063 if($self->{'_collect_stats'}) {
1064 foreach my $line (@data) {
1065 if( $line =~ /^\s*Matrix:\s*(\S+)/i ) {
1066 $self->{'_current_blast'}->matrix( $1 );
1067 }
1068 elsif( $line =~ /^\s*Number of Sequences:\s*(\d+)/i ) {
1069 $self->{'_current_blast'}->analysis_subject->entries( $1 );
1070 }
1071 elsif( $line =~ /^\s*length of database:\s*(\d+)/i ) {
1072 $self->{'_current_blast'}->analysis_subject->letters( $1 );
1073 }
1074 elsif( $line =~ /^\s*Posted date:\s*(.+)$/i ) {
1075 $self->{'_current_blast'}->analysis_subject->date( $1 );
1076 }
1077 }
1078 }
1079
1080 if( $self->errors ) {
1081 my $num_err = scalar($self->errors);
1082 $self->warn( "$num_err Blast parsing errors occurred.");
1083 foreach( $self->errors ) { print STDERR "$_\n"; };
1084 }
1085
1086 if( $self->{'_pause_between_reports'} and $ok_to_pause ) {
1087 $self->pause;
1088 }
1089
1090 }
1091
1092 sub _process_nohits {
1093 my $self = shift;
1094 # print STDERR "Processing No Hits (iteration = $current_iteration)\n";
1095 $self->{'_current_blast'}->set_no_hits_found( $current_iteration );
1096 }
1097
1098
1099 sub _process_iteration {
1100 my ($self, @data) = @_;
1101 # print STDERR "Processing Iteration\n";
1102 # print STDERR " Incrementing current iteration (was=$current_iteration)\n";
1103 $current_iteration++;
1104 $self->{'_current_blast'}->iterations( $current_iteration );
1105
1106 foreach( @data ) {
1107 if( /Results from round \d+/i ) {
1108 $self->{'_current_blast'}->psiblast( 1 );
1109 }
1110 elsif( /No hits found/i ) {
1111 $self->_process_nohits();
1112 last;
1113 }
1114 elsif( /^\s*Sequences/i ) {
1115 $self->_process_descriptions( @data );
1116 last;
1117 }
1118 }
1119 }
1120
1121 sub pause_between_reports {
1122 my ($self, $setting) = @_;
1123 if( defined $setting ) {
1124 $self->{'_pause_between_reports'} = $setting;
1125 }
1126 $self->{'_pause_between_reports'};
1127 }
1128
1129 sub result_count {
1130 my $self = shift;
1131 return $self->{'_result_count'};
1132 }
1133
1134 # For backward compatibility:
1135 sub report_count { shift->result_count }
1136
1137 sub next_result {
1138 my ($self) = @_;
1139 # print STDERR "next_result() called\n";
1140 if( not $self->running ) {
1141 $self->run;
1142 }
1143 else {
1144 $self->resume;
1145 }
1146 my $blast = $self->{'_current_blast'};
1147 $self->{'_current_blast'} = undef;
1148 return $blast;
1149 }
1150
1151 =head2 write_result
1152
1153 Title : write_result
1154 Usage : $stream->write_result($result_result, @other_args)
1155 Function: Writes data from the $result_result object into the stream.
1156 : Delegates to the to_string() method of the associated
1157 : WriterI object.
1158 Returns : 1 for success and 0 for error
1159 Args : Bio::Search:Result::ResultI object,
1160 : plus any other arguments for the Writer
1161 Throws : Bio::Root::Exception if a Writer has not been set.
1162
1163 See L<Bio::Root::Exception>
1164
1165 =cut
1166
1167 sub write_result {
1168 my ($self, $blast, @args) = @_;
1169
1170 if( not defined($self->writer) ) {
1171 $self->warn("Writer not defined. Using a $DEFAULT_BLAST_WRITER_CLASS");
1172 $self->writer( $DEFAULT_BLAST_WRITER_CLASS->new() );
1173 }
1174 $self->SUPER::write_result( $blast, @args );
1175 }
1176
1177
1178
1179 1;
1180 __END__
1181