Mercurial > repos > mahtabm > ensembl
comparison variant_effect_predictor/Bio/SearchIO/psiblast.pm @ 0:1f6dce3d34e0
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 02:01:53 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:1f6dce3d34e0 |
---|---|
1 #------------------------------------------------------------------ | |
2 # $Id: psiblast.pm,v 1.17 2002/12/24 15:48:41 jason Exp $ | |
3 # | |
4 # BioPerl module Bio::SearchIO::psiblast | |
5 # | |
6 # Cared for by Steve Chervitz <sac@bioperl.org> | |
7 # | |
8 # You may distribute this module under the same terms as perl itself | |
9 #------------------------------------------------------------------ | |
10 | |
11 =head1 NAME | |
12 | |
13 Bio::SearchIO::psiblast - Parser for traditional BLAST and PSI-BLAST reports | |
14 | |
15 =head1 SYNOPSIS | |
16 | |
17 use Bio::SearchIO; | |
18 | |
19 my $in = Bio::SearchIO->new( -format => 'psiblast', | |
20 -file => 'report.blastp' ); | |
21 | |
22 while ( my $blast = $in->next_result() ) { | |
23 foreach my $hit ( $blast->hits ) { | |
24 print "Hit: $hit\n"; | |
25 } | |
26 } | |
27 | |
28 # BLAST hit filtering function. All hits of each BLAST report must satisfy | |
29 # this criteria to be retained. If a hit fails this test, it is ignored. | |
30 # If all hits of a report fail, the report will be considered hitless. | |
31 # But we can distinguish this from the case where there were no | |
32 # hits in the report by testing the function $blast->no_hits_found(). | |
33 | |
34 my $filt_func = sub{ my $hit=shift; | |
35 $hit->frac_identical('query') >= 0.5 | |
36 && $hit->frac_aligned_query >= 0.50 | |
37 }; | |
38 | |
39 # Not supplying a -file or -fh parameter means read from STDIN | |
40 | |
41 my $in2 = Bio::SearchIO->new( -format => 'psiblast', | |
42 -hit_filter => $filt_func | |
43 ); | |
44 | |
45 | |
46 =head1 DESCRIPTION | |
47 | |
48 This module parses BLAST and PSI-BLAST reports and acts as a factory for | |
49 objects that encapsulate BLAST results: | |
50 L<Bio::Search::Result::BlastResult>, L<Bio::Search::Hit::BlastHit>, | |
51 L<Bio::Search::HSP::BlastHSP>. | |
52 | |
53 This module does not parse XML-formatted BLAST reports. | |
54 See L<Bio::SearchIO::blastxml|Bio::SearchIO::blastxml> if you need to do that. | |
55 | |
56 To use this module, the only module you need to C<use> is | |
57 Bio::SearchIO.pm. SearchIO knows how to load this module when you | |
58 supply a C<-format =E<gt> 'psiblast'> parameters to its C<new>() | |
59 function. For more information about the SearchIO system, see | |
60 documentation in Bio::SearchIO.pm. | |
61 | |
62 =head2 PSI-BLAST Support | |
63 | |
64 In addition to BLAST1 and BLAST2 reports, this module can also handle | |
65 PSI-BLAST reports. When accessing the set of Hits in a result, hits | |
66 from different iterations are lumped together but can be distinguished by | |
67 interrogating L<Bio::Search::Hit::BlastHit::iteration> and | |
68 L<Bio::Search::Hit::BlastHit::found_again>. | |
69 | |
70 If you want to collect hits only from a certain iteration during parsing, | |
71 supply a function using the C<-HIT_FILTER> parameter. | |
72 | |
73 =head1 EXAMPLES | |
74 | |
75 To get a feel for how to use this, have look at scripts in the | |
76 B<examples/searchio> and B<examples/searchio/writer> directory of the Bioperl | |
77 distribution as well as the test script B<t/SearchIO.t>. | |
78 | |
79 =head1 SEE ALSO | |
80 | |
81 For more documentation about working with Blast result objects that are | |
82 produced by this parser, see L<Bio::Search::Result::BlastResult>, | |
83 L<Bio::Search::Hit::BlastHit>, L<Bio::Search::HSP::BlastHSP>. | |
84 | |
85 =head1 FEEDBACK | |
86 | |
87 =head2 Mailing Lists | |
88 | |
89 User feedback is an integral part of the evolution of this and other | |
90 Bioperl modules. Send your comments and suggestions preferably to | |
91 the Bioperl mailing list. Your participation is much appreciated. | |
92 | |
93 bioperl-l@bioperl.org - General discussion | |
94 http://bioperl.org/MailList.shtml - About the mailing lists | |
95 | |
96 =head2 Reporting Bugs | |
97 | |
98 Report bugs to the Bioperl bug tracking system to help us keep track | |
99 of the bugs and their resolution. Bug reports can be submitted via | |
100 email or the web: | |
101 | |
102 bioperl-bugs@bioperl.org | |
103 http://bugzilla.bioperl.org/ | |
104 | |
105 =head1 AUTHOR | |
106 | |
107 Steve Chervitz E<lt>sac@bioperl.orgE<gt> | |
108 | |
109 See L<the FEEDBACK section | FEEDBACK> for where to send bug reports and comments. | |
110 | |
111 =head1 ACKNOWLEDGEMENTS | |
112 | |
113 I would like to acknowledge my colleagues at Affymetrix for useful | |
114 feedback. | |
115 | |
116 =head1 COPYRIGHT | |
117 | |
118 Copyright (c) 2001 Steve Chervitz. All Rights Reserved. | |
119 | |
120 This library is free software; you can redistribute it and/or modify | |
121 it under the same terms as Perl itself. | |
122 | |
123 =head1 DISCLAIMER | |
124 | |
125 This software is provided "as is" without warranty of any kind. | |
126 | |
127 =head1 APPENDIX | |
128 | |
129 The rest of the documentation details each of the object methods. | |
130 Internal methods are usually preceded with a _ | |
131 | |
132 =cut | |
133 | |
134 | |
135 # Let the code begin... | |
136 | |
137 | |
138 package Bio::SearchIO::psiblast; | |
139 | |
140 use strict; | |
141 use vars qw( @ISA | |
142 $MAX_HSP_OVERLAP | |
143 $DEFAULT_MATRIX | |
144 $DEFAULT_SIGNIF | |
145 $DEFAULT_SCORE | |
146 $DEFAULT_BLAST_WRITER_CLASS | |
147 $DEFAULT_HIT_FACTORY_CLASS | |
148 $DEFAULT_RESULT_FACTORY_CLASS | |
149 ); | |
150 | |
151 use Bio::SearchIO; | |
152 use Bio::Search::Result::BlastResult; | |
153 use Bio::Factory::BlastHitFactory; | |
154 use Bio::Factory::BlastResultFactory; | |
155 use Bio::Tools::StateMachine::IOStateMachine qw($INITIAL_STATE $FINAL_STATE); | |
156 | |
157 @ISA = qw( Bio::SearchIO | |
158 Bio::Tools::StateMachine::IOStateMachine ); | |
159 | |
160 $MAX_HSP_OVERLAP = 2; # Used when tiling multiple HSPs. | |
161 $DEFAULT_MATRIX = 'BLOSUM62'; | |
162 $DEFAULT_SIGNIF = 999;# Value used as significance cutoff if none supplied. | |
163 $DEFAULT_SCORE = 0; # Value used as score cutoff if none supplied. | |
164 $DEFAULT_BLAST_WRITER_CLASS = 'Bio::Search::Writer::HitTableWriter'; | |
165 $DEFAULT_HIT_FACTORY_CLASS = 'Bio::Factory::BlastHitFactory'; | |
166 $DEFAULT_RESULT_FACTORY_CLASS = 'Bio::Factory::BlastResultFactory'; | |
167 | |
168 my %state = ( | |
169 'header' => 'Header', | |
170 'descr' => 'Descriptions', | |
171 'align' => 'Alignment', | |
172 'footer' => 'Footer', | |
173 'iteration' => 'Iteration', # psiblast | |
174 'nohits' => 'No Hits' | |
175 ); | |
176 | |
177 # These are the expected transitions assuming a "normal" report (Blast2 or PSI-Blast). | |
178 my @state_transitions = ( [ $INITIAL_STATE, $state{'header'}], | |
179 [ $state{'header'}, $state{'descr'} ], | |
180 [ $state{'header'}, $state{'iteration'} ], | |
181 [ $state{'iteration'}, $state{'descr'} ], | |
182 [ $state{'iteration'}, $state{'nohits'} ], | |
183 [ $state{'descr'}, $state{'align'} ], | |
184 [ $state{'align'}, $state{'align'} ], | |
185 [ $state{'align'}, $state{'footer'} ], | |
186 [ $state{'align'}, $state{'iteration'} ], # psiblast | |
187 [ $state{'nohits'}, $state{'iteration'} ], # psiblast | |
188 [ $state{'nohits'}, $state{'footer'} ], | |
189 [ $state{'footer'}, $state{'header'} ], | |
190 [ $state{'footer'}, $FINAL_STATE] | |
191 ); | |
192 | |
193 my $current_iteration; # psiblast | |
194 | |
195 =head2 new | |
196 | |
197 Usage : Bio::SearchIO::psiblast->new( %named_parameters ) | |
198 Purpose : Parse traditional BLAST or PSI-BLAST data a file or input stream. | |
199 : Handles Blast1, Blast2, NCBI and WU Blast reports. | |
200 : Populates Bio::Search:: objects with data extracted from the report. | |
201 : (The exact type of Bio::Search objects depends on the type of | |
202 : Bio::Factory::ResultFactory and Bio::Factory::HitFactory you hook up | |
203 : to the SearchIO object.) | |
204 Returns : Bio::SearchIO::psiblast object. | |
205 Argument : Named parameters: (PARAMETER TAGS CAN BE UPPER OR LOWER CASE). | |
206 : These are in addition to those specified by Bio::SearchIO::new() (see). | |
207 : -SIGNIF => number (float or scientific notation number to be used | |
208 : as a P- or Expect value cutoff; default = 999.) | |
209 : -SCORE => number (integer or scientific notation number to be used | |
210 : as a score value cutoff; default = 0.) | |
211 : -HIT_FILTER => func_ref (reference to a function to be used for | |
212 : filtering out hits based on arbitrary criteria. | |
213 : This function should take a | |
214 : Bio::Search::Hit::BlastHit.pm object as its first | |
215 : argument and return a boolean value, | |
216 : true if the hit should be filtered out). | |
217 : Sample filter function: | |
218 : -HIT_FILTER => sub { $hit = shift; | |
219 : $hit->gaps == 0; }, | |
220 : Historical note: This parameter was formerly | |
221 called -FILT_FUNC in the older | |
222 Bio::Tools::Blast::parse method. Using | |
223 -FILT_FUNC will still work for backward | |
224 compatibility. | |
225 : -CHECK_ALL_HITS => boolean (check all hits for significance against | |
226 : significance criteria. Default = false. | |
227 : If false, stops processing hits after the first | |
228 : non-significant hit or the first hit that fails | |
229 : the hit_filter call. This speeds parsing, | |
230 : taking advantage of the fact that the hits | |
231 : are processed in the order they are ranked.) | |
232 : -MIN_LEN => integer (to be used as a minimum query sequence length | |
233 : sequences below this length will not be processed). | |
234 : default = no minimum length). | |
235 : -STATS => boolean (collect key statistical parameters for the report: | |
236 : matrix, filters, etc. default = false). | |
237 : This requires extra parsing | |
238 : so if you aren't interested in this info, don't | |
239 : set this parameter. Note that the unparsed | |
240 : parameter section of a Blast report is always | |
241 : accessible via $blast->raw_statistics(). | |
242 : -BEST => boolean (only process the best hit of each report; | |
243 : default = false). | |
244 : -OVERLAP => integer (the amount of overlap to permit between | |
245 : adjacent HSPs when tiling HSPs. A reasonable value is 2. | |
246 : Default = $Bio::SearchIO::psiblast::MAX_HSP_OVERLAP) | |
247 : -HOLD_RAW_DATA => boolean (store the raw alignment sections for each hit. | |
248 : used with the -SHALLOW_PARSE option). | |
249 : -SHALLOW_PARSE => boolean (only minimal parsing; does not parse HSPs. | |
250 : Hit data is limited to what can be obtained | |
251 : from the description line. | |
252 : Replaces the older NO_ALIGNS option.) | |
253 : | |
254 : | |
255 Comments : Do NOT remove the HTML from an HTML-formatted Blast report by using the | |
256 : "Save As" option of a web browser to save it as text. This renders the | |
257 : report unparsable. | |
258 Throws : An exception will be thrown if a BLAST report contains a FATAL: error. | |
259 | |
260 =cut | |
261 | |
262 sub new { | |
263 my ($class, %args) = @_; | |
264 | |
265 # TODO: Resolve this issue: | |
266 # Temporary hack to allow factory-based and non-factory based | |
267 # SearchIO objects co-exist. | |
268 # steve --- Sat Dec 22 04:41:20 2001 | |
269 $args{-USE_FACTORIES} = 1; | |
270 | |
271 my $self = $class->Bio::SearchIO::new(%args); | |
272 $self->_init_state_machine( %args, -transition_table => \@state_transitions); | |
273 | |
274 $self->_init_parse_params( %args ); | |
275 | |
276 $self->pause_between_reports( 1 ); | |
277 | |
278 $self->{'_result_count'} = 0; | |
279 | |
280 return $self; | |
281 } | |
282 | |
283 | |
284 sub default_result_factory_class { | |
285 my $self = shift; | |
286 return $DEFAULT_RESULT_FACTORY_CLASS; | |
287 } | |
288 | |
289 sub default_hit_factory_class { | |
290 my $self = shift; | |
291 return $DEFAULT_HIT_FACTORY_CLASS; | |
292 } | |
293 | |
294 sub check_for_new_state { | |
295 my ($self) = @_; | |
296 | |
297 # Ignore blank lines | |
298 my $chunk = $self->SUPER::check_for_new_state(1); | |
299 | |
300 my $newstate = undef; | |
301 | |
302 # End the run if there's no more input. | |
303 if( ! $chunk ) { | |
304 return $self->final_state; | |
305 } | |
306 $self->clear_state_change_cache; | |
307 my $curr_state = $self->current_state; | |
308 | |
309 if( $chunk =~ /^(<.*>)?T?BLAST[NPX]/ ) { | |
310 $newstate = $state{header}; | |
311 $self->state_change_cache( $chunk ); | |
312 } | |
313 | |
314 elsif ($chunk =~ /^Sequences producing/ ) { | |
315 $newstate = $state{descr}; | |
316 $self->state_change_cache( $chunk ); | |
317 } | |
318 | |
319 elsif ($chunk =~ /No hits found/i ) { | |
320 $newstate = $state{nohits}; | |
321 $self->state_change_cache( $chunk ); | |
322 } | |
323 | |
324 elsif ($chunk =~ /^\s*Searching/ ) { | |
325 $newstate = $state{iteration}; | |
326 } | |
327 | |
328 elsif ($chunk =~ /^>(.*)/ ) { | |
329 $newstate = $state{align}; | |
330 $self->state_change_cache( "$1\n" ); | |
331 } | |
332 | |
333 elsif ($chunk =~ /^(CPU time|Parameters):/ ) { | |
334 $newstate = $state{footer}; | |
335 $self->state_change_cache( $chunk ); | |
336 } | |
337 | |
338 # Necessary to distinguish " Database:" lines that start a footer section | |
339 # from those that are internal to a footer section. | |
340 elsif ($chunk =~ /^\s+Database:/ && $curr_state ne $state{'footer'}) { | |
341 $newstate = $state{footer}; | |
342 $self->state_change_cache( $chunk ); | |
343 } | |
344 | |
345 if( $curr_state ne $INITIAL_STATE and not $newstate ) { | |
346 # print "Appending input cache with ($curr_state): $chunk\n"; | |
347 $self->append_input_cache( $chunk ); | |
348 } | |
349 | |
350 return $newstate; | |
351 } | |
352 | |
353 | |
354 sub change_state { | |
355 my ($self, $state) = @_; | |
356 | |
357 my $from_state = $self->current_state; | |
358 my $verbose = $self->verbose; | |
359 $verbose and print STDERR ">>>>> Changing state from $from_state to $state\n"; | |
360 | |
361 if ( $self->validate_transition( $from_state, $state ) ) { | |
362 | |
363 # Now we know the current state is complete | |
364 # and all data from it is now in the input cache. | |
365 my @data = $self->get_input_cache(); | |
366 | |
367 # if($from_state eq $state{iteration} ) { | |
368 # do{ | |
369 # print STDERR "State change cache: ", $self->state_change_cache, "\n"; | |
370 # print STDERR "Input cache ($from_state):\n@data\n\n"; | |
371 # }; | |
372 # } | |
373 | |
374 # Now we need to process the input cache data. | |
375 # Remember, at this point, the current state is the "from" state | |
376 # of the state transition. The "to" state doesn't get set until | |
377 # the finalize_state_change() call at the end of this method. | |
378 | |
379 if($from_state eq $state{header} ) { | |
380 $self->_process_header( @data ); | |
381 } | |
382 elsif($from_state eq $state{descr} ) { | |
383 $self->_process_descriptions( @data ); | |
384 } | |
385 elsif($from_state eq $state{iteration} ) { | |
386 $self->_process_iteration( @data, $self->state_change_cache() ); | |
387 } | |
388 elsif($from_state eq $state{align} ) { | |
389 $self->_process_alignment( @data ); | |
390 } | |
391 elsif($from_state eq $state{footer} ) { | |
392 my $ok_to_pause = not $state eq $self->final_state; | |
393 $self->_process_footer( $ok_to_pause, @data ); | |
394 } | |
395 | |
396 $self->finalize_state_change( $state, 1 ); | |
397 } | |
398 } | |
399 | |
400 | |
401 sub _add_error { | |
402 my ($self, $err) = @_; | |
403 if( $err ) { | |
404 push @{$self->{'_blast_errs'}}, $err; | |
405 } | |
406 } | |
407 | |
408 sub _clear_errors { | |
409 my $self = shift; | |
410 $self->{'_blast_errs'} = undef; | |
411 } | |
412 | |
413 #--------- | |
414 sub errors { | |
415 #--------- | |
416 my $self = shift; | |
417 my @errs = (); | |
418 @errs = @{$self->{'_blast_errs'}} if ref $self->{'_blast_errs'}; | |
419 return @errs; | |
420 } | |
421 | |
422 | |
423 #---------------------- | |
424 sub _init_parse_params { | |
425 #---------------------- | |
426 #Initializes parameters used during parsing of Blast reports. | |
427 | |
428 my ( $self, @param ) = @_; | |
429 # -FILT_FUNC has been replaced by -HIT_FILTER. | |
430 # Leaving -FILT_FUNC in place for backward compatibility | |
431 my($signif, $filt_func, $hit_filter, $min_len, $check_all, | |
432 $gapped, $score, $overlap, $stats, $best, $shallow_parse, | |
433 $hold_raw) = | |
434 $self->_rearrange([qw(SIGNIF FILT_FUNC HIT_FILTER MIN_LEN | |
435 CHECK_ALL_HITS GAPPED SCORE | |
436 OVERLAP STATS BEST SHALLOW_PARSE HOLD_RAW_DATA)], @param); | |
437 | |
438 $self->{'_hit_filter'} = $hit_filter || $filt_func || 0; | |
439 $self->{'_check_all'} = $check_all || 0; | |
440 $self->{'_shallow_parse'} = $shallow_parse || 0; | |
441 $self->{'_hold_raw_data'} = $hold_raw || 0; | |
442 | |
443 $self->_set_signif($signif, $min_len, $self->{'_hit_filter'}, $score); | |
444 $self->best_hit_only($best) if $best; | |
445 $self->{'_blast_count'} = 0; | |
446 | |
447 $self->{'_collect_stats'} = defined($stats) ? $stats : 0; | |
448 | |
449 # TODO: Automatically determine whether gapping was used. | |
450 # e.g., based on version number. Otherwise, have to read params. | |
451 $self->{'_gapped'} = $gapped || 1; | |
452 | |
453 # Clear any errors from previous parse. | |
454 $self->_clear_errors; | |
455 undef $self->{'_hit_count'}; | |
456 undef $self->{'_num_hits_significant'}; | |
457 } | |
458 | |
459 #=head2 _set_signif | |
460 # | |
461 # Usage : n/a; called automatically by _init_parse_params() | |
462 # Purpose : Sets significance criteria for the BLAST object. | |
463 # Argument : Obligatory three arguments: | |
464 # : $signif = float or sci-notation number or undef | |
465 # : $min_len = integer or undef | |
466 # : $hit_filter = function reference or undef | |
467 # : | |
468 # : If $signif is undefined, a default value is set | |
469 # : (see $DEFAULT_SIGNIF; min_length = not set). | |
470 # Throws : Exception if significance value is defined but appears | |
471 # : out of range or invalid. | |
472 # : Exception if $hit_filter if defined and is not a func ref. | |
473 # Comments : The significance of a BLAST report can be based on | |
474 # : the P (or Expect) value and/or the length of the query sequence. | |
475 # : P (or Expect) values GREATER than '_max_significance' are not significant. | |
476 # : Query sequence lengths LESS than '_min_length' are not significant. | |
477 # : | |
478 # : Hits can also be screened using arbitrary significance criteria | |
479 # : as discussed in the parse() method. | |
480 # : | |
481 # : If no $signif is defined, the '_max_significance' level is set to | |
482 # : $DEFAULT_SIGNIF (999). | |
483 # | |
484 #See Also : L<signif>(), L<min_length>(), L<_init_parse_params>(), L<parse>() | |
485 # | |
486 #=cut | |
487 | |
488 #----------------- | |
489 sub _set_signif { | |
490 #----------------- | |
491 my( $self, $sig, $len, $func, $score ) = @_; | |
492 | |
493 if(defined $sig) { | |
494 $self->{'_confirm_significance'} = 1; | |
495 if( $sig =~ /[^\d.e-]/ or $sig <= 0) { | |
496 $self->throw(-class => 'Bio::Root::BadParameter', | |
497 -text => "Invalid significance value: $sig\n". | |
498 "Must be a number greater than zero."); | |
499 } | |
500 $self->{'_max_significance'} = $sig; | |
501 } else { | |
502 $self->{'_max_significance'} = $DEFAULT_SIGNIF; | |
503 } | |
504 | |
505 if(defined $score) { | |
506 $self->{'_confirm_significance'} = 1; | |
507 if( $score =~ /[^\de+]/ or $score <= 0) { | |
508 $self->throw(-class => 'Bio::Root::BadParameter', | |
509 -text => "Invalid score value: $score\n". | |
510 "Must be an integer greater than zero."); | |
511 } | |
512 $self->{'_min_score'} = $score; | |
513 } else { | |
514 $self->{'_min_score'} = $DEFAULT_SCORE; | |
515 } | |
516 | |
517 if(defined $len) { | |
518 if($len =~ /\D/ or $len <= 0) { | |
519 $self->warn("Invalid minimum length value: $len", | |
520 "Value must be an integer > 0. Value not set."); | |
521 } else { | |
522 $self->{'_min_length'} = $len; | |
523 } | |
524 } | |
525 | |
526 if(defined $func) { | |
527 $self->{'_check_all'} = 1; | |
528 $self->{'_confirm_significance'} = 1; | |
529 if($func and not ref $func eq 'CODE') { | |
530 $self->throw("Not a function reference: $func", | |
531 "The -hit_filter parameter must be function reference."); | |
532 } | |
533 } | |
534 } | |
535 | |
536 =head2 signif | |
537 | |
538 Synonym for L<max_significance()|max_significance> | |
539 | |
540 =cut | |
541 | |
542 #----------- | |
543 sub signif { shift->max_significance } | |
544 | |
545 | |
546 =head2 max_significance | |
547 | |
548 Usage : $obj->max_significance(); | |
549 Purpose : Gets the P or Expect value used as significance screening cutoff. | |
550 This is the value of the -signif parameter supplied to new(). | |
551 Hits with P or E-value above this are skipped. | |
552 Returns : Scientific notation number with this format: 1.0e-05. | |
553 Argument : n/a | |
554 Comments : Screening of significant hits uses the data provided on the | |
555 : description line. For NCBI BLAST1 and WU-BLAST, this data | |
556 : is P-value. for NCBI BLAST2 it is an Expect value. | |
557 | |
558 =cut | |
559 | |
560 #----------- | |
561 sub max_significance { | |
562 #----------- | |
563 my $self = shift; | |
564 my $sig = $self->{'_max_significance'}; | |
565 sprintf "%.1e", $sig; | |
566 } | |
567 | |
568 =head2 min_score | |
569 | |
570 Usage : $obj->min_score(); | |
571 Purpose : Gets the Blast score used as screening cutoff. | |
572 This is the value of the -score parameter supplied to new(). | |
573 Hits with scores below this are skipped. | |
574 Returns : Integer or scientific notation number. | |
575 Argument : n/a | |
576 Comments : Screening of significant hits uses the data provided on the | |
577 : description line. | |
578 | |
579 =cut | |
580 | |
581 #----------- | |
582 sub min_score { | |
583 #----------- | |
584 my $self = shift; | |
585 return $self->{'_min_score'}; | |
586 } | |
587 | |
588 =head2 min_length | |
589 | |
590 Usage : $obj->min_length(); | |
591 Purpose : Gets the query sequence length used as screening criteria. | |
592 This is the value of the -min_len parameter supplied to new(). | |
593 Hits with sequence length below this are skipped. | |
594 Returns : Integer | |
595 Argument : n/a | |
596 | |
597 See Also : L<signif()|signif> | |
598 | |
599 =cut | |
600 | |
601 #-------------- | |
602 sub min_length { | |
603 #-------------- | |
604 my $self = shift; | |
605 $self->{'_min_length'}; | |
606 } | |
607 | |
608 =head2 highest_signif | |
609 | |
610 Usage : $value = $obj->highest_signif(); | |
611 Purpose : Gets the largest significance (P- or E-value) observed in | |
612 the report. | |
613 : For NCBI BLAST1 and WU-BLAST, this is a P-value. | |
614 : For NCBI BLAST2 it is an Expect value. | |
615 Returns : Float or sci notation number | |
616 Argument : n/a | |
617 | |
618 =cut | |
619 | |
620 sub highest_signif { shift->{'_highestSignif'} } | |
621 | |
622 =head2 lowest_signif | |
623 | |
624 Usage : $value = $obj->lowest_signif(); | |
625 Purpose : Gets the largest significance (P- or E-value) observed in | |
626 the report. | |
627 : For NCBI BLAST1 and WU-BLAST, this is a P-value. | |
628 : For NCBI BLAST2 it is an Expect value. | |
629 Returns : Float or sci notation number | |
630 Argument : n/a | |
631 | |
632 =cut | |
633 | |
634 sub lowest_signif { shift->{'_lowestSignif'} } | |
635 | |
636 =head2 highest_score | |
637 | |
638 Usage : $value = $obj->highest_score(); | |
639 Purpose : Gets the largest BLAST score observed in the report. | |
640 Returns : Integer or sci notation number | |
641 Argument : n/a | |
642 | |
643 =cut | |
644 | |
645 sub highest_score { shift->{'_highestScore'} } | |
646 | |
647 =head2 lowest_score | |
648 | |
649 Usage : $value = $obj->lowest_score(); | |
650 Purpose : Gets the largest BLAST score observed in the report. | |
651 Returns : Integer or sci notation number | |
652 Argument : n/a | |
653 | |
654 =cut | |
655 | |
656 sub lowest_score { shift->{'_lowestScore'} } | |
657 | |
658 | |
659 # Throws : Exception if BLAST report contains a FATAL: error. | |
660 sub _process_header { | |
661 my ($self, @data) = @_; | |
662 | |
663 # print STDERR "Processing Header...\n"; | |
664 | |
665 $current_iteration = 0; | |
666 $self->{'_result_count'}++; | |
667 # Finish off the current Blast object, if any | |
668 my $blast = $self->{'_current_blast'} = $self->result_factory->create_result(); | |
669 | |
670 my ($set_method, $set_query, $set_db, $set_length); | |
671 my ($query_start, $query_desc); | |
672 | |
673 foreach my $line (@data) { | |
674 if( $line =~ /^(<.*>)?(T?BLAST[NPX])\s+(.*)$/ ) { | |
675 $blast->analysis_method( $2 ); | |
676 $blast->analysis_method_version( $3 ); | |
677 $set_method = 1; | |
678 } | |
679 elsif ($line =~ /^Query= *(.+)$/ ) { | |
680 $query_start = 1; | |
681 my $info = $1; | |
682 $info =~ s/TITLE //; | |
683 # Split the query line into two parts. | |
684 # Using \s instead of ' ' | |
685 $info =~ /(\S+?)\s+(.*)/; | |
686 # set name of Blast object and return. | |
687 $blast->query_name($1 || 'UNKNOWN'); | |
688 $query_desc = $2 || ''; | |
689 $set_query = 1; | |
690 } | |
691 elsif ($line =~ /^Database:\s+(.+)$/ ) { | |
692 require Bio::Search::GenericDatabase; | |
693 my $blastdb = Bio::Search::GenericDatabase->new( -name => $1 ); | |
694 $blast->analysis_subject( $blastdb ); | |
695 $set_db = 1; | |
696 } | |
697 elsif( $line =~ m/^\s+\(([\d|,]+) letters\)/ ) { | |
698 my $length = $1; | |
699 $length =~ s/,//g; | |
700 $self->_set_query_length( $length ); | |
701 $set_length = 1; | |
702 $blast->query_description( $query_desc ); | |
703 $query_start = 0; | |
704 } | |
705 elsif( $line =~ /WARNING: (.+?)/ ) { | |
706 $self->warn( $1 ); | |
707 } | |
708 elsif( $line =~ /FATAL: (.+?)/ ) { | |
709 $self->throw("FATAL BLAST Report Error: $1"); | |
710 } | |
711 # This needs to be the last elsif block. | |
712 elsif( $query_start ) { | |
713 # Handling multi-line query descriptions. | |
714 chomp( $line ); | |
715 $query_desc .= " $line"; | |
716 } | |
717 } | |
718 if (!$set_method) { | |
719 $self->throw("Can't determine type of BLAST program."); | |
720 } | |
721 if (!$set_query) { | |
722 $self->throw("Can't determine name of query sequence."); | |
723 } | |
724 if(!$set_db) { | |
725 $self->throw("Can't determine name of database."); | |
726 } | |
727 if(!$set_length) { | |
728 $self->throw("Can't determine sequence length from BLAST report."); | |
729 } | |
730 | |
731 } | |
732 | |
733 sub _process_descriptions { | |
734 my ($self, @data) = @_; | |
735 # print STDERR "Processing Descriptions...\n"; | |
736 | |
737 # Step through each line parsing out the P/Expect value | |
738 # All we really need to do is check the first one, if it doesn't | |
739 # meet the significance requirement, we can skip the report. | |
740 # BUT: we want to collect data for all hits anyway to get min/max signif. | |
741 | |
742 my $max_signif = $self->max_significance; | |
743 my $min_score = $self->min_score; | |
744 my $layout_set = $self->{'_layout'} || 0; | |
745 my ($layout, $sig, $hitid, $score, $is_p_value); | |
746 | |
747 if( $data[0] =~ /^\s*Sequences producing.*Score\s+P/i ) { | |
748 $is_p_value = 1; | |
749 } else { | |
750 $is_p_value = 0; | |
751 } | |
752 | |
753 my $hit_found_again; | |
754 | |
755 desc_loop: | |
756 foreach my $line (@data) { | |
757 last desc_loop if $line =~ / NONE |End of List|Results from round/; | |
758 next desc_loop if $line =~ /^\.\./; | |
759 | |
760 if($line =~ /^Sequences used in model/ ) { | |
761 #Sequences used in model and found again: | |
762 $hit_found_again = 1; | |
763 next; | |
764 } | |
765 elsif($line =~ /^Sequences not found previously/ ) { | |
766 #Sequences not found previously or not previously below threshold: | |
767 $hit_found_again = 0; | |
768 next; | |
769 } | |
770 | |
771 ## Checking the significance value (P- or Expect value) of the hit | |
772 ## in the description line. | |
773 | |
774 next desc_loop unless $line =~ /\d/; | |
775 | |
776 # TODO: These regexps need testing on a variety of reports. | |
777 if ( $line =~ /^(\S+)\s+.*\s+([\de.+-]+)\s{1,5}[\de.-]+\s*$/) { | |
778 $hitid = $1; | |
779 $score = $2; | |
780 $layout = 2; | |
781 } elsif( $line =~ /^(\S+)\s+.*\s+([\de.+-]+)\s{1,5}[\de.-]+\s{1,}\d+\s*$/) { | |
782 $hitid = $1; | |
783 $score = $2; | |
784 $layout = 1; | |
785 } else { | |
786 $self->warn("Can't parse description line\n $line"); | |
787 next desc_loop; | |
788 } | |
789 not $layout_set and ($self->_layout($layout), $layout_set = 1); | |
790 | |
791 $sig = &_parse_signif( $line, $layout, $self->{'_gapped'} ); | |
792 | |
793 # print STDERR " Parsed signif for $hitid: $sig (layout=$layout)\n"; | |
794 | |
795 $self->{'_hit_hash'}->{$hitid}->{'signif'} = $sig; | |
796 $self->{'_hit_hash'}->{$hitid}->{'score'} = $score; | |
797 $self->{'_hit_hash'}->{$hitid}->{'found_again'} = $hit_found_again; | |
798 $self->{'_hit_hash'}->{'is_pval'} = $is_p_value; | |
799 | |
800 last desc_loop if (not $self->{'_check_all'} and | |
801 ($sig > $max_signif or $score < $min_score)); | |
802 | |
803 $self->_process_significance($sig, $score); | |
804 } | |
805 | |
806 # printf "\n%d SIGNIFICANT HITS.\nDONE PARSING DESCRIPTIONS.\n", $self->{'_num_hits_significant'}; | |
807 } | |
808 | |
809 | |
810 #=head2 _set_query_length | |
811 # | |
812 # Usage : n/a; called automatically during Blast report parsing. | |
813 # Purpose : Sets the length of the query sequence (extracted from report). | |
814 # Returns : integer (length of the query sequence) | |
815 # Throws : Exception if cannot determine the query sequence length from | |
816 # : the BLAST report. | |
817 # : Exception if the length is below the min_length cutoff (if any). | |
818 # Comments : The logic here is a bit different from the other _set_XXX() | |
819 # : methods since the significance of the BLAST report is assessed | |
820 # : if MIN_LENGTH is set. | |
821 # | |
822 #=cut | |
823 | |
824 #--------------- | |
825 sub _set_query_length { | |
826 #--------------- | |
827 my ($self, $length) = @_; | |
828 | |
829 my($sig_len); | |
830 if(defined($self->{'_min_length'})) { | |
831 local $^W = 0; | |
832 if($length < $self->{'_min_len'}) { | |
833 $self->throw("Query sequence too short (Query= ${\$self->{'_current_blast'}->query_name}, length= $length)", | |
834 "Minimum length is $self->{'_min_len'}"); | |
835 } | |
836 } | |
837 | |
838 $self->{'_current_blast'}->query_length($length); | |
839 } | |
840 | |
841 | |
842 # Records the highest and lowest significance (P- or E-value) and | |
843 # score encountered in a given report. | |
844 sub _set_hi_low_signif_and_score { | |
845 my($self, $sig, $score) = @_; | |
846 | |
847 my $hiSig = $self->{'_highestSignif'} || 0; | |
848 my $lowSig = $self->{'_lowestSignif'} || $DEFAULT_SIGNIF; | |
849 my $hiScore = $self->{'_highestScore'} || 0; | |
850 my $lowScore = $self->{'_lowestScore'} || $DEFAULT_SIGNIF; | |
851 | |
852 $self->{'_highestSignif'} = ($sig > $hiSig) | |
853 ? $sig : $hiSig; | |
854 | |
855 $self->{'_lowestSignif'} = ($sig < $lowSig) | |
856 ? $sig : $lowSig; | |
857 | |
858 $self->{'_highestScore'} = ($score > $hiScore) | |
859 ? $score : $hiScore; | |
860 | |
861 $self->{'_lowestScore'} = ($score < $lowScore) | |
862 ? $score : $lowScore; | |
863 } | |
864 | |
865 | |
866 sub _process_significance { | |
867 my($self, $sig, $score) = @_; | |
868 | |
869 $self->_set_hi_low_signif_and_score($sig, $score); | |
870 | |
871 # Significance value assessment. | |
872 if($sig <= $self->{'_max_significance'} and $score >= $self->{'_min_score'}) { | |
873 $self->{'_num_hits_significant'}++; | |
874 } | |
875 $self->{'_num_hits'}++; | |
876 | |
877 $self->{'_is_significant'} = 1 if $self->{'_num_hits_significant'}; | |
878 } | |
879 | |
880 #=head2 _layout | |
881 # | |
882 # Usage : n/a; internal method. | |
883 # Purpose : Set/Get indicator for the layout of the report. | |
884 # Returns : Integer (1 | 2) | |
885 # : Defaults to 2 if not set. | |
886 # Comments : Blast1 and WashU-Blast2 have a layout = 1. | |
887 # : This is intended for internal use by this and closely | |
888 # : allied modules like BlastHit.pm and BlastHSP.pm. | |
889 # | |
890 #=cut | |
891 | |
892 #------------ | |
893 sub _layout { | |
894 #------------ | |
895 my $self = shift; | |
896 if(@_) { | |
897 $self->{'_layout'} = shift; | |
898 } | |
899 $self->{'_layout'} || 2; | |
900 } | |
901 | |
902 #=head2 _parse_signif | |
903 # | |
904 # Usage : $signif = _parse_signif(string, layout, gapped); | |
905 # : This is a class function. | |
906 # Purpose : Extracts the P- or Expect value from a single line of a BLAST description section. | |
907 # Example : _parse_signif("PDB_UNIQUEP:3HSC_ heat-shock cognate ... 799 4.0e-206 2", 1); | |
908 # : _parse_signif("gi|758803 (U23828) peritrophin-95 precurs 38 0.19", 2); | |
909 # Argument : string = line from BLAST description section | |
910 # : layout = integer (1 or 2) | |
911 # : gapped = boolean (true if gapped Blast). | |
912 # Returns : Float (0.001 or 1e-03) | |
913 # Status : Static | |
914 # | |
915 #=cut | |
916 | |
917 #------------------ | |
918 sub _parse_signif { | |
919 #------------------ | |
920 my ($line, $layout, $gapped) = @_; | |
921 | |
922 local $_ = $line; | |
923 my @linedat = split(); | |
924 | |
925 # When processing both Blast1 and Blast2 reports | |
926 # in the same run, offset needs to be configured each time. | |
927 # NOTE: We likely won't be supporting mixed report streams. Not needed. | |
928 | |
929 my $offset = 0; | |
930 $offset = 1 if $layout == 1 or not $gapped; | |
931 | |
932 my $signif = $linedat[ $#linedat - $offset ]; | |
933 | |
934 # fail-safe check | |
935 if(not $signif =~ /[.-]/) { | |
936 $offset = ($offset == 0 ? 1 : 0); | |
937 $signif = $linedat[ $#linedat - $offset ]; | |
938 } | |
939 | |
940 $signif = "1$signif" if $signif =~ /^e/i; | |
941 return $signif; | |
942 } | |
943 | |
944 | |
945 =head2 best_hit_only | |
946 | |
947 Usage : print "only getting best hit.\n" if $obj->best_hit_only(); | |
948 Purpose : Set/Get the indicator for whether or not to processing only | |
949 : the best BlastHit. | |
950 Returns : Boolean (1 | 0) | |
951 Argument : n/a | |
952 | |
953 =cut | |
954 | |
955 #---------- | |
956 sub best_hit_only { | |
957 #---------- | |
958 my $self = shift; | |
959 if(@_) { $self->{'_best'} = shift; } | |
960 $self->{'_best'}; | |
961 } | |
962 | |
963 sub _process_alignment { | |
964 my ($self, @data) = @_; | |
965 # print STDERR "Processing Alignment...\n"; | |
966 | |
967 # If all of the significant hits have been parsed, | |
968 # return if we're not checking all or if we don't need to get | |
969 # the Blast stats (parameters at footer of report). | |
970 if(defined $self->{'_hit_count'} and | |
971 defined $self->{'_num_hits_significant'}) { | |
972 return if $self->{'_hit_count'} >= $self->{'_num_hits_significant'} and | |
973 not ($self->{'_check_all'} or $self->{'_collect_stats'}); | |
974 } | |
975 | |
976 # Return if we're only interested in the best hit. | |
977 # This has to occur after checking for the parameters section | |
978 # in the footer (since we may still be interested in them). | |
979 return if $self->best_hit_only and ( defined $self->{'_hit_count'} and $self->{'_hit_count'} >=1); | |
980 | |
981 push @data, 'end'; | |
982 | |
983 # print STDERR "\nALIGNMENT DATA:\n@data\n"; | |
984 | |
985 my $max_signif = $self->max_significance; | |
986 my $min_score = $self->min_score; | |
987 | |
988 my ($hitid, $score, $signif, $is_pval, $found_again); | |
989 if( $data[0] =~ /^(\S+)\s+/ ) { | |
990 $hitid = $1; | |
991 return unless defined $self->{'_hit_hash'}->{$hitid}; | |
992 $score = $self->{'_hit_hash'}->{$hitid}->{'score'}; | |
993 $signif = $self->{'_hit_hash'}->{$hitid}->{'signif'}; | |
994 $found_again = $self->{'_hit_hash'}->{$hitid}->{'found_again'}; | |
995 $is_pval = $self->{'_hit_hash'}->{'is_pval'}; | |
996 # print STDERR " Got hitid: $hitid ($signif, $score, P?=$is_pval)\n"; | |
997 } | |
998 | |
999 # Now construct the BlastHit objects from the alignment section | |
1000 | |
1001 # debug(1); | |
1002 | |
1003 $self->{'_hit_count'}++; | |
1004 | |
1005 # If not confirming significance, _process_descriptions will not have been run, | |
1006 # so we need to count the total number of hits here. | |
1007 if( not $self->{'_confirm_significance'}) { | |
1008 $self->{'_num_hits'}++; | |
1009 } | |
1010 | |
1011 my %hit_params = ( -RESULT => $self->{'_current_blast'}, | |
1012 -RAW_DATA =>\@data, | |
1013 -SIGNIF => $signif, | |
1014 -IS_PVAL => $is_pval, | |
1015 -SCORE => $score, | |
1016 -RANK => $self->{'_hit_count'}, | |
1017 -RANK_BY => 'order', | |
1018 -OVERLAP => $self->{'_overlap'} || $MAX_HSP_OVERLAP, | |
1019 -FOUND_AGAIN => $found_again, | |
1020 -SHALLOW_PARSE => $self->{'_shallow_parse'}, | |
1021 -HOLD_RAW_DATA => $self->{'_hold_raw_data'}, | |
1022 ); | |
1023 | |
1024 my $hit; | |
1025 $hit = $self->hit_factory->create_hit( %hit_params ); | |
1026 | |
1027 # printf STDERR "NEW HIT: %s, SIGNIFICANCE = %g\n", $hit->name, $hit->expect; <STDIN>; | |
1028 # The BLAST report may have not had a description section. | |
1029 if(not $self->{'_has_descriptions'}) { | |
1030 $self->_process_significance($hit->signif, $score); | |
1031 } | |
1032 | |
1033 # Collect overall signif data if we don't already have it, | |
1034 # (as occurs if no -signif or -score parameter are supplied). | |
1035 my $hit_signif = $hit->signif; | |
1036 | |
1037 if (not $self->{'_confirm_significance'} ) { | |
1038 $self->_set_hi_low_signif_and_score($hit_signif, $score); | |
1039 } | |
1040 | |
1041 # Test significance using custom function (if supplied) | |
1042 my $add_hit = 0; | |
1043 | |
1044 my $hit_filter = $self->{'_hit_filter'} || 0; | |
1045 | |
1046 if($hit_filter) { | |
1047 if(&$hit_filter($hit)) { | |
1048 $add_hit = 1; | |
1049 } | |
1050 } elsif($hit_signif <= $max_signif and $score >= $min_score) { | |
1051 $add_hit = 1; | |
1052 } | |
1053 $add_hit && $self->{'_current_blast'}->add_hit( $hit ); | |
1054 } | |
1055 | |
1056 | |
1057 sub _process_footer { | |
1058 my ($self, $ok_to_pause, @data) = @_; | |
1059 # print STDERR "Processing Footer...\n"; | |
1060 | |
1061 $self->{'_current_blast'}->raw_statistics( [@data] ); | |
1062 | |
1063 if($self->{'_collect_stats'}) { | |
1064 foreach my $line (@data) { | |
1065 if( $line =~ /^\s*Matrix:\s*(\S+)/i ) { | |
1066 $self->{'_current_blast'}->matrix( $1 ); | |
1067 } | |
1068 elsif( $line =~ /^\s*Number of Sequences:\s*(\d+)/i ) { | |
1069 $self->{'_current_blast'}->analysis_subject->entries( $1 ); | |
1070 } | |
1071 elsif( $line =~ /^\s*length of database:\s*(\d+)/i ) { | |
1072 $self->{'_current_blast'}->analysis_subject->letters( $1 ); | |
1073 } | |
1074 elsif( $line =~ /^\s*Posted date:\s*(.+)$/i ) { | |
1075 $self->{'_current_blast'}->analysis_subject->date( $1 ); | |
1076 } | |
1077 } | |
1078 } | |
1079 | |
1080 if( $self->errors ) { | |
1081 my $num_err = scalar($self->errors); | |
1082 $self->warn( "$num_err Blast parsing errors occurred."); | |
1083 foreach( $self->errors ) { print STDERR "$_\n"; }; | |
1084 } | |
1085 | |
1086 if( $self->{'_pause_between_reports'} and $ok_to_pause ) { | |
1087 $self->pause; | |
1088 } | |
1089 | |
1090 } | |
1091 | |
1092 sub _process_nohits { | |
1093 my $self = shift; | |
1094 # print STDERR "Processing No Hits (iteration = $current_iteration)\n"; | |
1095 $self->{'_current_blast'}->set_no_hits_found( $current_iteration ); | |
1096 } | |
1097 | |
1098 | |
1099 sub _process_iteration { | |
1100 my ($self, @data) = @_; | |
1101 # print STDERR "Processing Iteration\n"; | |
1102 # print STDERR " Incrementing current iteration (was=$current_iteration)\n"; | |
1103 $current_iteration++; | |
1104 $self->{'_current_blast'}->iterations( $current_iteration ); | |
1105 | |
1106 foreach( @data ) { | |
1107 if( /Results from round \d+/i ) { | |
1108 $self->{'_current_blast'}->psiblast( 1 ); | |
1109 } | |
1110 elsif( /No hits found/i ) { | |
1111 $self->_process_nohits(); | |
1112 last; | |
1113 } | |
1114 elsif( /^\s*Sequences/i ) { | |
1115 $self->_process_descriptions( @data ); | |
1116 last; | |
1117 } | |
1118 } | |
1119 } | |
1120 | |
1121 sub pause_between_reports { | |
1122 my ($self, $setting) = @_; | |
1123 if( defined $setting ) { | |
1124 $self->{'_pause_between_reports'} = $setting; | |
1125 } | |
1126 $self->{'_pause_between_reports'}; | |
1127 } | |
1128 | |
1129 sub result_count { | |
1130 my $self = shift; | |
1131 return $self->{'_result_count'}; | |
1132 } | |
1133 | |
1134 # For backward compatibility: | |
1135 sub report_count { shift->result_count } | |
1136 | |
1137 sub next_result { | |
1138 my ($self) = @_; | |
1139 # print STDERR "next_result() called\n"; | |
1140 if( not $self->running ) { | |
1141 $self->run; | |
1142 } | |
1143 else { | |
1144 $self->resume; | |
1145 } | |
1146 my $blast = $self->{'_current_blast'}; | |
1147 $self->{'_current_blast'} = undef; | |
1148 return $blast; | |
1149 } | |
1150 | |
1151 =head2 write_result | |
1152 | |
1153 Title : write_result | |
1154 Usage : $stream->write_result($result_result, @other_args) | |
1155 Function: Writes data from the $result_result object into the stream. | |
1156 : Delegates to the to_string() method of the associated | |
1157 : WriterI object. | |
1158 Returns : 1 for success and 0 for error | |
1159 Args : Bio::Search:Result::ResultI object, | |
1160 : plus any other arguments for the Writer | |
1161 Throws : Bio::Root::Exception if a Writer has not been set. | |
1162 | |
1163 See L<Bio::Root::Exception> | |
1164 | |
1165 =cut | |
1166 | |
1167 sub write_result { | |
1168 my ($self, $blast, @args) = @_; | |
1169 | |
1170 if( not defined($self->writer) ) { | |
1171 $self->warn("Writer not defined. Using a $DEFAULT_BLAST_WRITER_CLASS"); | |
1172 $self->writer( $DEFAULT_BLAST_WRITER_CLASS->new() ); | |
1173 } | |
1174 $self->SUPER::write_result( $blast, @args ); | |
1175 } | |
1176 | |
1177 | |
1178 | |
1179 1; | |
1180 __END__ | |
1181 |