comparison variant_effect_predictor/Bio/SearchIO/blast.pm @ 0:1f6dce3d34e0

Uploaded
author mahtabm
date Thu, 11 Apr 2013 02:01:53 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:1f6dce3d34e0
1 # $Id: blast.pm,v 1.42.2.14 2003/09/15 16:19:01 jason Exp $
2 #
3 # BioPerl module for Bio::SearchIO::blast
4 #
5 # Cared for by Jason Stajich <jason@bioperl.org>
6 #
7 # Copyright Jason Stajich
8 #
9 # You may distribute this module under the same terms as perl itself
10
11 # POD documentation - main docs before the code
12
13 =head1 NAME
14
15 Bio::SearchIO::blast - Event generator for event based parsing of blast reports
16
17 =head1 SYNOPSIS
18
19 # Do not use this object directly - it is used as part of the
20 # Bio::SearchIO system.
21
22 use Bio::SearchIO;
23 my $searchio = new Bio::SearchIO(-format => 'blast',
24 -file => 't/data/ecolitst.bls');
25 while( my $result = $searchio->next_result ) {
26 while( my $hit = $result->next_hit ) {
27 while( my $hsp = $hit->next_hsp ) {
28 # ...
29 }
30 }
31 }
32
33 =head1 DESCRIPTION
34
35 This object encapsulated the necessary methods for generating events
36 suitable for building Bio::Search objects from a BLAST report file.
37 Read the L<Bio::SearchIO> for more information about how to use this.
38
39 =head1 FEEDBACK
40
41 =head2 Mailing Lists
42
43 User feedback is an integral part of the evolution of this and other
44 Bioperl modules. Send your comments and suggestions preferably to
45 the Bioperl mailing list. Your participation is much appreciated.
46
47 bioperl-l@bioperl.org - General discussion
48 http://bioperl.org/MailList.shtml - About the mailing lists
49
50 =head2 Reporting Bugs
51
52 Report bugs to the Bioperl bug tracking system to help us keep track
53 of the bugs and their resolution. Bug reports can be submitted via
54 email or the web:
55
56 bioperl-bugs@bioperl.org
57 http://bugzilla.bioperl.org/
58
59 =head1 AUTHOR - Jason Stajich
60
61 Email jason@bioperl.org
62
63 Describe contact details here
64
65 =head1 CONTRIBUTORS
66
67 Additional contributors names and emails here
68
69 =head1 APPENDIX
70
71 The rest of the documentation details each of the object methods.
72 Internal methods are usually preceded with a _
73
74 =cut
75
76
77 # Let the code begin...
78
79
80
81 package Bio::SearchIO::blast;
82 use strict;
83 use vars qw(@ISA %MAPPING %MODEMAP $DEFAULT_BLAST_WRITER_CLASS);
84 use Bio::SearchIO;
85
86 @ISA = qw(Bio::SearchIO );
87
88 BEGIN {
89 # mapping of NCBI Blast terms to Bioperl hash keys
90 %MODEMAP = ('BlastOutput' => 'result',
91 'Hit' => 'hit',
92 'Hsp' => 'hsp'
93 );
94
95 # This should really be done more intelligently, like with
96 # XSLT
97
98 %MAPPING =
99 (
100 'Hsp_bit-score' => 'HSP-bits',
101 'Hsp_score' => 'HSP-score',
102 'Hsp_evalue' => 'HSP-evalue',
103 'Hsp_pvalue' => 'HSP-pvalue',
104 'Hsp_query-from' => 'HSP-query_start',
105 'Hsp_query-to' => 'HSP-query_end',
106 'Hsp_hit-from' => 'HSP-hit_start',
107 'Hsp_hit-to' => 'HSP-hit_end',
108 'Hsp_positive' => 'HSP-conserved',
109 'Hsp_identity' => 'HSP-identical',
110 'Hsp_gaps' => 'HSP-hsp_gaps',
111 'Hsp_hitgaps' => 'HSP-hit_gaps',
112 'Hsp_querygaps' => 'HSP-query_gaps',
113 'Hsp_qseq' => 'HSP-query_seq',
114 'Hsp_hseq' => 'HSP-hit_seq',
115 'Hsp_midline' => 'HSP-homology_seq',
116 'Hsp_align-len' => 'HSP-hsp_length',
117 'Hsp_query-frame'=> 'HSP-query_frame',
118 'Hsp_hit-frame' => 'HSP-hit_frame',
119
120 'Hit_id' => 'HIT-name',
121 'Hit_len' => 'HIT-length',
122 'Hit_accession' => 'HIT-accession',
123 'Hit_def' => 'HIT-description',
124 'Hit_signif' => 'HIT-significance',
125 'Hit_score' => 'HIT-score',
126 'Iteration_iter-num' => 'HIT-iteration',
127
128 'BlastOutput_program' => 'RESULT-algorithm_name',
129 'BlastOutput_version' => 'RESULT-algorithm_version',
130 'BlastOutput_query-def'=> 'RESULT-query_name',
131 'BlastOutput_query-len'=> 'RESULT-query_length',
132 'BlastOutput_query-acc'=> 'RESULT-query_accession',
133 'BlastOutput_querydesc'=> 'RESULT-query_description',
134 'BlastOutput_db' => 'RESULT-database_name',
135 'BlastOutput_db-len' => 'RESULT-database_entries',
136 'BlastOutput_db-let' => 'RESULT-database_letters',
137
138 'Parameters_matrix' => { 'RESULT-parameters' => 'matrix'},
139 'Parameters_expect' => { 'RESULT-parameters' => 'expect'},
140 'Parameters_include' => { 'RESULT-parameters' => 'include'},
141 'Parameters_sc-match' => { 'RESULT-parameters' => 'match'},
142 'Parameters_sc-mismatch' => { 'RESULT-parameters' => 'mismatch'},
143 'Parameters_gap-open' => { 'RESULT-parameters' => 'gapopen'},
144 'Parameters_gap-extend'=> { 'RESULT-parameters' => 'gapext'},
145 'Parameters_filter' => {'RESULT-parameters' => 'filter'},
146 'Parameters_allowgaps' => { 'RESULT-parameters' => 'allowgaps'},
147
148 'Statistics_db-len' => {'RESULT-statistics' => 'dbentries'},
149 'Statistics_db-let' => { 'RESULT-statistics' => 'dbletters'},
150 'Statistics_hsp-len' => { 'RESULT-statistics' => 'effective_hsplength'},
151 'Statistics_query-len' => { 'RESULT-statistics' => 'querylength'},
152 'Statistics_eff-space' => { 'RESULT-statistics' => 'effectivespace'},
153 'Statistics_eff-spaceused' => { 'RESULT-statistics' => 'effectivespaceused'},
154 'Statistics_eff-dblen' => { 'RESULT-statistics' => 'effectivedblength'},
155 'Statistics_kappa' => { 'RESULT-statistics' => 'kappa' },
156 'Statistics_lambda' => { 'RESULT-statistics' => 'lambda' },
157 'Statistics_entropy' => { 'RESULT-statistics' => 'entropy'},
158 'Statistics_framewindow'=> { 'RESULT-statistics' => 'frameshiftwindow'},
159 'Statistics_decay'=> { 'RESULT-statistics' => 'decayconst'},
160
161 'Statistics_T'=> { 'RESULT-statistics' => 'T'},
162 'Statistics_A'=> { 'RESULT-statistics' => 'A'},
163 'Statistics_X1'=> { 'RESULT-statistics' => 'X1'},
164 'Statistics_X2'=> { 'RESULT-statistics' => 'X2'},
165 'Statistics_S1'=> { 'RESULT-statistics' => 'S1'},
166 'Statistics_S2'=> { 'RESULT-statistics' => 'S2'},
167 'Statistics_hit_to_db' => { 'RESULT-statistics' => 'Hits_to_DB'},
168 'Statistics_num_extensions' => { 'RESULT-statistics' => 'num_extensions'},
169 'Statistics_num_extensions' => { 'RESULT-statistics' => 'num_extensions'},
170 'Statistics_num_suc_extensions' => { 'RESULT-statistics' => 'num_successful_extensions'},
171 'Statistics_seqs_better_than_cutoff' => { 'RESULT-statistics' => 'seqs_better_than_cutoff'},
172 'Statistics_posted_date' => { 'RESULT-statistics' => 'posted_date'},
173
174 # WU-BLAST stats
175 'Statistics_DFA_states'=> { 'RESULT-statistics' => 'num_dfa_states'},
176 'Statistics_DFA_size'=> { 'RESULT-statistics' => 'dfa_size'},
177
178 'Statistics_search_cputime' => { 'RESULT-statistics' => 'search_cputime'},
179 'Statistics_total_cputime' => { 'RESULT-statistics' => 'total_cputime'},
180 'Statistics_search_actualtime' => { 'RESULT-statistics' => 'search_actualtime'},
181 'Statistics_total_actualtime' => { 'RESULT-statistics' => 'total_actualtime'},
182
183 'Statistics_noprocessors' => { 'RESULT-statistics' => 'no_of_processors'},
184 'Statistics_neighbortime' => { 'RESULT-statistics' => 'neighborhood_generate_time'},
185 'Statistics_starttime' => { 'RESULT-statistics' => 'start_time'},
186 'Statistics_endtime' => { 'RESULT-statistics' => 'end_time'},
187 );
188
189 $DEFAULT_BLAST_WRITER_CLASS = 'Bio::Search::Writer::HitTableWriter';
190 }
191
192
193 =head2 new
194
195 Title : new
196 Usage : my $obj = new Bio::SearchIO::blast();
197 Function: Builds a new Bio::SearchIO::blast object
198 Returns : Bio::SearchIO::blast
199 Args : -fh/-file => filehandle/filename to BLAST file
200 -format => 'blast'
201
202 =cut
203
204 =head2 next_result
205
206 Title : next_result
207 Usage : my $hit = $searchio->next_result;
208 Function: Returns the next Result from a search
209 Returns : Bio::Search::Result::ResultI object
210 Args : none
211
212 =cut
213
214 sub next_result{
215 my ($self) = @_;
216
217 my $data = '';
218 my $seentop = 0;
219 my ($reporttype,$seenquery,$reportline);
220 $self->start_document();
221 my @hit_signifs;
222
223 while( defined ($_ = $self->_readline )) {
224 next if( /^\s+$/); # skip empty lines
225 next if( /CPU time:/);
226 next if( /^>\s*$/);
227
228 if( /^([T]?BLAST[NPX])\s*(.+)$/i ||
229 /^(PSITBLASTN)\s+(.+)$/i ||
230 /^(RPS-BLAST)\s*(.+)$/i ||
231 /^(MEGABLAST)\s*(.+)$/i
232 ) {
233 if( $seentop ) {
234 $self->_pushback($_);
235 $self->in_element('hsp') &&
236 $self->end_element({ 'Name' => 'Hsp'});
237 $self->in_element('hit') &&
238 $self->end_element({ 'Name' => 'Hit'});
239 $self->end_element({ 'Name' => 'BlastOutput'});
240 return $self->end_document();
241 }
242 $self->start_element({ 'Name' => 'BlastOutput' } );
243 $self->{'_result_count'}++;
244 $seentop = 1;
245 $reporttype = $1;
246 $reportline = $_; # to fix the fact that RPS-BLAST output is wrong
247 $self->element({ 'Name' => 'BlastOutput_program',
248 'Data' => $reporttype});
249
250 $self->element({ 'Name' => 'BlastOutput_version',
251 'Data' => $2});
252 } elsif ( /^Query=\s*(.+)$/ ) {
253 my $q = $1;
254 my $size = 0;
255 if( defined $seenquery ) {
256 $self->_pushback($reportline);
257 $self->_pushback($_);
258 $self->in_element('hsp') &&
259 $self->end_element({'Name'=> 'Hsp'});
260 $self->in_element('hit') &&
261 $self->end_element({'Name'=> 'Hit'});
262 $self->end_element({'Name' => 'BlastOutput'});
263 return $self->end_document();
264 } else {
265 if( ! defined $reporttype ) {
266 $self->start_element({'Name' => 'BlastOutput'});
267 $seentop = 1;
268 $self->{'_result_count'}++;
269 }
270 }
271 $seenquery = $q;
272 $_ = $self->_readline;
273 while( defined ($_) ) {
274 if( /^Database:/ ) {
275 $self->_pushback($_);
276 last;
277 }
278 chomp;
279 if( /\((\-?[\d,]+)\s+letters.*\)/ ) {
280 $size = $1;
281 $size =~ s/,//g;
282 last;
283 } else {
284 $q .= " $_";
285 $q =~ s/ +/ /g;
286 $q =~ s/^ | $//g;
287 }
288
289 $_ = $self->_readline;
290 }
291 chomp($q);
292 my ($nm,$desc) = split(/\s+/,$q,2);
293 $self->element({ 'Name' => 'BlastOutput_query-def',
294 'Data' => $nm});
295 $self->element({ 'Name' => 'BlastOutput_query-len',
296 'Data' => $size});
297 defined $desc && $desc =~ s/\s+$//;
298 $self->element({ 'Name' => 'BlastOutput_querydesc',
299 'Data' => $desc});
300
301 if( my @pieces = split(/\|/,$nm) ) {
302 my $acc = pop @pieces;
303 $acc = pop @pieces if( ! defined $acc || $acc =~ /^\s+$/);
304 $self->element({ 'Name' => 'BlastOutput_query-acc',
305 'Data' => $acc});
306 }
307
308 } elsif( /Sequences producing significant alignments:/ ) {
309 descline:
310 while( defined ($_ = $self->_readline() )) {
311 if( /^>/ ) {
312 $self->_pushback($_);
313 last descline;
314 } elsif( /(\d+)\s+([\d\.\-eE]+)(\s+\d+)?\s*$/) {
315 # the last match is for gapped BLAST output
316 # which will report the number of HSPs for the Hit
317 my ($score, $evalue) = ($1, $2);
318 # Some data clean-up so e-value will appear numeric to perl
319 $evalue =~ s/^e/1e/i;
320 push @hit_signifs, [ $evalue, $score ];
321 }
322 }
323 } elsif( /Sequences producing High-scoring Segment Pairs:/ ) {
324 # skip the next line
325 $_ = $self->_readline();
326
327 while( defined ($_ = $self->_readline() ) &&
328 ! /^\s+$/ ) {
329 my @line = split;
330 pop @line; # throw away first number which is for 'N'col
331 push @hit_signifs, [ pop @line, pop @line];
332 }
333 } elsif ( /^Database:\s*(.+)$/ ) {
334 my $db = $1;
335
336 while( defined($_ = $self->_readline) ) {
337 if( /^\s+(\-?[\d\,]+)\s+sequences\;\s+(\-?[\d,]+)\s+total\s+letters/){
338 my ($s,$l) = ($1,$2);
339 $s =~ s/,//g;
340 $l =~ s/,//g;
341 $self->element({'Name' => 'BlastOutput_db-len',
342 'Data' => $s});
343 $self->element({'Name' => 'BlastOutput_db-let',
344 'Data' => $l});
345 last;
346 } else {
347 chomp;
348 $db .= $_;
349 }
350 }
351 $self->element({'Name' => 'BlastOutput_db',
352 'Data' => $db});
353 } elsif( /^>(\S+)\s*(.*)?/ ) {
354 chomp;
355
356 $self->in_element('hsp') && $self->end_element({ 'Name' => 'Hsp'});
357 $self->in_element('hit') && $self->end_element({ 'Name' => 'Hit'});
358
359 $self->start_element({ 'Name' => 'Hit'});
360 my $id = $1;
361 my $restofline = $2;
362 $self->element({ 'Name' => 'Hit_id',
363 'Data' => $id});
364 my ($acc, $version);
365 if ($id =~ /(gb|emb|dbj|sp|pdb|bbs|ref|lcl)\|(.*)\|(.*)/) {
366 ($acc, $version) = split /\./, $2;
367 } elsif ($id =~ /(pir|prf|pat|gnl)\|(.*)\|(.*)/) {
368 ($acc, $version) = split /\./, $3;
369 } else {
370 #punt, not matching the db's at ftp://ftp.ncbi.nih.gov/blast/db/README
371 #Database Name Identifier Syntax
372 #============================ ========================
373 #GenBank gb|accession|locus
374 #EMBL Data Library emb|accession|locus
375 #DDBJ, DNA Database of Japan dbj|accession|locus
376 #NBRF PIR pir||entry
377 #Protein Research Foundation prf||name
378 #SWISS-PROT sp|accession|entry name
379 #Brookhaven Protein Data Bank pdb|entry|chain
380 #Patents pat|country|number
381 #GenInfo Backbone Id bbs|number
382 #General database identifier gnl|database|identifier
383 #NCBI Reference Sequence ref|accession|locus
384 #Local Sequence identifier lcl|identifier
385 $acc=$id;
386 }
387 $self->element({ 'Name' => 'Hit_accession',
388 'Data' => $acc});
389
390 my $v = shift @hit_signifs;
391 if( defined $v ) {
392 $self->element({'Name' => 'Hit_signif',
393 'Data' => $v->[0]});
394 $self->element({'Name' => 'Hit_score',
395 'Data' => $v->[1]});
396 }
397 while(defined($_ = $self->_readline()) ) {
398 next if( /^\s+$/ );
399 chomp;
400 if( /Length\s*=\s*([\d,]+)/ ) {
401 my $l = $1;
402 $l =~ s/\,//g;
403 $self->element({ 'Name' => 'Hit_len',
404 'Data' => $l });
405 last;
406 } else {
407 $restofline .= $_;
408 }
409 }
410 $restofline =~ s/\s+/ /g;
411 $self->element({ 'Name' => 'Hit_def',
412 'Data' => $restofline});
413 } elsif( /\s+(Plus|Minus) Strand HSPs:/i ) {
414 next;
415 } elsif( ($self->in_element('hit') ||
416 $self->in_element('hsp')) && # wublast
417 m/Score\s*=\s*(\S+)\s* # Bit score
418 \(([\d\.]+)\s*bits\), # Raw score
419 \s*Expect\s*=\s*([^,\s]+), # E-value
420 \s*(?:Sum)?\s* # SUM
421 P(?:\(\d+\))?\s*=\s*([^,\s]+) # P-value
422 /ox
423 ) {
424 my ($score, $bits,$evalue,$pvalue) = ($1,$2,$3,$4);
425 $evalue =~ s/^e/1e/i;
426 $pvalue =~ s/^e/1e/i;
427 $self->in_element('hsp') && $self->end_element({'Name' => 'Hsp'});
428 $self->start_element({'Name' => 'Hsp'});
429 $self->element( { 'Name' => 'Hsp_score',
430 'Data' => $score});
431 $self->element( { 'Name' => 'Hsp_bit-score',
432 'Data' => $bits});
433 $self->element( { 'Name' => 'Hsp_evalue',
434 'Data' => $evalue});
435 $self->element( {'Name' => 'Hsp_pvalue',
436 'Data' => $pvalue});
437 } elsif( ($self->in_element('hit') ||
438 $self->in_element('hsp')) && # ncbi blast
439 m/Score\s*=\s*(\S+)\s*bits\s* # Bit score
440 (?:\((\d+)\))?, # Missing for BLAT pseudo-BLAST fmt
441 \s*Expect(?:\(\d+\))?\s*=\s*(\S+) # E-value
442 /ox) {
443 my ($bits,$score,$evalue) = ($1,$2,$3);
444 $evalue =~ s/^e/1e/i;
445 $self->in_element('hsp') && $self->end_element({ 'Name' => 'Hsp'});
446
447 $self->start_element({'Name' => 'Hsp'});
448 $self->element( { 'Name' => 'Hsp_score',
449 'Data' => $score});
450 $self->element( { 'Name' => 'Hsp_bit-score',
451 'Data' => $bits});
452 $self->element( { 'Name' => 'Hsp_evalue',
453 'Data' => $evalue});
454 } elsif( $self->in_element('hsp') &&
455 m/Identities\s*=\s*(\d+)\s*\/\s*(\d+)\s*[\d\%\(\)]+\s*
456 (?:,\s*Positives\s*=\s*(\d+)\/(\d+)\s*[\d\%\(\)]+\s*)? # pos only valid for Protein alignments
457 (?:\,\s*Gaps\s*=\s*(\d+)\/(\d+))? # Gaps
458 /oxi
459 ) {
460 $self->element( { 'Name' => 'Hsp_identity',
461 'Data' => $1});
462 $self->element( {'Name' => 'Hsp_align-len',
463 'Data' => $2});
464 if( defined $3 ) {
465 $self->element( { 'Name' => 'Hsp_positive',
466 'Data' => $3});
467 } else {
468 $self->element( { 'Name' => 'Hsp_positive',
469 'Data' => $1});
470 }
471 if( defined $6 ) {
472 $self->element( { 'Name' => 'Hsp_gaps',
473 'Data' => $5});
474 }
475
476 $self->{'_Query'} = { 'begin' => 0, 'end' => 0};
477 $self->{'_Sbjct'} = { 'begin' => 0, 'end' => 0};
478
479 if( /(Frame\s*=\s*.+)$/ ) {
480 # handle wu-blast Frame listing on same line
481 $self->_pushback($1);
482 }
483 } elsif( $self->in_element('hsp') &&
484 /Strand\s*=\s*(Plus|Minus)\s*\/\s*(Plus|Minus)/i ) {
485 # consume this event ( we infer strand from start/end)
486 next;
487 } elsif( $self->in_element('hsp') &&
488 /Frame\s*=\s*([\+\-][1-3])\s*(\/\s*([\+\-][1-3]))?/ ){
489 my ($one,$two)= ($1,$2);
490 my ($queryframe,$hitframe);
491 if( $reporttype eq 'TBLASTX' ) {
492 ($queryframe,$hitframe) = ($one,$two);
493 $hitframe =~ s/\/\s*//g;
494 } elsif( $reporttype =~ /^(PSI)?TBLASTN/oi ) {
495 ($hitframe,$queryframe) = ($one,0);
496 } elsif( $reporttype eq 'BLASTX' ) {
497 ($queryframe,$hitframe) = ($one,0);
498 }
499 $self->element({'Name' => 'Hsp_query-frame',
500 'Data' => $queryframe});
501
502 $self->element({'Name' => 'Hsp_hit-frame',
503 'Data' => $hitframe});
504 } elsif( /^Parameters:/ || /^\s+Database:\s+?/ || /^\s+Subset/ ||
505 /^\s+Subset/ || /^\s*Lambda/ || /^\s*Histogram/ ||
506 ( $self->in_element('hsp') && /WARNING|NOTE/ ) ) {
507 $self->in_element('hsp') && $self->end_element({'Name' => 'Hsp'});
508 $self->in_element('hit') && $self->end_element({'Name' => 'Hit'});
509 next if /^\s+Subset/;
510 my $blast = ( /^(\s+Database\:)|(\s*Lambda)/ ) ? 'ncbi' : 'wublast';
511 if( /^\s*Histogram/ ) {
512 $blast = 'btk';
513 }
514 my $last = '';
515 # default is that gaps are allowed
516 $self->element({'Name' => 'Parameters_allowgaps',
517 'Data' => 'yes'});
518 while( defined ($_ = $self->_readline ) ) {
519 if( /^(PSI)?([T]?BLAST[NPX])\s*(.+)$/i ||
520 /^(RPS-BLAST)\s*(.+)$/i ||
521 /^(MEGABLAST)\s*(.+)$/i ) {
522 $self->_pushback($_);
523 # let's handle this in the loop
524 last;
525 } elsif( /^Query=/ ) {
526 $self->_pushback($reportline);
527 $self->_pushback($_);
528 # -- Superfluous I think
529 $self->in_element('hsp') &&
530 $self->end_element({'Name' => 'Hsp'});
531 $self->in_element('hit') &&
532 $self->end_element({'Name' => 'Hit'});
533 # --
534 $self->end_element({ 'Name' => 'BlastOutput'});
535 return $self->end_document();
536 }
537
538 # here is where difference between wublast and ncbiblast
539 # is better handled by different logic
540 if( /Number of Sequences:\s+([\d\,]+)/i ||
541 /of sequences in database:\s+([\d,]+)/i) {
542 my $c = $1;
543 $c =~ s/\,//g;
544 $self->element({'Name' => 'Statistics_db-len',
545 'Data' => $c});
546 } elsif ( /letters in database:\s+(\-?[\d,]+)/i) {
547 my $s = $1;
548 $s =~ s/,//g;
549 $self->element({'Name' => 'Statistics_db-let',
550 'Data' => $s});
551 } elsif( $blast eq 'btk' ) {
552 next;
553 } elsif( $blast eq 'wublast' ) {
554 if( /E=(\S+)/ ) {
555 $self->element({'Name' => 'Parameters_expect',
556 'Data' => $1});
557 } elsif( /nogaps/ ) {
558 $self->element({'Name' => 'Parameters_allowgaps',
559 'Data' => 'no'});
560 } elsif( $last =~ /(Frame|Strand)\s+MatID\s+Matrix name/i ){
561 s/^\s+//;
562 #throw away first two slots
563 my @vals = split;
564 splice(@vals, 0,2);
565 my ($matrix,$lambda,$kappa,$entropy) = @vals;
566 $self->element({'Name' => 'Parameters_matrix',
567 'Data' => $matrix});
568 $self->element({'Name' => 'Statistics_lambda',
569 'Data' => $lambda});
570 $self->element({'Name' => 'Statistics_kappa',
571 'Data' => $kappa});
572 $self->element({'Name' => 'Statistics_entropy',
573 'Data' => $entropy});
574 } elsif( m/^\s+Q=(\d+),R=(\d+)\s+/ox ) {
575 $self->element({'Name' => 'Parameters_gap-open',
576 'Data' => $1});
577 $self->element({'Name' => 'Parameters_gap-extend',
578 'Data' => $2});
579 } elsif( /(\S+\s+\S+)\s+DFA:\s+(\S+)\s+\((.+)\)/ ) {
580 if( $1 eq 'states in') {
581 $self->element({'Name' => 'Statistics_DFA_states',
582 'Data' => "$2 $3"});
583 } elsif( $1 eq 'size of') {
584 $self->element({'Name' => 'Statistics_DFA_size',
585 'Data' => "$2 $3"});
586 }
587 } elsif( /^\s+Time to generate neighborhood:\s+(\S+\s+\S+\s+\S+)/ ) {
588 $self->element({'Name' => 'Statistics_neighbortime',
589 'Data' => $1});
590 } elsif( /processors\s+used:\s+(\d+)/ ) {
591 $self->element({'Name' => 'Statistics_noprocessors',
592 'Data' => $1});
593 } elsif( m/^\s+(\S+)\s+cpu\s+time:\s+(\S+\s+\S+\s+\S+)\s+
594 Elapsed:\s+(\S+)/ox ) {
595 my $cputype = lc($1);
596 $self->element({'Name' => "Statistics_$cputype\_cputime",
597 'Data' => $2});
598 $self->element({'Name' => "Statistics_$cputype\_actualtime",
599 'Data' => $3});
600 } elsif( /^\s+Start:/ ) {
601 my ($junk,$start,$stime,$end,$etime) =
602 split(/\s+(Start|End)\:\s+/,$_);
603 chomp($stime);
604 $self->element({'Name' => 'Statistics_starttime',
605 'Data' => $stime});
606 chomp($etime);
607 $self->element({'Name' => 'Statistics_endtime',
608 'Data' => $etime});
609 } elsif( !/^\s+$/ ) {
610 $self->debug( "unmatched stat $_");
611 }
612
613 } elsif ( $blast eq 'ncbi' ) {
614 if( m/^Matrix:\s+(\S+)/oxi ) {
615 $self->element({'Name' => 'Parameters_matrix',
616 'Data' => $1});
617 } elsif( /Lambda/ ) {
618 $_ = $self->_readline;
619 s/^\s+//;
620 my ($lambda, $kappa, $entropy) = split;
621 $self->element({'Name' => 'Statistics_lambda',
622 'Data' => $lambda});
623 $self->element({'Name' => 'Statistics_kappa',
624 'Data' => $kappa});
625 $self->element({'Name' => 'Statistics_entropy',
626 'Data' => $entropy});
627 } elsif( m/effective\s+search\s+space\s+used:\s+(\d+)/ox ) {
628 $self->element({'Name' => 'Statistics_eff-spaceused',
629 'Data' => $1});
630 } elsif( m/effective\s+search\s+space:\s+(\d+)/ox ) {
631 $self->element({'Name' => 'Statistics_eff-space',
632 'Data' => $1});
633 } elsif( m/Gap\s+Penalties:\s+Existence:\s+(\d+)\,
634 \s+Extension:\s+(\d+)/ox) {
635 $self->element({'Name' => 'Parameters_gap-open',
636 'Data' => $1});
637 $self->element({'Name' => 'Parameters_gap-extend',
638 'Data' => $2});
639 } elsif( /effective\s+HSP\s+length:\s+(\d+)/ ) {
640 $self->element({'Name' => 'Statistics_hsp-len',
641 'Data' => $1});
642 } elsif( /effective\s+length\s+of\s+query:\s+([\d\,]+)/ ) {
643 my $c = $1;
644 $c =~ s/\,//g;
645 $self->element({'Name' => 'Statistics_query-len',
646 'Data' => $c});
647 } elsif( m/effective\s+length\s+of\s+database:\s+
648 ([\d\,]+)/ox){
649 my $c = $1;
650 $c =~ s/\,//g;
651 $self->element({'Name' => 'Statistics_eff-dblen',
652 'Data' => $c});
653 } elsif( m/^(T|A|X1|X2|S1|S2):\s+(.+)/ox ) {
654 my $v = $2;
655 chomp($v);
656 $self->element({'Name' => "Statistics_$1",
657 'Data' => $v});
658 } elsif( m/frameshift\s+window\,\s+decay\s+const:\s+
659 (\d+)\,\s+([\.\d]+)/ox ) {
660 $self->element({'Name'=> 'Statistics_framewindow',
661 'Data' => $1});
662 $self->element({'Name'=> 'Statistics_decay',
663 'Data' => $2});
664 } elsif( m/^Number\s+of\s+Hits\s+to\s+DB:\s+(\S+)/ox ) {
665 $self->element({'Name' => 'Statistics_hit_to_db',
666 'Data' => $1});
667 } elsif( m/^Number\s+of\s+extensions:\s+(\S+)/ox ) {
668 $self->element({'Name' => 'Statistics_num_extensions',
669 'Data' => $1});
670 } elsif( m/^Number\s+of\s+successful\s+extensions:\s+
671 (\S+)/ox ) {
672 $self->element({'Name' => 'Statistics_num_suc_extensions',
673 'Data' => $1});
674 } elsif( m/^Number\s+of\s+sequences\s+better\s+than\s+
675 (\S+):\s+(\d+)/ox ) {
676 $self->element({'Name' => 'Parameters_expect',
677 'Data' => $1});
678 $self->element({'Name' => 'Statistics_seqs_better_than_cutoff',
679 'Data' => $2});
680 } elsif( /^\s+Posted\s+date:\s+(.+)/ ) {
681 my $d = $1;
682 chomp($d);
683 $self->element({'Name' => 'Statistics_posted_date',
684 'Data' => $d});
685 } elsif( ! /^\s+$/ ) {
686 $self->debug( "unmatched stat $_");
687 }
688 }
689 $last = $_;
690 }
691 } elsif( $self->in_element('hsp') ) {
692 # let's read 3 lines at a time;
693 my %data = ( 'Query' => '',
694 'Mid' => '',
695 'Hit' => '' );
696 my ($l,$len);
697 for( my $i = 0;
698 defined($_) && $i < 3;
699 $i++ ){
700 chomp;
701 if( ($i == 0 && /^\s+$/) || ($l = /^\s*Lambda/i) ) {
702 $self->_pushback($_) if defined $_;
703 # this fixes bug #1443
704 $self->end_element({'Name' => 'Hsp'});
705 $self->end_element({'Name' => 'Hit'}) if $l;
706 last;
707 }
708 if( /^((Query|Sbjct):\s+(\d+)\s*)(\S+)\s+(\d+)/ ) {
709 $data{$2} = $4;
710 $len = length($1);
711 $self->{"\_$2"}->{'begin'} = $3 unless $self->{"_$2"}->{'begin'};
712 $self->{"\_$2"}->{'end'} = $5;
713 } else {
714 $self->throw("no data for midline $_")
715 unless (defined $_ && defined $len);
716 $data{'Mid'} = substr($_,$len);
717 }
718 $_ = $self->_readline();
719 }
720 $self->characters({'Name' => 'Hsp_qseq',
721 'Data' => $data{'Query'} });
722 $self->characters({'Name' => 'Hsp_hseq',
723 'Data' => $data{'Sbjct'}});
724 $self->characters({'Name' => 'Hsp_midline',
725 'Data' => $data{'Mid'} });
726 } else {
727 $self->debug( "unrecognized line $_");
728 }
729 }
730
731 if( $seentop ) {
732 # double back check that hits and hsps are closed
733 # this in response to bug #1443 (may be uncessary due to fix
734 # above, but making double sure)
735 $self->in_element('hsp') &&
736 $self->end_element({'Name' => 'Hsp'});
737 $self->in_element('hit') &&
738 $self->end_element({'Name' => 'Hit'});
739 $self->end_element({'Name' => 'BlastOutput'});
740 }
741 # $self->end_element({'Name' => 'BlastOutput'}) unless ! $seentop;
742 return $self->end_document();
743 }
744
745 =head2 start_element
746
747 Title : start_element
748 Usage : $eventgenerator->start_element
749 Function: Handles a start element event
750 Returns : none
751 Args : hashref with at least 2 keys 'Data' and 'Name'
752
753
754 =cut
755
756 sub start_element{
757 my ($self,$data) = @_;
758 # we currently don't care about attributes
759 my $nm = $data->{'Name'};
760 my $type = $MODEMAP{$nm};
761 if( $type ) {
762 if( $self->_eventHandler->will_handle($type) ) {
763 my $func = sprintf("start_%s",lc $type);
764 $self->_eventHandler->$func($data->{'Attributes'});
765 }
766 unshift @{$self->{'_elements'}}, $type;
767 if( $type eq 'result') {
768 $self->{'_values'} = {};
769 $self->{'_result'}= undef;
770 } else {
771 # cleanup some things
772 if( defined $self->{'_values'} ) {
773 foreach my $k ( grep { /^\U$type\-/ }
774 keys %{$self->{'_values'}} ) {
775 delete $self->{'_values'}->{$k};
776 }
777 }
778 }
779 }
780 }
781
782 =head2 end_element
783
784 Title : start_element
785 Usage : $eventgenerator->end_element
786 Function: Handles an end element event
787 Returns : none
788 Args : hashref with at least 2 keys 'Data' and 'Name'
789
790
791 =cut
792
793 sub end_element {
794 my ($self,$data) = @_;
795 my $nm = $data->{'Name'};
796 my $type = $MODEMAP{$nm};
797 my $rc;
798 if($nm eq 'BlastOutput_program' &&
799 $self->{'_last_data'} =~ /(t?blast[npx])/i ) {
800 $self->{'_reporttype'} = uc $1;
801 }
802
803 # Hsp are sort of weird, in that they end when another
804 # object begins so have to detect this in end_element for now
805 if( $nm eq 'Hsp' ) {
806 foreach ( qw(Hsp_qseq Hsp_midline Hsp_hseq) ) {
807 $self->element({'Name' => $_,
808 'Data' => $self->{'_last_hspdata'}->{$_}});
809 }
810 $self->{'_last_hspdata'} = {};
811 $self->element({'Name' => 'Hsp_query-from',
812 'Data' => $self->{'_Query'}->{'begin'}});
813 $self->element({'Name' => 'Hsp_query-to',
814 'Data' => $self->{'_Query'}->{'end'}});
815
816 $self->element({'Name' => 'Hsp_hit-from',
817 'Data' => $self->{'_Sbjct'}->{'begin'}});
818 $self->element({'Name' => 'Hsp_hit-to',
819 'Data' => $self->{'_Sbjct'}->{'end'}});
820 }
821 if( $type = $MODEMAP{$nm} ) {
822 if( $self->_eventHandler->will_handle($type) ) {
823 my $func = sprintf("end_%s",lc $type);
824 $rc = $self->_eventHandler->$func($self->{'_reporttype'},
825 $self->{'_values'});
826 }
827 shift @{$self->{'_elements'}};
828
829 } elsif( $MAPPING{$nm} ) {
830
831 if ( ref($MAPPING{$nm}) =~ /hash/i ) {
832 # this is where we shove in the data from the
833 # hashref info about params or statistics
834 my $key = (keys %{$MAPPING{$nm}})[0];
835 $self->{'_values'}->{$key}->{$MAPPING{$nm}->{$key}} = $self->{'_last_data'};
836 } else {
837 $self->{'_values'}->{$MAPPING{$nm}} = $self->{'_last_data'};
838 }
839 } else {
840 $self->debug( "unknown nm $nm, ignoring\n");
841 }
842 $self->{'_last_data'} = ''; # remove read data if we are at
843 # end of an element
844 $self->{'_result'} = $rc if( defined $type && $type eq 'result' );
845 return $rc;
846
847 }
848
849 =head2 element
850
851 Title : element
852 Usage : $eventhandler->element({'Name' => $name, 'Data' => $str});
853 Function: Convience method that calls start_element, characters, end_element
854 Returns : none
855 Args : Hash ref with the keys 'Name' and 'Data'
856
857
858 =cut
859
860 sub element{
861 my ($self,$data) = @_;
862 $self->start_element($data);
863 $self->characters($data);
864 $self->end_element($data);
865 }
866
867 =head2 characters
868
869 Title : characters
870 Usage : $eventgenerator->characters($str)
871 Function: Send a character events
872 Returns : none
873 Args : string
874
875
876 =cut
877
878 sub characters{
879 my ($self,$data) = @_;
880 return unless ( defined $data->{'Data'} && $data->{'Data'} !~ /^\s+$/ );
881
882 if( $self->in_element('hsp') &&
883 $data->{'Name'} =~ /Hsp\_(qseq|hseq|midline)/ ) {
884 $self->{'_last_hspdata'}->{$data->{'Name'}} .= $data->{'Data'};
885 }
886 $self->{'_last_data'} = $data->{'Data'};
887 }
888
889 =head2 within_element
890
891 Title : within_element
892 Usage : if( $eventgenerator->within_element($element) ) {}
893 Function: Test if we are within a particular element
894 This is different than 'in' because within can be tested
895 for a whole block.
896 Returns : boolean
897 Args : string element name
898
899
900 =cut
901
902 sub within_element{
903 my ($self,$name) = @_;
904 return 0 if ( ! defined $name &&
905 ! defined $self->{'_elements'} ||
906 scalar @{$self->{'_elements'}} == 0) ;
907 foreach ( @{$self->{'_elements'}} ) {
908 if( $_ eq $name ) {
909 return 1;
910 }
911 }
912 return 0;
913 }
914
915
916 =head2 in_element
917
918 Title : in_element
919 Usage : if( $eventgenerator->in_element($element) ) {}
920 Function: Test if we are in a particular element
921 This is different than 'in' because within can be tested
922 for a whole block.
923 Returns : boolean
924 Args : string element name
925
926
927 =cut
928
929 sub in_element{
930 my ($self,$name) = @_;
931 return 0 if ! defined $self->{'_elements'}->[0];
932 return ( $self->{'_elements'}->[0] eq $name)
933 }
934
935 =head2 start_document
936
937 Title : start_document
938 Usage : $eventgenerator->start_document
939 Function: Handle a start document event
940 Returns : none
941 Args : none
942
943
944 =cut
945
946 sub start_document{
947 my ($self) = @_;
948 $self->{'_lasttype'} = '';
949 $self->{'_values'} = {};
950 $self->{'_result'}= undef;
951 $self->{'_elements'} = [];
952 }
953
954 =head2 end_document
955
956 Title : end_document
957 Usage : $eventgenerator->end_document
958 Function: Handles an end document event
959 Returns : Bio::Search::Result::ResultI object
960 Args : none
961
962
963 =cut
964
965 sub end_document{
966 my ($self,@args) = @_;
967 return $self->{'_result'};
968 }
969
970
971 sub write_result {
972 my ($self, $blast, @args) = @_;
973
974 if( not defined($self->writer) ) {
975 $self->warn("Writer not defined. Using a $DEFAULT_BLAST_WRITER_CLASS");
976 $self->writer( $DEFAULT_BLAST_WRITER_CLASS->new() );
977 }
978 $self->SUPER::write_result( $blast, @args );
979 }
980
981 sub result_count {
982 my $self = shift;
983 return $self->{'_result_count'};
984 }
985
986 sub report_count { shift->result_count }
987
988 1;