Mercurial > repos > mahtabm > ensembl
comparison variant_effect_predictor/Bio/SearchIO/blast.pm @ 0:1f6dce3d34e0
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 02:01:53 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:1f6dce3d34e0 |
---|---|
1 # $Id: blast.pm,v 1.42.2.14 2003/09/15 16:19:01 jason Exp $ | |
2 # | |
3 # BioPerl module for Bio::SearchIO::blast | |
4 # | |
5 # Cared for by Jason Stajich <jason@bioperl.org> | |
6 # | |
7 # Copyright Jason Stajich | |
8 # | |
9 # You may distribute this module under the same terms as perl itself | |
10 | |
11 # POD documentation - main docs before the code | |
12 | |
13 =head1 NAME | |
14 | |
15 Bio::SearchIO::blast - Event generator for event based parsing of blast reports | |
16 | |
17 =head1 SYNOPSIS | |
18 | |
19 # Do not use this object directly - it is used as part of the | |
20 # Bio::SearchIO system. | |
21 | |
22 use Bio::SearchIO; | |
23 my $searchio = new Bio::SearchIO(-format => 'blast', | |
24 -file => 't/data/ecolitst.bls'); | |
25 while( my $result = $searchio->next_result ) { | |
26 while( my $hit = $result->next_hit ) { | |
27 while( my $hsp = $hit->next_hsp ) { | |
28 # ... | |
29 } | |
30 } | |
31 } | |
32 | |
33 =head1 DESCRIPTION | |
34 | |
35 This object encapsulated the necessary methods for generating events | |
36 suitable for building Bio::Search objects from a BLAST report file. | |
37 Read the L<Bio::SearchIO> for more information about how to use this. | |
38 | |
39 =head1 FEEDBACK | |
40 | |
41 =head2 Mailing Lists | |
42 | |
43 User feedback is an integral part of the evolution of this and other | |
44 Bioperl modules. Send your comments and suggestions preferably to | |
45 the Bioperl mailing list. Your participation is much appreciated. | |
46 | |
47 bioperl-l@bioperl.org - General discussion | |
48 http://bioperl.org/MailList.shtml - About the mailing lists | |
49 | |
50 =head2 Reporting Bugs | |
51 | |
52 Report bugs to the Bioperl bug tracking system to help us keep track | |
53 of the bugs and their resolution. Bug reports can be submitted via | |
54 email or the web: | |
55 | |
56 bioperl-bugs@bioperl.org | |
57 http://bugzilla.bioperl.org/ | |
58 | |
59 =head1 AUTHOR - Jason Stajich | |
60 | |
61 Email jason@bioperl.org | |
62 | |
63 Describe contact details here | |
64 | |
65 =head1 CONTRIBUTORS | |
66 | |
67 Additional contributors names and emails here | |
68 | |
69 =head1 APPENDIX | |
70 | |
71 The rest of the documentation details each of the object methods. | |
72 Internal methods are usually preceded with a _ | |
73 | |
74 =cut | |
75 | |
76 | |
77 # Let the code begin... | |
78 | |
79 | |
80 | |
81 package Bio::SearchIO::blast; | |
82 use strict; | |
83 use vars qw(@ISA %MAPPING %MODEMAP $DEFAULT_BLAST_WRITER_CLASS); | |
84 use Bio::SearchIO; | |
85 | |
86 @ISA = qw(Bio::SearchIO ); | |
87 | |
88 BEGIN { | |
89 # mapping of NCBI Blast terms to Bioperl hash keys | |
90 %MODEMAP = ('BlastOutput' => 'result', | |
91 'Hit' => 'hit', | |
92 'Hsp' => 'hsp' | |
93 ); | |
94 | |
95 # This should really be done more intelligently, like with | |
96 # XSLT | |
97 | |
98 %MAPPING = | |
99 ( | |
100 'Hsp_bit-score' => 'HSP-bits', | |
101 'Hsp_score' => 'HSP-score', | |
102 'Hsp_evalue' => 'HSP-evalue', | |
103 'Hsp_pvalue' => 'HSP-pvalue', | |
104 'Hsp_query-from' => 'HSP-query_start', | |
105 'Hsp_query-to' => 'HSP-query_end', | |
106 'Hsp_hit-from' => 'HSP-hit_start', | |
107 'Hsp_hit-to' => 'HSP-hit_end', | |
108 'Hsp_positive' => 'HSP-conserved', | |
109 'Hsp_identity' => 'HSP-identical', | |
110 'Hsp_gaps' => 'HSP-hsp_gaps', | |
111 'Hsp_hitgaps' => 'HSP-hit_gaps', | |
112 'Hsp_querygaps' => 'HSP-query_gaps', | |
113 'Hsp_qseq' => 'HSP-query_seq', | |
114 'Hsp_hseq' => 'HSP-hit_seq', | |
115 'Hsp_midline' => 'HSP-homology_seq', | |
116 'Hsp_align-len' => 'HSP-hsp_length', | |
117 'Hsp_query-frame'=> 'HSP-query_frame', | |
118 'Hsp_hit-frame' => 'HSP-hit_frame', | |
119 | |
120 'Hit_id' => 'HIT-name', | |
121 'Hit_len' => 'HIT-length', | |
122 'Hit_accession' => 'HIT-accession', | |
123 'Hit_def' => 'HIT-description', | |
124 'Hit_signif' => 'HIT-significance', | |
125 'Hit_score' => 'HIT-score', | |
126 'Iteration_iter-num' => 'HIT-iteration', | |
127 | |
128 'BlastOutput_program' => 'RESULT-algorithm_name', | |
129 'BlastOutput_version' => 'RESULT-algorithm_version', | |
130 'BlastOutput_query-def'=> 'RESULT-query_name', | |
131 'BlastOutput_query-len'=> 'RESULT-query_length', | |
132 'BlastOutput_query-acc'=> 'RESULT-query_accession', | |
133 'BlastOutput_querydesc'=> 'RESULT-query_description', | |
134 'BlastOutput_db' => 'RESULT-database_name', | |
135 'BlastOutput_db-len' => 'RESULT-database_entries', | |
136 'BlastOutput_db-let' => 'RESULT-database_letters', | |
137 | |
138 'Parameters_matrix' => { 'RESULT-parameters' => 'matrix'}, | |
139 'Parameters_expect' => { 'RESULT-parameters' => 'expect'}, | |
140 'Parameters_include' => { 'RESULT-parameters' => 'include'}, | |
141 'Parameters_sc-match' => { 'RESULT-parameters' => 'match'}, | |
142 'Parameters_sc-mismatch' => { 'RESULT-parameters' => 'mismatch'}, | |
143 'Parameters_gap-open' => { 'RESULT-parameters' => 'gapopen'}, | |
144 'Parameters_gap-extend'=> { 'RESULT-parameters' => 'gapext'}, | |
145 'Parameters_filter' => {'RESULT-parameters' => 'filter'}, | |
146 'Parameters_allowgaps' => { 'RESULT-parameters' => 'allowgaps'}, | |
147 | |
148 'Statistics_db-len' => {'RESULT-statistics' => 'dbentries'}, | |
149 'Statistics_db-let' => { 'RESULT-statistics' => 'dbletters'}, | |
150 'Statistics_hsp-len' => { 'RESULT-statistics' => 'effective_hsplength'}, | |
151 'Statistics_query-len' => { 'RESULT-statistics' => 'querylength'}, | |
152 'Statistics_eff-space' => { 'RESULT-statistics' => 'effectivespace'}, | |
153 'Statistics_eff-spaceused' => { 'RESULT-statistics' => 'effectivespaceused'}, | |
154 'Statistics_eff-dblen' => { 'RESULT-statistics' => 'effectivedblength'}, | |
155 'Statistics_kappa' => { 'RESULT-statistics' => 'kappa' }, | |
156 'Statistics_lambda' => { 'RESULT-statistics' => 'lambda' }, | |
157 'Statistics_entropy' => { 'RESULT-statistics' => 'entropy'}, | |
158 'Statistics_framewindow'=> { 'RESULT-statistics' => 'frameshiftwindow'}, | |
159 'Statistics_decay'=> { 'RESULT-statistics' => 'decayconst'}, | |
160 | |
161 'Statistics_T'=> { 'RESULT-statistics' => 'T'}, | |
162 'Statistics_A'=> { 'RESULT-statistics' => 'A'}, | |
163 'Statistics_X1'=> { 'RESULT-statistics' => 'X1'}, | |
164 'Statistics_X2'=> { 'RESULT-statistics' => 'X2'}, | |
165 'Statistics_S1'=> { 'RESULT-statistics' => 'S1'}, | |
166 'Statistics_S2'=> { 'RESULT-statistics' => 'S2'}, | |
167 'Statistics_hit_to_db' => { 'RESULT-statistics' => 'Hits_to_DB'}, | |
168 'Statistics_num_extensions' => { 'RESULT-statistics' => 'num_extensions'}, | |
169 'Statistics_num_extensions' => { 'RESULT-statistics' => 'num_extensions'}, | |
170 'Statistics_num_suc_extensions' => { 'RESULT-statistics' => 'num_successful_extensions'}, | |
171 'Statistics_seqs_better_than_cutoff' => { 'RESULT-statistics' => 'seqs_better_than_cutoff'}, | |
172 'Statistics_posted_date' => { 'RESULT-statistics' => 'posted_date'}, | |
173 | |
174 # WU-BLAST stats | |
175 'Statistics_DFA_states'=> { 'RESULT-statistics' => 'num_dfa_states'}, | |
176 'Statistics_DFA_size'=> { 'RESULT-statistics' => 'dfa_size'}, | |
177 | |
178 'Statistics_search_cputime' => { 'RESULT-statistics' => 'search_cputime'}, | |
179 'Statistics_total_cputime' => { 'RESULT-statistics' => 'total_cputime'}, | |
180 'Statistics_search_actualtime' => { 'RESULT-statistics' => 'search_actualtime'}, | |
181 'Statistics_total_actualtime' => { 'RESULT-statistics' => 'total_actualtime'}, | |
182 | |
183 'Statistics_noprocessors' => { 'RESULT-statistics' => 'no_of_processors'}, | |
184 'Statistics_neighbortime' => { 'RESULT-statistics' => 'neighborhood_generate_time'}, | |
185 'Statistics_starttime' => { 'RESULT-statistics' => 'start_time'}, | |
186 'Statistics_endtime' => { 'RESULT-statistics' => 'end_time'}, | |
187 ); | |
188 | |
189 $DEFAULT_BLAST_WRITER_CLASS = 'Bio::Search::Writer::HitTableWriter'; | |
190 } | |
191 | |
192 | |
193 =head2 new | |
194 | |
195 Title : new | |
196 Usage : my $obj = new Bio::SearchIO::blast(); | |
197 Function: Builds a new Bio::SearchIO::blast object | |
198 Returns : Bio::SearchIO::blast | |
199 Args : -fh/-file => filehandle/filename to BLAST file | |
200 -format => 'blast' | |
201 | |
202 =cut | |
203 | |
204 =head2 next_result | |
205 | |
206 Title : next_result | |
207 Usage : my $hit = $searchio->next_result; | |
208 Function: Returns the next Result from a search | |
209 Returns : Bio::Search::Result::ResultI object | |
210 Args : none | |
211 | |
212 =cut | |
213 | |
214 sub next_result{ | |
215 my ($self) = @_; | |
216 | |
217 my $data = ''; | |
218 my $seentop = 0; | |
219 my ($reporttype,$seenquery,$reportline); | |
220 $self->start_document(); | |
221 my @hit_signifs; | |
222 | |
223 while( defined ($_ = $self->_readline )) { | |
224 next if( /^\s+$/); # skip empty lines | |
225 next if( /CPU time:/); | |
226 next if( /^>\s*$/); | |
227 | |
228 if( /^([T]?BLAST[NPX])\s*(.+)$/i || | |
229 /^(PSITBLASTN)\s+(.+)$/i || | |
230 /^(RPS-BLAST)\s*(.+)$/i || | |
231 /^(MEGABLAST)\s*(.+)$/i | |
232 ) { | |
233 if( $seentop ) { | |
234 $self->_pushback($_); | |
235 $self->in_element('hsp') && | |
236 $self->end_element({ 'Name' => 'Hsp'}); | |
237 $self->in_element('hit') && | |
238 $self->end_element({ 'Name' => 'Hit'}); | |
239 $self->end_element({ 'Name' => 'BlastOutput'}); | |
240 return $self->end_document(); | |
241 } | |
242 $self->start_element({ 'Name' => 'BlastOutput' } ); | |
243 $self->{'_result_count'}++; | |
244 $seentop = 1; | |
245 $reporttype = $1; | |
246 $reportline = $_; # to fix the fact that RPS-BLAST output is wrong | |
247 $self->element({ 'Name' => 'BlastOutput_program', | |
248 'Data' => $reporttype}); | |
249 | |
250 $self->element({ 'Name' => 'BlastOutput_version', | |
251 'Data' => $2}); | |
252 } elsif ( /^Query=\s*(.+)$/ ) { | |
253 my $q = $1; | |
254 my $size = 0; | |
255 if( defined $seenquery ) { | |
256 $self->_pushback($reportline); | |
257 $self->_pushback($_); | |
258 $self->in_element('hsp') && | |
259 $self->end_element({'Name'=> 'Hsp'}); | |
260 $self->in_element('hit') && | |
261 $self->end_element({'Name'=> 'Hit'}); | |
262 $self->end_element({'Name' => 'BlastOutput'}); | |
263 return $self->end_document(); | |
264 } else { | |
265 if( ! defined $reporttype ) { | |
266 $self->start_element({'Name' => 'BlastOutput'}); | |
267 $seentop = 1; | |
268 $self->{'_result_count'}++; | |
269 } | |
270 } | |
271 $seenquery = $q; | |
272 $_ = $self->_readline; | |
273 while( defined ($_) ) { | |
274 if( /^Database:/ ) { | |
275 $self->_pushback($_); | |
276 last; | |
277 } | |
278 chomp; | |
279 if( /\((\-?[\d,]+)\s+letters.*\)/ ) { | |
280 $size = $1; | |
281 $size =~ s/,//g; | |
282 last; | |
283 } else { | |
284 $q .= " $_"; | |
285 $q =~ s/ +/ /g; | |
286 $q =~ s/^ | $//g; | |
287 } | |
288 | |
289 $_ = $self->_readline; | |
290 } | |
291 chomp($q); | |
292 my ($nm,$desc) = split(/\s+/,$q,2); | |
293 $self->element({ 'Name' => 'BlastOutput_query-def', | |
294 'Data' => $nm}); | |
295 $self->element({ 'Name' => 'BlastOutput_query-len', | |
296 'Data' => $size}); | |
297 defined $desc && $desc =~ s/\s+$//; | |
298 $self->element({ 'Name' => 'BlastOutput_querydesc', | |
299 'Data' => $desc}); | |
300 | |
301 if( my @pieces = split(/\|/,$nm) ) { | |
302 my $acc = pop @pieces; | |
303 $acc = pop @pieces if( ! defined $acc || $acc =~ /^\s+$/); | |
304 $self->element({ 'Name' => 'BlastOutput_query-acc', | |
305 'Data' => $acc}); | |
306 } | |
307 | |
308 } elsif( /Sequences producing significant alignments:/ ) { | |
309 descline: | |
310 while( defined ($_ = $self->_readline() )) { | |
311 if( /^>/ ) { | |
312 $self->_pushback($_); | |
313 last descline; | |
314 } elsif( /(\d+)\s+([\d\.\-eE]+)(\s+\d+)?\s*$/) { | |
315 # the last match is for gapped BLAST output | |
316 # which will report the number of HSPs for the Hit | |
317 my ($score, $evalue) = ($1, $2); | |
318 # Some data clean-up so e-value will appear numeric to perl | |
319 $evalue =~ s/^e/1e/i; | |
320 push @hit_signifs, [ $evalue, $score ]; | |
321 } | |
322 } | |
323 } elsif( /Sequences producing High-scoring Segment Pairs:/ ) { | |
324 # skip the next line | |
325 $_ = $self->_readline(); | |
326 | |
327 while( defined ($_ = $self->_readline() ) && | |
328 ! /^\s+$/ ) { | |
329 my @line = split; | |
330 pop @line; # throw away first number which is for 'N'col | |
331 push @hit_signifs, [ pop @line, pop @line]; | |
332 } | |
333 } elsif ( /^Database:\s*(.+)$/ ) { | |
334 my $db = $1; | |
335 | |
336 while( defined($_ = $self->_readline) ) { | |
337 if( /^\s+(\-?[\d\,]+)\s+sequences\;\s+(\-?[\d,]+)\s+total\s+letters/){ | |
338 my ($s,$l) = ($1,$2); | |
339 $s =~ s/,//g; | |
340 $l =~ s/,//g; | |
341 $self->element({'Name' => 'BlastOutput_db-len', | |
342 'Data' => $s}); | |
343 $self->element({'Name' => 'BlastOutput_db-let', | |
344 'Data' => $l}); | |
345 last; | |
346 } else { | |
347 chomp; | |
348 $db .= $_; | |
349 } | |
350 } | |
351 $self->element({'Name' => 'BlastOutput_db', | |
352 'Data' => $db}); | |
353 } elsif( /^>(\S+)\s*(.*)?/ ) { | |
354 chomp; | |
355 | |
356 $self->in_element('hsp') && $self->end_element({ 'Name' => 'Hsp'}); | |
357 $self->in_element('hit') && $self->end_element({ 'Name' => 'Hit'}); | |
358 | |
359 $self->start_element({ 'Name' => 'Hit'}); | |
360 my $id = $1; | |
361 my $restofline = $2; | |
362 $self->element({ 'Name' => 'Hit_id', | |
363 'Data' => $id}); | |
364 my ($acc, $version); | |
365 if ($id =~ /(gb|emb|dbj|sp|pdb|bbs|ref|lcl)\|(.*)\|(.*)/) { | |
366 ($acc, $version) = split /\./, $2; | |
367 } elsif ($id =~ /(pir|prf|pat|gnl)\|(.*)\|(.*)/) { | |
368 ($acc, $version) = split /\./, $3; | |
369 } else { | |
370 #punt, not matching the db's at ftp://ftp.ncbi.nih.gov/blast/db/README | |
371 #Database Name Identifier Syntax | |
372 #============================ ======================== | |
373 #GenBank gb|accession|locus | |
374 #EMBL Data Library emb|accession|locus | |
375 #DDBJ, DNA Database of Japan dbj|accession|locus | |
376 #NBRF PIR pir||entry | |
377 #Protein Research Foundation prf||name | |
378 #SWISS-PROT sp|accession|entry name | |
379 #Brookhaven Protein Data Bank pdb|entry|chain | |
380 #Patents pat|country|number | |
381 #GenInfo Backbone Id bbs|number | |
382 #General database identifier gnl|database|identifier | |
383 #NCBI Reference Sequence ref|accession|locus | |
384 #Local Sequence identifier lcl|identifier | |
385 $acc=$id; | |
386 } | |
387 $self->element({ 'Name' => 'Hit_accession', | |
388 'Data' => $acc}); | |
389 | |
390 my $v = shift @hit_signifs; | |
391 if( defined $v ) { | |
392 $self->element({'Name' => 'Hit_signif', | |
393 'Data' => $v->[0]}); | |
394 $self->element({'Name' => 'Hit_score', | |
395 'Data' => $v->[1]}); | |
396 } | |
397 while(defined($_ = $self->_readline()) ) { | |
398 next if( /^\s+$/ ); | |
399 chomp; | |
400 if( /Length\s*=\s*([\d,]+)/ ) { | |
401 my $l = $1; | |
402 $l =~ s/\,//g; | |
403 $self->element({ 'Name' => 'Hit_len', | |
404 'Data' => $l }); | |
405 last; | |
406 } else { | |
407 $restofline .= $_; | |
408 } | |
409 } | |
410 $restofline =~ s/\s+/ /g; | |
411 $self->element({ 'Name' => 'Hit_def', | |
412 'Data' => $restofline}); | |
413 } elsif( /\s+(Plus|Minus) Strand HSPs:/i ) { | |
414 next; | |
415 } elsif( ($self->in_element('hit') || | |
416 $self->in_element('hsp')) && # wublast | |
417 m/Score\s*=\s*(\S+)\s* # Bit score | |
418 \(([\d\.]+)\s*bits\), # Raw score | |
419 \s*Expect\s*=\s*([^,\s]+), # E-value | |
420 \s*(?:Sum)?\s* # SUM | |
421 P(?:\(\d+\))?\s*=\s*([^,\s]+) # P-value | |
422 /ox | |
423 ) { | |
424 my ($score, $bits,$evalue,$pvalue) = ($1,$2,$3,$4); | |
425 $evalue =~ s/^e/1e/i; | |
426 $pvalue =~ s/^e/1e/i; | |
427 $self->in_element('hsp') && $self->end_element({'Name' => 'Hsp'}); | |
428 $self->start_element({'Name' => 'Hsp'}); | |
429 $self->element( { 'Name' => 'Hsp_score', | |
430 'Data' => $score}); | |
431 $self->element( { 'Name' => 'Hsp_bit-score', | |
432 'Data' => $bits}); | |
433 $self->element( { 'Name' => 'Hsp_evalue', | |
434 'Data' => $evalue}); | |
435 $self->element( {'Name' => 'Hsp_pvalue', | |
436 'Data' => $pvalue}); | |
437 } elsif( ($self->in_element('hit') || | |
438 $self->in_element('hsp')) && # ncbi blast | |
439 m/Score\s*=\s*(\S+)\s*bits\s* # Bit score | |
440 (?:\((\d+)\))?, # Missing for BLAT pseudo-BLAST fmt | |
441 \s*Expect(?:\(\d+\))?\s*=\s*(\S+) # E-value | |
442 /ox) { | |
443 my ($bits,$score,$evalue) = ($1,$2,$3); | |
444 $evalue =~ s/^e/1e/i; | |
445 $self->in_element('hsp') && $self->end_element({ 'Name' => 'Hsp'}); | |
446 | |
447 $self->start_element({'Name' => 'Hsp'}); | |
448 $self->element( { 'Name' => 'Hsp_score', | |
449 'Data' => $score}); | |
450 $self->element( { 'Name' => 'Hsp_bit-score', | |
451 'Data' => $bits}); | |
452 $self->element( { 'Name' => 'Hsp_evalue', | |
453 'Data' => $evalue}); | |
454 } elsif( $self->in_element('hsp') && | |
455 m/Identities\s*=\s*(\d+)\s*\/\s*(\d+)\s*[\d\%\(\)]+\s* | |
456 (?:,\s*Positives\s*=\s*(\d+)\/(\d+)\s*[\d\%\(\)]+\s*)? # pos only valid for Protein alignments | |
457 (?:\,\s*Gaps\s*=\s*(\d+)\/(\d+))? # Gaps | |
458 /oxi | |
459 ) { | |
460 $self->element( { 'Name' => 'Hsp_identity', | |
461 'Data' => $1}); | |
462 $self->element( {'Name' => 'Hsp_align-len', | |
463 'Data' => $2}); | |
464 if( defined $3 ) { | |
465 $self->element( { 'Name' => 'Hsp_positive', | |
466 'Data' => $3}); | |
467 } else { | |
468 $self->element( { 'Name' => 'Hsp_positive', | |
469 'Data' => $1}); | |
470 } | |
471 if( defined $6 ) { | |
472 $self->element( { 'Name' => 'Hsp_gaps', | |
473 'Data' => $5}); | |
474 } | |
475 | |
476 $self->{'_Query'} = { 'begin' => 0, 'end' => 0}; | |
477 $self->{'_Sbjct'} = { 'begin' => 0, 'end' => 0}; | |
478 | |
479 if( /(Frame\s*=\s*.+)$/ ) { | |
480 # handle wu-blast Frame listing on same line | |
481 $self->_pushback($1); | |
482 } | |
483 } elsif( $self->in_element('hsp') && | |
484 /Strand\s*=\s*(Plus|Minus)\s*\/\s*(Plus|Minus)/i ) { | |
485 # consume this event ( we infer strand from start/end) | |
486 next; | |
487 } elsif( $self->in_element('hsp') && | |
488 /Frame\s*=\s*([\+\-][1-3])\s*(\/\s*([\+\-][1-3]))?/ ){ | |
489 my ($one,$two)= ($1,$2); | |
490 my ($queryframe,$hitframe); | |
491 if( $reporttype eq 'TBLASTX' ) { | |
492 ($queryframe,$hitframe) = ($one,$two); | |
493 $hitframe =~ s/\/\s*//g; | |
494 } elsif( $reporttype =~ /^(PSI)?TBLASTN/oi ) { | |
495 ($hitframe,$queryframe) = ($one,0); | |
496 } elsif( $reporttype eq 'BLASTX' ) { | |
497 ($queryframe,$hitframe) = ($one,0); | |
498 } | |
499 $self->element({'Name' => 'Hsp_query-frame', | |
500 'Data' => $queryframe}); | |
501 | |
502 $self->element({'Name' => 'Hsp_hit-frame', | |
503 'Data' => $hitframe}); | |
504 } elsif( /^Parameters:/ || /^\s+Database:\s+?/ || /^\s+Subset/ || | |
505 /^\s+Subset/ || /^\s*Lambda/ || /^\s*Histogram/ || | |
506 ( $self->in_element('hsp') && /WARNING|NOTE/ ) ) { | |
507 $self->in_element('hsp') && $self->end_element({'Name' => 'Hsp'}); | |
508 $self->in_element('hit') && $self->end_element({'Name' => 'Hit'}); | |
509 next if /^\s+Subset/; | |
510 my $blast = ( /^(\s+Database\:)|(\s*Lambda)/ ) ? 'ncbi' : 'wublast'; | |
511 if( /^\s*Histogram/ ) { | |
512 $blast = 'btk'; | |
513 } | |
514 my $last = ''; | |
515 # default is that gaps are allowed | |
516 $self->element({'Name' => 'Parameters_allowgaps', | |
517 'Data' => 'yes'}); | |
518 while( defined ($_ = $self->_readline ) ) { | |
519 if( /^(PSI)?([T]?BLAST[NPX])\s*(.+)$/i || | |
520 /^(RPS-BLAST)\s*(.+)$/i || | |
521 /^(MEGABLAST)\s*(.+)$/i ) { | |
522 $self->_pushback($_); | |
523 # let's handle this in the loop | |
524 last; | |
525 } elsif( /^Query=/ ) { | |
526 $self->_pushback($reportline); | |
527 $self->_pushback($_); | |
528 # -- Superfluous I think | |
529 $self->in_element('hsp') && | |
530 $self->end_element({'Name' => 'Hsp'}); | |
531 $self->in_element('hit') && | |
532 $self->end_element({'Name' => 'Hit'}); | |
533 # -- | |
534 $self->end_element({ 'Name' => 'BlastOutput'}); | |
535 return $self->end_document(); | |
536 } | |
537 | |
538 # here is where difference between wublast and ncbiblast | |
539 # is better handled by different logic | |
540 if( /Number of Sequences:\s+([\d\,]+)/i || | |
541 /of sequences in database:\s+([\d,]+)/i) { | |
542 my $c = $1; | |
543 $c =~ s/\,//g; | |
544 $self->element({'Name' => 'Statistics_db-len', | |
545 'Data' => $c}); | |
546 } elsif ( /letters in database:\s+(\-?[\d,]+)/i) { | |
547 my $s = $1; | |
548 $s =~ s/,//g; | |
549 $self->element({'Name' => 'Statistics_db-let', | |
550 'Data' => $s}); | |
551 } elsif( $blast eq 'btk' ) { | |
552 next; | |
553 } elsif( $blast eq 'wublast' ) { | |
554 if( /E=(\S+)/ ) { | |
555 $self->element({'Name' => 'Parameters_expect', | |
556 'Data' => $1}); | |
557 } elsif( /nogaps/ ) { | |
558 $self->element({'Name' => 'Parameters_allowgaps', | |
559 'Data' => 'no'}); | |
560 } elsif( $last =~ /(Frame|Strand)\s+MatID\s+Matrix name/i ){ | |
561 s/^\s+//; | |
562 #throw away first two slots | |
563 my @vals = split; | |
564 splice(@vals, 0,2); | |
565 my ($matrix,$lambda,$kappa,$entropy) = @vals; | |
566 $self->element({'Name' => 'Parameters_matrix', | |
567 'Data' => $matrix}); | |
568 $self->element({'Name' => 'Statistics_lambda', | |
569 'Data' => $lambda}); | |
570 $self->element({'Name' => 'Statistics_kappa', | |
571 'Data' => $kappa}); | |
572 $self->element({'Name' => 'Statistics_entropy', | |
573 'Data' => $entropy}); | |
574 } elsif( m/^\s+Q=(\d+),R=(\d+)\s+/ox ) { | |
575 $self->element({'Name' => 'Parameters_gap-open', | |
576 'Data' => $1}); | |
577 $self->element({'Name' => 'Parameters_gap-extend', | |
578 'Data' => $2}); | |
579 } elsif( /(\S+\s+\S+)\s+DFA:\s+(\S+)\s+\((.+)\)/ ) { | |
580 if( $1 eq 'states in') { | |
581 $self->element({'Name' => 'Statistics_DFA_states', | |
582 'Data' => "$2 $3"}); | |
583 } elsif( $1 eq 'size of') { | |
584 $self->element({'Name' => 'Statistics_DFA_size', | |
585 'Data' => "$2 $3"}); | |
586 } | |
587 } elsif( /^\s+Time to generate neighborhood:\s+(\S+\s+\S+\s+\S+)/ ) { | |
588 $self->element({'Name' => 'Statistics_neighbortime', | |
589 'Data' => $1}); | |
590 } elsif( /processors\s+used:\s+(\d+)/ ) { | |
591 $self->element({'Name' => 'Statistics_noprocessors', | |
592 'Data' => $1}); | |
593 } elsif( m/^\s+(\S+)\s+cpu\s+time:\s+(\S+\s+\S+\s+\S+)\s+ | |
594 Elapsed:\s+(\S+)/ox ) { | |
595 my $cputype = lc($1); | |
596 $self->element({'Name' => "Statistics_$cputype\_cputime", | |
597 'Data' => $2}); | |
598 $self->element({'Name' => "Statistics_$cputype\_actualtime", | |
599 'Data' => $3}); | |
600 } elsif( /^\s+Start:/ ) { | |
601 my ($junk,$start,$stime,$end,$etime) = | |
602 split(/\s+(Start|End)\:\s+/,$_); | |
603 chomp($stime); | |
604 $self->element({'Name' => 'Statistics_starttime', | |
605 'Data' => $stime}); | |
606 chomp($etime); | |
607 $self->element({'Name' => 'Statistics_endtime', | |
608 'Data' => $etime}); | |
609 } elsif( !/^\s+$/ ) { | |
610 $self->debug( "unmatched stat $_"); | |
611 } | |
612 | |
613 } elsif ( $blast eq 'ncbi' ) { | |
614 if( m/^Matrix:\s+(\S+)/oxi ) { | |
615 $self->element({'Name' => 'Parameters_matrix', | |
616 'Data' => $1}); | |
617 } elsif( /Lambda/ ) { | |
618 $_ = $self->_readline; | |
619 s/^\s+//; | |
620 my ($lambda, $kappa, $entropy) = split; | |
621 $self->element({'Name' => 'Statistics_lambda', | |
622 'Data' => $lambda}); | |
623 $self->element({'Name' => 'Statistics_kappa', | |
624 'Data' => $kappa}); | |
625 $self->element({'Name' => 'Statistics_entropy', | |
626 'Data' => $entropy}); | |
627 } elsif( m/effective\s+search\s+space\s+used:\s+(\d+)/ox ) { | |
628 $self->element({'Name' => 'Statistics_eff-spaceused', | |
629 'Data' => $1}); | |
630 } elsif( m/effective\s+search\s+space:\s+(\d+)/ox ) { | |
631 $self->element({'Name' => 'Statistics_eff-space', | |
632 'Data' => $1}); | |
633 } elsif( m/Gap\s+Penalties:\s+Existence:\s+(\d+)\, | |
634 \s+Extension:\s+(\d+)/ox) { | |
635 $self->element({'Name' => 'Parameters_gap-open', | |
636 'Data' => $1}); | |
637 $self->element({'Name' => 'Parameters_gap-extend', | |
638 'Data' => $2}); | |
639 } elsif( /effective\s+HSP\s+length:\s+(\d+)/ ) { | |
640 $self->element({'Name' => 'Statistics_hsp-len', | |
641 'Data' => $1}); | |
642 } elsif( /effective\s+length\s+of\s+query:\s+([\d\,]+)/ ) { | |
643 my $c = $1; | |
644 $c =~ s/\,//g; | |
645 $self->element({'Name' => 'Statistics_query-len', | |
646 'Data' => $c}); | |
647 } elsif( m/effective\s+length\s+of\s+database:\s+ | |
648 ([\d\,]+)/ox){ | |
649 my $c = $1; | |
650 $c =~ s/\,//g; | |
651 $self->element({'Name' => 'Statistics_eff-dblen', | |
652 'Data' => $c}); | |
653 } elsif( m/^(T|A|X1|X2|S1|S2):\s+(.+)/ox ) { | |
654 my $v = $2; | |
655 chomp($v); | |
656 $self->element({'Name' => "Statistics_$1", | |
657 'Data' => $v}); | |
658 } elsif( m/frameshift\s+window\,\s+decay\s+const:\s+ | |
659 (\d+)\,\s+([\.\d]+)/ox ) { | |
660 $self->element({'Name'=> 'Statistics_framewindow', | |
661 'Data' => $1}); | |
662 $self->element({'Name'=> 'Statistics_decay', | |
663 'Data' => $2}); | |
664 } elsif( m/^Number\s+of\s+Hits\s+to\s+DB:\s+(\S+)/ox ) { | |
665 $self->element({'Name' => 'Statistics_hit_to_db', | |
666 'Data' => $1}); | |
667 } elsif( m/^Number\s+of\s+extensions:\s+(\S+)/ox ) { | |
668 $self->element({'Name' => 'Statistics_num_extensions', | |
669 'Data' => $1}); | |
670 } elsif( m/^Number\s+of\s+successful\s+extensions:\s+ | |
671 (\S+)/ox ) { | |
672 $self->element({'Name' => 'Statistics_num_suc_extensions', | |
673 'Data' => $1}); | |
674 } elsif( m/^Number\s+of\s+sequences\s+better\s+than\s+ | |
675 (\S+):\s+(\d+)/ox ) { | |
676 $self->element({'Name' => 'Parameters_expect', | |
677 'Data' => $1}); | |
678 $self->element({'Name' => 'Statistics_seqs_better_than_cutoff', | |
679 'Data' => $2}); | |
680 } elsif( /^\s+Posted\s+date:\s+(.+)/ ) { | |
681 my $d = $1; | |
682 chomp($d); | |
683 $self->element({'Name' => 'Statistics_posted_date', | |
684 'Data' => $d}); | |
685 } elsif( ! /^\s+$/ ) { | |
686 $self->debug( "unmatched stat $_"); | |
687 } | |
688 } | |
689 $last = $_; | |
690 } | |
691 } elsif( $self->in_element('hsp') ) { | |
692 # let's read 3 lines at a time; | |
693 my %data = ( 'Query' => '', | |
694 'Mid' => '', | |
695 'Hit' => '' ); | |
696 my ($l,$len); | |
697 for( my $i = 0; | |
698 defined($_) && $i < 3; | |
699 $i++ ){ | |
700 chomp; | |
701 if( ($i == 0 && /^\s+$/) || ($l = /^\s*Lambda/i) ) { | |
702 $self->_pushback($_) if defined $_; | |
703 # this fixes bug #1443 | |
704 $self->end_element({'Name' => 'Hsp'}); | |
705 $self->end_element({'Name' => 'Hit'}) if $l; | |
706 last; | |
707 } | |
708 if( /^((Query|Sbjct):\s+(\d+)\s*)(\S+)\s+(\d+)/ ) { | |
709 $data{$2} = $4; | |
710 $len = length($1); | |
711 $self->{"\_$2"}->{'begin'} = $3 unless $self->{"_$2"}->{'begin'}; | |
712 $self->{"\_$2"}->{'end'} = $5; | |
713 } else { | |
714 $self->throw("no data for midline $_") | |
715 unless (defined $_ && defined $len); | |
716 $data{'Mid'} = substr($_,$len); | |
717 } | |
718 $_ = $self->_readline(); | |
719 } | |
720 $self->characters({'Name' => 'Hsp_qseq', | |
721 'Data' => $data{'Query'} }); | |
722 $self->characters({'Name' => 'Hsp_hseq', | |
723 'Data' => $data{'Sbjct'}}); | |
724 $self->characters({'Name' => 'Hsp_midline', | |
725 'Data' => $data{'Mid'} }); | |
726 } else { | |
727 $self->debug( "unrecognized line $_"); | |
728 } | |
729 } | |
730 | |
731 if( $seentop ) { | |
732 # double back check that hits and hsps are closed | |
733 # this in response to bug #1443 (may be uncessary due to fix | |
734 # above, but making double sure) | |
735 $self->in_element('hsp') && | |
736 $self->end_element({'Name' => 'Hsp'}); | |
737 $self->in_element('hit') && | |
738 $self->end_element({'Name' => 'Hit'}); | |
739 $self->end_element({'Name' => 'BlastOutput'}); | |
740 } | |
741 # $self->end_element({'Name' => 'BlastOutput'}) unless ! $seentop; | |
742 return $self->end_document(); | |
743 } | |
744 | |
745 =head2 start_element | |
746 | |
747 Title : start_element | |
748 Usage : $eventgenerator->start_element | |
749 Function: Handles a start element event | |
750 Returns : none | |
751 Args : hashref with at least 2 keys 'Data' and 'Name' | |
752 | |
753 | |
754 =cut | |
755 | |
756 sub start_element{ | |
757 my ($self,$data) = @_; | |
758 # we currently don't care about attributes | |
759 my $nm = $data->{'Name'}; | |
760 my $type = $MODEMAP{$nm}; | |
761 if( $type ) { | |
762 if( $self->_eventHandler->will_handle($type) ) { | |
763 my $func = sprintf("start_%s",lc $type); | |
764 $self->_eventHandler->$func($data->{'Attributes'}); | |
765 } | |
766 unshift @{$self->{'_elements'}}, $type; | |
767 if( $type eq 'result') { | |
768 $self->{'_values'} = {}; | |
769 $self->{'_result'}= undef; | |
770 } else { | |
771 # cleanup some things | |
772 if( defined $self->{'_values'} ) { | |
773 foreach my $k ( grep { /^\U$type\-/ } | |
774 keys %{$self->{'_values'}} ) { | |
775 delete $self->{'_values'}->{$k}; | |
776 } | |
777 } | |
778 } | |
779 } | |
780 } | |
781 | |
782 =head2 end_element | |
783 | |
784 Title : start_element | |
785 Usage : $eventgenerator->end_element | |
786 Function: Handles an end element event | |
787 Returns : none | |
788 Args : hashref with at least 2 keys 'Data' and 'Name' | |
789 | |
790 | |
791 =cut | |
792 | |
793 sub end_element { | |
794 my ($self,$data) = @_; | |
795 my $nm = $data->{'Name'}; | |
796 my $type = $MODEMAP{$nm}; | |
797 my $rc; | |
798 if($nm eq 'BlastOutput_program' && | |
799 $self->{'_last_data'} =~ /(t?blast[npx])/i ) { | |
800 $self->{'_reporttype'} = uc $1; | |
801 } | |
802 | |
803 # Hsp are sort of weird, in that they end when another | |
804 # object begins so have to detect this in end_element for now | |
805 if( $nm eq 'Hsp' ) { | |
806 foreach ( qw(Hsp_qseq Hsp_midline Hsp_hseq) ) { | |
807 $self->element({'Name' => $_, | |
808 'Data' => $self->{'_last_hspdata'}->{$_}}); | |
809 } | |
810 $self->{'_last_hspdata'} = {}; | |
811 $self->element({'Name' => 'Hsp_query-from', | |
812 'Data' => $self->{'_Query'}->{'begin'}}); | |
813 $self->element({'Name' => 'Hsp_query-to', | |
814 'Data' => $self->{'_Query'}->{'end'}}); | |
815 | |
816 $self->element({'Name' => 'Hsp_hit-from', | |
817 'Data' => $self->{'_Sbjct'}->{'begin'}}); | |
818 $self->element({'Name' => 'Hsp_hit-to', | |
819 'Data' => $self->{'_Sbjct'}->{'end'}}); | |
820 } | |
821 if( $type = $MODEMAP{$nm} ) { | |
822 if( $self->_eventHandler->will_handle($type) ) { | |
823 my $func = sprintf("end_%s",lc $type); | |
824 $rc = $self->_eventHandler->$func($self->{'_reporttype'}, | |
825 $self->{'_values'}); | |
826 } | |
827 shift @{$self->{'_elements'}}; | |
828 | |
829 } elsif( $MAPPING{$nm} ) { | |
830 | |
831 if ( ref($MAPPING{$nm}) =~ /hash/i ) { | |
832 # this is where we shove in the data from the | |
833 # hashref info about params or statistics | |
834 my $key = (keys %{$MAPPING{$nm}})[0]; | |
835 $self->{'_values'}->{$key}->{$MAPPING{$nm}->{$key}} = $self->{'_last_data'}; | |
836 } else { | |
837 $self->{'_values'}->{$MAPPING{$nm}} = $self->{'_last_data'}; | |
838 } | |
839 } else { | |
840 $self->debug( "unknown nm $nm, ignoring\n"); | |
841 } | |
842 $self->{'_last_data'} = ''; # remove read data if we are at | |
843 # end of an element | |
844 $self->{'_result'} = $rc if( defined $type && $type eq 'result' ); | |
845 return $rc; | |
846 | |
847 } | |
848 | |
849 =head2 element | |
850 | |
851 Title : element | |
852 Usage : $eventhandler->element({'Name' => $name, 'Data' => $str}); | |
853 Function: Convience method that calls start_element, characters, end_element | |
854 Returns : none | |
855 Args : Hash ref with the keys 'Name' and 'Data' | |
856 | |
857 | |
858 =cut | |
859 | |
860 sub element{ | |
861 my ($self,$data) = @_; | |
862 $self->start_element($data); | |
863 $self->characters($data); | |
864 $self->end_element($data); | |
865 } | |
866 | |
867 =head2 characters | |
868 | |
869 Title : characters | |
870 Usage : $eventgenerator->characters($str) | |
871 Function: Send a character events | |
872 Returns : none | |
873 Args : string | |
874 | |
875 | |
876 =cut | |
877 | |
878 sub characters{ | |
879 my ($self,$data) = @_; | |
880 return unless ( defined $data->{'Data'} && $data->{'Data'} !~ /^\s+$/ ); | |
881 | |
882 if( $self->in_element('hsp') && | |
883 $data->{'Name'} =~ /Hsp\_(qseq|hseq|midline)/ ) { | |
884 $self->{'_last_hspdata'}->{$data->{'Name'}} .= $data->{'Data'}; | |
885 } | |
886 $self->{'_last_data'} = $data->{'Data'}; | |
887 } | |
888 | |
889 =head2 within_element | |
890 | |
891 Title : within_element | |
892 Usage : if( $eventgenerator->within_element($element) ) {} | |
893 Function: Test if we are within a particular element | |
894 This is different than 'in' because within can be tested | |
895 for a whole block. | |
896 Returns : boolean | |
897 Args : string element name | |
898 | |
899 | |
900 =cut | |
901 | |
902 sub within_element{ | |
903 my ($self,$name) = @_; | |
904 return 0 if ( ! defined $name && | |
905 ! defined $self->{'_elements'} || | |
906 scalar @{$self->{'_elements'}} == 0) ; | |
907 foreach ( @{$self->{'_elements'}} ) { | |
908 if( $_ eq $name ) { | |
909 return 1; | |
910 } | |
911 } | |
912 return 0; | |
913 } | |
914 | |
915 | |
916 =head2 in_element | |
917 | |
918 Title : in_element | |
919 Usage : if( $eventgenerator->in_element($element) ) {} | |
920 Function: Test if we are in a particular element | |
921 This is different than 'in' because within can be tested | |
922 for a whole block. | |
923 Returns : boolean | |
924 Args : string element name | |
925 | |
926 | |
927 =cut | |
928 | |
929 sub in_element{ | |
930 my ($self,$name) = @_; | |
931 return 0 if ! defined $self->{'_elements'}->[0]; | |
932 return ( $self->{'_elements'}->[0] eq $name) | |
933 } | |
934 | |
935 =head2 start_document | |
936 | |
937 Title : start_document | |
938 Usage : $eventgenerator->start_document | |
939 Function: Handle a start document event | |
940 Returns : none | |
941 Args : none | |
942 | |
943 | |
944 =cut | |
945 | |
946 sub start_document{ | |
947 my ($self) = @_; | |
948 $self->{'_lasttype'} = ''; | |
949 $self->{'_values'} = {}; | |
950 $self->{'_result'}= undef; | |
951 $self->{'_elements'} = []; | |
952 } | |
953 | |
954 =head2 end_document | |
955 | |
956 Title : end_document | |
957 Usage : $eventgenerator->end_document | |
958 Function: Handles an end document event | |
959 Returns : Bio::Search::Result::ResultI object | |
960 Args : none | |
961 | |
962 | |
963 =cut | |
964 | |
965 sub end_document{ | |
966 my ($self,@args) = @_; | |
967 return $self->{'_result'}; | |
968 } | |
969 | |
970 | |
971 sub write_result { | |
972 my ($self, $blast, @args) = @_; | |
973 | |
974 if( not defined($self->writer) ) { | |
975 $self->warn("Writer not defined. Using a $DEFAULT_BLAST_WRITER_CLASS"); | |
976 $self->writer( $DEFAULT_BLAST_WRITER_CLASS->new() ); | |
977 } | |
978 $self->SUPER::write_result( $blast, @args ); | |
979 } | |
980 | |
981 sub result_count { | |
982 my $self = shift; | |
983 return $self->{'_result_count'}; | |
984 } | |
985 | |
986 sub report_count { shift->result_count } | |
987 | |
988 1; |