0
|
1 # $Id: blastxml.pm,v 1.24 2002/10/26 09:32:16 sac Exp $
|
|
2 #
|
|
3 # BioPerl module for Bio::SearchIO::blastxml
|
|
4 #
|
|
5 # Cared for by Jason Stajich <jason@bioperl.org>
|
|
6 #
|
|
7 # Copyright Jason Stajich
|
|
8 #
|
|
9 # You may distribute this module under the same terms as perl itself
|
|
10
|
|
11 # POD documentation - main docs before the code
|
|
12
|
|
13 =head1 NAME
|
|
14
|
|
15 Bio::SearchIO::blastxml - A SearchIO implementation of NCBI Blast XML parsing.
|
|
16
|
|
17 =head1 SYNOPSIS
|
|
18
|
|
19 use Bio::SearchIO;
|
|
20 my $searchin = new Bio::SearchIO(-format => 'blastxml',
|
|
21 -file => 't/data/plague_yeast.bls.xml');
|
|
22 while( my $result = $searchin->next_result ) {
|
|
23 }
|
|
24
|
|
25 # one can also request that the parser NOT keep the XML data in memory
|
|
26 # by using the tempfile initialization flag.
|
|
27 my $searchin = new Bio::SearchIO(-tempfile => 1,
|
|
28 -format => 'blastxml',
|
|
29 -file => 't/data/plague_yeast.bls.xml');
|
|
30 while( my $result = $searchin->next_result ) {
|
|
31 }
|
|
32
|
|
33 =head1 DESCRIPTION
|
|
34
|
|
35 This object implements a NCBI Blast XML parser.
|
|
36
|
|
37 There is one additional initialization flag from the SearchIO defaults
|
|
38 - that is the -tempfile flag. If specified as true, then the parser
|
|
39 will write out each report to a temporary filehandle rather than
|
|
40 holding the entire report as a string in memory. The reason this is
|
|
41 done in the first place is NCBI reports have an uncessary E<lt>?xml
|
|
42 version="1.0"?E<gt> at the beginning of each report and RPS-BLAST reports
|
|
43 have an additional unecessary RPS-BLAST tag at the top of each report.
|
|
44 So we currently have implemented the work around by preparsing the
|
|
45 file (yes it makes the process slower, but it works).
|
|
46
|
|
47
|
|
48 =head1 FEEDBACK
|
|
49
|
|
50 =head2 Mailing Lists
|
|
51
|
|
52 User feedback is an integral part of the evolution of this and other
|
|
53 Bioperl modules. Send your comments and suggestions preferably to
|
|
54 the Bioperl mailing list. Your participation is much appreciated.
|
|
55
|
|
56 bioperl-l@bioperl.org - General discussion
|
|
57 http://bioperl.org/MailList.shtml - About the mailing lists
|
|
58
|
|
59 =head2 Reporting Bugs
|
|
60
|
|
61 Report bugs to the Bioperl bug tracking system to help us keep track
|
|
62 of the bugs and their resolution. Bug reports can be submitted via
|
|
63 email or the web:
|
|
64
|
|
65 bioperl-bugs@bioperl.org
|
|
66 http://bugzilla.bioperl.org/
|
|
67
|
|
68 =head1 AUTHOR - Jason Stajich
|
|
69
|
|
70 Email jason@bioperl.org
|
|
71
|
|
72 Describe contact details here
|
|
73
|
|
74 =head1 CONTRIBUTORS
|
|
75
|
|
76 Additional contributors names and emails here
|
|
77
|
|
78 =head1 APPENDIX
|
|
79
|
|
80 The rest of the documentation details each of the object methods.
|
|
81 Internal methods are usually preceded with a _
|
|
82
|
|
83 =cut
|
|
84
|
|
85 # Let the code begin...
|
|
86
|
|
87 package Bio::SearchIO::blastxml;
|
|
88 use vars qw(@ISA $DTD %MAPPING %MODEMAP $DEBUG);
|
|
89 use strict;
|
|
90
|
|
91 $DTD = 'ftp://ftp.ncbi.nlm.nih.gov/blast/documents/NCBI_BlastOutput.dtd';
|
|
92 # Object preamble - inherits from Bio::Root::Root
|
|
93
|
|
94 use Bio::Root::Root;
|
|
95 use Bio::SearchIO;
|
|
96 use XML::Parser::PerlSAX;
|
|
97 use XML::Handler::Subs;
|
|
98 use HTML::Entities;
|
|
99 use IO::File;
|
|
100
|
|
101
|
|
102 BEGIN {
|
|
103 # mapping of NCBI Blast terms to Bioperl hash keys
|
|
104 %MODEMAP = ('BlastOutput' => 'result',
|
|
105 'Hit' => 'hit',
|
|
106 'Hsp' => 'hsp'
|
|
107 );
|
|
108
|
|
109 %MAPPING = (
|
|
110 # HSP specific fields
|
|
111 'Hsp_bit-score' => 'HSP-bits',
|
|
112 'Hsp_score' => 'HSP-score',
|
|
113 'Hsp_evalue' => 'HSP-evalue',
|
|
114 'Hsp_query-from' => 'HSP-query_start',
|
|
115 'Hsp_query-to' => 'HSP-query_end',
|
|
116 'Hsp_hit-from' => 'HSP-hit_start',
|
|
117 'Hsp_hit-to' => 'HSP-hit_end',
|
|
118 'Hsp_positive' => 'HSP-conserved',
|
|
119 'Hsp_identity' => 'HSP-identical',
|
|
120 'Hsp_gaps' => 'HSP-gaps',
|
|
121 'Hsp_hitgaps' => 'HSP-hit_gaps',
|
|
122 'Hsp_querygaps' => 'HSP-query_gaps',
|
|
123 'Hsp_qseq' => 'HSP-query_seq',
|
|
124 'Hsp_hseq' => 'HSP-hit_seq',
|
|
125 'Hsp_midline' => 'HSP-homology_seq',
|
|
126 'Hsp_align-len' => 'HSP-hsp_length',
|
|
127 'Hsp_query-frame'=> 'HSP-query_frame',
|
|
128 'Hsp_hit-frame' => 'HSP-hit_frame',
|
|
129
|
|
130 # these are ignored for now
|
|
131 'Hsp_num' => 'HSP-order',
|
|
132 'Hsp_pattern-from' => 'patternend',
|
|
133 'Hsp_pattern-to' => 'patternstart',
|
|
134 'Hsp_density' => 'hspdensity',
|
|
135
|
|
136 # Hit specific fields
|
|
137 'Hit_id' => 'HIT-name',
|
|
138 'Hit_len' => 'HIT-length',
|
|
139 'Hit_accession' => 'HIT-accession',
|
|
140 'Hit_def' => 'HIT-description',
|
|
141 'Hit_num' => 'HIT-order',
|
|
142 'Iteration_iter-num' => 'HIT-iteration',
|
|
143 'Iteration_stat' => 'HIT-iteration_statistic',
|
|
144
|
|
145 'BlastOutput_program' => 'RESULT-algorithm_name',
|
|
146 'BlastOutput_version' => 'RESULT-algorithm_version',
|
|
147 'BlastOutput_query-def' => 'RESULT-query_description',
|
|
148 'BlastOutput_query-len' => 'RESULT-query_length',
|
|
149 'BlastOutput_db' => 'RESULT-database_name',
|
|
150 'BlastOutput_reference' => 'RESULT-program_reference',
|
|
151 'BlastOutput_query-ID' => 'runid',
|
|
152
|
|
153 'Parameters_matrix' => { 'RESULT-parameters' => 'matrix'},
|
|
154 'Parameters_expect' => { 'RESULT-parameters' => 'expect'},
|
|
155 'Parameters_include' => { 'RESULT-parameters' => 'include'},
|
|
156 'Parameters_sc-match' => { 'RESULT-parameters' => 'match'},
|
|
157 'Parameters_sc-mismatch' => { 'RESULT-parameters' => 'mismatch'},
|
|
158 'Parameters_gap-open' => { 'RESULT-parameters' => 'gapopen'},
|
|
159 'Parameters_gap-extend'=> { 'RESULT-parameters' => 'gapext'},
|
|
160 'Parameters_filter' => {'RESULT-parameters' => 'filter'},
|
|
161 'Statistics_db-num' => 'RESULT-database_entries',
|
|
162 'Statistics_db-len' => 'RESULT-database_letters',
|
|
163 'Statistics_hsp-len' => { 'RESULT-statistics' => 'hsplength'},
|
|
164 'Statistics_eff-space' => { 'RESULT-statistics' => 'effectivespace'},
|
|
165 'Statistics_kappa' => { 'RESULT-statistics' => 'kappa' },
|
|
166 'Statistics_lambda' => { 'RESULT-statistics' => 'lambda' },
|
|
167 'Statistics_entropy' => { 'RESULT-statistics' => 'entropy'},
|
|
168 );
|
|
169 eval { require Time::HiRes };
|
|
170 if( $@ ) { $DEBUG = 0; }
|
|
171 }
|
|
172
|
|
173
|
|
174 @ISA = qw(Bio::SearchIO );
|
|
175
|
|
176 =head2 new
|
|
177
|
|
178 Title : new
|
|
179 Usage : my $searchio = new Bio::SearchIO(-format => 'blastxml',
|
|
180 -file => 'filename',
|
|
181 -tempfile => 1);
|
|
182 Function: Initializes the object - this is chained through new in SearchIO
|
|
183 Returns : Bio::SearchIO::blastxml object
|
|
184 Args : One additional argument from the format and file/fh parameters.
|
|
185 -tempfile => boolean. Defaults to false. Write out XML data
|
|
186 to a temporary filehandle to send to
|
|
187 PerlSAX parser.
|
|
188 =cut
|
|
189
|
|
190 =head2 _initialize
|
|
191
|
|
192 Title : _initialize
|
|
193 Usage : private
|
|
194 Function: Initializes the object - this is chained through new in SearchIO
|
|
195
|
|
196 =cut
|
|
197
|
|
198 sub _initialize{
|
|
199 my ($self,@args) = @_;
|
|
200 $self->SUPER::_initialize(@args);
|
|
201 my ($usetempfile) = $self->_rearrange([qw(TEMPFILE)],@args);
|
|
202 defined $usetempfile && $self->use_tempfile($usetempfile);
|
|
203 $self->{'_xmlparser'} = new XML::Parser::PerlSAX();
|
|
204 $DEBUG = 1 if( ! defined $DEBUG && $self->verbose > 0);
|
|
205 }
|
|
206
|
|
207 =head2 next_result
|
|
208
|
|
209 Title : next_result
|
|
210 Usage : my $hit = $searchio->next_result;
|
|
211 Function: Returns the next Result from a search
|
|
212 Returns : Bio::Search::Result::ResultI object
|
|
213 Args : none
|
|
214
|
|
215 =cut
|
|
216
|
|
217 sub next_result {
|
|
218 my ($self) = @_;
|
|
219
|
|
220 my $data = '';
|
|
221 my $firstline = 1;
|
|
222 my ($tfh);
|
|
223 if( $self->use_tempfile ) {
|
|
224 $tfh = IO::File->new_tmpfile or $self->throw("Unable to open temp file: $!");
|
|
225 $tfh->autoflush(1);
|
|
226 }
|
|
227 my $okaytoprocess;
|
|
228 while( defined( $_ = $self->_readline) ) {
|
|
229 if( /^RPS-BLAST/i ) {
|
|
230 $self->{'_type'} = 'RPSBLAST';
|
|
231 next;
|
|
232 }
|
|
233 if( /^<\?xml version/ && ! $firstline) {
|
|
234 $self->_pushback($_);
|
|
235 last;
|
|
236 }
|
|
237 $_ = decode_entities($_);
|
|
238 # s/\'/\`/g;
|
|
239 # s/\>/\>/g;
|
|
240 # s/\</\</g;
|
|
241 $okaytoprocess = 1;
|
|
242 if( defined $tfh ) {
|
|
243 print $tfh $_;
|
|
244 } else {
|
|
245 $data .= $_;
|
|
246 }
|
|
247 $firstline = 0;
|
|
248 }
|
|
249
|
|
250 return undef unless( $okaytoprocess);
|
|
251
|
|
252 my %parser_args;
|
|
253 if( defined $tfh ) {
|
|
254 seek($tfh,0,0);
|
|
255 %parser_args = ('Source' => { 'ByteStream' => $tfh },
|
|
256 'Handler' => $self);
|
|
257 } else {
|
|
258 %parser_args = ('Source' => { 'String' => $data },
|
|
259 'Handler' => $self);
|
|
260 }
|
|
261 my $result;
|
|
262 my $starttime;
|
|
263 if( $DEBUG ) { $starttime = [ Time::HiRes::gettimeofday() ]; }
|
|
264
|
|
265 eval {
|
|
266 $result = $self->{'_xmlparser'}->parse(%parser_args);
|
|
267 $self->{'_result_count'}++;
|
|
268 };
|
|
269 if( $@ ) {
|
|
270 $self->warn("error in parsing a report:\n $@");
|
|
271 $result = undef;
|
|
272 }
|
|
273 if( $DEBUG ) {
|
|
274 $self->debug( sprintf("parsing took %f seconds\n", Time::HiRes::tv_interval($starttime)));
|
|
275 }
|
|
276 # parsing magic here - but we call event handlers rather than
|
|
277 # instantiating things
|
|
278 return $result;
|
|
279 }
|
|
280
|
|
281 =head2 SAX methods
|
|
282
|
|
283 =cut
|
|
284
|
|
285 =head2 start_document
|
|
286
|
|
287 Title : start_document
|
|
288 Usage : $parser->start_document;
|
|
289 Function: SAX method to indicate starting to parse a new document
|
|
290 Returns : none
|
|
291 Args : none
|
|
292
|
|
293
|
|
294 =cut
|
|
295
|
|
296 sub start_document{
|
|
297 my ($self) = @_;
|
|
298 $self->{'_lasttype'} = '';
|
|
299 $self->{'_values'} = {};
|
|
300 $self->{'_result'}= undef;
|
|
301 }
|
|
302
|
|
303 =head2 end_document
|
|
304
|
|
305 Title : end_document
|
|
306 Usage : $parser->end_document;
|
|
307 Function: SAX method to indicate finishing parsing a new document
|
|
308 Returns : Bio::Search::Result::ResultI object
|
|
309 Args : none
|
|
310
|
|
311 =cut
|
|
312
|
|
313 sub end_document{
|
|
314 my ($self,@args) = @_;
|
|
315 return $self->{'_result'};
|
|
316 }
|
|
317
|
|
318 =head2 start_element
|
|
319
|
|
320 Title : start_element
|
|
321 Usage : $parser->start_element($data)
|
|
322 Function: SAX method to indicate starting a new element
|
|
323 Returns : none
|
|
324 Args : hash ref for data
|
|
325
|
|
326 =cut
|
|
327
|
|
328 sub start_element{
|
|
329 my ($self,$data) = @_;
|
|
330 # we currently don't care about attributes
|
|
331 my $nm = $data->{'Name'};
|
|
332
|
|
333 if( my $type = $MODEMAP{$nm} ) {
|
|
334 if( $self->_eventHandler->will_handle($type) ) {
|
|
335 my $func = sprintf("start_%s",lc $type);
|
|
336 $self->_eventHandler->$func($data->{'Attributes'});
|
|
337 }
|
|
338 }
|
|
339
|
|
340 if($nm eq 'BlastOutput') {
|
|
341 $self->{'_values'} = {};
|
|
342 $self->{'_result'}= undef;
|
|
343 }
|
|
344 }
|
|
345
|
|
346 =head2 end_element
|
|
347
|
|
348 Title : end_element
|
|
349 Usage : $parser->end_element($data)
|
|
350 Function: Signals finishing an element
|
|
351 Returns : Bio::Search object dpending on what type of element
|
|
352 Args : hash ref for data
|
|
353
|
|
354 =cut
|
|
355
|
|
356 sub end_element{
|
|
357 my ($self,$data) = @_;
|
|
358
|
|
359 my $nm = $data->{'Name'};
|
|
360 my $rc;
|
|
361 if($nm eq 'BlastOutput_program' &&
|
|
362 $self->{'_last_data'} =~ /(t?blast[npx])/i ) {
|
|
363 $self->{'_type'} = uc $1;
|
|
364 }
|
|
365
|
|
366 if( my $type = $MODEMAP{$nm} ) {
|
|
367 if( $self->_eventHandler->will_handle($type) ) {
|
|
368 my $func = sprintf("end_%s",lc $type);
|
|
369 $rc = $self->_eventHandler->$func($self->{'_type'},
|
|
370 $self->{'_values'});
|
|
371 }
|
|
372 } elsif( $MAPPING{$nm} ) {
|
|
373 if ( ref($MAPPING{$nm}) =~ /hash/i ) {
|
|
374 my $key = (keys %{$MAPPING{$nm}})[0];
|
|
375 $self->{'_values'}->{$key}->{$MAPPING{$nm}->{$key}} = $self->{'_last_data'};
|
|
376 } else {
|
|
377 $self->{'_values'}->{$MAPPING{$nm}} = $self->{'_last_data'};
|
|
378 }
|
|
379 } elsif( $nm eq 'Iteration' || $nm eq 'Hit_hsps' || $nm eq 'Parameters' ||
|
|
380 $nm eq 'BlastOutput_param' || $nm eq 'Iteration_hits' ||
|
|
381 $nm eq 'Statistics' || $nm eq 'BlastOutput_iterations' ){
|
|
382
|
|
383 } else {
|
|
384
|
|
385 $self->debug("ignoring unrecognized element type $nm\n");
|
|
386 }
|
|
387 $self->{'_last_data'} = ''; # remove read data if we are at
|
|
388 # end of an element
|
|
389 $self->{'_result'} = $rc if( $nm eq 'BlastOutput' );
|
|
390 return $rc;
|
|
391 }
|
|
392
|
|
393 =head2 characters
|
|
394
|
|
395 Title : characters
|
|
396 Usage : $parser->characters($data)
|
|
397 Function: Signals new characters to be processed
|
|
398 Returns : characters read
|
|
399 Args : hash ref with the key 'Data'
|
|
400
|
|
401
|
|
402 =cut
|
|
403
|
|
404 sub characters{
|
|
405 my ($self,$data) = @_;
|
|
406 return unless ( defined $data->{'Data'} && $data->{'Data'} !~ /^\s+$/ );
|
|
407
|
|
408 $self->{'_last_data'} = $data->{'Data'};
|
|
409 }
|
|
410
|
|
411 =head2 use_tempfile
|
|
412
|
|
413 Title : use_tempfile
|
|
414 Usage : $obj->use_tempfile($newval)
|
|
415 Function: Get/Set boolean flag on whether or not use a tempfile
|
|
416 Example :
|
|
417 Returns : value of use_tempfile
|
|
418 Args : newvalue (optional)
|
|
419
|
|
420
|
|
421 =cut
|
|
422
|
|
423 sub use_tempfile{
|
|
424 my ($self,$value) = @_;
|
|
425 if( defined $value) {
|
|
426 $self->{'_use_tempfile'} = $value;
|
|
427 }
|
|
428 return $self->{'_use_tempfile'};
|
|
429 }
|
|
430
|
|
431 sub result_count {
|
|
432 my $self = shift;
|
|
433 return $self->{'_result_count'};
|
|
434 }
|
|
435
|
|
436 1;
|