comparison variant_effect_predictor/Bio/SearchIO/blastxml.pm @ 0:2bc9b66ada89 draft default tip

Uploaded
author mahtabm
date Thu, 11 Apr 2013 06:29:17 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:2bc9b66ada89
1 # $Id: blastxml.pm,v 1.24 2002/10/26 09:32:16 sac Exp $
2 #
3 # BioPerl module for Bio::SearchIO::blastxml
4 #
5 # Cared for by Jason Stajich <jason@bioperl.org>
6 #
7 # Copyright Jason Stajich
8 #
9 # You may distribute this module under the same terms as perl itself
10
11 # POD documentation - main docs before the code
12
13 =head1 NAME
14
15 Bio::SearchIO::blastxml - A SearchIO implementation of NCBI Blast XML parsing.
16
17 =head1 SYNOPSIS
18
19 use Bio::SearchIO;
20 my $searchin = new Bio::SearchIO(-format => 'blastxml',
21 -file => 't/data/plague_yeast.bls.xml');
22 while( my $result = $searchin->next_result ) {
23 }
24
25 # one can also request that the parser NOT keep the XML data in memory
26 # by using the tempfile initialization flag.
27 my $searchin = new Bio::SearchIO(-tempfile => 1,
28 -format => 'blastxml',
29 -file => 't/data/plague_yeast.bls.xml');
30 while( my $result = $searchin->next_result ) {
31 }
32
33 =head1 DESCRIPTION
34
35 This object implements a NCBI Blast XML parser.
36
37 There is one additional initialization flag from the SearchIO defaults
38 - that is the -tempfile flag. If specified as true, then the parser
39 will write out each report to a temporary filehandle rather than
40 holding the entire report as a string in memory. The reason this is
41 done in the first place is NCBI reports have an uncessary E<lt>?xml
42 version="1.0"?E<gt> at the beginning of each report and RPS-BLAST reports
43 have an additional unecessary RPS-BLAST tag at the top of each report.
44 So we currently have implemented the work around by preparsing the
45 file (yes it makes the process slower, but it works).
46
47
48 =head1 FEEDBACK
49
50 =head2 Mailing Lists
51
52 User feedback is an integral part of the evolution of this and other
53 Bioperl modules. Send your comments and suggestions preferably to
54 the Bioperl mailing list. Your participation is much appreciated.
55
56 bioperl-l@bioperl.org - General discussion
57 http://bioperl.org/MailList.shtml - About the mailing lists
58
59 =head2 Reporting Bugs
60
61 Report bugs to the Bioperl bug tracking system to help us keep track
62 of the bugs and their resolution. Bug reports can be submitted via
63 email or the web:
64
65 bioperl-bugs@bioperl.org
66 http://bugzilla.bioperl.org/
67
68 =head1 AUTHOR - Jason Stajich
69
70 Email jason@bioperl.org
71
72 Describe contact details here
73
74 =head1 CONTRIBUTORS
75
76 Additional contributors names and emails here
77
78 =head1 APPENDIX
79
80 The rest of the documentation details each of the object methods.
81 Internal methods are usually preceded with a _
82
83 =cut
84
85 # Let the code begin...
86
87 package Bio::SearchIO::blastxml;
88 use vars qw(@ISA $DTD %MAPPING %MODEMAP $DEBUG);
89 use strict;
90
91 $DTD = 'ftp://ftp.ncbi.nlm.nih.gov/blast/documents/NCBI_BlastOutput.dtd';
92 # Object preamble - inherits from Bio::Root::Root
93
94 use Bio::Root::Root;
95 use Bio::SearchIO;
96 use XML::Parser::PerlSAX;
97 use XML::Handler::Subs;
98 use HTML::Entities;
99 use IO::File;
100
101
102 BEGIN {
103 # mapping of NCBI Blast terms to Bioperl hash keys
104 %MODEMAP = ('BlastOutput' => 'result',
105 'Hit' => 'hit',
106 'Hsp' => 'hsp'
107 );
108
109 %MAPPING = (
110 # HSP specific fields
111 'Hsp_bit-score' => 'HSP-bits',
112 'Hsp_score' => 'HSP-score',
113 'Hsp_evalue' => 'HSP-evalue',
114 'Hsp_query-from' => 'HSP-query_start',
115 'Hsp_query-to' => 'HSP-query_end',
116 'Hsp_hit-from' => 'HSP-hit_start',
117 'Hsp_hit-to' => 'HSP-hit_end',
118 'Hsp_positive' => 'HSP-conserved',
119 'Hsp_identity' => 'HSP-identical',
120 'Hsp_gaps' => 'HSP-gaps',
121 'Hsp_hitgaps' => 'HSP-hit_gaps',
122 'Hsp_querygaps' => 'HSP-query_gaps',
123 'Hsp_qseq' => 'HSP-query_seq',
124 'Hsp_hseq' => 'HSP-hit_seq',
125 'Hsp_midline' => 'HSP-homology_seq',
126 'Hsp_align-len' => 'HSP-hsp_length',
127 'Hsp_query-frame'=> 'HSP-query_frame',
128 'Hsp_hit-frame' => 'HSP-hit_frame',
129
130 # these are ignored for now
131 'Hsp_num' => 'HSP-order',
132 'Hsp_pattern-from' => 'patternend',
133 'Hsp_pattern-to' => 'patternstart',
134 'Hsp_density' => 'hspdensity',
135
136 # Hit specific fields
137 'Hit_id' => 'HIT-name',
138 'Hit_len' => 'HIT-length',
139 'Hit_accession' => 'HIT-accession',
140 'Hit_def' => 'HIT-description',
141 'Hit_num' => 'HIT-order',
142 'Iteration_iter-num' => 'HIT-iteration',
143 'Iteration_stat' => 'HIT-iteration_statistic',
144
145 'BlastOutput_program' => 'RESULT-algorithm_name',
146 'BlastOutput_version' => 'RESULT-algorithm_version',
147 'BlastOutput_query-def' => 'RESULT-query_description',
148 'BlastOutput_query-len' => 'RESULT-query_length',
149 'BlastOutput_db' => 'RESULT-database_name',
150 'BlastOutput_reference' => 'RESULT-program_reference',
151 'BlastOutput_query-ID' => 'runid',
152
153 'Parameters_matrix' => { 'RESULT-parameters' => 'matrix'},
154 'Parameters_expect' => { 'RESULT-parameters' => 'expect'},
155 'Parameters_include' => { 'RESULT-parameters' => 'include'},
156 'Parameters_sc-match' => { 'RESULT-parameters' => 'match'},
157 'Parameters_sc-mismatch' => { 'RESULT-parameters' => 'mismatch'},
158 'Parameters_gap-open' => { 'RESULT-parameters' => 'gapopen'},
159 'Parameters_gap-extend'=> { 'RESULT-parameters' => 'gapext'},
160 'Parameters_filter' => {'RESULT-parameters' => 'filter'},
161 'Statistics_db-num' => 'RESULT-database_entries',
162 'Statistics_db-len' => 'RESULT-database_letters',
163 'Statistics_hsp-len' => { 'RESULT-statistics' => 'hsplength'},
164 'Statistics_eff-space' => { 'RESULT-statistics' => 'effectivespace'},
165 'Statistics_kappa' => { 'RESULT-statistics' => 'kappa' },
166 'Statistics_lambda' => { 'RESULT-statistics' => 'lambda' },
167 'Statistics_entropy' => { 'RESULT-statistics' => 'entropy'},
168 );
169 eval { require Time::HiRes };
170 if( $@ ) { $DEBUG = 0; }
171 }
172
173
174 @ISA = qw(Bio::SearchIO );
175
176 =head2 new
177
178 Title : new
179 Usage : my $searchio = new Bio::SearchIO(-format => 'blastxml',
180 -file => 'filename',
181 -tempfile => 1);
182 Function: Initializes the object - this is chained through new in SearchIO
183 Returns : Bio::SearchIO::blastxml object
184 Args : One additional argument from the format and file/fh parameters.
185 -tempfile => boolean. Defaults to false. Write out XML data
186 to a temporary filehandle to send to
187 PerlSAX parser.
188 =cut
189
190 =head2 _initialize
191
192 Title : _initialize
193 Usage : private
194 Function: Initializes the object - this is chained through new in SearchIO
195
196 =cut
197
198 sub _initialize{
199 my ($self,@args) = @_;
200 $self->SUPER::_initialize(@args);
201 my ($usetempfile) = $self->_rearrange([qw(TEMPFILE)],@args);
202 defined $usetempfile && $self->use_tempfile($usetempfile);
203 $self->{'_xmlparser'} = new XML::Parser::PerlSAX();
204 $DEBUG = 1 if( ! defined $DEBUG && $self->verbose > 0);
205 }
206
207 =head2 next_result
208
209 Title : next_result
210 Usage : my $hit = $searchio->next_result;
211 Function: Returns the next Result from a search
212 Returns : Bio::Search::Result::ResultI object
213 Args : none
214
215 =cut
216
217 sub next_result {
218 my ($self) = @_;
219
220 my $data = '';
221 my $firstline = 1;
222 my ($tfh);
223 if( $self->use_tempfile ) {
224 $tfh = IO::File->new_tmpfile or $self->throw("Unable to open temp file: $!");
225 $tfh->autoflush(1);
226 }
227 my $okaytoprocess;
228 while( defined( $_ = $self->_readline) ) {
229 if( /^RPS-BLAST/i ) {
230 $self->{'_type'} = 'RPSBLAST';
231 next;
232 }
233 if( /^<\?xml version/ && ! $firstline) {
234 $self->_pushback($_);
235 last;
236 }
237 $_ = decode_entities($_);
238 # s/\&apos;/\`/g;
239 # s/\&gt;/\>/g;
240 # s/\&lt;/\</g;
241 $okaytoprocess = 1;
242 if( defined $tfh ) {
243 print $tfh $_;
244 } else {
245 $data .= $_;
246 }
247 $firstline = 0;
248 }
249
250 return undef unless( $okaytoprocess);
251
252 my %parser_args;
253 if( defined $tfh ) {
254 seek($tfh,0,0);
255 %parser_args = ('Source' => { 'ByteStream' => $tfh },
256 'Handler' => $self);
257 } else {
258 %parser_args = ('Source' => { 'String' => $data },
259 'Handler' => $self);
260 }
261 my $result;
262 my $starttime;
263 if( $DEBUG ) { $starttime = [ Time::HiRes::gettimeofday() ]; }
264
265 eval {
266 $result = $self->{'_xmlparser'}->parse(%parser_args);
267 $self->{'_result_count'}++;
268 };
269 if( $@ ) {
270 $self->warn("error in parsing a report:\n $@");
271 $result = undef;
272 }
273 if( $DEBUG ) {
274 $self->debug( sprintf("parsing took %f seconds\n", Time::HiRes::tv_interval($starttime)));
275 }
276 # parsing magic here - but we call event handlers rather than
277 # instantiating things
278 return $result;
279 }
280
281 =head2 SAX methods
282
283 =cut
284
285 =head2 start_document
286
287 Title : start_document
288 Usage : $parser->start_document;
289 Function: SAX method to indicate starting to parse a new document
290 Returns : none
291 Args : none
292
293
294 =cut
295
296 sub start_document{
297 my ($self) = @_;
298 $self->{'_lasttype'} = '';
299 $self->{'_values'} = {};
300 $self->{'_result'}= undef;
301 }
302
303 =head2 end_document
304
305 Title : end_document
306 Usage : $parser->end_document;
307 Function: SAX method to indicate finishing parsing a new document
308 Returns : Bio::Search::Result::ResultI object
309 Args : none
310
311 =cut
312
313 sub end_document{
314 my ($self,@args) = @_;
315 return $self->{'_result'};
316 }
317
318 =head2 start_element
319
320 Title : start_element
321 Usage : $parser->start_element($data)
322 Function: SAX method to indicate starting a new element
323 Returns : none
324 Args : hash ref for data
325
326 =cut
327
328 sub start_element{
329 my ($self,$data) = @_;
330 # we currently don't care about attributes
331 my $nm = $data->{'Name'};
332
333 if( my $type = $MODEMAP{$nm} ) {
334 if( $self->_eventHandler->will_handle($type) ) {
335 my $func = sprintf("start_%s",lc $type);
336 $self->_eventHandler->$func($data->{'Attributes'});
337 }
338 }
339
340 if($nm eq 'BlastOutput') {
341 $self->{'_values'} = {};
342 $self->{'_result'}= undef;
343 }
344 }
345
346 =head2 end_element
347
348 Title : end_element
349 Usage : $parser->end_element($data)
350 Function: Signals finishing an element
351 Returns : Bio::Search object dpending on what type of element
352 Args : hash ref for data
353
354 =cut
355
356 sub end_element{
357 my ($self,$data) = @_;
358
359 my $nm = $data->{'Name'};
360 my $rc;
361 if($nm eq 'BlastOutput_program' &&
362 $self->{'_last_data'} =~ /(t?blast[npx])/i ) {
363 $self->{'_type'} = uc $1;
364 }
365
366 if( my $type = $MODEMAP{$nm} ) {
367 if( $self->_eventHandler->will_handle($type) ) {
368 my $func = sprintf("end_%s",lc $type);
369 $rc = $self->_eventHandler->$func($self->{'_type'},
370 $self->{'_values'});
371 }
372 } elsif( $MAPPING{$nm} ) {
373 if ( ref($MAPPING{$nm}) =~ /hash/i ) {
374 my $key = (keys %{$MAPPING{$nm}})[0];
375 $self->{'_values'}->{$key}->{$MAPPING{$nm}->{$key}} = $self->{'_last_data'};
376 } else {
377 $self->{'_values'}->{$MAPPING{$nm}} = $self->{'_last_data'};
378 }
379 } elsif( $nm eq 'Iteration' || $nm eq 'Hit_hsps' || $nm eq 'Parameters' ||
380 $nm eq 'BlastOutput_param' || $nm eq 'Iteration_hits' ||
381 $nm eq 'Statistics' || $nm eq 'BlastOutput_iterations' ){
382
383 } else {
384
385 $self->debug("ignoring unrecognized element type $nm\n");
386 }
387 $self->{'_last_data'} = ''; # remove read data if we are at
388 # end of an element
389 $self->{'_result'} = $rc if( $nm eq 'BlastOutput' );
390 return $rc;
391 }
392
393 =head2 characters
394
395 Title : characters
396 Usage : $parser->characters($data)
397 Function: Signals new characters to be processed
398 Returns : characters read
399 Args : hash ref with the key 'Data'
400
401
402 =cut
403
404 sub characters{
405 my ($self,$data) = @_;
406 return unless ( defined $data->{'Data'} && $data->{'Data'} !~ /^\s+$/ );
407
408 $self->{'_last_data'} = $data->{'Data'};
409 }
410
411 =head2 use_tempfile
412
413 Title : use_tempfile
414 Usage : $obj->use_tempfile($newval)
415 Function: Get/Set boolean flag on whether or not use a tempfile
416 Example :
417 Returns : value of use_tempfile
418 Args : newvalue (optional)
419
420
421 =cut
422
423 sub use_tempfile{
424 my ($self,$value) = @_;
425 if( defined $value) {
426 $self->{'_use_tempfile'} = $value;
427 }
428 return $self->{'_use_tempfile'};
429 }
430
431 sub result_count {
432 my $self = shift;
433 return $self->{'_result_count'};
434 }
435
436 1;