Mercurial > repos > mahtabm > ensemb_rep_gvl
comparison variant_effect_predictor/Bio/SearchIO/blastxml.pm @ 0:2bc9b66ada89 draft default tip
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 06:29:17 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:2bc9b66ada89 |
---|---|
1 # $Id: blastxml.pm,v 1.24 2002/10/26 09:32:16 sac Exp $ | |
2 # | |
3 # BioPerl module for Bio::SearchIO::blastxml | |
4 # | |
5 # Cared for by Jason Stajich <jason@bioperl.org> | |
6 # | |
7 # Copyright Jason Stajich | |
8 # | |
9 # You may distribute this module under the same terms as perl itself | |
10 | |
11 # POD documentation - main docs before the code | |
12 | |
13 =head1 NAME | |
14 | |
15 Bio::SearchIO::blastxml - A SearchIO implementation of NCBI Blast XML parsing. | |
16 | |
17 =head1 SYNOPSIS | |
18 | |
19 use Bio::SearchIO; | |
20 my $searchin = new Bio::SearchIO(-format => 'blastxml', | |
21 -file => 't/data/plague_yeast.bls.xml'); | |
22 while( my $result = $searchin->next_result ) { | |
23 } | |
24 | |
25 # one can also request that the parser NOT keep the XML data in memory | |
26 # by using the tempfile initialization flag. | |
27 my $searchin = new Bio::SearchIO(-tempfile => 1, | |
28 -format => 'blastxml', | |
29 -file => 't/data/plague_yeast.bls.xml'); | |
30 while( my $result = $searchin->next_result ) { | |
31 } | |
32 | |
33 =head1 DESCRIPTION | |
34 | |
35 This object implements a NCBI Blast XML parser. | |
36 | |
37 There is one additional initialization flag from the SearchIO defaults | |
38 - that is the -tempfile flag. If specified as true, then the parser | |
39 will write out each report to a temporary filehandle rather than | |
40 holding the entire report as a string in memory. The reason this is | |
41 done in the first place is NCBI reports have an uncessary E<lt>?xml | |
42 version="1.0"?E<gt> at the beginning of each report and RPS-BLAST reports | |
43 have an additional unecessary RPS-BLAST tag at the top of each report. | |
44 So we currently have implemented the work around by preparsing the | |
45 file (yes it makes the process slower, but it works). | |
46 | |
47 | |
48 =head1 FEEDBACK | |
49 | |
50 =head2 Mailing Lists | |
51 | |
52 User feedback is an integral part of the evolution of this and other | |
53 Bioperl modules. Send your comments and suggestions preferably to | |
54 the Bioperl mailing list. Your participation is much appreciated. | |
55 | |
56 bioperl-l@bioperl.org - General discussion | |
57 http://bioperl.org/MailList.shtml - About the mailing lists | |
58 | |
59 =head2 Reporting Bugs | |
60 | |
61 Report bugs to the Bioperl bug tracking system to help us keep track | |
62 of the bugs and their resolution. Bug reports can be submitted via | |
63 email or the web: | |
64 | |
65 bioperl-bugs@bioperl.org | |
66 http://bugzilla.bioperl.org/ | |
67 | |
68 =head1 AUTHOR - Jason Stajich | |
69 | |
70 Email jason@bioperl.org | |
71 | |
72 Describe contact details here | |
73 | |
74 =head1 CONTRIBUTORS | |
75 | |
76 Additional contributors names and emails here | |
77 | |
78 =head1 APPENDIX | |
79 | |
80 The rest of the documentation details each of the object methods. | |
81 Internal methods are usually preceded with a _ | |
82 | |
83 =cut | |
84 | |
85 # Let the code begin... | |
86 | |
87 package Bio::SearchIO::blastxml; | |
88 use vars qw(@ISA $DTD %MAPPING %MODEMAP $DEBUG); | |
89 use strict; | |
90 | |
91 $DTD = 'ftp://ftp.ncbi.nlm.nih.gov/blast/documents/NCBI_BlastOutput.dtd'; | |
92 # Object preamble - inherits from Bio::Root::Root | |
93 | |
94 use Bio::Root::Root; | |
95 use Bio::SearchIO; | |
96 use XML::Parser::PerlSAX; | |
97 use XML::Handler::Subs; | |
98 use HTML::Entities; | |
99 use IO::File; | |
100 | |
101 | |
102 BEGIN { | |
103 # mapping of NCBI Blast terms to Bioperl hash keys | |
104 %MODEMAP = ('BlastOutput' => 'result', | |
105 'Hit' => 'hit', | |
106 'Hsp' => 'hsp' | |
107 ); | |
108 | |
109 %MAPPING = ( | |
110 # HSP specific fields | |
111 'Hsp_bit-score' => 'HSP-bits', | |
112 'Hsp_score' => 'HSP-score', | |
113 'Hsp_evalue' => 'HSP-evalue', | |
114 'Hsp_query-from' => 'HSP-query_start', | |
115 'Hsp_query-to' => 'HSP-query_end', | |
116 'Hsp_hit-from' => 'HSP-hit_start', | |
117 'Hsp_hit-to' => 'HSP-hit_end', | |
118 'Hsp_positive' => 'HSP-conserved', | |
119 'Hsp_identity' => 'HSP-identical', | |
120 'Hsp_gaps' => 'HSP-gaps', | |
121 'Hsp_hitgaps' => 'HSP-hit_gaps', | |
122 'Hsp_querygaps' => 'HSP-query_gaps', | |
123 'Hsp_qseq' => 'HSP-query_seq', | |
124 'Hsp_hseq' => 'HSP-hit_seq', | |
125 'Hsp_midline' => 'HSP-homology_seq', | |
126 'Hsp_align-len' => 'HSP-hsp_length', | |
127 'Hsp_query-frame'=> 'HSP-query_frame', | |
128 'Hsp_hit-frame' => 'HSP-hit_frame', | |
129 | |
130 # these are ignored for now | |
131 'Hsp_num' => 'HSP-order', | |
132 'Hsp_pattern-from' => 'patternend', | |
133 'Hsp_pattern-to' => 'patternstart', | |
134 'Hsp_density' => 'hspdensity', | |
135 | |
136 # Hit specific fields | |
137 'Hit_id' => 'HIT-name', | |
138 'Hit_len' => 'HIT-length', | |
139 'Hit_accession' => 'HIT-accession', | |
140 'Hit_def' => 'HIT-description', | |
141 'Hit_num' => 'HIT-order', | |
142 'Iteration_iter-num' => 'HIT-iteration', | |
143 'Iteration_stat' => 'HIT-iteration_statistic', | |
144 | |
145 'BlastOutput_program' => 'RESULT-algorithm_name', | |
146 'BlastOutput_version' => 'RESULT-algorithm_version', | |
147 'BlastOutput_query-def' => 'RESULT-query_description', | |
148 'BlastOutput_query-len' => 'RESULT-query_length', | |
149 'BlastOutput_db' => 'RESULT-database_name', | |
150 'BlastOutput_reference' => 'RESULT-program_reference', | |
151 'BlastOutput_query-ID' => 'runid', | |
152 | |
153 'Parameters_matrix' => { 'RESULT-parameters' => 'matrix'}, | |
154 'Parameters_expect' => { 'RESULT-parameters' => 'expect'}, | |
155 'Parameters_include' => { 'RESULT-parameters' => 'include'}, | |
156 'Parameters_sc-match' => { 'RESULT-parameters' => 'match'}, | |
157 'Parameters_sc-mismatch' => { 'RESULT-parameters' => 'mismatch'}, | |
158 'Parameters_gap-open' => { 'RESULT-parameters' => 'gapopen'}, | |
159 'Parameters_gap-extend'=> { 'RESULT-parameters' => 'gapext'}, | |
160 'Parameters_filter' => {'RESULT-parameters' => 'filter'}, | |
161 'Statistics_db-num' => 'RESULT-database_entries', | |
162 'Statistics_db-len' => 'RESULT-database_letters', | |
163 'Statistics_hsp-len' => { 'RESULT-statistics' => 'hsplength'}, | |
164 'Statistics_eff-space' => { 'RESULT-statistics' => 'effectivespace'}, | |
165 'Statistics_kappa' => { 'RESULT-statistics' => 'kappa' }, | |
166 'Statistics_lambda' => { 'RESULT-statistics' => 'lambda' }, | |
167 'Statistics_entropy' => { 'RESULT-statistics' => 'entropy'}, | |
168 ); | |
169 eval { require Time::HiRes }; | |
170 if( $@ ) { $DEBUG = 0; } | |
171 } | |
172 | |
173 | |
174 @ISA = qw(Bio::SearchIO ); | |
175 | |
176 =head2 new | |
177 | |
178 Title : new | |
179 Usage : my $searchio = new Bio::SearchIO(-format => 'blastxml', | |
180 -file => 'filename', | |
181 -tempfile => 1); | |
182 Function: Initializes the object - this is chained through new in SearchIO | |
183 Returns : Bio::SearchIO::blastxml object | |
184 Args : One additional argument from the format and file/fh parameters. | |
185 -tempfile => boolean. Defaults to false. Write out XML data | |
186 to a temporary filehandle to send to | |
187 PerlSAX parser. | |
188 =cut | |
189 | |
190 =head2 _initialize | |
191 | |
192 Title : _initialize | |
193 Usage : private | |
194 Function: Initializes the object - this is chained through new in SearchIO | |
195 | |
196 =cut | |
197 | |
198 sub _initialize{ | |
199 my ($self,@args) = @_; | |
200 $self->SUPER::_initialize(@args); | |
201 my ($usetempfile) = $self->_rearrange([qw(TEMPFILE)],@args); | |
202 defined $usetempfile && $self->use_tempfile($usetempfile); | |
203 $self->{'_xmlparser'} = new XML::Parser::PerlSAX(); | |
204 $DEBUG = 1 if( ! defined $DEBUG && $self->verbose > 0); | |
205 } | |
206 | |
207 =head2 next_result | |
208 | |
209 Title : next_result | |
210 Usage : my $hit = $searchio->next_result; | |
211 Function: Returns the next Result from a search | |
212 Returns : Bio::Search::Result::ResultI object | |
213 Args : none | |
214 | |
215 =cut | |
216 | |
217 sub next_result { | |
218 my ($self) = @_; | |
219 | |
220 my $data = ''; | |
221 my $firstline = 1; | |
222 my ($tfh); | |
223 if( $self->use_tempfile ) { | |
224 $tfh = IO::File->new_tmpfile or $self->throw("Unable to open temp file: $!"); | |
225 $tfh->autoflush(1); | |
226 } | |
227 my $okaytoprocess; | |
228 while( defined( $_ = $self->_readline) ) { | |
229 if( /^RPS-BLAST/i ) { | |
230 $self->{'_type'} = 'RPSBLAST'; | |
231 next; | |
232 } | |
233 if( /^<\?xml version/ && ! $firstline) { | |
234 $self->_pushback($_); | |
235 last; | |
236 } | |
237 $_ = decode_entities($_); | |
238 # s/\'/\`/g; | |
239 # s/\>/\>/g; | |
240 # s/\</\</g; | |
241 $okaytoprocess = 1; | |
242 if( defined $tfh ) { | |
243 print $tfh $_; | |
244 } else { | |
245 $data .= $_; | |
246 } | |
247 $firstline = 0; | |
248 } | |
249 | |
250 return undef unless( $okaytoprocess); | |
251 | |
252 my %parser_args; | |
253 if( defined $tfh ) { | |
254 seek($tfh,0,0); | |
255 %parser_args = ('Source' => { 'ByteStream' => $tfh }, | |
256 'Handler' => $self); | |
257 } else { | |
258 %parser_args = ('Source' => { 'String' => $data }, | |
259 'Handler' => $self); | |
260 } | |
261 my $result; | |
262 my $starttime; | |
263 if( $DEBUG ) { $starttime = [ Time::HiRes::gettimeofday() ]; } | |
264 | |
265 eval { | |
266 $result = $self->{'_xmlparser'}->parse(%parser_args); | |
267 $self->{'_result_count'}++; | |
268 }; | |
269 if( $@ ) { | |
270 $self->warn("error in parsing a report:\n $@"); | |
271 $result = undef; | |
272 } | |
273 if( $DEBUG ) { | |
274 $self->debug( sprintf("parsing took %f seconds\n", Time::HiRes::tv_interval($starttime))); | |
275 } | |
276 # parsing magic here - but we call event handlers rather than | |
277 # instantiating things | |
278 return $result; | |
279 } | |
280 | |
281 =head2 SAX methods | |
282 | |
283 =cut | |
284 | |
285 =head2 start_document | |
286 | |
287 Title : start_document | |
288 Usage : $parser->start_document; | |
289 Function: SAX method to indicate starting to parse a new document | |
290 Returns : none | |
291 Args : none | |
292 | |
293 | |
294 =cut | |
295 | |
296 sub start_document{ | |
297 my ($self) = @_; | |
298 $self->{'_lasttype'} = ''; | |
299 $self->{'_values'} = {}; | |
300 $self->{'_result'}= undef; | |
301 } | |
302 | |
303 =head2 end_document | |
304 | |
305 Title : end_document | |
306 Usage : $parser->end_document; | |
307 Function: SAX method to indicate finishing parsing a new document | |
308 Returns : Bio::Search::Result::ResultI object | |
309 Args : none | |
310 | |
311 =cut | |
312 | |
313 sub end_document{ | |
314 my ($self,@args) = @_; | |
315 return $self->{'_result'}; | |
316 } | |
317 | |
318 =head2 start_element | |
319 | |
320 Title : start_element | |
321 Usage : $parser->start_element($data) | |
322 Function: SAX method to indicate starting a new element | |
323 Returns : none | |
324 Args : hash ref for data | |
325 | |
326 =cut | |
327 | |
328 sub start_element{ | |
329 my ($self,$data) = @_; | |
330 # we currently don't care about attributes | |
331 my $nm = $data->{'Name'}; | |
332 | |
333 if( my $type = $MODEMAP{$nm} ) { | |
334 if( $self->_eventHandler->will_handle($type) ) { | |
335 my $func = sprintf("start_%s",lc $type); | |
336 $self->_eventHandler->$func($data->{'Attributes'}); | |
337 } | |
338 } | |
339 | |
340 if($nm eq 'BlastOutput') { | |
341 $self->{'_values'} = {}; | |
342 $self->{'_result'}= undef; | |
343 } | |
344 } | |
345 | |
346 =head2 end_element | |
347 | |
348 Title : end_element | |
349 Usage : $parser->end_element($data) | |
350 Function: Signals finishing an element | |
351 Returns : Bio::Search object dpending on what type of element | |
352 Args : hash ref for data | |
353 | |
354 =cut | |
355 | |
356 sub end_element{ | |
357 my ($self,$data) = @_; | |
358 | |
359 my $nm = $data->{'Name'}; | |
360 my $rc; | |
361 if($nm eq 'BlastOutput_program' && | |
362 $self->{'_last_data'} =~ /(t?blast[npx])/i ) { | |
363 $self->{'_type'} = uc $1; | |
364 } | |
365 | |
366 if( my $type = $MODEMAP{$nm} ) { | |
367 if( $self->_eventHandler->will_handle($type) ) { | |
368 my $func = sprintf("end_%s",lc $type); | |
369 $rc = $self->_eventHandler->$func($self->{'_type'}, | |
370 $self->{'_values'}); | |
371 } | |
372 } elsif( $MAPPING{$nm} ) { | |
373 if ( ref($MAPPING{$nm}) =~ /hash/i ) { | |
374 my $key = (keys %{$MAPPING{$nm}})[0]; | |
375 $self->{'_values'}->{$key}->{$MAPPING{$nm}->{$key}} = $self->{'_last_data'}; | |
376 } else { | |
377 $self->{'_values'}->{$MAPPING{$nm}} = $self->{'_last_data'}; | |
378 } | |
379 } elsif( $nm eq 'Iteration' || $nm eq 'Hit_hsps' || $nm eq 'Parameters' || | |
380 $nm eq 'BlastOutput_param' || $nm eq 'Iteration_hits' || | |
381 $nm eq 'Statistics' || $nm eq 'BlastOutput_iterations' ){ | |
382 | |
383 } else { | |
384 | |
385 $self->debug("ignoring unrecognized element type $nm\n"); | |
386 } | |
387 $self->{'_last_data'} = ''; # remove read data if we are at | |
388 # end of an element | |
389 $self->{'_result'} = $rc if( $nm eq 'BlastOutput' ); | |
390 return $rc; | |
391 } | |
392 | |
393 =head2 characters | |
394 | |
395 Title : characters | |
396 Usage : $parser->characters($data) | |
397 Function: Signals new characters to be processed | |
398 Returns : characters read | |
399 Args : hash ref with the key 'Data' | |
400 | |
401 | |
402 =cut | |
403 | |
404 sub characters{ | |
405 my ($self,$data) = @_; | |
406 return unless ( defined $data->{'Data'} && $data->{'Data'} !~ /^\s+$/ ); | |
407 | |
408 $self->{'_last_data'} = $data->{'Data'}; | |
409 } | |
410 | |
411 =head2 use_tempfile | |
412 | |
413 Title : use_tempfile | |
414 Usage : $obj->use_tempfile($newval) | |
415 Function: Get/Set boolean flag on whether or not use a tempfile | |
416 Example : | |
417 Returns : value of use_tempfile | |
418 Args : newvalue (optional) | |
419 | |
420 | |
421 =cut | |
422 | |
423 sub use_tempfile{ | |
424 my ($self,$value) = @_; | |
425 if( defined $value) { | |
426 $self->{'_use_tempfile'} = $value; | |
427 } | |
428 return $self->{'_use_tempfile'}; | |
429 } | |
430 | |
431 sub result_count { | |
432 my $self = shift; | |
433 return $self->{'_result_count'}; | |
434 } | |
435 | |
436 1; |