Mercurial > repos > mahtabm > ensembl
comparison variant_effect_predictor/Bio/SearchIO/blastxml.pm @ 0:1f6dce3d34e0
Uploaded
| author | mahtabm |
|---|---|
| date | Thu, 11 Apr 2013 02:01:53 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:1f6dce3d34e0 |
|---|---|
| 1 # $Id: blastxml.pm,v 1.24 2002/10/26 09:32:16 sac Exp $ | |
| 2 # | |
| 3 # BioPerl module for Bio::SearchIO::blastxml | |
| 4 # | |
| 5 # Cared for by Jason Stajich <jason@bioperl.org> | |
| 6 # | |
| 7 # Copyright Jason Stajich | |
| 8 # | |
| 9 # You may distribute this module under the same terms as perl itself | |
| 10 | |
| 11 # POD documentation - main docs before the code | |
| 12 | |
| 13 =head1 NAME | |
| 14 | |
| 15 Bio::SearchIO::blastxml - A SearchIO implementation of NCBI Blast XML parsing. | |
| 16 | |
| 17 =head1 SYNOPSIS | |
| 18 | |
| 19 use Bio::SearchIO; | |
| 20 my $searchin = new Bio::SearchIO(-format => 'blastxml', | |
| 21 -file => 't/data/plague_yeast.bls.xml'); | |
| 22 while( my $result = $searchin->next_result ) { | |
| 23 } | |
| 24 | |
| 25 # one can also request that the parser NOT keep the XML data in memory | |
| 26 # by using the tempfile initialization flag. | |
| 27 my $searchin = new Bio::SearchIO(-tempfile => 1, | |
| 28 -format => 'blastxml', | |
| 29 -file => 't/data/plague_yeast.bls.xml'); | |
| 30 while( my $result = $searchin->next_result ) { | |
| 31 } | |
| 32 | |
| 33 =head1 DESCRIPTION | |
| 34 | |
| 35 This object implements a NCBI Blast XML parser. | |
| 36 | |
| 37 There is one additional initialization flag from the SearchIO defaults | |
| 38 - that is the -tempfile flag. If specified as true, then the parser | |
| 39 will write out each report to a temporary filehandle rather than | |
| 40 holding the entire report as a string in memory. The reason this is | |
| 41 done in the first place is NCBI reports have an uncessary E<lt>?xml | |
| 42 version="1.0"?E<gt> at the beginning of each report and RPS-BLAST reports | |
| 43 have an additional unecessary RPS-BLAST tag at the top of each report. | |
| 44 So we currently have implemented the work around by preparsing the | |
| 45 file (yes it makes the process slower, but it works). | |
| 46 | |
| 47 | |
| 48 =head1 FEEDBACK | |
| 49 | |
| 50 =head2 Mailing Lists | |
| 51 | |
| 52 User feedback is an integral part of the evolution of this and other | |
| 53 Bioperl modules. Send your comments and suggestions preferably to | |
| 54 the Bioperl mailing list. Your participation is much appreciated. | |
| 55 | |
| 56 bioperl-l@bioperl.org - General discussion | |
| 57 http://bioperl.org/MailList.shtml - About the mailing lists | |
| 58 | |
| 59 =head2 Reporting Bugs | |
| 60 | |
| 61 Report bugs to the Bioperl bug tracking system to help us keep track | |
| 62 of the bugs and their resolution. Bug reports can be submitted via | |
| 63 email or the web: | |
| 64 | |
| 65 bioperl-bugs@bioperl.org | |
| 66 http://bugzilla.bioperl.org/ | |
| 67 | |
| 68 =head1 AUTHOR - Jason Stajich | |
| 69 | |
| 70 Email jason@bioperl.org | |
| 71 | |
| 72 Describe contact details here | |
| 73 | |
| 74 =head1 CONTRIBUTORS | |
| 75 | |
| 76 Additional contributors names and emails here | |
| 77 | |
| 78 =head1 APPENDIX | |
| 79 | |
| 80 The rest of the documentation details each of the object methods. | |
| 81 Internal methods are usually preceded with a _ | |
| 82 | |
| 83 =cut | |
| 84 | |
| 85 # Let the code begin... | |
| 86 | |
| 87 package Bio::SearchIO::blastxml; | |
| 88 use vars qw(@ISA $DTD %MAPPING %MODEMAP $DEBUG); | |
| 89 use strict; | |
| 90 | |
| 91 $DTD = 'ftp://ftp.ncbi.nlm.nih.gov/blast/documents/NCBI_BlastOutput.dtd'; | |
| 92 # Object preamble - inherits from Bio::Root::Root | |
| 93 | |
| 94 use Bio::Root::Root; | |
| 95 use Bio::SearchIO; | |
| 96 use XML::Parser::PerlSAX; | |
| 97 use XML::Handler::Subs; | |
| 98 use HTML::Entities; | |
| 99 use IO::File; | |
| 100 | |
| 101 | |
| 102 BEGIN { | |
| 103 # mapping of NCBI Blast terms to Bioperl hash keys | |
| 104 %MODEMAP = ('BlastOutput' => 'result', | |
| 105 'Hit' => 'hit', | |
| 106 'Hsp' => 'hsp' | |
| 107 ); | |
| 108 | |
| 109 %MAPPING = ( | |
| 110 # HSP specific fields | |
| 111 'Hsp_bit-score' => 'HSP-bits', | |
| 112 'Hsp_score' => 'HSP-score', | |
| 113 'Hsp_evalue' => 'HSP-evalue', | |
| 114 'Hsp_query-from' => 'HSP-query_start', | |
| 115 'Hsp_query-to' => 'HSP-query_end', | |
| 116 'Hsp_hit-from' => 'HSP-hit_start', | |
| 117 'Hsp_hit-to' => 'HSP-hit_end', | |
| 118 'Hsp_positive' => 'HSP-conserved', | |
| 119 'Hsp_identity' => 'HSP-identical', | |
| 120 'Hsp_gaps' => 'HSP-gaps', | |
| 121 'Hsp_hitgaps' => 'HSP-hit_gaps', | |
| 122 'Hsp_querygaps' => 'HSP-query_gaps', | |
| 123 'Hsp_qseq' => 'HSP-query_seq', | |
| 124 'Hsp_hseq' => 'HSP-hit_seq', | |
| 125 'Hsp_midline' => 'HSP-homology_seq', | |
| 126 'Hsp_align-len' => 'HSP-hsp_length', | |
| 127 'Hsp_query-frame'=> 'HSP-query_frame', | |
| 128 'Hsp_hit-frame' => 'HSP-hit_frame', | |
| 129 | |
| 130 # these are ignored for now | |
| 131 'Hsp_num' => 'HSP-order', | |
| 132 'Hsp_pattern-from' => 'patternend', | |
| 133 'Hsp_pattern-to' => 'patternstart', | |
| 134 'Hsp_density' => 'hspdensity', | |
| 135 | |
| 136 # Hit specific fields | |
| 137 'Hit_id' => 'HIT-name', | |
| 138 'Hit_len' => 'HIT-length', | |
| 139 'Hit_accession' => 'HIT-accession', | |
| 140 'Hit_def' => 'HIT-description', | |
| 141 'Hit_num' => 'HIT-order', | |
| 142 'Iteration_iter-num' => 'HIT-iteration', | |
| 143 'Iteration_stat' => 'HIT-iteration_statistic', | |
| 144 | |
| 145 'BlastOutput_program' => 'RESULT-algorithm_name', | |
| 146 'BlastOutput_version' => 'RESULT-algorithm_version', | |
| 147 'BlastOutput_query-def' => 'RESULT-query_description', | |
| 148 'BlastOutput_query-len' => 'RESULT-query_length', | |
| 149 'BlastOutput_db' => 'RESULT-database_name', | |
| 150 'BlastOutput_reference' => 'RESULT-program_reference', | |
| 151 'BlastOutput_query-ID' => 'runid', | |
| 152 | |
| 153 'Parameters_matrix' => { 'RESULT-parameters' => 'matrix'}, | |
| 154 'Parameters_expect' => { 'RESULT-parameters' => 'expect'}, | |
| 155 'Parameters_include' => { 'RESULT-parameters' => 'include'}, | |
| 156 'Parameters_sc-match' => { 'RESULT-parameters' => 'match'}, | |
| 157 'Parameters_sc-mismatch' => { 'RESULT-parameters' => 'mismatch'}, | |
| 158 'Parameters_gap-open' => { 'RESULT-parameters' => 'gapopen'}, | |
| 159 'Parameters_gap-extend'=> { 'RESULT-parameters' => 'gapext'}, | |
| 160 'Parameters_filter' => {'RESULT-parameters' => 'filter'}, | |
| 161 'Statistics_db-num' => 'RESULT-database_entries', | |
| 162 'Statistics_db-len' => 'RESULT-database_letters', | |
| 163 'Statistics_hsp-len' => { 'RESULT-statistics' => 'hsplength'}, | |
| 164 'Statistics_eff-space' => { 'RESULT-statistics' => 'effectivespace'}, | |
| 165 'Statistics_kappa' => { 'RESULT-statistics' => 'kappa' }, | |
| 166 'Statistics_lambda' => { 'RESULT-statistics' => 'lambda' }, | |
| 167 'Statistics_entropy' => { 'RESULT-statistics' => 'entropy'}, | |
| 168 ); | |
| 169 eval { require Time::HiRes }; | |
| 170 if( $@ ) { $DEBUG = 0; } | |
| 171 } | |
| 172 | |
| 173 | |
| 174 @ISA = qw(Bio::SearchIO ); | |
| 175 | |
| 176 =head2 new | |
| 177 | |
| 178 Title : new | |
| 179 Usage : my $searchio = new Bio::SearchIO(-format => 'blastxml', | |
| 180 -file => 'filename', | |
| 181 -tempfile => 1); | |
| 182 Function: Initializes the object - this is chained through new in SearchIO | |
| 183 Returns : Bio::SearchIO::blastxml object | |
| 184 Args : One additional argument from the format and file/fh parameters. | |
| 185 -tempfile => boolean. Defaults to false. Write out XML data | |
| 186 to a temporary filehandle to send to | |
| 187 PerlSAX parser. | |
| 188 =cut | |
| 189 | |
| 190 =head2 _initialize | |
| 191 | |
| 192 Title : _initialize | |
| 193 Usage : private | |
| 194 Function: Initializes the object - this is chained through new in SearchIO | |
| 195 | |
| 196 =cut | |
| 197 | |
| 198 sub _initialize{ | |
| 199 my ($self,@args) = @_; | |
| 200 $self->SUPER::_initialize(@args); | |
| 201 my ($usetempfile) = $self->_rearrange([qw(TEMPFILE)],@args); | |
| 202 defined $usetempfile && $self->use_tempfile($usetempfile); | |
| 203 $self->{'_xmlparser'} = new XML::Parser::PerlSAX(); | |
| 204 $DEBUG = 1 if( ! defined $DEBUG && $self->verbose > 0); | |
| 205 } | |
| 206 | |
| 207 =head2 next_result | |
| 208 | |
| 209 Title : next_result | |
| 210 Usage : my $hit = $searchio->next_result; | |
| 211 Function: Returns the next Result from a search | |
| 212 Returns : Bio::Search::Result::ResultI object | |
| 213 Args : none | |
| 214 | |
| 215 =cut | |
| 216 | |
| 217 sub next_result { | |
| 218 my ($self) = @_; | |
| 219 | |
| 220 my $data = ''; | |
| 221 my $firstline = 1; | |
| 222 my ($tfh); | |
| 223 if( $self->use_tempfile ) { | |
| 224 $tfh = IO::File->new_tmpfile or $self->throw("Unable to open temp file: $!"); | |
| 225 $tfh->autoflush(1); | |
| 226 } | |
| 227 my $okaytoprocess; | |
| 228 while( defined( $_ = $self->_readline) ) { | |
| 229 if( /^RPS-BLAST/i ) { | |
| 230 $self->{'_type'} = 'RPSBLAST'; | |
| 231 next; | |
| 232 } | |
| 233 if( /^<\?xml version/ && ! $firstline) { | |
| 234 $self->_pushback($_); | |
| 235 last; | |
| 236 } | |
| 237 $_ = decode_entities($_); | |
| 238 # s/\'/\`/g; | |
| 239 # s/\>/\>/g; | |
| 240 # s/\</\</g; | |
| 241 $okaytoprocess = 1; | |
| 242 if( defined $tfh ) { | |
| 243 print $tfh $_; | |
| 244 } else { | |
| 245 $data .= $_; | |
| 246 } | |
| 247 $firstline = 0; | |
| 248 } | |
| 249 | |
| 250 return undef unless( $okaytoprocess); | |
| 251 | |
| 252 my %parser_args; | |
| 253 if( defined $tfh ) { | |
| 254 seek($tfh,0,0); | |
| 255 %parser_args = ('Source' => { 'ByteStream' => $tfh }, | |
| 256 'Handler' => $self); | |
| 257 } else { | |
| 258 %parser_args = ('Source' => { 'String' => $data }, | |
| 259 'Handler' => $self); | |
| 260 } | |
| 261 my $result; | |
| 262 my $starttime; | |
| 263 if( $DEBUG ) { $starttime = [ Time::HiRes::gettimeofday() ]; } | |
| 264 | |
| 265 eval { | |
| 266 $result = $self->{'_xmlparser'}->parse(%parser_args); | |
| 267 $self->{'_result_count'}++; | |
| 268 }; | |
| 269 if( $@ ) { | |
| 270 $self->warn("error in parsing a report:\n $@"); | |
| 271 $result = undef; | |
| 272 } | |
| 273 if( $DEBUG ) { | |
| 274 $self->debug( sprintf("parsing took %f seconds\n", Time::HiRes::tv_interval($starttime))); | |
| 275 } | |
| 276 # parsing magic here - but we call event handlers rather than | |
| 277 # instantiating things | |
| 278 return $result; | |
| 279 } | |
| 280 | |
| 281 =head2 SAX methods | |
| 282 | |
| 283 =cut | |
| 284 | |
| 285 =head2 start_document | |
| 286 | |
| 287 Title : start_document | |
| 288 Usage : $parser->start_document; | |
| 289 Function: SAX method to indicate starting to parse a new document | |
| 290 Returns : none | |
| 291 Args : none | |
| 292 | |
| 293 | |
| 294 =cut | |
| 295 | |
| 296 sub start_document{ | |
| 297 my ($self) = @_; | |
| 298 $self->{'_lasttype'} = ''; | |
| 299 $self->{'_values'} = {}; | |
| 300 $self->{'_result'}= undef; | |
| 301 } | |
| 302 | |
| 303 =head2 end_document | |
| 304 | |
| 305 Title : end_document | |
| 306 Usage : $parser->end_document; | |
| 307 Function: SAX method to indicate finishing parsing a new document | |
| 308 Returns : Bio::Search::Result::ResultI object | |
| 309 Args : none | |
| 310 | |
| 311 =cut | |
| 312 | |
| 313 sub end_document{ | |
| 314 my ($self,@args) = @_; | |
| 315 return $self->{'_result'}; | |
| 316 } | |
| 317 | |
| 318 =head2 start_element | |
| 319 | |
| 320 Title : start_element | |
| 321 Usage : $parser->start_element($data) | |
| 322 Function: SAX method to indicate starting a new element | |
| 323 Returns : none | |
| 324 Args : hash ref for data | |
| 325 | |
| 326 =cut | |
| 327 | |
| 328 sub start_element{ | |
| 329 my ($self,$data) = @_; | |
| 330 # we currently don't care about attributes | |
| 331 my $nm = $data->{'Name'}; | |
| 332 | |
| 333 if( my $type = $MODEMAP{$nm} ) { | |
| 334 if( $self->_eventHandler->will_handle($type) ) { | |
| 335 my $func = sprintf("start_%s",lc $type); | |
| 336 $self->_eventHandler->$func($data->{'Attributes'}); | |
| 337 } | |
| 338 } | |
| 339 | |
| 340 if($nm eq 'BlastOutput') { | |
| 341 $self->{'_values'} = {}; | |
| 342 $self->{'_result'}= undef; | |
| 343 } | |
| 344 } | |
| 345 | |
| 346 =head2 end_element | |
| 347 | |
| 348 Title : end_element | |
| 349 Usage : $parser->end_element($data) | |
| 350 Function: Signals finishing an element | |
| 351 Returns : Bio::Search object dpending on what type of element | |
| 352 Args : hash ref for data | |
| 353 | |
| 354 =cut | |
| 355 | |
| 356 sub end_element{ | |
| 357 my ($self,$data) = @_; | |
| 358 | |
| 359 my $nm = $data->{'Name'}; | |
| 360 my $rc; | |
| 361 if($nm eq 'BlastOutput_program' && | |
| 362 $self->{'_last_data'} =~ /(t?blast[npx])/i ) { | |
| 363 $self->{'_type'} = uc $1; | |
| 364 } | |
| 365 | |
| 366 if( my $type = $MODEMAP{$nm} ) { | |
| 367 if( $self->_eventHandler->will_handle($type) ) { | |
| 368 my $func = sprintf("end_%s",lc $type); | |
| 369 $rc = $self->_eventHandler->$func($self->{'_type'}, | |
| 370 $self->{'_values'}); | |
| 371 } | |
| 372 } elsif( $MAPPING{$nm} ) { | |
| 373 if ( ref($MAPPING{$nm}) =~ /hash/i ) { | |
| 374 my $key = (keys %{$MAPPING{$nm}})[0]; | |
| 375 $self->{'_values'}->{$key}->{$MAPPING{$nm}->{$key}} = $self->{'_last_data'}; | |
| 376 } else { | |
| 377 $self->{'_values'}->{$MAPPING{$nm}} = $self->{'_last_data'}; | |
| 378 } | |
| 379 } elsif( $nm eq 'Iteration' || $nm eq 'Hit_hsps' || $nm eq 'Parameters' || | |
| 380 $nm eq 'BlastOutput_param' || $nm eq 'Iteration_hits' || | |
| 381 $nm eq 'Statistics' || $nm eq 'BlastOutput_iterations' ){ | |
| 382 | |
| 383 } else { | |
| 384 | |
| 385 $self->debug("ignoring unrecognized element type $nm\n"); | |
| 386 } | |
| 387 $self->{'_last_data'} = ''; # remove read data if we are at | |
| 388 # end of an element | |
| 389 $self->{'_result'} = $rc if( $nm eq 'BlastOutput' ); | |
| 390 return $rc; | |
| 391 } | |
| 392 | |
| 393 =head2 characters | |
| 394 | |
| 395 Title : characters | |
| 396 Usage : $parser->characters($data) | |
| 397 Function: Signals new characters to be processed | |
| 398 Returns : characters read | |
| 399 Args : hash ref with the key 'Data' | |
| 400 | |
| 401 | |
| 402 =cut | |
| 403 | |
| 404 sub characters{ | |
| 405 my ($self,$data) = @_; | |
| 406 return unless ( defined $data->{'Data'} && $data->{'Data'} !~ /^\s+$/ ); | |
| 407 | |
| 408 $self->{'_last_data'} = $data->{'Data'}; | |
| 409 } | |
| 410 | |
| 411 =head2 use_tempfile | |
| 412 | |
| 413 Title : use_tempfile | |
| 414 Usage : $obj->use_tempfile($newval) | |
| 415 Function: Get/Set boolean flag on whether or not use a tempfile | |
| 416 Example : | |
| 417 Returns : value of use_tempfile | |
| 418 Args : newvalue (optional) | |
| 419 | |
| 420 | |
| 421 =cut | |
| 422 | |
| 423 sub use_tempfile{ | |
| 424 my ($self,$value) = @_; | |
| 425 if( defined $value) { | |
| 426 $self->{'_use_tempfile'} = $value; | |
| 427 } | |
| 428 return $self->{'_use_tempfile'}; | |
| 429 } | |
| 430 | |
| 431 sub result_count { | |
| 432 my $self = shift; | |
| 433 return $self->{'_result_count'}; | |
| 434 } | |
| 435 | |
| 436 1; |
