Mercurial > repos > mahtabm > ensembl
comparison variant_effect_predictor/Bio/SearchIO/waba.pm @ 0:1f6dce3d34e0
Uploaded
| author | mahtabm |
|---|---|
| date | Thu, 11 Apr 2013 02:01:53 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:1f6dce3d34e0 |
|---|---|
| 1 # $Id: waba.pm,v 1.8 2002/12/11 22:12:32 jason Exp $ | |
| 2 # | |
| 3 # BioPerl module for Bio::SearchIO::waba | |
| 4 # | |
| 5 # Cared for by Jason Stajich <jason@bioperl.org> | |
| 6 # | |
| 7 # Copyright Jason Stajich | |
| 8 # | |
| 9 # You may distribute this module under the same terms as perl itself | |
| 10 | |
| 11 # POD documentation - main docs before the code | |
| 12 | |
| 13 =head1 NAME | |
| 14 | |
| 15 Bio::SearchIO::waba - SearchIO parser for Jim Kent WABA program | |
| 16 alignment output | |
| 17 | |
| 18 =head1 SYNOPSIS | |
| 19 | |
| 20 # do not use this object directly, rather through Bio::SearchIO | |
| 21 | |
| 22 use Bio::SearchIO; | |
| 23 my $in = new Bio::SearchIO(-format => 'waba', | |
| 24 -file => 'output.wab'); | |
| 25 while( my $result = $in->next_result ) { | |
| 26 while( my $hit = $result->next_hit ) { | |
| 27 while( my $hsp = $result->next_hsp ) { | |
| 28 | |
| 29 } | |
| 30 } | |
| 31 } | |
| 32 | |
| 33 =head1 DESCRIPTION | |
| 34 | |
| 35 This parser will process the waba output (NOT the human readable format). | |
| 36 | |
| 37 =head1 FEEDBACK | |
| 38 | |
| 39 =head2 Mailing Lists | |
| 40 | |
| 41 User feedback is an integral part of the evolution of this and other | |
| 42 Bioperl modules. Send your comments and suggestions preferably to | |
| 43 the Bioperl mailing list. Your participation is much appreciated. | |
| 44 | |
| 45 bioperl-l@bioperl.org - General discussion | |
| 46 http://bioperl.org/MailList.shtml - About the mailing lists | |
| 47 | |
| 48 =head2 Reporting Bugs | |
| 49 | |
| 50 Report bugs to the Bioperl bug tracking system to help us keep track | |
| 51 of the bugs and their resolution. Bug reports can be submitted via | |
| 52 email or the web: | |
| 53 | |
| 54 bioperl-bugs@bioperl.org | |
| 55 http://bugzilla.bioperl.org/ | |
| 56 | |
| 57 =head1 AUTHOR - Jason Stajich | |
| 58 | |
| 59 Email jason@bioperl.org | |
| 60 | |
| 61 Describe contact details here | |
| 62 | |
| 63 =head1 CONTRIBUTORS | |
| 64 | |
| 65 Additional contributors names and emails here | |
| 66 | |
| 67 =head1 APPENDIX | |
| 68 | |
| 69 The rest of the documentation details each of the object methods. | |
| 70 Internal methods are usually preceded with a _ | |
| 71 | |
| 72 =cut | |
| 73 | |
| 74 | |
| 75 # Let the code begin... | |
| 76 | |
| 77 | |
| 78 package Bio::SearchIO::waba; | |
| 79 use vars qw(@ISA %MODEMAP %MAPPING @STATES); | |
| 80 use strict; | |
| 81 | |
| 82 # Object preamble - inherits from Bio::Root::Root | |
| 83 | |
| 84 use Bio::SearchIO; | |
| 85 | |
| 86 use POSIX; | |
| 87 | |
| 88 BEGIN { | |
| 89 # mapping of NCBI Blast terms to Bioperl hash keys | |
| 90 %MODEMAP = ('WABAOutput' => 'result', | |
| 91 'Hit' => 'hit', | |
| 92 'Hsp' => 'hsp' | |
| 93 ); | |
| 94 @STATES = qw(Hsp_qseq Hsp_hseq Hsp_stateseq); | |
| 95 %MAPPING = | |
| 96 ( | |
| 97 'Hsp_query-from'=> 'HSP-query_start', | |
| 98 'Hsp_query-to' => 'HSP-query_end', | |
| 99 'Hsp_hit-from' => 'HSP-hit_start', | |
| 100 'Hsp_hit-to' => 'HSP-hit_end', | |
| 101 'Hsp_qseq' => 'HSP-query_seq', | |
| 102 'Hsp_hseq' => 'HSP-hit_seq', | |
| 103 'Hsp_midline' => 'HSP-homology_seq', | |
| 104 'Hsp_stateseq' => 'HSP-hmmstate_seq', | |
| 105 'Hsp_align-len' => 'HSP-hsp_length', | |
| 106 | |
| 107 'Hit_id' => 'HIT-name', | |
| 108 'Hit_accession' => 'HIT-accession', | |
| 109 | |
| 110 'WABAOutput_program' => 'RESULT-algorithm_name', | |
| 111 'WABAOutput_version' => 'RESULT-algorithm_version', | |
| 112 'WABAOutput_query-def'=> 'RESULT-query_name', | |
| 113 'WABAOutput_query-db' => 'RESULT-query_database', | |
| 114 'WABAOutput_db' => 'RESULT-database_name', | |
| 115 ); | |
| 116 } | |
| 117 | |
| 118 | |
| 119 @ISA = qw(Bio::SearchIO ); | |
| 120 | |
| 121 =head2 new | |
| 122 | |
| 123 Title : new | |
| 124 Usage : my $obj = new Bio::SearchIO::waba(); | |
| 125 Function: Builds a new Bio::SearchIO::waba object | |
| 126 Returns : Bio::SearchIO::waba | |
| 127 Args : see Bio::SearchIO | |
| 128 | |
| 129 =cut | |
| 130 | |
| 131 sub _initialize { | |
| 132 my ($self,@args) = @_; | |
| 133 $self->SUPER::_initialize(@args); | |
| 134 $self->_eventHandler->register_factory('result', Bio::Search::Result::ResultFactory->new(-type => 'Bio::Search::Result::WABAResult')); | |
| 135 | |
| 136 $self->_eventHandler->register_factory('hsp', Bio::Search::HSP::HSPFactory->new(-type => 'Bio::Search::HSP::WABAHSP')); | |
| 137 } | |
| 138 | |
| 139 | |
| 140 =head2 next_result | |
| 141 | |
| 142 Title : next_result | |
| 143 Usage : my $hit = $searchio->next_result; | |
| 144 Function: Returns the next Result from a search | |
| 145 Returns : Bio::Search::Result::ResultI object | |
| 146 Args : none | |
| 147 | |
| 148 =cut | |
| 149 | |
| 150 sub next_result{ | |
| 151 my ($self) = @_; | |
| 152 | |
| 153 my ($curquery,$curhit); | |
| 154 my $state = -1; | |
| 155 $self->start_document(); | |
| 156 my @hit_signifs; | |
| 157 while( defined ($_ = $self->_readline )) { | |
| 158 | |
| 159 if( $state == -1 ) { | |
| 160 my ($qid, $qhspid,$qpercent, $junk, | |
| 161 $alnlen,$qdb,$qacc,$qstart,$qend,$qstrand, | |
| 162 $hitdb,$hacc,$hstart,$hend, | |
| 163 $hstrand) = | |
| 164 ( /^(\S+)\.(\S+)\s+align\s+ # get the queryid | |
| 165 (\d+(\.\d+)?)\%\s+ # get the percentage | |
| 166 of\s+(\d+)\s+ # get the length of the alignment | |
| 167 (\S+)\s+ # this is the query database | |
| 168 (\S+):(\d+)\-(\d+) # The accession:start-end for query | |
| 169 \s+([\-\+]) # query strand | |
| 170 \s+(\S+)\. # hit db | |
| 171 (\S+):(\d+)\-(\d+) # The accession:start-end for hit | |
| 172 \s+([\-\+])\s*$ # hit strand | |
| 173 /ox ); | |
| 174 | |
| 175 # Curses. Jim's code is 0 based, the following is to readjust | |
| 176 $hstart++; $hend++; $qstart++; $qend++; | |
| 177 | |
| 178 if( ! defined $alnlen ) { | |
| 179 $self->warn("Unable to parse the rest of the WABA alignment info for: $_"); | |
| 180 last; | |
| 181 } | |
| 182 $self->{'_reporttype'} = 'WABA'; # hardcoded - only | |
| 183 # one type of WABA AFAIK | |
| 184 if( defined $curquery && | |
| 185 $curquery ne $qid ) { | |
| 186 $self->end_element({'Name' => 'Hit'}); | |
| 187 $self->_pushback($_); | |
| 188 $self->end_element({'Name' => 'WABAOutput'}); | |
| 189 return $self->end_document(); | |
| 190 } | |
| 191 | |
| 192 if( defined $curhit && | |
| 193 $curhit ne $hacc) { | |
| 194 # slight duplication here -- keep these in SYNC | |
| 195 $self->end_element({'Name' => 'Hit'}); | |
| 196 $self->start_element({'Name' => 'Hit'}); | |
| 197 $self->element({'Name' => 'Hit_id', | |
| 198 'Data' => $hacc}); | |
| 199 $self->element({'Name' => 'Hit_accession', | |
| 200 'Data' => $hacc}); | |
| 201 | |
| 202 } elsif ( ! defined $curquery ) { | |
| 203 $self->start_element({'Name' => 'WABAOutput'}); | |
| 204 $self->{'_result_count'}++; | |
| 205 $self->element({'Name' => 'WABAOutput_query-def', | |
| 206 'Data' => $qid }); | |
| 207 $self->element({'Name' => 'WABAOutput_program', | |
| 208 'Data' => 'WABA'}); | |
| 209 $self->element({'Name' => 'WABAOutput_query-db', | |
| 210 'Data' => $qdb}); | |
| 211 $self->element({'Name' => 'WABAOutput_db', | |
| 212 'Data' => $hitdb}); | |
| 213 | |
| 214 # slight duplication here -- keep these N'SYNC ;-) | |
| 215 $self->start_element({'Name' => 'Hit'}); | |
| 216 $self->element({'Name' => 'Hit_id', | |
| 217 'Data' => $hacc}); | |
| 218 $self->element({'Name' => 'Hit_accession', | |
| 219 'Data' => $hacc}); | |
| 220 } | |
| 221 | |
| 222 | |
| 223 # strand is inferred by start,end values | |
| 224 # in the Result Builder | |
| 225 if( $qstrand eq '-' ) { | |
| 226 ($qstart,$qend) = ($qend,$qstart); | |
| 227 } | |
| 228 if( $hstrand eq '-' ) { | |
| 229 ($hstart,$hend) = ($hend,$hstart); | |
| 230 } | |
| 231 | |
| 232 $self->start_element({'Name' => 'Hsp'}); | |
| 233 $self->element({'Name' => 'Hsp_query-from', | |
| 234 'Data' => $qstart}); | |
| 235 $self->element({'Name' => 'Hsp_query-to', | |
| 236 'Data' => $qend}); | |
| 237 $self->element({'Name' => 'Hsp_hit-from', | |
| 238 'Data' => $hstart}); | |
| 239 $self->element({'Name' => 'Hsp_hit-to', | |
| 240 'Data' => $hend}); | |
| 241 $self->element({'Name' => 'Hsp_align-len', | |
| 242 'Data' => $alnlen}); | |
| 243 | |
| 244 $curquery = $qid; | |
| 245 $curhit = $hacc; | |
| 246 $state = 0; | |
| 247 } elsif( ! defined $curquery ) { | |
| 248 $self->warn("skipping because no Hit begin line was recognized\n$_") if( $_ !~ /^\s+$/ ); | |
| 249 next; | |
| 250 } else { | |
| 251 chomp; | |
| 252 $self->element({'Name' => $STATES[$state++], | |
| 253 'Data' => $_}); | |
| 254 if( $state >= scalar @STATES ) { | |
| 255 $state = -1; | |
| 256 $self->end_element({'Name' => 'Hsp'}); | |
| 257 } | |
| 258 } | |
| 259 } | |
| 260 if( defined $curquery ) { | |
| 261 $self->end_element({'Name' => 'Hit'}); | |
| 262 $self->end_element({'Name' => 'WABAOutput'}); | |
| 263 return $self->end_document(); | |
| 264 } | |
| 265 return undef; | |
| 266 } | |
| 267 | |
| 268 =head2 start_element | |
| 269 | |
| 270 Title : start_element | |
| 271 Usage : $eventgenerator->start_element | |
| 272 Function: Handles a start element event | |
| 273 Returns : none | |
| 274 Args : hashref with at least 2 keys 'Data' and 'Name' | |
| 275 | |
| 276 | |
| 277 =cut | |
| 278 | |
| 279 sub start_element{ | |
| 280 my ($self,$data) = @_; | |
| 281 # we currently don't care about attributes | |
| 282 my $nm = $data->{'Name'}; | |
| 283 if( my $type = $MODEMAP{$nm} ) { | |
| 284 $self->_mode($type); | |
| 285 if( $self->_eventHandler->will_handle($type) ) { | |
| 286 my $func = sprintf("start_%s",lc $type); | |
| 287 $self->_eventHandler->$func($data->{'Attributes'}); | |
| 288 } | |
| 289 unshift @{$self->{'_elements'}}, $type; | |
| 290 } | |
| 291 if($nm eq 'WABAOutput') { | |
| 292 $self->{'_values'} = {}; | |
| 293 $self->{'_result'}= undef; | |
| 294 $self->{'_mode'} = ''; | |
| 295 } | |
| 296 | |
| 297 } | |
| 298 | |
| 299 =head2 end_element | |
| 300 | |
| 301 Title : start_element | |
| 302 Usage : $eventgenerator->end_element | |
| 303 Function: Handles an end element event | |
| 304 Returns : none | |
| 305 Args : hashref with at least 2 keys 'Data' and 'Name' | |
| 306 | |
| 307 | |
| 308 =cut | |
| 309 | |
| 310 sub end_element { | |
| 311 my ($self,$data) = @_; | |
| 312 my $nm = $data->{'Name'}; | |
| 313 my $rc; | |
| 314 # Hsp are sort of weird, in that they end when another | |
| 315 # object begins so have to detect this in end_element for now | |
| 316 if( $nm eq 'Hsp' ) { | |
| 317 foreach ( qw(Hsp_qseq Hsp_midline Hsp_hseq) ) { | |
| 318 $self->element({'Name' => $_, | |
| 319 'Data' => $self->{'_last_hspdata'}->{$_}}); | |
| 320 } | |
| 321 $self->{'_last_hspdata'} = {} | |
| 322 } | |
| 323 | |
| 324 if( my $type = $MODEMAP{$nm} ) { | |
| 325 if( $self->_eventHandler->will_handle($type) ) { | |
| 326 my $func = sprintf("end_%s",lc $type); | |
| 327 $rc = $self->_eventHandler->$func($self->{'_reporttype'}, | |
| 328 $self->{'_values'}); | |
| 329 } | |
| 330 shift @{$self->{'_elements'}}; | |
| 331 | |
| 332 } elsif( $MAPPING{$nm} ) { | |
| 333 if ( ref($MAPPING{$nm}) =~ /hash/i ) { | |
| 334 my $key = (keys %{$MAPPING{$nm}})[0]; | |
| 335 $self->{'_values'}->{$key}->{$MAPPING{$nm}->{$key}} = $self->{'_last_data'}; | |
| 336 } else { | |
| 337 $self->{'_values'}->{$MAPPING{$nm}} = $self->{'_last_data'}; | |
| 338 } | |
| 339 } else { | |
| 340 $self->warn( "unknown nm $nm ignoring\n"); | |
| 341 } | |
| 342 $self->{'_last_data'} = ''; # remove read data if we are at | |
| 343 # end of an element | |
| 344 $self->{'_result'} = $rc if( $nm eq 'WABAOutput' ); | |
| 345 return $rc; | |
| 346 | |
| 347 } | |
| 348 | |
| 349 =head2 element | |
| 350 | |
| 351 Title : element | |
| 352 Usage : $eventhandler->element({'Name' => $name, 'Data' => $str}); | |
| 353 Function: Convience method that calls start_element, characters, end_element | |
| 354 Returns : none | |
| 355 Args : Hash ref with the keys 'Name' and 'Data' | |
| 356 | |
| 357 | |
| 358 =cut | |
| 359 | |
| 360 sub element{ | |
| 361 my ($self,$data) = @_; | |
| 362 $self->start_element($data); | |
| 363 $self->characters($data); | |
| 364 $self->end_element($data); | |
| 365 } | |
| 366 | |
| 367 | |
| 368 =head2 characters | |
| 369 | |
| 370 Title : characters | |
| 371 Usage : $eventgenerator->characters($str) | |
| 372 Function: Send a character events | |
| 373 Returns : none | |
| 374 Args : string | |
| 375 | |
| 376 | |
| 377 =cut | |
| 378 | |
| 379 sub characters{ | |
| 380 my ($self,$data) = @_; | |
| 381 | |
| 382 return unless ( defined $data->{'Data'} ); | |
| 383 if( $data->{'Data'} =~ /^\s+$/ ) { | |
| 384 return unless $data->{'Name'} =~ /Hsp\_(midline|qseq|hseq)/; | |
| 385 } | |
| 386 | |
| 387 if( $self->in_element('hsp') && | |
| 388 $data->{'Name'} =~ /Hsp\_(qseq|hseq|midline)/ ) { | |
| 389 | |
| 390 $self->{'_last_hspdata'}->{$data->{'Name'}} .= $data->{'Data'}; | |
| 391 } | |
| 392 | |
| 393 $self->{'_last_data'} = $data->{'Data'}; | |
| 394 } | |
| 395 | |
| 396 =head2 _mode | |
| 397 | |
| 398 Title : _mode | |
| 399 Usage : $obj->_mode($newval) | |
| 400 Function: | |
| 401 Example : | |
| 402 Returns : value of _mode | |
| 403 Args : newvalue (optional) | |
| 404 | |
| 405 | |
| 406 =cut | |
| 407 | |
| 408 sub _mode{ | |
| 409 my ($self,$value) = @_; | |
| 410 if( defined $value) { | |
| 411 $self->{'_mode'} = $value; | |
| 412 } | |
| 413 return $self->{'_mode'}; | |
| 414 } | |
| 415 | |
| 416 =head2 within_element | |
| 417 | |
| 418 Title : within_element | |
| 419 Usage : if( $eventgenerator->within_element($element) ) {} | |
| 420 Function: Test if we are within a particular element | |
| 421 This is different than 'in' because within can be tested | |
| 422 for a whole block. | |
| 423 Returns : boolean | |
| 424 Args : string element name | |
| 425 | |
| 426 | |
| 427 =cut | |
| 428 | |
| 429 sub within_element{ | |
| 430 my ($self,$name) = @_; | |
| 431 return 0 if ( ! defined $name && | |
| 432 ! defined $self->{'_elements'} || | |
| 433 scalar @{$self->{'_elements'}} == 0) ; | |
| 434 foreach ( @{$self->{'_elements'}} ) { | |
| 435 if( $_ eq $name ) { | |
| 436 return 1; | |
| 437 } | |
| 438 } | |
| 439 return 0; | |
| 440 } | |
| 441 | |
| 442 =head2 in_element | |
| 443 | |
| 444 Title : in_element | |
| 445 Usage : if( $eventgenerator->in_element($element) ) {} | |
| 446 Function: Test if we are in a particular element | |
| 447 This is different than 'in' because within can be tested | |
| 448 for a whole block. | |
| 449 Returns : boolean | |
| 450 Args : string element name | |
| 451 | |
| 452 | |
| 453 =cut | |
| 454 | |
| 455 sub in_element{ | |
| 456 my ($self,$name) = @_; | |
| 457 return 0 if ! defined $self->{'_elements'}->[0]; | |
| 458 return ( $self->{'_elements'}->[0] eq $name) | |
| 459 } | |
| 460 | |
| 461 | |
| 462 =head2 start_document | |
| 463 | |
| 464 Title : start_document | |
| 465 Usage : $eventgenerator->start_document | |
| 466 Function: Handles a start document event | |
| 467 Returns : none | |
| 468 Args : none | |
| 469 | |
| 470 | |
| 471 =cut | |
| 472 | |
| 473 sub start_document{ | |
| 474 my ($self) = @_; | |
| 475 $self->{'_lasttype'} = ''; | |
| 476 $self->{'_values'} = {}; | |
| 477 $self->{'_result'}= undef; | |
| 478 $self->{'_mode'} = ''; | |
| 479 $self->{'_elements'} = []; | |
| 480 } | |
| 481 | |
| 482 | |
| 483 =head2 end_document | |
| 484 | |
| 485 Title : end_document | |
| 486 Usage : $eventgenerator->end_document | |
| 487 Function: Handles an end document event | |
| 488 Returns : Bio::Search::Result::ResultI object | |
| 489 Args : none | |
| 490 | |
| 491 | |
| 492 =cut | |
| 493 | |
| 494 sub end_document{ | |
| 495 my ($self,@args) = @_; | |
| 496 return $self->{'_result'}; | |
| 497 } | |
| 498 | |
| 499 =head2 result_count | |
| 500 | |
| 501 Title : result_count | |
| 502 Usage : my $count = $searchio->result_count | |
| 503 Function: Returns the number of results we have processed | |
| 504 Returns : integer | |
| 505 Args : none | |
| 506 | |
| 507 | |
| 508 =cut | |
| 509 | |
| 510 sub result_count { | |
| 511 my $self = shift; | |
| 512 return $self->{'_result_count'}; | |
| 513 } | |
| 514 | |
| 515 sub report_count { shift->result_count } | |
| 516 | |
| 517 1; |
