Mercurial > repos > mahtabm > ensembl
comparison variant_effect_predictor/Bio/SearchIO/waba.pm @ 0:1f6dce3d34e0
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 02:01:53 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:1f6dce3d34e0 |
---|---|
1 # $Id: waba.pm,v 1.8 2002/12/11 22:12:32 jason Exp $ | |
2 # | |
3 # BioPerl module for Bio::SearchIO::waba | |
4 # | |
5 # Cared for by Jason Stajich <jason@bioperl.org> | |
6 # | |
7 # Copyright Jason Stajich | |
8 # | |
9 # You may distribute this module under the same terms as perl itself | |
10 | |
11 # POD documentation - main docs before the code | |
12 | |
13 =head1 NAME | |
14 | |
15 Bio::SearchIO::waba - SearchIO parser for Jim Kent WABA program | |
16 alignment output | |
17 | |
18 =head1 SYNOPSIS | |
19 | |
20 # do not use this object directly, rather through Bio::SearchIO | |
21 | |
22 use Bio::SearchIO; | |
23 my $in = new Bio::SearchIO(-format => 'waba', | |
24 -file => 'output.wab'); | |
25 while( my $result = $in->next_result ) { | |
26 while( my $hit = $result->next_hit ) { | |
27 while( my $hsp = $result->next_hsp ) { | |
28 | |
29 } | |
30 } | |
31 } | |
32 | |
33 =head1 DESCRIPTION | |
34 | |
35 This parser will process the waba output (NOT the human readable format). | |
36 | |
37 =head1 FEEDBACK | |
38 | |
39 =head2 Mailing Lists | |
40 | |
41 User feedback is an integral part of the evolution of this and other | |
42 Bioperl modules. Send your comments and suggestions preferably to | |
43 the Bioperl mailing list. Your participation is much appreciated. | |
44 | |
45 bioperl-l@bioperl.org - General discussion | |
46 http://bioperl.org/MailList.shtml - About the mailing lists | |
47 | |
48 =head2 Reporting Bugs | |
49 | |
50 Report bugs to the Bioperl bug tracking system to help us keep track | |
51 of the bugs and their resolution. Bug reports can be submitted via | |
52 email or the web: | |
53 | |
54 bioperl-bugs@bioperl.org | |
55 http://bugzilla.bioperl.org/ | |
56 | |
57 =head1 AUTHOR - Jason Stajich | |
58 | |
59 Email jason@bioperl.org | |
60 | |
61 Describe contact details here | |
62 | |
63 =head1 CONTRIBUTORS | |
64 | |
65 Additional contributors names and emails here | |
66 | |
67 =head1 APPENDIX | |
68 | |
69 The rest of the documentation details each of the object methods. | |
70 Internal methods are usually preceded with a _ | |
71 | |
72 =cut | |
73 | |
74 | |
75 # Let the code begin... | |
76 | |
77 | |
78 package Bio::SearchIO::waba; | |
79 use vars qw(@ISA %MODEMAP %MAPPING @STATES); | |
80 use strict; | |
81 | |
82 # Object preamble - inherits from Bio::Root::Root | |
83 | |
84 use Bio::SearchIO; | |
85 | |
86 use POSIX; | |
87 | |
88 BEGIN { | |
89 # mapping of NCBI Blast terms to Bioperl hash keys | |
90 %MODEMAP = ('WABAOutput' => 'result', | |
91 'Hit' => 'hit', | |
92 'Hsp' => 'hsp' | |
93 ); | |
94 @STATES = qw(Hsp_qseq Hsp_hseq Hsp_stateseq); | |
95 %MAPPING = | |
96 ( | |
97 'Hsp_query-from'=> 'HSP-query_start', | |
98 'Hsp_query-to' => 'HSP-query_end', | |
99 'Hsp_hit-from' => 'HSP-hit_start', | |
100 'Hsp_hit-to' => 'HSP-hit_end', | |
101 'Hsp_qseq' => 'HSP-query_seq', | |
102 'Hsp_hseq' => 'HSP-hit_seq', | |
103 'Hsp_midline' => 'HSP-homology_seq', | |
104 'Hsp_stateseq' => 'HSP-hmmstate_seq', | |
105 'Hsp_align-len' => 'HSP-hsp_length', | |
106 | |
107 'Hit_id' => 'HIT-name', | |
108 'Hit_accession' => 'HIT-accession', | |
109 | |
110 'WABAOutput_program' => 'RESULT-algorithm_name', | |
111 'WABAOutput_version' => 'RESULT-algorithm_version', | |
112 'WABAOutput_query-def'=> 'RESULT-query_name', | |
113 'WABAOutput_query-db' => 'RESULT-query_database', | |
114 'WABAOutput_db' => 'RESULT-database_name', | |
115 ); | |
116 } | |
117 | |
118 | |
119 @ISA = qw(Bio::SearchIO ); | |
120 | |
121 =head2 new | |
122 | |
123 Title : new | |
124 Usage : my $obj = new Bio::SearchIO::waba(); | |
125 Function: Builds a new Bio::SearchIO::waba object | |
126 Returns : Bio::SearchIO::waba | |
127 Args : see Bio::SearchIO | |
128 | |
129 =cut | |
130 | |
131 sub _initialize { | |
132 my ($self,@args) = @_; | |
133 $self->SUPER::_initialize(@args); | |
134 $self->_eventHandler->register_factory('result', Bio::Search::Result::ResultFactory->new(-type => 'Bio::Search::Result::WABAResult')); | |
135 | |
136 $self->_eventHandler->register_factory('hsp', Bio::Search::HSP::HSPFactory->new(-type => 'Bio::Search::HSP::WABAHSP')); | |
137 } | |
138 | |
139 | |
140 =head2 next_result | |
141 | |
142 Title : next_result | |
143 Usage : my $hit = $searchio->next_result; | |
144 Function: Returns the next Result from a search | |
145 Returns : Bio::Search::Result::ResultI object | |
146 Args : none | |
147 | |
148 =cut | |
149 | |
150 sub next_result{ | |
151 my ($self) = @_; | |
152 | |
153 my ($curquery,$curhit); | |
154 my $state = -1; | |
155 $self->start_document(); | |
156 my @hit_signifs; | |
157 while( defined ($_ = $self->_readline )) { | |
158 | |
159 if( $state == -1 ) { | |
160 my ($qid, $qhspid,$qpercent, $junk, | |
161 $alnlen,$qdb,$qacc,$qstart,$qend,$qstrand, | |
162 $hitdb,$hacc,$hstart,$hend, | |
163 $hstrand) = | |
164 ( /^(\S+)\.(\S+)\s+align\s+ # get the queryid | |
165 (\d+(\.\d+)?)\%\s+ # get the percentage | |
166 of\s+(\d+)\s+ # get the length of the alignment | |
167 (\S+)\s+ # this is the query database | |
168 (\S+):(\d+)\-(\d+) # The accession:start-end for query | |
169 \s+([\-\+]) # query strand | |
170 \s+(\S+)\. # hit db | |
171 (\S+):(\d+)\-(\d+) # The accession:start-end for hit | |
172 \s+([\-\+])\s*$ # hit strand | |
173 /ox ); | |
174 | |
175 # Curses. Jim's code is 0 based, the following is to readjust | |
176 $hstart++; $hend++; $qstart++; $qend++; | |
177 | |
178 if( ! defined $alnlen ) { | |
179 $self->warn("Unable to parse the rest of the WABA alignment info for: $_"); | |
180 last; | |
181 } | |
182 $self->{'_reporttype'} = 'WABA'; # hardcoded - only | |
183 # one type of WABA AFAIK | |
184 if( defined $curquery && | |
185 $curquery ne $qid ) { | |
186 $self->end_element({'Name' => 'Hit'}); | |
187 $self->_pushback($_); | |
188 $self->end_element({'Name' => 'WABAOutput'}); | |
189 return $self->end_document(); | |
190 } | |
191 | |
192 if( defined $curhit && | |
193 $curhit ne $hacc) { | |
194 # slight duplication here -- keep these in SYNC | |
195 $self->end_element({'Name' => 'Hit'}); | |
196 $self->start_element({'Name' => 'Hit'}); | |
197 $self->element({'Name' => 'Hit_id', | |
198 'Data' => $hacc}); | |
199 $self->element({'Name' => 'Hit_accession', | |
200 'Data' => $hacc}); | |
201 | |
202 } elsif ( ! defined $curquery ) { | |
203 $self->start_element({'Name' => 'WABAOutput'}); | |
204 $self->{'_result_count'}++; | |
205 $self->element({'Name' => 'WABAOutput_query-def', | |
206 'Data' => $qid }); | |
207 $self->element({'Name' => 'WABAOutput_program', | |
208 'Data' => 'WABA'}); | |
209 $self->element({'Name' => 'WABAOutput_query-db', | |
210 'Data' => $qdb}); | |
211 $self->element({'Name' => 'WABAOutput_db', | |
212 'Data' => $hitdb}); | |
213 | |
214 # slight duplication here -- keep these N'SYNC ;-) | |
215 $self->start_element({'Name' => 'Hit'}); | |
216 $self->element({'Name' => 'Hit_id', | |
217 'Data' => $hacc}); | |
218 $self->element({'Name' => 'Hit_accession', | |
219 'Data' => $hacc}); | |
220 } | |
221 | |
222 | |
223 # strand is inferred by start,end values | |
224 # in the Result Builder | |
225 if( $qstrand eq '-' ) { | |
226 ($qstart,$qend) = ($qend,$qstart); | |
227 } | |
228 if( $hstrand eq '-' ) { | |
229 ($hstart,$hend) = ($hend,$hstart); | |
230 } | |
231 | |
232 $self->start_element({'Name' => 'Hsp'}); | |
233 $self->element({'Name' => 'Hsp_query-from', | |
234 'Data' => $qstart}); | |
235 $self->element({'Name' => 'Hsp_query-to', | |
236 'Data' => $qend}); | |
237 $self->element({'Name' => 'Hsp_hit-from', | |
238 'Data' => $hstart}); | |
239 $self->element({'Name' => 'Hsp_hit-to', | |
240 'Data' => $hend}); | |
241 $self->element({'Name' => 'Hsp_align-len', | |
242 'Data' => $alnlen}); | |
243 | |
244 $curquery = $qid; | |
245 $curhit = $hacc; | |
246 $state = 0; | |
247 } elsif( ! defined $curquery ) { | |
248 $self->warn("skipping because no Hit begin line was recognized\n$_") if( $_ !~ /^\s+$/ ); | |
249 next; | |
250 } else { | |
251 chomp; | |
252 $self->element({'Name' => $STATES[$state++], | |
253 'Data' => $_}); | |
254 if( $state >= scalar @STATES ) { | |
255 $state = -1; | |
256 $self->end_element({'Name' => 'Hsp'}); | |
257 } | |
258 } | |
259 } | |
260 if( defined $curquery ) { | |
261 $self->end_element({'Name' => 'Hit'}); | |
262 $self->end_element({'Name' => 'WABAOutput'}); | |
263 return $self->end_document(); | |
264 } | |
265 return undef; | |
266 } | |
267 | |
268 =head2 start_element | |
269 | |
270 Title : start_element | |
271 Usage : $eventgenerator->start_element | |
272 Function: Handles a start element event | |
273 Returns : none | |
274 Args : hashref with at least 2 keys 'Data' and 'Name' | |
275 | |
276 | |
277 =cut | |
278 | |
279 sub start_element{ | |
280 my ($self,$data) = @_; | |
281 # we currently don't care about attributes | |
282 my $nm = $data->{'Name'}; | |
283 if( my $type = $MODEMAP{$nm} ) { | |
284 $self->_mode($type); | |
285 if( $self->_eventHandler->will_handle($type) ) { | |
286 my $func = sprintf("start_%s",lc $type); | |
287 $self->_eventHandler->$func($data->{'Attributes'}); | |
288 } | |
289 unshift @{$self->{'_elements'}}, $type; | |
290 } | |
291 if($nm eq 'WABAOutput') { | |
292 $self->{'_values'} = {}; | |
293 $self->{'_result'}= undef; | |
294 $self->{'_mode'} = ''; | |
295 } | |
296 | |
297 } | |
298 | |
299 =head2 end_element | |
300 | |
301 Title : start_element | |
302 Usage : $eventgenerator->end_element | |
303 Function: Handles an end element event | |
304 Returns : none | |
305 Args : hashref with at least 2 keys 'Data' and 'Name' | |
306 | |
307 | |
308 =cut | |
309 | |
310 sub end_element { | |
311 my ($self,$data) = @_; | |
312 my $nm = $data->{'Name'}; | |
313 my $rc; | |
314 # Hsp are sort of weird, in that they end when another | |
315 # object begins so have to detect this in end_element for now | |
316 if( $nm eq 'Hsp' ) { | |
317 foreach ( qw(Hsp_qseq Hsp_midline Hsp_hseq) ) { | |
318 $self->element({'Name' => $_, | |
319 'Data' => $self->{'_last_hspdata'}->{$_}}); | |
320 } | |
321 $self->{'_last_hspdata'} = {} | |
322 } | |
323 | |
324 if( my $type = $MODEMAP{$nm} ) { | |
325 if( $self->_eventHandler->will_handle($type) ) { | |
326 my $func = sprintf("end_%s",lc $type); | |
327 $rc = $self->_eventHandler->$func($self->{'_reporttype'}, | |
328 $self->{'_values'}); | |
329 } | |
330 shift @{$self->{'_elements'}}; | |
331 | |
332 } elsif( $MAPPING{$nm} ) { | |
333 if ( ref($MAPPING{$nm}) =~ /hash/i ) { | |
334 my $key = (keys %{$MAPPING{$nm}})[0]; | |
335 $self->{'_values'}->{$key}->{$MAPPING{$nm}->{$key}} = $self->{'_last_data'}; | |
336 } else { | |
337 $self->{'_values'}->{$MAPPING{$nm}} = $self->{'_last_data'}; | |
338 } | |
339 } else { | |
340 $self->warn( "unknown nm $nm ignoring\n"); | |
341 } | |
342 $self->{'_last_data'} = ''; # remove read data if we are at | |
343 # end of an element | |
344 $self->{'_result'} = $rc if( $nm eq 'WABAOutput' ); | |
345 return $rc; | |
346 | |
347 } | |
348 | |
349 =head2 element | |
350 | |
351 Title : element | |
352 Usage : $eventhandler->element({'Name' => $name, 'Data' => $str}); | |
353 Function: Convience method that calls start_element, characters, end_element | |
354 Returns : none | |
355 Args : Hash ref with the keys 'Name' and 'Data' | |
356 | |
357 | |
358 =cut | |
359 | |
360 sub element{ | |
361 my ($self,$data) = @_; | |
362 $self->start_element($data); | |
363 $self->characters($data); | |
364 $self->end_element($data); | |
365 } | |
366 | |
367 | |
368 =head2 characters | |
369 | |
370 Title : characters | |
371 Usage : $eventgenerator->characters($str) | |
372 Function: Send a character events | |
373 Returns : none | |
374 Args : string | |
375 | |
376 | |
377 =cut | |
378 | |
379 sub characters{ | |
380 my ($self,$data) = @_; | |
381 | |
382 return unless ( defined $data->{'Data'} ); | |
383 if( $data->{'Data'} =~ /^\s+$/ ) { | |
384 return unless $data->{'Name'} =~ /Hsp\_(midline|qseq|hseq)/; | |
385 } | |
386 | |
387 if( $self->in_element('hsp') && | |
388 $data->{'Name'} =~ /Hsp\_(qseq|hseq|midline)/ ) { | |
389 | |
390 $self->{'_last_hspdata'}->{$data->{'Name'}} .= $data->{'Data'}; | |
391 } | |
392 | |
393 $self->{'_last_data'} = $data->{'Data'}; | |
394 } | |
395 | |
396 =head2 _mode | |
397 | |
398 Title : _mode | |
399 Usage : $obj->_mode($newval) | |
400 Function: | |
401 Example : | |
402 Returns : value of _mode | |
403 Args : newvalue (optional) | |
404 | |
405 | |
406 =cut | |
407 | |
408 sub _mode{ | |
409 my ($self,$value) = @_; | |
410 if( defined $value) { | |
411 $self->{'_mode'} = $value; | |
412 } | |
413 return $self->{'_mode'}; | |
414 } | |
415 | |
416 =head2 within_element | |
417 | |
418 Title : within_element | |
419 Usage : if( $eventgenerator->within_element($element) ) {} | |
420 Function: Test if we are within a particular element | |
421 This is different than 'in' because within can be tested | |
422 for a whole block. | |
423 Returns : boolean | |
424 Args : string element name | |
425 | |
426 | |
427 =cut | |
428 | |
429 sub within_element{ | |
430 my ($self,$name) = @_; | |
431 return 0 if ( ! defined $name && | |
432 ! defined $self->{'_elements'} || | |
433 scalar @{$self->{'_elements'}} == 0) ; | |
434 foreach ( @{$self->{'_elements'}} ) { | |
435 if( $_ eq $name ) { | |
436 return 1; | |
437 } | |
438 } | |
439 return 0; | |
440 } | |
441 | |
442 =head2 in_element | |
443 | |
444 Title : in_element | |
445 Usage : if( $eventgenerator->in_element($element) ) {} | |
446 Function: Test if we are in a particular element | |
447 This is different than 'in' because within can be tested | |
448 for a whole block. | |
449 Returns : boolean | |
450 Args : string element name | |
451 | |
452 | |
453 =cut | |
454 | |
455 sub in_element{ | |
456 my ($self,$name) = @_; | |
457 return 0 if ! defined $self->{'_elements'}->[0]; | |
458 return ( $self->{'_elements'}->[0] eq $name) | |
459 } | |
460 | |
461 | |
462 =head2 start_document | |
463 | |
464 Title : start_document | |
465 Usage : $eventgenerator->start_document | |
466 Function: Handles a start document event | |
467 Returns : none | |
468 Args : none | |
469 | |
470 | |
471 =cut | |
472 | |
473 sub start_document{ | |
474 my ($self) = @_; | |
475 $self->{'_lasttype'} = ''; | |
476 $self->{'_values'} = {}; | |
477 $self->{'_result'}= undef; | |
478 $self->{'_mode'} = ''; | |
479 $self->{'_elements'} = []; | |
480 } | |
481 | |
482 | |
483 =head2 end_document | |
484 | |
485 Title : end_document | |
486 Usage : $eventgenerator->end_document | |
487 Function: Handles an end document event | |
488 Returns : Bio::Search::Result::ResultI object | |
489 Args : none | |
490 | |
491 | |
492 =cut | |
493 | |
494 sub end_document{ | |
495 my ($self,@args) = @_; | |
496 return $self->{'_result'}; | |
497 } | |
498 | |
499 =head2 result_count | |
500 | |
501 Title : result_count | |
502 Usage : my $count = $searchio->result_count | |
503 Function: Returns the number of results we have processed | |
504 Returns : integer | |
505 Args : none | |
506 | |
507 | |
508 =cut | |
509 | |
510 sub result_count { | |
511 my $self = shift; | |
512 return $self->{'_result_count'}; | |
513 } | |
514 | |
515 sub report_count { shift->result_count } | |
516 | |
517 1; |