0
|
1 # $Id: waba.pm,v 1.8 2002/12/11 22:12:32 jason Exp $
|
|
2 #
|
|
3 # BioPerl module for Bio::SearchIO::waba
|
|
4 #
|
|
5 # Cared for by Jason Stajich <jason@bioperl.org>
|
|
6 #
|
|
7 # Copyright Jason Stajich
|
|
8 #
|
|
9 # You may distribute this module under the same terms as perl itself
|
|
10
|
|
11 # POD documentation - main docs before the code
|
|
12
|
|
13 =head1 NAME
|
|
14
|
|
15 Bio::SearchIO::waba - SearchIO parser for Jim Kent WABA program
|
|
16 alignment output
|
|
17
|
|
18 =head1 SYNOPSIS
|
|
19
|
|
20 # do not use this object directly, rather through Bio::SearchIO
|
|
21
|
|
22 use Bio::SearchIO;
|
|
23 my $in = new Bio::SearchIO(-format => 'waba',
|
|
24 -file => 'output.wab');
|
|
25 while( my $result = $in->next_result ) {
|
|
26 while( my $hit = $result->next_hit ) {
|
|
27 while( my $hsp = $result->next_hsp ) {
|
|
28
|
|
29 }
|
|
30 }
|
|
31 }
|
|
32
|
|
33 =head1 DESCRIPTION
|
|
34
|
|
35 This parser will process the waba output (NOT the human readable format).
|
|
36
|
|
37 =head1 FEEDBACK
|
|
38
|
|
39 =head2 Mailing Lists
|
|
40
|
|
41 User feedback is an integral part of the evolution of this and other
|
|
42 Bioperl modules. Send your comments and suggestions preferably to
|
|
43 the Bioperl mailing list. Your participation is much appreciated.
|
|
44
|
|
45 bioperl-l@bioperl.org - General discussion
|
|
46 http://bioperl.org/MailList.shtml - About the mailing lists
|
|
47
|
|
48 =head2 Reporting Bugs
|
|
49
|
|
50 Report bugs to the Bioperl bug tracking system to help us keep track
|
|
51 of the bugs and their resolution. Bug reports can be submitted via
|
|
52 email or the web:
|
|
53
|
|
54 bioperl-bugs@bioperl.org
|
|
55 http://bugzilla.bioperl.org/
|
|
56
|
|
57 =head1 AUTHOR - Jason Stajich
|
|
58
|
|
59 Email jason@bioperl.org
|
|
60
|
|
61 Describe contact details here
|
|
62
|
|
63 =head1 CONTRIBUTORS
|
|
64
|
|
65 Additional contributors names and emails here
|
|
66
|
|
67 =head1 APPENDIX
|
|
68
|
|
69 The rest of the documentation details each of the object methods.
|
|
70 Internal methods are usually preceded with a _
|
|
71
|
|
72 =cut
|
|
73
|
|
74
|
|
75 # Let the code begin...
|
|
76
|
|
77
|
|
78 package Bio::SearchIO::waba;
|
|
79 use vars qw(@ISA %MODEMAP %MAPPING @STATES);
|
|
80 use strict;
|
|
81
|
|
82 # Object preamble - inherits from Bio::Root::Root
|
|
83
|
|
84 use Bio::SearchIO;
|
|
85
|
|
86 use POSIX;
|
|
87
|
|
88 BEGIN {
|
|
89 # mapping of NCBI Blast terms to Bioperl hash keys
|
|
90 %MODEMAP = ('WABAOutput' => 'result',
|
|
91 'Hit' => 'hit',
|
|
92 'Hsp' => 'hsp'
|
|
93 );
|
|
94 @STATES = qw(Hsp_qseq Hsp_hseq Hsp_stateseq);
|
|
95 %MAPPING =
|
|
96 (
|
|
97 'Hsp_query-from'=> 'HSP-query_start',
|
|
98 'Hsp_query-to' => 'HSP-query_end',
|
|
99 'Hsp_hit-from' => 'HSP-hit_start',
|
|
100 'Hsp_hit-to' => 'HSP-hit_end',
|
|
101 'Hsp_qseq' => 'HSP-query_seq',
|
|
102 'Hsp_hseq' => 'HSP-hit_seq',
|
|
103 'Hsp_midline' => 'HSP-homology_seq',
|
|
104 'Hsp_stateseq' => 'HSP-hmmstate_seq',
|
|
105 'Hsp_align-len' => 'HSP-hsp_length',
|
|
106
|
|
107 'Hit_id' => 'HIT-name',
|
|
108 'Hit_accession' => 'HIT-accession',
|
|
109
|
|
110 'WABAOutput_program' => 'RESULT-algorithm_name',
|
|
111 'WABAOutput_version' => 'RESULT-algorithm_version',
|
|
112 'WABAOutput_query-def'=> 'RESULT-query_name',
|
|
113 'WABAOutput_query-db' => 'RESULT-query_database',
|
|
114 'WABAOutput_db' => 'RESULT-database_name',
|
|
115 );
|
|
116 }
|
|
117
|
|
118
|
|
119 @ISA = qw(Bio::SearchIO );
|
|
120
|
|
121 =head2 new
|
|
122
|
|
123 Title : new
|
|
124 Usage : my $obj = new Bio::SearchIO::waba();
|
|
125 Function: Builds a new Bio::SearchIO::waba object
|
|
126 Returns : Bio::SearchIO::waba
|
|
127 Args : see Bio::SearchIO
|
|
128
|
|
129 =cut
|
|
130
|
|
131 sub _initialize {
|
|
132 my ($self,@args) = @_;
|
|
133 $self->SUPER::_initialize(@args);
|
|
134 $self->_eventHandler->register_factory('result', Bio::Search::Result::ResultFactory->new(-type => 'Bio::Search::Result::WABAResult'));
|
|
135
|
|
136 $self->_eventHandler->register_factory('hsp', Bio::Search::HSP::HSPFactory->new(-type => 'Bio::Search::HSP::WABAHSP'));
|
|
137 }
|
|
138
|
|
139
|
|
140 =head2 next_result
|
|
141
|
|
142 Title : next_result
|
|
143 Usage : my $hit = $searchio->next_result;
|
|
144 Function: Returns the next Result from a search
|
|
145 Returns : Bio::Search::Result::ResultI object
|
|
146 Args : none
|
|
147
|
|
148 =cut
|
|
149
|
|
150 sub next_result{
|
|
151 my ($self) = @_;
|
|
152
|
|
153 my ($curquery,$curhit);
|
|
154 my $state = -1;
|
|
155 $self->start_document();
|
|
156 my @hit_signifs;
|
|
157 while( defined ($_ = $self->_readline )) {
|
|
158
|
|
159 if( $state == -1 ) {
|
|
160 my ($qid, $qhspid,$qpercent, $junk,
|
|
161 $alnlen,$qdb,$qacc,$qstart,$qend,$qstrand,
|
|
162 $hitdb,$hacc,$hstart,$hend,
|
|
163 $hstrand) =
|
|
164 ( /^(\S+)\.(\S+)\s+align\s+ # get the queryid
|
|
165 (\d+(\.\d+)?)\%\s+ # get the percentage
|
|
166 of\s+(\d+)\s+ # get the length of the alignment
|
|
167 (\S+)\s+ # this is the query database
|
|
168 (\S+):(\d+)\-(\d+) # The accession:start-end for query
|
|
169 \s+([\-\+]) # query strand
|
|
170 \s+(\S+)\. # hit db
|
|
171 (\S+):(\d+)\-(\d+) # The accession:start-end for hit
|
|
172 \s+([\-\+])\s*$ # hit strand
|
|
173 /ox );
|
|
174
|
|
175 # Curses. Jim's code is 0 based, the following is to readjust
|
|
176 $hstart++; $hend++; $qstart++; $qend++;
|
|
177
|
|
178 if( ! defined $alnlen ) {
|
|
179 $self->warn("Unable to parse the rest of the WABA alignment info for: $_");
|
|
180 last;
|
|
181 }
|
|
182 $self->{'_reporttype'} = 'WABA'; # hardcoded - only
|
|
183 # one type of WABA AFAIK
|
|
184 if( defined $curquery &&
|
|
185 $curquery ne $qid ) {
|
|
186 $self->end_element({'Name' => 'Hit'});
|
|
187 $self->_pushback($_);
|
|
188 $self->end_element({'Name' => 'WABAOutput'});
|
|
189 return $self->end_document();
|
|
190 }
|
|
191
|
|
192 if( defined $curhit &&
|
|
193 $curhit ne $hacc) {
|
|
194 # slight duplication here -- keep these in SYNC
|
|
195 $self->end_element({'Name' => 'Hit'});
|
|
196 $self->start_element({'Name' => 'Hit'});
|
|
197 $self->element({'Name' => 'Hit_id',
|
|
198 'Data' => $hacc});
|
|
199 $self->element({'Name' => 'Hit_accession',
|
|
200 'Data' => $hacc});
|
|
201
|
|
202 } elsif ( ! defined $curquery ) {
|
|
203 $self->start_element({'Name' => 'WABAOutput'});
|
|
204 $self->{'_result_count'}++;
|
|
205 $self->element({'Name' => 'WABAOutput_query-def',
|
|
206 'Data' => $qid });
|
|
207 $self->element({'Name' => 'WABAOutput_program',
|
|
208 'Data' => 'WABA'});
|
|
209 $self->element({'Name' => 'WABAOutput_query-db',
|
|
210 'Data' => $qdb});
|
|
211 $self->element({'Name' => 'WABAOutput_db',
|
|
212 'Data' => $hitdb});
|
|
213
|
|
214 # slight duplication here -- keep these N'SYNC ;-)
|
|
215 $self->start_element({'Name' => 'Hit'});
|
|
216 $self->element({'Name' => 'Hit_id',
|
|
217 'Data' => $hacc});
|
|
218 $self->element({'Name' => 'Hit_accession',
|
|
219 'Data' => $hacc});
|
|
220 }
|
|
221
|
|
222
|
|
223 # strand is inferred by start,end values
|
|
224 # in the Result Builder
|
|
225 if( $qstrand eq '-' ) {
|
|
226 ($qstart,$qend) = ($qend,$qstart);
|
|
227 }
|
|
228 if( $hstrand eq '-' ) {
|
|
229 ($hstart,$hend) = ($hend,$hstart);
|
|
230 }
|
|
231
|
|
232 $self->start_element({'Name' => 'Hsp'});
|
|
233 $self->element({'Name' => 'Hsp_query-from',
|
|
234 'Data' => $qstart});
|
|
235 $self->element({'Name' => 'Hsp_query-to',
|
|
236 'Data' => $qend});
|
|
237 $self->element({'Name' => 'Hsp_hit-from',
|
|
238 'Data' => $hstart});
|
|
239 $self->element({'Name' => 'Hsp_hit-to',
|
|
240 'Data' => $hend});
|
|
241 $self->element({'Name' => 'Hsp_align-len',
|
|
242 'Data' => $alnlen});
|
|
243
|
|
244 $curquery = $qid;
|
|
245 $curhit = $hacc;
|
|
246 $state = 0;
|
|
247 } elsif( ! defined $curquery ) {
|
|
248 $self->warn("skipping because no Hit begin line was recognized\n$_") if( $_ !~ /^\s+$/ );
|
|
249 next;
|
|
250 } else {
|
|
251 chomp;
|
|
252 $self->element({'Name' => $STATES[$state++],
|
|
253 'Data' => $_});
|
|
254 if( $state >= scalar @STATES ) {
|
|
255 $state = -1;
|
|
256 $self->end_element({'Name' => 'Hsp'});
|
|
257 }
|
|
258 }
|
|
259 }
|
|
260 if( defined $curquery ) {
|
|
261 $self->end_element({'Name' => 'Hit'});
|
|
262 $self->end_element({'Name' => 'WABAOutput'});
|
|
263 return $self->end_document();
|
|
264 }
|
|
265 return undef;
|
|
266 }
|
|
267
|
|
268 =head2 start_element
|
|
269
|
|
270 Title : start_element
|
|
271 Usage : $eventgenerator->start_element
|
|
272 Function: Handles a start element event
|
|
273 Returns : none
|
|
274 Args : hashref with at least 2 keys 'Data' and 'Name'
|
|
275
|
|
276
|
|
277 =cut
|
|
278
|
|
279 sub start_element{
|
|
280 my ($self,$data) = @_;
|
|
281 # we currently don't care about attributes
|
|
282 my $nm = $data->{'Name'};
|
|
283 if( my $type = $MODEMAP{$nm} ) {
|
|
284 $self->_mode($type);
|
|
285 if( $self->_eventHandler->will_handle($type) ) {
|
|
286 my $func = sprintf("start_%s",lc $type);
|
|
287 $self->_eventHandler->$func($data->{'Attributes'});
|
|
288 }
|
|
289 unshift @{$self->{'_elements'}}, $type;
|
|
290 }
|
|
291 if($nm eq 'WABAOutput') {
|
|
292 $self->{'_values'} = {};
|
|
293 $self->{'_result'}= undef;
|
|
294 $self->{'_mode'} = '';
|
|
295 }
|
|
296
|
|
297 }
|
|
298
|
|
299 =head2 end_element
|
|
300
|
|
301 Title : start_element
|
|
302 Usage : $eventgenerator->end_element
|
|
303 Function: Handles an end element event
|
|
304 Returns : none
|
|
305 Args : hashref with at least 2 keys 'Data' and 'Name'
|
|
306
|
|
307
|
|
308 =cut
|
|
309
|
|
310 sub end_element {
|
|
311 my ($self,$data) = @_;
|
|
312 my $nm = $data->{'Name'};
|
|
313 my $rc;
|
|
314 # Hsp are sort of weird, in that they end when another
|
|
315 # object begins so have to detect this in end_element for now
|
|
316 if( $nm eq 'Hsp' ) {
|
|
317 foreach ( qw(Hsp_qseq Hsp_midline Hsp_hseq) ) {
|
|
318 $self->element({'Name' => $_,
|
|
319 'Data' => $self->{'_last_hspdata'}->{$_}});
|
|
320 }
|
|
321 $self->{'_last_hspdata'} = {}
|
|
322 }
|
|
323
|
|
324 if( my $type = $MODEMAP{$nm} ) {
|
|
325 if( $self->_eventHandler->will_handle($type) ) {
|
|
326 my $func = sprintf("end_%s",lc $type);
|
|
327 $rc = $self->_eventHandler->$func($self->{'_reporttype'},
|
|
328 $self->{'_values'});
|
|
329 }
|
|
330 shift @{$self->{'_elements'}};
|
|
331
|
|
332 } elsif( $MAPPING{$nm} ) {
|
|
333 if ( ref($MAPPING{$nm}) =~ /hash/i ) {
|
|
334 my $key = (keys %{$MAPPING{$nm}})[0];
|
|
335 $self->{'_values'}->{$key}->{$MAPPING{$nm}->{$key}} = $self->{'_last_data'};
|
|
336 } else {
|
|
337 $self->{'_values'}->{$MAPPING{$nm}} = $self->{'_last_data'};
|
|
338 }
|
|
339 } else {
|
|
340 $self->warn( "unknown nm $nm ignoring\n");
|
|
341 }
|
|
342 $self->{'_last_data'} = ''; # remove read data if we are at
|
|
343 # end of an element
|
|
344 $self->{'_result'} = $rc if( $nm eq 'WABAOutput' );
|
|
345 return $rc;
|
|
346
|
|
347 }
|
|
348
|
|
349 =head2 element
|
|
350
|
|
351 Title : element
|
|
352 Usage : $eventhandler->element({'Name' => $name, 'Data' => $str});
|
|
353 Function: Convience method that calls start_element, characters, end_element
|
|
354 Returns : none
|
|
355 Args : Hash ref with the keys 'Name' and 'Data'
|
|
356
|
|
357
|
|
358 =cut
|
|
359
|
|
360 sub element{
|
|
361 my ($self,$data) = @_;
|
|
362 $self->start_element($data);
|
|
363 $self->characters($data);
|
|
364 $self->end_element($data);
|
|
365 }
|
|
366
|
|
367
|
|
368 =head2 characters
|
|
369
|
|
370 Title : characters
|
|
371 Usage : $eventgenerator->characters($str)
|
|
372 Function: Send a character events
|
|
373 Returns : none
|
|
374 Args : string
|
|
375
|
|
376
|
|
377 =cut
|
|
378
|
|
379 sub characters{
|
|
380 my ($self,$data) = @_;
|
|
381
|
|
382 return unless ( defined $data->{'Data'} );
|
|
383 if( $data->{'Data'} =~ /^\s+$/ ) {
|
|
384 return unless $data->{'Name'} =~ /Hsp\_(midline|qseq|hseq)/;
|
|
385 }
|
|
386
|
|
387 if( $self->in_element('hsp') &&
|
|
388 $data->{'Name'} =~ /Hsp\_(qseq|hseq|midline)/ ) {
|
|
389
|
|
390 $self->{'_last_hspdata'}->{$data->{'Name'}} .= $data->{'Data'};
|
|
391 }
|
|
392
|
|
393 $self->{'_last_data'} = $data->{'Data'};
|
|
394 }
|
|
395
|
|
396 =head2 _mode
|
|
397
|
|
398 Title : _mode
|
|
399 Usage : $obj->_mode($newval)
|
|
400 Function:
|
|
401 Example :
|
|
402 Returns : value of _mode
|
|
403 Args : newvalue (optional)
|
|
404
|
|
405
|
|
406 =cut
|
|
407
|
|
408 sub _mode{
|
|
409 my ($self,$value) = @_;
|
|
410 if( defined $value) {
|
|
411 $self->{'_mode'} = $value;
|
|
412 }
|
|
413 return $self->{'_mode'};
|
|
414 }
|
|
415
|
|
416 =head2 within_element
|
|
417
|
|
418 Title : within_element
|
|
419 Usage : if( $eventgenerator->within_element($element) ) {}
|
|
420 Function: Test if we are within a particular element
|
|
421 This is different than 'in' because within can be tested
|
|
422 for a whole block.
|
|
423 Returns : boolean
|
|
424 Args : string element name
|
|
425
|
|
426
|
|
427 =cut
|
|
428
|
|
429 sub within_element{
|
|
430 my ($self,$name) = @_;
|
|
431 return 0 if ( ! defined $name &&
|
|
432 ! defined $self->{'_elements'} ||
|
|
433 scalar @{$self->{'_elements'}} == 0) ;
|
|
434 foreach ( @{$self->{'_elements'}} ) {
|
|
435 if( $_ eq $name ) {
|
|
436 return 1;
|
|
437 }
|
|
438 }
|
|
439 return 0;
|
|
440 }
|
|
441
|
|
442 =head2 in_element
|
|
443
|
|
444 Title : in_element
|
|
445 Usage : if( $eventgenerator->in_element($element) ) {}
|
|
446 Function: Test if we are in a particular element
|
|
447 This is different than 'in' because within can be tested
|
|
448 for a whole block.
|
|
449 Returns : boolean
|
|
450 Args : string element name
|
|
451
|
|
452
|
|
453 =cut
|
|
454
|
|
455 sub in_element{
|
|
456 my ($self,$name) = @_;
|
|
457 return 0 if ! defined $self->{'_elements'}->[0];
|
|
458 return ( $self->{'_elements'}->[0] eq $name)
|
|
459 }
|
|
460
|
|
461
|
|
462 =head2 start_document
|
|
463
|
|
464 Title : start_document
|
|
465 Usage : $eventgenerator->start_document
|
|
466 Function: Handles a start document event
|
|
467 Returns : none
|
|
468 Args : none
|
|
469
|
|
470
|
|
471 =cut
|
|
472
|
|
473 sub start_document{
|
|
474 my ($self) = @_;
|
|
475 $self->{'_lasttype'} = '';
|
|
476 $self->{'_values'} = {};
|
|
477 $self->{'_result'}= undef;
|
|
478 $self->{'_mode'} = '';
|
|
479 $self->{'_elements'} = [];
|
|
480 }
|
|
481
|
|
482
|
|
483 =head2 end_document
|
|
484
|
|
485 Title : end_document
|
|
486 Usage : $eventgenerator->end_document
|
|
487 Function: Handles an end document event
|
|
488 Returns : Bio::Search::Result::ResultI object
|
|
489 Args : none
|
|
490
|
|
491
|
|
492 =cut
|
|
493
|
|
494 sub end_document{
|
|
495 my ($self,@args) = @_;
|
|
496 return $self->{'_result'};
|
|
497 }
|
|
498
|
|
499 =head2 result_count
|
|
500
|
|
501 Title : result_count
|
|
502 Usage : my $count = $searchio->result_count
|
|
503 Function: Returns the number of results we have processed
|
|
504 Returns : integer
|
|
505 Args : none
|
|
506
|
|
507
|
|
508 =cut
|
|
509
|
|
510 sub result_count {
|
|
511 my $self = shift;
|
|
512 return $self->{'_result_count'};
|
|
513 }
|
|
514
|
|
515 sub report_count { shift->result_count }
|
|
516
|
|
517 1;
|