0
|
1 # $Id: exonerate.pm,v 1.3.2.3 2003/03/29 20:30:54 jason Exp $
|
|
2 #
|
|
3 # BioPerl module for Bio::SearchIO::exonerate
|
|
4 #
|
|
5 # Cared for by Jason Stajich <jason@bioperl.org>
|
|
6 #
|
|
7 # Copyright Jason Stajich
|
|
8 #
|
|
9 # You may distribute this module under the same terms as perl itself
|
|
10
|
|
11 # POD documentation - main docs before the code
|
|
12
|
|
13 =head1 NAME
|
|
14
|
|
15 Bio::SearchIO::exonerate - parser for Exonerate
|
|
16
|
|
17 =head1 SYNOPSIS
|
|
18
|
|
19 # do not use this module directly, it is a driver for SearchIO
|
|
20
|
|
21 use Bio::SearchIO;
|
|
22 my $searchio = new Bio::SearchIO(-file => 'file.exonerate',
|
|
23 -format => 'exonerate');
|
|
24
|
|
25
|
|
26 while( my $r = $searchio->next_result ) {
|
|
27 print $r->query_name, "\n";
|
|
28 }
|
|
29
|
|
30 =head1 DESCRIPTION
|
|
31
|
|
32 This is a driver for the SearchIO system for parsing Exonerate (Guy
|
|
33 Slater) output. You can get Exonerate at
|
|
34 http://cvsweb.sanger.ac.uk/cgi-bin/cvsweb.cgi/exonerate/?cvsroot=Ensembl
|
|
35 [until Guy puts up a Web reference,publication for it.]).
|
|
36
|
|
37 An optional parameter -min_intron is supported by the L<new>
|
|
38 initialization method. This is if you run Exonerate with a different
|
|
39 minimum intron length (default is 30) the parser will be able to
|
|
40 detect the difference between standard deletions and an intron. Still
|
|
41 some room to play with there that might cause this to get
|
|
42 misinterpreted that has not been fully tested or explored.
|
|
43
|
|
44 =head1 FEEDBACK
|
|
45
|
|
46 =head2 Mailing Lists
|
|
47
|
|
48 User feedback is an integral part of the evolution of this and other
|
|
49 Bioperl modules. Send your comments and suggestions preferably to
|
|
50 the Bioperl mailing list. Your participation is much appreciated.
|
|
51
|
|
52 bioperl-l@bioperl.org - General discussion
|
|
53 http://bioperl.org/MailList.shtml - About the mailing lists
|
|
54
|
|
55 =head2 Reporting Bugs
|
|
56
|
|
57 Report bugs to the Bioperl bug tracking system to help us keep track
|
|
58 of the bugs and their resolution. Bug reports can be submitted via
|
|
59 email or the web:
|
|
60
|
|
61 bioperl-bugs@bioperl.org
|
|
62 http://bioperl.org/bioperl-bugs/
|
|
63
|
|
64 =head1 AUTHOR - Jason Stajich
|
|
65
|
|
66 Email jason@bioperl.org
|
|
67
|
|
68 Describe contact details here
|
|
69
|
|
70 =head1 CONTRIBUTORS
|
|
71
|
|
72 Additional contributors names and emails here
|
|
73
|
|
74 =head1 APPENDIX
|
|
75
|
|
76 The rest of the documentation details each of the object methods.
|
|
77 Internal methods are usually preceded with a _
|
|
78
|
|
79 =cut
|
|
80
|
|
81
|
|
82 # Let the code begin...
|
|
83
|
|
84
|
|
85 package Bio::SearchIO::exonerate;
|
|
86 use strict;
|
|
87 use vars qw(@ISA @STATES %MAPPING %MODEMAP $DEFAULT_WRITER_CLASS $MIN_INTRON);
|
|
88 use Bio::SearchIO;
|
|
89
|
|
90 @ISA = qw(Bio::SearchIO );
|
|
91
|
|
92 use POSIX;
|
|
93
|
|
94
|
|
95 %MODEMAP = ('ExonerateOutput' => 'result',
|
|
96 'Hit' => 'hit',
|
|
97 'Hsp' => 'hsp'
|
|
98 );
|
|
99 %MAPPING =
|
|
100 (
|
|
101 'Hsp_query-from'=> 'HSP-query_start',
|
|
102 'Hsp_query-to' => 'HSP-query_end',
|
|
103 'Hsp_hit-from' => 'HSP-hit_start',
|
|
104 'Hsp_hit-to' => 'HSP-hit_end',
|
|
105 'Hsp_qseq' => 'HSP-query_seq',
|
|
106 'Hsp_hseq' => 'HSP-hit_seq',
|
|
107 'Hsp_midline' => 'HSP-homology_seq',
|
|
108 'Hsp_score' => 'HSP-score',
|
|
109 'Hsp_qlength' => 'HSP-query_length',
|
|
110 'Hsp_hlength' => 'HSP-hit_length',
|
|
111 'Hsp_align-len' => 'HSP-hsp_length',
|
|
112 'Hsp_identity' => 'HSP-identical',
|
|
113 'Hsp_gaps' => 'HSP-hsp_gaps',
|
|
114 'Hsp_hitgaps' => 'HSP-hit_gaps',
|
|
115 'Hsp_querygaps' => 'HSP-query_gaps',
|
|
116
|
|
117 'Hit_id' => 'HIT-name',
|
|
118 'Hit_desc' => 'HIT-description',
|
|
119 'Hit_len' => 'HIT-length',
|
|
120 'Hit_score' => 'HIT-score',
|
|
121
|
|
122 'ExonerateOutput_program' => 'RESULT-algorithm_name',
|
|
123 'ExonerateOutput_query-def' => 'RESULT-query_name',
|
|
124 'ExonerateOutput_query-desc'=> 'RESULT-query_description',
|
|
125 'ExonerateOutput_query-len' => 'RESULT-query_length',
|
|
126 );
|
|
127
|
|
128 $DEFAULT_WRITER_CLASS = 'Bio::Search::Writer::HitTableWriter';
|
|
129
|
|
130 $MIN_INTRON=30; # This is the minimum intron size
|
|
131
|
|
132 =head2 new
|
|
133
|
|
134 Title : new
|
|
135 Usage : my $obj = new Bio::SearchIO::exonerate();
|
|
136 Function: Builds a new Bio::SearchIO::exonerate object
|
|
137 Returns : an instance of Bio::SearchIO::exonerate
|
|
138 Args :
|
|
139
|
|
140
|
|
141 =cut
|
|
142
|
|
143 sub new {
|
|
144 my ($class) = shift;
|
|
145 my $self = $class->SUPER::new(@_);
|
|
146
|
|
147 my ($min_intron) = $self->_rearrange([qw(MIN_INTRON)], @_);
|
|
148 if( $min_intron ) {
|
|
149 $MIN_INTRON = $min_intron;
|
|
150 }
|
|
151 $self;
|
|
152 }
|
|
153
|
|
154 =head2 next_result
|
|
155
|
|
156 Title : next_result
|
|
157 Usage : my $hit = $searchio->next_result;
|
|
158 Function: Returns the next Result from a search
|
|
159 Returns : Bio::Search::Result::ResultI object
|
|
160 Args : none
|
|
161
|
|
162 =cut
|
|
163
|
|
164 sub next_result{
|
|
165 my ($self) = @_;
|
|
166 $self->{'_last_data'} = '';
|
|
167 my ($reporttype,$seenquery,$reportline);
|
|
168 $self->start_document();
|
|
169 my @hit_signifs;
|
|
170 my $seentop;
|
|
171 my (@q_ex, @m_ex, @h_ex); ## gc addition
|
|
172 while( defined($_ = $self->_readline) ) {
|
|
173 if( /^Query:\s+(\S+)(\s+(.+))?/ ) {
|
|
174 if( $seentop ) {
|
|
175 $self->end_element({'Name' => 'ExonerateOutput'});
|
|
176 $self->_pushback($_);
|
|
177 return $self->end_document();
|
|
178 }
|
|
179 $seentop = 1;
|
|
180 my ($nm,$desc) = ($1,$2);
|
|
181 chomp($desc) if defined $desc;
|
|
182 $self->{'_result_count'}++;
|
|
183 $self->start_element({'Name' => 'ExonerateOutput'});
|
|
184 $self->element({'Name' => 'ExonerateOutput_query-def',
|
|
185 'Data' => $nm });
|
|
186 $self->element({'Name' => 'ExonerateOutput_query-desc',
|
|
187 'Data' => $desc });
|
|
188 $self->element({'Name' => 'ExonerateOutput_program',
|
|
189 'Data' => 'Exonerate' });
|
|
190
|
|
191 } elsif ( /^Target:\s+(\S+)(\s+(.+))?/ ) {
|
|
192 my ($nm,$desc) = ($1,$2);
|
|
193 chomp($desc) if defined $desc;
|
|
194 $self->start_element({'Name' => 'Hit'});
|
|
195 $self->element({'Name' => 'Hit_id',
|
|
196 'Data' => $nm});
|
|
197 $self->element({'Name' => 'Hit_desc',
|
|
198 'Data' => $desc});
|
|
199 } elsif( s/^cigar:\s+(\S+)\s+ # query sequence id
|
|
200 (\d+)\s+(\d+)\s+([\-\+])\s+ # query start-end-strand
|
|
201 (\S+)\s+ # target sequence id
|
|
202 (\d+)\s+(\d+)\s+([\-\+])\s+ # target start-end-strand
|
|
203 (\d+)\s+ # score
|
|
204 //ox ) {
|
|
205
|
|
206 ## gc note:
|
|
207 ## $qe and $he are no longer used for calculating the ends,
|
|
208 ## just the $qs and $hs values and the alignment and insert lenghts
|
|
209 my ($qs,$qe,$qstrand) = ($2,$3,$4);
|
|
210 my ($hs,$he,$hstrand) = ($6,$7,$8);
|
|
211 my $score = $9;
|
|
212 # $self->element({'Name' => 'ExonerateOutput_query-len',
|
|
213 # 'Data' => $qe});
|
|
214 # $self->element({'Name' => 'Hit_len',
|
|
215 # 'Data' => $he});
|
|
216
|
|
217 my @rest = split;
|
|
218 if( $qstrand eq '-' ) {
|
|
219 $qstrand = -1;
|
|
220 ($qs,$qe) = ($qe,$qs); # flip-flop if we're on opp strand
|
|
221 $qs--; $qe++;
|
|
222 } else { $qstrand = 1; }
|
|
223 if( $hstrand eq '-' ) {
|
|
224 $hstrand = -1;
|
|
225 ($hs,$he) = ($he,$hs); # flip-flop if we're on opp strand
|
|
226 $hs--; $he++;
|
|
227 } else { $hstrand = 1; }
|
|
228 # okay let's do this right and generate a set of HSPs
|
|
229 # from the cigar line
|
|
230
|
|
231 ## gc note:
|
|
232 ## add one because these values are zero-based
|
|
233 ## this calculation was originally done lower in the code,
|
|
234 ## but it's clearer to do it just once at the start
|
|
235 $qs++;
|
|
236 $hs++;
|
|
237
|
|
238 my ($aln_len,$inserts,$deletes) = (0,0,0);
|
|
239 while( @rest >= 2 ) {
|
|
240 my ($state,$len) = (shift @rest, shift @rest);
|
|
241 if( $state eq 'I' ) {
|
|
242 $inserts+=$len;
|
|
243 } elsif( $state eq 'D' ) {
|
|
244 if( $len >= $MIN_INTRON ) {
|
|
245 $self->start_element({'Name' => 'Hsp'});
|
|
246
|
|
247 $self->element({'Name' => 'Hsp_score',
|
|
248 'Data' => $score});
|
|
249 $self->element({'Name' => 'Hsp_align-len',
|
|
250 'Data' => $aln_len});
|
|
251 $self->element({'Name' => 'Hsp_identity',
|
|
252 'Data' => $aln_len -
|
|
253 ($inserts + $deletes)});
|
|
254
|
|
255 # HSP ends where the other begins
|
|
256 $self->element({'Name' => 'Hsp_query-from',
|
|
257 'Data' => $qs});
|
|
258 ## gc note:
|
|
259 ## $qs is now the start of the next hsp
|
|
260 ## the end of this hsp is 1 before this position
|
|
261 ## (or 1 after in case of reverse strand)
|
|
262 $qs += $aln_len*$qstrand;
|
|
263 $self->element({'Name' => 'Hsp_query-to',
|
|
264 'Data' => $qs - ($qstrand*1)});
|
|
265
|
|
266 $hs += $deletes*$hstrand;
|
|
267 $self->element({'Name' => 'Hsp_hit-from',
|
|
268 'Data' => $hs});
|
|
269 $hs += $aln_len*$hstrand;
|
|
270 $self->element({'Name' => 'Hsp_hit-to',
|
|
271 'Data' => $hs-($hstrand*1)});
|
|
272
|
|
273 $self->element({'Name' => 'Hsp_align-len',
|
|
274 'Data' => $aln_len + $inserts
|
|
275 + $deletes});
|
|
276 $self->element({'Name' => 'Hsp_identity',
|
|
277 'Data' => $aln_len });
|
|
278
|
|
279 $self->element({'Name' => 'Hsp_gaps',
|
|
280 'Data' => $inserts + $deletes});
|
|
281 $self->element({'Name' => 'Hsp_querygaps',
|
|
282 'Data' => $inserts});
|
|
283 $self->element({'Name' => 'Hsp_hitgaps',
|
|
284 'Data' => $deletes});
|
|
285
|
|
286 ## gc addition start
|
|
287
|
|
288 $self->element({'Name' => 'Hsp_qseq',
|
|
289 'Data' => shift @q_ex,
|
|
290 });
|
|
291 $self->element({'Name' => 'Hsp_hseq',
|
|
292 'Data' => shift @h_ex,
|
|
293 });
|
|
294 $self->element({'Name' => 'Hsp_midline',
|
|
295 'Data' => shift @m_ex,
|
|
296 });
|
|
297 ## gc addition end
|
|
298 $self->end_element({'Name' => 'Hsp'});
|
|
299
|
|
300 $aln_len = $inserts = $deletes = 0;
|
|
301 }
|
|
302 $deletes+=$len;
|
|
303 } else {
|
|
304 $aln_len += $len;
|
|
305 }
|
|
306 }
|
|
307 $self->start_element({'Name' => 'Hsp'});
|
|
308
|
|
309 ## gc addition start
|
|
310
|
|
311 $self->element({'Name' => 'Hsp_qseq',
|
|
312 'Data' => shift @q_ex,
|
|
313 });
|
|
314 $self->element({'Name' => 'Hsp_hseq',
|
|
315 'Data' => shift @h_ex,
|
|
316 });
|
|
317 $self->element({'Name' => 'Hsp_midline',
|
|
318 'Data' => shift @m_ex,
|
|
319 });
|
|
320 ## gc addition end
|
|
321
|
|
322 $self->element({'Name' => 'Hsp_score',
|
|
323 'Data' => $score});
|
|
324
|
|
325 $self->element({'Name' => 'Hsp_query-from',
|
|
326 'Data' => $qs});
|
|
327
|
|
328 $qs += $aln_len*$qstrand;
|
|
329 $self->element({'Name' => 'Hsp_query-to',
|
|
330 'Data' => $qs - ($qstrand*1)});
|
|
331
|
|
332 $hs += $deletes*$hstrand;
|
|
333 $self->element({'Name' => 'Hsp_hit-from',
|
|
334 'Data' => $hs});
|
|
335 $hs += $aln_len*$hstrand;
|
|
336 $self->element({'Name' => 'Hsp_hit-to',
|
|
337 'Data' => $hs -($hstrand*1)});
|
|
338
|
|
339 $self->element({'Name' => 'Hsp_align-len',
|
|
340 'Data' => $aln_len});
|
|
341
|
|
342 $self->element({'Name' => 'Hsp_identity',
|
|
343 'Data' => $aln_len - ($inserts + $deletes)});
|
|
344
|
|
345 $self->element({'Name' => 'Hsp_gaps',
|
|
346 'Data' => $inserts + $deletes});
|
|
347
|
|
348 $self->element({'Name' => 'Hsp_querygaps',
|
|
349 'Data' => $inserts});
|
|
350 $self->element({'Name' => 'Hsp_hitgaps',
|
|
351 'Data' => $deletes});
|
|
352 $self->end_element({'Name' => 'Hsp'});
|
|
353 $self->element({'Name' => 'Hit_score',
|
|
354 'Data' => $score});
|
|
355 $self->end_element({'Name' => 'Hit'});
|
|
356 $self->end_element({'Name' => 'ExonerateOutput'});
|
|
357
|
|
358 return $self->end_document();
|
|
359 } else {
|
|
360 }
|
|
361 }
|
|
362 return $self->end_document() if( $seentop );
|
|
363 }
|
|
364
|
|
365 =head2 start_element
|
|
366
|
|
367 Title : start_element
|
|
368 Usage : $eventgenerator->start_element
|
|
369 Function: Handles a start element event
|
|
370 Returns : none
|
|
371 Args : hashref with at least 2 keys 'Data' and 'Name'
|
|
372
|
|
373
|
|
374 =cut
|
|
375
|
|
376 sub start_element{
|
|
377 my ($self,$data) = @_;
|
|
378 # we currently don't care about attributes
|
|
379 my $nm = $data->{'Name'};
|
|
380 my $type = $MODEMAP{$nm};
|
|
381
|
|
382 if( $type ) {
|
|
383 if( $self->_eventHandler->will_handle($type) ) {
|
|
384 my $func = sprintf("start_%s",lc $type);
|
|
385 $self->_eventHandler->$func($data->{'Attributes'});
|
|
386 }
|
|
387 unshift @{$self->{'_elements'}}, $type;
|
|
388
|
|
389 if($type eq 'result') {
|
|
390 $self->{'_values'} = {};
|
|
391 $self->{'_result'}= undef;
|
|
392 }
|
|
393 }
|
|
394
|
|
395 }
|
|
396
|
|
397 =head2 end_element
|
|
398
|
|
399 Title : start_element
|
|
400 Usage : $eventgenerator->end_element
|
|
401 Function: Handles an end element event
|
|
402 Returns : none
|
|
403 Args : hashref with at least 2 keys 'Data' and 'Name'
|
|
404
|
|
405
|
|
406 =cut
|
|
407
|
|
408 sub end_element {
|
|
409 my ($self,$data) = @_;
|
|
410 my $nm = $data->{'Name'};
|
|
411 my $type = $MODEMAP{$nm};
|
|
412 my $rc;
|
|
413
|
|
414 if( $type = $MODEMAP{$nm} ) {
|
|
415 if( $self->_eventHandler->will_handle($type) ) {
|
|
416 my $func = sprintf("end_%s",lc $type);
|
|
417 $rc = $self->_eventHandler->$func($self->{'_reporttype'},
|
|
418 $self->{'_values'});
|
|
419 }
|
|
420 shift @{$self->{'_elements'}};
|
|
421
|
|
422 } elsif( $MAPPING{$nm} ) {
|
|
423
|
|
424 if ( ref($MAPPING{$nm}) =~ /hash/i ) {
|
|
425 my $key = (keys %{$MAPPING{$nm}})[0];
|
|
426 $self->{'_values'}->{$key}->{$MAPPING{$nm}->{$key}} = $self->{'_last_data'};
|
|
427 } else {
|
|
428 $self->{'_values'}->{$MAPPING{$nm}} = $self->{'_last_data'};
|
|
429 }
|
|
430 } else {
|
|
431 $self->debug( "unknown nm $nm, ignoring\n");
|
|
432 }
|
|
433 $self->{'_last_data'} = ''; # remove read data if we are at
|
|
434 # end of an element
|
|
435 $self->{'_result'} = $rc if( defined $type && $type eq 'result' );
|
|
436 return $rc;
|
|
437 }
|
|
438
|
|
439 =head2 element
|
|
440
|
|
441 Title : element
|
|
442 Usage : $eventhandler->element({'Name' => $name, 'Data' => $str});
|
|
443 Function: Convience method that calls start_element, characters, end_element
|
|
444 Returns : none
|
|
445 Args : Hash ref with the keys 'Name' and 'Data'
|
|
446
|
|
447
|
|
448 =cut
|
|
449
|
|
450 sub element{
|
|
451 my ($self,$data) = @_;
|
|
452 $self->start_element($data);
|
|
453 $self->characters($data);
|
|
454 $self->end_element($data);
|
|
455 }
|
|
456
|
|
457 =head2 characters
|
|
458
|
|
459 Title : characters
|
|
460 Usage : $eventgenerator->characters($str)
|
|
461 Function: Send a character events
|
|
462 Returns : none
|
|
463 Args : string
|
|
464
|
|
465
|
|
466 =cut
|
|
467
|
|
468 sub characters{
|
|
469 my ($self,$data) = @_;
|
|
470
|
|
471 return unless ( defined $data->{'Data'} && $data->{'Data'} !~ /^\s+$/ );
|
|
472
|
|
473 $self->{'_last_data'} = $data->{'Data'};
|
|
474 }
|
|
475
|
|
476 =head2 within_element
|
|
477
|
|
478 Title : within_element
|
|
479 Usage : if( $eventgenerator->within_element($element) ) {}
|
|
480 Function: Test if we are within a particular element
|
|
481 This is different than 'in' because within can be tested
|
|
482 for a whole block.
|
|
483 Returns : boolean
|
|
484 Args : string element name
|
|
485
|
|
486
|
|
487 =cut
|
|
488
|
|
489 sub within_element{
|
|
490 my ($self,$name) = @_;
|
|
491 return 0 if ( ! defined $name &&
|
|
492 ! defined $self->{'_elements'} ||
|
|
493 scalar @{$self->{'_elements'}} == 0) ;
|
|
494 foreach ( @{$self->{'_elements'}} ) {
|
|
495 if( $_ eq $name ) {
|
|
496 return 1;
|
|
497 }
|
|
498 }
|
|
499 return 0;
|
|
500 }
|
|
501
|
|
502
|
|
503 =head2 in_element
|
|
504
|
|
505 Title : in_element
|
|
506 Usage : if( $eventgenerator->in_element($element) ) {}
|
|
507 Function: Test if we are in a particular element
|
|
508 This is different than 'in' because within can be tested
|
|
509 for a whole block.
|
|
510 Returns : boolean
|
|
511 Args : string element name
|
|
512
|
|
513
|
|
514 =cut
|
|
515
|
|
516 sub in_element{
|
|
517 my ($self,$name) = @_;
|
|
518 return 0 if ! defined $self->{'_elements'}->[0];
|
|
519 return ( $self->{'_elements'}->[0] eq $name)
|
|
520 }
|
|
521
|
|
522 =head2 start_document
|
|
523
|
|
524 Title : start_document
|
|
525 Usage : $eventgenerator->start_document
|
|
526 Function: Handle a start document event
|
|
527 Returns : none
|
|
528 Args : none
|
|
529
|
|
530
|
|
531 =cut
|
|
532
|
|
533 sub start_document{
|
|
534 my ($self) = @_;
|
|
535 $self->{'_lasttype'} = '';
|
|
536 $self->{'_values'} = {};
|
|
537 $self->{'_result'}= undef;
|
|
538 $self->{'_elements'} = [];
|
|
539 $self->{'_reporttype'} = 'exonerate';
|
|
540 }
|
|
541
|
|
542
|
|
543 =head2 end_document
|
|
544
|
|
545 Title : end_document
|
|
546 Usage : $eventgenerator->end_document
|
|
547 Function: Handles an end document event
|
|
548 Returns : Bio::Search::Result::ResultI object
|
|
549 Args : none
|
|
550
|
|
551
|
|
552 =cut
|
|
553
|
|
554 sub end_document{
|
|
555 my ($self,@args) = @_;
|
|
556 return $self->{'_result'};
|
|
557 }
|
|
558
|
|
559
|
|
560 sub write_result {
|
|
561 my ($self, $blast, @args) = @_;
|
|
562
|
|
563 if( not defined($self->writer) ) {
|
|
564 $self->warn("Writer not defined. Using a $DEFAULT_WRITER_CLASS");
|
|
565 $self->writer( $DEFAULT_WRITER_CLASS->new() );
|
|
566 }
|
|
567 $self->SUPER::write_result( $blast, @args );
|
|
568 }
|
|
569
|
|
570 sub result_count {
|
|
571 my $self = shift;
|
|
572 return $self->{'_result_count'};
|
|
573 }
|
|
574
|
|
575 sub report_count { shift->result_count }
|
|
576
|
|
577 1;
|
|
578
|