Mercurial > repos > mahtabm > ensembl
comparison variant_effect_predictor/Bio/SearchIO/exonerate.pm @ 0:1f6dce3d34e0
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 02:01:53 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:1f6dce3d34e0 |
---|---|
1 # $Id: exonerate.pm,v 1.3.2.3 2003/03/29 20:30:54 jason Exp $ | |
2 # | |
3 # BioPerl module for Bio::SearchIO::exonerate | |
4 # | |
5 # Cared for by Jason Stajich <jason@bioperl.org> | |
6 # | |
7 # Copyright Jason Stajich | |
8 # | |
9 # You may distribute this module under the same terms as perl itself | |
10 | |
11 # POD documentation - main docs before the code | |
12 | |
13 =head1 NAME | |
14 | |
15 Bio::SearchIO::exonerate - parser for Exonerate | |
16 | |
17 =head1 SYNOPSIS | |
18 | |
19 # do not use this module directly, it is a driver for SearchIO | |
20 | |
21 use Bio::SearchIO; | |
22 my $searchio = new Bio::SearchIO(-file => 'file.exonerate', | |
23 -format => 'exonerate'); | |
24 | |
25 | |
26 while( my $r = $searchio->next_result ) { | |
27 print $r->query_name, "\n"; | |
28 } | |
29 | |
30 =head1 DESCRIPTION | |
31 | |
32 This is a driver for the SearchIO system for parsing Exonerate (Guy | |
33 Slater) output. You can get Exonerate at | |
34 http://cvsweb.sanger.ac.uk/cgi-bin/cvsweb.cgi/exonerate/?cvsroot=Ensembl | |
35 [until Guy puts up a Web reference,publication for it.]). | |
36 | |
37 An optional parameter -min_intron is supported by the L<new> | |
38 initialization method. This is if you run Exonerate with a different | |
39 minimum intron length (default is 30) the parser will be able to | |
40 detect the difference between standard deletions and an intron. Still | |
41 some room to play with there that might cause this to get | |
42 misinterpreted that has not been fully tested or explored. | |
43 | |
44 =head1 FEEDBACK | |
45 | |
46 =head2 Mailing Lists | |
47 | |
48 User feedback is an integral part of the evolution of this and other | |
49 Bioperl modules. Send your comments and suggestions preferably to | |
50 the Bioperl mailing list. Your participation is much appreciated. | |
51 | |
52 bioperl-l@bioperl.org - General discussion | |
53 http://bioperl.org/MailList.shtml - About the mailing lists | |
54 | |
55 =head2 Reporting Bugs | |
56 | |
57 Report bugs to the Bioperl bug tracking system to help us keep track | |
58 of the bugs and their resolution. Bug reports can be submitted via | |
59 email or the web: | |
60 | |
61 bioperl-bugs@bioperl.org | |
62 http://bioperl.org/bioperl-bugs/ | |
63 | |
64 =head1 AUTHOR - Jason Stajich | |
65 | |
66 Email jason@bioperl.org | |
67 | |
68 Describe contact details here | |
69 | |
70 =head1 CONTRIBUTORS | |
71 | |
72 Additional contributors names and emails here | |
73 | |
74 =head1 APPENDIX | |
75 | |
76 The rest of the documentation details each of the object methods. | |
77 Internal methods are usually preceded with a _ | |
78 | |
79 =cut | |
80 | |
81 | |
82 # Let the code begin... | |
83 | |
84 | |
85 package Bio::SearchIO::exonerate; | |
86 use strict; | |
87 use vars qw(@ISA @STATES %MAPPING %MODEMAP $DEFAULT_WRITER_CLASS $MIN_INTRON); | |
88 use Bio::SearchIO; | |
89 | |
90 @ISA = qw(Bio::SearchIO ); | |
91 | |
92 use POSIX; | |
93 | |
94 | |
95 %MODEMAP = ('ExonerateOutput' => 'result', | |
96 'Hit' => 'hit', | |
97 'Hsp' => 'hsp' | |
98 ); | |
99 %MAPPING = | |
100 ( | |
101 'Hsp_query-from'=> 'HSP-query_start', | |
102 'Hsp_query-to' => 'HSP-query_end', | |
103 'Hsp_hit-from' => 'HSP-hit_start', | |
104 'Hsp_hit-to' => 'HSP-hit_end', | |
105 'Hsp_qseq' => 'HSP-query_seq', | |
106 'Hsp_hseq' => 'HSP-hit_seq', | |
107 'Hsp_midline' => 'HSP-homology_seq', | |
108 'Hsp_score' => 'HSP-score', | |
109 'Hsp_qlength' => 'HSP-query_length', | |
110 'Hsp_hlength' => 'HSP-hit_length', | |
111 'Hsp_align-len' => 'HSP-hsp_length', | |
112 'Hsp_identity' => 'HSP-identical', | |
113 'Hsp_gaps' => 'HSP-hsp_gaps', | |
114 'Hsp_hitgaps' => 'HSP-hit_gaps', | |
115 'Hsp_querygaps' => 'HSP-query_gaps', | |
116 | |
117 'Hit_id' => 'HIT-name', | |
118 'Hit_desc' => 'HIT-description', | |
119 'Hit_len' => 'HIT-length', | |
120 'Hit_score' => 'HIT-score', | |
121 | |
122 'ExonerateOutput_program' => 'RESULT-algorithm_name', | |
123 'ExonerateOutput_query-def' => 'RESULT-query_name', | |
124 'ExonerateOutput_query-desc'=> 'RESULT-query_description', | |
125 'ExonerateOutput_query-len' => 'RESULT-query_length', | |
126 ); | |
127 | |
128 $DEFAULT_WRITER_CLASS = 'Bio::Search::Writer::HitTableWriter'; | |
129 | |
130 $MIN_INTRON=30; # This is the minimum intron size | |
131 | |
132 =head2 new | |
133 | |
134 Title : new | |
135 Usage : my $obj = new Bio::SearchIO::exonerate(); | |
136 Function: Builds a new Bio::SearchIO::exonerate object | |
137 Returns : an instance of Bio::SearchIO::exonerate | |
138 Args : | |
139 | |
140 | |
141 =cut | |
142 | |
143 sub new { | |
144 my ($class) = shift; | |
145 my $self = $class->SUPER::new(@_); | |
146 | |
147 my ($min_intron) = $self->_rearrange([qw(MIN_INTRON)], @_); | |
148 if( $min_intron ) { | |
149 $MIN_INTRON = $min_intron; | |
150 } | |
151 $self; | |
152 } | |
153 | |
154 =head2 next_result | |
155 | |
156 Title : next_result | |
157 Usage : my $hit = $searchio->next_result; | |
158 Function: Returns the next Result from a search | |
159 Returns : Bio::Search::Result::ResultI object | |
160 Args : none | |
161 | |
162 =cut | |
163 | |
164 sub next_result{ | |
165 my ($self) = @_; | |
166 $self->{'_last_data'} = ''; | |
167 my ($reporttype,$seenquery,$reportline); | |
168 $self->start_document(); | |
169 my @hit_signifs; | |
170 my $seentop; | |
171 my (@q_ex, @m_ex, @h_ex); ## gc addition | |
172 while( defined($_ = $self->_readline) ) { | |
173 if( /^Query:\s+(\S+)(\s+(.+))?/ ) { | |
174 if( $seentop ) { | |
175 $self->end_element({'Name' => 'ExonerateOutput'}); | |
176 $self->_pushback($_); | |
177 return $self->end_document(); | |
178 } | |
179 $seentop = 1; | |
180 my ($nm,$desc) = ($1,$2); | |
181 chomp($desc) if defined $desc; | |
182 $self->{'_result_count'}++; | |
183 $self->start_element({'Name' => 'ExonerateOutput'}); | |
184 $self->element({'Name' => 'ExonerateOutput_query-def', | |
185 'Data' => $nm }); | |
186 $self->element({'Name' => 'ExonerateOutput_query-desc', | |
187 'Data' => $desc }); | |
188 $self->element({'Name' => 'ExonerateOutput_program', | |
189 'Data' => 'Exonerate' }); | |
190 | |
191 } elsif ( /^Target:\s+(\S+)(\s+(.+))?/ ) { | |
192 my ($nm,$desc) = ($1,$2); | |
193 chomp($desc) if defined $desc; | |
194 $self->start_element({'Name' => 'Hit'}); | |
195 $self->element({'Name' => 'Hit_id', | |
196 'Data' => $nm}); | |
197 $self->element({'Name' => 'Hit_desc', | |
198 'Data' => $desc}); | |
199 } elsif( s/^cigar:\s+(\S+)\s+ # query sequence id | |
200 (\d+)\s+(\d+)\s+([\-\+])\s+ # query start-end-strand | |
201 (\S+)\s+ # target sequence id | |
202 (\d+)\s+(\d+)\s+([\-\+])\s+ # target start-end-strand | |
203 (\d+)\s+ # score | |
204 //ox ) { | |
205 | |
206 ## gc note: | |
207 ## $qe and $he are no longer used for calculating the ends, | |
208 ## just the $qs and $hs values and the alignment and insert lenghts | |
209 my ($qs,$qe,$qstrand) = ($2,$3,$4); | |
210 my ($hs,$he,$hstrand) = ($6,$7,$8); | |
211 my $score = $9; | |
212 # $self->element({'Name' => 'ExonerateOutput_query-len', | |
213 # 'Data' => $qe}); | |
214 # $self->element({'Name' => 'Hit_len', | |
215 # 'Data' => $he}); | |
216 | |
217 my @rest = split; | |
218 if( $qstrand eq '-' ) { | |
219 $qstrand = -1; | |
220 ($qs,$qe) = ($qe,$qs); # flip-flop if we're on opp strand | |
221 $qs--; $qe++; | |
222 } else { $qstrand = 1; } | |
223 if( $hstrand eq '-' ) { | |
224 $hstrand = -1; | |
225 ($hs,$he) = ($he,$hs); # flip-flop if we're on opp strand | |
226 $hs--; $he++; | |
227 } else { $hstrand = 1; } | |
228 # okay let's do this right and generate a set of HSPs | |
229 # from the cigar line | |
230 | |
231 ## gc note: | |
232 ## add one because these values are zero-based | |
233 ## this calculation was originally done lower in the code, | |
234 ## but it's clearer to do it just once at the start | |
235 $qs++; | |
236 $hs++; | |
237 | |
238 my ($aln_len,$inserts,$deletes) = (0,0,0); | |
239 while( @rest >= 2 ) { | |
240 my ($state,$len) = (shift @rest, shift @rest); | |
241 if( $state eq 'I' ) { | |
242 $inserts+=$len; | |
243 } elsif( $state eq 'D' ) { | |
244 if( $len >= $MIN_INTRON ) { | |
245 $self->start_element({'Name' => 'Hsp'}); | |
246 | |
247 $self->element({'Name' => 'Hsp_score', | |
248 'Data' => $score}); | |
249 $self->element({'Name' => 'Hsp_align-len', | |
250 'Data' => $aln_len}); | |
251 $self->element({'Name' => 'Hsp_identity', | |
252 'Data' => $aln_len - | |
253 ($inserts + $deletes)}); | |
254 | |
255 # HSP ends where the other begins | |
256 $self->element({'Name' => 'Hsp_query-from', | |
257 'Data' => $qs}); | |
258 ## gc note: | |
259 ## $qs is now the start of the next hsp | |
260 ## the end of this hsp is 1 before this position | |
261 ## (or 1 after in case of reverse strand) | |
262 $qs += $aln_len*$qstrand; | |
263 $self->element({'Name' => 'Hsp_query-to', | |
264 'Data' => $qs - ($qstrand*1)}); | |
265 | |
266 $hs += $deletes*$hstrand; | |
267 $self->element({'Name' => 'Hsp_hit-from', | |
268 'Data' => $hs}); | |
269 $hs += $aln_len*$hstrand; | |
270 $self->element({'Name' => 'Hsp_hit-to', | |
271 'Data' => $hs-($hstrand*1)}); | |
272 | |
273 $self->element({'Name' => 'Hsp_align-len', | |
274 'Data' => $aln_len + $inserts | |
275 + $deletes}); | |
276 $self->element({'Name' => 'Hsp_identity', | |
277 'Data' => $aln_len }); | |
278 | |
279 $self->element({'Name' => 'Hsp_gaps', | |
280 'Data' => $inserts + $deletes}); | |
281 $self->element({'Name' => 'Hsp_querygaps', | |
282 'Data' => $inserts}); | |
283 $self->element({'Name' => 'Hsp_hitgaps', | |
284 'Data' => $deletes}); | |
285 | |
286 ## gc addition start | |
287 | |
288 $self->element({'Name' => 'Hsp_qseq', | |
289 'Data' => shift @q_ex, | |
290 }); | |
291 $self->element({'Name' => 'Hsp_hseq', | |
292 'Data' => shift @h_ex, | |
293 }); | |
294 $self->element({'Name' => 'Hsp_midline', | |
295 'Data' => shift @m_ex, | |
296 }); | |
297 ## gc addition end | |
298 $self->end_element({'Name' => 'Hsp'}); | |
299 | |
300 $aln_len = $inserts = $deletes = 0; | |
301 } | |
302 $deletes+=$len; | |
303 } else { | |
304 $aln_len += $len; | |
305 } | |
306 } | |
307 $self->start_element({'Name' => 'Hsp'}); | |
308 | |
309 ## gc addition start | |
310 | |
311 $self->element({'Name' => 'Hsp_qseq', | |
312 'Data' => shift @q_ex, | |
313 }); | |
314 $self->element({'Name' => 'Hsp_hseq', | |
315 'Data' => shift @h_ex, | |
316 }); | |
317 $self->element({'Name' => 'Hsp_midline', | |
318 'Data' => shift @m_ex, | |
319 }); | |
320 ## gc addition end | |
321 | |
322 $self->element({'Name' => 'Hsp_score', | |
323 'Data' => $score}); | |
324 | |
325 $self->element({'Name' => 'Hsp_query-from', | |
326 'Data' => $qs}); | |
327 | |
328 $qs += $aln_len*$qstrand; | |
329 $self->element({'Name' => 'Hsp_query-to', | |
330 'Data' => $qs - ($qstrand*1)}); | |
331 | |
332 $hs += $deletes*$hstrand; | |
333 $self->element({'Name' => 'Hsp_hit-from', | |
334 'Data' => $hs}); | |
335 $hs += $aln_len*$hstrand; | |
336 $self->element({'Name' => 'Hsp_hit-to', | |
337 'Data' => $hs -($hstrand*1)}); | |
338 | |
339 $self->element({'Name' => 'Hsp_align-len', | |
340 'Data' => $aln_len}); | |
341 | |
342 $self->element({'Name' => 'Hsp_identity', | |
343 'Data' => $aln_len - ($inserts + $deletes)}); | |
344 | |
345 $self->element({'Name' => 'Hsp_gaps', | |
346 'Data' => $inserts + $deletes}); | |
347 | |
348 $self->element({'Name' => 'Hsp_querygaps', | |
349 'Data' => $inserts}); | |
350 $self->element({'Name' => 'Hsp_hitgaps', | |
351 'Data' => $deletes}); | |
352 $self->end_element({'Name' => 'Hsp'}); | |
353 $self->element({'Name' => 'Hit_score', | |
354 'Data' => $score}); | |
355 $self->end_element({'Name' => 'Hit'}); | |
356 $self->end_element({'Name' => 'ExonerateOutput'}); | |
357 | |
358 return $self->end_document(); | |
359 } else { | |
360 } | |
361 } | |
362 return $self->end_document() if( $seentop ); | |
363 } | |
364 | |
365 =head2 start_element | |
366 | |
367 Title : start_element | |
368 Usage : $eventgenerator->start_element | |
369 Function: Handles a start element event | |
370 Returns : none | |
371 Args : hashref with at least 2 keys 'Data' and 'Name' | |
372 | |
373 | |
374 =cut | |
375 | |
376 sub start_element{ | |
377 my ($self,$data) = @_; | |
378 # we currently don't care about attributes | |
379 my $nm = $data->{'Name'}; | |
380 my $type = $MODEMAP{$nm}; | |
381 | |
382 if( $type ) { | |
383 if( $self->_eventHandler->will_handle($type) ) { | |
384 my $func = sprintf("start_%s",lc $type); | |
385 $self->_eventHandler->$func($data->{'Attributes'}); | |
386 } | |
387 unshift @{$self->{'_elements'}}, $type; | |
388 | |
389 if($type eq 'result') { | |
390 $self->{'_values'} = {}; | |
391 $self->{'_result'}= undef; | |
392 } | |
393 } | |
394 | |
395 } | |
396 | |
397 =head2 end_element | |
398 | |
399 Title : start_element | |
400 Usage : $eventgenerator->end_element | |
401 Function: Handles an end element event | |
402 Returns : none | |
403 Args : hashref with at least 2 keys 'Data' and 'Name' | |
404 | |
405 | |
406 =cut | |
407 | |
408 sub end_element { | |
409 my ($self,$data) = @_; | |
410 my $nm = $data->{'Name'}; | |
411 my $type = $MODEMAP{$nm}; | |
412 my $rc; | |
413 | |
414 if( $type = $MODEMAP{$nm} ) { | |
415 if( $self->_eventHandler->will_handle($type) ) { | |
416 my $func = sprintf("end_%s",lc $type); | |
417 $rc = $self->_eventHandler->$func($self->{'_reporttype'}, | |
418 $self->{'_values'}); | |
419 } | |
420 shift @{$self->{'_elements'}}; | |
421 | |
422 } elsif( $MAPPING{$nm} ) { | |
423 | |
424 if ( ref($MAPPING{$nm}) =~ /hash/i ) { | |
425 my $key = (keys %{$MAPPING{$nm}})[0]; | |
426 $self->{'_values'}->{$key}->{$MAPPING{$nm}->{$key}} = $self->{'_last_data'}; | |
427 } else { | |
428 $self->{'_values'}->{$MAPPING{$nm}} = $self->{'_last_data'}; | |
429 } | |
430 } else { | |
431 $self->debug( "unknown nm $nm, ignoring\n"); | |
432 } | |
433 $self->{'_last_data'} = ''; # remove read data if we are at | |
434 # end of an element | |
435 $self->{'_result'} = $rc if( defined $type && $type eq 'result' ); | |
436 return $rc; | |
437 } | |
438 | |
439 =head2 element | |
440 | |
441 Title : element | |
442 Usage : $eventhandler->element({'Name' => $name, 'Data' => $str}); | |
443 Function: Convience method that calls start_element, characters, end_element | |
444 Returns : none | |
445 Args : Hash ref with the keys 'Name' and 'Data' | |
446 | |
447 | |
448 =cut | |
449 | |
450 sub element{ | |
451 my ($self,$data) = @_; | |
452 $self->start_element($data); | |
453 $self->characters($data); | |
454 $self->end_element($data); | |
455 } | |
456 | |
457 =head2 characters | |
458 | |
459 Title : characters | |
460 Usage : $eventgenerator->characters($str) | |
461 Function: Send a character events | |
462 Returns : none | |
463 Args : string | |
464 | |
465 | |
466 =cut | |
467 | |
468 sub characters{ | |
469 my ($self,$data) = @_; | |
470 | |
471 return unless ( defined $data->{'Data'} && $data->{'Data'} !~ /^\s+$/ ); | |
472 | |
473 $self->{'_last_data'} = $data->{'Data'}; | |
474 } | |
475 | |
476 =head2 within_element | |
477 | |
478 Title : within_element | |
479 Usage : if( $eventgenerator->within_element($element) ) {} | |
480 Function: Test if we are within a particular element | |
481 This is different than 'in' because within can be tested | |
482 for a whole block. | |
483 Returns : boolean | |
484 Args : string element name | |
485 | |
486 | |
487 =cut | |
488 | |
489 sub within_element{ | |
490 my ($self,$name) = @_; | |
491 return 0 if ( ! defined $name && | |
492 ! defined $self->{'_elements'} || | |
493 scalar @{$self->{'_elements'}} == 0) ; | |
494 foreach ( @{$self->{'_elements'}} ) { | |
495 if( $_ eq $name ) { | |
496 return 1; | |
497 } | |
498 } | |
499 return 0; | |
500 } | |
501 | |
502 | |
503 =head2 in_element | |
504 | |
505 Title : in_element | |
506 Usage : if( $eventgenerator->in_element($element) ) {} | |
507 Function: Test if we are in a particular element | |
508 This is different than 'in' because within can be tested | |
509 for a whole block. | |
510 Returns : boolean | |
511 Args : string element name | |
512 | |
513 | |
514 =cut | |
515 | |
516 sub in_element{ | |
517 my ($self,$name) = @_; | |
518 return 0 if ! defined $self->{'_elements'}->[0]; | |
519 return ( $self->{'_elements'}->[0] eq $name) | |
520 } | |
521 | |
522 =head2 start_document | |
523 | |
524 Title : start_document | |
525 Usage : $eventgenerator->start_document | |
526 Function: Handle a start document event | |
527 Returns : none | |
528 Args : none | |
529 | |
530 | |
531 =cut | |
532 | |
533 sub start_document{ | |
534 my ($self) = @_; | |
535 $self->{'_lasttype'} = ''; | |
536 $self->{'_values'} = {}; | |
537 $self->{'_result'}= undef; | |
538 $self->{'_elements'} = []; | |
539 $self->{'_reporttype'} = 'exonerate'; | |
540 } | |
541 | |
542 | |
543 =head2 end_document | |
544 | |
545 Title : end_document | |
546 Usage : $eventgenerator->end_document | |
547 Function: Handles an end document event | |
548 Returns : Bio::Search::Result::ResultI object | |
549 Args : none | |
550 | |
551 | |
552 =cut | |
553 | |
554 sub end_document{ | |
555 my ($self,@args) = @_; | |
556 return $self->{'_result'}; | |
557 } | |
558 | |
559 | |
560 sub write_result { | |
561 my ($self, $blast, @args) = @_; | |
562 | |
563 if( not defined($self->writer) ) { | |
564 $self->warn("Writer not defined. Using a $DEFAULT_WRITER_CLASS"); | |
565 $self->writer( $DEFAULT_WRITER_CLASS->new() ); | |
566 } | |
567 $self->SUPER::write_result( $blast, @args ); | |
568 } | |
569 | |
570 sub result_count { | |
571 my $self = shift; | |
572 return $self->{'_result_count'}; | |
573 } | |
574 | |
575 sub report_count { shift->result_count } | |
576 | |
577 1; | |
578 |