comparison variant_effect_predictor/Bio/SearchIO/fasta.pm @ 0:1f6dce3d34e0

Uploaded
author mahtabm
date Thu, 11 Apr 2013 02:01:53 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:1f6dce3d34e0
1 # $Id: fasta.pm,v 1.33.2.3 2003/08/28 16:01:03 jason Exp $
2 #
3 # BioPerl module for Bio::SearchIO::fasta
4 #
5 # Cared for by Jason Stajich <jason@bioperl.org>
6 #
7 # Copyright Jason Stajich
8 #
9 # You may distribute this module under the same terms as perl itself
10
11 # POD documentation - main docs before the code
12
13 =head1 NAME
14
15 Bio::SearchIO::fasta - A SearchIO parser for FASTA results
16
17 =head1 SYNOPSIS
18
19 # Do not use this object directly, use it through the SearchIO system
20 use Bio::SearchIO;
21 my $searchio = new Bio::SearchIO(-format => 'fasta',
22 -file => 'report.FASTA');
23 while( my $result = $searchio->next_result ) {
24 # ....
25 }
26
27 =head1 DESCRIPTION
28
29 This object contains the event based parsing code for FASTA format reports.
30
31 =head1 FEEDBACK
32
33 =head2 Mailing Lists
34
35 User feedback is an integral part of the evolution of this and other
36 Bioperl modules. Send your comments and suggestions preferably to
37 the Bioperl mailing list. Your participation is much appreciated.
38
39 bioperl-l@bioperl.org - General discussion
40 http://bioperl.org/MailList.shtml - About the mailing lists
41
42 =head2 Reporting Bugs
43
44 Report bugs to the Bioperl bug tracking system to help us keep track
45 of the bugs and their resolution. Bug reports can be submitted via
46 email or the web:
47
48 bioperl-bugs@bioperl.org
49 http://bugzilla.bioperl.org/
50
51 =head1 AUTHOR - Jason Stajich
52
53 Email jason@bioperl.org
54
55 Describe contact details here
56
57 =head1 CONTRIBUTORS
58
59 Additional contributors names and emails here
60
61 =head1 APPENDIX
62
63 The rest of the documentation details each of the object methods.
64 Internal methods are usually preceded with a _
65
66 =cut
67
68
69 # Let the code begin...
70
71
72 package Bio::SearchIO::fasta;
73 use vars qw(@ISA %MODEMAP %MAPPING $IDLENGTH);
74 use strict;
75
76 # Object preamble - inherits from Bio::Root::RootI
77
78 use Bio::SearchIO;
79 use POSIX;
80
81 BEGIN {
82 # Set IDLENGTH to a new value if you have
83 # compile FASTA with a different ID length
84 # (actually newest FASTA allows the setting of this
85 # via -C parameter, default is 6)
86 $IDLENGTH = 6;
87
88 # mapping of NCBI Blast terms to Bioperl hash keys
89 %MODEMAP = ('FastaOutput' => 'result',
90 'Hit' => 'hit',
91 'Hsp' => 'hsp'
92 );
93
94 # This should really be done more intelligently, like with
95 # XSLT
96
97 %MAPPING =
98 (
99 'Hsp_bit-score' => 'HSP-bits',
100 'Hsp_score' => 'HSP-score',
101 'Hsp_sw-score' => 'HSP-swscore',
102 'Hsp_evalue' => 'HSP-evalue',
103 'Hsp_query-from'=> 'HSP-query_start',
104 'Hsp_query-to' => 'HSP-query_end',
105 'Hsp_hit-from' => 'HSP-hit_start',
106 'Hsp_hit-to' => 'HSP-hit_end',
107 'Hsp_positive' => 'HSP-conserved',
108 'Hsp_identity' => 'HSP-identical',
109 'Hsp_gaps' => 'HSP-hsp_gaps',
110 'Hsp_hitgaps' => 'HSP-hit_gaps',
111 'Hsp_querygaps' => 'HSP-query_gaps',
112 'Hsp_qseq' => 'HSP-query_seq',
113 'Hsp_hseq' => 'HSP-hit_seq',
114 'Hsp_midline' => 'HSP-homology_seq',
115 'Hsp_align-len' => 'HSP-hsp_length',
116 'Hsp_query-frame'=> 'HSP-query_frame',
117 'Hsp_hit-frame' => 'HSP-hit_frame',
118
119 'Hit_id' => 'HIT-name',
120 'Hit_len' => 'HIT-length',
121 'Hit_accession' => 'HIT-accession',
122 'Hit_def' => 'HIT-description',
123 'Hit_signif' => 'HIT-significance',
124 'Hit_score' => 'HIT-score',
125
126 'FastaOutput_program' => 'RESULT-algorithm_name',
127 'FastaOutput_version' => 'RESULT-algorithm_version',
128 'FastaOutput_query-def'=> 'RESULT-query_name',
129 'FastaOutput_querydesc'=> 'RESULT-query_description',
130 'FastaOutput_query-len'=> 'RESULT-query_length',
131 'FastaOutput_db' => 'RESULT-database_name',
132 'FastaOutput_db-len' => 'RESULT-database_entries',
133 'FastaOutput_db-let' => 'RESULT-database_letters',
134
135 'Parameters_matrix' => { 'RESULT-parameters' => 'matrix'},
136 'Parameters_expect' => { 'RESULT-parameters' => 'expect'},
137 'Parameters_include' => { 'RESULT-parameters' => 'include'},
138 'Parameters_sc-match' => { 'RESULT-parameters' => 'match'},
139 'Parameters_sc-mismatch' => { 'RESULT-parameters' => 'mismatch'},
140 'Parameters_gap-open' => { 'RESULT-parameters' => 'gapopen'},
141 'Parameters_gap-ext' => { 'RESULT-parameters' => 'gapext'},
142 'Parameters_word-size' => { 'RESULT-parameters' => 'wordsize'},
143 'Parameters_ktup' => { 'RESULT-parameters' => 'ktup'},
144 'Parameters_filter' => {'RESULT-parameters' => 'filter'},
145 'Statistics_db-num' => { 'RESULT-statistics' => 'dbentries'},
146 'Statistics_db-len' => { 'RESULT-statistics' => 'dbletters'},
147 'Statistics_hsp-len' => { 'RESULT-statistics' => 'hsplength'},
148 'Statistics_eff-space' => { 'RESULT-statistics' => 'effectivespace'},
149 'Statistics_kappa' => { 'RESULT-statistics' => 'kappa' },
150 'Statistics_lambda' => { 'RESULT-statistics' => 'lambda' },
151 'Statistics_entropy' => { 'RESULT-statistics' => 'entropy'},
152 );
153 }
154
155
156 @ISA = qw(Bio::SearchIO );
157
158 =head2 new
159
160 Title : new
161 Usage : my $obj = new Bio::SearchIO::fasta();
162 Function: Builds a new Bio::SearchIO::fasta object
163 Returns : Bio::SearchIO::fasta
164 Args : -idlength - set ID length to something other
165 than the default (7), this is only
166 necessary if you have compiled FASTA
167 with a new default id length to display
168 in the HSP alignment blocks
169
170 =cut
171
172 sub _initialize {
173 my($self,@args) = @_;
174 $self->SUPER::_initialize(@args);
175 return unless @args;
176 my ($idlength) = $self->_rearrange([qw(IDLENGTH)],@args);
177 $self->idlength($idlength || $IDLENGTH);
178 $self->_eventHandler->register_factory('hsp', Bio::Search::HSP::HSPFactory->new(-type => 'Bio::Search::HSP::FastaHSP'));
179
180 return 1;
181 }
182
183 =head2 next_result
184
185 Title : next_result
186 Usage : my $hit = $searchio->next_result;
187 Function: Returns the next Result from a search
188 Returns : Bio::Search::Result::ResultI object
189 Args : none
190
191 =cut
192
193 sub next_result{
194 my ($self) = @_;
195
196 my $data = '';
197 my $seentop = 0;
198 my $current_hsp;
199 $self->start_document();
200 my @hit_signifs;
201 while( defined ($_ = $self->_readline )) {
202 next if( ! $self->in_element('hsp') &&
203 /^\s+$/); # skip empty lines
204 if( /(\S+)\s+searches\s+a\s+((protein\s+or\s+DNA\s+sequence)|(sequence\s+database))/i || /(\S+) compares a/ ||
205 ( m/^# / && ($_ = $self->_readline) &&
206 /(\S+)\s+searches\s+a\s+((protein\s+or\s+DNA\s+sequence)|(sequence\s+database))/i || /(\S+) compares a/
207 )
208 ) {
209 if( $seentop ) {
210 $self->_pushback($_);
211 $self->end_element({ 'Name' => 'FastaOutput'});
212 return $self->end_document();
213 }
214 $self->{'_reporttype'} = $1;
215 $self->start_element({ 'Name' => 'FastaOutput' } );
216 $self->{'_result_count'}++;
217 $seentop = 1;
218
219 $self->element({ 'Name' => 'FastaOutput_program',
220 'Data' => $self->{'_reporttype'}});
221 $_ = $self->_readline();
222 my ($version) = (/version\s+(\S+)/);
223 $version = '' unless defined $version;
224 $self->{'_version'} = $version;
225 $self->element({ 'Name' => 'FastaOutput_version',
226 'Data' => $version});
227
228 my ($last, $leadin, $type, $querylen, $querytype, $querydef);
229
230 while( defined($_ = $self->_readline()) ) {
231 if( /^ (
232 (?:\s+>) | # fa33 lead-in
233 (?:\s*\d+\s*>>>) # fa34 mlib lead-in
234 )
235 (.*)
236 /x
237 ) {
238 ($leadin, $querydef) = ($1, $2);
239 if ($leadin =~ m/>>>/) {
240 if($querydef =~ /^(.*?)\s+(?:\-\s+)?(\d+)\s+(aa|nt)\s*$/o ) {
241 ($querydef, $querylen, $querytype) = ($1, $2, $3);
242 last;
243 }
244 } else {
245 if( $last =~ /(\S+)[:,]\s*(\d+)\s+(aa|nt)/ ) {
246 ($querylen, $querytype) = ($2, $3);
247 $querydef ||= $1;
248 last;
249 }
250 }
251 } elsif ( m/^\s*vs\s+\S+/o ) {
252 if ( $last =~ /(\S+)[,:]\s+(\d+)\s+(aa|nt)/o) {
253 ($querydef, $querylen, $querytype) = ($1, $2, $3);
254 last;
255 }
256 }
257 $last = $_;
258 }
259
260 if( $self->{'_reporttype'} &&
261 $self->{'_reporttype'} eq 'FASTA'
262 ) {
263 if( $querytype eq 'nt') {
264 $self->{'_reporttype'} = 'FASTN' ;
265 } elsif( $querytype eq 'aa' ) {
266 $self->{'_reporttype'} = 'FASTP' ;
267 }
268 }
269 my ($name, $descr) = $querydef =~ m/^(\S+)\s*(.*?)\s*$/o;
270 $self->element({'Name' => 'FastaOutput_query-def',
271 'Data' => $name});
272 $self->element({'Name' => 'FastaOutput_querydesc',
273 'Data' => $descr});
274 if ($querylen) {
275 $self->element({'Name' => 'FastaOutput_query-len',
276 'Data' => $querylen});
277 } else {
278 $self->warn("unable to find and set query length");
279 }
280
281 if( $last =~ /^\s*vs\s+(\S+)/ ||
282 ($last =~ /^searching\s+(\S+)\s+library/) ||
283 (defined $_ && /^\s*vs\s+(\S+)/) ||
284 (defined ($_ = $self->_readline()) && /^\s*vs\s+(\S+)/)
285 ) {
286 $self->element({'Name' => 'FastaOutput_db',
287 'Data' => $1});
288 } elsif (m/^\s+opt(?:\s+E\(\))?$/o) {
289 # histogram ... read over it more rapidly than the larger outer loop:
290 while (defined($_ = $self->_readline)) {
291 last if m/^>\d+/;
292 }
293 }
294
295 } elsif( /(\d+) residues in\s+(\d+)\s+sequences/ ) {
296 $self->element({'Name' => 'FastaOutput_db-let',
297 'Data' => $1});
298 $self->element({'Name' => 'FastaOutput_db-len',
299 'Data' => $2});
300 $self->element({'Name' => 'Statistics_db-len',
301 'Data' => $1});
302 $self->element({'Name' => 'Statistics_db-num',
303 'Data' => $2});
304 } elsif( /Lambda=\s*(\S+)/ ) {
305 $self->element({'Name' => 'Statistics_lambda',
306 'Data' => $1});
307 } elsif (/K=\s*(\S+)/) {
308 $self->element({'Name' => 'Statistics_kappa',
309 'Data' => $1});
310 } elsif( /^\s*(Smith-Waterman).+(\S+)\s*matrix [^\]]*?(xS)?\]/ ) {
311 $self->element({'Name' => 'Parameters_matrix',
312 'Data' => $2});
313 $self->element({'Name' => 'Parameters_filter',
314 'Data' => defined $3 ? 1 : 0,
315 });
316 $self->{'_reporttype'} = $1;
317
318 $self->element({ 'Name' => 'FastaOutput_program',
319 'Data' => $self->{'_reporttype'}});
320
321 } elsif( /The best( related| unrelated)? scores are:/ ) {
322 my $rel = $1;
323 my @labels = split;
324 @labels = map {
325 if ($_ =~ m/^E\((\d+)\)$/o) {
326 $self->element({'Name' => 'Statistics_eff-space', 'Data' => $1});
327 "evalue";
328 } else {
329 $_;
330 }
331 } @labels[$rel ? 5 : 4 .. $#labels];
332
333 while( defined ($_ = $self->_readline() ) &&
334 ! /^\s+$/ ) {
335 my @line = split;
336
337 if ($line[-1] =~ m/\=/o && $labels[-1] eq 'fs') {
338 # unlabelled alignment hit;
339 push @labels, "aln_code";
340 }
341
342 my %data;
343 @data{@labels} = splice(@line, @line - @labels);
344 if ($line[-1] =~ m/\[([1-6rf])\]/o) {
345 my $fr = $1;
346 $data{lframe} = ($fr =~ /\d/o ?
347 ($fr <= 3 ? "+$fr" : "-@{[$fr-3]}") :
348 ($fr eq 'f' ? '+1' : '-1')
349 );
350 pop @line;
351 } else {
352 $data{lframe} = '0';
353 }
354
355 if ($line[-1] =~ m/^\(?(\d+)\)$/) {
356 $data{hit_len} = $1;
357 pop @line;
358 if ($line[-1] =~ m/^\($/) {
359 pop @line;
360 }
361 } else {
362 $data{hit_len} = 0;
363 }
364
365 # rebuild the first part of the line, preserving spaces:
366 ($_) = m/^(\S+(?:\s+\S+){$#line})/;
367
368 my ($id, $desc) = split(/\s+/,$_,2);
369 my @pieces = split(/\|/,$id);
370 my $acc = pop @pieces;
371 $acc =~ s/\.\d+$//;
372
373 @data{qw(id desc acc)} = ($id, $desc, $acc);
374
375 push @hit_signifs, \%data;
376 }
377 } elsif( /^\s*([T]?FAST[XYAF]).+,\s*(\S+)\s*matrix[^\]]+?(xS)?\]\s*ktup:\s*(\d+)/ ) {
378 $self->element({'Name' => 'Parameters_matrix',
379 'Data' => $2});
380 $self->element({'Name' => 'Parameters_filter',
381 'Data' => defined $3 ? 1 : 0,
382 });
383 $self->element({'Name' => 'Parameters_ktup',
384 'Data' => $4});
385 $self->{'_reporttype'} = $1 if( $self->{'_reporttype'} !~ /FAST[PN]/i ) ;
386
387 $self->element({ 'Name' => 'FastaOutput_program',
388 'Data' => $self->{'_reporttype'}});
389
390 } elsif( /(?:gap\-pen|open\/ext):\s+([\-\+]?\d+)\s*\/\s*([\-\+]?\d+).+width:\s+(\d+)/ ) {
391 $self->element({'Name' => 'Parameters_gap-open',
392 'Data' => $1});
393 $self->element({'Name' => 'Parameters_gap-ext',
394 'Data' => $2});
395 $self->element({'Name' => 'Parameters_word-size',
396 'Data' => $3});
397 } elsif( /^>>(.+?)\s+\((\d+)\s*(aa|nt)\)$/ ) {
398 if( $self->in_element('hsp') ) {
399 $self->end_element({ 'Name' => 'Hsp'});
400 }
401 if( $self->in_element('hit') ) {
402 $self->end_element({ 'Name' => 'Hit'});
403 }
404
405 $self->start_element({'Name' => 'Hit'});
406 $self->element({ 'Name' => 'Hit_len',
407 'Data' => $2});
408 my ($id,$desc) = split(/\s+/,$1,2);
409 $self->element({ 'Name' => 'Hit_id',
410 'Data' => $id});
411 my @pieces = split(/\|/,$id);
412 my $acc = pop @pieces;
413 $acc =~ s/\.\d+$//;
414 $self->element({ 'Name' => 'Hit_accession',
415 'Data' => $acc});
416 $self->element({ 'Name' => 'Hit_def',
417 'Data' => $desc});
418
419 $_ = $self->_readline();
420 my ($score,$bits,$e) = /Z-score: \s* (\S+) \s*
421 (?: bits: \s* (\S+) \s+ )?
422 (?: E|expect ) \s* \(\) :? \s*(\S+)/x;
423 $bits = $score unless defined $bits;
424
425 my $v = shift @hit_signifs;
426 if( defined $v ) {
427 @{$v}{qw(evalue bits z-sc)} = ($e, $bits, $score);
428 }
429
430 $self->element({'Name' => 'Hit_signif',
431 'Data' => $v ? $v->{evalue} : $e });
432 $self->element({'Name' => 'Hit_score',
433 'Data' => $v ? $v->{bits} : $bits });
434 $self->start_element({'Name' => 'Hsp'});
435
436 $self->element({'Name' => 'Hsp_score',
437 'Data' => $v ? $v->{'z-sc'} : $score });
438 $self->element({'Name' => 'Hsp_evalue',
439 'Data' => $v ? $v->{evalue} : $e });
440 $self->element({'Name' => 'Hsp_bit-score',
441 'Data' => $v ? $v->{bits} : $bits });
442 $_ = $self->_readline();
443 if( /Smith-Waterman score:\s*(\d+)/ ) {
444 $self->element({'Name' => 'Hsp_sw-score',
445 'Data' => $1});
446 }
447 if( / (\S+)\% \s* identity
448 (?:\s* \( \s* (\S+)\% \s* ungapped \) )?
449 \s* in \s* (\d+) \s+ (?:aa|nt) \s+ overlap \s*
450 \( (\d+) \- (\d+) : (\d+) \- (\d+) \)
451 /x ) {
452 my ($identper,$gapper,$len,$querystart,
453 $queryend,$hitstart,$hitend) = ($1,$2,$3,$4,$5,$6,$7);
454 my $ident = POSIX::ceil(($identper/100) * $len);
455 my $gaps = ( defined $gapper ) ? POSIX::ceil ( ($gapper/100) * $len) : undef;
456
457 $self->element({'Name' => 'Hsp_gaps',
458 'Data' => $gaps});
459 $self->element({'Name' => 'Hsp_identity',
460 'Data' => $ident});
461 $self->element({'Name' => 'Hsp_positive',
462 'Data' => $ident});
463 $self->element({'Name' => 'Hsp_align-len',
464 'Data' => $len});
465
466 $self->element({'Name' => 'Hsp_query-from',
467 'Data' => $querystart});
468 $self->element({'Name' => 'Hsp_query-to',
469 'Data' => $queryend});
470 $self->element({'Name' => 'Hsp_hit-from',
471 'Data' => $hitstart});
472 $self->element({'Name' => 'Hsp_hit-to',
473 'Data' => $hitend});
474
475 }
476
477 if ($v) {
478 $self->element({'Name' => 'Hsp_querygaps', 'Data' => $v->{qgaps} }) if exists $v->{qgaps};
479 $self->element({'Name' => 'Hsp_hitgaps', 'Data' => $v->{lgaps} }) if exists $v->{lgaps};
480
481 if ($self->{'_reporttype'} =~ m/^FAST[NXY]$/o) {
482 if( 8 == scalar grep { exists $v->{$_} } qw(an0 ax0 pn0 px0 an1 ax1 pn1 px1) ) {
483 if ($v->{ax0} < $v->{an0}) {
484 $self->element({'Name' => 'Hsp_query-frame', 'Data' => "-@{[(($v->{px0} - $v->{ax0}) % 3) + 1]}" });
485 } else {
486 $self->element({'Name' => 'Hsp_query-frame', 'Data' => "+@{[(($v->{an0} - $v->{pn0}) % 3) + 1]}" });
487 }
488 if ($v->{ax1} < $v->{an1}) {
489 $self->element({'Name' => 'Hsp_hit-frame', 'Data' => "-@{[(($v->{px1} - $v->{ax1}) % 3) + 1]}" });
490 } else {
491 $self->element({'Name' => 'Hsp_hit-frame', 'Data' => "+@{[(($v->{an1} - $v->{pn1}) % 3) + 1]}" });
492 }
493 } else {
494 $self->element({'Name' => 'Hsp_query-frame', 'Data' => $v->{lframe} });
495 $self->element({'Name' => 'Hsp_hit-frame', 'Data' => 0 });
496 }
497 } else {
498 $self->element({'Name' => 'Hsp_query-frame', 'Data' => 0 });
499 $self->element({'Name' => 'Hsp_hit-frame', 'Data' => $v->{lframe} });
500 }
501
502 } else {
503 $self->warn( "unable to parse FASTA score line: $_");
504 }
505 } elsif( /\d+\s*residues\s*in\s*\d+\s*query\s*sequences/ ) {
506 if( $self->in_element('hsp') ) {
507 $self->end_element({'Name' => 'Hsp'});
508 }
509 if( $self->in_element('hit') ) {
510 $self->end_element({'Name' => 'Hit'});
511 }
512
513 # $_ = $self->_readline();
514 # my ( $liblen,$libsize) = /(\d+)\s+residues\s*in(\d+)\s*library/;
515 # fast forward to the end of the file as there is
516 # nothing else left to do with this file and want to be sure and
517 # reset it
518 while(defined($_ = $self->_readline() ) ) {
519 last if( /^Function used was/);
520 if( /(\S+)\s+searches\s+a\s+(protein\s+or\s+DNA\s+sequence)|(sequence\s+database)/ ) {
521 $self->_pushback($_);
522 }
523 }
524
525 if (@hit_signifs) {
526 # process remaining best hits
527 for my $h (@hit_signifs) {
528 # Hsp_score Hsp_evalue Hsp_bit-score
529 # Hsp_sw-score Hsp_gaps Hsp_identity Hsp_positive
530 # Hsp_align-len Hsp_query-from Hsp_query-to
531 # Hsp_hit-from Hsp_hit-to Hsp_qseq Hsp_midline
532
533 $self->start_element({'Name' => 'Hit'});
534 $self->element({ 'Name' => 'Hit_len',
535 'Data' => $h->{hit_len}
536 }) if exists $h->{hit_len};
537 $self->element({ 'Name' => 'Hit_id',
538 'Data' => $h->{id}
539 }) if exists $h->{id};
540 $self->element({ 'Name' => 'Hit_accession',
541 'Data' => $h->{acc}
542 }) if exists $h->{acc};
543 $self->element({ 'Name' => 'Hit_def',
544 'Data' => $h->{desc}
545 }) if exists $h->{desc};
546 $self->element({'Name' => 'Hit_signif',
547 'Data' => $h->{evalue}
548 }) if exists $h->{evalue};
549 $self->element({'Name' => 'Hit_score',
550 'Data' => $h->{bits}
551 }) if exists $h->{bits};
552
553 $self->start_element({'Name' => 'Hsp'});
554 $self->element({'Name' => 'Hsp_score', 'Data' => $h->{'z-sc'} }) if exists $h->{'z-sc'};
555 $self->element({'Name' => 'Hsp_evalue', 'Data' => $h->{evalue} }) if exists $h->{evalue};
556 $self->element({'Name' => 'Hsp_bit-score', 'Data' => $h->{bits} }) if exists $h->{bits};
557 $self->element({'Name' => 'Hsp_sw-score', 'Data' => $h->{sw} }) if exists $h->{sw};
558 $self->element({'Name' => 'Hsp_gaps', 'Data' => $h->{'%_gid'} }) if exists $h->{'%_gid'};
559 $self->element({'Name' => 'Hsp_identity', 'Data' => POSIX::ceil($h->{'%_id'} * $h->{alen}) })
560 if (exists $h->{'%_id'} && exists $h->{alen});
561 $self->element({'Name' => 'Hsp_positive', 'Data' => 100 * $h->{'%_id'} }) if exists $h->{'%_id'};
562 $self->element({'Name' => 'Hsp_align-len', 'Data' => $h->{alen} }) if exists $h->{alen};
563 $self->element({'Name' => 'Hsp_query-from', 'Data' => $h->{an0} }) if exists $h->{an0};
564 $self->element({'Name' => 'Hsp_query-to', 'Data' => $h->{ax0} }) if exists $h->{ax0};
565 $self->element({'Name' => 'Hsp_hit-from', 'Data' => $h->{an1} }) if exists $h->{an1};
566 $self->element({'Name' => 'Hsp_hit-to', 'Data' => $h->{ax1} }) if exists $h->{ax1};
567
568 $self->element({'Name' => 'Hsp_querygaps', 'Data' => $h->{qgaps} }) if exists $h->{qgaps};
569 $self->element({'Name' => 'Hsp_hitgaps', 'Data' => $h->{lgaps} }) if exists $h->{lgaps};
570
571 if ($self->{'_reporttype'} =~ m/^FAST[NXY]$/o) {
572 if( 8 == scalar grep { exists $h->{$_} } qw(an0 ax0 pn0 px0 an1 ax1 pn1 px1) ) {
573 if ($h->{ax0} < $h->{an0}) {
574 $self->element({'Name' => 'Hsp_query-frame', 'Data' => "-@{[(($h->{px0} - $h->{ax0}) % 3) + 1]}" });
575 } else {
576 $self->element({'Name' => 'Hsp_query-frame', 'Data' => "+@{[(($h->{an0} - $h->{pn0}) % 3) + 1]}" });
577 }
578 if ($h->{ax1} < $h->{an1}) {
579 $self->element({'Name' => 'Hsp_hit-frame', 'Data' => "-@{[(($h->{px1} - $h->{ax1}) % 3) + 1]}" });
580 } else {
581 $self->element({'Name' => 'Hsp_hit-frame', 'Data' => "+@{[(($h->{an1} - $h->{pn1}) % 3) + 1]}" });
582 }
583 } else {
584 $self->element({'Name' => 'Hsp_query-frame', 'Data' => $h->{lframe} });
585 $self->element({'Name' => 'Hsp_hit-frame', 'Data' => 0 });
586 }
587 } else {
588 $self->element({'Name' => 'Hsp_query-frame', 'Data' => 0 });
589 $self->element({'Name' => 'Hsp_hit-frame', 'Data' => $h->{lframe} });
590 }
591
592 $self->end_element({'Name' => 'Hsp'});
593 $self->end_element({'Name' => 'Hit'});
594 }
595 }
596
597 $self->end_element({ 'Name' => 'FastaOutput'});
598 return $self->end_document();
599 } elsif( /^\s*\d+\s*>>>/) {
600 if ($self->within_element('FastaOutput')) {
601 if( $self->in_element('hsp') ) {
602 $self->end_element({'Name' => 'Hsp'});
603 }
604 if( $self->in_element('hit') ) {
605 $self->end_element({'Name' => 'Hit'});
606 }
607
608 if (@hit_signifs) {
609 # process remaining best hits
610 for my $h (@hit_signifs) {
611 $self->start_element({'Name' => 'Hit'});
612 $self->element({ 'Name' => 'Hit_len',
613 'Data' => $h->{hit_len}
614 }) if exists $h->{hit_len};
615 $self->element({ 'Name' => 'Hit_id',
616 'Data' => $h->{id}
617 }) if exists $h->{id};
618 $self->element({ 'Name' => 'Hit_accession',
619 'Data' => $h->{acc}
620 }) if exists $h->{acc};
621 $self->element({ 'Name' => 'Hit_def',
622 'Data' => $h->{desc}
623 }) if exists $h->{desc};
624 $self->element({'Name' => 'Hit_signif',
625 'Data' => $h->{evalue}
626 }) if exists $h->{evalue};
627 $self->element({'Name' => 'Hit_score',
628 'Data' => $h->{bits}
629 }) if exists $h->{bits};
630
631 $self->start_element({'Name' => 'Hsp'});
632 $self->element({'Name' => 'Hsp_score', 'Data' => $h->{'z-sc'} }) if exists $h->{'z-sc'};
633 $self->element({'Name' => 'Hsp_evalue', 'Data' => $h->{evalue} }) if exists $h->{evalue};
634 $self->element({'Name' => 'Hsp_bit-score', 'Data' => $h->{bits} }) if exists $h->{bits};
635 $self->element({'Name' => 'Hsp_sw-score', 'Data' => $h->{sw} }) if exists $h->{sw};
636 $self->element({'Name' => 'Hsp_gaps', 'Data' => $h->{'%_gid'} }) if exists $h->{'%_gid'};
637 $self->element({'Name' => 'Hsp_identity', 'Data' => POSIX::ceil($h->{'%_id'} * $h->{alen}) })
638 if (exists $h->{'%_id'} && exists $h->{alen});
639 $self->element({'Name' => 'Hsp_positive', 'Data' => $h->{'%_id'} }) if exists $h->{'%_id'};
640 $self->element({'Name' => 'Hsp_align-len', 'Data' => $h->{alen} }) if exists $h->{alen};
641 $self->element({'Name' => 'Hsp_query-from', 'Data' => $h->{an0} }) if exists $h->{an0};
642 $self->element({'Name' => 'Hsp_query-to', 'Data' => $h->{ax0} }) if exists $h->{ax0};
643 $self->element({'Name' => 'Hsp_hit-from', 'Data' => $h->{an1} }) if exists $h->{an1};
644 $self->element({'Name' => 'Hsp_hit-to', 'Data' => $h->{ax1} }) if exists $h->{ax1};
645
646 $self->element({'Name' => 'Hsp_querygaps', 'Data' => $h->{qgaps} }) if exists $h->{qgaps};
647 $self->element({'Name' => 'Hsp_hitgaps', 'Data' => $h->{lgaps} }) if exists $h->{lgaps};
648
649 if ($self->{'_reporttype'} =~ m/^FAST[NXY]$/o) {
650 if( 8 == scalar grep { exists $h->{$_} } qw(an0 ax0 pn0 px0 an1 ax1 pn1 px1) ) {
651 if ($h->{ax0} < $h->{an0}) {
652 $self->element({'Name' => 'Hsp_query-frame', 'Data' => "-@{[(($h->{px0} - $h->{ax0}) % 3) + 1]}" });
653 } else {
654 $self->element({'Name' => 'Hsp_query-frame', 'Data' => "+@{[(($h->{an0} - $h->{pn0}) % 3) + 1]}" });
655 }
656 if ($h->{ax1} < $h->{an1}) {
657 $self->element({'Name' => 'Hsp_hit-frame', 'Data' => "-@{[(($h->{px1} - $h->{ax1}) % 3) + 1]}" });
658 } else {
659 $self->element({'Name' => 'Hsp_hit-frame', 'Data' => "+@{[(($h->{an1} - $h->{pn1}) % 3) + 1]}" });
660 }
661 } else {
662 $self->element({'Name' => 'Hsp_query-frame', 'Data' => $h->{lframe} });
663 $self->element({'Name' => 'Hsp_hit-frame', 'Data' => 0 });
664 }
665 } else {
666 $self->element({'Name' => 'Hsp_query-frame', 'Data' => 0 });
667 $self->element({'Name' => 'Hsp_hit-frame', 'Data' => $h->{lframe} });
668 }
669
670 $self->end_element({'Name' => 'Hsp'});
671 $self->end_element({'Name' => 'Hit'});
672 }
673 }
674 $self->end_element({ 'Name' => 'FastaOutput' });
675 $self->_pushback($_);
676 return $self->end_document();
677 } else {
678 $self->start_element({ 'Name' => 'FastaOutput' });
679 $self->{'_result_count'}++;
680 $seentop = 1;
681 $self->element({ 'Name' => 'FastaOutput_program',
682 'Data' => $self->{'_reporttype'} });
683 $self->element({ 'Name' => 'FastaOutput_version',
684 'Data' => $self->{'_version'} });
685
686 my ($type, $querylen, $querytype, $querydef);
687
688 if( /^\s*\d+\s*>>>(.*)/ ) {
689 $querydef = $1;
690 if($querydef =~ /^(.*?)\s+(?:\-\s+)?(\d+)\s+(aa|nt)\s*$/o ) {
691 ($querydef, $querylen, $querytype) = ($1, $2, $3);
692 }
693 }
694
695 if( $self->{'_reporttype'} &&
696 $self->{'_reporttype'} eq 'FASTA'
697 ) {
698 if( $querytype eq 'nt') {
699 $self->{'_reporttype'} = 'FASTN' ;
700 } elsif( $querytype eq 'aa' ) {
701 $self->{'_reporttype'} = 'FASTP' ;
702 }
703 }
704 my ($name,$descr) = ($querydef =~ m/^(\S+)(?:\s+(.*))?\s*$/o);
705 $self->element({'Name' => 'FastaOutput_query-def',
706 'Data' => $name});
707 $self->element({'Name' => 'FastaOutput_querydesc',
708 'Data' => $descr});
709 if ($querylen) {
710 $self->element({'Name' => 'FastaOutput_query-len',
711 'Data' => $querylen});
712 } else {
713 $self->warn("unable to find and set query length");
714 }
715
716
717 if( defined ($_ = $self->_readline()) && /^\s*vs\s+(\S+)/ ) {
718 $self->element({'Name' => 'FastaOutput_db',
719 'Data' => $1});
720 }
721 }
722 } elsif( $self->in_element('hsp' ) ) {
723
724 my @data = ( '','','');
725 my $count = 0;
726 my $len = $self->idlength + 1;
727 my ($seq1_id);
728 while( defined($_ ) ) {
729 chomp;
730 $self->debug( "$count $_\n");
731
732 if( /residues in \d+\s+query\s+sequences/) {
733 $self->_pushback($_);
734 last;
735 } elsif( /^>>/ ) {
736 $self->_pushback($_);
737 last;
738 } elsif (/^\s*\d+\s*>>>/) {
739 $self->_pushback($_);
740 last;
741 }
742 if( $count == 0 ) {
743 unless( /^\s+\d+/ || /^\s+$/) {
744 $self->_pushback($_);
745 $count = 2;
746 }
747 } elsif( $count == 1 || $count == 3 ) {
748 if( /^(\S+)\s+/ ) {
749 $len = CORE::length($1) if $len < CORE::length($1);
750 s/\s+$//; # trim trailing spaces,we don't want them
751 $data[$count-1] = substr($_,$len);
752 } elsif( /^\s+(\d+)/ ) {
753 $count = -1;
754 $self->_pushback($_);
755 } elsif( /^\s+$/ || length($_) == 0) {
756 $count = 5;
757 # going to skip these
758 } else {
759 $self->warn("Unrecognized alignment line ($count) '$_'");
760 }
761 } elsif( $count == 2 ) {
762 if( /^\s+\d+\s+/ ) {
763 $self->warn("$_\n");
764 $count = 4;
765 } else {
766 # toss the first IDLENGTH characters of the line
767 if( length($_) >= $len ) {
768 $data[$count-1] = substr($_,$len);
769 }
770 }
771 }
772 last if( $count++ >= 5);
773 $_ = $self->_readline();
774 }
775 if( length($data[0]) > 0 || length($data[2]) > 0 ) {
776 $self->characters({'Name' => 'Hsp_qseq',
777 'Data' => $data[0] });
778 $self->characters({'Name' => 'Hsp_midline',
779 'Data' => $data[1]});
780 $self->characters({'Name' => 'Hsp_hseq',
781 'Data' => $data[2]});
782 }
783 } else {
784 if( ! $seentop ) {
785 $self->debug($_);
786 $self->warn("unrecognized FASTA Family report file!");
787 return undef;
788 }
789 }
790 }
791 }
792
793
794 =head2 start_element
795
796 Title : start_element
797 Usage : $eventgenerator->start_element
798 Function: Handles a start element event
799 Returns : none
800 Args : hashref with at least 2 keys 'Data' and 'Name'
801
802
803 =cut
804
805 sub start_element{
806 my ($self,$data) = @_;
807 # we currently don't care about attributes
808 my $nm = $data->{'Name'};
809 if( my $type = $MODEMAP{$nm} ) {
810 $self->_mode($type);
811 if( $self->_eventHandler->will_handle($type) ) {
812 my $func = sprintf("start_%s",lc $type);
813 $self->_eventHandler->$func($data->{'Attributes'});
814 }
815 unshift @{$self->{'_elements'}}, $type;
816 }
817 if($nm eq 'FastaOutput') {
818 $self->{'_values'} = {};
819 $self->{'_result'}= undef;
820 $self->{'_mode'} = '';
821 }
822
823 }
824
825 =head2 end_element
826
827 Title : start_element
828 Usage : $eventgenerator->end_element
829 Function: Handles an end element event
830 Returns : none
831 Args : hashref with at least 2 keys 'Data' and 'Name'
832
833
834 =cut
835
836 sub end_element {
837 my ($self,$data) = @_;
838 my $nm = $data->{'Name'};
839 my $rc;
840 # Hsp are sort of weird, in that they end when another
841 # object begins so have to detect this in end_element for now
842 if( $nm eq 'Hsp' ) {
843 foreach ( qw(Hsp_qseq Hsp_midline Hsp_hseq) ) {
844 $self->element({'Name' => $_,
845 'Data' => $self->{'_last_hspdata'}->{$_}});
846 }
847 $self->{'_last_hspdata'} = {}
848 }
849
850 if( my $type = $MODEMAP{$nm} ) {
851 if( $self->_eventHandler->will_handle($type) ) {
852 my $func = sprintf("end_%s",lc $type);
853 $rc = $self->_eventHandler->$func($self->{'_reporttype'},
854 $self->{'_values'});
855 }
856 shift @{$self->{'_elements'}};
857
858 } elsif( $MAPPING{$nm} ) {
859 if ( ref($MAPPING{$nm}) =~ /hash/i ) {
860 my $key = (keys %{$MAPPING{$nm}})[0];
861 $self->{'_values'}->{$key}->{$MAPPING{$nm}->{$key}} = $self->{'_last_data'};
862 } else {
863 $self->{'_values'}->{$MAPPING{$nm}} = $self->{'_last_data'};
864 }
865 } else {
866 $self->warn( "unknown nm $nm, ignoring\n");
867 }
868 $self->{'_last_data'} = ''; # remove read data if we are at
869 # end of an element
870 $self->{'_result'} = $rc if( $nm eq 'FastaOutput' );
871 return $rc;
872
873 }
874
875 =head2 element
876
877 Title : element
878 Usage : $eventhandler->element({'Name' => $name, 'Data' => $str});
879 Function: Convience method that calls start_element, characters, end_element
880 Returns : none
881 Args : Hash ref with the keys 'Name' and 'Data'
882
883
884 =cut
885
886 sub element{
887 my ($self,$data) = @_;
888 $self->start_element($data);
889 $self->characters($data);
890 $self->end_element($data);
891 }
892
893
894 =head2 characters
895
896 Title : characters
897 Usage : $eventgenerator->characters($str)
898 Function: Send a character events
899 Returns : none
900 Args : string
901
902
903 =cut
904
905 sub characters{
906 my ($self,$data) = @_;
907
908 return unless ( defined $data->{'Data'} );
909 if( $data->{'Data'} =~ /^\s+$/ ) {
910 return unless $data->{'Name'} =~ /Hsp\_(midline|qseq|hseq)/;
911 }
912
913 if( $self->in_element('hsp') &&
914 $data->{'Name'} =~ /Hsp\_(qseq|hseq|midline)/ ) {
915
916 $self->{'_last_hspdata'}->{$data->{'Name'}} .= $data->{'Data'};
917 }
918
919 $self->{'_last_data'} = $data->{'Data'};
920 }
921
922 =head2 _mode
923
924 Title : _mode
925 Usage : $obj->_mode($newval)
926 Function:
927 Example :
928 Returns : value of _mode
929 Args : newvalue (optional)
930
931
932 =cut
933
934 sub _mode{
935 my ($self,$value) = @_;
936 if( defined $value) {
937 $self->{'_mode'} = $value;
938 }
939 return $self->{'_mode'};
940 }
941
942 =head2 within_element
943
944 Title : within_element
945 Usage : if( $eventgenerator->within_element($element) ) {}
946 Function: Test if we are within a particular element
947 This is different than 'in' because within can be tested
948 for a whole block.
949 Returns : boolean
950 Args : string element name
951
952
953 =cut
954
955 sub within_element{
956 my ($self,$name) = @_;
957 return 0 if ( ! defined $name &&
958 ! defined $self->{'_elements'} ||
959 scalar @{$self->{'_elements'}} == 0) ;
960 foreach ( @{$self->{'_elements'}} ) {
961 if( $_ eq $name || $_ eq $MODEMAP{$name} ) {
962 return 1;
963 }
964 }
965 return 0;
966 }
967
968 =head2 in_element
969
970 Title : in_element
971 Usage : if( $eventgenerator->in_element($element) ) {}
972 Function: Test if we are in a particular element
973 This is different than 'in' because within can be tested
974 for a whole block.
975 Returns : boolean
976 Args : string element name
977
978
979 =cut
980
981 sub in_element{
982 my ($self,$name) = @_;
983 return 0 if ! defined $self->{'_elements'}->[0];
984 return ( $self->{'_elements'}->[0] eq $name ||
985 (exists $MODEMAP{$name} && $self->{'_elements'}->[0] eq $MODEMAP{$name})
986 );
987 }
988
989
990 =head2 start_document
991
992 Title : start_document
993 Usage : $eventgenerator->start_document
994 Function: Handles a start document event
995 Returns : none
996 Args : none
997
998
999 =cut
1000
1001 sub start_document{
1002 my ($self) = @_;
1003 $self->{'_lasttype'} = '';
1004 $self->{'_values'} = {};
1005 $self->{'_result'}= undef;
1006 $self->{'_mode'} = '';
1007 $self->{'_elements'} = [];
1008 }
1009
1010
1011 =head2 end_document
1012
1013 Title : end_document
1014 Usage : $eventgenerator->end_document
1015 Function: Handles an end document event
1016 Returns : Bio::Search::Result::ResultI object
1017 Args : none
1018
1019
1020 =cut
1021
1022 sub end_document{
1023 my ($self,@args) = @_;
1024 return $self->{'_result'};
1025 }
1026
1027 =head2 idlength
1028
1029 Title : idlength
1030 Usage : $obj->idlength($newval)
1031 Function: Internal storage of the length of the ID desc
1032 in the HSP alignment blocks. Defaults to
1033 $IDLENGTH class variable value
1034 Returns : value of idlength
1035 Args : newvalue (optional)
1036
1037
1038 =cut
1039
1040 sub idlength{
1041 my ($self,$value) = @_;
1042 if( defined $value) {
1043 $self->{'_idlength'} = $value;
1044 }
1045 return $self->{'_idlength'} || $IDLENGTH;
1046 }
1047
1048
1049 =head2 result_count
1050
1051 Title : result_count
1052 Usage : my $count = $searchio->result_count
1053 Function: Returns the number of results we have processed
1054 Returns : integer
1055 Args : none
1056
1057
1058 =cut
1059
1060 sub result_count {
1061 my $self = shift;
1062 return $self->{'_result_count'};
1063 }
1064
1065 1;
1066