0
|
1 # $Id: hmmer.pm,v 1.13.2.3 2003/08/07 15:40:36 jason Exp $
|
|
2 #
|
|
3 # BioPerl module for Bio::SearchIO::hmmer
|
|
4 #
|
|
5 # Cared for by Jason Stajich <jason@bioperl.org>
|
|
6 #
|
|
7 # Copyright Jason Stajich
|
|
8 #
|
|
9 # You may distribute this module under the same terms as perl itself
|
|
10
|
|
11 # POD documentation - main docs before the code
|
|
12
|
|
13 =head1 NAME
|
|
14
|
|
15 Bio::SearchIO::hmmer - A parser for HMMER output (hmmpfam, hmmsearch)
|
|
16
|
|
17 =head1 SYNOPSIS
|
|
18
|
|
19 # do not use this class directly it is available through Bio::SearchIO
|
|
20 use Bio::SearchIO;
|
|
21 my $in = new Bio::SearchIO(-format => 'hmmer',
|
|
22 -file => 't/data/L77119.hmmer');
|
|
23 while( my $result = $in->next_result ) {
|
|
24 # this is a Bio::Search::Result::HMMERResult object
|
|
25 print $result->query_name(), " for HMM ", $result->hmm_name(), "\n";
|
|
26 while( my $hit = $result->next_hit ) {
|
|
27 print $hit->name(), "\n";
|
|
28 while( my $hsp = $hit->next_hsp ) {
|
|
29 print "length is ", $hsp->length(), "\n";
|
|
30 }
|
|
31 }
|
|
32 }
|
|
33
|
|
34 =head1 DESCRIPTION
|
|
35
|
|
36 This object implements a parser for HMMER output.
|
|
37
|
|
38 =head1 FEEDBACK
|
|
39
|
|
40 =head2 Mailing Lists
|
|
41
|
|
42 User feedback is an integral part of the evolution of this and other
|
|
43 Bioperl modules. Send your comments and suggestions preferably to
|
|
44 the Bioperl mailing list. Your participation is much appreciated.
|
|
45
|
|
46 bioperl-l@bioperl.org - General discussion
|
|
47 http://bioperl.org/MailList.shtml - About the mailing lists
|
|
48
|
|
49 =head2 Reporting Bugs
|
|
50
|
|
51 Report bugs to the Bioperl bug tracking system to help us keep track
|
|
52 of the bugs and their resolution. Bug reports can be submitted via
|
|
53 email or the web:
|
|
54
|
|
55 bioperl-bugs@bioperl.org
|
|
56 http://bugzilla.bioperl.org/
|
|
57
|
|
58 =head1 AUTHOR - Jason Stajich
|
|
59
|
|
60 Email jason@bioperl.org
|
|
61
|
|
62 Describe contact details here
|
|
63
|
|
64 =head1 CONTRIBUTORS
|
|
65
|
|
66 Additional contributors names and emails here
|
|
67
|
|
68 =head1 APPENDIX
|
|
69
|
|
70 The rest of the documentation details each of the object methods.
|
|
71 Internal methods are usually preceded with a _
|
|
72
|
|
73 =cut
|
|
74
|
|
75 package Bio::SearchIO::hmmer;
|
|
76 use vars qw(@ISA);
|
|
77 use strict;
|
|
78 use vars qw(@ISA %MAPPING %MODEMAP
|
|
79 $DEFAULT_HSP_FACTORY_CLASS
|
|
80 $DEFAULT_HIT_FACTORY_CLASS
|
|
81 $DEFAULT_RESULT_FACTORY_CLASS
|
|
82 );
|
|
83 use Bio::SearchIO;
|
|
84 use Bio::Factory::ObjectFactory;
|
|
85
|
|
86 @ISA = qw(Bio::SearchIO );
|
|
87
|
|
88 BEGIN {
|
|
89 # mapping of HMMER items to Bioperl hash keys
|
|
90 %MODEMAP = ('HMMER_Output' => 'result',
|
|
91 'Hit' => 'hit',
|
|
92 'Hsp' => 'hsp'
|
|
93 );
|
|
94
|
|
95 %MAPPING = (
|
|
96 'Hsp_bit-score' => 'HSP-bits',
|
|
97 'Hsp_score' => 'HSP-score',
|
|
98 'Hsp_evalue' => 'HSP-evalue',
|
|
99 'Hsp_query-from' => 'HSP-query_start',
|
|
100 'Hsp_query-to' => 'HSP-query_end',
|
|
101 'Hsp_hit-from' => 'HSP-hit_start',
|
|
102 'Hsp_hit-to' => 'HSP-hit_end',
|
|
103 'Hsp_positive' => 'HSP-conserved',
|
|
104 'Hsp_identity' => 'HSP-identical',
|
|
105 'Hsp_gaps' => 'HSP-hsp_gaps',
|
|
106 'Hsp_hitgaps' => 'HSP-hit_gaps',
|
|
107 'Hsp_querygaps' => 'HSP-query_gaps',
|
|
108 'Hsp_qseq' => 'HSP-query_seq',
|
|
109 'Hsp_hseq' => 'HSP-hit_seq',
|
|
110 'Hsp_midline' => 'HSP-homology_seq',
|
|
111 'Hsp_align-len' => 'HSP-hsp_length',
|
|
112 'Hsp_query-frame'=> 'HSP-query_frame',
|
|
113 'Hsp_hit-frame' => 'HSP-hit_frame',
|
|
114
|
|
115 'Hit_id' => 'HIT-name',
|
|
116 'Hit_len' => 'HIT-length',
|
|
117 'Hit_accession' => 'HIT-accession',
|
|
118 'Hit_desc' => 'HIT-description',
|
|
119 'Hit_signif' => 'HIT-significance',
|
|
120 'Hit_score' => 'HIT-score',
|
|
121
|
|
122 'HMMER_program' => 'RESULT-algorithm_name',
|
|
123 'HMMER_version' => 'RESULT-algorithm_version',
|
|
124 'HMMER_query-def' => 'RESULT-query_name',
|
|
125 'HMMER_query-len' => 'RESULT-query_length',
|
|
126 'HMMER_query-acc' => 'RESULT-query_accession',
|
|
127 'HMMER_querydesc' => 'RESULT-query_description',
|
|
128 'HMMER_hmm' => 'RESULT-hmm_name',
|
|
129 'HMMER_seqfile' => 'RESULT-sequence_file',
|
|
130 );
|
|
131 $DEFAULT_HIT_FACTORY_CLASS = 'Bio::Factory::HMMERHitFactory';
|
|
132 $DEFAULT_HSP_FACTORY_CLASS = 'Bio::Factory::HMMERHSPFactory';
|
|
133 $DEFAULT_RESULT_FACTORY_CLASS = 'Bio::Factory::HMMERResultFactory';
|
|
134 }
|
|
135
|
|
136
|
|
137 =head2 new
|
|
138
|
|
139 Title : new
|
|
140 Usage : my $obj = new Bio::SearchIO::hmmer();
|
|
141 Function: Builds a new Bio::SearchIO::hmmer object
|
|
142 Returns : Bio::SearchIO::hmmer
|
|
143 Args : -fh/-file => HMMER filename
|
|
144 -format => 'hmmer'
|
|
145
|
|
146 =cut
|
|
147
|
|
148 sub _initialize {
|
|
149 my ($self,@args) = @_;
|
|
150 $self->SUPER::_initialize(@args);
|
|
151 my $handler = $self->_eventHandler;
|
|
152 # new object initialization code
|
|
153 $handler->register_factory('result',
|
|
154 Bio::Factory::ObjectFactory->new(
|
|
155 -type => 'Bio::Search::Result::HMMERResult',
|
|
156 -interface => 'Bio::Search::Result::ResultI'));
|
|
157
|
|
158 $handler->register_factory('hit',
|
|
159 Bio::Factory::ObjectFactory->new(
|
|
160 -type => 'Bio::Search::Hit::HMMERHit',
|
|
161 -interface => 'Bio::Search::Hit::HitI'));
|
|
162
|
|
163 $handler->register_factory('hsp',
|
|
164 Bio::Factory::ObjectFactory->new(
|
|
165 -type => 'Bio::Search::HSP::HMMERHSP',
|
|
166 -interface => 'Bio::Search::HSP::HSPI'));
|
|
167 $self->{'_hmmidline'} = 'HMMER 2.2g (August 2001)';
|
|
168 }
|
|
169 =head2 next_result
|
|
170
|
|
171 Title : next_result
|
|
172 Usage : my $hit = $searchio->next_result;
|
|
173 Function: Returns the next Result from a search
|
|
174 Returns : Bio::Search::Result::ResultI object
|
|
175 Args : none
|
|
176
|
|
177 =cut
|
|
178
|
|
179 sub next_result{
|
|
180 my ($self) = @_;
|
|
181 my $seentop = 0;
|
|
182 my $reporttype;
|
|
183 my ($last,@hitinfo,@hspinfo,%hspinfo,%hitinfo);
|
|
184
|
|
185 my @alignemnt_lines;
|
|
186
|
|
187 $self->start_document();
|
|
188 while( defined ($_ = $self->_readline )) {
|
|
189 my $lineorig = $_;
|
|
190 chomp;
|
|
191 if( /^HMMER\s+(\S+)\s+\((.+)\)/o ) {
|
|
192 my ($prog,$version) = split;
|
|
193 if( $seentop ) {
|
|
194 $self->_pushback($_);
|
|
195 $self->end_element({'Name' => 'HMMER_Output'});
|
|
196 return $self->end_document();
|
|
197 }
|
|
198 $self->{'_hmmidline'} = $_;
|
|
199 $self->start_element({'Name' => 'HMMER_Output'});
|
|
200 $self->{'_result_count'}++;
|
|
201 $seentop = 1;
|
|
202 if( defined $last ) {
|
|
203 ($reporttype) = split(/\s+/,$last);
|
|
204 $self->element({'Name' => 'HMMER_program',
|
|
205 'Data' => uc ($reporttype)});
|
|
206 }
|
|
207 $self->element({'Name' => 'HMMER_version',
|
|
208 'Data' => $version});
|
|
209 } elsif( s/^HMM file:\s+//o ) {
|
|
210 $self->{'_hmmfileline'} = $lineorig;
|
|
211 $self->element({'Name' => 'HMMER_hmm',
|
|
212 'Data' => $_});
|
|
213 } elsif( s/^Sequence\s+(file|database):\s+//o ) {
|
|
214 $self->{'_hmmseqline'} = $lineorig;
|
|
215 $self->element({'Name' => 'HMMER_seqfile',
|
|
216 'Data' => $_});
|
|
217 } elsif( s/^Query(\s+(sequence|HMM))?:\s+//o) {
|
|
218 if( ! $seentop ) {
|
|
219 # we're in a multi-query report
|
|
220 $self->_pushback($self->{'_hmmidline'});
|
|
221 $self->_pushback($self->{'_hmmfileline'});
|
|
222 $self->_pushback($self->{'_hmmseqline'});
|
|
223 $self->_pushback($lineorig);
|
|
224 next;
|
|
225 }
|
|
226 s/\s+$//;
|
|
227 $self->element({'Name' => 'HMMER_query-def',
|
|
228 'Data' => $_});
|
|
229 } elsif( s/^Accession:\s+//o ) {
|
|
230 s/\s+$//;
|
|
231 $self->element({'Name' => 'HMMER_query-acc',
|
|
232 'Data' => $_});
|
|
233 } elsif( s/^Description:\s+//o ) {
|
|
234 s/\s+$//;
|
|
235 $self->element({'Name' => 'HMMER_querydesc',
|
|
236 'Data' => $_});
|
|
237 } elsif( defined $self->{'_reporttype'} &&
|
|
238 $self->{'_reporttype'} eq 'HMMSEARCH' ) {
|
|
239 # PROCESS HMMSEARCH RESULTS HERE
|
|
240 if( /^Scores for complete sequences/o ) {
|
|
241 while( defined($_ = $self->_readline) ) {
|
|
242 last if( /^\s+$/);
|
|
243 next if( /^Sequence\s+Description/o || /^\-\-\-/o );
|
|
244 my @line = split;
|
|
245 my ($name, $n,$evalue,$score)= (shift @line,
|
|
246 pop @line,
|
|
247 pop @line,
|
|
248 pop @line);
|
|
249 my $desc = join(' ', @line);
|
|
250 push @hitinfo, [ $name, $desc,$evalue,$score];
|
|
251 $hitinfo{$name} = $#hitinfo;
|
|
252 }
|
|
253 } elsif( /^Parsed for domains:/o ) {
|
|
254 @hspinfo = ();
|
|
255
|
|
256 while( defined($_ = $self->_readline) ) {
|
|
257 last if( /^\s+$/);
|
|
258 next if( /^(Model|Sequence)\s+Domain/o || /^\-\-\-/o );
|
|
259 if( my ($n,$domainnum,$domainct, @vals) =
|
|
260 (m!^(\S+)\s+ # host name
|
|
261 (\d+)/(\d+)\s+ # num/num (ie 1 of 2)
|
|
262 (\d+)\s+(\d+).+? # sequence start and end
|
|
263 (\d+)\s+(\d+)\s+ # hmm start and end
|
|
264 \S+\s+ # []
|
|
265 (\S+)\s+ # score
|
|
266 (\S+) # evalue
|
|
267 \s*$!ox ) ) {
|
|
268 # array lookup so that we can get rid of things
|
|
269 # when they've been processed
|
|
270 my $info = $hitinfo[$hitinfo{$n}];
|
|
271 if( !defined $info ) {
|
|
272 $self->warn("Incomplete Sequence information, can't find $n hitinfo says $hitinfo{$n}");
|
|
273 next;
|
|
274 }
|
|
275 push @hspinfo, [ $n, @vals];
|
|
276 }
|
|
277 }
|
|
278 } elsif( /^Alignments of top/o ) {
|
|
279 my ($prelength,$lastdomain,$count,$width);
|
|
280 $count = 0;
|
|
281 my %domaincounter;
|
|
282 my $second_tier=0;
|
|
283 while( defined($_ = $self->_readline) ) {
|
|
284 next if( /^Align/o ||
|
|
285 /^\s+RF\s+[x\s]+$/o);
|
|
286 if( /^Histogram/o || m!^//!o ) {
|
|
287 if( $self->in_element('hsp')) {
|
|
288 $self->end_element({'Name' => 'Hsp'});
|
|
289 }
|
|
290 if( $self->within_element('hit')) {
|
|
291 $self->end_element({'Name' => 'Hit'});
|
|
292 }
|
|
293 last;
|
|
294 }
|
|
295 chomp;
|
|
296
|
|
297 if( /^\s*(.+):\s+domain\s+(\d+)\s+of\s+(\d+)\,\s+from\s+(\d+)\s+to\s+(\d+)/o ) {
|
|
298 my ($name,$domainct,$domaintotal,
|
|
299 $from,$to) = ($1,$2,$3,$4,$5);
|
|
300 $domaincounter{$name}++;
|
|
301 if( ! defined $lastdomain || $lastdomain ne $name ) {
|
|
302 if( $self->within_element('hit') ) {
|
|
303 if( $self->within_element('hsp') ) {
|
|
304 $self->end_element({'Name' => 'Hsp'});
|
|
305 }
|
|
306 $self->end_element({'Name' => 'Hit'});
|
|
307 }
|
|
308 $self->start_element({'Name' => 'Hit'});
|
|
309 my $info = [@{$hitinfo[$hitinfo{$name}] || $self->throw("Could not find hit info for $name: Insure that your database contains only unique sequence names")}];
|
|
310 if( $info->[0] ne $name ) {
|
|
311 $self->throw("Somehow the Model table order does not match the order in the domains (got ".$info->[0].", expected $name)");
|
|
312 }
|
|
313 $self->element({'Name' => 'Hit_id',
|
|
314 'Data' => shift @{$info}});
|
|
315 $self->element({'Name' => 'Hit_desc',
|
|
316 'Data' => shift @{$info}});
|
|
317 $self->element({'Name' => 'Hit_signif',
|
|
318 'Data' => shift @{$info}});
|
|
319 $self->element({'Name' => 'Hit_score',
|
|
320 'Data' => shift @{$info}});
|
|
321 }
|
|
322 $self->end_element({'Name' => 'Hsp'})
|
|
323 if $self->in_element('hsp');
|
|
324
|
|
325 $self->start_element({'Name' => 'Hsp'});
|
|
326 $self->element({'Name' => 'Hsp_identity',
|
|
327 'Data' => 0});
|
|
328 $self->element({'Name' => 'Hsp_positive',
|
|
329 'Data' => 0});
|
|
330 my $HSPinfo = shift @hspinfo;
|
|
331 my $id = shift @$HSPinfo;
|
|
332 if( $id ne $name ) {
|
|
333 $self->throw("Somehow the domain list details do not match the table (got $id, expected $name)");
|
|
334 }
|
|
335 if( $domaincounter{$name} == $domaintotal) {
|
|
336 $hitinfo[$hitinfo{$name}] = undef;
|
|
337 }
|
|
338
|
|
339 $self->element({'Name' => 'Hsp_hit-from',
|
|
340 'Data' => shift @$HSPinfo});
|
|
341 $self->element({'Name' => 'Hsp_hit-to',
|
|
342 'Data' => shift @$HSPinfo});
|
|
343 $self->element({'Name' => 'Hsp_query-from',
|
|
344 'Data' => shift @$HSPinfo});
|
|
345 $self->element({'Name' => 'Hsp_query-to',
|
|
346 'Data' => shift @$HSPinfo});
|
|
347 $self->element({'Name' => 'Hsp_score',
|
|
348 'Data' => shift @$HSPinfo});
|
|
349 $self->element({'Name' => 'Hsp_evalue',
|
|
350 'Data' => shift @$HSPinfo});
|
|
351 $lastdomain = $name;
|
|
352 } else {
|
|
353 # Might want to change this so that it
|
|
354 # accumulates all the of the alignment lines into
|
|
355 # three array slots and then tests for the
|
|
356 # end of the line
|
|
357 if( /^(\s+\*\-\>)(\S+)/o ) { # start of domain
|
|
358 $prelength = CORE::length($1);
|
|
359 $width = 0;
|
|
360 # $width = CORE::length($2);
|
|
361 $self->element({'Name' =>'Hsp_qseq',
|
|
362 'Data' => $2});
|
|
363 $count = 0;
|
|
364 $second_tier = 0;
|
|
365 } elsif ( /^(\s+)(\S+)\<\-\*\s*$/o ) { #end of domain
|
|
366 $self->element({'Name' =>'Hsp_qseq',
|
|
367 'Data' => $2});
|
|
368 $width = CORE::length($2);
|
|
369 $count = 0;
|
|
370 } elsif( ($count != 1 && /^\s+$/o) ||
|
|
371 CORE::length($_) == 0 ||
|
|
372 /^\s+\-?\*\s*$/ ) {
|
|
373 next;
|
|
374 } elsif( $count == 0 ) {
|
|
375 $prelength -= 3 unless ($second_tier++);
|
|
376 unless( defined $prelength) {
|
|
377 # $self->warn("prelength not set");
|
|
378 next;
|
|
379 }
|
|
380 $self->element({'Name' => 'Hsp_qseq',
|
|
381 'Data' => substr($_,$prelength)});
|
|
382 } elsif( $count == 1) {
|
|
383 if( ! defined $prelength ) {
|
|
384 $self->warn("prelength not set");
|
|
385 }
|
|
386 if( $width ) {
|
|
387 $self->element({'Name' => 'Hsp_midline',
|
|
388 'Data' => substr($_,
|
|
389 $prelength,
|
|
390 $width)});
|
|
391 } else {
|
|
392 $self->debug( "midline is $_\n") if( CORE::length($_) <= $prelength && $self->verbose > 0);
|
|
393 $self->element({'Name' => 'Hsp_midline',
|
|
394 'Data' => substr($_,
|
|
395 $prelength)});
|
|
396 }
|
|
397 } elsif( $count == 2) {
|
|
398 if( /^\s+(\S+)\s+(\d+|\-)\s+(\S*)\s+(\d+|\-)/o ) {
|
|
399 $self->element({'Name' => 'Hsp_hseq',
|
|
400 'Data' => $3});
|
|
401 } else {
|
|
402 $self->warn("unrecognized line: $_\n");
|
|
403 }
|
|
404 }
|
|
405 $count = 0 if $count++ >= 2;
|
|
406 }
|
|
407 }
|
|
408 } elsif( /^Histogram/o || m!^//!o ) {
|
|
409
|
|
410 while( my $HSPinfo = shift @hspinfo ) {
|
|
411 my $id = shift @$HSPinfo;
|
|
412 my $info = [@{$hitinfo[$hitinfo{$id}]}];
|
|
413 next unless defined $info;
|
|
414 $self->start_element({'Name' => 'Hit'});
|
|
415 $self->element({'Name' => 'Hit_id',
|
|
416 'Data' => shift @{$info}});
|
|
417 $self->element({'Name' => 'Hit_desc',
|
|
418 'Data' => shift @{$info}});
|
|
419 $self->element({'Name' => 'Hit_signif',
|
|
420 'Data' => shift @{$info}});
|
|
421 $self->element({'Name' => 'Hit_score',
|
|
422 'Data' => shift @{$info}});
|
|
423 $self->start_element({'Name' => 'Hsp'});
|
|
424 $self->element({'Name' => 'Hsp_query-from',
|
|
425 'Data' => shift @$HSPinfo});
|
|
426 $self->element({'Name' => 'Hsp_query-to',
|
|
427 'Data' => shift @$HSPinfo});
|
|
428 $self->element({'Name' => 'Hsp_hit-from',
|
|
429 'Data' => shift @$HSPinfo});
|
|
430 $self->element({'Name' => 'Hsp_hit-to',
|
|
431 'Data' => shift @$HSPinfo});
|
|
432 $self->element({'Name' => 'Hsp_score',
|
|
433 'Data' => shift @$HSPinfo});
|
|
434 $self->element({'Name' => 'Hsp_evalue',
|
|
435 'Data' => shift @$HSPinfo});
|
|
436 $self->element({'Name' => 'Hsp_identity',
|
|
437 'Data' => 0});
|
|
438 $self->element({'Name' => 'Hsp_positive',
|
|
439 'Data' => 0});
|
|
440 $self->element({'Name' => 'Hsp_positive',
|
|
441 'Data' => 0});
|
|
442 $self->end_element({'Name' => 'Hsp'});
|
|
443 $self->end_element({'Name' => 'Hit'});
|
|
444 }
|
|
445 @hitinfo = ();
|
|
446 %hitinfo = ();
|
|
447 last;
|
|
448 }
|
|
449 } elsif( defined $self->{'_reporttype'} &&
|
|
450 $self->{'_reporttype'} eq 'HMMPFAM' ) {
|
|
451 if( /^Scores for sequence family/o ) {
|
|
452 while( defined($_ = $self->_readline) ) {
|
|
453 last if( /^\s+$/);
|
|
454 next if( /^Model\s+Description/o || /^\-\-\-/o );
|
|
455 chomp;
|
|
456 my @line = split;
|
|
457 my ($model,$n,$evalue,$score) = (shift @line,
|
|
458 pop @line,
|
|
459 pop @line,
|
|
460 pop @line);
|
|
461 my $desc = join(' ', @line);
|
|
462 push @hitinfo, [ $model, $desc,$score,$evalue,$n];
|
|
463 $hitinfo{$model} = $#hitinfo;
|
|
464 }
|
|
465 } elsif( /^Parsed for domains:/o ) {
|
|
466 @hspinfo = ();
|
|
467 while( defined($_ = $self->_readline) ) {
|
|
468 last if( /^\s+$/);
|
|
469 next if( /^Model\s+Domain/o || /^\-\-\-/o );
|
|
470 chomp;
|
|
471 if( my ($n,$domainnum,$domainct,@vals) =
|
|
472 (m!^(\S+)\s+ # domain name
|
|
473 \s+(\d+)/(\d+)\s+ # domain num out of num
|
|
474 (\d+)\s+(\d+).+? # seq start, end
|
|
475 (\d+)\s+(\d+)\s+ # hmm start, end
|
|
476 \S+\s+ # []
|
|
477 (\S+)\s+ # score
|
|
478 (\S+) # evalue
|
|
479 \s*$!ox ) ) {
|
|
480 my $hindex = $hitinfo{$n};
|
|
481 if( ! defined $hindex ) {
|
|
482 push @hitinfo, [ $n, '',$vals[5],$vals[6],
|
|
483 $domainct];
|
|
484 $hitinfo{$n} = $#hitinfo;
|
|
485 $hindex = $#hitinfo;
|
|
486 }
|
|
487 my $info = $hitinfo[$hindex];
|
|
488 if( ! defined $info ) {
|
|
489 $self->warn("incomplete Domain information, can't find $n hitinfo says $hitinfo{$n}");
|
|
490 next;
|
|
491 }
|
|
492 push @hspinfo, [ $n, @vals];
|
|
493 }
|
|
494 }
|
|
495 } elsif( /^Alignments of top/o ) {
|
|
496 my ($prelength,$lastdomain,$count,$width);
|
|
497 $count = 0;
|
|
498 my $second_tier=0;
|
|
499 while( defined($_ = $self->_readline) ) {
|
|
500 next if( /^Align/o || /^\s+RF\s+[x\s]+$/o);
|
|
501
|
|
502 if( /^Histogram/o || m!^//!o || /^Query sequence/o ) {
|
|
503 if( $self->in_element('hsp')) {
|
|
504 $self->end_element({'Name' => 'Hsp'});
|
|
505 }
|
|
506 if( $self->in_element('hit') ) {
|
|
507 $self->end_element({'Name' => 'Hit'});
|
|
508 }
|
|
509 $self->end_element({'Name' => 'HMMER_Output'});
|
|
510 if( /^Query sequence/o ) { $self->_pushback($_); }
|
|
511 return $self->end_document();
|
|
512 last;
|
|
513 }
|
|
514 chomp;
|
|
515 if( m/(\S+):.*from\s+(\d+)\s+to\s+(\d+)/o ) {
|
|
516 my ($name,$from,$to) = ($1,$2,$3);
|
|
517 if( ! defined $lastdomain || $lastdomain ne $name ) {
|
|
518
|
|
519 if( $self->within_element('hit') ) {
|
|
520 if( $self->in_element('hsp') ) {
|
|
521 $self->end_element({'Name' => 'Hsp'});
|
|
522 }
|
|
523 $self->end_element({'Name' => 'Hit'});
|
|
524 }
|
|
525 $self->start_element({'Name' => 'Hit'});
|
|
526 my $info = [@{$hitinfo[$hitinfo{$name}]}];
|
|
527 if( ! defined $info ||
|
|
528 $info->[0] ne $name ) {
|
|
529 $self->warn("Somehow the Model table order does not match the order in the domains (got ".$info->[0].", expected $name). We're
|
|
530 back loading this from the alignment information instead");
|
|
531 $info = [$name, '',
|
|
532 /score\s+([^,\s]+),\s+E\s+=\s+(\S+)/ox];
|
|
533 push @hitinfo, $info;
|
|
534 $hitinfo{$name} = $#hitinfo;
|
|
535 }
|
|
536
|
|
537 $self->element({'Name' => 'Hit_id',
|
|
538 'Data' => shift @{$info}});
|
|
539 $self->element({'Name' => 'Hit_desc',
|
|
540 'Data' => shift @{$info}});
|
|
541 $self->element({'Name' => 'Hit_score',
|
|
542 'Data' => shift @{$info}});
|
|
543 $self->element({'Name' => 'Hit_signif',
|
|
544 'Data' => shift @{$info}});
|
|
545 }
|
|
546 if( $self->within_element('hsp') ) {
|
|
547 #if( defined $lastdomain ) {
|
|
548 $self->end_element({'Name' => 'Hsp'});
|
|
549 }
|
|
550 $self->start_element({'Name' => 'Hsp'});
|
|
551 $self->element({'Name' => 'Hsp_identity',
|
|
552 'Data' => 0});
|
|
553 $self->element({'Name' => 'Hsp_positive',
|
|
554 'Data' => 0});
|
|
555 my $HSPinfo = shift @hspinfo;
|
|
556 my $id = shift @$HSPinfo;
|
|
557
|
|
558 if( $id ne $name ) {
|
|
559 $self->throw("Somehow the domain list details do not match the table (got $id, expected $name)");
|
|
560 }
|
|
561 $self->element({'Name' => 'Hsp_query-from',
|
|
562 'Data' => shift @$HSPinfo});
|
|
563 $self->element({'Name' => 'Hsp_query-to',
|
|
564 'Data' => shift @$HSPinfo});
|
|
565 $self->element({'Name' => 'Hsp_hit-from',
|
|
566 'Data' => shift @$HSPinfo});
|
|
567 $self->element({'Name' => 'Hsp_hit-to',
|
|
568 'Data' => shift @$HSPinfo});
|
|
569 $self->element({'Name' => 'Hsp_score',
|
|
570 'Data' => shift @$HSPinfo});
|
|
571 $self->element({'Name' => 'Hsp_evalue',
|
|
572 'Data' => shift @$HSPinfo});
|
|
573
|
|
574 $lastdomain = $name;
|
|
575 } else {
|
|
576 if( /^(\s+\*\-\>)(\S+)/o ) { # start of domain
|
|
577 $prelength = CORE::length($1);
|
|
578 $width = 0;
|
|
579 # $width = CORE::length($2);
|
|
580 $self->element({'Name' =>'Hsp_hseq',
|
|
581 'Data' => $2});
|
|
582 $count = 0;
|
|
583 $second_tier = 0;
|
|
584 } elsif ( /^(\s+)(\S+)\<\-?\*?\s*$/o ) { #end of domain
|
|
585 $prelength -= 3 unless ($second_tier++);
|
|
586 $self->element({'Name' =>'Hsp_hseq',
|
|
587 'Data' => $2});
|
|
588 $width = CORE::length($2);
|
|
589 $count = 0;
|
|
590 } elsif( CORE::length($_) == 0 ||
|
|
591 ($count != 1 && /^\s+$/o) ||
|
|
592 /^\s+\-?\*\s*$/ ) {
|
|
593 next;
|
|
594 } elsif( $count == 0 ) {
|
|
595 $prelength -= 3 unless ($second_tier++);
|
|
596 unless( defined $prelength) {
|
|
597 # $self->warn("prelength not set");
|
|
598 next;
|
|
599 }
|
|
600 $self->element({'Name' => 'Hsp_hseq',
|
|
601 'Data' => substr($_,$prelength)});
|
|
602 } elsif( $count == 1 ) {
|
|
603 if( ! defined $prelength ) {
|
|
604 $self->warn("prelength not set");
|
|
605 }
|
|
606 if( $width ) {
|
|
607 $self->element({'Name' => 'Hsp_midline',
|
|
608 'Data' => substr($_,$prelength,$width)});
|
|
609 } else {
|
|
610 $self->element({'Name' => 'Hsp_midline',
|
|
611 'Data' => substr($_,$prelength)});
|
|
612 }
|
|
613 } elsif( $count == 2 ) {
|
|
614 if( /^\s+(\S+)\s+(\d+)\s+(\S+)\s+(\d+)/o ||
|
|
615 /^\s+(\S+)\s+(\-)\s+(\S*)\s+(\-)/o
|
|
616 ) {
|
|
617 $self->element({'Name' => 'Hsp_qseq',
|
|
618 'Data' => $3});
|
|
619 } else {
|
|
620 $self->warn("unrecognized line ($count): $_\n");
|
|
621 }
|
|
622 }
|
|
623 $count = 0 if $count++ >= 2;
|
|
624 }
|
|
625 }
|
|
626 } else {
|
|
627 $self->debug($_);
|
|
628 }
|
|
629 }
|
|
630 $last = $_;
|
|
631 }
|
|
632
|
|
633 $self->end_element({'Name' => 'HMMER_Output'}) unless ! $seentop;
|
|
634 return $self->end_document();
|
|
635 }
|
|
636
|
|
637 =head2 start_element
|
|
638
|
|
639 Title : start_element
|
|
640 Usage : $eventgenerator->start_element
|
|
641 Function: Handles a start element event
|
|
642 Returns : none
|
|
643 Args : hashref with at least 2 keys 'Data' and 'Name'
|
|
644
|
|
645
|
|
646 =cut
|
|
647
|
|
648 sub start_element{
|
|
649 my ($self,$data) = @_;
|
|
650 # we currently don't care about attributes
|
|
651 my $nm = $data->{'Name'};
|
|
652 my $type = $MODEMAP{$nm};
|
|
653 if( $type ) {
|
|
654 if( $self->_eventHandler->will_handle($type) ) {
|
|
655 my $func = sprintf("start_%s",lc $type);
|
|
656 $self->_eventHandler->$func($data->{'Attributes'});
|
|
657 }
|
|
658 unshift @{$self->{'_elements'}}, $type;
|
|
659 }
|
|
660 if(defined $type &&
|
|
661 $type eq 'result') {
|
|
662 $self->{'_values'} = {};
|
|
663 $self->{'_result'}= undef;
|
|
664 }
|
|
665 }
|
|
666
|
|
667 =head2 end_element
|
|
668
|
|
669 Title : start_element
|
|
670 Usage : $eventgenerator->end_element
|
|
671 Function: Handles an end element event
|
|
672 Returns : none
|
|
673 Args : hashref with at least 2 keys 'Data' and 'Name'
|
|
674
|
|
675
|
|
676 =cut
|
|
677
|
|
678 sub end_element {
|
|
679 my ($self,$data) = @_;
|
|
680 my $nm = $data->{'Name'};
|
|
681 my $type = $MODEMAP{$nm};
|
|
682 my $rc;
|
|
683
|
|
684 if($nm eq 'HMMER_program' ) {
|
|
685 if( $self->{'_last_data'} =~ /(HMM\S+)/i ) {
|
|
686 $self->{'_reporttype'} = uc $1;
|
|
687 }
|
|
688 }
|
|
689 # Hsp are sort of weird, in that they end when another
|
|
690 # object begins so have to detect this in end_element for now
|
|
691 if( $nm eq 'Hsp' ) {
|
|
692 foreach ( qw(Hsp_qseq Hsp_midline Hsp_hseq) ) {
|
|
693 $self->element({'Name' => $_,
|
|
694 'Data' => $self->{'_last_hspdata'}->{$_}});
|
|
695 }
|
|
696 $self->{'_last_hspdata'} = {};
|
|
697 }
|
|
698 if( $type ) {
|
|
699 if( $self->_eventHandler->will_handle($type) ) {
|
|
700 my $func = sprintf("end_%s",lc $type);
|
|
701 $rc = $self->_eventHandler->$func($self->{'_reporttype'},
|
|
702 $self->{'_values'});
|
|
703 }
|
|
704 my $lastelem = shift @{$self->{'_elements'}};
|
|
705 } elsif( $MAPPING{$nm} ) {
|
|
706 if ( ref($MAPPING{$nm}) =~ /hash/i ) {
|
|
707 my $key = (keys %{$MAPPING{$nm}})[0];
|
|
708 $self->{'_values'}->{$key}->{$MAPPING{$nm}->{$key}} = $self->{'_last_data'};
|
|
709 } else {
|
|
710 $self->{'_values'}->{$MAPPING{$nm}} = $self->{'_last_data'};
|
|
711 }
|
|
712 } else {
|
|
713 $self->debug( "unknown nm $nm, ignoring\n");
|
|
714 }
|
|
715 $self->{'_last_data'} = ''; # remove read data if we are at
|
|
716 # end of an element
|
|
717 $self->{'_result'} = $rc if( defined $type && $type eq 'result' );
|
|
718 return $rc;
|
|
719 }
|
|
720
|
|
721 =head2 element
|
|
722
|
|
723 Title : element
|
|
724 Usage : $eventhandler->element({'Name' => $name, 'Data' => $str});
|
|
725 Function: Convience method that calls start_element, characters, end_element
|
|
726 Returns : none
|
|
727 Args : Hash ref with the keys 'Name' and 'Data'
|
|
728
|
|
729
|
|
730 =cut
|
|
731
|
|
732 sub element{
|
|
733 my ($self,$data) = @_;
|
|
734 $self->start_element($data);
|
|
735 $self->characters($data);
|
|
736 $self->end_element($data);
|
|
737 }
|
|
738
|
|
739 =head2 characters
|
|
740
|
|
741 Title : characters
|
|
742 Usage : $eventgenerator->characters($str)
|
|
743 Function: Send a character events
|
|
744 Returns : none
|
|
745 Args : string
|
|
746
|
|
747
|
|
748 =cut
|
|
749
|
|
750 sub characters{
|
|
751 my ($self,$data) = @_;
|
|
752
|
|
753 if( $self->in_element('hsp') &&
|
|
754 $data->{'Name'} =~ /Hsp\_(qseq|hseq|midline)/o &&
|
|
755 defined $data->{'Data'} ) {
|
|
756 $self->{'_last_hspdata'}->{$data->{'Name'}} .= $data->{'Data'};
|
|
757 }
|
|
758 return unless ( defined $data->{'Data'} && $data->{'Data'} !~ /^\s+$/o );
|
|
759
|
|
760 $self->{'_last_data'} = $data->{'Data'};
|
|
761 }
|
|
762
|
|
763 =head2 within_element
|
|
764
|
|
765 Title : within_element
|
|
766 Usage : if( $eventgenerator->within_element($element) ) {}
|
|
767 Function: Test if we are within a particular element
|
|
768 This is different than 'in' because within can be tested
|
|
769 for a whole block.
|
|
770 Returns : boolean
|
|
771 Args : string element name
|
|
772
|
|
773
|
|
774 =cut
|
|
775
|
|
776 sub within_element{
|
|
777 my ($self,$name) = @_;
|
|
778 return 0 if ( ! defined $name ||
|
|
779 ! defined $self->{'_elements'} ||
|
|
780 scalar @{$self->{'_elements'}} == 0) ;
|
|
781 foreach ( @{$self->{'_elements'}} ) {
|
|
782 return 1 if( $_ eq $name );
|
|
783 }
|
|
784 return 0;
|
|
785 }
|
|
786
|
|
787
|
|
788 =head2 in_element
|
|
789
|
|
790 Title : in_element
|
|
791 Usage : if( $eventgenerator->in_element($element) ) {}
|
|
792 Function: Test if we are in a particular element
|
|
793 This is different than 'within' because 'in' only
|
|
794 tests its immediete parent.
|
|
795 Returns : boolean
|
|
796 Args : string element name
|
|
797
|
|
798
|
|
799 =cut
|
|
800
|
|
801 sub in_element{
|
|
802 my ($self,$name) = @_;
|
|
803 return 0 if ! defined $self->{'_elements'}->[0];
|
|
804 return ( $self->{'_elements'}->[0] eq $name)
|
|
805 }
|
|
806
|
|
807 =head2 start_document
|
|
808
|
|
809 Title : start_document
|
|
810 Usage : $eventgenerator->start_document
|
|
811 Function: Handle a start document event
|
|
812 Returns : none
|
|
813 Args : none
|
|
814
|
|
815
|
|
816 =cut
|
|
817
|
|
818 sub start_document{
|
|
819 my ($self) = @_;
|
|
820 $self->{'_lasttype'} = '';
|
|
821 $self->{'_values'} = {};
|
|
822 $self->{'_result'}= undef;
|
|
823 $self->{'_elements'} = [];
|
|
824 }
|
|
825
|
|
826 =head2 end_document
|
|
827
|
|
828 Title : end_document
|
|
829 Usage : $eventgenerator->end_document
|
|
830 Function: Handles an end document event
|
|
831 Returns : Bio::Search::Result::ResultI object
|
|
832 Args : none
|
|
833
|
|
834
|
|
835 =cut
|
|
836
|
|
837 sub end_document{
|
|
838 my ($self) = @_;
|
|
839 return $self->{'_result'};
|
|
840 }
|
|
841
|
|
842 =head2 result_count
|
|
843
|
|
844 Title : result_count
|
|
845 Usage : my $count = $searchio->result_count
|
|
846 Function: Returns the number of results we have processed
|
|
847 Returns : integer
|
|
848 Args : none
|
|
849
|
|
850
|
|
851 =cut
|
|
852
|
|
853 sub result_count {
|
|
854 my $self = shift;
|
|
855 return $self->{'_result_count'};
|
|
856 }
|
|
857
|
|
858 1;
|