Mercurial > repos > mahtabm > ensembl
comparison variant_effect_predictor/Bio/Tools/HMMER/Results.pm @ 0:1f6dce3d34e0
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 02:01:53 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:1f6dce3d34e0 |
---|---|
1 # $Id: Results.pm,v 1.22.2.1 2003/01/07 13:58:01 jason Exp $ | |
2 # | |
3 # Perl Module for HMMResults | |
4 # | |
5 # Cared for by Ewan Birney <birney@sanger.ac.uk> | |
6 # | |
7 #Copyright Genome Research Limited (1997). | |
8 | |
9 =head1 NAME | |
10 | |
11 Bio::Tools::HMMER::Results - Object representing HMMER output results | |
12 | |
13 =head1 SYNOPSIS | |
14 | |
15 # parse a hmmsearch file (can also parse a hmmpfam file) | |
16 $res = new Bio::Tools::HMMER::Results( -file => 'output.hmm' , -type => 'hmmsearch'); | |
17 | |
18 # print out the results for each sequence | |
19 foreach $seq ( $res->each_Set ) { | |
20 print "Sequence bit score is",$seq->bits,"\n"; | |
21 foreach $domain ( $seq->each_Domain ) { | |
22 print " Domain start ",$domain->start," end ",$domain->end, | |
23 " score ",$domain->bits,"\n"; | |
24 } | |
25 } | |
26 | |
27 # new result object on a sequence/domain cutoff of 25 bits sequence, 15 bits domain | |
28 $newresult = $res->filter_on_cutoff(25,15); | |
29 | |
30 # alternative way of getting out all domains directly | |
31 foreach $domain ( $res->each_Domain ) { | |
32 print "Domain on ",$domain->seq_id," with score ", | |
33 $domain->bits," evalue ",$domain->evalue,"\n"; | |
34 } | |
35 | |
36 =head1 DESCRIPTION | |
37 | |
38 This object represents HMMER output, either from hmmsearch or | |
39 hmmpfam. For hmmsearch, a series of HMMER::Set objects are made, one | |
40 for each sequence, which have the the bits score for the object. For | |
41 hmmpfam searches, only one Set object is made. | |
42 | |
43 | |
44 These objects come from the original HMMResults modules used | |
45 internally in Pfam, written by Ewan. Ewan then converted them to | |
46 bioperl objects in 1999. That conversion is meant to be backwardly | |
47 compatible, but may not be (caveat emptor). | |
48 | |
49 =head1 FEEDBACK | |
50 | |
51 =head2 Mailing Lists | |
52 | |
53 User feedback is an integral part of the evolution of this and other | |
54 Bioperl modules. Send your comments and suggestions preferably to one | |
55 of the Bioperl mailing lists. Your participation is much appreciated. | |
56 | |
57 bioperl-l@bioperl.org - General discussion | |
58 http://www.bioperl.org/MailList.html - About the mailing lists | |
59 | |
60 =head2 Reporting Bugs | |
61 | |
62 Report bugs to the Bioperl bug tracking system to help us keep track | |
63 the bugs and their resolution. Bug reports can be submitted via email | |
64 or the web: | |
65 | |
66 bioperl-bugs@bio.perl.org | |
67 http://www.bugzilla.bioperl.org/ | |
68 | |
69 =head1 AUTHOR - Ewan Birney | |
70 | |
71 Email birney@ebi.ac.uk | |
72 | |
73 =head1 CONTRIBUTORS | |
74 | |
75 Jason Stajich, jason@bioperl.org | |
76 | |
77 =head1 APPENDIX | |
78 | |
79 The rest of the documentation details each of the object | |
80 methods. Internal methods are usually preceded with a _ | |
81 | |
82 =cut | |
83 | |
84 package Bio::Tools::HMMER::Results; | |
85 | |
86 use vars qw(@ISA); | |
87 use Carp; | |
88 use strict; | |
89 | |
90 use Bio::Root::Root; | |
91 use Bio::Root::IO; | |
92 use Bio::Tools::HMMER::Domain; | |
93 use Bio::Tools::HMMER::Set; | |
94 use Bio::SeqAnalysisParserI; | |
95 use Symbol; | |
96 | |
97 @ISA = qw(Bio::Root::Root Bio::Root::IO Bio::SeqAnalysisParserI); | |
98 | |
99 sub new { | |
100 my($class,@args) = @_; | |
101 | |
102 my $self = $class->SUPER::new(@args); | |
103 | |
104 $self->{'domain'} = []; # array of HMMUnits | |
105 $self->{'seq'} = {}; | |
106 | |
107 my ($parsetype) = $self->_rearrange([qw(TYPE)],@args); | |
108 $self->_initialize_io(@args); | |
109 if( !defined $parsetype ) { | |
110 $self->throw("No parse type provided. should be hmmsearch or hmmpfam"); | |
111 } | |
112 $self->parsetype($parsetype); | |
113 if( defined $self->_fh() ) { | |
114 if( $parsetype eq 'hmmsearch' ) { | |
115 $self->_parse_hmmsearch($self->_fh()); | |
116 } elsif ( $parsetype eq 'hmmpfam' ) { | |
117 $self->_parse_hmmpfam($self->_fh()); | |
118 } else { | |
119 $self->throw("Did not recoginise type $parsetype"); | |
120 } | |
121 } | |
122 | |
123 return $self; # success - we hope! | |
124 } | |
125 | |
126 | |
127 =head2 next_feature | |
128 | |
129 Title : next_feature | |
130 Usage : while( my $feat = $res->next_feature ) { # do something } | |
131 Function: SeqAnalysisParserI implementing function | |
132 Example : | |
133 Returns : A Bio::SeqFeatureI compliant object, in this case, | |
134 each DomainUnit object, ie, flattening the Sequence | |
135 aspect of this. | |
136 Args : None | |
137 | |
138 | |
139 =cut | |
140 | |
141 sub next_feature{ | |
142 my ($self) = @_; | |
143 | |
144 if( $self->{'_started_next_feature'} == 1 ) { | |
145 return shift @{$self->{'_next_feature_array'}}; | |
146 } else { | |
147 $self->{'_started_next_feature'} = 1; | |
148 my @array; | |
149 foreach my $seq ( $self->each_Set() ) { | |
150 foreach my $unit ( $seq->each_Domain() ) { | |
151 push(@array,$unit); | |
152 } | |
153 } | |
154 my $res = shift @array; | |
155 $self->{'_next_feature_array'} = \@array; | |
156 return $res; | |
157 } | |
158 | |
159 $self->throw("Should not reach here! Error!"); | |
160 } | |
161 | |
162 | |
163 =head2 number | |
164 | |
165 Title : number | |
166 Usage : print "There are ",$res->number," domains hit\n"; | |
167 Function: provides the number of domains in the HMMER report | |
168 | |
169 =cut | |
170 | |
171 sub number { | |
172 my $self = shift; | |
173 my @val; | |
174 my $ref; | |
175 $ref = $self->{'domain'}; | |
176 | |
177 | |
178 @val = @{$self->{'domain'}}; | |
179 return scalar @val; | |
180 } | |
181 | |
182 =head2 seqfile | |
183 | |
184 Title : seqfile | |
185 Usage : $obj->seqfile($newval) | |
186 Function: | |
187 Example : | |
188 Returns : value of seqfile | |
189 Args : newvalue (optional) | |
190 | |
191 | |
192 =cut | |
193 | |
194 sub seqfile{ | |
195 my ($self,$value) = @_; | |
196 if( defined $value) { | |
197 $self->{'seqfile'} = $value; | |
198 } | |
199 return $self->{'seqfile'}; | |
200 | |
201 } | |
202 | |
203 =head2 hmmfile | |
204 | |
205 Title : hmmfile | |
206 Usage : $obj->hmmfile($newval) | |
207 Function: | |
208 Example : | |
209 Returns : value of hmmfile | |
210 Args : newvalue (optional) | |
211 | |
212 | |
213 =cut | |
214 | |
215 sub hmmfile{ | |
216 my ($self,$value) = @_; | |
217 if( defined $value) { | |
218 $self->{'hmmfile'} = $value; | |
219 } | |
220 return $self->{'hmmfile'}; | |
221 | |
222 } | |
223 | |
224 =head2 add_Domain | |
225 | |
226 Title : add_Domain | |
227 Usage : $res->add_Domain($unit) | |
228 Function: adds a domain to the results array. Mainly used internally. | |
229 Args : A Bio::Tools::HMMER::Domain | |
230 | |
231 | |
232 =cut | |
233 | |
234 sub add_Domain { | |
235 my $self = shift; | |
236 my $unit = shift; | |
237 my $name; | |
238 | |
239 $name = $unit->seq_id(); | |
240 | |
241 if( ! exists $self->{'seq'}->{$name} ) { | |
242 $self->warn("Adding a domain of $name but with no HMMSequence. Will be kept in domain array but not added to a HMMSequence"); | |
243 } else { | |
244 $self->{'seq'}->{$name}->add_Domain($unit); | |
245 } | |
246 push(@{$self->{'domain'}},$unit); | |
247 } | |
248 | |
249 | |
250 =head2 each_Domain | |
251 | |
252 Title : each_Domain | |
253 Usage : foreach $domain ( $res->each_Domain() ) | |
254 Function: array of Domain units which are held in this report | |
255 Returns : array | |
256 Args : none | |
257 | |
258 | |
259 =cut | |
260 | |
261 sub each_Domain { | |
262 my $self = shift; | |
263 my (@arr,$u); | |
264 | |
265 foreach $u ( @{$self->{'domain'}} ) { | |
266 push(@arr,$u); | |
267 } | |
268 | |
269 return @arr; | |
270 } | |
271 | |
272 | |
273 =head2 domain_bits_cutoff_from_evalue | |
274 | |
275 Title : domain_bits_cutoff_from_evalue | |
276 Usage : $cutoff = domain_bits_cutoff_from_evalue(0.01); | |
277 Function: return a bits cutoff from an evalue using the | |
278 scores here. Somewhat interesting logic: | |
279 Find the two bit score which straddle the evalue | |
280 if( 25 is between these two points) return 25 | |
281 else return the midpoint. | |
282 | |
283 This logic tries to ensure that with large signal to | |
284 noise separation one still has sensible 25 bit cutoff | |
285 Returns : | |
286 Args : | |
287 | |
288 =cut | |
289 | |
290 sub domain_bits_cutoff_from_evalue { | |
291 my $self = shift; | |
292 my $eval = shift; | |
293 my ($dom,$prev,@doms,$cutoff,$sep,$seen); | |
294 | |
295 @doms = $self->each_Domain; | |
296 | |
297 | |
298 @doms = map { $_->[0] } | |
299 sort { $b->[1] <=> $a->[1] } | |
300 map { [ $_, $_->bits] } @doms; | |
301 $seen = 0; | |
302 foreach $_ ( @doms ) { | |
303 if( $_->evalue > $eval ) { | |
304 $seen = 1; | |
305 $dom = $_; | |
306 last; | |
307 } | |
308 $prev = $_; | |
309 } | |
310 | |
311 if( ! defined $prev || $seen == 0) { | |
312 $self->throw("Evalue is either above or below the list..."); | |
313 return undef; | |
314 } | |
315 | |
316 $sep = $prev->bits - $dom->bits ; | |
317 | |
318 if( $sep < 1 ) { | |
319 return $prev->bits(); | |
320 } | |
321 if( $dom->bits < 25 && $prev->bits > 25 ) { | |
322 return 25; | |
323 } | |
324 | |
325 return int( $dom->bits + $sep/2 ) ; | |
326 | |
327 } | |
328 | |
329 | |
330 sub dictate_hmm_acc { | |
331 my $self = shift; | |
332 my $acc = shift; | |
333 my ($unit); | |
334 | |
335 | |
336 foreach $unit ( $self->eachHMMUnit() ) { | |
337 $unit->hmmacc($acc); | |
338 } | |
339 } | |
340 | |
341 =head2 write_FT_output | |
342 | |
343 Title : write_FT_output | |
344 Usage : $res->write_FT_output(\*STDOUT,'DOMAIN') | |
345 Function: writes feature table output ala swissprot | |
346 Returns : | |
347 Args : | |
348 | |
349 | |
350 =cut | |
351 | |
352 sub write_FT_output { | |
353 my $self = shift; | |
354 my $file = shift; | |
355 my $idt = shift; | |
356 my ($seq,$unit); | |
357 | |
358 if( !defined $idt ) { | |
359 $idt = "DOMAIN"; | |
360 } | |
361 | |
362 foreach $seq ( $self->each_Set() ) { | |
363 print $file sprintf("ID %s\n",$seq->name()); | |
364 foreach $unit ( $seq->each_Domain() ) { | |
365 print $file sprintf("FT %s %d %d %s\n",$idt, | |
366 $unit->start,$unit->end,$unit->hmmname); | |
367 } | |
368 print $file "//\n"; | |
369 } | |
370 } | |
371 | |
372 =head2 filter_on_cutoff | |
373 | |
374 Title : filter_on_cutoff | |
375 Usage : $newresults = $results->filter_on_cutoff(25,15); | |
376 Function: Produces a new HMMER::Results module which has | |
377 been trimmed at the cutoff. | |
378 Returns : a Bio::Tools::HMMER::Results module | |
379 Args : sequence cutoff and domain cutoff. in bits score | |
380 if you want one cutoff, simply use same number both places | |
381 | |
382 =cut | |
383 | |
384 sub filter_on_cutoff { | |
385 my $self = shift; | |
386 my $seqthr = shift; | |
387 my $domthr = shift; | |
388 my ($new,$seq,$unit,@array,@narray); | |
389 | |
390 if( !defined $domthr ) { | |
391 $self->throw("hmmresults filter on cutoff needs two arguments"); | |
392 } | |
393 | |
394 $new = Bio::Tools::HMMER::Results->new(-type => $self->parsetype); | |
395 | |
396 foreach $seq ( $self->each_Set()) { | |
397 next if( $seq->bits() < $seqthr ); | |
398 $new->add_Set($seq); | |
399 foreach $unit ( $seq->each_Domain() ) { | |
400 next if( $unit->bits() < $domthr ); | |
401 $new->add_Domain($unit); | |
402 } | |
403 } | |
404 $new; | |
405 } | |
406 | |
407 =head2 write_ascii_out | |
408 | |
409 Title : write_ascii_out | |
410 Usage : $res->write_ascii_out(\*STDOUT) | |
411 Function: writes as | |
412 seq seq_start seq_end model-acc model_start model_end model_name | |
413 Returns : | |
414 Args : | |
415 | |
416 FIXME: Now that we have no modelacc, this is probably a bad thing. | |
417 | |
418 =cut | |
419 | |
420 # writes as seq sstart send modelacc hstart hend modelname | |
421 | |
422 sub write_ascii_out { | |
423 my $self = shift; | |
424 my $fh = shift; | |
425 my ($unit,$seq); | |
426 | |
427 if( !defined $fh) { | |
428 $fh = \*STDOUT; | |
429 } | |
430 | |
431 | |
432 foreach $seq ( $self->each_Set()) { | |
433 foreach $unit ( $seq->each_Domain()) { | |
434 print $fh sprintf("%s %4d %4d %s %4d %4d %4.2f %4.2g %s\n", | |
435 $unit->seq_id(),$unit->start(),$unit->end(), | |
436 $unit->hmmacc,$unit->hstart,$unit->hend, | |
437 $unit->bits,$unit->evalue,$unit->hmmname); | |
438 } | |
439 } | |
440 | |
441 } | |
442 | |
443 =head2 write_GDF_bits | |
444 | |
445 Title : write_GDF_bits | |
446 Usage : $res->write_GDF_bits(25,15,\*STDOUT) | |
447 Function: writes GDF format with a sequence,domain threshold | |
448 Returns : | |
449 Args : | |
450 | |
451 =cut | |
452 | |
453 sub write_GDF_bits { | |
454 my $self = shift; | |
455 my $seqt = shift; | |
456 my $domt = shift; | |
457 my $file = shift; | |
458 my $seq; | |
459 my $unit; | |
460 my (@array,@narray); | |
461 | |
462 if( !defined $file ) { | |
463 $self->throw("Attempting to use write_GDF_bits without passing in correct arguments!"); | |
464 return; | |
465 } | |
466 | |
467 foreach $seq ( $self->each_Set()) { | |
468 | |
469 if( $seq->bits() < $seqt ) { | |
470 next; | |
471 } | |
472 | |
473 foreach $unit ( $seq->each_Domain() ) { | |
474 if( $unit->bits() < $domt ) { | |
475 next; | |
476 } | |
477 push(@array,$unit); | |
478 } | |
479 | |
480 } | |
481 | |
482 @narray = sort { my ($aa,$bb,$st_a,$st_b); | |
483 $aa = $a->seq_id(); | |
484 $bb = $b->seq_id(); | |
485 if ( $aa eq $bb) { | |
486 $st_a = $a->start(); | |
487 $st_b = $b->start(); | |
488 return $st_a <=> $st_b; | |
489 } | |
490 else { | |
491 return $aa cmp $bb; | |
492 } } @array; | |
493 | |
494 foreach $unit ( @narray ) { | |
495 print $file sprintf("%-24s\t%6d\t%6d\t%15s\t%.1f\t%g\n",$unit->get_nse(),$unit->start(),$unit->end(),$unit->seq_id(),$unit->bits(),$unit->evalue); | |
496 } | |
497 | |
498 } | |
499 | |
500 sub write_scores_bits { | |
501 my $self = shift; | |
502 my $seqt = shift; | |
503 my $domt = shift; | |
504 my $file = shift; | |
505 my $seq; | |
506 my $unit; | |
507 my (@array,@narray); | |
508 | |
509 if( !defined $file ) { | |
510 carp("Attempting to use write_scores_bits without passing in correct arguments!"); | |
511 return; | |
512 } | |
513 | |
514 foreach $seq ( $self->eachHMMSequence()) { | |
515 | |
516 if( $seq->bits() < $seqt ) { | |
517 next; | |
518 } | |
519 | |
520 foreach $unit ( $seq->eachHMMUnit() ) { | |
521 if( $unit->bits() < $domt ) { | |
522 next; | |
523 } | |
524 push(@array,$unit); | |
525 } | |
526 | |
527 } | |
528 | |
529 @narray = sort { my ($aa,$bb,$st_a,$st_b); | |
530 $aa = $a->bits(); | |
531 $bb = $b->bits(); | |
532 return $aa <=> $bb; | |
533 } @array; | |
534 | |
535 foreach $unit ( @narray ) { | |
536 print $file sprintf("%4.2f %s\n",$unit->bits(),$unit->get_nse()); | |
537 } | |
538 | |
539 } | |
540 | |
541 sub write_GDF { | |
542 my $self = shift; | |
543 my $file = shift; | |
544 my $unit; | |
545 | |
546 if( !defined $file ) { | |
547 $file = \*STDOUT; | |
548 } | |
549 | |
550 | |
551 foreach $unit ( $self->eachHMMUnit() ) { | |
552 print $file sprintf("%-24s\t%6d\t%6d\t%15s\t%.1f\t%g\n",$unit->get_nse(),$unit->start(),$unit->end(),$unit->seq_id(),$unit->bits(),$unit->evalue); | |
553 } | |
554 | |
555 } | |
556 | |
557 sub highest_noise { | |
558 my $self = shift; | |
559 my $seqt = shift; | |
560 my $domt = shift; | |
561 my ($seq,$unit,$hseq,$hdom,$noiseseq,$noisedom); | |
562 | |
563 $hseq = $hdom = -100000; | |
564 | |
565 foreach $seq ( $self->eachHMMSequence()) { | |
566 if( $seq->bits() < $seqt && $seq->bits() > $hseq ) { | |
567 $hseq = $seq->bits(); | |
568 $noiseseq = $seq; | |
569 } | |
570 foreach $unit ( $seq->eachHMMUnit() ) { | |
571 if( (($seq->bits() < $seqt) || ($seq->bits() > $seqt && $unit->bits < $domt)) && $unit->bits() > $hdom ) { | |
572 $hdom = $unit->bits(); | |
573 $noisedom = $unit; | |
574 } | |
575 } | |
576 } | |
577 | |
578 | |
579 return ($noiseseq,$noisedom); | |
580 | |
581 } | |
582 | |
583 | |
584 sub lowest_true { | |
585 my $self = shift; | |
586 my $seqt = shift; | |
587 my $domt = shift; | |
588 my ($seq,$unit,$lowseq,$lowdom,$trueseq,$truedom); | |
589 | |
590 if( ! defined $domt ) { | |
591 carp "lowest true needs at least a domain threshold cut-off"; | |
592 return (0,0); | |
593 } | |
594 | |
595 $lowseq = $lowdom = 100000; | |
596 | |
597 foreach $seq ( $self->eachHMMSequence()) { | |
598 | |
599 if( $seq->bits() >= $seqt && $seq->bits() < $lowseq ) { | |
600 $lowseq = $seq->bits(); | |
601 $trueseq = $seq; | |
602 } | |
603 if( $seq->bits() < $seqt ) { | |
604 next; | |
605 } | |
606 | |
607 foreach $unit ( $seq->eachHMMUnit() ) { | |
608 if( $unit->bits() >= $domt && $unit->bits() < $lowdom ) { | |
609 $lowdom = $unit->bits(); | |
610 $truedom = $unit; | |
611 } | |
612 } | |
613 } | |
614 | |
615 | |
616 return ($trueseq,$truedom); | |
617 | |
618 } | |
619 | |
620 | |
621 | |
622 =head2 add_Set | |
623 | |
624 Title : add_Set | |
625 Usage : Mainly internal function | |
626 Function: | |
627 Returns : | |
628 Args : | |
629 | |
630 | |
631 =cut | |
632 | |
633 sub add_Set { | |
634 my $self = shift; | |
635 my $seq = shift; | |
636 my $name; | |
637 | |
638 $name = $seq->name(); | |
639 | |
640 if( exists $self->{'seq'}->{$name} ) { | |
641 $self->throw("You alredy have $name in HMMResults!"); | |
642 } | |
643 $self->{'seq'}->{$name} = $seq; | |
644 } | |
645 | |
646 | |
647 =head2 each_Set | |
648 | |
649 Title : each_Set | |
650 Usage : | |
651 Function: | |
652 Returns : | |
653 Args : | |
654 | |
655 | |
656 =cut | |
657 | |
658 sub each_Set { | |
659 my $self = shift; | |
660 my (@array,$name); | |
661 | |
662 | |
663 foreach $name ( keys %{$self->{'seq'}} ) { | |
664 push(@array,$self->{'seq'}->{$name}); | |
665 } | |
666 return @array; | |
667 } | |
668 | |
669 | |
670 =head2 get_Set | |
671 | |
672 Title : get_Set | |
673 Usage : $set = $res->get_Set('sequence-name'); | |
674 Function: returns the Set for a particular sequence | |
675 Returns : a HMMER::Set object | |
676 Args : name of the sequence | |
677 | |
678 | |
679 =cut | |
680 | |
681 sub get_Set { | |
682 my $self = shift; | |
683 my $name = shift; | |
684 | |
685 return $self->{'seq'}->{$name}; | |
686 } | |
687 | |
688 | |
689 =head2 _parse_hmmpfam | |
690 | |
691 Title : _parse_hmmpfam | |
692 Usage : $res->_parse_hmmpfam($filehandle) | |
693 Function: | |
694 Returns : | |
695 Args : | |
696 | |
697 | |
698 =cut | |
699 | |
700 sub _parse_hmmpfam { | |
701 my $self = shift; | |
702 my $file = shift; | |
703 | |
704 my ($id,$sqfrom,$sqto,$hmmf,$hmmt,$sc,$ev, | |
705 $unit,$nd,$seq,$name,$seqname,$from, | |
706 $to,%hash,%acc,$acc); | |
707 my $count = 0; | |
708 | |
709 while(<$file>) { | |
710 if( /^HMM file:\s+(\S+)/ ) { $self->hmmfile($1); next; } | |
711 elsif( /^Sequence file:\s+(\S+)/ ) { $self->seqfile($1); next } | |
712 elsif( /^Query(\s+sequence)?:\s+(\S+)/ ) { | |
713 | |
714 $seqname = $2; | |
715 | |
716 $seq = Bio::Tools::HMMER::Set->new(); | |
717 | |
718 $seq ->name($seqname); | |
719 $self->add_Set($seq); | |
720 %hash = (); | |
721 | |
722 while(<$file>){ | |
723 | |
724 if( /Accession:\s+(\S+)/ ) { $seq->accession($1); next } | |
725 elsif( s/^Description:\s+// ) { chomp; $seq->desc($_); next } | |
726 /^Parsed for domains/ && last; | |
727 | |
728 # This is to parse out the accession numbers in old Pfam format. | |
729 # now not support due to changes in HMMER. | |
730 | |
731 if( (($id,$acc, $sc, $ev, $nd) = /^\s*(\S+)\s+(\S+).+?\s(\S+)\s+(\S+)\s+(\d+)\s*$/)) { | |
732 $hash{$id} = $sc; # we need this for the sequence | |
733 # core of the domains below! | |
734 $acc {$id} = $acc; | |
735 | |
736 # this is the more common parsing routine | |
737 } elsif ( (($id,$sc, $ev, $nd) = | |
738 /^\s*(\S+).+?\s(\S+)\s+(\S+)\s+(\d+)\s*$/) ) { | |
739 | |
740 $hash{$id} = $sc; # we need this for the | |
741 # sequence score of hte domains below! | |
742 | |
743 } | |
744 } | |
745 | |
746 while(<$file>) { | |
747 /^Align/ && last; | |
748 /^\/\// && last; | |
749 # this is meant to match | |
750 | |
751 #Sequence Domain seq-f seq-t hmm-f hmm-t score E-value | |
752 #-------- ------- ----- ----- ----- ----- ----- ------- | |
753 #PF00621 1/1 198 372 .. 1 207 [] 281.6 1e-80 | |
754 | |
755 if( (($id, $sqfrom, $sqto, $hmmf,$hmmt,$sc, $ev) = | |
756 /(\S+)\s+\S+\s+(\d+)\s+(\d+).+?(\d+)\s+(\d+)\s+\S+\s+(\S+)\s+(\S+)\s*$/)) { | |
757 $unit = Bio::Tools::HMMER::Domain->new(); | |
758 $unit->seq_id ($seqname); | |
759 $unit->hmmname ($id); | |
760 $unit->start ($sqfrom); | |
761 $unit->end ($sqto); | |
762 $unit->hstart($hmmf); | |
763 $unit->hend ($hmmt); | |
764 $unit->bits ($sc); | |
765 $unit->evalue ($ev); | |
766 | |
767 if( !exists($hash{$id}) ) { | |
768 $self->throw("HMMResults parsing error in hmmpfam for $id - can't find sequecne score"); | |
769 } | |
770 | |
771 $unit->seqbits($hash{$id}); | |
772 | |
773 if( defined $acc{$id} ) { | |
774 $unit->hmmacc($acc{$id}); | |
775 } | |
776 | |
777 # this should find it's own sequence! | |
778 $self->add_Domain($unit); | |
779 } | |
780 } | |
781 if( /^\/\// ) { next; } | |
782 | |
783 $_ = <$file>; | |
784 # parses alignment lines. Icky as we have to break on the same line | |
785 # that we need to read to place the alignment lines with the unit. | |
786 | |
787 while(1) { | |
788 (!defined $_ || /^\/\//) && last; | |
789 | |
790 # matches: | |
791 # PF00621: domain 1 of 1, from 198 to 372 | |
792 if( /^\s*(\S+):.*from\s+(\d+)\s+to\s+(\d+)/ ) { | |
793 | |
794 $name = $1; | |
795 $from = $2; | |
796 $to = $3; | |
797 | |
798 # find the HMMUnit which this alignment is from | |
799 | |
800 $unit = $self->get_unit_nse($seqname,$name,$from,$to); | |
801 if( !defined $unit ) { | |
802 $self->warn("Could not find $name $from $to unit even though I am reading it in. ugh!"); | |
803 $_ = <$file>; | |
804 next; | |
805 } | |
806 while(<$file>) { | |
807 /^\/\// && last; | |
808 /^\s*\S+:.*from\s+\d+\s+to\s+\d+/ && last; | |
809 $unit->add_alignment_line($_); | |
810 } | |
811 } else { | |
812 $_ = <$file>; | |
813 } | |
814 } | |
815 | |
816 # back to main 'Query:' loop | |
817 } | |
818 } | |
819 } | |
820 | |
821 # mainly internal function | |
822 | |
823 sub get_unit_nse { | |
824 my $self = shift; | |
825 my $seqname = shift; | |
826 my $domname = shift; | |
827 my $start = shift; | |
828 my $end = shift; | |
829 | |
830 my($seq,$unit); | |
831 | |
832 $seq = $self->get_Set($seqname); | |
833 | |
834 if( !defined $seq ) { | |
835 $self->throw("Could not get sequence name $seqname - so can't get its unit"); | |
836 } | |
837 | |
838 foreach $unit ( $seq->each_Domain() ) { | |
839 if( $unit->hmmname() eq $domname && $unit->start() == $start && $unit->end() == $end ) { | |
840 return $unit; | |
841 } | |
842 } | |
843 | |
844 return undef; | |
845 } | |
846 | |
847 | |
848 =head2 _parse_hmmsearch | |
849 | |
850 Title : _parse_hmmsearch | |
851 Usage : $res->_parse_hmmsearch($filehandle) | |
852 Function: | |
853 Returns : | |
854 Args : | |
855 | |
856 | |
857 =cut | |
858 | |
859 sub _parse_hmmsearch { | |
860 my $self = shift; | |
861 my $file = shift; | |
862 my ($id,$sqfrom,$sqto,$sc,$ev,$unit,$nd,$seq,$hmmf,$hmmt, | |
863 $hmmfname,$hmmacc, $hmmid, %seqh); | |
864 my $count = 0; | |
865 | |
866 while(<$file>) { | |
867 /^HMM file:\s+(\S+)/ and do { $self->hmmfile($1); $hmmfname = $1 }; | |
868 /^Accession:\s+(\S+)/ and do { $hmmacc = $1 }; | |
869 /^Query HMM:\s+(\S+)/ and do { $hmmid = $1 }; | |
870 /^Sequence database:\s+(\S+)/ and do { $self->seqfile($1) }; | |
871 /^Scores for complete sequences/ && last; | |
872 } | |
873 | |
874 $hmmfname = "given" if not $hmmfname; | |
875 | |
876 while(<$file>) { | |
877 /^Parsed for domains/ && last; | |
878 if( (($id, $sc, $ev, $nd) = /(\S+).+?\s(\S+)\s+(\S+)\s+(\d+)\s*$/)) { | |
879 $seq = Bio::Tools::HMMER::Set->new(); | |
880 $seq->name($id); | |
881 $seq->bits($sc); | |
882 $seqh{$id} = $sc; | |
883 $seq->evalue($ev); | |
884 $self->add_Set($seq); | |
885 $seq->accession($hmmacc); | |
886 } | |
887 } | |
888 | |
889 while(<$file>) { | |
890 /^Alignments of top-scoring domains/ && last; | |
891 if( (($id, $sqfrom, $sqto, $hmmf, $hmmt, $sc, $ev) = /(\S+)\s+\S+\s+(\d+)\s+(\d+).+?(\d+)\s+(\d+)\s+\S+\s+(\S+)\s+(\S+)\s*$/)) { | |
892 $unit = Bio::Tools::HMMER::Domain->new(); | |
893 | |
894 $unit->seq_id($id); | |
895 $unit->hmmname($hmmfname); | |
896 $unit->start($sqfrom); | |
897 $unit->end($sqto); | |
898 $unit->bits($sc); | |
899 $unit->hstart($hmmf); | |
900 $unit->hend($hmmt); | |
901 $unit->evalue($ev); | |
902 $unit->seqbits($seqh{$id}); | |
903 $self->add_Domain($unit); | |
904 $count++; | |
905 } | |
906 } | |
907 | |
908 $_ = <$file>; | |
909 | |
910 ## Recognize and store domain alignments | |
911 | |
912 while(1) { | |
913 if( !defined $_ ) { | |
914 last; | |
915 } | |
916 /^Histogram of all scores/ && last; | |
917 | |
918 # matches: | |
919 # PF00621: domain 1 of 1, from 198 to 372 | |
920 if( /^\s*(\S+):.*from\s+(\d+)\s+to\s+(\d+)/ ) { | |
921 my $name = $1; | |
922 my $from = $2; | |
923 my $to = $3; | |
924 | |
925 # find the HMMUnit which this alignment is from | |
926 $unit = $self->get_unit_nse($name,$hmmfname,$from,$to); | |
927 | |
928 if( !defined $unit ) { | |
929 $self->warn("Could not find $name $from $to unit even though I am reading it in. ugh!"); | |
930 next; | |
931 } | |
932 while(<$file>) { | |
933 /^Histogram of all scores/ && last; | |
934 /^\s*\S+:.*from\s+\d+\s+to\s+\d+/ && last; | |
935 $unit->add_alignment_line($_); | |
936 } | |
937 } | |
938 else { | |
939 $_ = <$file>; | |
940 } | |
941 } | |
942 | |
943 return $count; | |
944 } | |
945 | |
946 =head2 parsetype | |
947 | |
948 Title : parsetype | |
949 Usage : $obj->parsetype($newval) | |
950 Function: | |
951 Returns : value of parsetype | |
952 Args : newvalue (optional) | |
953 | |
954 | |
955 =cut | |
956 | |
957 sub parsetype{ | |
958 my ($self,$value) = @_; | |
959 if( defined $value) { | |
960 $self->{'_parsetype'} = $value; | |
961 } | |
962 return $self->{'_parsetype'}; | |
963 } | |
964 | |
965 1; # says use was ok | |
966 __END__ | |
967 | |
968 |