0
|
1 # $Id: Results.pm,v 1.22.2.1 2003/01/07 13:58:01 jason Exp $
|
|
2 #
|
|
3 # Perl Module for HMMResults
|
|
4 #
|
|
5 # Cared for by Ewan Birney <birney@sanger.ac.uk>
|
|
6 #
|
|
7 #Copyright Genome Research Limited (1997).
|
|
8
|
|
9 =head1 NAME
|
|
10
|
|
11 Bio::Tools::HMMER::Results - Object representing HMMER output results
|
|
12
|
|
13 =head1 SYNOPSIS
|
|
14
|
|
15 # parse a hmmsearch file (can also parse a hmmpfam file)
|
|
16 $res = new Bio::Tools::HMMER::Results( -file => 'output.hmm' , -type => 'hmmsearch');
|
|
17
|
|
18 # print out the results for each sequence
|
|
19 foreach $seq ( $res->each_Set ) {
|
|
20 print "Sequence bit score is",$seq->bits,"\n";
|
|
21 foreach $domain ( $seq->each_Domain ) {
|
|
22 print " Domain start ",$domain->start," end ",$domain->end,
|
|
23 " score ",$domain->bits,"\n";
|
|
24 }
|
|
25 }
|
|
26
|
|
27 # new result object on a sequence/domain cutoff of 25 bits sequence, 15 bits domain
|
|
28 $newresult = $res->filter_on_cutoff(25,15);
|
|
29
|
|
30 # alternative way of getting out all domains directly
|
|
31 foreach $domain ( $res->each_Domain ) {
|
|
32 print "Domain on ",$domain->seq_id," with score ",
|
|
33 $domain->bits," evalue ",$domain->evalue,"\n";
|
|
34 }
|
|
35
|
|
36 =head1 DESCRIPTION
|
|
37
|
|
38 This object represents HMMER output, either from hmmsearch or
|
|
39 hmmpfam. For hmmsearch, a series of HMMER::Set objects are made, one
|
|
40 for each sequence, which have the the bits score for the object. For
|
|
41 hmmpfam searches, only one Set object is made.
|
|
42
|
|
43
|
|
44 These objects come from the original HMMResults modules used
|
|
45 internally in Pfam, written by Ewan. Ewan then converted them to
|
|
46 bioperl objects in 1999. That conversion is meant to be backwardly
|
|
47 compatible, but may not be (caveat emptor).
|
|
48
|
|
49 =head1 FEEDBACK
|
|
50
|
|
51 =head2 Mailing Lists
|
|
52
|
|
53 User feedback is an integral part of the evolution of this and other
|
|
54 Bioperl modules. Send your comments and suggestions preferably to one
|
|
55 of the Bioperl mailing lists. Your participation is much appreciated.
|
|
56
|
|
57 bioperl-l@bioperl.org - General discussion
|
|
58 http://www.bioperl.org/MailList.html - About the mailing lists
|
|
59
|
|
60 =head2 Reporting Bugs
|
|
61
|
|
62 Report bugs to the Bioperl bug tracking system to help us keep track
|
|
63 the bugs and their resolution. Bug reports can be submitted via email
|
|
64 or the web:
|
|
65
|
|
66 bioperl-bugs@bio.perl.org
|
|
67 http://www.bugzilla.bioperl.org/
|
|
68
|
|
69 =head1 AUTHOR - Ewan Birney
|
|
70
|
|
71 Email birney@ebi.ac.uk
|
|
72
|
|
73 =head1 CONTRIBUTORS
|
|
74
|
|
75 Jason Stajich, jason@bioperl.org
|
|
76
|
|
77 =head1 APPENDIX
|
|
78
|
|
79 The rest of the documentation details each of the object
|
|
80 methods. Internal methods are usually preceded with a _
|
|
81
|
|
82 =cut
|
|
83
|
|
84 package Bio::Tools::HMMER::Results;
|
|
85
|
|
86 use vars qw(@ISA);
|
|
87 use Carp;
|
|
88 use strict;
|
|
89
|
|
90 use Bio::Root::Root;
|
|
91 use Bio::Root::IO;
|
|
92 use Bio::Tools::HMMER::Domain;
|
|
93 use Bio::Tools::HMMER::Set;
|
|
94 use Bio::SeqAnalysisParserI;
|
|
95 use Symbol;
|
|
96
|
|
97 @ISA = qw(Bio::Root::Root Bio::Root::IO Bio::SeqAnalysisParserI);
|
|
98
|
|
99 sub new {
|
|
100 my($class,@args) = @_;
|
|
101
|
|
102 my $self = $class->SUPER::new(@args);
|
|
103
|
|
104 $self->{'domain'} = []; # array of HMMUnits
|
|
105 $self->{'seq'} = {};
|
|
106
|
|
107 my ($parsetype) = $self->_rearrange([qw(TYPE)],@args);
|
|
108 $self->_initialize_io(@args);
|
|
109 if( !defined $parsetype ) {
|
|
110 $self->throw("No parse type provided. should be hmmsearch or hmmpfam");
|
|
111 }
|
|
112 $self->parsetype($parsetype);
|
|
113 if( defined $self->_fh() ) {
|
|
114 if( $parsetype eq 'hmmsearch' ) {
|
|
115 $self->_parse_hmmsearch($self->_fh());
|
|
116 } elsif ( $parsetype eq 'hmmpfam' ) {
|
|
117 $self->_parse_hmmpfam($self->_fh());
|
|
118 } else {
|
|
119 $self->throw("Did not recoginise type $parsetype");
|
|
120 }
|
|
121 }
|
|
122
|
|
123 return $self; # success - we hope!
|
|
124 }
|
|
125
|
|
126
|
|
127 =head2 next_feature
|
|
128
|
|
129 Title : next_feature
|
|
130 Usage : while( my $feat = $res->next_feature ) { # do something }
|
|
131 Function: SeqAnalysisParserI implementing function
|
|
132 Example :
|
|
133 Returns : A Bio::SeqFeatureI compliant object, in this case,
|
|
134 each DomainUnit object, ie, flattening the Sequence
|
|
135 aspect of this.
|
|
136 Args : None
|
|
137
|
|
138
|
|
139 =cut
|
|
140
|
|
141 sub next_feature{
|
|
142 my ($self) = @_;
|
|
143
|
|
144 if( $self->{'_started_next_feature'} == 1 ) {
|
|
145 return shift @{$self->{'_next_feature_array'}};
|
|
146 } else {
|
|
147 $self->{'_started_next_feature'} = 1;
|
|
148 my @array;
|
|
149 foreach my $seq ( $self->each_Set() ) {
|
|
150 foreach my $unit ( $seq->each_Domain() ) {
|
|
151 push(@array,$unit);
|
|
152 }
|
|
153 }
|
|
154 my $res = shift @array;
|
|
155 $self->{'_next_feature_array'} = \@array;
|
|
156 return $res;
|
|
157 }
|
|
158
|
|
159 $self->throw("Should not reach here! Error!");
|
|
160 }
|
|
161
|
|
162
|
|
163 =head2 number
|
|
164
|
|
165 Title : number
|
|
166 Usage : print "There are ",$res->number," domains hit\n";
|
|
167 Function: provides the number of domains in the HMMER report
|
|
168
|
|
169 =cut
|
|
170
|
|
171 sub number {
|
|
172 my $self = shift;
|
|
173 my @val;
|
|
174 my $ref;
|
|
175 $ref = $self->{'domain'};
|
|
176
|
|
177
|
|
178 @val = @{$self->{'domain'}};
|
|
179 return scalar @val;
|
|
180 }
|
|
181
|
|
182 =head2 seqfile
|
|
183
|
|
184 Title : seqfile
|
|
185 Usage : $obj->seqfile($newval)
|
|
186 Function:
|
|
187 Example :
|
|
188 Returns : value of seqfile
|
|
189 Args : newvalue (optional)
|
|
190
|
|
191
|
|
192 =cut
|
|
193
|
|
194 sub seqfile{
|
|
195 my ($self,$value) = @_;
|
|
196 if( defined $value) {
|
|
197 $self->{'seqfile'} = $value;
|
|
198 }
|
|
199 return $self->{'seqfile'};
|
|
200
|
|
201 }
|
|
202
|
|
203 =head2 hmmfile
|
|
204
|
|
205 Title : hmmfile
|
|
206 Usage : $obj->hmmfile($newval)
|
|
207 Function:
|
|
208 Example :
|
|
209 Returns : value of hmmfile
|
|
210 Args : newvalue (optional)
|
|
211
|
|
212
|
|
213 =cut
|
|
214
|
|
215 sub hmmfile{
|
|
216 my ($self,$value) = @_;
|
|
217 if( defined $value) {
|
|
218 $self->{'hmmfile'} = $value;
|
|
219 }
|
|
220 return $self->{'hmmfile'};
|
|
221
|
|
222 }
|
|
223
|
|
224 =head2 add_Domain
|
|
225
|
|
226 Title : add_Domain
|
|
227 Usage : $res->add_Domain($unit)
|
|
228 Function: adds a domain to the results array. Mainly used internally.
|
|
229 Args : A Bio::Tools::HMMER::Domain
|
|
230
|
|
231
|
|
232 =cut
|
|
233
|
|
234 sub add_Domain {
|
|
235 my $self = shift;
|
|
236 my $unit = shift;
|
|
237 my $name;
|
|
238
|
|
239 $name = $unit->seq_id();
|
|
240
|
|
241 if( ! exists $self->{'seq'}->{$name} ) {
|
|
242 $self->warn("Adding a domain of $name but with no HMMSequence. Will be kept in domain array but not added to a HMMSequence");
|
|
243 } else {
|
|
244 $self->{'seq'}->{$name}->add_Domain($unit);
|
|
245 }
|
|
246 push(@{$self->{'domain'}},$unit);
|
|
247 }
|
|
248
|
|
249
|
|
250 =head2 each_Domain
|
|
251
|
|
252 Title : each_Domain
|
|
253 Usage : foreach $domain ( $res->each_Domain() )
|
|
254 Function: array of Domain units which are held in this report
|
|
255 Returns : array
|
|
256 Args : none
|
|
257
|
|
258
|
|
259 =cut
|
|
260
|
|
261 sub each_Domain {
|
|
262 my $self = shift;
|
|
263 my (@arr,$u);
|
|
264
|
|
265 foreach $u ( @{$self->{'domain'}} ) {
|
|
266 push(@arr,$u);
|
|
267 }
|
|
268
|
|
269 return @arr;
|
|
270 }
|
|
271
|
|
272
|
|
273 =head2 domain_bits_cutoff_from_evalue
|
|
274
|
|
275 Title : domain_bits_cutoff_from_evalue
|
|
276 Usage : $cutoff = domain_bits_cutoff_from_evalue(0.01);
|
|
277 Function: return a bits cutoff from an evalue using the
|
|
278 scores here. Somewhat interesting logic:
|
|
279 Find the two bit score which straddle the evalue
|
|
280 if( 25 is between these two points) return 25
|
|
281 else return the midpoint.
|
|
282
|
|
283 This logic tries to ensure that with large signal to
|
|
284 noise separation one still has sensible 25 bit cutoff
|
|
285 Returns :
|
|
286 Args :
|
|
287
|
|
288 =cut
|
|
289
|
|
290 sub domain_bits_cutoff_from_evalue {
|
|
291 my $self = shift;
|
|
292 my $eval = shift;
|
|
293 my ($dom,$prev,@doms,$cutoff,$sep,$seen);
|
|
294
|
|
295 @doms = $self->each_Domain;
|
|
296
|
|
297
|
|
298 @doms = map { $_->[0] }
|
|
299 sort { $b->[1] <=> $a->[1] }
|
|
300 map { [ $_, $_->bits] } @doms;
|
|
301 $seen = 0;
|
|
302 foreach $_ ( @doms ) {
|
|
303 if( $_->evalue > $eval ) {
|
|
304 $seen = 1;
|
|
305 $dom = $_;
|
|
306 last;
|
|
307 }
|
|
308 $prev = $_;
|
|
309 }
|
|
310
|
|
311 if( ! defined $prev || $seen == 0) {
|
|
312 $self->throw("Evalue is either above or below the list...");
|
|
313 return undef;
|
|
314 }
|
|
315
|
|
316 $sep = $prev->bits - $dom->bits ;
|
|
317
|
|
318 if( $sep < 1 ) {
|
|
319 return $prev->bits();
|
|
320 }
|
|
321 if( $dom->bits < 25 && $prev->bits > 25 ) {
|
|
322 return 25;
|
|
323 }
|
|
324
|
|
325 return int( $dom->bits + $sep/2 ) ;
|
|
326
|
|
327 }
|
|
328
|
|
329
|
|
330 sub dictate_hmm_acc {
|
|
331 my $self = shift;
|
|
332 my $acc = shift;
|
|
333 my ($unit);
|
|
334
|
|
335
|
|
336 foreach $unit ( $self->eachHMMUnit() ) {
|
|
337 $unit->hmmacc($acc);
|
|
338 }
|
|
339 }
|
|
340
|
|
341 =head2 write_FT_output
|
|
342
|
|
343 Title : write_FT_output
|
|
344 Usage : $res->write_FT_output(\*STDOUT,'DOMAIN')
|
|
345 Function: writes feature table output ala swissprot
|
|
346 Returns :
|
|
347 Args :
|
|
348
|
|
349
|
|
350 =cut
|
|
351
|
|
352 sub write_FT_output {
|
|
353 my $self = shift;
|
|
354 my $file = shift;
|
|
355 my $idt = shift;
|
|
356 my ($seq,$unit);
|
|
357
|
|
358 if( !defined $idt ) {
|
|
359 $idt = "DOMAIN";
|
|
360 }
|
|
361
|
|
362 foreach $seq ( $self->each_Set() ) {
|
|
363 print $file sprintf("ID %s\n",$seq->name());
|
|
364 foreach $unit ( $seq->each_Domain() ) {
|
|
365 print $file sprintf("FT %s %d %d %s\n",$idt,
|
|
366 $unit->start,$unit->end,$unit->hmmname);
|
|
367 }
|
|
368 print $file "//\n";
|
|
369 }
|
|
370 }
|
|
371
|
|
372 =head2 filter_on_cutoff
|
|
373
|
|
374 Title : filter_on_cutoff
|
|
375 Usage : $newresults = $results->filter_on_cutoff(25,15);
|
|
376 Function: Produces a new HMMER::Results module which has
|
|
377 been trimmed at the cutoff.
|
|
378 Returns : a Bio::Tools::HMMER::Results module
|
|
379 Args : sequence cutoff and domain cutoff. in bits score
|
|
380 if you want one cutoff, simply use same number both places
|
|
381
|
|
382 =cut
|
|
383
|
|
384 sub filter_on_cutoff {
|
|
385 my $self = shift;
|
|
386 my $seqthr = shift;
|
|
387 my $domthr = shift;
|
|
388 my ($new,$seq,$unit,@array,@narray);
|
|
389
|
|
390 if( !defined $domthr ) {
|
|
391 $self->throw("hmmresults filter on cutoff needs two arguments");
|
|
392 }
|
|
393
|
|
394 $new = Bio::Tools::HMMER::Results->new(-type => $self->parsetype);
|
|
395
|
|
396 foreach $seq ( $self->each_Set()) {
|
|
397 next if( $seq->bits() < $seqthr );
|
|
398 $new->add_Set($seq);
|
|
399 foreach $unit ( $seq->each_Domain() ) {
|
|
400 next if( $unit->bits() < $domthr );
|
|
401 $new->add_Domain($unit);
|
|
402 }
|
|
403 }
|
|
404 $new;
|
|
405 }
|
|
406
|
|
407 =head2 write_ascii_out
|
|
408
|
|
409 Title : write_ascii_out
|
|
410 Usage : $res->write_ascii_out(\*STDOUT)
|
|
411 Function: writes as
|
|
412 seq seq_start seq_end model-acc model_start model_end model_name
|
|
413 Returns :
|
|
414 Args :
|
|
415
|
|
416 FIXME: Now that we have no modelacc, this is probably a bad thing.
|
|
417
|
|
418 =cut
|
|
419
|
|
420 # writes as seq sstart send modelacc hstart hend modelname
|
|
421
|
|
422 sub write_ascii_out {
|
|
423 my $self = shift;
|
|
424 my $fh = shift;
|
|
425 my ($unit,$seq);
|
|
426
|
|
427 if( !defined $fh) {
|
|
428 $fh = \*STDOUT;
|
|
429 }
|
|
430
|
|
431
|
|
432 foreach $seq ( $self->each_Set()) {
|
|
433 foreach $unit ( $seq->each_Domain()) {
|
|
434 print $fh sprintf("%s %4d %4d %s %4d %4d %4.2f %4.2g %s\n",
|
|
435 $unit->seq_id(),$unit->start(),$unit->end(),
|
|
436 $unit->hmmacc,$unit->hstart,$unit->hend,
|
|
437 $unit->bits,$unit->evalue,$unit->hmmname);
|
|
438 }
|
|
439 }
|
|
440
|
|
441 }
|
|
442
|
|
443 =head2 write_GDF_bits
|
|
444
|
|
445 Title : write_GDF_bits
|
|
446 Usage : $res->write_GDF_bits(25,15,\*STDOUT)
|
|
447 Function: writes GDF format with a sequence,domain threshold
|
|
448 Returns :
|
|
449 Args :
|
|
450
|
|
451 =cut
|
|
452
|
|
453 sub write_GDF_bits {
|
|
454 my $self = shift;
|
|
455 my $seqt = shift;
|
|
456 my $domt = shift;
|
|
457 my $file = shift;
|
|
458 my $seq;
|
|
459 my $unit;
|
|
460 my (@array,@narray);
|
|
461
|
|
462 if( !defined $file ) {
|
|
463 $self->throw("Attempting to use write_GDF_bits without passing in correct arguments!");
|
|
464 return;
|
|
465 }
|
|
466
|
|
467 foreach $seq ( $self->each_Set()) {
|
|
468
|
|
469 if( $seq->bits() < $seqt ) {
|
|
470 next;
|
|
471 }
|
|
472
|
|
473 foreach $unit ( $seq->each_Domain() ) {
|
|
474 if( $unit->bits() < $domt ) {
|
|
475 next;
|
|
476 }
|
|
477 push(@array,$unit);
|
|
478 }
|
|
479
|
|
480 }
|
|
481
|
|
482 @narray = sort { my ($aa,$bb,$st_a,$st_b);
|
|
483 $aa = $a->seq_id();
|
|
484 $bb = $b->seq_id();
|
|
485 if ( $aa eq $bb) {
|
|
486 $st_a = $a->start();
|
|
487 $st_b = $b->start();
|
|
488 return $st_a <=> $st_b;
|
|
489 }
|
|
490 else {
|
|
491 return $aa cmp $bb;
|
|
492 } } @array;
|
|
493
|
|
494 foreach $unit ( @narray ) {
|
|
495 print $file sprintf("%-24s\t%6d\t%6d\t%15s\t%.1f\t%g\n",$unit->get_nse(),$unit->start(),$unit->end(),$unit->seq_id(),$unit->bits(),$unit->evalue);
|
|
496 }
|
|
497
|
|
498 }
|
|
499
|
|
500 sub write_scores_bits {
|
|
501 my $self = shift;
|
|
502 my $seqt = shift;
|
|
503 my $domt = shift;
|
|
504 my $file = shift;
|
|
505 my $seq;
|
|
506 my $unit;
|
|
507 my (@array,@narray);
|
|
508
|
|
509 if( !defined $file ) {
|
|
510 carp("Attempting to use write_scores_bits without passing in correct arguments!");
|
|
511 return;
|
|
512 }
|
|
513
|
|
514 foreach $seq ( $self->eachHMMSequence()) {
|
|
515
|
|
516 if( $seq->bits() < $seqt ) {
|
|
517 next;
|
|
518 }
|
|
519
|
|
520 foreach $unit ( $seq->eachHMMUnit() ) {
|
|
521 if( $unit->bits() < $domt ) {
|
|
522 next;
|
|
523 }
|
|
524 push(@array,$unit);
|
|
525 }
|
|
526
|
|
527 }
|
|
528
|
|
529 @narray = sort { my ($aa,$bb,$st_a,$st_b);
|
|
530 $aa = $a->bits();
|
|
531 $bb = $b->bits();
|
|
532 return $aa <=> $bb;
|
|
533 } @array;
|
|
534
|
|
535 foreach $unit ( @narray ) {
|
|
536 print $file sprintf("%4.2f %s\n",$unit->bits(),$unit->get_nse());
|
|
537 }
|
|
538
|
|
539 }
|
|
540
|
|
541 sub write_GDF {
|
|
542 my $self = shift;
|
|
543 my $file = shift;
|
|
544 my $unit;
|
|
545
|
|
546 if( !defined $file ) {
|
|
547 $file = \*STDOUT;
|
|
548 }
|
|
549
|
|
550
|
|
551 foreach $unit ( $self->eachHMMUnit() ) {
|
|
552 print $file sprintf("%-24s\t%6d\t%6d\t%15s\t%.1f\t%g\n",$unit->get_nse(),$unit->start(),$unit->end(),$unit->seq_id(),$unit->bits(),$unit->evalue);
|
|
553 }
|
|
554
|
|
555 }
|
|
556
|
|
557 sub highest_noise {
|
|
558 my $self = shift;
|
|
559 my $seqt = shift;
|
|
560 my $domt = shift;
|
|
561 my ($seq,$unit,$hseq,$hdom,$noiseseq,$noisedom);
|
|
562
|
|
563 $hseq = $hdom = -100000;
|
|
564
|
|
565 foreach $seq ( $self->eachHMMSequence()) {
|
|
566 if( $seq->bits() < $seqt && $seq->bits() > $hseq ) {
|
|
567 $hseq = $seq->bits();
|
|
568 $noiseseq = $seq;
|
|
569 }
|
|
570 foreach $unit ( $seq->eachHMMUnit() ) {
|
|
571 if( (($seq->bits() < $seqt) || ($seq->bits() > $seqt && $unit->bits < $domt)) && $unit->bits() > $hdom ) {
|
|
572 $hdom = $unit->bits();
|
|
573 $noisedom = $unit;
|
|
574 }
|
|
575 }
|
|
576 }
|
|
577
|
|
578
|
|
579 return ($noiseseq,$noisedom);
|
|
580
|
|
581 }
|
|
582
|
|
583
|
|
584 sub lowest_true {
|
|
585 my $self = shift;
|
|
586 my $seqt = shift;
|
|
587 my $domt = shift;
|
|
588 my ($seq,$unit,$lowseq,$lowdom,$trueseq,$truedom);
|
|
589
|
|
590 if( ! defined $domt ) {
|
|
591 carp "lowest true needs at least a domain threshold cut-off";
|
|
592 return (0,0);
|
|
593 }
|
|
594
|
|
595 $lowseq = $lowdom = 100000;
|
|
596
|
|
597 foreach $seq ( $self->eachHMMSequence()) {
|
|
598
|
|
599 if( $seq->bits() >= $seqt && $seq->bits() < $lowseq ) {
|
|
600 $lowseq = $seq->bits();
|
|
601 $trueseq = $seq;
|
|
602 }
|
|
603 if( $seq->bits() < $seqt ) {
|
|
604 next;
|
|
605 }
|
|
606
|
|
607 foreach $unit ( $seq->eachHMMUnit() ) {
|
|
608 if( $unit->bits() >= $domt && $unit->bits() < $lowdom ) {
|
|
609 $lowdom = $unit->bits();
|
|
610 $truedom = $unit;
|
|
611 }
|
|
612 }
|
|
613 }
|
|
614
|
|
615
|
|
616 return ($trueseq,$truedom);
|
|
617
|
|
618 }
|
|
619
|
|
620
|
|
621
|
|
622 =head2 add_Set
|
|
623
|
|
624 Title : add_Set
|
|
625 Usage : Mainly internal function
|
|
626 Function:
|
|
627 Returns :
|
|
628 Args :
|
|
629
|
|
630
|
|
631 =cut
|
|
632
|
|
633 sub add_Set {
|
|
634 my $self = shift;
|
|
635 my $seq = shift;
|
|
636 my $name;
|
|
637
|
|
638 $name = $seq->name();
|
|
639
|
|
640 if( exists $self->{'seq'}->{$name} ) {
|
|
641 $self->throw("You alredy have $name in HMMResults!");
|
|
642 }
|
|
643 $self->{'seq'}->{$name} = $seq;
|
|
644 }
|
|
645
|
|
646
|
|
647 =head2 each_Set
|
|
648
|
|
649 Title : each_Set
|
|
650 Usage :
|
|
651 Function:
|
|
652 Returns :
|
|
653 Args :
|
|
654
|
|
655
|
|
656 =cut
|
|
657
|
|
658 sub each_Set {
|
|
659 my $self = shift;
|
|
660 my (@array,$name);
|
|
661
|
|
662
|
|
663 foreach $name ( keys %{$self->{'seq'}} ) {
|
|
664 push(@array,$self->{'seq'}->{$name});
|
|
665 }
|
|
666 return @array;
|
|
667 }
|
|
668
|
|
669
|
|
670 =head2 get_Set
|
|
671
|
|
672 Title : get_Set
|
|
673 Usage : $set = $res->get_Set('sequence-name');
|
|
674 Function: returns the Set for a particular sequence
|
|
675 Returns : a HMMER::Set object
|
|
676 Args : name of the sequence
|
|
677
|
|
678
|
|
679 =cut
|
|
680
|
|
681 sub get_Set {
|
|
682 my $self = shift;
|
|
683 my $name = shift;
|
|
684
|
|
685 return $self->{'seq'}->{$name};
|
|
686 }
|
|
687
|
|
688
|
|
689 =head2 _parse_hmmpfam
|
|
690
|
|
691 Title : _parse_hmmpfam
|
|
692 Usage : $res->_parse_hmmpfam($filehandle)
|
|
693 Function:
|
|
694 Returns :
|
|
695 Args :
|
|
696
|
|
697
|
|
698 =cut
|
|
699
|
|
700 sub _parse_hmmpfam {
|
|
701 my $self = shift;
|
|
702 my $file = shift;
|
|
703
|
|
704 my ($id,$sqfrom,$sqto,$hmmf,$hmmt,$sc,$ev,
|
|
705 $unit,$nd,$seq,$name,$seqname,$from,
|
|
706 $to,%hash,%acc,$acc);
|
|
707 my $count = 0;
|
|
708
|
|
709 while(<$file>) {
|
|
710 if( /^HMM file:\s+(\S+)/ ) { $self->hmmfile($1); next; }
|
|
711 elsif( /^Sequence file:\s+(\S+)/ ) { $self->seqfile($1); next }
|
|
712 elsif( /^Query(\s+sequence)?:\s+(\S+)/ ) {
|
|
713
|
|
714 $seqname = $2;
|
|
715
|
|
716 $seq = Bio::Tools::HMMER::Set->new();
|
|
717
|
|
718 $seq ->name($seqname);
|
|
719 $self->add_Set($seq);
|
|
720 %hash = ();
|
|
721
|
|
722 while(<$file>){
|
|
723
|
|
724 if( /Accession:\s+(\S+)/ ) { $seq->accession($1); next }
|
|
725 elsif( s/^Description:\s+// ) { chomp; $seq->desc($_); next }
|
|
726 /^Parsed for domains/ && last;
|
|
727
|
|
728 # This is to parse out the accession numbers in old Pfam format.
|
|
729 # now not support due to changes in HMMER.
|
|
730
|
|
731 if( (($id,$acc, $sc, $ev, $nd) = /^\s*(\S+)\s+(\S+).+?\s(\S+)\s+(\S+)\s+(\d+)\s*$/)) {
|
|
732 $hash{$id} = $sc; # we need this for the sequence
|
|
733 # core of the domains below!
|
|
734 $acc {$id} = $acc;
|
|
735
|
|
736 # this is the more common parsing routine
|
|
737 } elsif ( (($id,$sc, $ev, $nd) =
|
|
738 /^\s*(\S+).+?\s(\S+)\s+(\S+)\s+(\d+)\s*$/) ) {
|
|
739
|
|
740 $hash{$id} = $sc; # we need this for the
|
|
741 # sequence score of hte domains below!
|
|
742
|
|
743 }
|
|
744 }
|
|
745
|
|
746 while(<$file>) {
|
|
747 /^Align/ && last;
|
|
748 /^\/\// && last;
|
|
749 # this is meant to match
|
|
750
|
|
751 #Sequence Domain seq-f seq-t hmm-f hmm-t score E-value
|
|
752 #-------- ------- ----- ----- ----- ----- ----- -------
|
|
753 #PF00621 1/1 198 372 .. 1 207 [] 281.6 1e-80
|
|
754
|
|
755 if( (($id, $sqfrom, $sqto, $hmmf,$hmmt,$sc, $ev) =
|
|
756 /(\S+)\s+\S+\s+(\d+)\s+(\d+).+?(\d+)\s+(\d+)\s+\S+\s+(\S+)\s+(\S+)\s*$/)) {
|
|
757 $unit = Bio::Tools::HMMER::Domain->new();
|
|
758 $unit->seq_id ($seqname);
|
|
759 $unit->hmmname ($id);
|
|
760 $unit->start ($sqfrom);
|
|
761 $unit->end ($sqto);
|
|
762 $unit->hstart($hmmf);
|
|
763 $unit->hend ($hmmt);
|
|
764 $unit->bits ($sc);
|
|
765 $unit->evalue ($ev);
|
|
766
|
|
767 if( !exists($hash{$id}) ) {
|
|
768 $self->throw("HMMResults parsing error in hmmpfam for $id - can't find sequecne score");
|
|
769 }
|
|
770
|
|
771 $unit->seqbits($hash{$id});
|
|
772
|
|
773 if( defined $acc{$id} ) {
|
|
774 $unit->hmmacc($acc{$id});
|
|
775 }
|
|
776
|
|
777 # this should find it's own sequence!
|
|
778 $self->add_Domain($unit);
|
|
779 }
|
|
780 }
|
|
781 if( /^\/\// ) { next; }
|
|
782
|
|
783 $_ = <$file>;
|
|
784 # parses alignment lines. Icky as we have to break on the same line
|
|
785 # that we need to read to place the alignment lines with the unit.
|
|
786
|
|
787 while(1) {
|
|
788 (!defined $_ || /^\/\//) && last;
|
|
789
|
|
790 # matches:
|
|
791 # PF00621: domain 1 of 1, from 198 to 372
|
|
792 if( /^\s*(\S+):.*from\s+(\d+)\s+to\s+(\d+)/ ) {
|
|
793
|
|
794 $name = $1;
|
|
795 $from = $2;
|
|
796 $to = $3;
|
|
797
|
|
798 # find the HMMUnit which this alignment is from
|
|
799
|
|
800 $unit = $self->get_unit_nse($seqname,$name,$from,$to);
|
|
801 if( !defined $unit ) {
|
|
802 $self->warn("Could not find $name $from $to unit even though I am reading it in. ugh!");
|
|
803 $_ = <$file>;
|
|
804 next;
|
|
805 }
|
|
806 while(<$file>) {
|
|
807 /^\/\// && last;
|
|
808 /^\s*\S+:.*from\s+\d+\s+to\s+\d+/ && last;
|
|
809 $unit->add_alignment_line($_);
|
|
810 }
|
|
811 } else {
|
|
812 $_ = <$file>;
|
|
813 }
|
|
814 }
|
|
815
|
|
816 # back to main 'Query:' loop
|
|
817 }
|
|
818 }
|
|
819 }
|
|
820
|
|
821 # mainly internal function
|
|
822
|
|
823 sub get_unit_nse {
|
|
824 my $self = shift;
|
|
825 my $seqname = shift;
|
|
826 my $domname = shift;
|
|
827 my $start = shift;
|
|
828 my $end = shift;
|
|
829
|
|
830 my($seq,$unit);
|
|
831
|
|
832 $seq = $self->get_Set($seqname);
|
|
833
|
|
834 if( !defined $seq ) {
|
|
835 $self->throw("Could not get sequence name $seqname - so can't get its unit");
|
|
836 }
|
|
837
|
|
838 foreach $unit ( $seq->each_Domain() ) {
|
|
839 if( $unit->hmmname() eq $domname && $unit->start() == $start && $unit->end() == $end ) {
|
|
840 return $unit;
|
|
841 }
|
|
842 }
|
|
843
|
|
844 return undef;
|
|
845 }
|
|
846
|
|
847
|
|
848 =head2 _parse_hmmsearch
|
|
849
|
|
850 Title : _parse_hmmsearch
|
|
851 Usage : $res->_parse_hmmsearch($filehandle)
|
|
852 Function:
|
|
853 Returns :
|
|
854 Args :
|
|
855
|
|
856
|
|
857 =cut
|
|
858
|
|
859 sub _parse_hmmsearch {
|
|
860 my $self = shift;
|
|
861 my $file = shift;
|
|
862 my ($id,$sqfrom,$sqto,$sc,$ev,$unit,$nd,$seq,$hmmf,$hmmt,
|
|
863 $hmmfname,$hmmacc, $hmmid, %seqh);
|
|
864 my $count = 0;
|
|
865
|
|
866 while(<$file>) {
|
|
867 /^HMM file:\s+(\S+)/ and do { $self->hmmfile($1); $hmmfname = $1 };
|
|
868 /^Accession:\s+(\S+)/ and do { $hmmacc = $1 };
|
|
869 /^Query HMM:\s+(\S+)/ and do { $hmmid = $1 };
|
|
870 /^Sequence database:\s+(\S+)/ and do { $self->seqfile($1) };
|
|
871 /^Scores for complete sequences/ && last;
|
|
872 }
|
|
873
|
|
874 $hmmfname = "given" if not $hmmfname;
|
|
875
|
|
876 while(<$file>) {
|
|
877 /^Parsed for domains/ && last;
|
|
878 if( (($id, $sc, $ev, $nd) = /(\S+).+?\s(\S+)\s+(\S+)\s+(\d+)\s*$/)) {
|
|
879 $seq = Bio::Tools::HMMER::Set->new();
|
|
880 $seq->name($id);
|
|
881 $seq->bits($sc);
|
|
882 $seqh{$id} = $sc;
|
|
883 $seq->evalue($ev);
|
|
884 $self->add_Set($seq);
|
|
885 $seq->accession($hmmacc);
|
|
886 }
|
|
887 }
|
|
888
|
|
889 while(<$file>) {
|
|
890 /^Alignments of top-scoring domains/ && last;
|
|
891 if( (($id, $sqfrom, $sqto, $hmmf, $hmmt, $sc, $ev) = /(\S+)\s+\S+\s+(\d+)\s+(\d+).+?(\d+)\s+(\d+)\s+\S+\s+(\S+)\s+(\S+)\s*$/)) {
|
|
892 $unit = Bio::Tools::HMMER::Domain->new();
|
|
893
|
|
894 $unit->seq_id($id);
|
|
895 $unit->hmmname($hmmfname);
|
|
896 $unit->start($sqfrom);
|
|
897 $unit->end($sqto);
|
|
898 $unit->bits($sc);
|
|
899 $unit->hstart($hmmf);
|
|
900 $unit->hend($hmmt);
|
|
901 $unit->evalue($ev);
|
|
902 $unit->seqbits($seqh{$id});
|
|
903 $self->add_Domain($unit);
|
|
904 $count++;
|
|
905 }
|
|
906 }
|
|
907
|
|
908 $_ = <$file>;
|
|
909
|
|
910 ## Recognize and store domain alignments
|
|
911
|
|
912 while(1) {
|
|
913 if( !defined $_ ) {
|
|
914 last;
|
|
915 }
|
|
916 /^Histogram of all scores/ && last;
|
|
917
|
|
918 # matches:
|
|
919 # PF00621: domain 1 of 1, from 198 to 372
|
|
920 if( /^\s*(\S+):.*from\s+(\d+)\s+to\s+(\d+)/ ) {
|
|
921 my $name = $1;
|
|
922 my $from = $2;
|
|
923 my $to = $3;
|
|
924
|
|
925 # find the HMMUnit which this alignment is from
|
|
926 $unit = $self->get_unit_nse($name,$hmmfname,$from,$to);
|
|
927
|
|
928 if( !defined $unit ) {
|
|
929 $self->warn("Could not find $name $from $to unit even though I am reading it in. ugh!");
|
|
930 next;
|
|
931 }
|
|
932 while(<$file>) {
|
|
933 /^Histogram of all scores/ && last;
|
|
934 /^\s*\S+:.*from\s+\d+\s+to\s+\d+/ && last;
|
|
935 $unit->add_alignment_line($_);
|
|
936 }
|
|
937 }
|
|
938 else {
|
|
939 $_ = <$file>;
|
|
940 }
|
|
941 }
|
|
942
|
|
943 return $count;
|
|
944 }
|
|
945
|
|
946 =head2 parsetype
|
|
947
|
|
948 Title : parsetype
|
|
949 Usage : $obj->parsetype($newval)
|
|
950 Function:
|
|
951 Returns : value of parsetype
|
|
952 Args : newvalue (optional)
|
|
953
|
|
954
|
|
955 =cut
|
|
956
|
|
957 sub parsetype{
|
|
958 my ($self,$value) = @_;
|
|
959 if( defined $value) {
|
|
960 $self->{'_parsetype'} = $value;
|
|
961 }
|
|
962 return $self->{'_parsetype'};
|
|
963 }
|
|
964
|
|
965 1; # says use was ok
|
|
966 __END__
|
|
967
|
|
968
|