0
|
1 # $Id: GenericHit.pm,v 1.20.2.1 2003/02/28 09:27:56 jason Exp $
|
|
2 #
|
|
3 # BioPerl module for Bio::Search::Hit::GenericHit
|
|
4 #
|
|
5 # Cared for by Jason Stajich <jason@bioperl.org>
|
|
6 #
|
|
7 # Copyright Jason Stajich
|
|
8 #
|
|
9 # You may distribute this module under the same terms as perl itself
|
|
10
|
|
11 # POD documentation - main docs before the code
|
|
12
|
|
13 =head1 NAME
|
|
14
|
|
15 Bio::Search::Hit::GenericHit - A generic implementation of the Bio::Search::Hit::HitI interface
|
|
16
|
|
17 =head1 SYNOPSIS
|
|
18
|
|
19 use Bio::Search::Hit::GenericHit;
|
|
20 my $hit = new Bio::Search::Hit::GenericHit(-algorithm => 'blastp');
|
|
21
|
|
22 # more likely
|
|
23 use Bio::SearchIO;
|
|
24 my $parser = new Bio::SearchIO(-format => 'blast', -file => 'result.bls');
|
|
25
|
|
26 my $result = $parser->next_result;
|
|
27 my $hit = $result->next_hit;
|
|
28
|
|
29
|
|
30 =head1 DESCRIPTION
|
|
31
|
|
32 This object handles the hit data from a Database Sequence Search such
|
|
33 as FASTA or BLAST.
|
|
34
|
|
35 =head1 FEEDBACK
|
|
36
|
|
37 =head2 Mailing Lists
|
|
38
|
|
39 User feedback is an integral part of the evolution of this and other
|
|
40 Bioperl modules. Send your comments and suggestions preferably to
|
|
41 the Bioperl mailing list. Your participation is much appreciated.
|
|
42
|
|
43 bioperl-l@bioperl.org - General discussion
|
|
44 http://bioperl.org/MailList.shtml - About the mailing lists
|
|
45
|
|
46 =head2 Reporting Bugs
|
|
47
|
|
48 Report bugs to the Bioperl bug tracking system to help us keep track
|
|
49 of the bugs and their resolution. Bug reports can be submitted via
|
|
50 email or the web:
|
|
51
|
|
52 bioperl-bugs@bioperl.org
|
|
53 http://bugzilla.bioperl.org/
|
|
54
|
|
55 =head1 AUTHOR - Jason Stajich and Steve Chervitz
|
|
56
|
|
57 Email jason@bioperl.org
|
|
58 Email sac@bioperl.org
|
|
59
|
|
60 =head1 CONTRIBUTORS
|
|
61
|
|
62 Additional contributors names and emails here
|
|
63
|
|
64 =head1 APPENDIX
|
|
65
|
|
66 The rest of the documentation details each of the object methods.
|
|
67 Internal methods are usually preceded with a _
|
|
68
|
|
69 =cut
|
|
70
|
|
71
|
|
72 # Let the code begin...
|
|
73
|
|
74
|
|
75 package Bio::Search::Hit::GenericHit;
|
|
76 use vars qw(@ISA);
|
|
77 use strict;
|
|
78
|
|
79 use Bio::Root::Root;
|
|
80 use Bio::Search::Hit::HitI;
|
|
81 require Bio::Search::SearchUtils;
|
|
82
|
|
83 @ISA = qw(Bio::Root::Root Bio::Search::Hit::HitI );
|
|
84
|
|
85 =head2 new
|
|
86
|
|
87 Title : new
|
|
88 Usage : my $obj = new Bio::Search::Hit::GenericHit();
|
|
89 Function: Builds a new Bio::Search::Hit::GenericHit object
|
|
90 Returns : Bio::Search::Hit::GenericHit
|
|
91 Args : -name => Name of Hit (required)
|
|
92 -description => Description (optional)
|
|
93 -accession => Accession number (optional)
|
|
94 -length => Length of the Hit (optional)
|
|
95 -score => Raw Score for the Hit (optional)
|
|
96 -significance => Significance value for the Hit (optional)
|
|
97 -algorithm => Algorithm used (BLASTP, FASTX, etc...)
|
|
98 -hsps => Array ref of HSPs for this Hit.
|
|
99 -iteration => integer for the PSI-Blast iteration number
|
|
100
|
|
101 =cut
|
|
102
|
|
103 sub new {
|
|
104 my($class,@args) = @_;
|
|
105
|
|
106 my $self = $class->SUPER::new(@args);
|
|
107 my ($hsps, $name,$query_len,$desc, $acc, $locus, $length,
|
|
108 $score,$algo,$signif,$bits,
|
|
109 $iter,$rank) = $self->_rearrange([qw(HSPS
|
|
110 NAME
|
|
111 QUERY_LEN
|
|
112 DESCRIPTION
|
|
113 ACCESSION
|
|
114 LOCUS
|
|
115 LENGTH SCORE ALGORITHM
|
|
116 SIGNIFICANCE BITS ITERATION
|
|
117 RANK )], @args);
|
|
118
|
|
119 $self->{'_query_length'} = $query_len;
|
|
120
|
|
121 if( ! defined $name ) {
|
|
122 $self->throw("Must have defined a valid name for Hit");
|
|
123 } else {
|
|
124 $self->name($name);
|
|
125 }
|
|
126
|
|
127 defined $acc && $self->accession($acc);
|
|
128 defined $locus && $self->locus($locus);
|
|
129 defined $desc && $self->description($desc);
|
|
130 defined $length && $self->length($length);
|
|
131 defined $algo && $self->algorithm($algo);
|
|
132 defined $signif && $self->significance($signif);
|
|
133 defined $score && $self->raw_score($score);
|
|
134 defined $bits && $self->bits($bits);
|
|
135 defined $iter && $self->iteration($iter);
|
|
136 defined $rank && $self->rank($rank);
|
|
137
|
|
138 $self->{'_iterator'} = 0;
|
|
139 $self->{'_hsps'} = [];
|
|
140 if( defined $hsps ) {
|
|
141 if( ref($hsps) !~ /array/i ) {
|
|
142 $self->warn("Did not specify a valid array ref for the param HSPS ($hsps)");
|
|
143 } else {
|
|
144 while( @$hsps ) {
|
|
145 $self->add_hsp(shift @$hsps );
|
|
146 }
|
|
147 }
|
|
148 }
|
|
149 return $self;
|
|
150 }
|
|
151
|
|
152 =head2 add_hsp
|
|
153
|
|
154 Title : add_hsp
|
|
155 Usage : $hit->add_hsp($hsp)
|
|
156 Function: Add a HSP to the collection of HSPs for a Hit
|
|
157 Returns : number of HSPs in the Hit
|
|
158 Args : Bio::Search::HSP::HSPI object
|
|
159
|
|
160
|
|
161 =cut
|
|
162
|
|
163 sub add_hsp {
|
|
164 my ($self,$hsp) = @_;
|
|
165 if( !defined $hsp || ! $hsp->isa('Bio::Search::HSP::HSPI') ) {
|
|
166 $self->warn("Must provide a valid Bio::Search::HSP::HSPI object to object: $self method: add_hsp value: $hsp");
|
|
167 return undef;
|
|
168 }
|
|
169 push @{$self->{'_hsps'}}, $hsp;
|
|
170 return scalar @{$self->{'_hsps'}};
|
|
171 }
|
|
172
|
|
173
|
|
174
|
|
175 =head2 Bio::Search::Hit::HitI methods
|
|
176
|
|
177 Implementation of Bio::Search::Hit::HitI methods
|
|
178
|
|
179 =head2 name
|
|
180
|
|
181 Title : name
|
|
182 Usage : $hit_name = $hit->name();
|
|
183 Function: returns the name of the Hit sequence
|
|
184 Returns : a scalar string
|
|
185 Args : [optional] scalar string to set the name
|
|
186
|
|
187 =cut
|
|
188
|
|
189 sub name {
|
|
190 my ($self,$value) = @_;
|
|
191 my $previous = $self->{'_name'};
|
|
192 if( defined $value || ! defined $previous ) {
|
|
193 $value = $previous = '' unless defined $value;
|
|
194 $self->{'_name'} = $value;
|
|
195 }
|
|
196 return $previous;
|
|
197 }
|
|
198
|
|
199 =head2 accession
|
|
200
|
|
201 Title : accession
|
|
202 Usage : $acc = $hit->accession();
|
|
203 Function: Retrieve the accession (if available) for the hit
|
|
204 Returns : a scalar string (empty string if not set)
|
|
205 Args : none
|
|
206
|
|
207 =cut
|
|
208
|
|
209 sub accession {
|
|
210 my ($self,$value) = @_;
|
|
211 my $previous = $self->{'_accession'};
|
|
212 if( defined $value || ! defined $previous ) {
|
|
213 $value = $previous = '' unless defined $value;
|
|
214 $self->{'_accession'} = $value;
|
|
215 }
|
|
216 return $previous;
|
|
217 }
|
|
218
|
|
219 =head2 description
|
|
220
|
|
221 Title : description
|
|
222 Usage : $desc = $hit->description();
|
|
223 Function: Retrieve the description for the hit
|
|
224 Returns : a scalar string
|
|
225 Args : [optional] scalar string to set the descrition
|
|
226
|
|
227 =cut
|
|
228
|
|
229 sub description {
|
|
230 my ($self,$value) = @_;
|
|
231 my $previous = $self->{'_description'};
|
|
232 if( defined $value || ! defined $previous ) {
|
|
233 $value = $previous = '' unless defined $value;
|
|
234 $self->{'_description'} = $value;
|
|
235 }
|
|
236 return $previous;
|
|
237 }
|
|
238
|
|
239 =head2 length
|
|
240
|
|
241 Title : length
|
|
242 Usage : my $len = $hit->length
|
|
243 Function: Returns the length of the hit
|
|
244 Returns : integer
|
|
245 Args : [optional] integer to set the length
|
|
246
|
|
247 =cut
|
|
248
|
|
249 sub length {
|
|
250 my ($self,$value) = @_;
|
|
251 my $previous = $self->{'_length'};
|
|
252 if( defined $value || ! defined $previous ) {
|
|
253 $value = $previous = 0 unless defined $value;
|
|
254 $self->{'_length'} = $value;
|
|
255 }
|
|
256 return $previous;
|
|
257 }
|
|
258
|
|
259
|
|
260 =head2 algorithm
|
|
261
|
|
262 Title : algorithm
|
|
263 Usage : $alg = $hit->algorithm();
|
|
264 Function: Gets the algorithm specification that was used to obtain the hit
|
|
265 For BLAST, the algorithm denotes what type of sequence was aligned
|
|
266 against what (BLASTN: dna-dna, BLASTP prt-prt, BLASTX translated
|
|
267 dna-prt, TBLASTN prt-translated dna, TBLASTX translated
|
|
268 dna-translated dna).
|
|
269 Returns : a scalar string
|
|
270 Args : [optional] scalar string to set the algorithm
|
|
271
|
|
272 =cut
|
|
273
|
|
274 sub algorithm {
|
|
275 my ($self,$value) = @_;
|
|
276 my $previous = $self->{'_algorithm'};
|
|
277 if( defined $value || ! defined $previous ) {
|
|
278 $value = $previous = '' unless defined $value;
|
|
279 $self->{'_algorithm'} = $value;
|
|
280 }
|
|
281 return $previous;
|
|
282 }
|
|
283
|
|
284 =head2 raw_score
|
|
285
|
|
286 Title : raw_score
|
|
287 Usage : $score = $hit->raw_score();
|
|
288 Function: Gets the "raw score" generated by the algorithm. What
|
|
289 this score is exactly will vary from algorithm to algorithm,
|
|
290 returning undef if unavailable.
|
|
291 Returns : a scalar value
|
|
292 Args : [optional] scalar value to set the raw score
|
|
293
|
|
294 =cut
|
|
295
|
|
296 sub raw_score {
|
|
297 my ($self,$value) = @_;
|
|
298 my $previous = $self->{'_score'};
|
|
299 if( defined $value || ! defined $previous ) {
|
|
300 $value = $previous = '' unless defined $value;
|
|
301 $self->{'_score'} = $value;
|
|
302 }
|
|
303 return $previous;
|
|
304 }
|
|
305
|
|
306 =head2 significance
|
|
307
|
|
308 Title : significance
|
|
309 Usage : $significance = $hit->significance();
|
|
310 Function: Used to obtain the E or P value of a hit, i.e. the probability that
|
|
311 this particular hit was obtained purely by random chance. If
|
|
312 information is not available (nor calculatable from other
|
|
313 information sources), return undef.
|
|
314 Returns : a scalar value or undef if unavailable
|
|
315 Args : [optional] scalar value to set the significance
|
|
316
|
|
317 =cut
|
|
318
|
|
319 sub significance {
|
|
320 my ($self,$value) = @_;
|
|
321 my $previous = $self->{'_significance'};
|
|
322 if( defined $value || ! defined $previous ) {
|
|
323 $value = $previous = '' unless defined $value;
|
|
324 $self->{'_significance'} = $value;
|
|
325 }
|
|
326 return $previous;
|
|
327 }
|
|
328
|
|
329 =head2 bits
|
|
330
|
|
331 Usage : $hit_object->bits();
|
|
332 Purpose : Gets the bit score of the best HSP for the current hit.
|
|
333 Example : $bits = $hit_object->bits();
|
|
334 Returns : Integer or undef if bit score is not set
|
|
335 Argument : n/a
|
|
336 Comments : For BLAST1, the non-bit score is listed in the summary line.
|
|
337
|
|
338 See Also : L<score()|score>
|
|
339
|
|
340 =cut
|
|
341
|
|
342 #---------
|
|
343 sub bits {
|
|
344 #---------
|
|
345 my ($self) = @_;
|
|
346
|
|
347 my $bits = $self->{'_bits'};
|
|
348 if( ! defined $bits ) {
|
|
349 $bits = $self->{'_hsps'}->[0]->bits();
|
|
350 $self->{'_bits'} = $bits;
|
|
351 }
|
|
352 return $bits;
|
|
353 }
|
|
354
|
|
355 =head2 next_hsp
|
|
356
|
|
357 Title : next_hsp
|
|
358 Usage : while( $hsp = $obj->next_hsp()) { ... }
|
|
359 Function : Returns the next available High Scoring Pair
|
|
360 Example :
|
|
361 Returns : Bio::Search::HSP::HSPI object or null if finished
|
|
362 Args : none
|
|
363
|
|
364 =cut
|
|
365
|
|
366 sub next_hsp {
|
|
367 my ($self) = @_;
|
|
368 $self->{'_iterator'} = 0 unless defined $self->{'_iterator'};
|
|
369 return undef if $self->{'_iterator'} > scalar @{$self->{'_hsps'}};
|
|
370 return $self->{'_hsps'}->[$self->{'_iterator'}++];
|
|
371 }
|
|
372
|
|
373
|
|
374 =head2 hsps
|
|
375
|
|
376 Usage : $hit_object->hsps();
|
|
377 Purpose : Get a list containing all HSP objects.
|
|
378 : Get the numbers of HSPs for the current hit.
|
|
379 Example : @hsps = $hit_object->hsps();
|
|
380 : $num = $hit_object->hsps(); # alternatively, use num_hsps()
|
|
381 Returns : Array context : list of Bio::Search::HSP::BlastHSP.pm objects.
|
|
382 : Scalar context: integer (number of HSPs).
|
|
383 : (Equivalent to num_hsps()).
|
|
384 Argument : n/a. Relies on wantarray
|
|
385 Throws : Exception if the HSPs have not been collected.
|
|
386
|
|
387 See Also : L<hsp()|hsp>, L<num_hsps()|num_hsps>
|
|
388
|
|
389 =cut
|
|
390
|
|
391 #---------
|
|
392 sub hsps {
|
|
393 #---------
|
|
394 my $self = shift;
|
|
395
|
|
396 if (not ref $self->{'_hsps'}) {
|
|
397 $self->throw("Can't get HSPs: data not collected.");
|
|
398 }
|
|
399
|
|
400 return wantarray
|
|
401 # returning list containing all HSPs.
|
|
402 ? @{$self->{'_hsps'}}
|
|
403 # returning number of HSPs.
|
|
404 : scalar(@{$self->{'_hsps'}});
|
|
405 }
|
|
406
|
|
407 =head2 num_hsps
|
|
408
|
|
409 Usage : $hit_object->num_hsps();
|
|
410 Purpose : Get the number of HSPs for the present Blast hit.
|
|
411 Example : $nhsps = $hit_object->num_hsps();
|
|
412 Returns : Integer
|
|
413 Argument : n/a
|
|
414 Throws : Exception if the HSPs have not been collected.
|
|
415
|
|
416 See Also : L<hsps()|hsps>
|
|
417
|
|
418 =cut
|
|
419
|
|
420 #-------------
|
|
421 sub num_hsps {
|
|
422 my $self = shift;
|
|
423
|
|
424 if (not defined $self->{'_hsps'}) {
|
|
425 $self->throw("Can't get HSPs: data not collected.");
|
|
426 }
|
|
427
|
|
428 return scalar(@{$self->{'_hsps'}});
|
|
429 }
|
|
430
|
|
431 =head2 rewind
|
|
432
|
|
433 Title : rewind
|
|
434 Usage : $hit->rewind;
|
|
435 Function: Allow one to reset the HSP iteration to the beginning
|
|
436 Since this is an in-memory implementation
|
|
437 Returns : none
|
|
438 Args : none
|
|
439
|
|
440 =cut
|
|
441
|
|
442 sub rewind{
|
|
443 my ($self) = @_;
|
|
444 $self->{'_iterator'} = 0;
|
|
445 }
|
|
446
|
|
447 =head2 iteration
|
|
448
|
|
449 Title : iteration
|
|
450 Usage : $obj->iteration($newval)
|
|
451 Function: PSI-BLAST iteration
|
|
452 Returns : value of iteration
|
|
453 Args : newvalue (optional)
|
|
454
|
|
455
|
|
456 =cut
|
|
457
|
|
458 sub iteration{
|
|
459 my ($self,$value) = @_;
|
|
460 if( defined $value) {
|
|
461 $self->{'_psiblast_iteration'} = $value;
|
|
462 }
|
|
463 return $self->{'_psiblast_iteration'};
|
|
464
|
|
465 }
|
|
466
|
|
467
|
|
468 =head2 ambiguous_aln
|
|
469
|
|
470 Usage : $ambig_code = $hit_object->ambiguous_aln();
|
|
471 Purpose : Sets/Gets ambiguity code data member.
|
|
472 Example : (see usage)
|
|
473 Returns : String = 'q', 's', 'qs', '-'
|
|
474 : 'q' = query sequence contains overlapping sub-sequences
|
|
475 : while sbjct does not.
|
|
476 : 's' = sbjct sequence contains overlapping sub-sequences
|
|
477 : while query does not.
|
|
478 : 'qs' = query and sbjct sequence contains overlapping sub-sequences
|
|
479 : relative to each other.
|
|
480 : '-' = query and sbjct sequence do not contains multiple domains
|
|
481 : relative to each other OR both contain the same distribution
|
|
482 : of similar domains.
|
|
483 Argument : n/a
|
|
484 Throws : n/a
|
|
485 Status : Experimental
|
|
486
|
|
487 =cut
|
|
488
|
|
489 #--------------------
|
|
490 sub ambiguous_aln {
|
|
491 #--------------------
|
|
492 my $self = shift;
|
|
493 if(@_) { $self->{'_ambiguous_aln'} = shift; }
|
|
494 $self->{'_ambiguous_aln'} || '-';
|
|
495 }
|
|
496
|
|
497 =head2 overlap
|
|
498
|
|
499 See documentation in L<Bio::Search::SearchUtils::overlap()|Bio::Search::SearchUtils>
|
|
500
|
|
501 =cut
|
|
502
|
|
503 #-------------
|
|
504 sub overlap {
|
|
505 #-------------
|
|
506 my $self = shift;
|
|
507 if(@_) { $self->{'_overlap'} = shift; }
|
|
508 defined $self->{'_overlap'} ? $self->{'_overlap'} : 0;
|
|
509 }
|
|
510
|
|
511
|
|
512 =head2 n
|
|
513
|
|
514 Usage : $hit_object->n();
|
|
515 Purpose : Gets the N number for the current hit.
|
|
516 : This is the number of HSPs in the set which was ascribed
|
|
517 : the lowest P-value (listed on the description line).
|
|
518 : This number is not the same as the total number of HSPs.
|
|
519 : To get the total number of HSPs, use num_hsps().
|
|
520 Example : $n = $hit_object->n();
|
|
521 Returns : Integer
|
|
522 Argument : n/a
|
|
523 Throws : Exception if HSPs have not been set (BLAST2 reports).
|
|
524 Comments : Note that the N parameter is not reported in gapped BLAST2.
|
|
525 : Calling n() on such reports will result in a call to num_hsps().
|
|
526 : The num_hsps() method will count the actual number of
|
|
527 : HSPs in the alignment listing, which may exceed N in
|
|
528 : some cases.
|
|
529
|
|
530 See Also : L<num_hsps()|num_hsps>
|
|
531
|
|
532 =cut
|
|
533
|
|
534 #-----
|
|
535 sub n {
|
|
536 #-----
|
|
537 my $self = shift;
|
|
538
|
|
539 # The check for $self->{'_n'} is a remnant from the 'query' mode days
|
|
540 # in which the sbjct object would collect data from the description
|
|
541 # line only.
|
|
542
|
|
543 my ($n);
|
|
544 if(not defined($self->{'_n'})) {
|
|
545 $n = $self->hsp->n;
|
|
546 } else {
|
|
547 $n = $self->{'_n'};
|
|
548 }
|
|
549 $n ||= $self->num_hsps;
|
|
550
|
|
551 return $n;
|
|
552 }
|
|
553
|
|
554 =head2 p
|
|
555
|
|
556 Usage : $hit_object->p( [format] );
|
|
557 Purpose : Get the P-value for the best HSP of the given BLAST hit.
|
|
558 : (Note that P-values are not provided with NCBI Blast2 reports).
|
|
559 Example : $p = $sbjct->p;
|
|
560 : $p = $sbjct->p('exp'); # get exponent only.
|
|
561 : ($num, $exp) = $sbjct->p('parts'); # split sci notation into parts
|
|
562 Returns : Float or scientific notation number (the raw P-value, DEFAULT).
|
|
563 : Integer if format == 'exp' (the magnitude of the base 10 exponent).
|
|
564 : 2-element list (float, int) if format == 'parts' and P-value
|
|
565 : is in scientific notation (See Comments).
|
|
566 Argument : format: string of 'raw' | 'exp' | 'parts'
|
|
567 : 'raw' returns value given in report. Default. (1.2e-34)
|
|
568 : 'exp' returns exponent value only (34)
|
|
569 : 'parts' returns the decimal and exponent as a
|
|
570 : 2-element list (1.2, -34) (See Comments).
|
|
571 Throws : Warns if no P-value is defined. Uses expect instead.
|
|
572 Comments : Using the 'parts' argument is not recommended since it will not
|
|
573 : work as expected if the P-value is not in scientific notation.
|
|
574 : That is, floats are not converted into sci notation before
|
|
575 : splitting into parts.
|
|
576
|
|
577 See Also : L<expect()|expect>, L<signif()|signif>, L<Bio::Search::SearchUtils::get_exponent()|Bio::Search::SearchUtils>
|
|
578
|
|
579 =cut
|
|
580
|
|
581 #--------
|
|
582 sub p {
|
|
583 #--------
|
|
584 # Some duplication of logic for p(), expect() and signif() for the sake of performance.
|
|
585 my ($self, $fmt) = @_;
|
|
586
|
|
587 my $val = $self->{'_p'};
|
|
588
|
|
589 # $val can be zero.
|
|
590 if(not defined $val) {
|
|
591 # P-value not defined, must be a NCBI Blast2 report.
|
|
592 # Use expect instead.
|
|
593 $self->warn( "P-value not defined. Using expect() instead.");
|
|
594 $val = $self->{'_expect'};
|
|
595 }
|
|
596
|
|
597 return $val if not $fmt or $fmt =~ /^raw/i;
|
|
598 ## Special formats: exponent-only or as list.
|
|
599 return &Bio::Search::SearchUtils::get_exponent($val) if $fmt =~ /^exp/i;
|
|
600 return (split (/eE/, $val)) if $fmt =~ /^parts/i;
|
|
601
|
|
602 ## Default: return the raw P-value.
|
|
603 return $val;
|
|
604 }
|
|
605
|
|
606 =head2 hsp
|
|
607
|
|
608 Usage : $hit_object->hsp( [string] );
|
|
609 Purpose : Get a single HSPI object for the present HitI object.
|
|
610 Example : $hspObj = $hit_object->hsp; # same as 'best'
|
|
611 : $hspObj = $hit_object->hsp('best');
|
|
612 : $hspObj = $hit_object->hsp('worst');
|
|
613 Returns : Object reference for a Bio::Search::HSP::BlastHSP.pm object.
|
|
614 Argument : String (or no argument).
|
|
615 : No argument (default) = highest scoring HSP (same as 'best').
|
|
616 : 'best' or 'first' = highest scoring HSP.
|
|
617 : 'worst' or 'last' = lowest scoring HSP.
|
|
618 Throws : Exception if the HSPs have not been collected.
|
|
619 : Exception if an unrecognized argument is used.
|
|
620
|
|
621 See Also : L<hsps()|hsps>, L<num_hsps>()
|
|
622
|
|
623 =cut
|
|
624
|
|
625 #----------
|
|
626 sub hsp {
|
|
627 #----------
|
|
628 my( $self, $option ) = @_;
|
|
629 $option ||= 'best';
|
|
630
|
|
631 if (not ref $self->{'_hsps'}) {
|
|
632 $self->throw("Can't get HSPs: data not collected.");
|
|
633 }
|
|
634
|
|
635 my @hsps = @{$self->{'_hsps'}};
|
|
636
|
|
637 return $hsps[0] if $option =~ /best|first|1/i;
|
|
638 return $hsps[$#hsps] if $option =~ /worst|last/i;
|
|
639
|
|
640 $self->throw("Can't get HSP for: $option\n" .
|
|
641 "Valid arguments: 'best', 'worst'");
|
|
642 }
|
|
643
|
|
644 =head2 logical_length
|
|
645
|
|
646 Usage : $hit_object->logical_length( [seq_type] );
|
|
647 : (mostly intended for internal use).
|
|
648 Purpose : Get the logical length of the hit sequence.
|
|
649 : If the Blast is a TBLASTN or TBLASTX, the returned length
|
|
650 : is the length of the would-be amino acid sequence (length/3).
|
|
651 : For all other BLAST flavors, this function is the same as length().
|
|
652 Example : $len = $hit_object->logical_length();
|
|
653 Returns : Integer
|
|
654 Argument : seq_type = 'query' or 'hit' or 'sbjct' (default = 'query')
|
|
655 ('sbjct' is synonymous with 'hit')
|
|
656 Throws : n/a
|
|
657 Comments : This is important for functions like frac_aligned_query()
|
|
658 : which need to operate in amino acid coordinate space when dealing
|
|
659 : with [T]BLAST[NX] type reports.
|
|
660
|
|
661 See Also : L<length()|length>, L<frac_aligned_query()|frac_aligned_query>, L<frac_aligned_hit()|frac_aligned_hit>
|
|
662
|
|
663 =cut
|
|
664
|
|
665 #--------------------
|
|
666 sub logical_length {
|
|
667 #--------------------
|
|
668 my $self = shift;
|
|
669 my $seqType = shift || 'query';
|
|
670 $seqType = 'sbjct' if $seqType eq 'hit';
|
|
671
|
|
672 my $length;
|
|
673
|
|
674 # For the sbjct, return logical sbjct length
|
|
675 if( $seqType eq 'sbjct' ) {
|
|
676 $length = $self->{'_logical_length'} || $self->{'_length'};
|
|
677 }
|
|
678 else {
|
|
679 # Otherwise, return logical query length
|
|
680 $length = $self->{'_query_length'};
|
|
681 $self->throw("Must have defined query_len") unless ( $length );
|
|
682
|
|
683 # Adjust length based on BLAST flavor.
|
|
684 if($self->algorithm =~ /T?BLASTX/ ) {
|
|
685 $length /= 3;
|
|
686 }
|
|
687 }
|
|
688 return $length;
|
|
689 }
|
|
690
|
|
691 =head2 length_aln
|
|
692
|
|
693 Usage : $hit_object->length_aln( [seq_type] );
|
|
694 Purpose : Get the total length of the aligned region for query or sbjct seq.
|
|
695 : This number will include all HSPs
|
|
696 Example : $len = $hit_object->length_aln(); # default = query
|
|
697 : $lenAln = $hit_object->length_aln('query');
|
|
698 Returns : Integer
|
|
699 Argument : seq_Type = 'query' or 'hit' or 'sbjct' (Default = 'query')
|
|
700 ('sbjct' is synonymous with 'hit')
|
|
701 Throws : Exception if the argument is not recognized.
|
|
702 Comments : This method will report the logical length of the alignment,
|
|
703 : meaning that for TBLAST[NX] reports, the length is reported
|
|
704 : using amino acid coordinate space (i.e., nucleotides / 3).
|
|
705 :
|
|
706 : This method requires that all HSPs be tiled. If they have not
|
|
707 : already been tiled, they will be tiled first automatically..
|
|
708 : If you don't want the tiled data, iterate through each HSP
|
|
709 : calling length() on each (use hsps() to get all HSPs).
|
|
710
|
|
711 See Also : L<length()|length>, L<frac_aligned_query()|frac_aligned_query>, L<frac_aligned_hit()|frac_aligned_hit>, L<gaps()|gaps>, L<Bio::Search::SearchUtils::tile_hsps()|Bio::Search::SearchUtils>, L<Bio::Search::HSP::BlastHSP::length()|Bio::Search::HSP::BlastHSP>
|
|
712
|
|
713 =cut
|
|
714
|
|
715 #---------------'
|
|
716 sub length_aln {
|
|
717 #---------------
|
|
718 my( $self, $seqType, $num ) = @_;
|
|
719
|
|
720 $seqType ||= 'query';
|
|
721 $seqType = 'sbjct' if $seqType eq 'hit';
|
|
722
|
|
723 Bio::Search::SearchUtils::tile_hsps($self) if not $self->{'_tiled_hsps'};
|
|
724
|
|
725 if( defined $num) {
|
|
726 return $self->{'_length_aln_'.$seqType} = $num;
|
|
727 }
|
|
728
|
|
729 my $data = $self->{'_length_aln_'.$seqType};
|
|
730
|
|
731 ## If we don't have data, figure out what went wrong.
|
|
732 if(!$data) {
|
|
733 $self->throw("Can't get length aln for sequence type \"$seqType\". " .
|
|
734 "Valid types are 'query', 'hit', 'sbjct' ('sbjct' = 'hit')");
|
|
735 }
|
|
736 return $data;
|
|
737 }
|
|
738
|
|
739 =head2 gaps
|
|
740
|
|
741 Usage : $hit_object->gaps( [seq_type] );
|
|
742 Purpose : Get the number of gaps in the aligned query, hit, or both sequences.
|
|
743 : Data is summed across all HSPs.
|
|
744 Example : $qgaps = $hit_object->gaps('query');
|
|
745 : $hgaps = $hit_object->gaps('hit');
|
|
746 : $tgaps = $hit_object->gaps(); # default = total (query + hit)
|
|
747 Returns : scalar context: integer
|
|
748 : array context without args: two-element list of integers
|
|
749 : (queryGaps, hitGaps)
|
|
750 : Array context can be forced by providing an argument of 'list' or 'array'.
|
|
751 :
|
|
752 : CAUTION: Calling this method within printf or sprintf is arrray context.
|
|
753 : So this function may not give you what you expect. For example:
|
|
754 : printf "Total gaps: %d", $hit->gaps();
|
|
755 : Actually returns a two-element array, so what gets printed
|
|
756 : is the number of gaps in the query, not the total
|
|
757 :
|
|
758 Argument : seq_type: 'query' | 'hit' or 'sbjct' | 'total' | 'list' (default = 'total')
|
|
759 ('sbjct' is synonymous with 'hit')
|
|
760 Throws : n/a
|
|
761 Comments : If you need data for each HSP, use hsps() and then interate
|
|
762 : through each HSP object.
|
|
763 : This method requires that all HSPs be tiled. If they have not
|
|
764 : already been tiled, they will be tiled first automatically..
|
|
765 : Not relying on wantarray since that will fail in situations
|
|
766 : such as printf "%d", $hit->gaps() in which you might expect to
|
|
767 : be printing the total gaps, but evaluates to array context.
|
|
768
|
|
769 See Also : L<length_aln()|length_aln>
|
|
770
|
|
771 =cut
|
|
772
|
|
773 #----------
|
|
774 sub gaps {
|
|
775 #----------
|
|
776 my( $self, $seqType, $num ) = @_;
|
|
777
|
|
778 $seqType ||= (wantarray ? 'list' : 'total');
|
|
779 $seqType = 'sbjct' if $seqType eq 'hit';
|
|
780
|
|
781 Bio::Search::SearchUtils::tile_hsps($self) if not $self->{'_tiled_hsps'};
|
|
782
|
|
783 $seqType = lc($seqType);
|
|
784
|
|
785 if( defined $num ) {
|
|
786 $self->throw("Can't set gaps for seqType '$seqType'. Must be 'query' or 'hit'\n") unless ($seqType eq 'sbjct' or $seqType eq 'query');
|
|
787
|
|
788 return $self->{'_gaps_'.$seqType} = $num;
|
|
789 }
|
|
790 elsif($seqType =~ /list|array/i) {
|
|
791 return ($self->{'_gaps_query'}, $self->{'_gaps_sbjct'});
|
|
792 }
|
|
793 elsif($seqType eq 'total') {
|
|
794 return ($self->{'_gaps_query'} + $self->{'_gaps_sbjct'}) || 0;
|
|
795 } else {
|
|
796 return $self->{'_gaps_'.$seqType} || 0;
|
|
797 }
|
|
798 }
|
|
799
|
|
800
|
|
801 =head2 matches
|
|
802
|
|
803 See documentation in L<Bio::Search::Hit::HitI::matches()|Bio::Search::Hit::HitI>
|
|
804
|
|
805 =cut
|
|
806
|
|
807 #---------------
|
|
808 sub matches {
|
|
809 #---------------
|
|
810 my( $self, $arg1, $arg2) = @_;
|
|
811 my(@data,$data);
|
|
812
|
|
813 Bio::Search::SearchUtils::tile_hsps($self) if not $self->{'_tiled_hsps'};
|
|
814
|
|
815 if(!$arg1) {
|
|
816 @data = ($self->{'_totalIdentical'}, $self->{'_totalConserved'});
|
|
817
|
|
818 return @data if @data;
|
|
819
|
|
820 } else {
|
|
821
|
|
822 if( defined $arg2 ) {
|
|
823 $self->{'_totalIdentical'} = $arg1;
|
|
824 $self->{'_totalConserved'} = $arg2;
|
|
825 return ( $arg1, $arg2 );
|
|
826 }
|
|
827 elsif($arg1 =~ /^id/i) {
|
|
828 $data = $self->{'_totalIdentical'};
|
|
829 } else {
|
|
830 $data = $self->{'_totalConserved'};
|
|
831 }
|
|
832 return $data if $data;
|
|
833 }
|
|
834
|
|
835 ## Something went wrong if we make it to here.
|
|
836 $self->throw("Can't get identical or conserved data: no data.");
|
|
837 }
|
|
838
|
|
839
|
|
840 =head2 start
|
|
841
|
|
842 Usage : $sbjct->start( [seq_type] );
|
|
843 Purpose : Gets the start coordinate for the query, sbjct, or both sequences
|
|
844 : in the BlastHit object. If there is more than one HSP, the lowest start
|
|
845 : value of all HSPs is returned.
|
|
846 Example : $qbeg = $sbjct->start('query');
|
|
847 : $sbeg = $sbjct->start('hit');
|
|
848 : ($qbeg, $sbeg) = $sbjct->start();
|
|
849 Returns : scalar context: integer
|
|
850 : array context without args: list of two integers (queryStart, sbjctStart)
|
|
851 : Array context can be "induced" by providing an argument of 'list' or 'array'.
|
|
852 Argument : In scalar context: seq_type = 'query' or 'hit' or 'sbjct' (default = 'query')
|
|
853 ('sbjct' is synonymous with 'hit')
|
|
854 Throws : n/a
|
|
855 Comments : This method requires that all HSPs be tiled. If there is more than one
|
|
856 : HSP and they have not already been tiled, they will be tiled first automatically..
|
|
857 : Remember that the start and end coordinates of all HSPs are
|
|
858 : normalized so that start < end. Strand information can be
|
|
859 : obtained by calling $hit->strand().
|
|
860
|
|
861 See Also : L<end()|end>, L<range()|range>, L<strand()|strand>,
|
|
862 L<Bio::Search::HSP::BlastHSP::start|Bio::Search::HSP::BlastHSP>
|
|
863
|
|
864 =cut
|
|
865
|
|
866 #----------
|
|
867 sub start {
|
|
868 #----------
|
|
869 my ($self, $seqType, $num) = @_;
|
|
870
|
|
871 $seqType ||= (wantarray ? 'list' : 'query');
|
|
872 $seqType = 'sbjct' if $seqType eq 'hit';
|
|
873
|
|
874 if( defined $num ) {
|
|
875 $seqType = "_\L$seqType\E";
|
|
876 return $self->{$seqType.'Start'} = $num;
|
|
877 }
|
|
878
|
|
879 # If there is only one HSP, defer this call to the solitary HSP.
|
|
880 if($self->num_hsps == 1) {
|
|
881 return $self->hsp->start($seqType);
|
|
882 } else {
|
|
883 Bio::Search::SearchUtils::tile_hsps($self) if not $self->{'_tiled_hsps'};
|
|
884 if($seqType =~ /list|array/i) {
|
|
885 return ($self->{'_queryStart'}, $self->{'_sbjctStart'});
|
|
886 } else {
|
|
887 ## Sensitive to member name changes.
|
|
888 $seqType = "_\L$seqType\E";
|
|
889 return $self->{$seqType.'Start'};
|
|
890 }
|
|
891 }
|
|
892 }
|
|
893
|
|
894
|
|
895 =head2 end
|
|
896
|
|
897 Usage : $sbjct->end( [seq_type] );
|
|
898 Purpose : Gets the end coordinate for the query, sbjct, or both sequences
|
|
899 : in the BlastHit object. If there is more than one HSP, the largest end
|
|
900 : value of all HSPs is returned.
|
|
901 Example : $qend = $sbjct->end('query');
|
|
902 : $send = $sbjct->end('hit');
|
|
903 : ($qend, $send) = $sbjct->end();
|
|
904 Returns : scalar context: integer
|
|
905 : array context without args: list of two integers (queryEnd, sbjctEnd)
|
|
906 : Array context can be "induced" by providing an argument of 'list' or 'array'.
|
|
907 Argument : In scalar context: seq_type = 'query' or 'sbjct'
|
|
908 : (case insensitive). If not supplied, 'query' is used.
|
|
909 Throws : n/a
|
|
910 Comments : This method requires that all HSPs be tiled. If there is more than one
|
|
911 : HSP and they have not already been tiled, they will be tiled first automatically..
|
|
912 : Remember that the start and end coordinates of all HSPs are
|
|
913 : normalized so that start < end. Strand information can be
|
|
914 : obtained by calling $hit->strand().
|
|
915
|
|
916 See Also : L<start()|start>, L<range()|range>, L<strand()|strand>
|
|
917
|
|
918 =cut
|
|
919
|
|
920 #----------
|
|
921 sub end {
|
|
922 #----------
|
|
923 my ($self, $seqType, $num) = @_;
|
|
924
|
|
925 $seqType ||= (wantarray ? 'list' : 'query');
|
|
926 $seqType = 'sbjct' if $seqType eq 'hit';
|
|
927
|
|
928
|
|
929 if( defined $num ) {
|
|
930 $seqType = "_\L$seqType\E";
|
|
931 return $self->{$seqType.'Stop'} = $num;
|
|
932 }
|
|
933
|
|
934 # If there is only one HSP, defer this call to the solitary HSP.
|
|
935 if($self->num_hsps == 1) {
|
|
936 return $self->hsp->end($seqType);
|
|
937 } else {
|
|
938 Bio::Search::SearchUtils::tile_hsps($self) if not $self->{'_tiled_hsps'};
|
|
939 if($seqType =~ /list|array/i) {
|
|
940 return ($self->{'_queryStop'}, $self->{'_sbjctStop'});
|
|
941 } else {
|
|
942 ## Sensitive to member name changes.
|
|
943 $seqType = "_\L$seqType\E";
|
|
944 return $self->{$seqType.'Stop'};
|
|
945 }
|
|
946 }
|
|
947 }
|
|
948
|
|
949 =head2 range
|
|
950
|
|
951 Usage : $sbjct->range( [seq_type] );
|
|
952 Purpose : Gets the (start, end) coordinates for the query or sbjct sequence
|
|
953 : in the HSP alignment.
|
|
954 Example : ($qbeg, $qend) = $sbjct->range('query');
|
|
955 : ($sbeg, $send) = $sbjct->range('hit');
|
|
956 Returns : Two-element array of integers
|
|
957 Argument : seq_type = string, 'query' or 'hit' or 'sbjct' (default = 'query')
|
|
958 ('sbjct' is synonymous with 'hit')
|
|
959 Throws : n/a
|
|
960
|
|
961 See Also : L<start()|start>, L<end()|end>
|
|
962
|
|
963 =cut
|
|
964
|
|
965 #----------
|
|
966 sub range {
|
|
967 #----------
|
|
968 my ($self, $seqType) = @_;
|
|
969 $seqType ||= 'query';
|
|
970 $seqType = 'sbjct' if $seqType eq 'hit';
|
|
971 return ($self->start($seqType), $self->end($seqType));
|
|
972 }
|
|
973
|
|
974
|
|
975 =head2 frac_identical
|
|
976
|
|
977 Usage : $hit_object->frac_identical( [seq_type] );
|
|
978 Purpose : Get the overall fraction of identical positions across all HSPs.
|
|
979 : The number refers to only the aligned regions and does not
|
|
980 : account for unaligned regions in between the HSPs, if any.
|
|
981 Example : $frac_iden = $hit_object->frac_identical('query');
|
|
982 Returns : Float (2-decimal precision, e.g., 0.75).
|
|
983 Argument : seq_type: 'query' | 'hit' or 'sbjct' | 'total'
|
|
984 : default = 'query' (but see comments below).
|
|
985 : ('sbjct' is synonymous with 'hit')
|
|
986 Throws : n/a
|
|
987 Comments : Different versions of Blast report different values for the total
|
|
988 : length of the alignment. This is the number reported in the
|
|
989 : denominators in the stats section:
|
|
990 : "Identical = 34/120 Positives = 67/120".
|
|
991 : NCBI BLAST uses the total length of the alignment (with gaps)
|
|
992 : WU-BLAST uses the length of the query sequence (without gaps).
|
|
993 :
|
|
994 : Therefore, when called with an argument of 'total',
|
|
995 : this method will report different values depending on the
|
|
996 : version of BLAST used. Total does NOT take into account HSP
|
|
997 : tiling, so it should not be used.
|
|
998 :
|
|
999 : To get the fraction identical among only the aligned residues,
|
|
1000 : ignoring the gaps, call this method without an argument or
|
|
1001 : with an argument of 'query' or 'hit'.
|
|
1002 :
|
|
1003 : If you need data for each HSP, use hsps() and then iterate
|
|
1004 : through the HSP objects.
|
|
1005 : This method requires that all HSPs be tiled. If they have not
|
|
1006 : already been tiled, they will be tiled first automatically.
|
|
1007
|
|
1008 See Also : L<frac_conserved()|frac_conserved>, L<frac_aligned_query()|frac_aligned_query>, L<matches()|matches>, L<Bio::Search::SearchUtils::tile_hsps()|Bio::Search::SearchUtils>
|
|
1009
|
|
1010 =cut
|
|
1011
|
|
1012 #------------------
|
|
1013 sub frac_identical {
|
|
1014 #------------------
|
|
1015 my ($self, $seqType) = @_;
|
|
1016 $seqType ||= 'query';
|
|
1017 $seqType = 'sbjct' if $seqType eq 'hit';
|
|
1018
|
|
1019 ## Sensitive to member name format.
|
|
1020 $seqType = lc($seqType);
|
|
1021
|
|
1022 Bio::Search::SearchUtils::tile_hsps($self) if not $self->{'_tiled_hsps'};
|
|
1023
|
|
1024 my $ident = $self->{'_totalIdentical'};
|
|
1025 my $total = $self->{'_length_aln_'.$seqType};
|
|
1026 my $ratio = $ident / $total;
|
|
1027 my $ratio_rounded = sprintf( "%.3f", $ratio);
|
|
1028
|
|
1029 # Round down iff normal rounding yields 1 (just like blast)
|
|
1030 $ratio_rounded = 0.999 if (($ratio_rounded == 1) && ($ratio < 1));
|
|
1031 return $ratio_rounded;
|
|
1032 }
|
|
1033
|
|
1034
|
|
1035
|
|
1036 =head2 frac_conserved
|
|
1037
|
|
1038 Usage : $hit_object->frac_conserved( [seq_type] );
|
|
1039 Purpose : Get the overall fraction of conserved positions across all HSPs.
|
|
1040 : The number refers to only the aligned regions and does not
|
|
1041 : account for unaligned regions in between the HSPs, if any.
|
|
1042 Example : $frac_cons = $hit_object->frac_conserved('hit');
|
|
1043 Returns : Float (2-decimal precision, e.g., 0.75).
|
|
1044 Argument : seq_type: 'query' | 'hit' or 'sbjct' | 'total'
|
|
1045 : default = 'query' (but see comments below).
|
|
1046 : ('sbjct' is synonymous with 'hit')
|
|
1047 Throws : n/a
|
|
1048 Comments : Different versions of Blast report different values for the total
|
|
1049 : length of the alignment. This is the number reported in the
|
|
1050 : denominators in the stats section:
|
|
1051 : "Positives = 34/120 Positives = 67/120".
|
|
1052 : NCBI BLAST uses the total length of the alignment (with gaps)
|
|
1053 : WU-BLAST uses the length of the query sequence (without gaps).
|
|
1054 :
|
|
1055 : Therefore, when called with an argument of 'total',
|
|
1056 : this method will report different values depending on the
|
|
1057 : version of BLAST used. Total does NOT take into account HSP
|
|
1058 : tiling, so it should not be used.
|
|
1059 :
|
|
1060 : To get the fraction conserved among only the aligned residues,
|
|
1061 : ignoring the gaps, call this method without an argument or
|
|
1062 : with an argument of 'query' or 'hit'.
|
|
1063 :
|
|
1064 : If you need data for each HSP, use hsps() and then interate
|
|
1065 : through the HSP objects.
|
|
1066 : This method requires that all HSPs be tiled. If they have not
|
|
1067 : already been tiled, they will be tiled first automatically.
|
|
1068
|
|
1069 See Also : L<frac_identical()|frac_identical>, L<matches()|matches>, L<Bio::Search::SearchUtils::tile_hsps()|Bio::Search::SearchUtils>
|
|
1070
|
|
1071 =cut
|
|
1072
|
|
1073 #--------------------
|
|
1074 sub frac_conserved {
|
|
1075 #--------------------
|
|
1076 my ($self, $seqType) = @_;
|
|
1077 $seqType ||= 'query';
|
|
1078 $seqType = 'sbjct' if $seqType eq 'hit';
|
|
1079
|
|
1080 ## Sensitive to member name format.
|
|
1081 $seqType = lc($seqType);
|
|
1082
|
|
1083 Bio::Search::SearchUtils::tile_hsps($self) if not $self->{'_tiled_hsps'};
|
|
1084
|
|
1085 my $consv = $self->{'_totalConserved'};
|
|
1086 my $total = $self->{'_length_aln_'.$seqType};
|
|
1087 my $ratio = $consv / $total;
|
|
1088 my $ratio_rounded = sprintf( "%.3f", $ratio);
|
|
1089
|
|
1090 # Round down iff normal rounding yields 1 (just like blast)
|
|
1091 $ratio_rounded = 0.999 if (($ratio_rounded == 1) && ($ratio < 1));
|
|
1092 return $ratio_rounded;
|
|
1093 }
|
|
1094
|
|
1095
|
|
1096
|
|
1097
|
|
1098 =head2 frac_aligned_query
|
|
1099
|
|
1100 Usage : $hit_object->frac_aligned_query();
|
|
1101 Purpose : Get the fraction of the query sequence which has been aligned
|
|
1102 : across all HSPs (not including intervals between non-overlapping
|
|
1103 : HSPs).
|
|
1104 Example : $frac_alnq = $hit_object->frac_aligned_query();
|
|
1105 Returns : Float (2-decimal precision, e.g., 0.75).
|
|
1106 Argument : n/a
|
|
1107 Throws : n/a
|
|
1108 Comments : If you need data for each HSP, use hsps() and then interate
|
|
1109 : through the HSP objects.
|
|
1110 : To compute the fraction aligned, the logical length of the query
|
|
1111 : sequence is used, meaning that for [T]BLASTX reports, the
|
|
1112 : full length of the query sequence is converted into amino acids
|
|
1113 : by dividing by 3. This is necessary because of the way
|
|
1114 : the lengths of aligned sequences are computed.
|
|
1115 : This method requires that all HSPs be tiled. If they have not
|
|
1116 : already been tiled, they will be tiled first automatically.
|
|
1117
|
|
1118 See Also : L<frac_aligned_hit()|frac_aligned_hit>, L<logical_length()|logical_length>, L<length_aln()|length_aln>, L<Bio::Search::SearchUtils::tile_hsps()|Bio::Search::SearchUtils>
|
|
1119
|
|
1120 =cut
|
|
1121
|
|
1122 #----------------------
|
|
1123 sub frac_aligned_query {
|
|
1124 #----------------------
|
|
1125 my $self = shift;
|
|
1126
|
|
1127 Bio::Search::SearchUtils::tile_hsps($self) if not $self->{'_tiled_hsps'};
|
|
1128
|
|
1129 sprintf( "%.2f", $self->{'_length_aln_query'}/
|
|
1130 $self->logical_length('query'));
|
|
1131 }
|
|
1132
|
|
1133
|
|
1134
|
|
1135 =head2 frac_aligned_hit
|
|
1136
|
|
1137 Usage : $hit_object->frac_aligned_hit();
|
|
1138 Purpose : Get the fraction of the hit (sbjct) sequence which has been aligned
|
|
1139 : across all HSPs (not including intervals between non-overlapping
|
|
1140 : HSPs).
|
|
1141 Example : $frac_alnq = $hit_object->frac_aligned_hit();
|
|
1142 Returns : Float (2-decimal precision, e.g., 0.75).
|
|
1143 Argument : n/a
|
|
1144 Throws : n/a
|
|
1145 Comments : If you need data for each HSP, use hsps() and then interate
|
|
1146 : through the HSP objects.
|
|
1147 : To compute the fraction aligned, the logical length of the sbjct
|
|
1148 : sequence is used, meaning that for TBLAST[NX] reports, the
|
|
1149 : full length of the sbjct sequence is converted into amino acids
|
|
1150 : by dividing by 3. This is necessary because of the way
|
|
1151 : the lengths of aligned sequences are computed.
|
|
1152 : This method requires that all HSPs be tiled. If they have not
|
|
1153 : already been tiled, they will be tiled first automatically.
|
|
1154
|
|
1155 See Also : L<frac_aligned_query()|frac_aligned_query>, L<matches()|matches>, , L<logical_length()|logical_length>, L<length_aln()|length_aln>, L<Bio::Search::SearchUtils::tile_hsps()|Bio::Search::SearchUtils>
|
|
1156
|
|
1157 =cut
|
|
1158
|
|
1159 #--------------------
|
|
1160 sub frac_aligned_hit {
|
|
1161 #--------------------
|
|
1162 my $self = shift;
|
|
1163
|
|
1164 Bio::Search::SearchUtils::tile_hsps($self) if not $self->{'_tiled_hsps'};
|
|
1165
|
|
1166 sprintf( "%.2f", $self->{'_length_aln_sbjct'}/$self->logical_length('sbjct'));
|
|
1167 }
|
|
1168
|
|
1169
|
|
1170 ## These methods are being maintained for backward compatibility.
|
|
1171
|
|
1172 =head2 frac_aligned_sbjct
|
|
1173
|
|
1174 Same as L<frac_aligned_hit()|frac_aligned_hit>
|
|
1175
|
|
1176 =cut
|
|
1177
|
|
1178 #----------------
|
|
1179 sub frac_aligned_sbjct { my $self=shift; $self->frac_aligned_hit(@_); }
|
|
1180 #----------------
|
|
1181
|
|
1182 =head2 num_unaligned_sbjct
|
|
1183
|
|
1184 Same as L<num_unaligned_hit()|num_unaligned_hit>
|
|
1185
|
|
1186 =cut
|
|
1187
|
|
1188 #----------------
|
|
1189 sub num_unaligned_sbjct { my $self=shift; $self->num_unaligned_hit(@_); }
|
|
1190 #----------------
|
|
1191
|
|
1192
|
|
1193
|
|
1194 =head2 num_unaligned_hit
|
|
1195
|
|
1196 Usage : $hit_object->num_unaligned_hit();
|
|
1197 Purpose : Get the number of the unaligned residues in the hit sequence.
|
|
1198 : Sums across all all HSPs.
|
|
1199 Example : $num_unaln = $hit_object->num_unaligned_hit();
|
|
1200 Returns : Integer
|
|
1201 Argument : n/a
|
|
1202 Throws : n/a
|
|
1203 Comments : See notes regarding logical lengths in the comments for frac_aligned_hit().
|
|
1204 : They apply here as well.
|
|
1205 : If you need data for each HSP, use hsps() and then interate
|
|
1206 : through the HSP objects.
|
|
1207 : This method requires that all HSPs be tiled. If they have not
|
|
1208 : already been tiled, they will be tiled first automatically..
|
|
1209
|
|
1210 See Also : L<num_unaligned_query()|num_unaligned_query>, L<Bio::Search::SearchUtils::tile_hsps()|Bio::Search::SearchUtils>, L<frac_aligned_hit()|frac_aligned_hit>
|
|
1211
|
|
1212 =cut
|
|
1213
|
|
1214 #---------------------
|
|
1215 sub num_unaligned_hit {
|
|
1216 #---------------------
|
|
1217 my $self = shift;
|
|
1218
|
|
1219 Bio::Search::SearchUtils::tile_hsps($self) if not $self->{'_tiled_hsps'};
|
|
1220
|
|
1221 my $num = $self->logical_length('sbjct') - $self->{'_length_aln_sbjct'};
|
|
1222 ($num < 0 ? 0 : $num );
|
|
1223 }
|
|
1224
|
|
1225
|
|
1226 =head2 num_unaligned_query
|
|
1227
|
|
1228 Usage : $hit_object->num_unaligned_query();
|
|
1229 Purpose : Get the number of the unaligned residues in the query sequence.
|
|
1230 : Sums across all all HSPs.
|
|
1231 Example : $num_unaln = $hit_object->num_unaligned_query();
|
|
1232 Returns : Integer
|
|
1233 Argument : n/a
|
|
1234 Throws : n/a
|
|
1235 Comments : See notes regarding logical lengths in the comments for frac_aligned_query().
|
|
1236 : They apply here as well.
|
|
1237 : If you need data for each HSP, use hsps() and then interate
|
|
1238 : through the HSP objects.
|
|
1239 : This method requires that all HSPs be tiled. If they have not
|
|
1240 : already been tiled, they will be tiled first automatically..
|
|
1241
|
|
1242 See Also : L<num_unaligned_hit()|num_unaligned_hit>, L<frac_aligned_query()|frac_aligned_query>, L<Bio::Search::SearchUtils::tile_hsps()|Bio::Search::SearchUtils>
|
|
1243
|
|
1244 =cut
|
|
1245
|
|
1246 #-----------------------
|
|
1247 sub num_unaligned_query {
|
|
1248 #-----------------------
|
|
1249 my $self = shift;
|
|
1250
|
|
1251 Bio::Search::SearchUtils::tile_hsps($self) if not $self->{'_tiled_hsps'};
|
|
1252
|
|
1253 my $num = $self->logical_length('query') - $self->{'_length_aln_query'};
|
|
1254 ($num < 0 ? 0 : $num );
|
|
1255 }
|
|
1256
|
|
1257
|
|
1258
|
|
1259 =head2 seq_inds
|
|
1260
|
|
1261 Usage : $hit->seq_inds( seq_type, class, collapse );
|
|
1262 Purpose : Get a list of residue positions (indices) across all HSPs
|
|
1263 : for identical or conserved residues in the query or sbjct sequence.
|
|
1264 Example : @s_ind = $hit->seq_inds('query', 'identical');
|
|
1265 : @h_ind = $hit->seq_inds('hit', 'conserved');
|
|
1266 : @h_ind = $hit->seq_inds('hit', 'conserved', 1);
|
|
1267 Returns : Array of integers
|
|
1268 : May include ranges if collapse is non-zero.
|
|
1269 Argument : [0] seq_type = 'query' or 'hit' or 'sbjct' (default = 'query')
|
|
1270 : ('sbjct' is synonymous with 'hit')
|
|
1271 : [1] class = 'identical' or 'conserved' (default = 'identical')
|
|
1272 : (can be shortened to 'id' or 'cons')
|
|
1273 : (actually, anything not 'id' will evaluate to 'conserved').
|
|
1274 : [2] collapse = boolean, if non-zero, consecutive positions are merged
|
|
1275 : using a range notation, e.g., "1 2 3 4 5 7 9 10 11"
|
|
1276 : collapses to "1-5 7 9-11". This is useful for
|
|
1277 : consolidating long lists. Default = no collapse.
|
|
1278 Throws : n/a.
|
|
1279
|
|
1280 See Also : L<Bio::Search::HSP::BlastHSP::seq_inds()|Bio::Search::HSP::BlastHSP>
|
|
1281
|
|
1282 =cut
|
|
1283
|
|
1284 #-------------
|
|
1285 sub seq_inds {
|
|
1286 #-------------
|
|
1287 my ($self, $seqType, $class, $collapse) = @_;
|
|
1288
|
|
1289 $seqType ||= 'query';
|
|
1290 $class ||= 'identical';
|
|
1291 $collapse ||= 0;
|
|
1292
|
|
1293 $seqType = 'sbjct' if $seqType eq 'hit';
|
|
1294
|
|
1295 my (@inds, $hsp);
|
|
1296 foreach $hsp ($self->hsps) {
|
|
1297 # This will merge data for all HSPs together.
|
|
1298 push @inds, $hsp->seq_inds($seqType, $class);
|
|
1299 }
|
|
1300
|
|
1301 # Need to remove duplicates and sort the merged positions.
|
|
1302 if(@inds) {
|
|
1303 my %tmp = map { $_, 1 } @inds;
|
|
1304 @inds = sort {$a <=> $b} keys %tmp;
|
|
1305 }
|
|
1306
|
|
1307 $collapse ? &Bio::Search::SearchUtils::collapse_nums(@inds) : @inds;
|
|
1308 }
|
|
1309
|
|
1310
|
|
1311 =head2 strand
|
|
1312
|
|
1313 See documentation in L<Bio::Search::SearchUtils::strand()|Bio::Search::SearchUtils>
|
|
1314
|
|
1315 =cut
|
|
1316
|
|
1317 #----------'
|
|
1318 sub strand {
|
|
1319 #----------
|
|
1320 my ($self, $seqType, $strnd) = @_;
|
|
1321
|
|
1322 Bio::Search::SearchUtils::tile_hsps($self) if not $self->{'_tiled_hsps'};
|
|
1323
|
|
1324 $seqType ||= (wantarray ? 'list' : 'query');
|
|
1325 $seqType = 'sbjct' if $seqType eq 'hit';
|
|
1326
|
|
1327 $seqType = lc($seqType);
|
|
1328
|
|
1329 if( defined $strnd ) {
|
|
1330 $self->throw("Can't set strand for seqType '$seqType'. Must be 'query' or 'hit'\n") unless ($seqType eq 'sbjct' or $seqType eq 'query');
|
|
1331
|
|
1332 return $self->{'_strand_'.$seqType} = $strnd;
|
|
1333 }
|
|
1334
|
|
1335 my ($qstr, $hstr);
|
|
1336 # If there is only one HSP, defer this call to the solitary HSP.
|
|
1337 if($self->num_hsps == 1) {
|
|
1338 return $self->hsp->strand($seqType);
|
|
1339 }
|
|
1340 elsif( defined $self->{'_strand_query'}) {
|
|
1341 # Get the data computed during hsp tiling.
|
|
1342 $qstr = $self->{'_strand_query'};
|
|
1343 $hstr = $self->{'_strand_sbjct'}
|
|
1344 }
|
|
1345 else {
|
|
1346 # otherwise, iterate through all HSPs collecting strand info.
|
|
1347 # This will return the string "-1/1" if there are HSPs on different strands.
|
|
1348 # NOTE: This was the pre-10/21/02 procedure which will no longer be used,
|
|
1349 # (unless the above elsif{} is commented out).
|
|
1350 my (%qstr, %hstr);
|
|
1351 foreach my $hsp( $self->hsps ) {
|
|
1352 my ( $q, $h ) = $hsp->strand();
|
|
1353 $qstr{ $q }++;
|
|
1354 $hstr{ $h }++;
|
|
1355 }
|
|
1356 $qstr = join( '/', sort keys %qstr);
|
|
1357 $hstr = join( '/', sort keys %hstr);
|
|
1358 }
|
|
1359
|
|
1360 if($seqType =~ /list|array/i) {
|
|
1361 return ($qstr, $hstr);
|
|
1362 } elsif( $seqType eq 'query' ) {
|
|
1363 return $qstr;
|
|
1364 } else {
|
|
1365 return $hstr;
|
|
1366 }
|
|
1367 }
|
|
1368
|
|
1369 =head2 frame
|
|
1370
|
|
1371 See documentation in L<Bio::Search::SearchUtils::frame()|Bio::Search::SearchUtils>
|
|
1372
|
|
1373 =cut
|
|
1374
|
|
1375 #----------'
|
|
1376 sub frame {
|
|
1377 #----------
|
|
1378 my( $self, $frm ) = @_;
|
|
1379
|
|
1380 Bio::Search::SearchUtils::tile_hsps($self) if not $self->{'_tiled_hsps'};
|
|
1381
|
|
1382 if( defined $frm ) {
|
|
1383 return $self->{'_frame'} = $frm;
|
|
1384 }
|
|
1385
|
|
1386 # The check for $self->{'_frame'} is a remnant from the 'query' mode days
|
|
1387 # in which the sbjct object would collect data from the description line only.
|
|
1388
|
|
1389 my ($frame);
|
|
1390 if(not defined($self->{'_frame'})) {
|
|
1391 $frame = $self->hsp->frame;
|
|
1392 } else {
|
|
1393 $frame = $self->{'_frame'};
|
|
1394 }
|
|
1395 return $frame;
|
|
1396 }
|
|
1397
|
|
1398 =head2 rank
|
|
1399
|
|
1400 Title : rank
|
|
1401 Usage : $obj->rank($newval)
|
|
1402 Function: Get/Set the rank of this Hit in the Query search list
|
|
1403 i.e. this is the Nth hit for a specific query
|
|
1404 Returns : value of rank
|
|
1405 Args : newvalue (optional)
|
|
1406
|
|
1407
|
|
1408 =cut
|
|
1409
|
|
1410 sub rank{
|
|
1411 my $self = shift;
|
|
1412 return $self->{'_rank'} = shift if @_;
|
|
1413 return $self->{'_rank'} || 1;
|
|
1414 }
|
|
1415
|
|
1416 =head2 locus
|
|
1417
|
|
1418 Title : locus
|
|
1419 Usage : $locus = $hit->locus();
|
|
1420 Function: Retrieve the locus (if available) for the hit
|
|
1421 Returns : a scalar string (empty string if not set)
|
|
1422 Args : none
|
|
1423
|
|
1424 =cut
|
|
1425
|
|
1426 sub locus {
|
|
1427 my ($self,$value) = @_;
|
|
1428 my $previous = $self->{'_locus'};
|
|
1429 if( defined $value || ! defined $previous ) {
|
|
1430 unless (defined $value) {
|
|
1431 if ($self->{'_name'} =~/(gb|emb|dbj|ref)\|(.*)\|(.*)/) {
|
|
1432 $value = $previous = $3;
|
|
1433 } else {
|
|
1434 $value = $previous = '';
|
|
1435 }
|
|
1436 }
|
|
1437 $self->{'_locus'} = $value;
|
|
1438 }
|
|
1439 return $previous;
|
|
1440 }
|
|
1441
|
|
1442 =head2 each_accession_number
|
|
1443
|
|
1444 Title : each_accession_number
|
|
1445 Usage : @each_accession_number = $hit->each_accession_number();
|
|
1446 Function: Get each accession number listed in the description of the hit.
|
|
1447 If there are no alternatives, then only the primary accession will
|
|
1448 be given
|
|
1449 Returns : list of all accession numbers in the description
|
|
1450 Args : none
|
|
1451
|
|
1452 =cut
|
|
1453
|
|
1454 sub each_accession_number {
|
|
1455 my ($self,$value) = @_;
|
|
1456 my $desc = $self->{'_description'};
|
|
1457 #put primary accnum on the list
|
|
1458 my @accnums;
|
|
1459 push (@accnums,$self->{'_accession'});
|
|
1460 if( defined $desc ) {
|
|
1461 while ($desc =~ /(\b\S+\|\S*\|\S*\s?)/g) {
|
|
1462 my $id = $1;
|
|
1463 my ($acc, $version);
|
|
1464 if ($id =~ /(gb|emb|dbj|sp|pdb|bbs|ref|lcl)\|(.*)\|(.*)/) {
|
|
1465 ($acc, $version) = split /\./, $2;
|
|
1466 } elsif ($id =~ /(pir|prf|pat|gnl)\|(.*)\|(.*)/) {
|
|
1467 ($acc, $version) = split /\./, $3;
|
|
1468 } else {
|
|
1469 #punt, not matching the db's at ftp://ftp.ncbi.nih.gov/blast/db/README
|
|
1470 #Database Name Identifier Syntax
|
|
1471 #============================ ========================
|
|
1472 #GenBank gb|accession|locus
|
|
1473 #EMBL Data Library emb|accession|locus
|
|
1474 #DDBJ, DNA Database of Japan dbj|accession|locus
|
|
1475 #NBRF PIR pir||entry
|
|
1476 #Protein Research Foundation prf||name
|
|
1477 #SWISS-PROT sp|accession|entry name
|
|
1478 #Brookhaven Protein Data Bank pdb|entry|chain
|
|
1479 #Patents pat|country|number
|
|
1480 #GenInfo Backbone Id bbs|number
|
|
1481 #General database identifier gnl|database|identifier
|
|
1482 #NCBI Reference Sequence ref|accession|locus
|
|
1483 #Local Sequence identifier lcl|identifier
|
|
1484 $acc=$id;
|
|
1485 }
|
|
1486 push(@accnums, $acc);
|
|
1487 }
|
|
1488 }
|
|
1489 return @accnums;
|
|
1490 }
|
|
1491
|
|
1492 =head2 tiled_hsps
|
|
1493
|
|
1494 See documentation in L<Bio::Search::SearchUtils::tiled_hsps()|Bio::Search::SearchUtils>
|
|
1495
|
|
1496 =cut
|
|
1497
|
|
1498 sub tiled_hsps {
|
|
1499 my $self = shift;
|
|
1500 return $self->{'_tiled_hsps'} = shift if @_;
|
|
1501 return $self->{'_tiled_hsps'};
|
|
1502 }
|
|
1503
|
|
1504 1;
|