Mercurial > repos > mahtabm > ensembl
comparison variant_effect_predictor/Bio/Search/Hit/GenericHit.pm @ 0:1f6dce3d34e0
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 02:01:53 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:1f6dce3d34e0 |
---|---|
1 # $Id: GenericHit.pm,v 1.20.2.1 2003/02/28 09:27:56 jason Exp $ | |
2 # | |
3 # BioPerl module for Bio::Search::Hit::GenericHit | |
4 # | |
5 # Cared for by Jason Stajich <jason@bioperl.org> | |
6 # | |
7 # Copyright Jason Stajich | |
8 # | |
9 # You may distribute this module under the same terms as perl itself | |
10 | |
11 # POD documentation - main docs before the code | |
12 | |
13 =head1 NAME | |
14 | |
15 Bio::Search::Hit::GenericHit - A generic implementation of the Bio::Search::Hit::HitI interface | |
16 | |
17 =head1 SYNOPSIS | |
18 | |
19 use Bio::Search::Hit::GenericHit; | |
20 my $hit = new Bio::Search::Hit::GenericHit(-algorithm => 'blastp'); | |
21 | |
22 # more likely | |
23 use Bio::SearchIO; | |
24 my $parser = new Bio::SearchIO(-format => 'blast', -file => 'result.bls'); | |
25 | |
26 my $result = $parser->next_result; | |
27 my $hit = $result->next_hit; | |
28 | |
29 | |
30 =head1 DESCRIPTION | |
31 | |
32 This object handles the hit data from a Database Sequence Search such | |
33 as FASTA or BLAST. | |
34 | |
35 =head1 FEEDBACK | |
36 | |
37 =head2 Mailing Lists | |
38 | |
39 User feedback is an integral part of the evolution of this and other | |
40 Bioperl modules. Send your comments and suggestions preferably to | |
41 the Bioperl mailing list. Your participation is much appreciated. | |
42 | |
43 bioperl-l@bioperl.org - General discussion | |
44 http://bioperl.org/MailList.shtml - About the mailing lists | |
45 | |
46 =head2 Reporting Bugs | |
47 | |
48 Report bugs to the Bioperl bug tracking system to help us keep track | |
49 of the bugs and their resolution. Bug reports can be submitted via | |
50 email or the web: | |
51 | |
52 bioperl-bugs@bioperl.org | |
53 http://bugzilla.bioperl.org/ | |
54 | |
55 =head1 AUTHOR - Jason Stajich and Steve Chervitz | |
56 | |
57 Email jason@bioperl.org | |
58 Email sac@bioperl.org | |
59 | |
60 =head1 CONTRIBUTORS | |
61 | |
62 Additional contributors names and emails here | |
63 | |
64 =head1 APPENDIX | |
65 | |
66 The rest of the documentation details each of the object methods. | |
67 Internal methods are usually preceded with a _ | |
68 | |
69 =cut | |
70 | |
71 | |
72 # Let the code begin... | |
73 | |
74 | |
75 package Bio::Search::Hit::GenericHit; | |
76 use vars qw(@ISA); | |
77 use strict; | |
78 | |
79 use Bio::Root::Root; | |
80 use Bio::Search::Hit::HitI; | |
81 require Bio::Search::SearchUtils; | |
82 | |
83 @ISA = qw(Bio::Root::Root Bio::Search::Hit::HitI ); | |
84 | |
85 =head2 new | |
86 | |
87 Title : new | |
88 Usage : my $obj = new Bio::Search::Hit::GenericHit(); | |
89 Function: Builds a new Bio::Search::Hit::GenericHit object | |
90 Returns : Bio::Search::Hit::GenericHit | |
91 Args : -name => Name of Hit (required) | |
92 -description => Description (optional) | |
93 -accession => Accession number (optional) | |
94 -length => Length of the Hit (optional) | |
95 -score => Raw Score for the Hit (optional) | |
96 -significance => Significance value for the Hit (optional) | |
97 -algorithm => Algorithm used (BLASTP, FASTX, etc...) | |
98 -hsps => Array ref of HSPs for this Hit. | |
99 -iteration => integer for the PSI-Blast iteration number | |
100 | |
101 =cut | |
102 | |
103 sub new { | |
104 my($class,@args) = @_; | |
105 | |
106 my $self = $class->SUPER::new(@args); | |
107 my ($hsps, $name,$query_len,$desc, $acc, $locus, $length, | |
108 $score,$algo,$signif,$bits, | |
109 $iter,$rank) = $self->_rearrange([qw(HSPS | |
110 NAME | |
111 QUERY_LEN | |
112 DESCRIPTION | |
113 ACCESSION | |
114 LOCUS | |
115 LENGTH SCORE ALGORITHM | |
116 SIGNIFICANCE BITS ITERATION | |
117 RANK )], @args); | |
118 | |
119 $self->{'_query_length'} = $query_len; | |
120 | |
121 if( ! defined $name ) { | |
122 $self->throw("Must have defined a valid name for Hit"); | |
123 } else { | |
124 $self->name($name); | |
125 } | |
126 | |
127 defined $acc && $self->accession($acc); | |
128 defined $locus && $self->locus($locus); | |
129 defined $desc && $self->description($desc); | |
130 defined $length && $self->length($length); | |
131 defined $algo && $self->algorithm($algo); | |
132 defined $signif && $self->significance($signif); | |
133 defined $score && $self->raw_score($score); | |
134 defined $bits && $self->bits($bits); | |
135 defined $iter && $self->iteration($iter); | |
136 defined $rank && $self->rank($rank); | |
137 | |
138 $self->{'_iterator'} = 0; | |
139 $self->{'_hsps'} = []; | |
140 if( defined $hsps ) { | |
141 if( ref($hsps) !~ /array/i ) { | |
142 $self->warn("Did not specify a valid array ref for the param HSPS ($hsps)"); | |
143 } else { | |
144 while( @$hsps ) { | |
145 $self->add_hsp(shift @$hsps ); | |
146 } | |
147 } | |
148 } | |
149 return $self; | |
150 } | |
151 | |
152 =head2 add_hsp | |
153 | |
154 Title : add_hsp | |
155 Usage : $hit->add_hsp($hsp) | |
156 Function: Add a HSP to the collection of HSPs for a Hit | |
157 Returns : number of HSPs in the Hit | |
158 Args : Bio::Search::HSP::HSPI object | |
159 | |
160 | |
161 =cut | |
162 | |
163 sub add_hsp { | |
164 my ($self,$hsp) = @_; | |
165 if( !defined $hsp || ! $hsp->isa('Bio::Search::HSP::HSPI') ) { | |
166 $self->warn("Must provide a valid Bio::Search::HSP::HSPI object to object: $self method: add_hsp value: $hsp"); | |
167 return undef; | |
168 } | |
169 push @{$self->{'_hsps'}}, $hsp; | |
170 return scalar @{$self->{'_hsps'}}; | |
171 } | |
172 | |
173 | |
174 | |
175 =head2 Bio::Search::Hit::HitI methods | |
176 | |
177 Implementation of Bio::Search::Hit::HitI methods | |
178 | |
179 =head2 name | |
180 | |
181 Title : name | |
182 Usage : $hit_name = $hit->name(); | |
183 Function: returns the name of the Hit sequence | |
184 Returns : a scalar string | |
185 Args : [optional] scalar string to set the name | |
186 | |
187 =cut | |
188 | |
189 sub name { | |
190 my ($self,$value) = @_; | |
191 my $previous = $self->{'_name'}; | |
192 if( defined $value || ! defined $previous ) { | |
193 $value = $previous = '' unless defined $value; | |
194 $self->{'_name'} = $value; | |
195 } | |
196 return $previous; | |
197 } | |
198 | |
199 =head2 accession | |
200 | |
201 Title : accession | |
202 Usage : $acc = $hit->accession(); | |
203 Function: Retrieve the accession (if available) for the hit | |
204 Returns : a scalar string (empty string if not set) | |
205 Args : none | |
206 | |
207 =cut | |
208 | |
209 sub accession { | |
210 my ($self,$value) = @_; | |
211 my $previous = $self->{'_accession'}; | |
212 if( defined $value || ! defined $previous ) { | |
213 $value = $previous = '' unless defined $value; | |
214 $self->{'_accession'} = $value; | |
215 } | |
216 return $previous; | |
217 } | |
218 | |
219 =head2 description | |
220 | |
221 Title : description | |
222 Usage : $desc = $hit->description(); | |
223 Function: Retrieve the description for the hit | |
224 Returns : a scalar string | |
225 Args : [optional] scalar string to set the descrition | |
226 | |
227 =cut | |
228 | |
229 sub description { | |
230 my ($self,$value) = @_; | |
231 my $previous = $self->{'_description'}; | |
232 if( defined $value || ! defined $previous ) { | |
233 $value = $previous = '' unless defined $value; | |
234 $self->{'_description'} = $value; | |
235 } | |
236 return $previous; | |
237 } | |
238 | |
239 =head2 length | |
240 | |
241 Title : length | |
242 Usage : my $len = $hit->length | |
243 Function: Returns the length of the hit | |
244 Returns : integer | |
245 Args : [optional] integer to set the length | |
246 | |
247 =cut | |
248 | |
249 sub length { | |
250 my ($self,$value) = @_; | |
251 my $previous = $self->{'_length'}; | |
252 if( defined $value || ! defined $previous ) { | |
253 $value = $previous = 0 unless defined $value; | |
254 $self->{'_length'} = $value; | |
255 } | |
256 return $previous; | |
257 } | |
258 | |
259 | |
260 =head2 algorithm | |
261 | |
262 Title : algorithm | |
263 Usage : $alg = $hit->algorithm(); | |
264 Function: Gets the algorithm specification that was used to obtain the hit | |
265 For BLAST, the algorithm denotes what type of sequence was aligned | |
266 against what (BLASTN: dna-dna, BLASTP prt-prt, BLASTX translated | |
267 dna-prt, TBLASTN prt-translated dna, TBLASTX translated | |
268 dna-translated dna). | |
269 Returns : a scalar string | |
270 Args : [optional] scalar string to set the algorithm | |
271 | |
272 =cut | |
273 | |
274 sub algorithm { | |
275 my ($self,$value) = @_; | |
276 my $previous = $self->{'_algorithm'}; | |
277 if( defined $value || ! defined $previous ) { | |
278 $value = $previous = '' unless defined $value; | |
279 $self->{'_algorithm'} = $value; | |
280 } | |
281 return $previous; | |
282 } | |
283 | |
284 =head2 raw_score | |
285 | |
286 Title : raw_score | |
287 Usage : $score = $hit->raw_score(); | |
288 Function: Gets the "raw score" generated by the algorithm. What | |
289 this score is exactly will vary from algorithm to algorithm, | |
290 returning undef if unavailable. | |
291 Returns : a scalar value | |
292 Args : [optional] scalar value to set the raw score | |
293 | |
294 =cut | |
295 | |
296 sub raw_score { | |
297 my ($self,$value) = @_; | |
298 my $previous = $self->{'_score'}; | |
299 if( defined $value || ! defined $previous ) { | |
300 $value = $previous = '' unless defined $value; | |
301 $self->{'_score'} = $value; | |
302 } | |
303 return $previous; | |
304 } | |
305 | |
306 =head2 significance | |
307 | |
308 Title : significance | |
309 Usage : $significance = $hit->significance(); | |
310 Function: Used to obtain the E or P value of a hit, i.e. the probability that | |
311 this particular hit was obtained purely by random chance. If | |
312 information is not available (nor calculatable from other | |
313 information sources), return undef. | |
314 Returns : a scalar value or undef if unavailable | |
315 Args : [optional] scalar value to set the significance | |
316 | |
317 =cut | |
318 | |
319 sub significance { | |
320 my ($self,$value) = @_; | |
321 my $previous = $self->{'_significance'}; | |
322 if( defined $value || ! defined $previous ) { | |
323 $value = $previous = '' unless defined $value; | |
324 $self->{'_significance'} = $value; | |
325 } | |
326 return $previous; | |
327 } | |
328 | |
329 =head2 bits | |
330 | |
331 Usage : $hit_object->bits(); | |
332 Purpose : Gets the bit score of the best HSP for the current hit. | |
333 Example : $bits = $hit_object->bits(); | |
334 Returns : Integer or undef if bit score is not set | |
335 Argument : n/a | |
336 Comments : For BLAST1, the non-bit score is listed in the summary line. | |
337 | |
338 See Also : L<score()|score> | |
339 | |
340 =cut | |
341 | |
342 #--------- | |
343 sub bits { | |
344 #--------- | |
345 my ($self) = @_; | |
346 | |
347 my $bits = $self->{'_bits'}; | |
348 if( ! defined $bits ) { | |
349 $bits = $self->{'_hsps'}->[0]->bits(); | |
350 $self->{'_bits'} = $bits; | |
351 } | |
352 return $bits; | |
353 } | |
354 | |
355 =head2 next_hsp | |
356 | |
357 Title : next_hsp | |
358 Usage : while( $hsp = $obj->next_hsp()) { ... } | |
359 Function : Returns the next available High Scoring Pair | |
360 Example : | |
361 Returns : Bio::Search::HSP::HSPI object or null if finished | |
362 Args : none | |
363 | |
364 =cut | |
365 | |
366 sub next_hsp { | |
367 my ($self) = @_; | |
368 $self->{'_iterator'} = 0 unless defined $self->{'_iterator'}; | |
369 return undef if $self->{'_iterator'} > scalar @{$self->{'_hsps'}}; | |
370 return $self->{'_hsps'}->[$self->{'_iterator'}++]; | |
371 } | |
372 | |
373 | |
374 =head2 hsps | |
375 | |
376 Usage : $hit_object->hsps(); | |
377 Purpose : Get a list containing all HSP objects. | |
378 : Get the numbers of HSPs for the current hit. | |
379 Example : @hsps = $hit_object->hsps(); | |
380 : $num = $hit_object->hsps(); # alternatively, use num_hsps() | |
381 Returns : Array context : list of Bio::Search::HSP::BlastHSP.pm objects. | |
382 : Scalar context: integer (number of HSPs). | |
383 : (Equivalent to num_hsps()). | |
384 Argument : n/a. Relies on wantarray | |
385 Throws : Exception if the HSPs have not been collected. | |
386 | |
387 See Also : L<hsp()|hsp>, L<num_hsps()|num_hsps> | |
388 | |
389 =cut | |
390 | |
391 #--------- | |
392 sub hsps { | |
393 #--------- | |
394 my $self = shift; | |
395 | |
396 if (not ref $self->{'_hsps'}) { | |
397 $self->throw("Can't get HSPs: data not collected."); | |
398 } | |
399 | |
400 return wantarray | |
401 # returning list containing all HSPs. | |
402 ? @{$self->{'_hsps'}} | |
403 # returning number of HSPs. | |
404 : scalar(@{$self->{'_hsps'}}); | |
405 } | |
406 | |
407 =head2 num_hsps | |
408 | |
409 Usage : $hit_object->num_hsps(); | |
410 Purpose : Get the number of HSPs for the present Blast hit. | |
411 Example : $nhsps = $hit_object->num_hsps(); | |
412 Returns : Integer | |
413 Argument : n/a | |
414 Throws : Exception if the HSPs have not been collected. | |
415 | |
416 See Also : L<hsps()|hsps> | |
417 | |
418 =cut | |
419 | |
420 #------------- | |
421 sub num_hsps { | |
422 my $self = shift; | |
423 | |
424 if (not defined $self->{'_hsps'}) { | |
425 $self->throw("Can't get HSPs: data not collected."); | |
426 } | |
427 | |
428 return scalar(@{$self->{'_hsps'}}); | |
429 } | |
430 | |
431 =head2 rewind | |
432 | |
433 Title : rewind | |
434 Usage : $hit->rewind; | |
435 Function: Allow one to reset the HSP iteration to the beginning | |
436 Since this is an in-memory implementation | |
437 Returns : none | |
438 Args : none | |
439 | |
440 =cut | |
441 | |
442 sub rewind{ | |
443 my ($self) = @_; | |
444 $self->{'_iterator'} = 0; | |
445 } | |
446 | |
447 =head2 iteration | |
448 | |
449 Title : iteration | |
450 Usage : $obj->iteration($newval) | |
451 Function: PSI-BLAST iteration | |
452 Returns : value of iteration | |
453 Args : newvalue (optional) | |
454 | |
455 | |
456 =cut | |
457 | |
458 sub iteration{ | |
459 my ($self,$value) = @_; | |
460 if( defined $value) { | |
461 $self->{'_psiblast_iteration'} = $value; | |
462 } | |
463 return $self->{'_psiblast_iteration'}; | |
464 | |
465 } | |
466 | |
467 | |
468 =head2 ambiguous_aln | |
469 | |
470 Usage : $ambig_code = $hit_object->ambiguous_aln(); | |
471 Purpose : Sets/Gets ambiguity code data member. | |
472 Example : (see usage) | |
473 Returns : String = 'q', 's', 'qs', '-' | |
474 : 'q' = query sequence contains overlapping sub-sequences | |
475 : while sbjct does not. | |
476 : 's' = sbjct sequence contains overlapping sub-sequences | |
477 : while query does not. | |
478 : 'qs' = query and sbjct sequence contains overlapping sub-sequences | |
479 : relative to each other. | |
480 : '-' = query and sbjct sequence do not contains multiple domains | |
481 : relative to each other OR both contain the same distribution | |
482 : of similar domains. | |
483 Argument : n/a | |
484 Throws : n/a | |
485 Status : Experimental | |
486 | |
487 =cut | |
488 | |
489 #-------------------- | |
490 sub ambiguous_aln { | |
491 #-------------------- | |
492 my $self = shift; | |
493 if(@_) { $self->{'_ambiguous_aln'} = shift; } | |
494 $self->{'_ambiguous_aln'} || '-'; | |
495 } | |
496 | |
497 =head2 overlap | |
498 | |
499 See documentation in L<Bio::Search::SearchUtils::overlap()|Bio::Search::SearchUtils> | |
500 | |
501 =cut | |
502 | |
503 #------------- | |
504 sub overlap { | |
505 #------------- | |
506 my $self = shift; | |
507 if(@_) { $self->{'_overlap'} = shift; } | |
508 defined $self->{'_overlap'} ? $self->{'_overlap'} : 0; | |
509 } | |
510 | |
511 | |
512 =head2 n | |
513 | |
514 Usage : $hit_object->n(); | |
515 Purpose : Gets the N number for the current hit. | |
516 : This is the number of HSPs in the set which was ascribed | |
517 : the lowest P-value (listed on the description line). | |
518 : This number is not the same as the total number of HSPs. | |
519 : To get the total number of HSPs, use num_hsps(). | |
520 Example : $n = $hit_object->n(); | |
521 Returns : Integer | |
522 Argument : n/a | |
523 Throws : Exception if HSPs have not been set (BLAST2 reports). | |
524 Comments : Note that the N parameter is not reported in gapped BLAST2. | |
525 : Calling n() on such reports will result in a call to num_hsps(). | |
526 : The num_hsps() method will count the actual number of | |
527 : HSPs in the alignment listing, which may exceed N in | |
528 : some cases. | |
529 | |
530 See Also : L<num_hsps()|num_hsps> | |
531 | |
532 =cut | |
533 | |
534 #----- | |
535 sub n { | |
536 #----- | |
537 my $self = shift; | |
538 | |
539 # The check for $self->{'_n'} is a remnant from the 'query' mode days | |
540 # in which the sbjct object would collect data from the description | |
541 # line only. | |
542 | |
543 my ($n); | |
544 if(not defined($self->{'_n'})) { | |
545 $n = $self->hsp->n; | |
546 } else { | |
547 $n = $self->{'_n'}; | |
548 } | |
549 $n ||= $self->num_hsps; | |
550 | |
551 return $n; | |
552 } | |
553 | |
554 =head2 p | |
555 | |
556 Usage : $hit_object->p( [format] ); | |
557 Purpose : Get the P-value for the best HSP of the given BLAST hit. | |
558 : (Note that P-values are not provided with NCBI Blast2 reports). | |
559 Example : $p = $sbjct->p; | |
560 : $p = $sbjct->p('exp'); # get exponent only. | |
561 : ($num, $exp) = $sbjct->p('parts'); # split sci notation into parts | |
562 Returns : Float or scientific notation number (the raw P-value, DEFAULT). | |
563 : Integer if format == 'exp' (the magnitude of the base 10 exponent). | |
564 : 2-element list (float, int) if format == 'parts' and P-value | |
565 : is in scientific notation (See Comments). | |
566 Argument : format: string of 'raw' | 'exp' | 'parts' | |
567 : 'raw' returns value given in report. Default. (1.2e-34) | |
568 : 'exp' returns exponent value only (34) | |
569 : 'parts' returns the decimal and exponent as a | |
570 : 2-element list (1.2, -34) (See Comments). | |
571 Throws : Warns if no P-value is defined. Uses expect instead. | |
572 Comments : Using the 'parts' argument is not recommended since it will not | |
573 : work as expected if the P-value is not in scientific notation. | |
574 : That is, floats are not converted into sci notation before | |
575 : splitting into parts. | |
576 | |
577 See Also : L<expect()|expect>, L<signif()|signif>, L<Bio::Search::SearchUtils::get_exponent()|Bio::Search::SearchUtils> | |
578 | |
579 =cut | |
580 | |
581 #-------- | |
582 sub p { | |
583 #-------- | |
584 # Some duplication of logic for p(), expect() and signif() for the sake of performance. | |
585 my ($self, $fmt) = @_; | |
586 | |
587 my $val = $self->{'_p'}; | |
588 | |
589 # $val can be zero. | |
590 if(not defined $val) { | |
591 # P-value not defined, must be a NCBI Blast2 report. | |
592 # Use expect instead. | |
593 $self->warn( "P-value not defined. Using expect() instead."); | |
594 $val = $self->{'_expect'}; | |
595 } | |
596 | |
597 return $val if not $fmt or $fmt =~ /^raw/i; | |
598 ## Special formats: exponent-only or as list. | |
599 return &Bio::Search::SearchUtils::get_exponent($val) if $fmt =~ /^exp/i; | |
600 return (split (/eE/, $val)) if $fmt =~ /^parts/i; | |
601 | |
602 ## Default: return the raw P-value. | |
603 return $val; | |
604 } | |
605 | |
606 =head2 hsp | |
607 | |
608 Usage : $hit_object->hsp( [string] ); | |
609 Purpose : Get a single HSPI object for the present HitI object. | |
610 Example : $hspObj = $hit_object->hsp; # same as 'best' | |
611 : $hspObj = $hit_object->hsp('best'); | |
612 : $hspObj = $hit_object->hsp('worst'); | |
613 Returns : Object reference for a Bio::Search::HSP::BlastHSP.pm object. | |
614 Argument : String (or no argument). | |
615 : No argument (default) = highest scoring HSP (same as 'best'). | |
616 : 'best' or 'first' = highest scoring HSP. | |
617 : 'worst' or 'last' = lowest scoring HSP. | |
618 Throws : Exception if the HSPs have not been collected. | |
619 : Exception if an unrecognized argument is used. | |
620 | |
621 See Also : L<hsps()|hsps>, L<num_hsps>() | |
622 | |
623 =cut | |
624 | |
625 #---------- | |
626 sub hsp { | |
627 #---------- | |
628 my( $self, $option ) = @_; | |
629 $option ||= 'best'; | |
630 | |
631 if (not ref $self->{'_hsps'}) { | |
632 $self->throw("Can't get HSPs: data not collected."); | |
633 } | |
634 | |
635 my @hsps = @{$self->{'_hsps'}}; | |
636 | |
637 return $hsps[0] if $option =~ /best|first|1/i; | |
638 return $hsps[$#hsps] if $option =~ /worst|last/i; | |
639 | |
640 $self->throw("Can't get HSP for: $option\n" . | |
641 "Valid arguments: 'best', 'worst'"); | |
642 } | |
643 | |
644 =head2 logical_length | |
645 | |
646 Usage : $hit_object->logical_length( [seq_type] ); | |
647 : (mostly intended for internal use). | |
648 Purpose : Get the logical length of the hit sequence. | |
649 : If the Blast is a TBLASTN or TBLASTX, the returned length | |
650 : is the length of the would-be amino acid sequence (length/3). | |
651 : For all other BLAST flavors, this function is the same as length(). | |
652 Example : $len = $hit_object->logical_length(); | |
653 Returns : Integer | |
654 Argument : seq_type = 'query' or 'hit' or 'sbjct' (default = 'query') | |
655 ('sbjct' is synonymous with 'hit') | |
656 Throws : n/a | |
657 Comments : This is important for functions like frac_aligned_query() | |
658 : which need to operate in amino acid coordinate space when dealing | |
659 : with [T]BLAST[NX] type reports. | |
660 | |
661 See Also : L<length()|length>, L<frac_aligned_query()|frac_aligned_query>, L<frac_aligned_hit()|frac_aligned_hit> | |
662 | |
663 =cut | |
664 | |
665 #-------------------- | |
666 sub logical_length { | |
667 #-------------------- | |
668 my $self = shift; | |
669 my $seqType = shift || 'query'; | |
670 $seqType = 'sbjct' if $seqType eq 'hit'; | |
671 | |
672 my $length; | |
673 | |
674 # For the sbjct, return logical sbjct length | |
675 if( $seqType eq 'sbjct' ) { | |
676 $length = $self->{'_logical_length'} || $self->{'_length'}; | |
677 } | |
678 else { | |
679 # Otherwise, return logical query length | |
680 $length = $self->{'_query_length'}; | |
681 $self->throw("Must have defined query_len") unless ( $length ); | |
682 | |
683 # Adjust length based on BLAST flavor. | |
684 if($self->algorithm =~ /T?BLASTX/ ) { | |
685 $length /= 3; | |
686 } | |
687 } | |
688 return $length; | |
689 } | |
690 | |
691 =head2 length_aln | |
692 | |
693 Usage : $hit_object->length_aln( [seq_type] ); | |
694 Purpose : Get the total length of the aligned region for query or sbjct seq. | |
695 : This number will include all HSPs | |
696 Example : $len = $hit_object->length_aln(); # default = query | |
697 : $lenAln = $hit_object->length_aln('query'); | |
698 Returns : Integer | |
699 Argument : seq_Type = 'query' or 'hit' or 'sbjct' (Default = 'query') | |
700 ('sbjct' is synonymous with 'hit') | |
701 Throws : Exception if the argument is not recognized. | |
702 Comments : This method will report the logical length of the alignment, | |
703 : meaning that for TBLAST[NX] reports, the length is reported | |
704 : using amino acid coordinate space (i.e., nucleotides / 3). | |
705 : | |
706 : This method requires that all HSPs be tiled. If they have not | |
707 : already been tiled, they will be tiled first automatically.. | |
708 : If you don't want the tiled data, iterate through each HSP | |
709 : calling length() on each (use hsps() to get all HSPs). | |
710 | |
711 See Also : L<length()|length>, L<frac_aligned_query()|frac_aligned_query>, L<frac_aligned_hit()|frac_aligned_hit>, L<gaps()|gaps>, L<Bio::Search::SearchUtils::tile_hsps()|Bio::Search::SearchUtils>, L<Bio::Search::HSP::BlastHSP::length()|Bio::Search::HSP::BlastHSP> | |
712 | |
713 =cut | |
714 | |
715 #---------------' | |
716 sub length_aln { | |
717 #--------------- | |
718 my( $self, $seqType, $num ) = @_; | |
719 | |
720 $seqType ||= 'query'; | |
721 $seqType = 'sbjct' if $seqType eq 'hit'; | |
722 | |
723 Bio::Search::SearchUtils::tile_hsps($self) if not $self->{'_tiled_hsps'}; | |
724 | |
725 if( defined $num) { | |
726 return $self->{'_length_aln_'.$seqType} = $num; | |
727 } | |
728 | |
729 my $data = $self->{'_length_aln_'.$seqType}; | |
730 | |
731 ## If we don't have data, figure out what went wrong. | |
732 if(!$data) { | |
733 $self->throw("Can't get length aln for sequence type \"$seqType\". " . | |
734 "Valid types are 'query', 'hit', 'sbjct' ('sbjct' = 'hit')"); | |
735 } | |
736 return $data; | |
737 } | |
738 | |
739 =head2 gaps | |
740 | |
741 Usage : $hit_object->gaps( [seq_type] ); | |
742 Purpose : Get the number of gaps in the aligned query, hit, or both sequences. | |
743 : Data is summed across all HSPs. | |
744 Example : $qgaps = $hit_object->gaps('query'); | |
745 : $hgaps = $hit_object->gaps('hit'); | |
746 : $tgaps = $hit_object->gaps(); # default = total (query + hit) | |
747 Returns : scalar context: integer | |
748 : array context without args: two-element list of integers | |
749 : (queryGaps, hitGaps) | |
750 : Array context can be forced by providing an argument of 'list' or 'array'. | |
751 : | |
752 : CAUTION: Calling this method within printf or sprintf is arrray context. | |
753 : So this function may not give you what you expect. For example: | |
754 : printf "Total gaps: %d", $hit->gaps(); | |
755 : Actually returns a two-element array, so what gets printed | |
756 : is the number of gaps in the query, not the total | |
757 : | |
758 Argument : seq_type: 'query' | 'hit' or 'sbjct' | 'total' | 'list' (default = 'total') | |
759 ('sbjct' is synonymous with 'hit') | |
760 Throws : n/a | |
761 Comments : If you need data for each HSP, use hsps() and then interate | |
762 : through each HSP object. | |
763 : This method requires that all HSPs be tiled. If they have not | |
764 : already been tiled, they will be tiled first automatically.. | |
765 : Not relying on wantarray since that will fail in situations | |
766 : such as printf "%d", $hit->gaps() in which you might expect to | |
767 : be printing the total gaps, but evaluates to array context. | |
768 | |
769 See Also : L<length_aln()|length_aln> | |
770 | |
771 =cut | |
772 | |
773 #---------- | |
774 sub gaps { | |
775 #---------- | |
776 my( $self, $seqType, $num ) = @_; | |
777 | |
778 $seqType ||= (wantarray ? 'list' : 'total'); | |
779 $seqType = 'sbjct' if $seqType eq 'hit'; | |
780 | |
781 Bio::Search::SearchUtils::tile_hsps($self) if not $self->{'_tiled_hsps'}; | |
782 | |
783 $seqType = lc($seqType); | |
784 | |
785 if( defined $num ) { | |
786 $self->throw("Can't set gaps for seqType '$seqType'. Must be 'query' or 'hit'\n") unless ($seqType eq 'sbjct' or $seqType eq 'query'); | |
787 | |
788 return $self->{'_gaps_'.$seqType} = $num; | |
789 } | |
790 elsif($seqType =~ /list|array/i) { | |
791 return ($self->{'_gaps_query'}, $self->{'_gaps_sbjct'}); | |
792 } | |
793 elsif($seqType eq 'total') { | |
794 return ($self->{'_gaps_query'} + $self->{'_gaps_sbjct'}) || 0; | |
795 } else { | |
796 return $self->{'_gaps_'.$seqType} || 0; | |
797 } | |
798 } | |
799 | |
800 | |
801 =head2 matches | |
802 | |
803 See documentation in L<Bio::Search::Hit::HitI::matches()|Bio::Search::Hit::HitI> | |
804 | |
805 =cut | |
806 | |
807 #--------------- | |
808 sub matches { | |
809 #--------------- | |
810 my( $self, $arg1, $arg2) = @_; | |
811 my(@data,$data); | |
812 | |
813 Bio::Search::SearchUtils::tile_hsps($self) if not $self->{'_tiled_hsps'}; | |
814 | |
815 if(!$arg1) { | |
816 @data = ($self->{'_totalIdentical'}, $self->{'_totalConserved'}); | |
817 | |
818 return @data if @data; | |
819 | |
820 } else { | |
821 | |
822 if( defined $arg2 ) { | |
823 $self->{'_totalIdentical'} = $arg1; | |
824 $self->{'_totalConserved'} = $arg2; | |
825 return ( $arg1, $arg2 ); | |
826 } | |
827 elsif($arg1 =~ /^id/i) { | |
828 $data = $self->{'_totalIdentical'}; | |
829 } else { | |
830 $data = $self->{'_totalConserved'}; | |
831 } | |
832 return $data if $data; | |
833 } | |
834 | |
835 ## Something went wrong if we make it to here. | |
836 $self->throw("Can't get identical or conserved data: no data."); | |
837 } | |
838 | |
839 | |
840 =head2 start | |
841 | |
842 Usage : $sbjct->start( [seq_type] ); | |
843 Purpose : Gets the start coordinate for the query, sbjct, or both sequences | |
844 : in the BlastHit object. If there is more than one HSP, the lowest start | |
845 : value of all HSPs is returned. | |
846 Example : $qbeg = $sbjct->start('query'); | |
847 : $sbeg = $sbjct->start('hit'); | |
848 : ($qbeg, $sbeg) = $sbjct->start(); | |
849 Returns : scalar context: integer | |
850 : array context without args: list of two integers (queryStart, sbjctStart) | |
851 : Array context can be "induced" by providing an argument of 'list' or 'array'. | |
852 Argument : In scalar context: seq_type = 'query' or 'hit' or 'sbjct' (default = 'query') | |
853 ('sbjct' is synonymous with 'hit') | |
854 Throws : n/a | |
855 Comments : This method requires that all HSPs be tiled. If there is more than one | |
856 : HSP and they have not already been tiled, they will be tiled first automatically.. | |
857 : Remember that the start and end coordinates of all HSPs are | |
858 : normalized so that start < end. Strand information can be | |
859 : obtained by calling $hit->strand(). | |
860 | |
861 See Also : L<end()|end>, L<range()|range>, L<strand()|strand>, | |
862 L<Bio::Search::HSP::BlastHSP::start|Bio::Search::HSP::BlastHSP> | |
863 | |
864 =cut | |
865 | |
866 #---------- | |
867 sub start { | |
868 #---------- | |
869 my ($self, $seqType, $num) = @_; | |
870 | |
871 $seqType ||= (wantarray ? 'list' : 'query'); | |
872 $seqType = 'sbjct' if $seqType eq 'hit'; | |
873 | |
874 if( defined $num ) { | |
875 $seqType = "_\L$seqType\E"; | |
876 return $self->{$seqType.'Start'} = $num; | |
877 } | |
878 | |
879 # If there is only one HSP, defer this call to the solitary HSP. | |
880 if($self->num_hsps == 1) { | |
881 return $self->hsp->start($seqType); | |
882 } else { | |
883 Bio::Search::SearchUtils::tile_hsps($self) if not $self->{'_tiled_hsps'}; | |
884 if($seqType =~ /list|array/i) { | |
885 return ($self->{'_queryStart'}, $self->{'_sbjctStart'}); | |
886 } else { | |
887 ## Sensitive to member name changes. | |
888 $seqType = "_\L$seqType\E"; | |
889 return $self->{$seqType.'Start'}; | |
890 } | |
891 } | |
892 } | |
893 | |
894 | |
895 =head2 end | |
896 | |
897 Usage : $sbjct->end( [seq_type] ); | |
898 Purpose : Gets the end coordinate for the query, sbjct, or both sequences | |
899 : in the BlastHit object. If there is more than one HSP, the largest end | |
900 : value of all HSPs is returned. | |
901 Example : $qend = $sbjct->end('query'); | |
902 : $send = $sbjct->end('hit'); | |
903 : ($qend, $send) = $sbjct->end(); | |
904 Returns : scalar context: integer | |
905 : array context without args: list of two integers (queryEnd, sbjctEnd) | |
906 : Array context can be "induced" by providing an argument of 'list' or 'array'. | |
907 Argument : In scalar context: seq_type = 'query' or 'sbjct' | |
908 : (case insensitive). If not supplied, 'query' is used. | |
909 Throws : n/a | |
910 Comments : This method requires that all HSPs be tiled. If there is more than one | |
911 : HSP and they have not already been tiled, they will be tiled first automatically.. | |
912 : Remember that the start and end coordinates of all HSPs are | |
913 : normalized so that start < end. Strand information can be | |
914 : obtained by calling $hit->strand(). | |
915 | |
916 See Also : L<start()|start>, L<range()|range>, L<strand()|strand> | |
917 | |
918 =cut | |
919 | |
920 #---------- | |
921 sub end { | |
922 #---------- | |
923 my ($self, $seqType, $num) = @_; | |
924 | |
925 $seqType ||= (wantarray ? 'list' : 'query'); | |
926 $seqType = 'sbjct' if $seqType eq 'hit'; | |
927 | |
928 | |
929 if( defined $num ) { | |
930 $seqType = "_\L$seqType\E"; | |
931 return $self->{$seqType.'Stop'} = $num; | |
932 } | |
933 | |
934 # If there is only one HSP, defer this call to the solitary HSP. | |
935 if($self->num_hsps == 1) { | |
936 return $self->hsp->end($seqType); | |
937 } else { | |
938 Bio::Search::SearchUtils::tile_hsps($self) if not $self->{'_tiled_hsps'}; | |
939 if($seqType =~ /list|array/i) { | |
940 return ($self->{'_queryStop'}, $self->{'_sbjctStop'}); | |
941 } else { | |
942 ## Sensitive to member name changes. | |
943 $seqType = "_\L$seqType\E"; | |
944 return $self->{$seqType.'Stop'}; | |
945 } | |
946 } | |
947 } | |
948 | |
949 =head2 range | |
950 | |
951 Usage : $sbjct->range( [seq_type] ); | |
952 Purpose : Gets the (start, end) coordinates for the query or sbjct sequence | |
953 : in the HSP alignment. | |
954 Example : ($qbeg, $qend) = $sbjct->range('query'); | |
955 : ($sbeg, $send) = $sbjct->range('hit'); | |
956 Returns : Two-element array of integers | |
957 Argument : seq_type = string, 'query' or 'hit' or 'sbjct' (default = 'query') | |
958 ('sbjct' is synonymous with 'hit') | |
959 Throws : n/a | |
960 | |
961 See Also : L<start()|start>, L<end()|end> | |
962 | |
963 =cut | |
964 | |
965 #---------- | |
966 sub range { | |
967 #---------- | |
968 my ($self, $seqType) = @_; | |
969 $seqType ||= 'query'; | |
970 $seqType = 'sbjct' if $seqType eq 'hit'; | |
971 return ($self->start($seqType), $self->end($seqType)); | |
972 } | |
973 | |
974 | |
975 =head2 frac_identical | |
976 | |
977 Usage : $hit_object->frac_identical( [seq_type] ); | |
978 Purpose : Get the overall fraction of identical positions across all HSPs. | |
979 : The number refers to only the aligned regions and does not | |
980 : account for unaligned regions in between the HSPs, if any. | |
981 Example : $frac_iden = $hit_object->frac_identical('query'); | |
982 Returns : Float (2-decimal precision, e.g., 0.75). | |
983 Argument : seq_type: 'query' | 'hit' or 'sbjct' | 'total' | |
984 : default = 'query' (but see comments below). | |
985 : ('sbjct' is synonymous with 'hit') | |
986 Throws : n/a | |
987 Comments : Different versions of Blast report different values for the total | |
988 : length of the alignment. This is the number reported in the | |
989 : denominators in the stats section: | |
990 : "Identical = 34/120 Positives = 67/120". | |
991 : NCBI BLAST uses the total length of the alignment (with gaps) | |
992 : WU-BLAST uses the length of the query sequence (without gaps). | |
993 : | |
994 : Therefore, when called with an argument of 'total', | |
995 : this method will report different values depending on the | |
996 : version of BLAST used. Total does NOT take into account HSP | |
997 : tiling, so it should not be used. | |
998 : | |
999 : To get the fraction identical among only the aligned residues, | |
1000 : ignoring the gaps, call this method without an argument or | |
1001 : with an argument of 'query' or 'hit'. | |
1002 : | |
1003 : If you need data for each HSP, use hsps() and then iterate | |
1004 : through the HSP objects. | |
1005 : This method requires that all HSPs be tiled. If they have not | |
1006 : already been tiled, they will be tiled first automatically. | |
1007 | |
1008 See Also : L<frac_conserved()|frac_conserved>, L<frac_aligned_query()|frac_aligned_query>, L<matches()|matches>, L<Bio::Search::SearchUtils::tile_hsps()|Bio::Search::SearchUtils> | |
1009 | |
1010 =cut | |
1011 | |
1012 #------------------ | |
1013 sub frac_identical { | |
1014 #------------------ | |
1015 my ($self, $seqType) = @_; | |
1016 $seqType ||= 'query'; | |
1017 $seqType = 'sbjct' if $seqType eq 'hit'; | |
1018 | |
1019 ## Sensitive to member name format. | |
1020 $seqType = lc($seqType); | |
1021 | |
1022 Bio::Search::SearchUtils::tile_hsps($self) if not $self->{'_tiled_hsps'}; | |
1023 | |
1024 my $ident = $self->{'_totalIdentical'}; | |
1025 my $total = $self->{'_length_aln_'.$seqType}; | |
1026 my $ratio = $ident / $total; | |
1027 my $ratio_rounded = sprintf( "%.3f", $ratio); | |
1028 | |
1029 # Round down iff normal rounding yields 1 (just like blast) | |
1030 $ratio_rounded = 0.999 if (($ratio_rounded == 1) && ($ratio < 1)); | |
1031 return $ratio_rounded; | |
1032 } | |
1033 | |
1034 | |
1035 | |
1036 =head2 frac_conserved | |
1037 | |
1038 Usage : $hit_object->frac_conserved( [seq_type] ); | |
1039 Purpose : Get the overall fraction of conserved positions across all HSPs. | |
1040 : The number refers to only the aligned regions and does not | |
1041 : account for unaligned regions in between the HSPs, if any. | |
1042 Example : $frac_cons = $hit_object->frac_conserved('hit'); | |
1043 Returns : Float (2-decimal precision, e.g., 0.75). | |
1044 Argument : seq_type: 'query' | 'hit' or 'sbjct' | 'total' | |
1045 : default = 'query' (but see comments below). | |
1046 : ('sbjct' is synonymous with 'hit') | |
1047 Throws : n/a | |
1048 Comments : Different versions of Blast report different values for the total | |
1049 : length of the alignment. This is the number reported in the | |
1050 : denominators in the stats section: | |
1051 : "Positives = 34/120 Positives = 67/120". | |
1052 : NCBI BLAST uses the total length of the alignment (with gaps) | |
1053 : WU-BLAST uses the length of the query sequence (without gaps). | |
1054 : | |
1055 : Therefore, when called with an argument of 'total', | |
1056 : this method will report different values depending on the | |
1057 : version of BLAST used. Total does NOT take into account HSP | |
1058 : tiling, so it should not be used. | |
1059 : | |
1060 : To get the fraction conserved among only the aligned residues, | |
1061 : ignoring the gaps, call this method without an argument or | |
1062 : with an argument of 'query' or 'hit'. | |
1063 : | |
1064 : If you need data for each HSP, use hsps() and then interate | |
1065 : through the HSP objects. | |
1066 : This method requires that all HSPs be tiled. If they have not | |
1067 : already been tiled, they will be tiled first automatically. | |
1068 | |
1069 See Also : L<frac_identical()|frac_identical>, L<matches()|matches>, L<Bio::Search::SearchUtils::tile_hsps()|Bio::Search::SearchUtils> | |
1070 | |
1071 =cut | |
1072 | |
1073 #-------------------- | |
1074 sub frac_conserved { | |
1075 #-------------------- | |
1076 my ($self, $seqType) = @_; | |
1077 $seqType ||= 'query'; | |
1078 $seqType = 'sbjct' if $seqType eq 'hit'; | |
1079 | |
1080 ## Sensitive to member name format. | |
1081 $seqType = lc($seqType); | |
1082 | |
1083 Bio::Search::SearchUtils::tile_hsps($self) if not $self->{'_tiled_hsps'}; | |
1084 | |
1085 my $consv = $self->{'_totalConserved'}; | |
1086 my $total = $self->{'_length_aln_'.$seqType}; | |
1087 my $ratio = $consv / $total; | |
1088 my $ratio_rounded = sprintf( "%.3f", $ratio); | |
1089 | |
1090 # Round down iff normal rounding yields 1 (just like blast) | |
1091 $ratio_rounded = 0.999 if (($ratio_rounded == 1) && ($ratio < 1)); | |
1092 return $ratio_rounded; | |
1093 } | |
1094 | |
1095 | |
1096 | |
1097 | |
1098 =head2 frac_aligned_query | |
1099 | |
1100 Usage : $hit_object->frac_aligned_query(); | |
1101 Purpose : Get the fraction of the query sequence which has been aligned | |
1102 : across all HSPs (not including intervals between non-overlapping | |
1103 : HSPs). | |
1104 Example : $frac_alnq = $hit_object->frac_aligned_query(); | |
1105 Returns : Float (2-decimal precision, e.g., 0.75). | |
1106 Argument : n/a | |
1107 Throws : n/a | |
1108 Comments : If you need data for each HSP, use hsps() and then interate | |
1109 : through the HSP objects. | |
1110 : To compute the fraction aligned, the logical length of the query | |
1111 : sequence is used, meaning that for [T]BLASTX reports, the | |
1112 : full length of the query sequence is converted into amino acids | |
1113 : by dividing by 3. This is necessary because of the way | |
1114 : the lengths of aligned sequences are computed. | |
1115 : This method requires that all HSPs be tiled. If they have not | |
1116 : already been tiled, they will be tiled first automatically. | |
1117 | |
1118 See Also : L<frac_aligned_hit()|frac_aligned_hit>, L<logical_length()|logical_length>, L<length_aln()|length_aln>, L<Bio::Search::SearchUtils::tile_hsps()|Bio::Search::SearchUtils> | |
1119 | |
1120 =cut | |
1121 | |
1122 #---------------------- | |
1123 sub frac_aligned_query { | |
1124 #---------------------- | |
1125 my $self = shift; | |
1126 | |
1127 Bio::Search::SearchUtils::tile_hsps($self) if not $self->{'_tiled_hsps'}; | |
1128 | |
1129 sprintf( "%.2f", $self->{'_length_aln_query'}/ | |
1130 $self->logical_length('query')); | |
1131 } | |
1132 | |
1133 | |
1134 | |
1135 =head2 frac_aligned_hit | |
1136 | |
1137 Usage : $hit_object->frac_aligned_hit(); | |
1138 Purpose : Get the fraction of the hit (sbjct) sequence which has been aligned | |
1139 : across all HSPs (not including intervals between non-overlapping | |
1140 : HSPs). | |
1141 Example : $frac_alnq = $hit_object->frac_aligned_hit(); | |
1142 Returns : Float (2-decimal precision, e.g., 0.75). | |
1143 Argument : n/a | |
1144 Throws : n/a | |
1145 Comments : If you need data for each HSP, use hsps() and then interate | |
1146 : through the HSP objects. | |
1147 : To compute the fraction aligned, the logical length of the sbjct | |
1148 : sequence is used, meaning that for TBLAST[NX] reports, the | |
1149 : full length of the sbjct sequence is converted into amino acids | |
1150 : by dividing by 3. This is necessary because of the way | |
1151 : the lengths of aligned sequences are computed. | |
1152 : This method requires that all HSPs be tiled. If they have not | |
1153 : already been tiled, they will be tiled first automatically. | |
1154 | |
1155 See Also : L<frac_aligned_query()|frac_aligned_query>, L<matches()|matches>, , L<logical_length()|logical_length>, L<length_aln()|length_aln>, L<Bio::Search::SearchUtils::tile_hsps()|Bio::Search::SearchUtils> | |
1156 | |
1157 =cut | |
1158 | |
1159 #-------------------- | |
1160 sub frac_aligned_hit { | |
1161 #-------------------- | |
1162 my $self = shift; | |
1163 | |
1164 Bio::Search::SearchUtils::tile_hsps($self) if not $self->{'_tiled_hsps'}; | |
1165 | |
1166 sprintf( "%.2f", $self->{'_length_aln_sbjct'}/$self->logical_length('sbjct')); | |
1167 } | |
1168 | |
1169 | |
1170 ## These methods are being maintained for backward compatibility. | |
1171 | |
1172 =head2 frac_aligned_sbjct | |
1173 | |
1174 Same as L<frac_aligned_hit()|frac_aligned_hit> | |
1175 | |
1176 =cut | |
1177 | |
1178 #---------------- | |
1179 sub frac_aligned_sbjct { my $self=shift; $self->frac_aligned_hit(@_); } | |
1180 #---------------- | |
1181 | |
1182 =head2 num_unaligned_sbjct | |
1183 | |
1184 Same as L<num_unaligned_hit()|num_unaligned_hit> | |
1185 | |
1186 =cut | |
1187 | |
1188 #---------------- | |
1189 sub num_unaligned_sbjct { my $self=shift; $self->num_unaligned_hit(@_); } | |
1190 #---------------- | |
1191 | |
1192 | |
1193 | |
1194 =head2 num_unaligned_hit | |
1195 | |
1196 Usage : $hit_object->num_unaligned_hit(); | |
1197 Purpose : Get the number of the unaligned residues in the hit sequence. | |
1198 : Sums across all all HSPs. | |
1199 Example : $num_unaln = $hit_object->num_unaligned_hit(); | |
1200 Returns : Integer | |
1201 Argument : n/a | |
1202 Throws : n/a | |
1203 Comments : See notes regarding logical lengths in the comments for frac_aligned_hit(). | |
1204 : They apply here as well. | |
1205 : If you need data for each HSP, use hsps() and then interate | |
1206 : through the HSP objects. | |
1207 : This method requires that all HSPs be tiled. If they have not | |
1208 : already been tiled, they will be tiled first automatically.. | |
1209 | |
1210 See Also : L<num_unaligned_query()|num_unaligned_query>, L<Bio::Search::SearchUtils::tile_hsps()|Bio::Search::SearchUtils>, L<frac_aligned_hit()|frac_aligned_hit> | |
1211 | |
1212 =cut | |
1213 | |
1214 #--------------------- | |
1215 sub num_unaligned_hit { | |
1216 #--------------------- | |
1217 my $self = shift; | |
1218 | |
1219 Bio::Search::SearchUtils::tile_hsps($self) if not $self->{'_tiled_hsps'}; | |
1220 | |
1221 my $num = $self->logical_length('sbjct') - $self->{'_length_aln_sbjct'}; | |
1222 ($num < 0 ? 0 : $num ); | |
1223 } | |
1224 | |
1225 | |
1226 =head2 num_unaligned_query | |
1227 | |
1228 Usage : $hit_object->num_unaligned_query(); | |
1229 Purpose : Get the number of the unaligned residues in the query sequence. | |
1230 : Sums across all all HSPs. | |
1231 Example : $num_unaln = $hit_object->num_unaligned_query(); | |
1232 Returns : Integer | |
1233 Argument : n/a | |
1234 Throws : n/a | |
1235 Comments : See notes regarding logical lengths in the comments for frac_aligned_query(). | |
1236 : They apply here as well. | |
1237 : If you need data for each HSP, use hsps() and then interate | |
1238 : through the HSP objects. | |
1239 : This method requires that all HSPs be tiled. If they have not | |
1240 : already been tiled, they will be tiled first automatically.. | |
1241 | |
1242 See Also : L<num_unaligned_hit()|num_unaligned_hit>, L<frac_aligned_query()|frac_aligned_query>, L<Bio::Search::SearchUtils::tile_hsps()|Bio::Search::SearchUtils> | |
1243 | |
1244 =cut | |
1245 | |
1246 #----------------------- | |
1247 sub num_unaligned_query { | |
1248 #----------------------- | |
1249 my $self = shift; | |
1250 | |
1251 Bio::Search::SearchUtils::tile_hsps($self) if not $self->{'_tiled_hsps'}; | |
1252 | |
1253 my $num = $self->logical_length('query') - $self->{'_length_aln_query'}; | |
1254 ($num < 0 ? 0 : $num ); | |
1255 } | |
1256 | |
1257 | |
1258 | |
1259 =head2 seq_inds | |
1260 | |
1261 Usage : $hit->seq_inds( seq_type, class, collapse ); | |
1262 Purpose : Get a list of residue positions (indices) across all HSPs | |
1263 : for identical or conserved residues in the query or sbjct sequence. | |
1264 Example : @s_ind = $hit->seq_inds('query', 'identical'); | |
1265 : @h_ind = $hit->seq_inds('hit', 'conserved'); | |
1266 : @h_ind = $hit->seq_inds('hit', 'conserved', 1); | |
1267 Returns : Array of integers | |
1268 : May include ranges if collapse is non-zero. | |
1269 Argument : [0] seq_type = 'query' or 'hit' or 'sbjct' (default = 'query') | |
1270 : ('sbjct' is synonymous with 'hit') | |
1271 : [1] class = 'identical' or 'conserved' (default = 'identical') | |
1272 : (can be shortened to 'id' or 'cons') | |
1273 : (actually, anything not 'id' will evaluate to 'conserved'). | |
1274 : [2] collapse = boolean, if non-zero, consecutive positions are merged | |
1275 : using a range notation, e.g., "1 2 3 4 5 7 9 10 11" | |
1276 : collapses to "1-5 7 9-11". This is useful for | |
1277 : consolidating long lists. Default = no collapse. | |
1278 Throws : n/a. | |
1279 | |
1280 See Also : L<Bio::Search::HSP::BlastHSP::seq_inds()|Bio::Search::HSP::BlastHSP> | |
1281 | |
1282 =cut | |
1283 | |
1284 #------------- | |
1285 sub seq_inds { | |
1286 #------------- | |
1287 my ($self, $seqType, $class, $collapse) = @_; | |
1288 | |
1289 $seqType ||= 'query'; | |
1290 $class ||= 'identical'; | |
1291 $collapse ||= 0; | |
1292 | |
1293 $seqType = 'sbjct' if $seqType eq 'hit'; | |
1294 | |
1295 my (@inds, $hsp); | |
1296 foreach $hsp ($self->hsps) { | |
1297 # This will merge data for all HSPs together. | |
1298 push @inds, $hsp->seq_inds($seqType, $class); | |
1299 } | |
1300 | |
1301 # Need to remove duplicates and sort the merged positions. | |
1302 if(@inds) { | |
1303 my %tmp = map { $_, 1 } @inds; | |
1304 @inds = sort {$a <=> $b} keys %tmp; | |
1305 } | |
1306 | |
1307 $collapse ? &Bio::Search::SearchUtils::collapse_nums(@inds) : @inds; | |
1308 } | |
1309 | |
1310 | |
1311 =head2 strand | |
1312 | |
1313 See documentation in L<Bio::Search::SearchUtils::strand()|Bio::Search::SearchUtils> | |
1314 | |
1315 =cut | |
1316 | |
1317 #----------' | |
1318 sub strand { | |
1319 #---------- | |
1320 my ($self, $seqType, $strnd) = @_; | |
1321 | |
1322 Bio::Search::SearchUtils::tile_hsps($self) if not $self->{'_tiled_hsps'}; | |
1323 | |
1324 $seqType ||= (wantarray ? 'list' : 'query'); | |
1325 $seqType = 'sbjct' if $seqType eq 'hit'; | |
1326 | |
1327 $seqType = lc($seqType); | |
1328 | |
1329 if( defined $strnd ) { | |
1330 $self->throw("Can't set strand for seqType '$seqType'. Must be 'query' or 'hit'\n") unless ($seqType eq 'sbjct' or $seqType eq 'query'); | |
1331 | |
1332 return $self->{'_strand_'.$seqType} = $strnd; | |
1333 } | |
1334 | |
1335 my ($qstr, $hstr); | |
1336 # If there is only one HSP, defer this call to the solitary HSP. | |
1337 if($self->num_hsps == 1) { | |
1338 return $self->hsp->strand($seqType); | |
1339 } | |
1340 elsif( defined $self->{'_strand_query'}) { | |
1341 # Get the data computed during hsp tiling. | |
1342 $qstr = $self->{'_strand_query'}; | |
1343 $hstr = $self->{'_strand_sbjct'} | |
1344 } | |
1345 else { | |
1346 # otherwise, iterate through all HSPs collecting strand info. | |
1347 # This will return the string "-1/1" if there are HSPs on different strands. | |
1348 # NOTE: This was the pre-10/21/02 procedure which will no longer be used, | |
1349 # (unless the above elsif{} is commented out). | |
1350 my (%qstr, %hstr); | |
1351 foreach my $hsp( $self->hsps ) { | |
1352 my ( $q, $h ) = $hsp->strand(); | |
1353 $qstr{ $q }++; | |
1354 $hstr{ $h }++; | |
1355 } | |
1356 $qstr = join( '/', sort keys %qstr); | |
1357 $hstr = join( '/', sort keys %hstr); | |
1358 } | |
1359 | |
1360 if($seqType =~ /list|array/i) { | |
1361 return ($qstr, $hstr); | |
1362 } elsif( $seqType eq 'query' ) { | |
1363 return $qstr; | |
1364 } else { | |
1365 return $hstr; | |
1366 } | |
1367 } | |
1368 | |
1369 =head2 frame | |
1370 | |
1371 See documentation in L<Bio::Search::SearchUtils::frame()|Bio::Search::SearchUtils> | |
1372 | |
1373 =cut | |
1374 | |
1375 #----------' | |
1376 sub frame { | |
1377 #---------- | |
1378 my( $self, $frm ) = @_; | |
1379 | |
1380 Bio::Search::SearchUtils::tile_hsps($self) if not $self->{'_tiled_hsps'}; | |
1381 | |
1382 if( defined $frm ) { | |
1383 return $self->{'_frame'} = $frm; | |
1384 } | |
1385 | |
1386 # The check for $self->{'_frame'} is a remnant from the 'query' mode days | |
1387 # in which the sbjct object would collect data from the description line only. | |
1388 | |
1389 my ($frame); | |
1390 if(not defined($self->{'_frame'})) { | |
1391 $frame = $self->hsp->frame; | |
1392 } else { | |
1393 $frame = $self->{'_frame'}; | |
1394 } | |
1395 return $frame; | |
1396 } | |
1397 | |
1398 =head2 rank | |
1399 | |
1400 Title : rank | |
1401 Usage : $obj->rank($newval) | |
1402 Function: Get/Set the rank of this Hit in the Query search list | |
1403 i.e. this is the Nth hit for a specific query | |
1404 Returns : value of rank | |
1405 Args : newvalue (optional) | |
1406 | |
1407 | |
1408 =cut | |
1409 | |
1410 sub rank{ | |
1411 my $self = shift; | |
1412 return $self->{'_rank'} = shift if @_; | |
1413 return $self->{'_rank'} || 1; | |
1414 } | |
1415 | |
1416 =head2 locus | |
1417 | |
1418 Title : locus | |
1419 Usage : $locus = $hit->locus(); | |
1420 Function: Retrieve the locus (if available) for the hit | |
1421 Returns : a scalar string (empty string if not set) | |
1422 Args : none | |
1423 | |
1424 =cut | |
1425 | |
1426 sub locus { | |
1427 my ($self,$value) = @_; | |
1428 my $previous = $self->{'_locus'}; | |
1429 if( defined $value || ! defined $previous ) { | |
1430 unless (defined $value) { | |
1431 if ($self->{'_name'} =~/(gb|emb|dbj|ref)\|(.*)\|(.*)/) { | |
1432 $value = $previous = $3; | |
1433 } else { | |
1434 $value = $previous = ''; | |
1435 } | |
1436 } | |
1437 $self->{'_locus'} = $value; | |
1438 } | |
1439 return $previous; | |
1440 } | |
1441 | |
1442 =head2 each_accession_number | |
1443 | |
1444 Title : each_accession_number | |
1445 Usage : @each_accession_number = $hit->each_accession_number(); | |
1446 Function: Get each accession number listed in the description of the hit. | |
1447 If there are no alternatives, then only the primary accession will | |
1448 be given | |
1449 Returns : list of all accession numbers in the description | |
1450 Args : none | |
1451 | |
1452 =cut | |
1453 | |
1454 sub each_accession_number { | |
1455 my ($self,$value) = @_; | |
1456 my $desc = $self->{'_description'}; | |
1457 #put primary accnum on the list | |
1458 my @accnums; | |
1459 push (@accnums,$self->{'_accession'}); | |
1460 if( defined $desc ) { | |
1461 while ($desc =~ /(\b\S+\|\S*\|\S*\s?)/g) { | |
1462 my $id = $1; | |
1463 my ($acc, $version); | |
1464 if ($id =~ /(gb|emb|dbj|sp|pdb|bbs|ref|lcl)\|(.*)\|(.*)/) { | |
1465 ($acc, $version) = split /\./, $2; | |
1466 } elsif ($id =~ /(pir|prf|pat|gnl)\|(.*)\|(.*)/) { | |
1467 ($acc, $version) = split /\./, $3; | |
1468 } else { | |
1469 #punt, not matching the db's at ftp://ftp.ncbi.nih.gov/blast/db/README | |
1470 #Database Name Identifier Syntax | |
1471 #============================ ======================== | |
1472 #GenBank gb|accession|locus | |
1473 #EMBL Data Library emb|accession|locus | |
1474 #DDBJ, DNA Database of Japan dbj|accession|locus | |
1475 #NBRF PIR pir||entry | |
1476 #Protein Research Foundation prf||name | |
1477 #SWISS-PROT sp|accession|entry name | |
1478 #Brookhaven Protein Data Bank pdb|entry|chain | |
1479 #Patents pat|country|number | |
1480 #GenInfo Backbone Id bbs|number | |
1481 #General database identifier gnl|database|identifier | |
1482 #NCBI Reference Sequence ref|accession|locus | |
1483 #Local Sequence identifier lcl|identifier | |
1484 $acc=$id; | |
1485 } | |
1486 push(@accnums, $acc); | |
1487 } | |
1488 } | |
1489 return @accnums; | |
1490 } | |
1491 | |
1492 =head2 tiled_hsps | |
1493 | |
1494 See documentation in L<Bio::Search::SearchUtils::tiled_hsps()|Bio::Search::SearchUtils> | |
1495 | |
1496 =cut | |
1497 | |
1498 sub tiled_hsps { | |
1499 my $self = shift; | |
1500 return $self->{'_tiled_hsps'} = shift if @_; | |
1501 return $self->{'_tiled_hsps'}; | |
1502 } | |
1503 | |
1504 1; |