Mercurial > repos > mahtabm > ensembl
comparison variant_effect_predictor/Bio/Search/HSP/GenericHSP.pm @ 0:1f6dce3d34e0
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 02:01:53 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:1f6dce3d34e0 |
---|---|
1 # $Id: GenericHSP.pm,v 1.40.2.3 2003/03/24 20:44:45 jason Exp $ | |
2 # | |
3 # BioPerl module for Bio::Search::HSP::GenericHSP | |
4 # | |
5 # Cared for by Jason Stajich <jason@bioperl.org> | |
6 # | |
7 # Copyright Jason Stajich | |
8 # | |
9 # You may distribute this module under the same terms as perl itself | |
10 | |
11 # POD documentation - main docs before the code | |
12 | |
13 =head1 NAME | |
14 | |
15 Bio::Search::HSP::GenericHSP - A "Generic" implementation of a High Scoring Pair | |
16 | |
17 =head1 SYNOPSIS | |
18 | |
19 my $hsp = new Bio::Search::HSP::GenericHSP( -algorithm => 'blastp', | |
20 -evalue => '1e-30', | |
21 ); | |
22 | |
23 $r_type = $hsp->algorithm | |
24 | |
25 $pvalue = $hsp->p(); | |
26 | |
27 $evalue = $hsp->evalue(); | |
28 | |
29 $frac_id = $hsp->frac_identical( ['query'|'hit'|'total'] ); | |
30 | |
31 $frac_cons = $hsp->frac_conserved( ['query'|'hit'|'total'] ); | |
32 | |
33 $gaps = $hsp->gaps( ['query'|'hit'|'total'] ); | |
34 | |
35 $qseq = $hsp->query_string; | |
36 | |
37 $hseq = $hsp->hit_string; | |
38 | |
39 $homo_string = $hsp->homology_string; | |
40 | |
41 $len = $hsp->length( ['query'|'hit'|'total'] ); | |
42 | |
43 $len = $hsp->length( ['query'|'hit'|'total'] ); | |
44 | |
45 $rank = $hsp->rank; | |
46 | |
47 | |
48 =head1 DESCRIPTION | |
49 | |
50 This implementation is "Generic", meaning it is is suitable for | |
51 holding information about High Scoring pairs from most Search reports | |
52 such as BLAST and FastA. Specialized objects can be derived from | |
53 this. | |
54 | |
55 =head1 FEEDBACK | |
56 | |
57 =head2 Mailing Lists | |
58 | |
59 User feedback is an integral part of the evolution of this and other | |
60 Bioperl modules. Send your comments and suggestions preferably to | |
61 the Bioperl mailing list. Your participation is much appreciated. | |
62 | |
63 bioperl-l@bioperl.org - General discussion | |
64 http://bioperl.org/MailList.shtml - About the mailing lists | |
65 | |
66 =head2 Reporting Bugs | |
67 | |
68 Report bugs to the Bioperl bug tracking system to help us keep track | |
69 of the bugs and their resolution. Bug reports can be submitted via | |
70 email or the web: | |
71 | |
72 bioperl-bugs@bioperl.org | |
73 http://bugzilla.bioperl.org/ | |
74 | |
75 =head1 AUTHOR - Jason Stajich and Steve Chervitz | |
76 | |
77 Email jason@bioperl.org | |
78 Email sac@bioperl.org | |
79 | |
80 Describe contact details here | |
81 | |
82 =head1 CONTRIBUTORS | |
83 | |
84 Additional contributors names and emails here | |
85 | |
86 =head1 APPENDIX | |
87 | |
88 The rest of the documentation details each of the object methods. | |
89 Internal methods are usually preceded with a _ | |
90 | |
91 =cut | |
92 | |
93 | |
94 # Let the code begin... | |
95 | |
96 | |
97 package Bio::Search::HSP::GenericHSP; | |
98 use vars qw(@ISA $GAP_SYMBOL); | |
99 use strict; | |
100 | |
101 use Bio::Root::Root; | |
102 use Bio::SeqFeature::Similarity; | |
103 use Bio::Search::HSP::HSPI; | |
104 | |
105 @ISA = qw(Bio::Search::HSP::HSPI Bio::Root::Root ); | |
106 | |
107 BEGIN { | |
108 $GAP_SYMBOL = '-'; | |
109 } | |
110 =head2 new | |
111 | |
112 Title : new | |
113 Usage : my $obj = new Bio::Search::HSP::GenericHSP(); | |
114 Function: Builds a new Bio::Search::HSP::GenericHSP object | |
115 Returns : Bio::Search::HSP::GenericHSP | |
116 Args : -algorithm => algorithm used (BLASTP, TBLASTX, FASTX, etc) | |
117 -evalue => evalue | |
118 -pvalue => pvalue | |
119 -bits => bit value for HSP | |
120 -score => score value for HSP (typically z-score but depends on | |
121 analysis) | |
122 -hsp_length=> Length of the HSP (including gaps) | |
123 -identical => # of residues that that matched identically | |
124 -conserved => # of residues that matched conservatively | |
125 (only protein comparisions; | |
126 conserved == identical in nucleotide comparisons) | |
127 -hsp_gaps => # of gaps in the HSP | |
128 -query_gaps => # of gaps in the query in the alignment | |
129 -hit_gaps => # of gaps in the subject in the alignment | |
130 -query_name => HSP Query sequence name (if available) | |
131 -query_start => HSP Query start (in original query sequence coords) | |
132 -query_end => HSP Query end (in original query sequence coords) | |
133 -hit_name => HSP Hit sequence name (if available) | |
134 -hit_start => HSP Hit start (in original hit sequence coords) | |
135 -hit_end => HSP Hit end (in original hit sequence coords) | |
136 -hit_length => total length of the hit sequence | |
137 -query_length=> total length of the query sequence | |
138 -query_seq => query sequence portion of the HSP | |
139 -hit_seq => hit sequence portion of the HSP | |
140 -homology_seq=> homology sequence for the HSP | |
141 -hit_frame => hit frame (only if hit is translated protein) | |
142 -query_frame => query frame (only if query is translated protein) | |
143 -rank => HSP rank | |
144 | |
145 | |
146 =cut | |
147 | |
148 sub new { | |
149 my($class,@args) = @_; | |
150 | |
151 my $self = $class->SUPER::new(@args); | |
152 my ($algo, $evalue, $pvalue, $identical, $conserved, | |
153 $gaps, $query_gaps, $hit_gaps, | |
154 $hit_seq, $query_seq, $homology_seq, | |
155 $hsp_len, $query_len,$hit_len, | |
156 $hit_name,$query_name,$bits,$score, | |
157 $hs,$he,$qs,$qe, | |
158 $qframe,$hframe, | |
159 $rank) = $self->_rearrange([qw(ALGORITHM | |
160 EVALUE | |
161 PVALUE | |
162 IDENTICAL | |
163 CONSERVED | |
164 HSP_GAPS | |
165 QUERY_GAPS | |
166 HIT_GAPS | |
167 HIT_SEQ | |
168 QUERY_SEQ | |
169 HOMOLOGY_SEQ | |
170 HSP_LENGTH | |
171 QUERY_LENGTH | |
172 HIT_LENGTH | |
173 HIT_NAME | |
174 QUERY_NAME | |
175 BITS | |
176 SCORE | |
177 HIT_START | |
178 HIT_END | |
179 QUERY_START | |
180 QUERY_END | |
181 QUERY_FRAME | |
182 HIT_FRAME | |
183 RANK | |
184 )], @args); | |
185 | |
186 $algo = 'GENERIC' unless defined $algo; | |
187 $self->algorithm($algo); | |
188 | |
189 # defined $evalue && $self->evalue($evalue) | |
190 # $hsp->significance is initialized by the | |
191 # the SimilarityPair object - let's only keep one | |
192 # value, don't need 2 slots. | |
193 | |
194 defined $pvalue && $self->pvalue($pvalue); | |
195 defined $bits && $self->bits($bits); | |
196 defined $score && $self->score($score); | |
197 my ($queryfactor, $hitfactor) = (0,0); | |
198 | |
199 if( $algo =~ /^(PSI)?T(BLAST|FAST)[NY]/oi ) { | |
200 $hitfactor = 1; | |
201 } elsif ($algo =~ /^(FAST|BLAST)(X|Y|XY)/oi ) { | |
202 $queryfactor = 1; | |
203 } elsif ($algo =~ /^T(BLAST|FAST)(X|Y|XY)/oi || | |
204 $algo =~ /^(BLAST|FAST)N/oi || | |
205 $algo eq 'WABA' || | |
206 $algo eq 'EXONERATE' || $algo eq 'MEGABLAST' || | |
207 $algo eq 'SMITH-WATERMAN' ){ | |
208 $hitfactor = 1; | |
209 $queryfactor = 1; | |
210 } elsif( $algo eq 'RPSBLAST' ) { | |
211 $queryfactor = $hitfactor = 0; | |
212 $qframe = $hframe = 0; | |
213 } | |
214 # Store the aligned query as sequence feature | |
215 my $strand; | |
216 unless( defined $qe && defined $qs ) { $self->throw("Did not specify a Query End or Query Begin @args ($qs,$qe)"); } | |
217 unless( defined $he && defined $hs ) { $self->throw("Did not specify a Hit End or Hit Begin"); } | |
218 if ($qe > $qs) { # normal query: start < end | |
219 if ($queryfactor) { $strand = 1; } else { $strand = undef; } | |
220 } else { # reverse query (i dont know if this is possible, | |
221 # but feel free to correct) | |
222 if ($queryfactor) { $strand = -1; } else { $strand = undef; } | |
223 ($qs,$qe) = ($qe,$qs); | |
224 | |
225 } | |
226 $self->query( new Bio::SeqFeature::Similarity | |
227 ('-primary' => $self->primary_tag, | |
228 '-start' => $qs, | |
229 '-expect' => $evalue, | |
230 '-bits' => $bits, | |
231 '-end' => $qe, | |
232 '-strand' => $strand, | |
233 '-seq_id' => $query_name, | |
234 '-seqlength'=> $query_len, | |
235 '-source' => $algo, | |
236 ) ); | |
237 | |
238 # to determine frame from something like FASTXY which doesn't | |
239 # report the frame | |
240 if( defined $strand && ! defined $qframe && $queryfactor ) { | |
241 $qframe = ( $self->query->start % 3 ) * $strand; | |
242 } elsif( ! defined $strand ) { | |
243 $qframe = 0; | |
244 } | |
245 # store the aligned subject as sequence feature | |
246 if ($he > $hs) { # normal subject | |
247 if ($hitfactor) { $strand = 1; } else { $strand = undef; } | |
248 } else { | |
249 if ($hitfactor) { $strand = -1; } else { $strand = undef; } | |
250 ($hs,$he) = ( $he,$hs); # reverse subject: start bigger than end | |
251 } | |
252 | |
253 $self->hit( Bio::SeqFeature::Similarity->new | |
254 ('-start' => $hs, | |
255 '-end' => $he, | |
256 '-strand' => $strand, | |
257 '-expect' => $evalue, | |
258 '-bits' => $bits, | |
259 '-source' => $algo, | |
260 '-seq_id' => $hit_name, | |
261 '-seqlength' => $hit_len, | |
262 '-primary' => $self->primary_tag )); | |
263 | |
264 if( defined $strand && ! defined $hframe && $hitfactor ) { | |
265 $hframe = ( $hs % 3 ) * $strand; | |
266 } elsif( ! defined $strand ) { | |
267 $hframe = 0; | |
268 } | |
269 | |
270 $self->frame($qframe,$hframe); | |
271 | |
272 if( ! defined $query_len || ! defined $hit_len ) { | |
273 $self->throw("Must defined hit and query length"); | |
274 } | |
275 | |
276 if( ! defined $identical ) { | |
277 $self->warn("Did not defined the number of identical matches in the HSP assuming 0"); | |
278 $identical = 0; | |
279 } | |
280 if( ! defined $conserved ) { | |
281 $self->warn("Did not defined the number of conserved matches in the HSP assuming conserved == identical ($identical)") | |
282 if( $algo !~ /^((FAST|BLAST)N)|Exonerate/oi); | |
283 $conserved = $identical; | |
284 } | |
285 # protect for divide by zero if user does not specify | |
286 # hsp_len, query_len, or hit_len | |
287 | |
288 $self->num_identical($identical); | |
289 $self->num_conserved($conserved); | |
290 | |
291 if( $hsp_len ) { | |
292 $self->length('total', $hsp_len); | |
293 $self->frac_identical( 'total', $identical / $self->length('total')); | |
294 $self->frac_conserved( 'total', $conserved / $self->length('total')); | |
295 } | |
296 if( $hit_len ) { | |
297 # $self->length('hit', $self->hit->length); | |
298 $self->frac_identical( 'hit', $identical / $self->length('hit')); | |
299 $self->frac_conserved( 'hit', $conserved / $self->length('hit')); | |
300 } | |
301 if( $query_len ) { | |
302 # $self->length('query', $self->query->length); | |
303 $self->frac_identical( 'query', $identical / $self->length('query')) ; | |
304 $self->frac_conserved( 'query', $conserved / $self->length('query')); | |
305 } | |
306 $self->query_string($query_seq); | |
307 $self->hit_string($hit_seq); | |
308 $self->homology_string($homology_seq); | |
309 | |
310 if( defined $query_gaps ) { | |
311 $self->gaps('query', $query_gaps); | |
312 } elsif( defined $query_seq ) { | |
313 $self->gaps('query', scalar ( $query_seq =~ tr/\-//)); | |
314 } | |
315 if( defined $hit_gaps ) { | |
316 $self->gaps('hit', $hit_gaps); | |
317 } elsif( defined $hit_seq ) { | |
318 $self->gaps('hit', scalar ( $hit_seq =~ tr/\-//)); | |
319 } | |
320 if( ! defined $gaps ) { | |
321 $gaps = $self->gaps("query") + $self->gaps("hit"); | |
322 } | |
323 $self->gaps('total', $gaps); | |
324 $self->percent_identity($identical / $hsp_len ) if( $hsp_len > 0 ); | |
325 | |
326 $rank && $self->rank($rank); | |
327 return $self; | |
328 } | |
329 | |
330 | |
331 | |
332 =head2 Bio::Search::HSP::HSPI methods | |
333 | |
334 Implementation of Bio::Search::HSP::HSPI methods follow | |
335 | |
336 =head2 algorithm | |
337 | |
338 Title : algorithm | |
339 Usage : my $r_type = $hsp->algorithm | |
340 Function: Obtain the name of the algorithm used to obtain the HSP | |
341 Returns : string (e.g., BLASTP) | |
342 Args : [optional] scalar string to set value | |
343 | |
344 =cut | |
345 | |
346 sub algorithm{ | |
347 my ($self,$value) = @_; | |
348 my $previous = $self->{'_algorithm'}; | |
349 if( defined $value || ! defined $previous ) { | |
350 $value = $previous = '' unless defined $value; | |
351 $self->{'_algorithm'} = $value; | |
352 } | |
353 | |
354 return $previous; | |
355 } | |
356 | |
357 =head2 pvalue | |
358 | |
359 Title : pvalue | |
360 Usage : my $pvalue = $hsp->pvalue(); | |
361 Function: Returns the P-value for this HSP or undef | |
362 Returns : float or exponential (2e-10) | |
363 P-value is not defined with NCBI Blast2 reports. | |
364 Args : [optional] numeric to set value | |
365 | |
366 =cut | |
367 | |
368 sub pvalue { | |
369 my ($self,$value) = @_; | |
370 my $previous = $self->{'_pvalue'}; | |
371 if( defined $value ) { | |
372 $self->{'_pvalue'} = $value; | |
373 } | |
374 return $previous; | |
375 } | |
376 | |
377 =head2 evalue | |
378 | |
379 Title : evalue | |
380 Usage : my $evalue = $hsp->evalue(); | |
381 Function: Returns the e-value for this HSP | |
382 Returns : float or exponential (2e-10) | |
383 Args : [optional] numeric to set value | |
384 | |
385 =cut | |
386 | |
387 sub evalue { shift->significance(@_) } | |
388 | |
389 =head2 frac_identical | |
390 | |
391 Title : frac_identical | |
392 Usage : my $frac_id = $hsp->frac_identical( ['query'|'hit'|'total'] ); | |
393 Function: Returns the fraction of identitical positions for this HSP | |
394 Returns : Float in range 0.0 -> 1.0 | |
395 Args : arg 1: 'query' = num identical / length of query seq (without gaps) | |
396 'hit' = num identical / length of hit seq (without gaps) | |
397 'total' = num identical / length of alignment (with gaps) | |
398 default = 'total' | |
399 arg 2: [optional] frac identical value to set for the type requested | |
400 | |
401 =cut | |
402 | |
403 sub frac_identical { | |
404 my ($self, $type,$value) = @_; | |
405 | |
406 $type = lc $type if defined $type; | |
407 $type = 'total' if( ! defined $type || | |
408 $type !~ /query|hit|total/); | |
409 my $previous = $self->{'_frac_identical'}->{$type}; | |
410 if( defined $value || ! defined $previous ) { | |
411 $value = $previous = '' unless defined $value; | |
412 if( $type eq 'hit' || $type eq 'query' ) { | |
413 $self->$type()->frac_identical( $value); | |
414 } | |
415 $self->{'_frac_identical'}->{$type} = $value; | |
416 } | |
417 return $previous; | |
418 | |
419 } | |
420 | |
421 =head2 frac_conserved | |
422 | |
423 Title : frac_conserved | |
424 Usage : my $frac_cons = $hsp->frac_conserved( ['query'|'hit'|'total'] ); | |
425 Function : Returns the fraction of conserved positions for this HSP. | |
426 This is the fraction of symbols in the alignment with a | |
427 positive score. | |
428 Returns : Float in range 0.0 -> 1.0 | |
429 Args : arg 1: 'query' = num conserved / length of query seq (without gaps) | |
430 'hit' = num conserved / length of hit seq (without gaps) | |
431 'total' = num conserved / length of alignment (with gaps) | |
432 default = 'total' | |
433 arg 2: [optional] frac conserved value to set for the type requested | |
434 | |
435 =cut | |
436 | |
437 sub frac_conserved { | |
438 my ($self, $type,$value) = @_; | |
439 $type = lc $type if defined $type; | |
440 $type = 'total' if( ! defined $type || | |
441 $type !~ /query|hit|total/); | |
442 my $previous = $self->{'_frac_conserved'}->{$type}; | |
443 if( defined $value || ! defined $previous ) { | |
444 $value = $previous = '' unless defined $value; | |
445 $self->{'_frac_conserved'}->{$type} = $value; | |
446 } | |
447 return $previous; | |
448 } | |
449 | |
450 =head2 gaps | |
451 | |
452 Title : gaps | |
453 Usage : my $gaps = $hsp->gaps( ['query'|'hit'|'total'] ); | |
454 Function : Get the number of gaps in the query, hit, or total alignment. | |
455 Returns : Integer, number of gaps or 0 if none | |
456 Args : arg 1: 'query' = num gaps in query seq | |
457 'hit' = num gaps in hit seq | |
458 'total' = num gaps in whole alignment | |
459 default = 'total' | |
460 arg 2: [optional] integer gap value to set for the type requested | |
461 | |
462 =cut | |
463 | |
464 sub gaps { | |
465 my ($self, $type,$value) = @_; | |
466 $type = lc $type if defined $type; | |
467 $type = 'total' if( ! defined $type || | |
468 $type !~ /query|hit|subject|sbjct|total/); | |
469 $type = 'hit' if $type =~ /sbjct|subject/; | |
470 my $previous = $self->{'_gaps'}->{$type}; | |
471 if( defined $value || ! defined $previous ) { | |
472 $value = $previous = '' unless defined $value; | |
473 $self->{'_gaps'}->{$type} = $value; | |
474 } | |
475 return $previous || 0; | |
476 } | |
477 | |
478 =head2 query_string | |
479 | |
480 Title : query_string | |
481 Usage : my $qseq = $hsp->query_string; | |
482 Function: Retrieves the query sequence of this HSP as a string | |
483 Returns : string | |
484 Args : [optional] string to set for query sequence | |
485 | |
486 | |
487 =cut | |
488 | |
489 sub query_string{ | |
490 my ($self,$value) = @_; | |
491 my $previous = $self->{'_query_string'}; | |
492 if( defined $value || ! defined $previous ) { | |
493 $value = $previous = '' unless defined $value; | |
494 $self->{'_query_string'} = $value; | |
495 # do some housekeeping so we know when to | |
496 # re-run _calculate_seq_positions | |
497 $self->{'_sequenceschanged'} = 1; | |
498 } | |
499 return $previous; | |
500 } | |
501 | |
502 =head2 hit_string | |
503 | |
504 Title : hit_string | |
505 Usage : my $hseq = $hsp->hit_string; | |
506 Function: Retrieves the hit sequence of this HSP as a string | |
507 Returns : string | |
508 Args : [optional] string to set for hit sequence | |
509 | |
510 | |
511 =cut | |
512 | |
513 sub hit_string{ | |
514 my ($self,$value) = @_; | |
515 my $previous = $self->{'_hit_string'}; | |
516 if( defined $value || ! defined $previous ) { | |
517 $value = $previous = '' unless defined $value; | |
518 $self->{'_hit_string'} = $value; | |
519 # do some housekeeping so we know when to | |
520 # re-run _calculate_seq_positions | |
521 $self->{'_sequenceschanged'} = 1; | |
522 } | |
523 return $previous; | |
524 } | |
525 | |
526 =head2 homology_string | |
527 | |
528 Title : homology_string | |
529 Usage : my $homo_string = $hsp->homology_string; | |
530 Function: Retrieves the homology sequence for this HSP as a string. | |
531 : The homology sequence is the string of symbols in between the | |
532 : query and hit sequences in the alignment indicating the degree | |
533 : of conservation (e.g., identical, similar, not similar). | |
534 Returns : string | |
535 Args : [optional] string to set for homology sequence | |
536 | |
537 =cut | |
538 | |
539 sub homology_string{ | |
540 my ($self,$value) = @_; | |
541 my $previous = $self->{'_homology_string'}; | |
542 if( defined $value || ! defined $previous ) { | |
543 $value = $previous = '' unless defined $value; | |
544 $self->{'_homology_string'} = $value; | |
545 # do some housekeeping so we know when to | |
546 # re-run _calculate_seq_positions | |
547 $self->{'_sequenceschanged'} = 1; | |
548 } | |
549 return $previous; | |
550 } | |
551 | |
552 =head2 length | |
553 | |
554 Title : length | |
555 Usage : my $len = $hsp->length( ['query'|'hit'|'total'] ); | |
556 Function : Returns the length of the query or hit in the alignment | |
557 (without gaps) | |
558 or the aggregate length of the HSP (including gaps; | |
559 this may be greater than either hit or query ) | |
560 Returns : integer | |
561 Args : arg 1: 'query' = length of query seq (without gaps) | |
562 'hit' = length of hit seq (without gaps) | |
563 'total' = length of alignment (with gaps) | |
564 default = 'total' | |
565 arg 2: [optional] integer length value to set for specific type | |
566 | |
567 =cut | |
568 | |
569 sub length { | |
570 | |
571 my $self = shift; | |
572 my $type = shift; | |
573 | |
574 $type = 'total' unless defined $type; | |
575 $type = lc $type; | |
576 | |
577 if( $type =~ /^q/i ) { | |
578 return $self->query()->length(shift); | |
579 } elsif( $type =~ /^(hit|subject|sbjct)/ ) { | |
580 return $self->hit()->length(shift); | |
581 } else { | |
582 my $v = shift; | |
583 if( defined $v ) { | |
584 $self->{'_hsplength'} = $v; | |
585 } | |
586 return $self->{'_hsplength'}; | |
587 } | |
588 return 0; # should never get here | |
589 } | |
590 | |
591 =head2 hsp_length | |
592 | |
593 Title : hsp_length | |
594 Usage : my $len = $hsp->hsp_length() | |
595 Function: shortcut length('hsp') | |
596 Returns : floating point between 0 and 100 | |
597 Args : none | |
598 | |
599 =cut | |
600 | |
601 sub hsp_length { return shift->length('hsp', shift); } | |
602 | |
603 =head2 percent_identity | |
604 | |
605 Title : percent_identity | |
606 Usage : my $percentid = $hsp->percent_identity() | |
607 Function: Returns the calculated percent identity for an HSP | |
608 Returns : floating point between 0 and 100 | |
609 Args : none | |
610 | |
611 | |
612 =cut | |
613 | |
614 | |
615 =head2 frame | |
616 | |
617 Title : frame | |
618 Usage : $hsp->frame($queryframe,$subjectframe) | |
619 Function: Set the Frame for both query and subject and insure that | |
620 they agree. | |
621 This overrides the frame() method implementation in | |
622 FeaturePair. | |
623 Returns : array of query and subjects if return type wants an array | |
624 or query frame if defined or subject frame | |
625 Args : none | |
626 Note : Frames are stored in the GFF way (0-2) not 1-3 | |
627 as they are in BLAST (negative frames are deduced by checking | |
628 the strand of the query or hit) | |
629 | |
630 =cut | |
631 | |
632 sub frame { | |
633 my ($self, $qframe, $sframe) = @_; | |
634 if( defined $qframe ) { | |
635 if( $qframe == 0 ) { | |
636 $qframe = 0; | |
637 } elsif( $qframe !~ /^([+-])?([1-3])/ ) { | |
638 $self->warn("Specifying an invalid query frame ($qframe)"); | |
639 $qframe = undef; | |
640 } else { | |
641 my $dir = $1; | |
642 $dir = '+' unless defined $dir; | |
643 if( ($dir eq '-' && $self->query->strand >= 0) || | |
644 ($dir eq '+' && $self->query->strand <= 0) ) { | |
645 $self->warn("Query frame ($qframe) did not match strand of query (". $self->query->strand() . ")"); | |
646 } | |
647 # Set frame to GFF [0-2] - | |
648 # what if someone tries to put in a GFF frame! | |
649 $qframe = $2 - 1; | |
650 } | |
651 $self->query->frame($qframe); | |
652 } | |
653 if( defined $sframe ) { | |
654 if( $sframe == 0 ) { | |
655 $sframe = 0; | |
656 } elsif( $sframe !~ /^([+-])?([1-3])/ ) { | |
657 $self->warn("Specifying an invalid subject frame ($sframe)"); | |
658 $sframe = undef; | |
659 } else { | |
660 my $dir = $1; | |
661 $dir = '+' unless defined $dir; | |
662 if( ($dir eq '-' && $self->hit->strand >= 0) || | |
663 ($dir eq '+' && $self->hit->strand <= 0) ) | |
664 { | |
665 $self->warn("Subject frame ($sframe) did not match strand of subject (". $self->hit->strand() . ")"); | |
666 } | |
667 | |
668 # Set frame to GFF [0-2] | |
669 $sframe = $2 - 1; | |
670 } | |
671 $self->hit->frame($sframe); | |
672 } | |
673 if (wantarray() && $self->algorithm =~ /^T(BLAST|FAST)(X|Y|XY)/oi) | |
674 { | |
675 return ($self->query->frame(), $self->hit->frame()); | |
676 } elsif (wantarray()) { | |
677 ($self->query->frame() && | |
678 return ($self->query->frame(), undef)) || | |
679 ($self->hit->frame() && | |
680 return (undef, $self->hit->frame())); | |
681 } else { | |
682 ($self->query->frame() && | |
683 return $self->query->frame()) || | |
684 ($self->hit->frame() && | |
685 return $self->hit->frame()); | |
686 } | |
687 } | |
688 | |
689 | |
690 =head2 get_aln | |
691 | |
692 Title : get_aln | |
693 Usage : my $aln = $hsp->gel_aln | |
694 Function: Returns a Bio::SimpleAlign representing the HSP alignment | |
695 Returns : Bio::SimpleAlign | |
696 Args : none | |
697 | |
698 =cut | |
699 | |
700 sub get_aln { | |
701 my ($self) = @_; | |
702 require Bio::LocatableSeq; | |
703 require Bio::SimpleAlign; | |
704 my $aln = new Bio::SimpleAlign; | |
705 my $hs = $self->hit_string(); | |
706 my $qs = $self->query_string(); | |
707 # FASTA specific stuff moved to the FastaHSP object | |
708 my $seqonly = $qs; | |
709 $seqonly =~ s/[\-\s]//g; | |
710 my ($q_nm,$s_nm) = ($self->query->seq_id(), | |
711 $self->hit->seq_id()); | |
712 unless( defined $q_nm && CORE::length ($q_nm) ) { | |
713 $q_nm = 'query'; | |
714 } | |
715 unless( defined $s_nm && CORE::length ($s_nm) ) { | |
716 $s_nm = 'hit'; | |
717 } | |
718 my $query = new Bio::LocatableSeq('-seq' => $qs, | |
719 '-id' => $q_nm, | |
720 '-start' => 1, | |
721 '-end' => CORE::length($seqonly), | |
722 ); | |
723 $seqonly = $hs; | |
724 $seqonly =~ s/[\-\s]//g; | |
725 my $hit = new Bio::LocatableSeq('-seq' => $hs, | |
726 '-id' => $s_nm, | |
727 '-start' => 1, | |
728 '-end' => CORE::length($seqonly), | |
729 ); | |
730 $aln->add_seq($query); | |
731 $aln->add_seq($hit); | |
732 return $aln; | |
733 } | |
734 | |
735 =head2 num_conserved | |
736 | |
737 Title : num_conserved | |
738 Usage : $obj->num_conserved($newval) | |
739 Function: returns the number of conserved residues in the alignment | |
740 Returns : inetger | |
741 Args : integer (optional) | |
742 | |
743 | |
744 =cut | |
745 | |
746 sub num_conserved{ | |
747 my ($self,$value) = @_; | |
748 if( defined $value) { | |
749 $self->{'num_conserved'} = $value; | |
750 } | |
751 return $self->{'num_conserved'}; | |
752 } | |
753 | |
754 =head2 num_identical | |
755 | |
756 Title : num_identical | |
757 Usage : $obj->num_identical($newval) | |
758 Function: returns the number of identical residues in the alignment | |
759 Returns : integer | |
760 Args : integer (optional) | |
761 | |
762 | |
763 =cut | |
764 | |
765 sub num_identical{ | |
766 my ($self,$value) = @_; | |
767 if( defined $value) { | |
768 $self->{'_num_identical'} = $value; | |
769 } | |
770 return $self->{'_num_identical'}; | |
771 } | |
772 | |
773 =head2 rank | |
774 | |
775 Usage : $hsp->rank( [string] ); | |
776 Purpose : Get the rank of the HSP within a given Blast hit. | |
777 Example : $rank = $hsp->rank; | |
778 Returns : Integer (1..n) corresponding to the order in which the HSP | |
779 appears in the BLAST report. | |
780 | |
781 =cut | |
782 | |
783 sub rank { | |
784 my ($self,$value) = @_; | |
785 if( defined $value) { | |
786 $self->{'_rank'} = $value; | |
787 } | |
788 return $self->{'_rank'}; | |
789 } | |
790 | |
791 | |
792 =head2 seq_inds | |
793 | |
794 Title : seq_inds | |
795 Purpose : Get a list of residue positions (indices) for all identical | |
796 : or conserved residues in the query or sbjct sequence. | |
797 Example : @s_ind = $hsp->seq_inds('query', 'identical'); | |
798 : @h_ind = $hsp->seq_inds('hit', 'conserved'); | |
799 @h_ind = $hsp->seq_inds('hit', 'conserved-not-identical'); | |
800 : @h_ind = $hsp->seq_inds('hit', 'conserved', 1); | |
801 Returns : List of integers | |
802 : May include ranges if collapse is true. | |
803 Argument : seq_type = 'query' or 'hit' or 'sbjct' (default = query) | |
804 : ('sbjct' is synonymous with 'hit') | |
805 : class = 'identical' or 'conserved' or 'nomatch' or 'gap' | |
806 : (default = identical) | |
807 : (can be shortened to 'id' or 'cons') | |
808 : or 'conserved-not-identical' | |
809 : collapse = boolean, if true, consecutive positions are merged | |
810 : using a range notation, e.g., "1 2 3 4 5 7 9 10 11" | |
811 : collapses to "1-5 7 9-11". This is useful for | |
812 : consolidating long lists. Default = no collapse. | |
813 Throws : n/a. | |
814 Comments : | |
815 | |
816 See Also : L<Bio::Search::SearchUtils::collapse_nums()|Bio::Search::SearchUtils>, | |
817 L<Bio::Search::Hit::HitI::seq_inds()|Bio::Search::Hit::HitI> | |
818 | |
819 =cut | |
820 | |
821 sub seq_inds{ | |
822 my ($self, $seqType, $class, $collapse) = @_; | |
823 | |
824 # prepare the internal structures - this is cached so | |
825 # if the strings have not changed we're okay | |
826 $self->_calculate_seq_positions(); | |
827 | |
828 $seqType ||= 'query'; | |
829 $class ||= 'identical'; | |
830 $collapse ||= 0; | |
831 $seqType = 'sbjct' if $seqType eq 'hit'; | |
832 my $t = lc(substr($seqType,0,1)); | |
833 if( $t eq 'q' ) { | |
834 $seqType = 'query'; | |
835 } elsif ( $t eq 's' || $t eq 'h' ) { | |
836 $seqType = 'sbjct'; | |
837 } else { | |
838 $self->warn("unknown seqtype $seqType using 'query'"); | |
839 $seqType = 'query'; | |
840 } | |
841 $t = lc(substr($class,0,1)); | |
842 | |
843 if( $t eq 'c' ) { | |
844 if( $class =~ /conserved\-not\-identical/ ) { | |
845 $class = 'conserved'; | |
846 } else { | |
847 $class = 'conservedall'; | |
848 } | |
849 } elsif( $t eq 'i' ) { | |
850 $class = 'identical'; | |
851 } elsif( $t eq 'n' ) { | |
852 $class = 'nomatch'; | |
853 } elsif( $t eq 'g' ) { | |
854 $class = 'gap'; | |
855 } else { | |
856 $self->warn("unknown sequence class $class using 'identical'"); | |
857 $class = 'identical'; | |
858 } | |
859 | |
860 ## Sensitive to member name changes. | |
861 $seqType = "_\L$seqType\E"; | |
862 $class = "_\L$class\E"; | |
863 my @ary; | |
864 | |
865 if( $class eq '_gap' ) { | |
866 # this means that we are remapping the gap length that is stored | |
867 # in the hash (for example $self->{'_gapRes_query'} ) | |
868 # so we'll return an array which has the values of the position of the | |
869 # of the gap (the key in the hash) + the gap length (value in the | |
870 # hash for this key - 1. | |
871 | |
872 @ary = map { $_ > 1 ? | |
873 $_..($_ + $self->{"${class}Res$seqType"}->{$_} - 1) : | |
874 $_ } | |
875 sort { $a <=> $b } keys %{ $self->{"${class}Res$seqType"}}; | |
876 } elsif( $class eq '_conservedall' ) { | |
877 @ary = sort { $a <=> $b } | |
878 keys %{ $self->{"_conservedRes$seqType"}}, | |
879 keys %{ $self->{"_identicalRes$seqType"}}, | |
880 } else { | |
881 @ary = sort { $a <=> $b } keys %{ $self->{"${class}Res$seqType"}}; | |
882 } | |
883 require Bio::Search::BlastUtils if $collapse; | |
884 | |
885 return $collapse ? &Bio::Search::SearchUtils::collapse_nums(@ary) : @ary; | |
886 } | |
887 | |
888 | |
889 =head2 Inherited from Bio::SeqFeature::SimilarityPair | |
890 | |
891 These methods come from Bio::SeqFeature::SimilarityPair | |
892 | |
893 =head2 query | |
894 | |
895 Title : query | |
896 Usage : my $query = $hsp->query | |
897 Function: Returns a SeqFeature representing the query in the HSP | |
898 Returns : Bio::SeqFeature::Similarity | |
899 Args : [optional] new value to set | |
900 | |
901 | |
902 =head2 hit | |
903 | |
904 Title : hit | |
905 Usage : my $hit = $hsp->hit | |
906 Function: Returns a SeqFeature representing the hit in the HSP | |
907 Returns : Bio::SeqFeature::Similarity | |
908 Args : [optional] new value to set | |
909 | |
910 | |
911 =head2 significance | |
912 | |
913 Title : significance | |
914 Usage : $evalue = $obj->significance(); | |
915 $obj->significance($evalue); | |
916 Function: Get/Set the significance value | |
917 Returns : numeric | |
918 Args : [optional] new value to set | |
919 | |
920 | |
921 =head2 score | |
922 | |
923 Title : score | |
924 Usage : my $score = $hsp->score(); | |
925 Function: Returns the score for this HSP or undef | |
926 Returns : numeric | |
927 Args : [optional] numeric to set value | |
928 | |
929 =cut | |
930 | |
931 # overriding | |
932 | |
933 sub score { | |
934 my ($self,$value) = @_; | |
935 my $previous = $self->{'_score'}; | |
936 if( defined $value ) { | |
937 $self->{'_score'} = $value; | |
938 } | |
939 return $previous; | |
940 } | |
941 | |
942 =head2 bits | |
943 | |
944 Title : bits | |
945 Usage : my $bits = $hsp->bits(); | |
946 Function: Returns the bit value for this HSP or undef | |
947 Returns : numeric | |
948 Args : none | |
949 | |
950 =cut | |
951 | |
952 # overriding | |
953 | |
954 sub bits { | |
955 my ($self,$value) = @_; | |
956 my $previous = $self->{'_bits'}; | |
957 if( defined $value ) { | |
958 $self->{'_bits'} = $value; | |
959 } | |
960 return $previous; | |
961 } | |
962 | |
963 | |
964 =head2 strand | |
965 | |
966 Title : strand | |
967 Usage : $hsp->strand('quer') | |
968 Function: Retrieves the strand for the HSP component requested | |
969 Returns : +1 or -1 (0 if unknown) | |
970 Args : 'hit' or 'subject' or 'sbjct' to retrieve the strand of the subject | |
971 'query' to retrieve the query strand (default) | |
972 | |
973 =cut | |
974 | |
975 =head1 Private methods | |
976 | |
977 =cut | |
978 | |
979 =head2 _calculate_seq_positions | |
980 | |
981 Title : _calculate_seq_positions | |
982 Usage : $self->_calculate_seq_positions | |
983 Function: Internal function | |
984 Returns : | |
985 Args : | |
986 | |
987 | |
988 =cut | |
989 | |
990 sub _calculate_seq_positions { | |
991 my ($self,@args) = @_; | |
992 return unless ( $self->{'_sequenceschanged'} ); | |
993 $self->{'_sequenceschanged'} = 0; | |
994 my ($mchar, $schar, $qchar); | |
995 my ($seqString, $qseq,$sseq) = ( $self->homology_string(), | |
996 $self->query_string(), | |
997 $self->hit_string() ); | |
998 | |
999 # Using hashes to avoid saving duplicate residue numbers. | |
1000 my %identicalList_query = (); | |
1001 my %identicalList_sbjct = (); | |
1002 my %conservedList_query = (); | |
1003 my %conservedList_sbjct = (); | |
1004 | |
1005 my %gapList_query = (); | |
1006 my %gapList_sbjct = (); | |
1007 my %nomatchList_query = (); | |
1008 my %nomatchList_sbjct = (); | |
1009 | |
1010 my $qdir = $self->query->strand || 1; | |
1011 my $sdir = $self->hit->strand || 1; | |
1012 my $resCount_query = ($qdir >=0) ? $self->query->end : $self->query->start; | |
1013 my $resCount_sbjct = ($sdir >=0) ? $self->hit->end : $self->hit->start; | |
1014 | |
1015 my $prog = $self->algorithm; | |
1016 if( $prog =~ /FAST/i ) { | |
1017 # fasta reports some extra 'regional' sequence information | |
1018 # we need to clear out first | |
1019 # this seemed a bit insane to me at first, but it appears to | |
1020 # work --jason | |
1021 | |
1022 # we infer the end of the regional sequence where the first | |
1023 # non space is in the homology string | |
1024 # then we use the HSP->length to tell us how far to read | |
1025 # to cut off the end of the sequence | |
1026 | |
1027 # one possible problem is the sequence which | |
1028 | |
1029 my ($start) = (0); | |
1030 if( $seqString =~ /^(\s+)/ ) { | |
1031 $start = CORE::length($1); | |
1032 } | |
1033 | |
1034 $seqString = substr($seqString, $start,$self->length('total')); | |
1035 $qseq = substr($qseq, $start,$self->length('total')); | |
1036 $sseq = substr($sseq, $start,$self->length('total')); | |
1037 | |
1038 $qseq =~ s![\\\/]!!g; | |
1039 $sseq =~ s![\\\/]!!g; | |
1040 } | |
1041 | |
1042 if($prog =~ /^(PSI)?T(BLAST|FAST)N/oi ) { | |
1043 $resCount_sbjct = int($resCount_sbjct / 3); | |
1044 } elsif($prog =~ /^(BLAST|FAST)(X|Y|XY)/oi ) { | |
1045 $resCount_query = int($resCount_query / 3); | |
1046 } elsif($prog =~ /^T(BLAST|FAST)(X|Y|XY)/oi ) { | |
1047 $resCount_query = int($resCount_query / 3); | |
1048 $resCount_sbjct = int($resCount_sbjct / 3); | |
1049 } | |
1050 while( $mchar = chop($seqString) ) { | |
1051 ($qchar, $schar) = (chop($qseq), chop($sseq)); | |
1052 if( $mchar eq '+' || $mchar eq '.' || $mchar eq ':' ) { | |
1053 $conservedList_query{ $resCount_query } = 1; | |
1054 $conservedList_sbjct{ $resCount_sbjct } = 1; | |
1055 } elsif( $mchar ne ' ' ) { | |
1056 $identicalList_query{ $resCount_query } = 1; | |
1057 $identicalList_sbjct{ $resCount_sbjct } = 1; | |
1058 } elsif( $mchar eq ' ') { | |
1059 $nomatchList_query{ $resCount_query } = 1; | |
1060 $nomatchList_sbjct{ $resCount_sbjct } = 1; | |
1061 } | |
1062 if( $qchar eq $GAP_SYMBOL ) { | |
1063 $gapList_query{ $resCount_query } ++; | |
1064 } else { | |
1065 $resCount_query -= $qdir; | |
1066 } | |
1067 if( $schar eq $GAP_SYMBOL ) { | |
1068 $gapList_sbjct{ $resCount_query } ++; | |
1069 } else { | |
1070 $resCount_sbjct -=$sdir; | |
1071 } | |
1072 } | |
1073 $self->{'_identicalRes_query'} = \%identicalList_query; | |
1074 $self->{'_conservedRes_query'} = \%conservedList_query; | |
1075 $self->{'_nomatchRes_query'} = \%nomatchList_query; | |
1076 $self->{'_gapRes_query'} = \%gapList_query; | |
1077 | |
1078 $self->{'_identicalRes_sbjct'} = \%identicalList_sbjct; | |
1079 $self->{'_conservedRes_sbjct'} = \%conservedList_sbjct; | |
1080 $self->{'_nomatchRes_sbjct'} = \%nomatchList_sbjct; | |
1081 $self->{'_gapRes_sbjct'} = \%gapList_sbjct; | |
1082 return 1; | |
1083 } | |
1084 | |
1085 =head2 n | |
1086 | |
1087 See documentation in L<Bio::Search::HSP::HSPI::n()|Bio::Search::HSP::HSPI> | |
1088 | |
1089 =cut | |
1090 | |
1091 #----- | |
1092 sub n { | |
1093 my $self = shift; | |
1094 if(@_) { $self->{'_n'} = shift; } | |
1095 defined $self->{'_n'} ? $self->{'_n'} : ''; | |
1096 } | |
1097 | |
1098 =head2 range | |
1099 | |
1100 See documentation in L<Bio::Search::HSP::HSPI::range()|Bio::Search::HSP::HSPI> | |
1101 | |
1102 =cut | |
1103 | |
1104 #---------- | |
1105 sub range { | |
1106 #---------- | |
1107 my ($self, $seqType) = @_; | |
1108 | |
1109 $seqType ||= 'query'; | |
1110 $seqType = 'sbjct' if $seqType eq 'hit'; | |
1111 | |
1112 my ($start, $end); | |
1113 if( $seqType eq 'query' ) { | |
1114 $start = $self->query->start; | |
1115 $end = $self->query->end; | |
1116 } | |
1117 else { | |
1118 $start = $self->hit->start; | |
1119 $end = $self->hit->end; | |
1120 } | |
1121 return ($start, $end); | |
1122 } | |
1123 | |
1124 | |
1125 1; |