0
|
1 # $Id: GenericHSP.pm,v 1.40.2.3 2003/03/24 20:44:45 jason Exp $
|
|
2 #
|
|
3 # BioPerl module for Bio::Search::HSP::GenericHSP
|
|
4 #
|
|
5 # Cared for by Jason Stajich <jason@bioperl.org>
|
|
6 #
|
|
7 # Copyright Jason Stajich
|
|
8 #
|
|
9 # You may distribute this module under the same terms as perl itself
|
|
10
|
|
11 # POD documentation - main docs before the code
|
|
12
|
|
13 =head1 NAME
|
|
14
|
|
15 Bio::Search::HSP::GenericHSP - A "Generic" implementation of a High Scoring Pair
|
|
16
|
|
17 =head1 SYNOPSIS
|
|
18
|
|
19 my $hsp = new Bio::Search::HSP::GenericHSP( -algorithm => 'blastp',
|
|
20 -evalue => '1e-30',
|
|
21 );
|
|
22
|
|
23 $r_type = $hsp->algorithm
|
|
24
|
|
25 $pvalue = $hsp->p();
|
|
26
|
|
27 $evalue = $hsp->evalue();
|
|
28
|
|
29 $frac_id = $hsp->frac_identical( ['query'|'hit'|'total'] );
|
|
30
|
|
31 $frac_cons = $hsp->frac_conserved( ['query'|'hit'|'total'] );
|
|
32
|
|
33 $gaps = $hsp->gaps( ['query'|'hit'|'total'] );
|
|
34
|
|
35 $qseq = $hsp->query_string;
|
|
36
|
|
37 $hseq = $hsp->hit_string;
|
|
38
|
|
39 $homo_string = $hsp->homology_string;
|
|
40
|
|
41 $len = $hsp->length( ['query'|'hit'|'total'] );
|
|
42
|
|
43 $len = $hsp->length( ['query'|'hit'|'total'] );
|
|
44
|
|
45 $rank = $hsp->rank;
|
|
46
|
|
47
|
|
48 =head1 DESCRIPTION
|
|
49
|
|
50 This implementation is "Generic", meaning it is is suitable for
|
|
51 holding information about High Scoring pairs from most Search reports
|
|
52 such as BLAST and FastA. Specialized objects can be derived from
|
|
53 this.
|
|
54
|
|
55 =head1 FEEDBACK
|
|
56
|
|
57 =head2 Mailing Lists
|
|
58
|
|
59 User feedback is an integral part of the evolution of this and other
|
|
60 Bioperl modules. Send your comments and suggestions preferably to
|
|
61 the Bioperl mailing list. Your participation is much appreciated.
|
|
62
|
|
63 bioperl-l@bioperl.org - General discussion
|
|
64 http://bioperl.org/MailList.shtml - About the mailing lists
|
|
65
|
|
66 =head2 Reporting Bugs
|
|
67
|
|
68 Report bugs to the Bioperl bug tracking system to help us keep track
|
|
69 of the bugs and their resolution. Bug reports can be submitted via
|
|
70 email or the web:
|
|
71
|
|
72 bioperl-bugs@bioperl.org
|
|
73 http://bugzilla.bioperl.org/
|
|
74
|
|
75 =head1 AUTHOR - Jason Stajich and Steve Chervitz
|
|
76
|
|
77 Email jason@bioperl.org
|
|
78 Email sac@bioperl.org
|
|
79
|
|
80 Describe contact details here
|
|
81
|
|
82 =head1 CONTRIBUTORS
|
|
83
|
|
84 Additional contributors names and emails here
|
|
85
|
|
86 =head1 APPENDIX
|
|
87
|
|
88 The rest of the documentation details each of the object methods.
|
|
89 Internal methods are usually preceded with a _
|
|
90
|
|
91 =cut
|
|
92
|
|
93
|
|
94 # Let the code begin...
|
|
95
|
|
96
|
|
97 package Bio::Search::HSP::GenericHSP;
|
|
98 use vars qw(@ISA $GAP_SYMBOL);
|
|
99 use strict;
|
|
100
|
|
101 use Bio::Root::Root;
|
|
102 use Bio::SeqFeature::Similarity;
|
|
103 use Bio::Search::HSP::HSPI;
|
|
104
|
|
105 @ISA = qw(Bio::Search::HSP::HSPI Bio::Root::Root );
|
|
106
|
|
107 BEGIN {
|
|
108 $GAP_SYMBOL = '-';
|
|
109 }
|
|
110 =head2 new
|
|
111
|
|
112 Title : new
|
|
113 Usage : my $obj = new Bio::Search::HSP::GenericHSP();
|
|
114 Function: Builds a new Bio::Search::HSP::GenericHSP object
|
|
115 Returns : Bio::Search::HSP::GenericHSP
|
|
116 Args : -algorithm => algorithm used (BLASTP, TBLASTX, FASTX, etc)
|
|
117 -evalue => evalue
|
|
118 -pvalue => pvalue
|
|
119 -bits => bit value for HSP
|
|
120 -score => score value for HSP (typically z-score but depends on
|
|
121 analysis)
|
|
122 -hsp_length=> Length of the HSP (including gaps)
|
|
123 -identical => # of residues that that matched identically
|
|
124 -conserved => # of residues that matched conservatively
|
|
125 (only protein comparisions;
|
|
126 conserved == identical in nucleotide comparisons)
|
|
127 -hsp_gaps => # of gaps in the HSP
|
|
128 -query_gaps => # of gaps in the query in the alignment
|
|
129 -hit_gaps => # of gaps in the subject in the alignment
|
|
130 -query_name => HSP Query sequence name (if available)
|
|
131 -query_start => HSP Query start (in original query sequence coords)
|
|
132 -query_end => HSP Query end (in original query sequence coords)
|
|
133 -hit_name => HSP Hit sequence name (if available)
|
|
134 -hit_start => HSP Hit start (in original hit sequence coords)
|
|
135 -hit_end => HSP Hit end (in original hit sequence coords)
|
|
136 -hit_length => total length of the hit sequence
|
|
137 -query_length=> total length of the query sequence
|
|
138 -query_seq => query sequence portion of the HSP
|
|
139 -hit_seq => hit sequence portion of the HSP
|
|
140 -homology_seq=> homology sequence for the HSP
|
|
141 -hit_frame => hit frame (only if hit is translated protein)
|
|
142 -query_frame => query frame (only if query is translated protein)
|
|
143 -rank => HSP rank
|
|
144
|
|
145
|
|
146 =cut
|
|
147
|
|
148 sub new {
|
|
149 my($class,@args) = @_;
|
|
150
|
|
151 my $self = $class->SUPER::new(@args);
|
|
152 my ($algo, $evalue, $pvalue, $identical, $conserved,
|
|
153 $gaps, $query_gaps, $hit_gaps,
|
|
154 $hit_seq, $query_seq, $homology_seq,
|
|
155 $hsp_len, $query_len,$hit_len,
|
|
156 $hit_name,$query_name,$bits,$score,
|
|
157 $hs,$he,$qs,$qe,
|
|
158 $qframe,$hframe,
|
|
159 $rank) = $self->_rearrange([qw(ALGORITHM
|
|
160 EVALUE
|
|
161 PVALUE
|
|
162 IDENTICAL
|
|
163 CONSERVED
|
|
164 HSP_GAPS
|
|
165 QUERY_GAPS
|
|
166 HIT_GAPS
|
|
167 HIT_SEQ
|
|
168 QUERY_SEQ
|
|
169 HOMOLOGY_SEQ
|
|
170 HSP_LENGTH
|
|
171 QUERY_LENGTH
|
|
172 HIT_LENGTH
|
|
173 HIT_NAME
|
|
174 QUERY_NAME
|
|
175 BITS
|
|
176 SCORE
|
|
177 HIT_START
|
|
178 HIT_END
|
|
179 QUERY_START
|
|
180 QUERY_END
|
|
181 QUERY_FRAME
|
|
182 HIT_FRAME
|
|
183 RANK
|
|
184 )], @args);
|
|
185
|
|
186 $algo = 'GENERIC' unless defined $algo;
|
|
187 $self->algorithm($algo);
|
|
188
|
|
189 # defined $evalue && $self->evalue($evalue)
|
|
190 # $hsp->significance is initialized by the
|
|
191 # the SimilarityPair object - let's only keep one
|
|
192 # value, don't need 2 slots.
|
|
193
|
|
194 defined $pvalue && $self->pvalue($pvalue);
|
|
195 defined $bits && $self->bits($bits);
|
|
196 defined $score && $self->score($score);
|
|
197 my ($queryfactor, $hitfactor) = (0,0);
|
|
198
|
|
199 if( $algo =~ /^(PSI)?T(BLAST|FAST)[NY]/oi ) {
|
|
200 $hitfactor = 1;
|
|
201 } elsif ($algo =~ /^(FAST|BLAST)(X|Y|XY)/oi ) {
|
|
202 $queryfactor = 1;
|
|
203 } elsif ($algo =~ /^T(BLAST|FAST)(X|Y|XY)/oi ||
|
|
204 $algo =~ /^(BLAST|FAST)N/oi ||
|
|
205 $algo eq 'WABA' ||
|
|
206 $algo eq 'EXONERATE' || $algo eq 'MEGABLAST' ||
|
|
207 $algo eq 'SMITH-WATERMAN' ){
|
|
208 $hitfactor = 1;
|
|
209 $queryfactor = 1;
|
|
210 } elsif( $algo eq 'RPSBLAST' ) {
|
|
211 $queryfactor = $hitfactor = 0;
|
|
212 $qframe = $hframe = 0;
|
|
213 }
|
|
214 # Store the aligned query as sequence feature
|
|
215 my $strand;
|
|
216 unless( defined $qe && defined $qs ) { $self->throw("Did not specify a Query End or Query Begin @args ($qs,$qe)"); }
|
|
217 unless( defined $he && defined $hs ) { $self->throw("Did not specify a Hit End or Hit Begin"); }
|
|
218 if ($qe > $qs) { # normal query: start < end
|
|
219 if ($queryfactor) { $strand = 1; } else { $strand = undef; }
|
|
220 } else { # reverse query (i dont know if this is possible,
|
|
221 # but feel free to correct)
|
|
222 if ($queryfactor) { $strand = -1; } else { $strand = undef; }
|
|
223 ($qs,$qe) = ($qe,$qs);
|
|
224
|
|
225 }
|
|
226 $self->query( new Bio::SeqFeature::Similarity
|
|
227 ('-primary' => $self->primary_tag,
|
|
228 '-start' => $qs,
|
|
229 '-expect' => $evalue,
|
|
230 '-bits' => $bits,
|
|
231 '-end' => $qe,
|
|
232 '-strand' => $strand,
|
|
233 '-seq_id' => $query_name,
|
|
234 '-seqlength'=> $query_len,
|
|
235 '-source' => $algo,
|
|
236 ) );
|
|
237
|
|
238 # to determine frame from something like FASTXY which doesn't
|
|
239 # report the frame
|
|
240 if( defined $strand && ! defined $qframe && $queryfactor ) {
|
|
241 $qframe = ( $self->query->start % 3 ) * $strand;
|
|
242 } elsif( ! defined $strand ) {
|
|
243 $qframe = 0;
|
|
244 }
|
|
245 # store the aligned subject as sequence feature
|
|
246 if ($he > $hs) { # normal subject
|
|
247 if ($hitfactor) { $strand = 1; } else { $strand = undef; }
|
|
248 } else {
|
|
249 if ($hitfactor) { $strand = -1; } else { $strand = undef; }
|
|
250 ($hs,$he) = ( $he,$hs); # reverse subject: start bigger than end
|
|
251 }
|
|
252
|
|
253 $self->hit( Bio::SeqFeature::Similarity->new
|
|
254 ('-start' => $hs,
|
|
255 '-end' => $he,
|
|
256 '-strand' => $strand,
|
|
257 '-expect' => $evalue,
|
|
258 '-bits' => $bits,
|
|
259 '-source' => $algo,
|
|
260 '-seq_id' => $hit_name,
|
|
261 '-seqlength' => $hit_len,
|
|
262 '-primary' => $self->primary_tag ));
|
|
263
|
|
264 if( defined $strand && ! defined $hframe && $hitfactor ) {
|
|
265 $hframe = ( $hs % 3 ) * $strand;
|
|
266 } elsif( ! defined $strand ) {
|
|
267 $hframe = 0;
|
|
268 }
|
|
269
|
|
270 $self->frame($qframe,$hframe);
|
|
271
|
|
272 if( ! defined $query_len || ! defined $hit_len ) {
|
|
273 $self->throw("Must defined hit and query length");
|
|
274 }
|
|
275
|
|
276 if( ! defined $identical ) {
|
|
277 $self->warn("Did not defined the number of identical matches in the HSP assuming 0");
|
|
278 $identical = 0;
|
|
279 }
|
|
280 if( ! defined $conserved ) {
|
|
281 $self->warn("Did not defined the number of conserved matches in the HSP assuming conserved == identical ($identical)")
|
|
282 if( $algo !~ /^((FAST|BLAST)N)|Exonerate/oi);
|
|
283 $conserved = $identical;
|
|
284 }
|
|
285 # protect for divide by zero if user does not specify
|
|
286 # hsp_len, query_len, or hit_len
|
|
287
|
|
288 $self->num_identical($identical);
|
|
289 $self->num_conserved($conserved);
|
|
290
|
|
291 if( $hsp_len ) {
|
|
292 $self->length('total', $hsp_len);
|
|
293 $self->frac_identical( 'total', $identical / $self->length('total'));
|
|
294 $self->frac_conserved( 'total', $conserved / $self->length('total'));
|
|
295 }
|
|
296 if( $hit_len ) {
|
|
297 # $self->length('hit', $self->hit->length);
|
|
298 $self->frac_identical( 'hit', $identical / $self->length('hit'));
|
|
299 $self->frac_conserved( 'hit', $conserved / $self->length('hit'));
|
|
300 }
|
|
301 if( $query_len ) {
|
|
302 # $self->length('query', $self->query->length);
|
|
303 $self->frac_identical( 'query', $identical / $self->length('query')) ;
|
|
304 $self->frac_conserved( 'query', $conserved / $self->length('query'));
|
|
305 }
|
|
306 $self->query_string($query_seq);
|
|
307 $self->hit_string($hit_seq);
|
|
308 $self->homology_string($homology_seq);
|
|
309
|
|
310 if( defined $query_gaps ) {
|
|
311 $self->gaps('query', $query_gaps);
|
|
312 } elsif( defined $query_seq ) {
|
|
313 $self->gaps('query', scalar ( $query_seq =~ tr/\-//));
|
|
314 }
|
|
315 if( defined $hit_gaps ) {
|
|
316 $self->gaps('hit', $hit_gaps);
|
|
317 } elsif( defined $hit_seq ) {
|
|
318 $self->gaps('hit', scalar ( $hit_seq =~ tr/\-//));
|
|
319 }
|
|
320 if( ! defined $gaps ) {
|
|
321 $gaps = $self->gaps("query") + $self->gaps("hit");
|
|
322 }
|
|
323 $self->gaps('total', $gaps);
|
|
324 $self->percent_identity($identical / $hsp_len ) if( $hsp_len > 0 );
|
|
325
|
|
326 $rank && $self->rank($rank);
|
|
327 return $self;
|
|
328 }
|
|
329
|
|
330
|
|
331
|
|
332 =head2 Bio::Search::HSP::HSPI methods
|
|
333
|
|
334 Implementation of Bio::Search::HSP::HSPI methods follow
|
|
335
|
|
336 =head2 algorithm
|
|
337
|
|
338 Title : algorithm
|
|
339 Usage : my $r_type = $hsp->algorithm
|
|
340 Function: Obtain the name of the algorithm used to obtain the HSP
|
|
341 Returns : string (e.g., BLASTP)
|
|
342 Args : [optional] scalar string to set value
|
|
343
|
|
344 =cut
|
|
345
|
|
346 sub algorithm{
|
|
347 my ($self,$value) = @_;
|
|
348 my $previous = $self->{'_algorithm'};
|
|
349 if( defined $value || ! defined $previous ) {
|
|
350 $value = $previous = '' unless defined $value;
|
|
351 $self->{'_algorithm'} = $value;
|
|
352 }
|
|
353
|
|
354 return $previous;
|
|
355 }
|
|
356
|
|
357 =head2 pvalue
|
|
358
|
|
359 Title : pvalue
|
|
360 Usage : my $pvalue = $hsp->pvalue();
|
|
361 Function: Returns the P-value for this HSP or undef
|
|
362 Returns : float or exponential (2e-10)
|
|
363 P-value is not defined with NCBI Blast2 reports.
|
|
364 Args : [optional] numeric to set value
|
|
365
|
|
366 =cut
|
|
367
|
|
368 sub pvalue {
|
|
369 my ($self,$value) = @_;
|
|
370 my $previous = $self->{'_pvalue'};
|
|
371 if( defined $value ) {
|
|
372 $self->{'_pvalue'} = $value;
|
|
373 }
|
|
374 return $previous;
|
|
375 }
|
|
376
|
|
377 =head2 evalue
|
|
378
|
|
379 Title : evalue
|
|
380 Usage : my $evalue = $hsp->evalue();
|
|
381 Function: Returns the e-value for this HSP
|
|
382 Returns : float or exponential (2e-10)
|
|
383 Args : [optional] numeric to set value
|
|
384
|
|
385 =cut
|
|
386
|
|
387 sub evalue { shift->significance(@_) }
|
|
388
|
|
389 =head2 frac_identical
|
|
390
|
|
391 Title : frac_identical
|
|
392 Usage : my $frac_id = $hsp->frac_identical( ['query'|'hit'|'total'] );
|
|
393 Function: Returns the fraction of identitical positions for this HSP
|
|
394 Returns : Float in range 0.0 -> 1.0
|
|
395 Args : arg 1: 'query' = num identical / length of query seq (without gaps)
|
|
396 'hit' = num identical / length of hit seq (without gaps)
|
|
397 'total' = num identical / length of alignment (with gaps)
|
|
398 default = 'total'
|
|
399 arg 2: [optional] frac identical value to set for the type requested
|
|
400
|
|
401 =cut
|
|
402
|
|
403 sub frac_identical {
|
|
404 my ($self, $type,$value) = @_;
|
|
405
|
|
406 $type = lc $type if defined $type;
|
|
407 $type = 'total' if( ! defined $type ||
|
|
408 $type !~ /query|hit|total/);
|
|
409 my $previous = $self->{'_frac_identical'}->{$type};
|
|
410 if( defined $value || ! defined $previous ) {
|
|
411 $value = $previous = '' unless defined $value;
|
|
412 if( $type eq 'hit' || $type eq 'query' ) {
|
|
413 $self->$type()->frac_identical( $value);
|
|
414 }
|
|
415 $self->{'_frac_identical'}->{$type} = $value;
|
|
416 }
|
|
417 return $previous;
|
|
418
|
|
419 }
|
|
420
|
|
421 =head2 frac_conserved
|
|
422
|
|
423 Title : frac_conserved
|
|
424 Usage : my $frac_cons = $hsp->frac_conserved( ['query'|'hit'|'total'] );
|
|
425 Function : Returns the fraction of conserved positions for this HSP.
|
|
426 This is the fraction of symbols in the alignment with a
|
|
427 positive score.
|
|
428 Returns : Float in range 0.0 -> 1.0
|
|
429 Args : arg 1: 'query' = num conserved / length of query seq (without gaps)
|
|
430 'hit' = num conserved / length of hit seq (without gaps)
|
|
431 'total' = num conserved / length of alignment (with gaps)
|
|
432 default = 'total'
|
|
433 arg 2: [optional] frac conserved value to set for the type requested
|
|
434
|
|
435 =cut
|
|
436
|
|
437 sub frac_conserved {
|
|
438 my ($self, $type,$value) = @_;
|
|
439 $type = lc $type if defined $type;
|
|
440 $type = 'total' if( ! defined $type ||
|
|
441 $type !~ /query|hit|total/);
|
|
442 my $previous = $self->{'_frac_conserved'}->{$type};
|
|
443 if( defined $value || ! defined $previous ) {
|
|
444 $value = $previous = '' unless defined $value;
|
|
445 $self->{'_frac_conserved'}->{$type} = $value;
|
|
446 }
|
|
447 return $previous;
|
|
448 }
|
|
449
|
|
450 =head2 gaps
|
|
451
|
|
452 Title : gaps
|
|
453 Usage : my $gaps = $hsp->gaps( ['query'|'hit'|'total'] );
|
|
454 Function : Get the number of gaps in the query, hit, or total alignment.
|
|
455 Returns : Integer, number of gaps or 0 if none
|
|
456 Args : arg 1: 'query' = num gaps in query seq
|
|
457 'hit' = num gaps in hit seq
|
|
458 'total' = num gaps in whole alignment
|
|
459 default = 'total'
|
|
460 arg 2: [optional] integer gap value to set for the type requested
|
|
461
|
|
462 =cut
|
|
463
|
|
464 sub gaps {
|
|
465 my ($self, $type,$value) = @_;
|
|
466 $type = lc $type if defined $type;
|
|
467 $type = 'total' if( ! defined $type ||
|
|
468 $type !~ /query|hit|subject|sbjct|total/);
|
|
469 $type = 'hit' if $type =~ /sbjct|subject/;
|
|
470 my $previous = $self->{'_gaps'}->{$type};
|
|
471 if( defined $value || ! defined $previous ) {
|
|
472 $value = $previous = '' unless defined $value;
|
|
473 $self->{'_gaps'}->{$type} = $value;
|
|
474 }
|
|
475 return $previous || 0;
|
|
476 }
|
|
477
|
|
478 =head2 query_string
|
|
479
|
|
480 Title : query_string
|
|
481 Usage : my $qseq = $hsp->query_string;
|
|
482 Function: Retrieves the query sequence of this HSP as a string
|
|
483 Returns : string
|
|
484 Args : [optional] string to set for query sequence
|
|
485
|
|
486
|
|
487 =cut
|
|
488
|
|
489 sub query_string{
|
|
490 my ($self,$value) = @_;
|
|
491 my $previous = $self->{'_query_string'};
|
|
492 if( defined $value || ! defined $previous ) {
|
|
493 $value = $previous = '' unless defined $value;
|
|
494 $self->{'_query_string'} = $value;
|
|
495 # do some housekeeping so we know when to
|
|
496 # re-run _calculate_seq_positions
|
|
497 $self->{'_sequenceschanged'} = 1;
|
|
498 }
|
|
499 return $previous;
|
|
500 }
|
|
501
|
|
502 =head2 hit_string
|
|
503
|
|
504 Title : hit_string
|
|
505 Usage : my $hseq = $hsp->hit_string;
|
|
506 Function: Retrieves the hit sequence of this HSP as a string
|
|
507 Returns : string
|
|
508 Args : [optional] string to set for hit sequence
|
|
509
|
|
510
|
|
511 =cut
|
|
512
|
|
513 sub hit_string{
|
|
514 my ($self,$value) = @_;
|
|
515 my $previous = $self->{'_hit_string'};
|
|
516 if( defined $value || ! defined $previous ) {
|
|
517 $value = $previous = '' unless defined $value;
|
|
518 $self->{'_hit_string'} = $value;
|
|
519 # do some housekeeping so we know when to
|
|
520 # re-run _calculate_seq_positions
|
|
521 $self->{'_sequenceschanged'} = 1;
|
|
522 }
|
|
523 return $previous;
|
|
524 }
|
|
525
|
|
526 =head2 homology_string
|
|
527
|
|
528 Title : homology_string
|
|
529 Usage : my $homo_string = $hsp->homology_string;
|
|
530 Function: Retrieves the homology sequence for this HSP as a string.
|
|
531 : The homology sequence is the string of symbols in between the
|
|
532 : query and hit sequences in the alignment indicating the degree
|
|
533 : of conservation (e.g., identical, similar, not similar).
|
|
534 Returns : string
|
|
535 Args : [optional] string to set for homology sequence
|
|
536
|
|
537 =cut
|
|
538
|
|
539 sub homology_string{
|
|
540 my ($self,$value) = @_;
|
|
541 my $previous = $self->{'_homology_string'};
|
|
542 if( defined $value || ! defined $previous ) {
|
|
543 $value = $previous = '' unless defined $value;
|
|
544 $self->{'_homology_string'} = $value;
|
|
545 # do some housekeeping so we know when to
|
|
546 # re-run _calculate_seq_positions
|
|
547 $self->{'_sequenceschanged'} = 1;
|
|
548 }
|
|
549 return $previous;
|
|
550 }
|
|
551
|
|
552 =head2 length
|
|
553
|
|
554 Title : length
|
|
555 Usage : my $len = $hsp->length( ['query'|'hit'|'total'] );
|
|
556 Function : Returns the length of the query or hit in the alignment
|
|
557 (without gaps)
|
|
558 or the aggregate length of the HSP (including gaps;
|
|
559 this may be greater than either hit or query )
|
|
560 Returns : integer
|
|
561 Args : arg 1: 'query' = length of query seq (without gaps)
|
|
562 'hit' = length of hit seq (without gaps)
|
|
563 'total' = length of alignment (with gaps)
|
|
564 default = 'total'
|
|
565 arg 2: [optional] integer length value to set for specific type
|
|
566
|
|
567 =cut
|
|
568
|
|
569 sub length {
|
|
570
|
|
571 my $self = shift;
|
|
572 my $type = shift;
|
|
573
|
|
574 $type = 'total' unless defined $type;
|
|
575 $type = lc $type;
|
|
576
|
|
577 if( $type =~ /^q/i ) {
|
|
578 return $self->query()->length(shift);
|
|
579 } elsif( $type =~ /^(hit|subject|sbjct)/ ) {
|
|
580 return $self->hit()->length(shift);
|
|
581 } else {
|
|
582 my $v = shift;
|
|
583 if( defined $v ) {
|
|
584 $self->{'_hsplength'} = $v;
|
|
585 }
|
|
586 return $self->{'_hsplength'};
|
|
587 }
|
|
588 return 0; # should never get here
|
|
589 }
|
|
590
|
|
591 =head2 hsp_length
|
|
592
|
|
593 Title : hsp_length
|
|
594 Usage : my $len = $hsp->hsp_length()
|
|
595 Function: shortcut length('hsp')
|
|
596 Returns : floating point between 0 and 100
|
|
597 Args : none
|
|
598
|
|
599 =cut
|
|
600
|
|
601 sub hsp_length { return shift->length('hsp', shift); }
|
|
602
|
|
603 =head2 percent_identity
|
|
604
|
|
605 Title : percent_identity
|
|
606 Usage : my $percentid = $hsp->percent_identity()
|
|
607 Function: Returns the calculated percent identity for an HSP
|
|
608 Returns : floating point between 0 and 100
|
|
609 Args : none
|
|
610
|
|
611
|
|
612 =cut
|
|
613
|
|
614
|
|
615 =head2 frame
|
|
616
|
|
617 Title : frame
|
|
618 Usage : $hsp->frame($queryframe,$subjectframe)
|
|
619 Function: Set the Frame for both query and subject and insure that
|
|
620 they agree.
|
|
621 This overrides the frame() method implementation in
|
|
622 FeaturePair.
|
|
623 Returns : array of query and subjects if return type wants an array
|
|
624 or query frame if defined or subject frame
|
|
625 Args : none
|
|
626 Note : Frames are stored in the GFF way (0-2) not 1-3
|
|
627 as they are in BLAST (negative frames are deduced by checking
|
|
628 the strand of the query or hit)
|
|
629
|
|
630 =cut
|
|
631
|
|
632 sub frame {
|
|
633 my ($self, $qframe, $sframe) = @_;
|
|
634 if( defined $qframe ) {
|
|
635 if( $qframe == 0 ) {
|
|
636 $qframe = 0;
|
|
637 } elsif( $qframe !~ /^([+-])?([1-3])/ ) {
|
|
638 $self->warn("Specifying an invalid query frame ($qframe)");
|
|
639 $qframe = undef;
|
|
640 } else {
|
|
641 my $dir = $1;
|
|
642 $dir = '+' unless defined $dir;
|
|
643 if( ($dir eq '-' && $self->query->strand >= 0) ||
|
|
644 ($dir eq '+' && $self->query->strand <= 0) ) {
|
|
645 $self->warn("Query frame ($qframe) did not match strand of query (". $self->query->strand() . ")");
|
|
646 }
|
|
647 # Set frame to GFF [0-2] -
|
|
648 # what if someone tries to put in a GFF frame!
|
|
649 $qframe = $2 - 1;
|
|
650 }
|
|
651 $self->query->frame($qframe);
|
|
652 }
|
|
653 if( defined $sframe ) {
|
|
654 if( $sframe == 0 ) {
|
|
655 $sframe = 0;
|
|
656 } elsif( $sframe !~ /^([+-])?([1-3])/ ) {
|
|
657 $self->warn("Specifying an invalid subject frame ($sframe)");
|
|
658 $sframe = undef;
|
|
659 } else {
|
|
660 my $dir = $1;
|
|
661 $dir = '+' unless defined $dir;
|
|
662 if( ($dir eq '-' && $self->hit->strand >= 0) ||
|
|
663 ($dir eq '+' && $self->hit->strand <= 0) )
|
|
664 {
|
|
665 $self->warn("Subject frame ($sframe) did not match strand of subject (". $self->hit->strand() . ")");
|
|
666 }
|
|
667
|
|
668 # Set frame to GFF [0-2]
|
|
669 $sframe = $2 - 1;
|
|
670 }
|
|
671 $self->hit->frame($sframe);
|
|
672 }
|
|
673 if (wantarray() && $self->algorithm =~ /^T(BLAST|FAST)(X|Y|XY)/oi)
|
|
674 {
|
|
675 return ($self->query->frame(), $self->hit->frame());
|
|
676 } elsif (wantarray()) {
|
|
677 ($self->query->frame() &&
|
|
678 return ($self->query->frame(), undef)) ||
|
|
679 ($self->hit->frame() &&
|
|
680 return (undef, $self->hit->frame()));
|
|
681 } else {
|
|
682 ($self->query->frame() &&
|
|
683 return $self->query->frame()) ||
|
|
684 ($self->hit->frame() &&
|
|
685 return $self->hit->frame());
|
|
686 }
|
|
687 }
|
|
688
|
|
689
|
|
690 =head2 get_aln
|
|
691
|
|
692 Title : get_aln
|
|
693 Usage : my $aln = $hsp->gel_aln
|
|
694 Function: Returns a Bio::SimpleAlign representing the HSP alignment
|
|
695 Returns : Bio::SimpleAlign
|
|
696 Args : none
|
|
697
|
|
698 =cut
|
|
699
|
|
700 sub get_aln {
|
|
701 my ($self) = @_;
|
|
702 require Bio::LocatableSeq;
|
|
703 require Bio::SimpleAlign;
|
|
704 my $aln = new Bio::SimpleAlign;
|
|
705 my $hs = $self->hit_string();
|
|
706 my $qs = $self->query_string();
|
|
707 # FASTA specific stuff moved to the FastaHSP object
|
|
708 my $seqonly = $qs;
|
|
709 $seqonly =~ s/[\-\s]//g;
|
|
710 my ($q_nm,$s_nm) = ($self->query->seq_id(),
|
|
711 $self->hit->seq_id());
|
|
712 unless( defined $q_nm && CORE::length ($q_nm) ) {
|
|
713 $q_nm = 'query';
|
|
714 }
|
|
715 unless( defined $s_nm && CORE::length ($s_nm) ) {
|
|
716 $s_nm = 'hit';
|
|
717 }
|
|
718 my $query = new Bio::LocatableSeq('-seq' => $qs,
|
|
719 '-id' => $q_nm,
|
|
720 '-start' => 1,
|
|
721 '-end' => CORE::length($seqonly),
|
|
722 );
|
|
723 $seqonly = $hs;
|
|
724 $seqonly =~ s/[\-\s]//g;
|
|
725 my $hit = new Bio::LocatableSeq('-seq' => $hs,
|
|
726 '-id' => $s_nm,
|
|
727 '-start' => 1,
|
|
728 '-end' => CORE::length($seqonly),
|
|
729 );
|
|
730 $aln->add_seq($query);
|
|
731 $aln->add_seq($hit);
|
|
732 return $aln;
|
|
733 }
|
|
734
|
|
735 =head2 num_conserved
|
|
736
|
|
737 Title : num_conserved
|
|
738 Usage : $obj->num_conserved($newval)
|
|
739 Function: returns the number of conserved residues in the alignment
|
|
740 Returns : inetger
|
|
741 Args : integer (optional)
|
|
742
|
|
743
|
|
744 =cut
|
|
745
|
|
746 sub num_conserved{
|
|
747 my ($self,$value) = @_;
|
|
748 if( defined $value) {
|
|
749 $self->{'num_conserved'} = $value;
|
|
750 }
|
|
751 return $self->{'num_conserved'};
|
|
752 }
|
|
753
|
|
754 =head2 num_identical
|
|
755
|
|
756 Title : num_identical
|
|
757 Usage : $obj->num_identical($newval)
|
|
758 Function: returns the number of identical residues in the alignment
|
|
759 Returns : integer
|
|
760 Args : integer (optional)
|
|
761
|
|
762
|
|
763 =cut
|
|
764
|
|
765 sub num_identical{
|
|
766 my ($self,$value) = @_;
|
|
767 if( defined $value) {
|
|
768 $self->{'_num_identical'} = $value;
|
|
769 }
|
|
770 return $self->{'_num_identical'};
|
|
771 }
|
|
772
|
|
773 =head2 rank
|
|
774
|
|
775 Usage : $hsp->rank( [string] );
|
|
776 Purpose : Get the rank of the HSP within a given Blast hit.
|
|
777 Example : $rank = $hsp->rank;
|
|
778 Returns : Integer (1..n) corresponding to the order in which the HSP
|
|
779 appears in the BLAST report.
|
|
780
|
|
781 =cut
|
|
782
|
|
783 sub rank {
|
|
784 my ($self,$value) = @_;
|
|
785 if( defined $value) {
|
|
786 $self->{'_rank'} = $value;
|
|
787 }
|
|
788 return $self->{'_rank'};
|
|
789 }
|
|
790
|
|
791
|
|
792 =head2 seq_inds
|
|
793
|
|
794 Title : seq_inds
|
|
795 Purpose : Get a list of residue positions (indices) for all identical
|
|
796 : or conserved residues in the query or sbjct sequence.
|
|
797 Example : @s_ind = $hsp->seq_inds('query', 'identical');
|
|
798 : @h_ind = $hsp->seq_inds('hit', 'conserved');
|
|
799 @h_ind = $hsp->seq_inds('hit', 'conserved-not-identical');
|
|
800 : @h_ind = $hsp->seq_inds('hit', 'conserved', 1);
|
|
801 Returns : List of integers
|
|
802 : May include ranges if collapse is true.
|
|
803 Argument : seq_type = 'query' or 'hit' or 'sbjct' (default = query)
|
|
804 : ('sbjct' is synonymous with 'hit')
|
|
805 : class = 'identical' or 'conserved' or 'nomatch' or 'gap'
|
|
806 : (default = identical)
|
|
807 : (can be shortened to 'id' or 'cons')
|
|
808 : or 'conserved-not-identical'
|
|
809 : collapse = boolean, if true, consecutive positions are merged
|
|
810 : using a range notation, e.g., "1 2 3 4 5 7 9 10 11"
|
|
811 : collapses to "1-5 7 9-11". This is useful for
|
|
812 : consolidating long lists. Default = no collapse.
|
|
813 Throws : n/a.
|
|
814 Comments :
|
|
815
|
|
816 See Also : L<Bio::Search::SearchUtils::collapse_nums()|Bio::Search::SearchUtils>,
|
|
817 L<Bio::Search::Hit::HitI::seq_inds()|Bio::Search::Hit::HitI>
|
|
818
|
|
819 =cut
|
|
820
|
|
821 sub seq_inds{
|
|
822 my ($self, $seqType, $class, $collapse) = @_;
|
|
823
|
|
824 # prepare the internal structures - this is cached so
|
|
825 # if the strings have not changed we're okay
|
|
826 $self->_calculate_seq_positions();
|
|
827
|
|
828 $seqType ||= 'query';
|
|
829 $class ||= 'identical';
|
|
830 $collapse ||= 0;
|
|
831 $seqType = 'sbjct' if $seqType eq 'hit';
|
|
832 my $t = lc(substr($seqType,0,1));
|
|
833 if( $t eq 'q' ) {
|
|
834 $seqType = 'query';
|
|
835 } elsif ( $t eq 's' || $t eq 'h' ) {
|
|
836 $seqType = 'sbjct';
|
|
837 } else {
|
|
838 $self->warn("unknown seqtype $seqType using 'query'");
|
|
839 $seqType = 'query';
|
|
840 }
|
|
841 $t = lc(substr($class,0,1));
|
|
842
|
|
843 if( $t eq 'c' ) {
|
|
844 if( $class =~ /conserved\-not\-identical/ ) {
|
|
845 $class = 'conserved';
|
|
846 } else {
|
|
847 $class = 'conservedall';
|
|
848 }
|
|
849 } elsif( $t eq 'i' ) {
|
|
850 $class = 'identical';
|
|
851 } elsif( $t eq 'n' ) {
|
|
852 $class = 'nomatch';
|
|
853 } elsif( $t eq 'g' ) {
|
|
854 $class = 'gap';
|
|
855 } else {
|
|
856 $self->warn("unknown sequence class $class using 'identical'");
|
|
857 $class = 'identical';
|
|
858 }
|
|
859
|
|
860 ## Sensitive to member name changes.
|
|
861 $seqType = "_\L$seqType\E";
|
|
862 $class = "_\L$class\E";
|
|
863 my @ary;
|
|
864
|
|
865 if( $class eq '_gap' ) {
|
|
866 # this means that we are remapping the gap length that is stored
|
|
867 # in the hash (for example $self->{'_gapRes_query'} )
|
|
868 # so we'll return an array which has the values of the position of the
|
|
869 # of the gap (the key in the hash) + the gap length (value in the
|
|
870 # hash for this key - 1.
|
|
871
|
|
872 @ary = map { $_ > 1 ?
|
|
873 $_..($_ + $self->{"${class}Res$seqType"}->{$_} - 1) :
|
|
874 $_ }
|
|
875 sort { $a <=> $b } keys %{ $self->{"${class}Res$seqType"}};
|
|
876 } elsif( $class eq '_conservedall' ) {
|
|
877 @ary = sort { $a <=> $b }
|
|
878 keys %{ $self->{"_conservedRes$seqType"}},
|
|
879 keys %{ $self->{"_identicalRes$seqType"}},
|
|
880 } else {
|
|
881 @ary = sort { $a <=> $b } keys %{ $self->{"${class}Res$seqType"}};
|
|
882 }
|
|
883 require Bio::Search::BlastUtils if $collapse;
|
|
884
|
|
885 return $collapse ? &Bio::Search::SearchUtils::collapse_nums(@ary) : @ary;
|
|
886 }
|
|
887
|
|
888
|
|
889 =head2 Inherited from Bio::SeqFeature::SimilarityPair
|
|
890
|
|
891 These methods come from Bio::SeqFeature::SimilarityPair
|
|
892
|
|
893 =head2 query
|
|
894
|
|
895 Title : query
|
|
896 Usage : my $query = $hsp->query
|
|
897 Function: Returns a SeqFeature representing the query in the HSP
|
|
898 Returns : Bio::SeqFeature::Similarity
|
|
899 Args : [optional] new value to set
|
|
900
|
|
901
|
|
902 =head2 hit
|
|
903
|
|
904 Title : hit
|
|
905 Usage : my $hit = $hsp->hit
|
|
906 Function: Returns a SeqFeature representing the hit in the HSP
|
|
907 Returns : Bio::SeqFeature::Similarity
|
|
908 Args : [optional] new value to set
|
|
909
|
|
910
|
|
911 =head2 significance
|
|
912
|
|
913 Title : significance
|
|
914 Usage : $evalue = $obj->significance();
|
|
915 $obj->significance($evalue);
|
|
916 Function: Get/Set the significance value
|
|
917 Returns : numeric
|
|
918 Args : [optional] new value to set
|
|
919
|
|
920
|
|
921 =head2 score
|
|
922
|
|
923 Title : score
|
|
924 Usage : my $score = $hsp->score();
|
|
925 Function: Returns the score for this HSP or undef
|
|
926 Returns : numeric
|
|
927 Args : [optional] numeric to set value
|
|
928
|
|
929 =cut
|
|
930
|
|
931 # overriding
|
|
932
|
|
933 sub score {
|
|
934 my ($self,$value) = @_;
|
|
935 my $previous = $self->{'_score'};
|
|
936 if( defined $value ) {
|
|
937 $self->{'_score'} = $value;
|
|
938 }
|
|
939 return $previous;
|
|
940 }
|
|
941
|
|
942 =head2 bits
|
|
943
|
|
944 Title : bits
|
|
945 Usage : my $bits = $hsp->bits();
|
|
946 Function: Returns the bit value for this HSP or undef
|
|
947 Returns : numeric
|
|
948 Args : none
|
|
949
|
|
950 =cut
|
|
951
|
|
952 # overriding
|
|
953
|
|
954 sub bits {
|
|
955 my ($self,$value) = @_;
|
|
956 my $previous = $self->{'_bits'};
|
|
957 if( defined $value ) {
|
|
958 $self->{'_bits'} = $value;
|
|
959 }
|
|
960 return $previous;
|
|
961 }
|
|
962
|
|
963
|
|
964 =head2 strand
|
|
965
|
|
966 Title : strand
|
|
967 Usage : $hsp->strand('quer')
|
|
968 Function: Retrieves the strand for the HSP component requested
|
|
969 Returns : +1 or -1 (0 if unknown)
|
|
970 Args : 'hit' or 'subject' or 'sbjct' to retrieve the strand of the subject
|
|
971 'query' to retrieve the query strand (default)
|
|
972
|
|
973 =cut
|
|
974
|
|
975 =head1 Private methods
|
|
976
|
|
977 =cut
|
|
978
|
|
979 =head2 _calculate_seq_positions
|
|
980
|
|
981 Title : _calculate_seq_positions
|
|
982 Usage : $self->_calculate_seq_positions
|
|
983 Function: Internal function
|
|
984 Returns :
|
|
985 Args :
|
|
986
|
|
987
|
|
988 =cut
|
|
989
|
|
990 sub _calculate_seq_positions {
|
|
991 my ($self,@args) = @_;
|
|
992 return unless ( $self->{'_sequenceschanged'} );
|
|
993 $self->{'_sequenceschanged'} = 0;
|
|
994 my ($mchar, $schar, $qchar);
|
|
995 my ($seqString, $qseq,$sseq) = ( $self->homology_string(),
|
|
996 $self->query_string(),
|
|
997 $self->hit_string() );
|
|
998
|
|
999 # Using hashes to avoid saving duplicate residue numbers.
|
|
1000 my %identicalList_query = ();
|
|
1001 my %identicalList_sbjct = ();
|
|
1002 my %conservedList_query = ();
|
|
1003 my %conservedList_sbjct = ();
|
|
1004
|
|
1005 my %gapList_query = ();
|
|
1006 my %gapList_sbjct = ();
|
|
1007 my %nomatchList_query = ();
|
|
1008 my %nomatchList_sbjct = ();
|
|
1009
|
|
1010 my $qdir = $self->query->strand || 1;
|
|
1011 my $sdir = $self->hit->strand || 1;
|
|
1012 my $resCount_query = ($qdir >=0) ? $self->query->end : $self->query->start;
|
|
1013 my $resCount_sbjct = ($sdir >=0) ? $self->hit->end : $self->hit->start;
|
|
1014
|
|
1015 my $prog = $self->algorithm;
|
|
1016 if( $prog =~ /FAST/i ) {
|
|
1017 # fasta reports some extra 'regional' sequence information
|
|
1018 # we need to clear out first
|
|
1019 # this seemed a bit insane to me at first, but it appears to
|
|
1020 # work --jason
|
|
1021
|
|
1022 # we infer the end of the regional sequence where the first
|
|
1023 # non space is in the homology string
|
|
1024 # then we use the HSP->length to tell us how far to read
|
|
1025 # to cut off the end of the sequence
|
|
1026
|
|
1027 # one possible problem is the sequence which
|
|
1028
|
|
1029 my ($start) = (0);
|
|
1030 if( $seqString =~ /^(\s+)/ ) {
|
|
1031 $start = CORE::length($1);
|
|
1032 }
|
|
1033
|
|
1034 $seqString = substr($seqString, $start,$self->length('total'));
|
|
1035 $qseq = substr($qseq, $start,$self->length('total'));
|
|
1036 $sseq = substr($sseq, $start,$self->length('total'));
|
|
1037
|
|
1038 $qseq =~ s![\\\/]!!g;
|
|
1039 $sseq =~ s![\\\/]!!g;
|
|
1040 }
|
|
1041
|
|
1042 if($prog =~ /^(PSI)?T(BLAST|FAST)N/oi ) {
|
|
1043 $resCount_sbjct = int($resCount_sbjct / 3);
|
|
1044 } elsif($prog =~ /^(BLAST|FAST)(X|Y|XY)/oi ) {
|
|
1045 $resCount_query = int($resCount_query / 3);
|
|
1046 } elsif($prog =~ /^T(BLAST|FAST)(X|Y|XY)/oi ) {
|
|
1047 $resCount_query = int($resCount_query / 3);
|
|
1048 $resCount_sbjct = int($resCount_sbjct / 3);
|
|
1049 }
|
|
1050 while( $mchar = chop($seqString) ) {
|
|
1051 ($qchar, $schar) = (chop($qseq), chop($sseq));
|
|
1052 if( $mchar eq '+' || $mchar eq '.' || $mchar eq ':' ) {
|
|
1053 $conservedList_query{ $resCount_query } = 1;
|
|
1054 $conservedList_sbjct{ $resCount_sbjct } = 1;
|
|
1055 } elsif( $mchar ne ' ' ) {
|
|
1056 $identicalList_query{ $resCount_query } = 1;
|
|
1057 $identicalList_sbjct{ $resCount_sbjct } = 1;
|
|
1058 } elsif( $mchar eq ' ') {
|
|
1059 $nomatchList_query{ $resCount_query } = 1;
|
|
1060 $nomatchList_sbjct{ $resCount_sbjct } = 1;
|
|
1061 }
|
|
1062 if( $qchar eq $GAP_SYMBOL ) {
|
|
1063 $gapList_query{ $resCount_query } ++;
|
|
1064 } else {
|
|
1065 $resCount_query -= $qdir;
|
|
1066 }
|
|
1067 if( $schar eq $GAP_SYMBOL ) {
|
|
1068 $gapList_sbjct{ $resCount_query } ++;
|
|
1069 } else {
|
|
1070 $resCount_sbjct -=$sdir;
|
|
1071 }
|
|
1072 }
|
|
1073 $self->{'_identicalRes_query'} = \%identicalList_query;
|
|
1074 $self->{'_conservedRes_query'} = \%conservedList_query;
|
|
1075 $self->{'_nomatchRes_query'} = \%nomatchList_query;
|
|
1076 $self->{'_gapRes_query'} = \%gapList_query;
|
|
1077
|
|
1078 $self->{'_identicalRes_sbjct'} = \%identicalList_sbjct;
|
|
1079 $self->{'_conservedRes_sbjct'} = \%conservedList_sbjct;
|
|
1080 $self->{'_nomatchRes_sbjct'} = \%nomatchList_sbjct;
|
|
1081 $self->{'_gapRes_sbjct'} = \%gapList_sbjct;
|
|
1082 return 1;
|
|
1083 }
|
|
1084
|
|
1085 =head2 n
|
|
1086
|
|
1087 See documentation in L<Bio::Search::HSP::HSPI::n()|Bio::Search::HSP::HSPI>
|
|
1088
|
|
1089 =cut
|
|
1090
|
|
1091 #-----
|
|
1092 sub n {
|
|
1093 my $self = shift;
|
|
1094 if(@_) { $self->{'_n'} = shift; }
|
|
1095 defined $self->{'_n'} ? $self->{'_n'} : '';
|
|
1096 }
|
|
1097
|
|
1098 =head2 range
|
|
1099
|
|
1100 See documentation in L<Bio::Search::HSP::HSPI::range()|Bio::Search::HSP::HSPI>
|
|
1101
|
|
1102 =cut
|
|
1103
|
|
1104 #----------
|
|
1105 sub range {
|
|
1106 #----------
|
|
1107 my ($self, $seqType) = @_;
|
|
1108
|
|
1109 $seqType ||= 'query';
|
|
1110 $seqType = 'sbjct' if $seqType eq 'hit';
|
|
1111
|
|
1112 my ($start, $end);
|
|
1113 if( $seqType eq 'query' ) {
|
|
1114 $start = $self->query->start;
|
|
1115 $end = $self->query->end;
|
|
1116 }
|
|
1117 else {
|
|
1118 $start = $self->hit->start;
|
|
1119 $end = $self->hit->end;
|
|
1120 }
|
|
1121 return ($start, $end);
|
|
1122 }
|
|
1123
|
|
1124
|
|
1125 1;
|