0
|
1 ###############################################################################
|
|
2 # Bio::Tools::BPlite::HSP
|
|
3 ###############################################################################
|
|
4 # HSP = High Scoring Pair (to all non-experts as I am)
|
|
5 #
|
|
6 # The original BPlite.pm module has been written by Ian Korf !
|
|
7 # see http://sapiens.wustl.edu/~ikorf
|
|
8 #
|
|
9 # You may distribute this module under the same terms as perl itself
|
|
10
|
|
11
|
|
12 #
|
|
13 # BioPerl module for Bio::Tools::BPlite::HSP
|
|
14 #
|
|
15 # Cared for by Peter Schattner <schattner@alum.mit.edu>
|
|
16 #
|
|
17 # Copyright Peter Schattner
|
|
18 #
|
|
19 # You may distribute this module under the same terms as perl itself
|
|
20
|
|
21 # POD documentation - main docs before the code
|
|
22
|
|
23 =head1 NAME
|
|
24
|
|
25 Bio::Tools::BPlite::HSP - Blast report High Scoring Pair (HSP)
|
|
26
|
|
27 =head1 SYNOPSIS
|
|
28
|
|
29 use Bio::Tools::BPlite;
|
|
30 my $report = new Bio::Tools::BPlite(-fh=>\*STDIN);
|
|
31 {
|
|
32 while(my $sbjct = $report->nextSbjct) {
|
|
33 while (my $hsp = $sbjct->nextHSP) {
|
|
34 $hsp->score;
|
|
35 $hsp->bits;
|
|
36 $hsp->percent;
|
|
37 $hsp->P;
|
|
38 $hsp->match;
|
|
39 $hsp->positive;
|
|
40 $hsp->length;
|
|
41 $hsp->querySeq;
|
|
42 $hsp->sbjctSeq;
|
|
43 $hsp->homologySeq;
|
|
44 $hsp->query->start;
|
|
45 $hsp->query->end;
|
|
46 $hsp->hit->start;
|
|
47 $hsp->hit->end;
|
|
48 $hsp->hit->seq_id;
|
|
49 $hsp->hit->overlaps($exon);
|
|
50 }
|
|
51 }
|
|
52
|
|
53 # the following line takes you to the next report in the stream/file
|
|
54 # it will return 0 if that report is empty,
|
|
55 # but that is valid for an empty blast report.
|
|
56 # Returns -1 for EOF.
|
|
57
|
|
58 last if ($report->_parseHeader == -1));
|
|
59
|
|
60 redo
|
|
61 }
|
|
62
|
|
63 =head1 DESCRIPTION
|
|
64
|
|
65 This object handles the High Scoring Pair data for a Blast report.
|
|
66 This is where the percent identity, query and hit sequence length,
|
|
67 P value, etc are stored and where most of the necessary information is located when building logic around parsing a Blast report.
|
|
68
|
|
69 See L<Bio::Tools::BPlite> for more detailed information on the entire
|
|
70 BPlite Blast parsing system.
|
|
71
|
|
72 =head1 FEEDBACK
|
|
73
|
|
74 =head2 Mailing Lists
|
|
75
|
|
76 User feedback is an integral part of the evolution of this and other
|
|
77 Bioperl modules. Send your comments and suggestions preferably to
|
|
78 the Bioperl mailing list. Your participation is much appreciated.
|
|
79
|
|
80 bioperl-l@bioperl.org - General discussion
|
|
81 http://bioperl.org/MailList.shtml - About the mailing lists
|
|
82
|
|
83 =head2 Reporting Bugs
|
|
84
|
|
85 Report bugs to the Bioperl bug tracking system to help us keep track
|
|
86 of the bugs and their resolution. Bug reports can be submitted via
|
|
87 email or the web:
|
|
88
|
|
89 bioperl-bugs@bioperl.org
|
|
90 http://bugzilla.bioperl.org/
|
|
91
|
|
92 =head1 AUTHOR - Peter Schattner
|
|
93
|
|
94 Email: schattner@alum.mit.edu
|
|
95
|
|
96 =head1 CONTRIBUTORS
|
|
97
|
|
98 Jason Stajich, jason@bioperl.org
|
|
99
|
|
100 =head1 APPENDIX
|
|
101
|
|
102 The rest of the documentation details each of the object methods.
|
|
103 Internal methods are usually preceded with a _
|
|
104
|
|
105 =cut
|
|
106
|
|
107 # Let the code begin...
|
|
108
|
|
109 package Bio::Tools::BPlite::HSP;
|
|
110
|
|
111 use vars qw(@ISA);
|
|
112 use strict;
|
|
113
|
|
114 # to disable overloading comment this out:
|
|
115 #use overload '""' => '_overload';
|
|
116
|
|
117 # Object preamble - inheriets from Bio::SeqFeature::SimilarityPair
|
|
118
|
|
119 use Bio::SeqFeature::SimilarityPair;
|
|
120 use Bio::SeqFeature::Similarity;
|
|
121
|
|
122 @ISA = qw(Bio::SeqFeature::SimilarityPair);
|
|
123
|
|
124 sub new {
|
|
125 my ($class, @args) = @_;
|
|
126
|
|
127 # workaround to make sure frame is not set before strand is
|
|
128 # interpreted from query/hit info
|
|
129 # this workaround removes the key from the hash
|
|
130 # so the superclass does not try and work with it
|
|
131 # we'll take care of setting it in this module later on
|
|
132
|
|
133 my %newargs = @args;
|
|
134 foreach ( keys %newargs ) {
|
|
135 if( /frame$/i ) {
|
|
136 delete $newargs{$_};
|
|
137 }
|
|
138 }
|
|
139 # done with workaround
|
|
140
|
|
141 my $self = $class->SUPER::new(%newargs);
|
|
142
|
|
143 my ($score,$bits,$match,$hsplength,$positive,$gaps,$p,$exp,$qb,$qe,$sb,
|
|
144 $se,$qs,$ss,$hs,$qname,$sname,$qlength,$slength,$qframe,$sframe,
|
|
145 $blasttype) =
|
|
146 $self->_rearrange([qw(SCORE
|
|
147 BITS
|
|
148 MATCH
|
|
149 HSPLENGTH
|
|
150 POSITIVE
|
|
151 GAPS
|
|
152 P
|
|
153 EXP
|
|
154 QUERYBEGIN
|
|
155 QUERYEND
|
|
156 SBJCTBEGIN
|
|
157 SBJCTEND
|
|
158 QUERYSEQ
|
|
159 SBJCTSEQ
|
|
160 HOMOLOGYSEQ
|
|
161 QUERYNAME
|
|
162 SBJCTNAME
|
|
163 QUERYLENGTH
|
|
164 SBJCTLENGTH
|
|
165 QUERYFRAME
|
|
166 SBJCTFRAME
|
|
167 BLASTTYPE
|
|
168 )],@args);
|
|
169
|
|
170 $blasttype = 'UNKNOWN' unless $blasttype;
|
|
171 $self->report_type($blasttype);
|
|
172 # Determine strand meanings
|
|
173 my ($queryfactor, $sbjctfactor) = (1,0); # default
|
|
174 if ($blasttype eq 'BLASTP' || $blasttype eq 'TBLASTN' ) {
|
|
175 $queryfactor = 0;
|
|
176 }
|
|
177 if ($blasttype eq 'TBLASTN' || $blasttype eq 'TBLASTX' ||
|
|
178 $blasttype eq 'BLASTN' ) {
|
|
179 $sbjctfactor = 1;
|
|
180 }
|
|
181
|
|
182 # Set BLAST type
|
|
183 $self->{'BLAST_TYPE'} = $blasttype;
|
|
184
|
|
185 # Store the aligned query as sequence feature
|
|
186 my $strand;
|
|
187 if ($qe > $qb) { # normal query: start < end
|
|
188 if ($queryfactor) { $strand = 1; } else { $strand = undef; }
|
|
189 $self->query( Bio::SeqFeature::Similarity->new
|
|
190 (-start=>$qb, -end=>$qe, -strand=>$strand,
|
|
191 -source=>"BLAST" ) ) }
|
|
192 else { # reverse query (i dont know if this is possible, but feel free to correct)
|
|
193 if ($queryfactor) { $strand = -1; } else { $strand = undef; }
|
|
194 $self->query( Bio::SeqFeature::Similarity->new
|
|
195 (-start=>$qe, -end=>$qb, -strand=>$strand,
|
|
196 -source=>"BLAST" ) ) }
|
|
197
|
|
198 # store the aligned hit as sequence feature
|
|
199 if ($se > $sb) { # normal hit
|
|
200 if ($sbjctfactor) { $strand = 1; } else { $strand = undef; }
|
|
201 $self->hit( Bio::SeqFeature::Similarity->new
|
|
202 (-start=>$sb, -end=>$se, -strand=>$strand,
|
|
203 -source=>"BLAST" ) ) }
|
|
204 else { # reverse hit: start bigger than end
|
|
205 if ($sbjctfactor) { $strand = -1; } else { $strand = undef; }
|
|
206 $self->hit( Bio::SeqFeature::Similarity->new
|
|
207 (-start=>$se, -end=>$sb, -strand=>$strand,
|
|
208 -source=>"BLAST" ) ) }
|
|
209
|
|
210 # name the sequences
|
|
211 $self->query->seq_id($qname); # query name
|
|
212 $self->hit->seq_id($sname); # hit name
|
|
213
|
|
214 # set lengths
|
|
215 $self->query->seqlength($qlength); # query length
|
|
216 $self->hit->seqlength($slength); # hit length
|
|
217
|
|
218 # set object vars
|
|
219 $self->score($score);
|
|
220 $self->bits($bits);
|
|
221
|
|
222 $self->significance($p);
|
|
223 $self->{'EXP'} = $exp;
|
|
224
|
|
225 $self->query->frac_identical($match);
|
|
226 $self->hit->frac_identical($match);
|
|
227 $self->{'HSPLENGTH'} = $hsplength;
|
|
228 $self->{'PERCENT'} = int((1000 * $match)/$hsplength)/10;
|
|
229 $self->{'POSITIVE'} = $positive;
|
|
230 $self->{'GAPS'} = $gaps;
|
|
231 $self->{'QS'} = $qs;
|
|
232 $self->{'SS'} = $ss;
|
|
233 $self->{'HS'} = $hs;
|
|
234
|
|
235 $self->frame($qframe, $sframe);
|
|
236 return $self; # success - we hope!
|
|
237 }
|
|
238
|
|
239 # to disable overloading comment this out:
|
|
240 sub _overload {
|
|
241 my $self = shift;
|
|
242 return $self->start."..".$self->end." ".$self->bits;
|
|
243 }
|
|
244
|
|
245 =head2 report_type
|
|
246
|
|
247 Title : report_type
|
|
248 Usage : $type = $sbjct->report_type()
|
|
249 Function : Returns the type of report from which this hit was obtained.
|
|
250 This usually pertains only to BLAST and friends reports, for which
|
|
251 the report type denotes what type of sequence was aligned against
|
|
252 what (BLASTN: dna-dna, BLASTP prt-prt, BLASTX translated dna-prt,
|
|
253 TBLASTN prt-translated dna, TBLASTX translated dna-translated dna).
|
|
254 Example :
|
|
255 Returns : A string (BLASTN, BLASTP, BLASTX, TBLASTN, TBLASTX, UNKNOWN)
|
|
256 Args : a string on set (you should know what you are doing)
|
|
257
|
|
258 =cut
|
|
259
|
|
260 sub report_type {
|
|
261 my ($self, $rpt) = @_;
|
|
262 if($rpt) {
|
|
263 $self->{'_report_type'} = $rpt;
|
|
264 }
|
|
265 return $self->{'_report_type'};
|
|
266 }
|
|
267
|
|
268 =head2 EXP
|
|
269
|
|
270 Title : EXP
|
|
271 Usage : my $exp = $hsp->EXP;
|
|
272 Function: returns the EXP value for the HSP
|
|
273 Returns : string value
|
|
274 Args : none
|
|
275 Note : Patch provided by Sami Ashour for BTK parsing
|
|
276
|
|
277
|
|
278 =cut
|
|
279
|
|
280 sub EXP{
|
|
281 return $_[0]->{'EXP'};
|
|
282 }
|
|
283
|
|
284
|
|
285 =head2 P
|
|
286
|
|
287 Title : P
|
|
288 Usage : $hsp->P();
|
|
289 Function : returns the P (significance) value for a HSP
|
|
290 Returns : (double) significance value
|
|
291 Args :
|
|
292
|
|
293 =cut
|
|
294
|
|
295 sub P {
|
|
296 my ($self, @args) = @_;
|
|
297 my $float = $self->significance(@args);
|
|
298 my $match = '([+-]?)(?=\d|\.\d)\d*(\.\d*)?([Ee]([+-]?\d+))?'; # Perl Cookbook 2.1
|
|
299 if ($float =~ /^$match$/) {
|
|
300 # Is a C float
|
|
301 return $float;
|
|
302 } elsif ("1$float" =~ /^$match$/) {
|
|
303 # Almost C float, Jitterbug 974
|
|
304 return "1$float";
|
|
305 } else {
|
|
306 $self->warn("[HSP::P()] '$float' is not a known number format. Returning zero (0) instead.");
|
|
307 return 0;
|
|
308 }
|
|
309 }
|
|
310
|
|
311 =head2 percent
|
|
312
|
|
313 Title : percent
|
|
314 Usage : $hsp->percent();
|
|
315 Function : returns the percent matching
|
|
316 Returns : (double) percent matching
|
|
317 Args : none
|
|
318
|
|
319 =cut
|
|
320
|
|
321 sub percent {shift->{'PERCENT'}}
|
|
322
|
|
323
|
|
324 =head2 match
|
|
325
|
|
326 Title : match
|
|
327 Usage : $hsp->match();
|
|
328 Function : returns the match
|
|
329 Example :
|
|
330 Returns : (double) frac_identical
|
|
331 Args :
|
|
332
|
|
333 =cut
|
|
334
|
|
335 sub match {shift->query->frac_identical(@_)}
|
|
336
|
|
337 =head2 hsplength
|
|
338
|
|
339 Title : hsplength
|
|
340 Usage : $hsp->hsplength();
|
|
341 Function : returns the HSP length (including gaps)
|
|
342 Returns : (integer) HSP length
|
|
343 Args : none
|
|
344
|
|
345 =cut
|
|
346
|
|
347 sub hsplength {shift->{'HSPLENGTH'}}
|
|
348
|
|
349 =head2 positive
|
|
350
|
|
351 Title : positive
|
|
352 Usage : $hsp->positive();
|
|
353 Function : returns the number of positive matches (symbols in the alignment
|
|
354 with a positive score)
|
|
355 Returns : (int) number of positive matches in the alignment
|
|
356 Args : none
|
|
357
|
|
358 =cut
|
|
359
|
|
360 sub positive {shift->{'POSITIVE'}}
|
|
361
|
|
362 =head2 gaps
|
|
363
|
|
364 Title : gaps
|
|
365 Usage : $hsp->gaps();
|
|
366 Function : returns the number of gaps or 0 if none
|
|
367 Returns : (int) number of gaps or 0 if none
|
|
368 Args : none
|
|
369
|
|
370 =cut
|
|
371
|
|
372 sub gaps {shift->{'GAPS'}}
|
|
373
|
|
374 =head2 querySeq
|
|
375
|
|
376 Title : querySeq
|
|
377 Usage : $hsp->querySeq();
|
|
378 Function : returns the query sequence
|
|
379 Returns : (string) the Query Sequence
|
|
380 Args : none
|
|
381
|
|
382 =cut
|
|
383
|
|
384 sub querySeq {shift->{'QS'}}
|
|
385
|
|
386 =head2 sbjctSeq
|
|
387
|
|
388 Title : sbjctSeq
|
|
389 Usage : $hsp->sbjctSeq();
|
|
390 Function : returns the Sbjct sequence
|
|
391 Returns : (string) the Sbjct Sequence
|
|
392 Args : none
|
|
393
|
|
394 =cut
|
|
395
|
|
396 sub sbjctSeq {shift->{'SS'}}
|
|
397
|
|
398 =head2 homologySeq
|
|
399
|
|
400 Title : homologySeq
|
|
401 Usage : $hsp->homologySeq();
|
|
402 Function : returns the homologous sequence
|
|
403 Returns : (string) homologous sequence
|
|
404 Args : none
|
|
405
|
|
406 =cut
|
|
407
|
|
408 sub homologySeq {shift->{'HS'}}
|
|
409
|
|
410 =head2 qs
|
|
411
|
|
412 Title : qs
|
|
413 Usage : $hsp->qs();
|
|
414 Function : returns the Query Sequence (same as querySeq)
|
|
415 Returns : (string) query Sequence
|
|
416 Args : none
|
|
417
|
|
418 =cut
|
|
419
|
|
420 sub qs {shift->{'QS'}}
|
|
421
|
|
422 =head2 ss
|
|
423
|
|
424 Title : ss
|
|
425 Usage : $hsp->ss();
|
|
426 Function : returns the subject sequence ( same as sbjctSeq)
|
|
427 Returns : (string) Sbjct Sequence
|
|
428 Args : none
|
|
429
|
|
430 =cut
|
|
431
|
|
432 sub ss {shift->{'SS'}}
|
|
433
|
|
434 =head2 hs
|
|
435
|
|
436 Title : hs
|
|
437 Usage : $hsp->hs();
|
|
438 Function : returns the Homologous Sequence (same as homologySeq )
|
|
439 Returns : (string) Homologous Sequence
|
|
440 Args : none
|
|
441
|
|
442 =cut
|
|
443
|
|
444 sub hs {shift->{'HS'}}
|
|
445
|
|
446 sub frame {
|
|
447 my ($self, $qframe, $sframe) = @_;
|
|
448 if( defined $qframe ) {
|
|
449 if( $qframe == 0 ) {
|
|
450 $qframe = undef;
|
|
451 } elsif( $qframe !~ /^([+-])?([1-3])/ ) {
|
|
452 $self->warn("Specifying an invalid query frame ($qframe)");
|
|
453 $qframe = undef;
|
|
454 } else {
|
|
455 if( ($1 eq '-' && $self->query->strand >= 0) ||
|
|
456 ($1 eq '+' && $self->query->strand <= 0) ) {
|
|
457 $self->warn("Query frame ($qframe) did not match strand of query (". $self->query->strand() . ")");
|
|
458 }
|
|
459 # Set frame to GFF [0-2]
|
|
460 $qframe = $2 - 1;
|
|
461 }
|
|
462 $self->{'QFRAME'} = $qframe;
|
|
463 }
|
|
464 if( defined $sframe ) {
|
|
465 if( $sframe == 0 ) {
|
|
466 $sframe = undef;
|
|
467 } elsif( $sframe !~ /^([+-])?([1-3])/ ) {
|
|
468 $self->warn("Specifying an invalid hit frame ($sframe)");
|
|
469 $sframe = undef;
|
|
470 } else {
|
|
471 if( ($1 eq '-' && $self->hit->strand >= 0) ||
|
|
472 ($1 eq '+' && $self->hit->strand <= 0) )
|
|
473 {
|
|
474 $self->warn("Hit frame ($sframe) did not match strand of hit (". $self->hit->strand() . ")");
|
|
475 }
|
|
476
|
|
477 # Set frame to GFF [0-2]
|
|
478 $sframe = $2 - 1;
|
|
479 }
|
|
480 $self->{'SFRAME'} = $sframe;
|
|
481 }
|
|
482
|
|
483 (defined $qframe && $self->SUPER::frame($qframe) &&
|
|
484 ($self->{'FRAME'} = $qframe)) ||
|
|
485 (defined $sframe && $self->SUPER::frame($sframe) &&
|
|
486 ($self->{'FRAME'} = $sframe));
|
|
487
|
|
488 if (wantarray() &&
|
|
489 $self->{'BLAST_TYPE'} eq 'TBLASTX')
|
|
490 {
|
|
491 return ($self->{'QFRAME'}, $self->{'SFRAME'});
|
|
492 } elsif (wantarray()) {
|
|
493 (defined $self->{'QFRAME'} &&
|
|
494 return ($self->{'QFRAME'}, undef)) ||
|
|
495 (defined $self->{'SFRAME'} &&
|
|
496 return (undef, $self->{'SFRAME'}));
|
|
497 } else {
|
|
498 (defined $self->{'QFRAME'} &&
|
|
499 return $self->{'QFRAME'}) ||
|
|
500 (defined $self->{'SFRAME'} &&
|
|
501 return $self->{'SFRAME'});
|
|
502 }
|
|
503 }
|
|
504
|
|
505 1;
|