Mercurial > repos > mahtabm > ensembl
comparison variant_effect_predictor/Bio/Tools/BPlite/HSP.pm @ 0:1f6dce3d34e0
Uploaded
| author | mahtabm |
|---|---|
| date | Thu, 11 Apr 2013 02:01:53 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:1f6dce3d34e0 |
|---|---|
| 1 ############################################################################### | |
| 2 # Bio::Tools::BPlite::HSP | |
| 3 ############################################################################### | |
| 4 # HSP = High Scoring Pair (to all non-experts as I am) | |
| 5 # | |
| 6 # The original BPlite.pm module has been written by Ian Korf ! | |
| 7 # see http://sapiens.wustl.edu/~ikorf | |
| 8 # | |
| 9 # You may distribute this module under the same terms as perl itself | |
| 10 | |
| 11 | |
| 12 # | |
| 13 # BioPerl module for Bio::Tools::BPlite::HSP | |
| 14 # | |
| 15 # Cared for by Peter Schattner <schattner@alum.mit.edu> | |
| 16 # | |
| 17 # Copyright Peter Schattner | |
| 18 # | |
| 19 # You may distribute this module under the same terms as perl itself | |
| 20 | |
| 21 # POD documentation - main docs before the code | |
| 22 | |
| 23 =head1 NAME | |
| 24 | |
| 25 Bio::Tools::BPlite::HSP - Blast report High Scoring Pair (HSP) | |
| 26 | |
| 27 =head1 SYNOPSIS | |
| 28 | |
| 29 use Bio::Tools::BPlite; | |
| 30 my $report = new Bio::Tools::BPlite(-fh=>\*STDIN); | |
| 31 { | |
| 32 while(my $sbjct = $report->nextSbjct) { | |
| 33 while (my $hsp = $sbjct->nextHSP) { | |
| 34 $hsp->score; | |
| 35 $hsp->bits; | |
| 36 $hsp->percent; | |
| 37 $hsp->P; | |
| 38 $hsp->match; | |
| 39 $hsp->positive; | |
| 40 $hsp->length; | |
| 41 $hsp->querySeq; | |
| 42 $hsp->sbjctSeq; | |
| 43 $hsp->homologySeq; | |
| 44 $hsp->query->start; | |
| 45 $hsp->query->end; | |
| 46 $hsp->hit->start; | |
| 47 $hsp->hit->end; | |
| 48 $hsp->hit->seq_id; | |
| 49 $hsp->hit->overlaps($exon); | |
| 50 } | |
| 51 } | |
| 52 | |
| 53 # the following line takes you to the next report in the stream/file | |
| 54 # it will return 0 if that report is empty, | |
| 55 # but that is valid for an empty blast report. | |
| 56 # Returns -1 for EOF. | |
| 57 | |
| 58 last if ($report->_parseHeader == -1)); | |
| 59 | |
| 60 redo | |
| 61 } | |
| 62 | |
| 63 =head1 DESCRIPTION | |
| 64 | |
| 65 This object handles the High Scoring Pair data for a Blast report. | |
| 66 This is where the percent identity, query and hit sequence length, | |
| 67 P value, etc are stored and where most of the necessary information is located when building logic around parsing a Blast report. | |
| 68 | |
| 69 See L<Bio::Tools::BPlite> for more detailed information on the entire | |
| 70 BPlite Blast parsing system. | |
| 71 | |
| 72 =head1 FEEDBACK | |
| 73 | |
| 74 =head2 Mailing Lists | |
| 75 | |
| 76 User feedback is an integral part of the evolution of this and other | |
| 77 Bioperl modules. Send your comments and suggestions preferably to | |
| 78 the Bioperl mailing list. Your participation is much appreciated. | |
| 79 | |
| 80 bioperl-l@bioperl.org - General discussion | |
| 81 http://bioperl.org/MailList.shtml - About the mailing lists | |
| 82 | |
| 83 =head2 Reporting Bugs | |
| 84 | |
| 85 Report bugs to the Bioperl bug tracking system to help us keep track | |
| 86 of the bugs and their resolution. Bug reports can be submitted via | |
| 87 email or the web: | |
| 88 | |
| 89 bioperl-bugs@bioperl.org | |
| 90 http://bugzilla.bioperl.org/ | |
| 91 | |
| 92 =head1 AUTHOR - Peter Schattner | |
| 93 | |
| 94 Email: schattner@alum.mit.edu | |
| 95 | |
| 96 =head1 CONTRIBUTORS | |
| 97 | |
| 98 Jason Stajich, jason@bioperl.org | |
| 99 | |
| 100 =head1 APPENDIX | |
| 101 | |
| 102 The rest of the documentation details each of the object methods. | |
| 103 Internal methods are usually preceded with a _ | |
| 104 | |
| 105 =cut | |
| 106 | |
| 107 # Let the code begin... | |
| 108 | |
| 109 package Bio::Tools::BPlite::HSP; | |
| 110 | |
| 111 use vars qw(@ISA); | |
| 112 use strict; | |
| 113 | |
| 114 # to disable overloading comment this out: | |
| 115 #use overload '""' => '_overload'; | |
| 116 | |
| 117 # Object preamble - inheriets from Bio::SeqFeature::SimilarityPair | |
| 118 | |
| 119 use Bio::SeqFeature::SimilarityPair; | |
| 120 use Bio::SeqFeature::Similarity; | |
| 121 | |
| 122 @ISA = qw(Bio::SeqFeature::SimilarityPair); | |
| 123 | |
| 124 sub new { | |
| 125 my ($class, @args) = @_; | |
| 126 | |
| 127 # workaround to make sure frame is not set before strand is | |
| 128 # interpreted from query/hit info | |
| 129 # this workaround removes the key from the hash | |
| 130 # so the superclass does not try and work with it | |
| 131 # we'll take care of setting it in this module later on | |
| 132 | |
| 133 my %newargs = @args; | |
| 134 foreach ( keys %newargs ) { | |
| 135 if( /frame$/i ) { | |
| 136 delete $newargs{$_}; | |
| 137 } | |
| 138 } | |
| 139 # done with workaround | |
| 140 | |
| 141 my $self = $class->SUPER::new(%newargs); | |
| 142 | |
| 143 my ($score,$bits,$match,$hsplength,$positive,$gaps,$p,$exp,$qb,$qe,$sb, | |
| 144 $se,$qs,$ss,$hs,$qname,$sname,$qlength,$slength,$qframe,$sframe, | |
| 145 $blasttype) = | |
| 146 $self->_rearrange([qw(SCORE | |
| 147 BITS | |
| 148 MATCH | |
| 149 HSPLENGTH | |
| 150 POSITIVE | |
| 151 GAPS | |
| 152 P | |
| 153 EXP | |
| 154 QUERYBEGIN | |
| 155 QUERYEND | |
| 156 SBJCTBEGIN | |
| 157 SBJCTEND | |
| 158 QUERYSEQ | |
| 159 SBJCTSEQ | |
| 160 HOMOLOGYSEQ | |
| 161 QUERYNAME | |
| 162 SBJCTNAME | |
| 163 QUERYLENGTH | |
| 164 SBJCTLENGTH | |
| 165 QUERYFRAME | |
| 166 SBJCTFRAME | |
| 167 BLASTTYPE | |
| 168 )],@args); | |
| 169 | |
| 170 $blasttype = 'UNKNOWN' unless $blasttype; | |
| 171 $self->report_type($blasttype); | |
| 172 # Determine strand meanings | |
| 173 my ($queryfactor, $sbjctfactor) = (1,0); # default | |
| 174 if ($blasttype eq 'BLASTP' || $blasttype eq 'TBLASTN' ) { | |
| 175 $queryfactor = 0; | |
| 176 } | |
| 177 if ($blasttype eq 'TBLASTN' || $blasttype eq 'TBLASTX' || | |
| 178 $blasttype eq 'BLASTN' ) { | |
| 179 $sbjctfactor = 1; | |
| 180 } | |
| 181 | |
| 182 # Set BLAST type | |
| 183 $self->{'BLAST_TYPE'} = $blasttype; | |
| 184 | |
| 185 # Store the aligned query as sequence feature | |
| 186 my $strand; | |
| 187 if ($qe > $qb) { # normal query: start < end | |
| 188 if ($queryfactor) { $strand = 1; } else { $strand = undef; } | |
| 189 $self->query( Bio::SeqFeature::Similarity->new | |
| 190 (-start=>$qb, -end=>$qe, -strand=>$strand, | |
| 191 -source=>"BLAST" ) ) } | |
| 192 else { # reverse query (i dont know if this is possible, but feel free to correct) | |
| 193 if ($queryfactor) { $strand = -1; } else { $strand = undef; } | |
| 194 $self->query( Bio::SeqFeature::Similarity->new | |
| 195 (-start=>$qe, -end=>$qb, -strand=>$strand, | |
| 196 -source=>"BLAST" ) ) } | |
| 197 | |
| 198 # store the aligned hit as sequence feature | |
| 199 if ($se > $sb) { # normal hit | |
| 200 if ($sbjctfactor) { $strand = 1; } else { $strand = undef; } | |
| 201 $self->hit( Bio::SeqFeature::Similarity->new | |
| 202 (-start=>$sb, -end=>$se, -strand=>$strand, | |
| 203 -source=>"BLAST" ) ) } | |
| 204 else { # reverse hit: start bigger than end | |
| 205 if ($sbjctfactor) { $strand = -1; } else { $strand = undef; } | |
| 206 $self->hit( Bio::SeqFeature::Similarity->new | |
| 207 (-start=>$se, -end=>$sb, -strand=>$strand, | |
| 208 -source=>"BLAST" ) ) } | |
| 209 | |
| 210 # name the sequences | |
| 211 $self->query->seq_id($qname); # query name | |
| 212 $self->hit->seq_id($sname); # hit name | |
| 213 | |
| 214 # set lengths | |
| 215 $self->query->seqlength($qlength); # query length | |
| 216 $self->hit->seqlength($slength); # hit length | |
| 217 | |
| 218 # set object vars | |
| 219 $self->score($score); | |
| 220 $self->bits($bits); | |
| 221 | |
| 222 $self->significance($p); | |
| 223 $self->{'EXP'} = $exp; | |
| 224 | |
| 225 $self->query->frac_identical($match); | |
| 226 $self->hit->frac_identical($match); | |
| 227 $self->{'HSPLENGTH'} = $hsplength; | |
| 228 $self->{'PERCENT'} = int((1000 * $match)/$hsplength)/10; | |
| 229 $self->{'POSITIVE'} = $positive; | |
| 230 $self->{'GAPS'} = $gaps; | |
| 231 $self->{'QS'} = $qs; | |
| 232 $self->{'SS'} = $ss; | |
| 233 $self->{'HS'} = $hs; | |
| 234 | |
| 235 $self->frame($qframe, $sframe); | |
| 236 return $self; # success - we hope! | |
| 237 } | |
| 238 | |
| 239 # to disable overloading comment this out: | |
| 240 sub _overload { | |
| 241 my $self = shift; | |
| 242 return $self->start."..".$self->end." ".$self->bits; | |
| 243 } | |
| 244 | |
| 245 =head2 report_type | |
| 246 | |
| 247 Title : report_type | |
| 248 Usage : $type = $sbjct->report_type() | |
| 249 Function : Returns the type of report from which this hit was obtained. | |
| 250 This usually pertains only to BLAST and friends reports, for which | |
| 251 the report type denotes what type of sequence was aligned against | |
| 252 what (BLASTN: dna-dna, BLASTP prt-prt, BLASTX translated dna-prt, | |
| 253 TBLASTN prt-translated dna, TBLASTX translated dna-translated dna). | |
| 254 Example : | |
| 255 Returns : A string (BLASTN, BLASTP, BLASTX, TBLASTN, TBLASTX, UNKNOWN) | |
| 256 Args : a string on set (you should know what you are doing) | |
| 257 | |
| 258 =cut | |
| 259 | |
| 260 sub report_type { | |
| 261 my ($self, $rpt) = @_; | |
| 262 if($rpt) { | |
| 263 $self->{'_report_type'} = $rpt; | |
| 264 } | |
| 265 return $self->{'_report_type'}; | |
| 266 } | |
| 267 | |
| 268 =head2 EXP | |
| 269 | |
| 270 Title : EXP | |
| 271 Usage : my $exp = $hsp->EXP; | |
| 272 Function: returns the EXP value for the HSP | |
| 273 Returns : string value | |
| 274 Args : none | |
| 275 Note : Patch provided by Sami Ashour for BTK parsing | |
| 276 | |
| 277 | |
| 278 =cut | |
| 279 | |
| 280 sub EXP{ | |
| 281 return $_[0]->{'EXP'}; | |
| 282 } | |
| 283 | |
| 284 | |
| 285 =head2 P | |
| 286 | |
| 287 Title : P | |
| 288 Usage : $hsp->P(); | |
| 289 Function : returns the P (significance) value for a HSP | |
| 290 Returns : (double) significance value | |
| 291 Args : | |
| 292 | |
| 293 =cut | |
| 294 | |
| 295 sub P { | |
| 296 my ($self, @args) = @_; | |
| 297 my $float = $self->significance(@args); | |
| 298 my $match = '([+-]?)(?=\d|\.\d)\d*(\.\d*)?([Ee]([+-]?\d+))?'; # Perl Cookbook 2.1 | |
| 299 if ($float =~ /^$match$/) { | |
| 300 # Is a C float | |
| 301 return $float; | |
| 302 } elsif ("1$float" =~ /^$match$/) { | |
| 303 # Almost C float, Jitterbug 974 | |
| 304 return "1$float"; | |
| 305 } else { | |
| 306 $self->warn("[HSP::P()] '$float' is not a known number format. Returning zero (0) instead."); | |
| 307 return 0; | |
| 308 } | |
| 309 } | |
| 310 | |
| 311 =head2 percent | |
| 312 | |
| 313 Title : percent | |
| 314 Usage : $hsp->percent(); | |
| 315 Function : returns the percent matching | |
| 316 Returns : (double) percent matching | |
| 317 Args : none | |
| 318 | |
| 319 =cut | |
| 320 | |
| 321 sub percent {shift->{'PERCENT'}} | |
| 322 | |
| 323 | |
| 324 =head2 match | |
| 325 | |
| 326 Title : match | |
| 327 Usage : $hsp->match(); | |
| 328 Function : returns the match | |
| 329 Example : | |
| 330 Returns : (double) frac_identical | |
| 331 Args : | |
| 332 | |
| 333 =cut | |
| 334 | |
| 335 sub match {shift->query->frac_identical(@_)} | |
| 336 | |
| 337 =head2 hsplength | |
| 338 | |
| 339 Title : hsplength | |
| 340 Usage : $hsp->hsplength(); | |
| 341 Function : returns the HSP length (including gaps) | |
| 342 Returns : (integer) HSP length | |
| 343 Args : none | |
| 344 | |
| 345 =cut | |
| 346 | |
| 347 sub hsplength {shift->{'HSPLENGTH'}} | |
| 348 | |
| 349 =head2 positive | |
| 350 | |
| 351 Title : positive | |
| 352 Usage : $hsp->positive(); | |
| 353 Function : returns the number of positive matches (symbols in the alignment | |
| 354 with a positive score) | |
| 355 Returns : (int) number of positive matches in the alignment | |
| 356 Args : none | |
| 357 | |
| 358 =cut | |
| 359 | |
| 360 sub positive {shift->{'POSITIVE'}} | |
| 361 | |
| 362 =head2 gaps | |
| 363 | |
| 364 Title : gaps | |
| 365 Usage : $hsp->gaps(); | |
| 366 Function : returns the number of gaps or 0 if none | |
| 367 Returns : (int) number of gaps or 0 if none | |
| 368 Args : none | |
| 369 | |
| 370 =cut | |
| 371 | |
| 372 sub gaps {shift->{'GAPS'}} | |
| 373 | |
| 374 =head2 querySeq | |
| 375 | |
| 376 Title : querySeq | |
| 377 Usage : $hsp->querySeq(); | |
| 378 Function : returns the query sequence | |
| 379 Returns : (string) the Query Sequence | |
| 380 Args : none | |
| 381 | |
| 382 =cut | |
| 383 | |
| 384 sub querySeq {shift->{'QS'}} | |
| 385 | |
| 386 =head2 sbjctSeq | |
| 387 | |
| 388 Title : sbjctSeq | |
| 389 Usage : $hsp->sbjctSeq(); | |
| 390 Function : returns the Sbjct sequence | |
| 391 Returns : (string) the Sbjct Sequence | |
| 392 Args : none | |
| 393 | |
| 394 =cut | |
| 395 | |
| 396 sub sbjctSeq {shift->{'SS'}} | |
| 397 | |
| 398 =head2 homologySeq | |
| 399 | |
| 400 Title : homologySeq | |
| 401 Usage : $hsp->homologySeq(); | |
| 402 Function : returns the homologous sequence | |
| 403 Returns : (string) homologous sequence | |
| 404 Args : none | |
| 405 | |
| 406 =cut | |
| 407 | |
| 408 sub homologySeq {shift->{'HS'}} | |
| 409 | |
| 410 =head2 qs | |
| 411 | |
| 412 Title : qs | |
| 413 Usage : $hsp->qs(); | |
| 414 Function : returns the Query Sequence (same as querySeq) | |
| 415 Returns : (string) query Sequence | |
| 416 Args : none | |
| 417 | |
| 418 =cut | |
| 419 | |
| 420 sub qs {shift->{'QS'}} | |
| 421 | |
| 422 =head2 ss | |
| 423 | |
| 424 Title : ss | |
| 425 Usage : $hsp->ss(); | |
| 426 Function : returns the subject sequence ( same as sbjctSeq) | |
| 427 Returns : (string) Sbjct Sequence | |
| 428 Args : none | |
| 429 | |
| 430 =cut | |
| 431 | |
| 432 sub ss {shift->{'SS'}} | |
| 433 | |
| 434 =head2 hs | |
| 435 | |
| 436 Title : hs | |
| 437 Usage : $hsp->hs(); | |
| 438 Function : returns the Homologous Sequence (same as homologySeq ) | |
| 439 Returns : (string) Homologous Sequence | |
| 440 Args : none | |
| 441 | |
| 442 =cut | |
| 443 | |
| 444 sub hs {shift->{'HS'}} | |
| 445 | |
| 446 sub frame { | |
| 447 my ($self, $qframe, $sframe) = @_; | |
| 448 if( defined $qframe ) { | |
| 449 if( $qframe == 0 ) { | |
| 450 $qframe = undef; | |
| 451 } elsif( $qframe !~ /^([+-])?([1-3])/ ) { | |
| 452 $self->warn("Specifying an invalid query frame ($qframe)"); | |
| 453 $qframe = undef; | |
| 454 } else { | |
| 455 if( ($1 eq '-' && $self->query->strand >= 0) || | |
| 456 ($1 eq '+' && $self->query->strand <= 0) ) { | |
| 457 $self->warn("Query frame ($qframe) did not match strand of query (". $self->query->strand() . ")"); | |
| 458 } | |
| 459 # Set frame to GFF [0-2] | |
| 460 $qframe = $2 - 1; | |
| 461 } | |
| 462 $self->{'QFRAME'} = $qframe; | |
| 463 } | |
| 464 if( defined $sframe ) { | |
| 465 if( $sframe == 0 ) { | |
| 466 $sframe = undef; | |
| 467 } elsif( $sframe !~ /^([+-])?([1-3])/ ) { | |
| 468 $self->warn("Specifying an invalid hit frame ($sframe)"); | |
| 469 $sframe = undef; | |
| 470 } else { | |
| 471 if( ($1 eq '-' && $self->hit->strand >= 0) || | |
| 472 ($1 eq '+' && $self->hit->strand <= 0) ) | |
| 473 { | |
| 474 $self->warn("Hit frame ($sframe) did not match strand of hit (". $self->hit->strand() . ")"); | |
| 475 } | |
| 476 | |
| 477 # Set frame to GFF [0-2] | |
| 478 $sframe = $2 - 1; | |
| 479 } | |
| 480 $self->{'SFRAME'} = $sframe; | |
| 481 } | |
| 482 | |
| 483 (defined $qframe && $self->SUPER::frame($qframe) && | |
| 484 ($self->{'FRAME'} = $qframe)) || | |
| 485 (defined $sframe && $self->SUPER::frame($sframe) && | |
| 486 ($self->{'FRAME'} = $sframe)); | |
| 487 | |
| 488 if (wantarray() && | |
| 489 $self->{'BLAST_TYPE'} eq 'TBLASTX') | |
| 490 { | |
| 491 return ($self->{'QFRAME'}, $self->{'SFRAME'}); | |
| 492 } elsif (wantarray()) { | |
| 493 (defined $self->{'QFRAME'} && | |
| 494 return ($self->{'QFRAME'}, undef)) || | |
| 495 (defined $self->{'SFRAME'} && | |
| 496 return (undef, $self->{'SFRAME'})); | |
| 497 } else { | |
| 498 (defined $self->{'QFRAME'} && | |
| 499 return $self->{'QFRAME'}) || | |
| 500 (defined $self->{'SFRAME'} && | |
| 501 return $self->{'SFRAME'}); | |
| 502 } | |
| 503 } | |
| 504 | |
| 505 1; |
