Mercurial > repos > mahtabm > ensembl
comparison variant_effect_predictor/Bio/Tools/Blast.pm @ 0:1f6dce3d34e0
Uploaded
| author | mahtabm |
|---|---|
| date | Thu, 11 Apr 2013 02:01:53 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:1f6dce3d34e0 |
|---|---|
| 1 #---------------------------------------------------------------------------- | |
| 2 # PACKAGE : Bio::Tools::Blast | |
| 3 # PURPOSE : To encapsulate code for running, parsing, and analyzing | |
| 4 # BLAST reports. | |
| 5 # AUTHOR : Steve Chervitz (sac@bioperl.org) | |
| 6 # CREATED : March 1996 | |
| 7 # REVISION: $Id: Blast.pm,v 1.30 2002/11/04 09:12:50 heikki Exp $ | |
| 8 # STATUS : Alpha | |
| 9 # | |
| 10 # For the latest version and documentation, visit: | |
| 11 # http://bio.perl.org/Projects/Blast | |
| 12 # | |
| 13 # To generate documentation, run this module through pod2html | |
| 14 # (preferably from Perl v5.004 or better). | |
| 15 # | |
| 16 # Copyright (c) 1996-2000 Steve Chervitz. All Rights Reserved. | |
| 17 # This module is free software; you can redistribute it and/or | |
| 18 # modify it under the same terms as Perl itself. | |
| 19 #---------------------------------------------------------------------------- | |
| 20 | |
| 21 package Bio::Tools::Blast; | |
| 22 use strict; | |
| 23 use Exporter; | |
| 24 | |
| 25 use Bio::Tools::SeqAnal; | |
| 26 use Bio::Root::Global qw(:std); | |
| 27 use Bio::Root::Utilities qw(:obj); | |
| 28 | |
| 29 require 5.002; | |
| 30 | |
| 31 use vars qw(@ISA @EXPORT @EXPORT_OK %EXPORT_TAGS | |
| 32 $ID $VERSION $Blast @Blast_programs $Revision $Newline); | |
| 33 | |
| 34 @ISA = qw( Bio::Tools::SeqAnal Exporter); | |
| 35 @EXPORT = qw(); | |
| 36 @EXPORT_OK = qw($VERSION $Blast); | |
| 37 %EXPORT_TAGS = ( obj => [qw($Blast)], | |
| 38 std => [qw($Blast)]); | |
| 39 | |
| 40 $ID = 'Bio::Tools::Blast'; | |
| 41 $VERSION = 0.09; | |
| 42 $Revision = '$Id: Blast.pm,v 1.30 2002/11/04 09:12:50 heikki Exp $'; #' | |
| 43 | |
| 44 ## Static Blast object. | |
| 45 $Blast = {}; | |
| 46 bless $Blast, $ID; | |
| 47 $Blast->{'_name'} = "Static Blast object"; | |
| 48 | |
| 49 @Blast_programs = qw(blastp blastn blastx tblastn tblastx); | |
| 50 | |
| 51 use vars qw($DEFAULT_MATRIX $DEFAULT_SIGNIF); | |
| 52 my $DEFAULT_MATRIX = 'BLOSUM62'; | |
| 53 my $DEFAULT_SIGNIF = 999;# Value used as significance cutoff if none supplied. | |
| 54 my $MAX_HSP_OVERLAP = 2; # Used when tiling multiple HSPs. | |
| 55 | |
| 56 ## POD Documentation: | |
| 57 | |
| 58 =head1 NAME | |
| 59 | |
| 60 Bio::Tools::Blast - Bioperl BLAST sequence analysis object | |
| 61 | |
| 62 =head1 SYNOPSIS | |
| 63 | |
| 64 =head2 Parsing Blast reports | |
| 65 | |
| 66 Parse an existing Blast report from file: | |
| 67 | |
| 68 use Bio::Tools::Blast; | |
| 69 | |
| 70 $blastObj = Bio::Tools::Blast->new( -file => '/tmp/blast.out', | |
| 71 -parse => 1, | |
| 72 -signif => '1e-10', | |
| 73 ); | |
| 74 | |
| 75 Parse an existing Blast report from STDIN: | |
| 76 | |
| 77 $blastObj = Bio::Tools::Blast->new( -parse => 1, | |
| 78 -signif => '1e-10', | |
| 79 ); | |
| 80 | |
| 81 Then send a Blast report to your script via STDIN. | |
| 82 | |
| 83 Full parameters for parsing Blast reports. | |
| 84 | |
| 85 %blastParam = ( | |
| 86 -run => \%runParam, | |
| 87 -file => '', | |
| 88 -parse => 1, | |
| 89 -signif => 1e-5, | |
| 90 -filt_func => \&my_filter, | |
| 91 -min_len => 15, | |
| 92 -check_all_hits => 0, | |
| 93 -strict => 0, | |
| 94 -stats => 1, | |
| 95 -best => 0, | |
| 96 -share => 0, | |
| 97 -exec_func => \&process_blast, | |
| 98 -save_array => \@blast_objs, # not used if -exce_func defined. | |
| 99 ); | |
| 100 | |
| 101 See L<parse()|parse> for a description of parameters and see L<USAGE | USAGE> for | |
| 102 more examples including how to parse streams containing multiple Blast | |
| 103 reports L<Using the Static $Blast Object>. | |
| 104 | |
| 105 See L<Memory Usage Issues> for information about how to make Blast | |
| 106 parsing be more memory efficient. | |
| 107 | |
| 108 =head2 Running Blast reports | |
| 109 | |
| 110 Run a new Blast2 at NCBI and then parse it: | |
| 111 | |
| 112 %runParam = ( | |
| 113 -method => 'remote', | |
| 114 -prog => 'blastp', | |
| 115 -database => 'swissprot', | |
| 116 -seqs => [ $seq ], # Bio::Seq.pm objects. | |
| 117 ); | |
| 118 | |
| 119 $blastObj = Bio::Tools::Blast->new( -run => \%runParam, | |
| 120 -parse => 1, | |
| 121 -signif => '1e-10', | |
| 122 -strict => 1, | |
| 123 ); | |
| 124 | |
| 125 Full parameters for running Blasts at NCBI using Webblast.pm: | |
| 126 | |
| 127 %runParam = ( | |
| 128 -method => 'remote', | |
| 129 -prog => 'blastp', | |
| 130 -version => 2, # BLAST2 | |
| 131 -database =>'swissprot', | |
| 132 -html => 0, | |
| 133 -seqs => [ $seqObject ], # Bio::Seq.pm object(s) | |
| 134 -descr => 250, | |
| 135 -align => 250, | |
| 136 -expect => 10, | |
| 137 -gap => 'on', | |
| 138 -matrix => 'PAM250', | |
| 139 -email => undef, # don't send report via e-mail if parsing. | |
| 140 -filter => undef, # use default | |
| 141 -gap_c => undef, # use default | |
| 142 -gap_e => undef, # use default | |
| 143 -word => undef, # use default | |
| 144 -min_len => undef, # use default | |
| 145 ); | |
| 146 | |
| 147 See L<run()|run> and L<USAGE | USAGE> for more information about running Blasts. | |
| 148 | |
| 149 =head2 HTML-formatting Blast reports | |
| 150 | |
| 151 Print an HTML-formatted version of a Blast report: | |
| 152 | |
| 153 use Bio::Tools::Blast qw(:obj); | |
| 154 | |
| 155 $Blast->to_html($filename); | |
| 156 $Blast->to_html(-file => $filename, | |
| 157 -header => "<H1>Blast Results</H1>"); | |
| 158 $Blast->to_html(-file => $filename, | |
| 159 -out => \@array); # store output | |
| 160 $Blast->to_html(); # use STDIN | |
| 161 | |
| 162 Results are sent directly to STDOUT unless an C<-out =E<gt> array_ref> | |
| 163 parameter is supplied. See L<to_html()|to_html> for details. | |
| 164 | |
| 165 =head1 INSTALLATION | |
| 166 | |
| 167 This module is included with the central Bioperl distribution: | |
| 168 | |
| 169 http://bio.perl.org/Core/Latest | |
| 170 ftp://bio.perl.org/pub/DIST | |
| 171 | |
| 172 Follow the installation instructions included in the README file. | |
| 173 | |
| 174 =head1 DESCRIPTION | |
| 175 | |
| 176 The Bio::Tools::Blast.pm module encapsulates data and methods for | |
| 177 running, parsing, and analyzing pre-existing BLAST reports. This | |
| 178 module defines an application programming interface (API) for working | |
| 179 with Blast reports. A Blast object is constructed from raw Blast | |
| 180 output and encapsulates the Blast results which can then be accessed | |
| 181 via the interface defined by the Blast object. | |
| 182 | |
| 183 The ways in which researchers use Blast data are many and varied. This | |
| 184 module attempts to be general and flexible enough to accommodate | |
| 185 different uses. The Blast module API is still at an early stage of | |
| 186 evolution and I expect it to continue to evolve as new uses for Blast | |
| 187 data are developed. Your L<FEEDBACK | FEEDBACK> is welcome. | |
| 188 | |
| 189 B<FEATURES:> | |
| 190 | |
| 191 =over 2 | |
| 192 | |
| 193 =item * Supports NCBI Blast1.x, Blast2.x, and WashU-Blast2.x, gapped | |
| 194 and ungapped. | |
| 195 | |
| 196 Can parse HTML-formatted as well as non-HTML-formatted reports. | |
| 197 | |
| 198 =item * Launch new Blast analyses remotely or locally. | |
| 199 | |
| 200 Blast objects can be constructed directly from the results of the | |
| 201 run. See L<run()|run>. | |
| 202 | |
| 203 =item * Construct Blast objects from pre-existing files or from a new run. | |
| 204 | |
| 205 Build a Blast object from a single file or build multiple Blast | |
| 206 objects from an input stream containing multiple reports. See | |
| 207 L<parse()|parse>. | |
| 208 | |
| 209 =item * Add hypertext links from a BLAST report. | |
| 210 | |
| 211 See L<to_html()|to_html>. | |
| 212 | |
| 213 =item * Generate sequence and sequence alignment objects from HSP | |
| 214 sequences. | |
| 215 | |
| 216 If you have Bio::Seq.pm and Bio::UnivAln.pm installed on your system, | |
| 217 they can be used for working with high-scoring segment pair (HSP) | |
| 218 sequences in the Blast alignment. (A new version of Bio::Seq.pm is | |
| 219 included in the distribution, see L<INSTALLATION | INSTALLATION>). For more | |
| 220 information about them, see: | |
| 221 | |
| 222 http://bio.perl.org/Projects/Sequence/ | |
| 223 http://bio.perl.org/Projects/SeqAlign/ | |
| 224 | |
| 225 =back | |
| 226 | |
| 227 A variety of different data can be extracted from the Blast report by | |
| 228 querying the Blast.pm object. Some basic examples are given in the | |
| 229 L<USAGE | USAGE> section. For some working scripts, see the links provided in | |
| 230 the L<the DEMO SCRIPTS section | DEMO> section. | |
| 231 | |
| 232 As a part of the incipient Bioperl framework, the Bio::Tools::Blast.pm | |
| 233 module inherits from B<Bio::Tools::SeqAnal.pm>, which provides some | |
| 234 generic functionality for biological sequence analysis. See the | |
| 235 documentation for that module for details | |
| 236 (L<Links to related modules>). | |
| 237 | |
| 238 =head2 The BLAST Program | |
| 239 | |
| 240 BLAST (Basic Local Alignment Search Tool) is a widely used algorithm | |
| 241 for performing rapid sequence similarity searches between a single DNA | |
| 242 or protein sequence and a large dataset of sequences. BLAST analyses | |
| 243 are typically performed by dedicated remote servers, such as the ones | |
| 244 at the NCBI. Individual groups may also run the program on local | |
| 245 machines. | |
| 246 | |
| 247 The Blast family includes 5 different programs: | |
| 248 | |
| 249 Query Seq Database | |
| 250 ------------ ---------- | |
| 251 blastp -- protein protein | |
| 252 blastn -- nucleotide nucleotide | |
| 253 blastx -- nucleotide* protein | |
| 254 tblastn -- protein nucleotide* | |
| 255 tblastx -- nucleotide* nucleotide* | |
| 256 | |
| 257 * = dynamically translated in all reading frames, both strands | |
| 258 | |
| 259 See L<References & Information about the BLAST program>. | |
| 260 | |
| 261 =head2 Versions Supported | |
| 262 | |
| 263 BLAST reports generated by different application front ends are similar | |
| 264 but not exactly the same. Blast reports are not intended to be exchange formats, | |
| 265 making parsing software susceptible to obsolescence. This module aims to | |
| 266 support BLAST reports generated by different implementations: | |
| 267 | |
| 268 Implementation Latest version tested | |
| 269 -------------- -------------------- | |
| 270 NCBI Blast1 1.4.11 [24-Nov-97] | |
| 271 NCBI Blast2 2.0.8 [Jan-5-1999] | |
| 272 WashU-BLAST2 2.0a19MP [05-Feb-1998] | |
| 273 GCG 1.4.8 [1-Feb-95] | |
| 274 | |
| 275 Support for both gapped and ungapped versions is included. Currently, there | |
| 276 is only rudimentary support for PSI-BLAST in that these reports can be parsed but | |
| 277 there is no special treatment of separate iteration rounds (they are all | |
| 278 merged together). | |
| 279 | |
| 280 =head2 References & Information about the BLAST program | |
| 281 | |
| 282 B<WEBSITES:> | |
| 283 | |
| 284 http://www.ncbi.nlm.nih.gov/BLAST/ - Homepage at NCBI | |
| 285 http://www.ncbi.nlm.nih.gov/BLAST/blast_help.html - Help manual | |
| 286 http://blast.wustl.edu/ - WashU-Blast2 | |
| 287 | |
| 288 B<PUBLICATIONS:> (with PubMed links) | |
| 289 | |
| 290 Altschul S.F., Gish W., Miller W., Myers E.W., Lipman D.J. (1990). | |
| 291 "Basic local alignment search tool", J Mol Biol 215: 403-410. | |
| 292 | |
| 293 http://www.ncbi.nlm.nih.gov/htbin-post/Entrez/query?uid=2231712&form=6&db=m&Dopt=r | |
| 294 | |
| 295 Altschul, Stephen F., Thomas L. Madden, Alejandro A. Schaffer, | |
| 296 Jinghui Zhang, Zheng Zhang, Webb Miller, and David J. Lipman (1997). | |
| 297 "Gapped BLAST and PSI-BLAST: a new generation of protein database | |
| 298 search programs", Nucleic Acids Res. 25:3389-3402. | |
| 299 | |
| 300 http://www.ncbi.nlm.nih.gov/htbin-post/Entrez/query?uid=9254694&form=6&db=m&Dopt=r | |
| 301 | |
| 302 Karlin, Samuel and Stephen F. Altschul (1990). Methods for | |
| 303 assessing the statistical significance of molecular sequence | |
| 304 features by using general scoring schemes. Proc. Natl. Acad. | |
| 305 Sci. USA 87:2264-68. | |
| 306 | |
| 307 http://www.ncbi.nlm.nih.gov/htbin-post/Entrez/query?uid=2315319&form=6&db=m&Dopt=b | |
| 308 | |
| 309 Karlin, Samuel and Stephen F. Altschul (1993). Applications | |
| 310 and statistics for multiple high-scoring segments in molecu- | |
| 311 lar sequences. Proc. Natl. Acad. Sci. USA 90:5873-7. | |
| 312 | |
| 313 http://www.ncbi.nlm.nih.gov/htbin-post/Entrez/query?uid=8390686&form=6&db=m&Dopt=b | |
| 314 | |
| 315 =head1 USAGE | |
| 316 | |
| 317 =head2 Creating Blast objects | |
| 318 | |
| 319 A Blast object can be constructed from the contents of a Blast report | |
| 320 using a set of named parameters that specify significance criteria for | |
| 321 parsing. The report data can be read in from an existing file | |
| 322 specified with the C<-file =E<gt> 'filename'> parameter or from a | |
| 323 STDIN stream containing potentially multiple Blast reports. If the | |
| 324 C<-file> parameter does not contain a valid filename, STDIN will be | |
| 325 used. Separate Blast objects will be created for each report in the | |
| 326 stream. | |
| 327 | |
| 328 To parse the report, you must include a C<-parse =E<gt> 1> parameter | |
| 329 in addition to any other parsing parameters | |
| 330 See L<parse()|parse> for a full description of parsing parameters. | |
| 331 To run a new report and then parse it, include a C<-run =E<gt> \%runParams> | |
| 332 parameter containing a reference to a hash | |
| 333 that hold the parameters required by the L<run()|run> method. | |
| 334 | |
| 335 The constructor for Blast objects is inherited from Bio::Tools::SeqAnal.pm. | |
| 336 See the B<_initialize>() method of that package for general information | |
| 337 relevant to creating Blast objects. (The B<new>() method, inherited from | |
| 338 B<Bio::Root::Object.pm>, calls B<_initialize>(). See L<Links to related modules>). | |
| 339 | |
| 340 The Blast object can read compressed (gzipped) Blast report | |
| 341 files. Compression/decompression uses the gzip or compress programs | |
| 342 that are standard on Unix systems and should not require special | |
| 343 configuration. If you can't or don't want to use gzip as the file | |
| 344 compression tool, either pre-uncompress your files before parsing with | |
| 345 this module or modify B<Bio::Root::Utilities.pm> to your liking. | |
| 346 | |
| 347 Blast objects can be generated either by direct instantiation as in: | |
| 348 | |
| 349 use Bio::Tools::Blast; | |
| 350 $blast = new Bio::Tools::Blast (%parameters); | |
| 351 | |
| 352 =head2 Using the Static $Blast Object | |
| 353 | |
| 354 use Bio::Tools::Blast qw(:obj); | |
| 355 | |
| 356 This exports the static $Blast object into your namespace. "Static" | |
| 357 refers to the fact that it has class scope and there is one of these | |
| 358 created when you use this module. The static $Blast object is | |
| 359 basically an empty object that is provided for convenience and is also | |
| 360 used for various internal chores. | |
| 361 | |
| 362 It is exported by this module and can be used for | |
| 363 parsing and running reports as well as HTML-formatting without having | |
| 364 to first create an empty Blast object. | |
| 365 | |
| 366 Using the static $Blast object for parsing a STDIN stream of Blast reports: | |
| 367 | |
| 368 use Bio::Tools::Blast qw(:obj); | |
| 369 | |
| 370 sub process_blast { | |
| 371 my $blastObj = shift; | |
| 372 print $blastObj->table(); | |
| 373 $blastObj->destroy; | |
| 374 } | |
| 375 | |
| 376 $Blast->parse( -parse => 1, | |
| 377 -signif => '1e-10', | |
| 378 -exec_func => \&process_blast, | |
| 379 ); | |
| 380 | |
| 381 Then pipe a stream of Blast reports into your script via STDIN. For | |
| 382 each Blast report extracted from the input stream, the parser will | |
| 383 generate a new Blast object and pass it to the function specified by | |
| 384 C<-exec_func>. The destroy() call tells Perl to free the memory | |
| 385 associated with the object, important if you are crunching through | |
| 386 many reports. This method is inherited from B<Bio::Root::Object.pm> | |
| 387 (see L<Links to related modules>). See L<parse()|parse> for a full | |
| 388 description of parameters and L<the DEMO SCRIPTS section | DEMO> for | |
| 389 additional examples. | |
| 390 | |
| 391 =head2 Running Blasts | |
| 392 | |
| 393 To run a Blast, create a new Blast object with a C<-run =E<gt> | |
| 394 \%runParams> parameter. Remote Blasts are performed by including a | |
| 395 C<-method =E<gt> 'remote'> parameter; local Blasts are performed by | |
| 396 including a C<-method =E<gt> 'local'> parameter. See | |
| 397 L<Running Blast reports> as well as the | |
| 398 L<the DEMO SCRIPTS section | DEMO> for examples. | |
| 399 Note that running local Blasts is not yet supported, see below. | |
| 400 | |
| 401 Note that the C<-seqs =E<gt> [ $seqs ]> run parameter must contain a | |
| 402 reference to an array of B<Bio::Seq.pm> objects | |
| 403 (L<Links to related modules>). Encapsulating the sequence in an | |
| 404 object makes sequence information much easier to handle as it can | |
| 405 be supplied in a variety of formats. Bio::Seq.pm is included with | |
| 406 this distribution (L<INSTALLATION | INSTALLATION>). | |
| 407 | |
| 408 Remote Blasts are implemented using the | |
| 409 B<Bio::Tools::Blast::Run::Webblast.pm> module. Local Blasts require | |
| 410 that you customize the B<Bio::Tools::Blast::Run::LocalBlast.pm> | |
| 411 module. The version of LocalBlast.pm included with this distribution | |
| 412 provides the basic framework for running local Blasts. | |
| 413 See L<Links to related modules>. | |
| 414 | |
| 415 =head2 Significance screening | |
| 416 | |
| 417 A C<-signif> parameter can be used to screen out all hits with | |
| 418 P-values (or Expect values) above a certain cutoff. For example, to | |
| 419 exclude all hits with Expect values above 1.0e-10: C<-signif =E<gt> | |
| 420 1e-10>. Providing a C<-signif> cutoff can speed up processing | |
| 421 tremendously, since only a small fraction of the report need be | |
| 422 parsed. This is because the C<-signif> value is used to screen hits | |
| 423 based on the data in the "Description" section of the Blast report: | |
| 424 | |
| 425 For NCBI BLAST2 reports: | |
| 426 | |
| 427 Score E | |
| 428 Sequences producing significant alignments: (bits) Value | |
| 429 | |
| 430 sp|P31376|YAB1_YEAST HYPOTHETICAL 74.1 KD PROTEIN IN CYS3-MDM10... 957 0.0 | |
| 431 | |
| 432 For BLAST1 or WashU-BLAST2 reports: | |
| 433 | |
| 434 Smallest | |
| 435 Sum | |
| 436 High Probability | |
| 437 Sequences producing High-scoring Segment Pairs: Score P(N) N | |
| 438 | |
| 439 PDB:3PRK_E Proteinase K complexed with inhibitor ........... 504 1.8e-50 1 | |
| 440 | |
| 441 Thus, the C<-signif> parameter will screen based on Expect values for | |
| 442 BLAST2 reports and based on P-values for BLAST1/WashU-BLAST2 reports. | |
| 443 | |
| 444 To screen based on other criteria, you can supply a C<-filt_func> | |
| 445 parameter containing a function reference that takes a | |
| 446 B<Bio::Tools::Sbjct.pm> object as an argument and returns a boolean, | |
| 447 true if the hit is to be screened out. See example below for | |
| 448 L<Screening hits using arbitrary criteria>. | |
| 449 | |
| 450 =head2 Get the best hit. | |
| 451 | |
| 452 $hit = $blastObj->hit; | |
| 453 | |
| 454 A "hit" is contained by a B<Bio::Tools::Blast::Sbjct.pm> object. | |
| 455 | |
| 456 =head2 Get the P-value or Expect value of the most significant hit. | |
| 457 | |
| 458 $p = $blastObj->lowest_p; | |
| 459 $e = $blastObj->lowest_expect; | |
| 460 | |
| 461 Alternatively: | |
| 462 | |
| 463 $p = $blastObj->hit->p; | |
| 464 $e = $blastObj->hit->expect; | |
| 465 | |
| 466 Note that P-values are not reported in NCBI Blast2 reports. | |
| 467 | |
| 468 =head2 Iterate through all the hits | |
| 469 | |
| 470 foreach $hit ($blastObj->hits) { | |
| 471 printf "%s\t %.1e\t %d\t %.2f\t %d\n", | |
| 472 $hit->name, $hit->expect, $hit->num_hsps, | |
| 473 $hit->frac_identical, $hit->gaps; | |
| 474 } | |
| 475 | |
| 476 Refer to the documentation for B<Bio::Tools::Blast::Sbjct.pm> | |
| 477 for other ways to work with hit objects (L<Links to related modules>). | |
| 478 | |
| 479 =head2 Screening hits using arbitrary criteria | |
| 480 | |
| 481 sub filter { $hit=shift; | |
| 482 return ($hit->gaps == 0 and | |
| 483 $hit->frac_conserved > 0.5); } | |
| 484 | |
| 485 $blastObj = Bio::Tools::Blast->new( -file => '/tmp/blast.out', | |
| 486 -parse => 1, | |
| 487 -filt_func => \&filter ); | |
| 488 | |
| 489 While the Blast object is parsing the report, each hit checked by calling | |
| 490 &filter($hit). All hits that generate false return values from &filter | |
| 491 are screened out and will not be added to the Blast object. | |
| 492 Note that the Blast object will normally stop parsing the report after | |
| 493 the first non-significant hit or the first hit that does not pass the | |
| 494 filter function. To force the Blast object to check all hits, | |
| 495 include a C<-check_all_hits =E<gt> 1> parameter. | |
| 496 Refer to the documentation for B<Bio::Tools::Blast::Sbjct.pm> | |
| 497 for other ways to work with hit objects. | |
| 498 | |
| 499 =over 4 | |
| 500 | |
| 501 =item Hit start, end coordinates. | |
| 502 | |
| 503 print $sbjct->start('query'); | |
| 504 print $sbjct->end('sbjct'); | |
| 505 | |
| 506 In array context, you can get information for both query and sbjct with one call: | |
| 507 | |
| 508 ($qstart, $sstart) = $sbjct->start(); | |
| 509 ($qend, $send) = $sbjct->end(); | |
| 510 | |
| 511 For important information regarding coordinate information, see | |
| 512 the L<HSP start, end, and strand> section below. | |
| 513 Also check out documentation for the start and end methods in B<Bio::Tools::Blast::Sbjct.pm>, | |
| 514 which explains what happens if there is more than one HSP. | |
| 515 | |
| 516 =back | |
| 517 | |
| 518 =head2 Working with HSPs | |
| 519 | |
| 520 =over 4 | |
| 521 | |
| 522 =item Iterate through all the HSPs of every hit | |
| 523 | |
| 524 foreach $hit ($blastObj->hits) { | |
| 525 foreach $hsp ($hit->hsps) { | |
| 526 printf "%.1e\t %d\t %.1f\t %.2f\t %.2f\t %d\t %d\n", | |
| 527 $hsp->expect, $hsp->score, $hsp->bits, | |
| 528 $hsp->frac_identical, $hsp->frac_conserved, | |
| 529 $hsp->gaps('query'), $hsp->gaps('sbjct'); | |
| 530 } | |
| 531 | |
| 532 Refer to the documentation for B<Bio::Tools::Blast::HSP.pm> | |
| 533 for other ways to work with hit objects (L<Links to related modules>). | |
| 534 | |
| 535 =back | |
| 536 | |
| 537 =over 4 | |
| 538 | |
| 539 =item Extract HSP sequence data as strings or sequence objects | |
| 540 | |
| 541 Get the first HSP of the first hit and the sequences | |
| 542 of the query and sbjct as strings. | |
| 543 | |
| 544 $hsp = $blast_obj->hit->hsp; | |
| 545 $query_seq = $hsp->seq_str('query'); | |
| 546 $hsp_seq = $hsp->seq_str('sbjct'); | |
| 547 | |
| 548 Get the indices of identical and conserved positions in the HSP query seq. | |
| 549 | |
| 550 @query_iden_indices = $hsp->seq_inds('query', 'identical'); | |
| 551 @query_cons_indices = $hsp->seq_inds('query', 'conserved'); | |
| 552 | |
| 553 Similarly for the sbjct sequence. | |
| 554 | |
| 555 @sbjct_iden_indices = $hsp->seq_inds('sbjct', 'identical'); | |
| 556 @sbjct_cons_indices = $hsp->seq_inds('sbjct', 'conserved'); | |
| 557 | |
| 558 print "Query in Fasta format:\n", $hsp->seq('query')->layout('fasta'); | |
| 559 print "Sbjct in Fasta format:\n", $hsp->seq('sbjct')->layout('fasta'); | |
| 560 | |
| 561 See the B<Bio::Seq.pm> package for more information about using these sequence objects | |
| 562 (L<Links to related modules>). | |
| 563 | |
| 564 =back | |
| 565 | |
| 566 =over 4 | |
| 567 | |
| 568 =item Create sequence alignment objects using HSP sequences | |
| 569 | |
| 570 $aln = $hsp->get_aln; | |
| 571 print " consensus:\n", $aln->consensus(); | |
| 572 print $hsp->get_aln->layout('fasta'); | |
| 573 | |
| 574 $ENV{READSEQ_DIR} = '/home/users/sac/bin/solaris'; | |
| 575 $ENV{READSEQ} = 'readseq'; | |
| 576 print $hsp->get_aln->layout('msf'); | |
| 577 | |
| 578 MSF formated layout requires Don Gilbert's ReadSeq program (not included). | |
| 579 See the B<Bio::UnivAln.pm> for more information about using these alignment objects | |
| 580 (L<Links to related modules>)'. | |
| 581 | |
| 582 =back | |
| 583 | |
| 584 =over 4 | |
| 585 | |
| 586 =item HSP start, end, and strand | |
| 587 | |
| 588 To facilitate HSP processing, endpoint data for each HSP sequence are | |
| 589 normalized so that B<start is always less than end>. This affects TBLASTN | |
| 590 and TBLASTX HSPs on the reverse complement or "Minus" strand. | |
| 591 | |
| 592 Some examples of obtaining start, end coordinates for HSP objects: | |
| 593 | |
| 594 print $hsp->start('query'); | |
| 595 print $hsp->end('sbjct'); | |
| 596 ($qstart, $sstart) = $hsp->start(); | |
| 597 ($qend, $send) = $hsp->end(); | |
| 598 | |
| 599 Strandedness of the HSP can be assessed using the strand() method | |
| 600 on the HSP object: | |
| 601 | |
| 602 print $hsp->strand('query'); | |
| 603 print $hsp->strand('sbjct'); | |
| 604 | |
| 605 These will return 'Minus' or 'Plus'. | |
| 606 Or, to get strand information for both query and sbjct with a single call: | |
| 607 | |
| 608 ($qstrand, $sstrand) = $hsp->strand(); | |
| 609 | |
| 610 =back | |
| 611 | |
| 612 =head2 Report Generation | |
| 613 | |
| 614 =over 4 | |
| 615 | |
| 616 =item Generate a tab-delimited table of all results. | |
| 617 | |
| 618 print $blastObj->table; | |
| 619 print $blastObj->table(0); # don't include hit descriptions. | |
| 620 print $blastObj->table_tiled; | |
| 621 | |
| 622 The L<table()|table> method returns data for each B<HSP> of each hit listed one per | |
| 623 line. The L<table_tiled()|table_tiled> method returns data for each B<hit, i.e., Sbjct> | |
| 624 listed one per line; data from multiple HSPs are combined after tiling to | |
| 625 reduce overlaps. See B<Bio::Tools::Blast::Sbjct.pm> for more information about | |
| 626 HSP tiling. These methods generate stereotypical, tab-delimited data for each | |
| 627 hit of the Blast report. The output is suitable for importation into | |
| 628 spreadsheets or database tables. Feel free to roll your own table function if | |
| 629 you need a custom table. | |
| 630 | |
| 631 For either table method, descriptions of each hit can be included if a | |
| 632 single, true argument is supplied (e.g., $blastObj-E<gt>table(1)). The description | |
| 633 will be added as the last field. This will significantly increase the size of | |
| 634 the table. Labels for the table columns can be obtained with L<table_labels()|table_labels> | |
| 635 and L<table_labels_tiled()|table_labels_tiled>. | |
| 636 | |
| 637 =back | |
| 638 | |
| 639 =over 4 | |
| 640 | |
| 641 =item Print a summary of the Blast report | |
| 642 | |
| 643 $blastObj->display(); | |
| 644 $blastObj->display(-show=>'hits'); | |
| 645 | |
| 646 L<display()|display> prints various statistics extracted from the Blast report | |
| 647 such as database name, database size, matrix used, etc. The | |
| 648 C<display(-show=E<gt>'hits')> call prints a non-tab-delimited table | |
| 649 attempting to line the data up into more readable columns. The output | |
| 650 generated is similar to L<table_tiled()|table_tiled>. | |
| 651 | |
| 652 =back | |
| 653 | |
| 654 =over 4 | |
| 655 | |
| 656 =item HTML-format an existing report | |
| 657 | |
| 658 use Bio::Tools::Blast qw(:obj); | |
| 659 | |
| 660 # Going straight from a non HTML report file to HTML output using | |
| 661 # the static $Blast object exported by Bio::Tools::Blast.pm | |
| 662 $Blast->to_html(-file => '/usr/people/me/blast.output.txt', | |
| 663 -header => qq|<H1>BLASTP Results</H1><A HREF="home.html">Home</A>| | |
| 664 ); | |
| 665 | |
| 666 # You can also use a specific Blast object created previously. | |
| 667 $blastObj->to_html; | |
| 668 | |
| 669 L<to_html()|to_html> will send HTML output, line-by-line, directly to STDOUT | |
| 670 unless an C<-out =E<gt> array_ref> parameter is supplied (e.g., C<-out | |
| 671 =E<gt> \@array>), in which case the HTML will be stored in @array, one | |
| 672 line per array element. The direct outputting permits faster response | |
| 673 time since Blast reports can be huge. The -header tag can contain a | |
| 674 string containing any HTML that you want to appear at the top of the | |
| 675 Blast report. | |
| 676 | |
| 677 =back | |
| 678 | |
| 679 =head1 DEMO SCRIPTS | |
| 680 | |
| 681 Sample Scripts are included in the central bioperl distribution in the | |
| 682 'examples/blast/' directory (see L<INSTALLATION | INSTALLATION>): | |
| 683 | |
| 684 =head2 Handy library for working with Bio::Tools::Blast.pm | |
| 685 | |
| 686 examples/blast/blast_config.pl | |
| 687 | |
| 688 =head2 Parsing Blast reports one at a time. | |
| 689 | |
| 690 examples/blast/parse_blast.pl | |
| 691 examples/blast/parse_blast2.pl | |
| 692 examples/blast/parse_positions.pl | |
| 693 | |
| 694 =head2 Parsing sets of Blast reports. | |
| 695 | |
| 696 examples/blast/parse_blast.pl | |
| 697 examples/blast/parse_multi.pl | |
| 698 | |
| 699 B<Warning:> See note about L<Memory Usage Issues>. | |
| 700 | |
| 701 =head2 Running Blast analyses one at a time. | |
| 702 | |
| 703 examples/blast/run_blast_remote.pl | |
| 704 | |
| 705 =head2 Running Blast analyses given a set of sequences. | |
| 706 | |
| 707 examples/blast/blast_seq.pl | |
| 708 | |
| 709 =head2 HTML-formatting Blast reports. | |
| 710 | |
| 711 examples/blast/html.pl | |
| 712 | |
| 713 =head1 TECHNICAL DETAILS | |
| 714 | |
| 715 =head2 Blast Modes | |
| 716 | |
| 717 A BLAST object may be created using one of three different modes as | |
| 718 defined by the B<Bio::Tools::SeqAnal.pm> package | |
| 719 (See L<Links to related modules>): | |
| 720 | |
| 721 -- parse - Load a BLAST report and parse it, storing parsed data in | |
| 722 Blast.pm object. | |
| 723 -- run - Run the BLAST program to generate a new report. | |
| 724 -- read - Load a BLAST report into the Blast object without parsing. | |
| 725 | |
| 726 B<Run mode support has recently been added>. The module | |
| 727 B<Bio::Tools::Blast::Run::Webblast.pm> is an modularized adaptation of | |
| 728 the webblast script by Alex Dong Li: | |
| 729 | |
| 730 http://www.genet.sickkids.on.ca/bioinfo_resources/software.html#webblast | |
| 731 | |
| 732 for running remote Blast analyses and saving the results locally. Run | |
| 733 mode can be combined with a parse mode to generate a Blast report and | |
| 734 then build the Blast object from the parsed results of this report | |
| 735 (see L<run()|run> and L<SYNOPSIS | SYNOPSIS>). | |
| 736 | |
| 737 In read mode, the BLAST report is read in by the Blast object but is | |
| 738 not parsed. This could be used to internalize a Blast report but not | |
| 739 parse it for results (e.g., generating HTML formatted output). | |
| 740 | |
| 741 =head2 Significant Hits | |
| 742 | |
| 743 This module permits the screening of hits on the basis of | |
| 744 user-specified criteria for significance. Currently, Blast reports can | |
| 745 be screened based on: | |
| 746 | |
| 747 CRITERIA PARAMETER VALUE | |
| 748 ---------------------------------- --------- ---------------- | |
| 749 1) the best Expect (or P) value -signif float or sci-notation | |
| 750 2) the length of the query sequence -min_length integer | |
| 751 3) arbitrary criteria -filt_func function reference | |
| 752 | |
| 753 The parameters are used for construction of the BLAST object or when | |
| 754 running the L<parse()|parse> method on the static $Blast object. The | |
| 755 -SIGNIF value represents the number listed in the description section | |
| 756 at the top of the Blast report. For Blast2, this is an Expect value, | |
| 757 for Blast1 and WashU-Blast2, this is a P-value. The idea behind the | |
| 758 C<-filt_func> parameter is that the hit has to pass through a filter | |
| 759 to be considered significant. Refer to the documentation for | |
| 760 B<Bio::Tools::Blast::Sbjct.pm> for ways to work with hit objects. | |
| 761 | |
| 762 Using a C<-signif> parameter allows for the following: | |
| 763 | |
| 764 =over 2 | |
| 765 | |
| 766 =item Faster parsing. | |
| 767 | |
| 768 Each hit can be screened by examination of the description line alone | |
| 769 without fully parsing the HSP alignment section. | |
| 770 | |
| 771 =item Flexibility. | |
| 772 | |
| 773 The C<-signif> tag provides a more semantic-free way to specify the | |
| 774 value to be used as a basis for screening hits. Thus, C<-signif> can | |
| 775 be used for screening Blast1 or Blast2 reports. It is up to the user | |
| 776 to understand whether C<-signif> represents a P-value or an Expect | |
| 777 value. | |
| 778 | |
| 779 =back | |
| 780 | |
| 781 Any hit not meeting the significance criteria will not be added to the | |
| 782 "hit list" of the BLAST object. Also, a BLAST object without any hits | |
| 783 meeting the significance criteria will throw an exception during | |
| 784 object construction (a fatal event). | |
| 785 | |
| 786 =head2 Statistical Parameters | |
| 787 | |
| 788 There are numerous parameters which define the behavior of the BLAST | |
| 789 program and which are useful for interpreting the search | |
| 790 results. These parameters are extracted from the Blast report: | |
| 791 | |
| 792 filter -- for masking out low-complexity sequences or short repeats | |
| 793 matrix -- name of the substitution scoring matrix (e.g., BLOSUM62) | |
| 794 E -- Expect filter (screens out frequent scores) | |
| 795 S -- Cutoff score for segment pairs | |
| 796 W -- Word length | |
| 797 T -- Threshold score for word pairs | |
| 798 Lambda, -- Karlin-Altschul "sum" statistical parameters dependent on | |
| 799 K, H sequence composition. | |
| 800 G -- Gap creation penalty. | |
| 801 E -- Gap extension penalty. | |
| 802 | |
| 803 These parameters are not always needed. Extraction may be turned off | |
| 804 explicitly by including a C<-stats =E<gt> 0> parameter during object | |
| 805 construction. Support for all statistical parameters is not complete. | |
| 806 | |
| 807 For more about the meaning of parameters, check out the NCBI URLs given above. | |
| 808 | |
| 809 =head2 Module Organization | |
| 810 | |
| 811 The modules that comprise this Bioperl Blast distribution are location in the | |
| 812 Bio:: hierarchy as shown in the diagram below. | |
| 813 | |
| 814 Bio/ | |
| 815 | | |
| 816 +--------------------------+ | |
| 817 | | | |
| 818 Bio::Tools Bio::Root | |
| 819 | | | |
| 820 +----------------------+ Object.pm | |
| 821 | | | | |
| 822 SeqAnal.pm Blast.pm Blast/ | |
| 823 | | |
| 824 +---------+---------+------------+ | |
| 825 | | | | | |
| 826 Sbjct.pm HSP.pm HTML.pm Run/ | |
| 827 | | |
| 828 +------------+ | |
| 829 | | | |
| 830 Webblast.pm LocalBlast.pm | |
| 831 | |
| 832 Bio::Tools::Blast.pm is a concrete class that inherits from | |
| 833 B<Bio::Tools::SeqAnal.pm> and relies on other modules for parsing and | |
| 834 managing BLAST data. Worth mentioning about this hierarchy is the | |
| 835 lack of a "Parse.pm" module. Since parsing is considered central to | |
| 836 the purpose of the Bioperl Blast module (and Bioperl in general), it | |
| 837 seems somewhat unnatural to segregate out all parsing code. This | |
| 838 segregation could also lead to inefficiencies and harder to maintain | |
| 839 code. I consider this issue still open for debate. | |
| 840 | |
| 841 Bio::Tools::Blast.pm, B<Bio::Tools::Blast::Sbjct.pm>, and | |
| 842 B<Bio::Tools::Blast::HSP.pm> are mostly dedicated to parsing and all | |
| 843 can be used to instantiate objects. Blast.pm is the main "command and | |
| 844 control" module, inheriting some basic behaviors from SeqAnal.pm | |
| 845 (things that are not specific to Blast I<per se>). | |
| 846 | |
| 847 B<Bio::Tools::Blast::HTML.pm> contains functions dedicated to | |
| 848 generating HTML-formatted Blast reports and does not generate objects. | |
| 849 | |
| 850 =head2 Running Blasts: Details | |
| 851 | |
| 852 B<Bio::Tools::Blast::Run::Webblast.pm> contains a set of functions for | |
| 853 running Blast analyses at a remote server and also does not | |
| 854 instantiate objects. It uses a helper script called postclient.pl, | |
| 855 located in the Run directory. The proposed LocalBlast.pm module would | |
| 856 be used for running Blast reports on local machines and thus would be | |
| 857 customizable for different sites. It would operate in a parallel | |
| 858 fashion to Webblast.pm (i.e., being a collection of functions, taking | |
| 859 in sequence objects or files, returning result files). | |
| 860 | |
| 861 The Run modules are considered experimental. In particular, | |
| 862 Webblast.pm catures an HTML-formatted version of the Blast report from | |
| 863 the NCBI server and strips out the HTML in preparation for parsing. A | |
| 864 more direct approach would be to capture the Blast results directly | |
| 865 from the server using an interface to the NCBI toolkit. This approach | |
| 866 was recently proposed on the Bioperl mailing list: | |
| 867 http://www.uni-bielefeld.de/mailinglists/BCD/vsns-bcd-perl/9805/0000.html | |
| 868 | |
| 869 =head2 Memory Usage Issues | |
| 870 | |
| 871 Parsing large numbers of Blast reports (a few thousand or so) with | |
| 872 Bio::Tools::Blast.pm may lead to unacceptable memory usage situations. | |
| 873 This is somewhat dependent of the size and complexity of the reports. | |
| 874 | |
| 875 While this problem is under investigation, here are some workarounds | |
| 876 that fix the memory usage problem: | |
| 877 | |
| 878 =over 4 | |
| 879 | |
| 880 =item 1 Don't specify a -signif criterion when calling L<parse()|parse>. | |
| 881 | |
| 882 The C<-signif> value is used for imposing a upper limit to the expect- or | |
| 883 P-value for Blast hits to be parsed. For reasons that are still under | |
| 884 investigation, specifying a value for C<-signif> in the L<parse()|parse> | |
| 885 method prevents Blast objects from being fully | |
| 886 garbage collected. When using the B<parse_blast.pl> or B<parse_multi.pl> | |
| 887 scripts in C<examples/blast/> of the bioperl distribution), don't supply | |
| 888 a C<-signif> command-line parameter. | |
| 889 | |
| 890 =item 2 If you want to impose a -signif criterion, put it inside a | |
| 891 -filt_func. | |
| 892 | |
| 893 For the L<parse()|parse> method, a -signif =E<gt> 1e-5 parameter is equivalent | |
| 894 to using a filter function parameter of | |
| 895 | |
| 896 -filt_func => sub { my $hit = shift; return $hit->signif <= 1e-5; } | |
| 897 | |
| 898 Using the B<examples/blast/parse_multi.pl> script, you can supply a | |
| 899 command-line argument of | |
| 900 | |
| 901 -filt_func '$hit->signif <= 1e-5' | |
| 902 | |
| 903 For more information, see L<parse()|parse> and the section | |
| 904 L<Screening hits using arbitrary criteria>. | |
| 905 | |
| 906 =back | |
| 907 | |
| 908 =head1 TODO | |
| 909 | |
| 910 =over 4 | |
| 911 | |
| 912 =item * Develop a functional, prototype Bio::Tools::Blast::Run::LocalBlast.pm module. | |
| 913 | |
| 914 =item * Add support for PSI-BLAST and PHI-BLAST | |
| 915 | |
| 916 =item * Parse histogram of expectations and retrieve gif image in | |
| 917 Blast report (if present). | |
| 918 | |
| 919 =item * Further investigate memory leak that occurs when parsing Blast | |
| 920 streams whe supplying a -signif parameter to L<parse()|parse>. | |
| 921 | |
| 922 =item * Access Blast results directly from the NCBI server using a | |
| 923 Perl interface to the NCBI toolkit or XML formated Blast reports (when | |
| 924 available). | |
| 925 | |
| 926 =item * Further exploit Bio::UnivAln.pm and multiple-sequence | |
| 927 alignment programs using HSP sequence data. Some of this may best go | |
| 928 into a separate, dedicated module or script as opposed to burdening | |
| 929 Blast.pm, Sbjct.pm, and HSP.pm with additional functionality that is | |
| 930 not always required. | |
| 931 | |
| 932 =item * Add an example script for parsing Blast reports containing | |
| 933 HTML formatting. | |
| 934 | |
| 935 =back | |
| 936 | |
| 937 =head1 VERSION | |
| 938 | |
| 939 Bio::Tools::Blast.pm, 0.09 | |
| 940 | |
| 941 =head1 FEEDBACK | |
| 942 | |
| 943 =head2 Mailing Lists | |
| 944 | |
| 945 User feedback is an integral part of the evolution of this and other | |
| 946 Bioperl modules. Send your comments and suggestions preferably to one | |
| 947 of the Bioperl mailing lists. Your participation is much appreciated. | |
| 948 | |
| 949 bioperl-l@bioperl.org - General discussion | |
| 950 http://bio.perl.org/MailList.html - About the mailing lists | |
| 951 | |
| 952 =head2 Reporting Bugs | |
| 953 | |
| 954 Report bugs to the Bioperl bug tracking system to help us keep track | |
| 955 the bugs and their resolution. Bug reports can be submitted via email | |
| 956 or the web: | |
| 957 | |
| 958 bioperl-bugs@bio.perl.org | |
| 959 http://bugzilla.bioperl.org/ | |
| 960 | |
| 961 =head1 AUTHOR | |
| 962 | |
| 963 Steve Chervitz, sac@bioperl.org | |
| 964 | |
| 965 See the L<FEEDBACK | FEEDBACK> section for where to send bug reports and comments. | |
| 966 | |
| 967 =head1 ACKNOWLEDGEMENTS | |
| 968 | |
| 969 This module was developed under the auspices of the Saccharomyces Genome | |
| 970 Database: | |
| 971 http://genome-www.stanford.edu/Saccharomyces | |
| 972 | |
| 973 Other contributors include: Alex Dong Li (webblast), Chris Dagdigian | |
| 974 (Seq.pm), Steve Brenner (Seq.pm), Georg Fuellen (Seq.pm, UnivAln.pm), | |
| 975 and untold others who have offered comments (noted in the | |
| 976 Bio/Tools/Blast/CHANGES file of the distribution). | |
| 977 | |
| 978 =head1 COPYRIGHT | |
| 979 | |
| 980 Copyright (c) 1996-98 Steve Chervitz. All Rights Reserved. This | |
| 981 module is free software; you can redistribute it and/or modify it | |
| 982 under the same terms as Perl itself. | |
| 983 | |
| 984 =head1 SEE ALSO | |
| 985 | |
| 986 Bio::Tools::SeqAnal.pm - Sequence analysis object base class. | |
| 987 Bio::Tools::Blast::Sbjct.pm - Blast hit object. | |
| 988 Bio::Tools::Blast::HSP.pm - Blast HSP object. | |
| 989 Bio::Tools::Blast::HTML.pm - Blast HTML-formating utility class. | |
| 990 Bio::Tools::Blast::Run::Webblast.pm - Utility module for running Blasts remotely. | |
| 991 Bio::Tools::Blast::Run::LocalBlast.pm - Utility module for running Blasts locally. | |
| 992 Bio::Seq.pm - Biosequence object | |
| 993 Bio::UnivAln.pm - Biosequence alignment object. | |
| 994 Bio::Root::Object.pm - Proposed base class for all Bioperl objects. | |
| 995 | |
| 996 =head2 Links to related modules | |
| 997 | |
| 998 Bio::Tools::SeqAnal.pm | |
| 999 http://bio.perl.org/Core/POD/Bio/Tools/SeqAnal.html | |
| 1000 | |
| 1001 Bio::Tools::Blast::Sbjct.pm | |
| 1002 http://bio.perl.org/Core/POD/Bio/Tools/Blast/Sbjct.html | |
| 1003 | |
| 1004 Bio::Tools::Blast::HSP.pm | |
| 1005 http://bio.perl.org/Core/POD/Bio/Tools/Blast/HSP.html | |
| 1006 | |
| 1007 Bio::Tools::Blast::HTML.pm | |
| 1008 http://bio.perl.org/Core/POD/Bio/Tools/Blast/HTML.html | |
| 1009 | |
| 1010 Bio::Tools::Blast::Run::Webblast.pm | |
| 1011 http://bio.perl.org/Core/POD/Bio/Tools/Blast/Run/Webblast.html | |
| 1012 | |
| 1013 Bio::Tools::Blast::Run::LocalBlast.pm | |
| 1014 http://bio.perl.org/Core/POD/Bio/Tools/Blast/Run/LocalBlast.html | |
| 1015 | |
| 1016 Bio::Seq.pm | |
| 1017 http://bio.perl.org/Core/POD/Seq.html | |
| 1018 | |
| 1019 Bio::UnivAln.pm | |
| 1020 http://bio.perl.org/Projects/SeqAlign/ | |
| 1021 Europe: http://www.techfak.uni-bielefeld.de/bcd/Perl/Bio/#univaln | |
| 1022 | |
| 1023 Bio::Root::Object.pm | |
| 1024 http://bio.perl.org/Core/POD/Root/Object.html | |
| 1025 | |
| 1026 http://bio.perl.org/Projects/modules.html - Online module documentation | |
| 1027 http://bio.perl.org/Projects/Blast/ - Bioperl Blast Project | |
| 1028 http://bio.perl.org/ - Bioperl Project Homepage | |
| 1029 | |
| 1030 L<References & Information about the BLAST program>. | |
| 1031 | |
| 1032 =head1 KNOWN BUGS | |
| 1033 | |
| 1034 There is a memory leak that occurs when parsing parsing streams | |
| 1035 containing large numbers of Blast reports (a few thousand or so) and | |
| 1036 specifying a -signif parameter to the L<parse()|parse> method. For a | |
| 1037 workaround, see L<Memory Usage Issues>. | |
| 1038 | |
| 1039 Not sharing statistical parameters between different Blast objects | |
| 1040 when parsing a multi-report stream has not been completely tested and | |
| 1041 may be a little buggy. | |
| 1042 | |
| 1043 Documentation inconsistencies or inaccuracies may exist since this | |
| 1044 module underwend a fair bit of re-working going from 0.75 to 0.80 | |
| 1045 (corresponds to versions 0.04.4 to 0.05 of the bioperl distribution). | |
| 1046 | |
| 1047 =cut | |
| 1048 | |
| 1049 # | |
| 1050 ## | |
| 1051 ### | |
| 1052 #### END of main POD documentation. | |
| 1053 ### | |
| 1054 ## | |
| 1055 # | |
| 1056 | |
| 1057 =head1 APPENDIX | |
| 1058 | |
| 1059 Methods beginning with a leading underscore are considered private and | |
| 1060 are intended for internal use by this module. They are B<not> | |
| 1061 considered part of the public interface and are described here for | |
| 1062 documentation purposes only. | |
| 1063 | |
| 1064 =cut | |
| 1065 | |
| 1066 ############################################################################## | |
| 1067 ## CONSTRUCTOR ## | |
| 1068 ############################################################################## | |
| 1069 | |
| 1070 | |
| 1071 sub new { | |
| 1072 my ($class,@args) = @_; | |
| 1073 $class->warn("Bio::Tools::BLAST is deprecated, use Bio::SearchIO system or Bio::Tools::BPlite"); | |
| 1074 return $class->SUPER::new(@args); | |
| 1075 } | |
| 1076 | |
| 1077 ## The Blast.pm object relies on the the superclass constructor: | |
| 1078 ## Bio::Tools::SeqAnal::_initialize(). See that module for details. | |
| 1079 | |
| 1080 #------------- | |
| 1081 sub destroy { | |
| 1082 #------------- | |
| 1083 my $self=shift; | |
| 1084 $DEBUG==2 && print STDERR "DESTROYING $self ${\$self->name}"; | |
| 1085 if($self->{'_hits'}) { | |
| 1086 foreach($self->hits) { | |
| 1087 $_->destroy; | |
| 1088 undef $_; | |
| 1089 } | |
| 1090 undef $self->{'_hits'}; | |
| 1091 #$self->{'_hits'}->remove_all; ## When and if this member becomes a vector. | |
| 1092 } | |
| 1093 | |
| 1094 $self->SUPER::destroy; | |
| 1095 } | |
| 1096 | |
| 1097 ##################################################################################### | |
| 1098 ## ACCESSORS ## | |
| 1099 ##################################################################################### | |
| 1100 | |
| 1101 =head2 run | |
| 1102 | |
| 1103 Usage : $object->run( %named_parameters ) | |
| 1104 Purpose : Run a local or remote Blast analysis on one or more sequences. | |
| 1105 Returns : String containing name of Blast output file if a single Blast | |
| 1106 : is run. | |
| 1107 : -- OR -- | |
| 1108 : List of Blast objects if multiple Blasts are being run as a group. | |
| 1109 Argument : Named parameters: (PARAMETER TAGS CAN BE UPPER OR LOWER CASE). | |
| 1110 : -METHOD => 'local' or 'remote' (default = remote), | |
| 1111 : -PARSE => boolean, (true if the results are to be parsed after the run) | |
| 1112 : -STRICT => boolean, the strict mode to use for the resulting Blast objects. | |
| 1113 : ADDITIONAL PARAMETERS: | |
| 1114 : See methods _run_remote() and _run_local() for required | |
| 1115 : parameters necessary for running the blast report. | |
| 1116 Throws : Exception if no Blast output file was obtained. | |
| 1117 Comments : This method is called automatically during construction of a | |
| 1118 : Blast.pm object when run parameters are sent to the constructor: | |
| 1119 : $blastObj = new Bio::Tools::Blast (-RUN =>\%runParam, | |
| 1120 : %parseParam ); | |
| 1121 : | |
| 1122 : The specific run methods (local or remote) called by run() | |
| 1123 : must return a list containing the file name(s) with the Blast output. | |
| 1124 : | |
| 1125 : The run() method can perform single or multiple Blast runs | |
| 1126 : (analogous to the way parse() works) depending on how many | |
| 1127 : sequences are submitted. However, the running of multiple | |
| 1128 : Blasts is probably better handled at the script level. See notes in | |
| 1129 : the "TODO" section below. | |
| 1130 : | |
| 1131 : As for what to do with the Blast result file, that decision is | |
| 1132 : left for the user who can direct the Blast object to delete, compress, | |
| 1133 : or leave it alone. | |
| 1134 : | |
| 1135 : This method does not worry about load balancing, which | |
| 1136 : is probably best handled at the server level. | |
| 1137 : | |
| 1138 TODO: : Support for running+parsing multiple Blast analyses with a | |
| 1139 : single run() call is incomplete. One can generate multiple | |
| 1140 : reports by placing more than one sequence object in the -seqs | |
| 1141 : reference parameter. This saves some overhead in the code | |
| 1142 : that executes the Blasts since all options are configured once. | |
| 1143 : (This is analogous to parsing using the static $Blast object | |
| 1144 : see parse() and _parse_stream()). | |
| 1145 : | |
| 1146 : The trouble is that Blast objects for all runs are constructed, | |
| 1147 : parsed (if necessary), and then returned as a group | |
| 1148 : This can require lots of memory when run+parsing many Blasts | |
| 1149 : but should be fine if you just want to run a bunch Blasts. | |
| 1150 : | |
| 1151 : For now, when running+parsing Blasts, stick to running one | |
| 1152 : Blast at a time, building the Blast object with the results | |
| 1153 : of that report, and processing as necessary. | |
| 1154 : | |
| 1155 : Support for running PSI-Blast is not complete. | |
| 1156 | |
| 1157 See Also: L<_run_remote()|_run_remote>, L<_run_local()|_run_local>, L<parse()|parse> | |
| 1158 | |
| 1159 =cut | |
| 1160 | |
| 1161 #--------- | |
| 1162 sub run { | |
| 1163 #--------- | |
| 1164 my ($self, %param) = @_; | |
| 1165 my($method, $parse, $strict) = | |
| 1166 $self->_rearrange([qw(METHOD PARSE STRICT)], %param); | |
| 1167 | |
| 1168 $strict = $self->strict($strict) if $strict; | |
| 1169 | |
| 1170 my (@files); | |
| 1171 if($method =~ /loc/i) { | |
| 1172 @files = $self->_run_local(%param); | |
| 1173 | |
| 1174 } else { | |
| 1175 @files = $self->_run_remote(%param); | |
| 1176 } | |
| 1177 | |
| 1178 $self->throw("Run Blast failed: no Blast output created.") if !@files; | |
| 1179 | |
| 1180 if(scalar(@files) == 1) { | |
| 1181 # If there was just one Blast output file, prepare to incorporate it | |
| 1182 # into the current Blast object. run() is called before parse() in the | |
| 1183 # SeqAnal.pm constructor. | |
| 1184 if($files[0] ne 'email') { | |
| 1185 $self->file($files[0]); | |
| 1186 } else { | |
| 1187 # Can't do anything with the report. | |
| 1188 $self->throw("Blast report to be sent via e-mail."); | |
| 1189 } | |
| 1190 | |
| 1191 } else { | |
| 1192 # If there are multiple report files, build individual Blast objects foreach. | |
| 1193 # In this situation, the static $Blast object is being used to run | |
| 1194 # a set of related Blasts, similar to the way parse() can be used. | |
| 1195 # This strategy is not optimal since all reports are generated first | |
| 1196 # before any are parsed. | |
| 1197 # Untested. | |
| 1198 | |
| 1199 my(@objs); | |
| 1200 foreach(@files) { | |
| 1201 push @objs, new Bio::Tools::Blast(-FILE => $_, | |
| 1202 -PARSE => $parse || 0, | |
| 1203 -STRICT => $strict, | |
| 1204 ); | |
| 1205 } | |
| 1206 return @objs; | |
| 1207 } | |
| 1208 } | |
| 1209 | |
| 1210 =head2 _run_remote | |
| 1211 | |
| 1212 Usage : n/a; internal method called by run() | |
| 1213 : $object->_run_remote( %named_parameters ) | |
| 1214 Purpose : Run Blast on a remote server. | |
| 1215 Argument : Named parameters: | |
| 1216 : See documentation for function &blast_remote in | |
| 1217 : Bio::Tools::Blast::Run::Webblast.pm for description | |
| 1218 : of parameters. | |
| 1219 Comments : This method requires the Bio::Tools::Blast::Run::Webblast.pm | |
| 1220 : which conforms to this minimal API: | |
| 1221 : * export a method called &blast_remote that accepts a | |
| 1222 : Bio::Tools::Blast.pm object + named parameters | |
| 1223 : (specified in the Webblast.pm module). | |
| 1224 : * return a list of names of files containing the raw Blast reports. | |
| 1225 : (When building a Blast object, this list would contain a | |
| 1226 : single file from which the Blast object is to be constructed). | |
| 1227 | |
| 1228 See Also : L<run()|run>, L<_run_local()|_run_local>, B<Bio::Tools::Blast::Run::Webblast.pm::blast_remote()>, L<Links to related modules> | |
| 1229 | |
| 1230 =cut | |
| 1231 | |
| 1232 #---------------- | |
| 1233 sub _run_remote { | |
| 1234 #---------------- | |
| 1235 my ($self, %param) = @_; | |
| 1236 | |
| 1237 require Bio::Tools::Blast::Run::Webblast; | |
| 1238 Bio::Tools::Blast::Run::Webblast->import(qw(&blast_remote)); | |
| 1239 | |
| 1240 &blast_remote($self, %param); | |
| 1241 } | |
| 1242 | |
| 1243 =head2 _run_local | |
| 1244 | |
| 1245 Usage : n/a; internal method called by run() | |
| 1246 : $object->_run_local(%named_parameters) | |
| 1247 Purpose : Run Blast on a local machine. | |
| 1248 Argument : Named parameters: | |
| 1249 : See documentation for function &blast_local in | |
| 1250 : Bio::Tools::Blast::Run::LocalBlast.pm for description | |
| 1251 : of parameters. | |
| 1252 Comments : This method requires the Bio::Tools::Blast::Run::LocalBlast.pm | |
| 1253 : module which should be customized for your site. This module would | |
| 1254 : contain all the commands, paths, environment variables, and other | |
| 1255 : data necessary to run Blast commands on a local machine, but should | |
| 1256 : not contain any semantics for specific query sequences. | |
| 1257 : | |
| 1258 : LocalBlast.pm should also conform to this minimal API: | |
| 1259 : * export a method called &blast_local that accepts a | |
| 1260 : Bio::Tools::Blast.pm object + named parameters | |
| 1261 : (specified in the LocalBlast.pm module). | |
| 1262 : * return a list of names of files containing the raw Blast reports. | |
| 1263 : (When building a Blast object, this list would contain a | |
| 1264 : single file from which the Blast object is to be constructed). | |
| 1265 | |
| 1266 See Also : L<run()|run>, L<_run_remote()|_run_remote>, B<Bio::Tools::Blast::Run::LocalBlast::blast_local()>, L<Links to related modules> | |
| 1267 | |
| 1268 =cut | |
| 1269 | |
| 1270 #-------------- | |
| 1271 sub _run_local { | |
| 1272 #-------------- | |
| 1273 my ($self, %param) = @_; | |
| 1274 | |
| 1275 require Bio::Tools::Blast::Run::Webblast; | |
| 1276 Bio::Tools::Blast::Run::Webblast->import(qw(&blast_local)); | |
| 1277 | |
| 1278 &blast_local($self, %param); | |
| 1279 } | |
| 1280 | |
| 1281 =head2 db_remote | |
| 1282 | |
| 1283 Usage : @dbs = $Blast->db_remote( [seq_type] ); | |
| 1284 Purpose : Get a list of available sequence databases for remote Blast analysis. | |
| 1285 Returns : Array of strings | |
| 1286 Argument : seq_type = 'p' or 'n' | |
| 1287 : 'p' = Gets databases for peptide searches (default) | |
| 1288 : 'n' = Gets databases for nucleotide searches | |
| 1289 Throws : n/a | |
| 1290 Comments : Peptide databases are a subset of the nucleotide databases. | |
| 1291 : It is convenient to call this method on the static $Blast object | |
| 1292 : as shown in Usage. | |
| 1293 | |
| 1294 See Also : L<db_local()|db_local> | |
| 1295 | |
| 1296 =cut | |
| 1297 | |
| 1298 #---------------- | |
| 1299 sub db_remote { | |
| 1300 #---------------- | |
| 1301 my ($self, $type) = @_; | |
| 1302 $type ||= 'p'; | |
| 1303 | |
| 1304 require Bio::Tools::Blast::Run::Webblast; | |
| 1305 Bio::Tools::Blast::Run::Webblast->import(qw(@Blast_dbp_remote | |
| 1306 @Blast_dbn_remote)); | |
| 1307 | |
| 1308 # We shouldn't have to fully qualify the Blast_dbX_remote arrays. Hm. | |
| 1309 | |
| 1310 my(@dbs); | |
| 1311 if( $type =~ /^p|amino/i) { | |
| 1312 @dbs = @Bio::Tools::Blast::Run::Webblast::Blast_dbp_remote; | |
| 1313 } else { | |
| 1314 @dbs = @Bio::Tools::Blast::Run::Webblast::Blast_dbn_remote; | |
| 1315 } | |
| 1316 @dbs; | |
| 1317 } | |
| 1318 | |
| 1319 =head2 db_local | |
| 1320 | |
| 1321 Usage : @dbs = $Blast->db_local( [seq_type] ); | |
| 1322 Purpose : Get a list of available sequence databases for local Blast analysis. | |
| 1323 Returns : Array of strings | |
| 1324 Argument : seq_type = 'p' or 'n' | |
| 1325 : 'p' = Gets databases for peptide searches (default) | |
| 1326 : 'n' = Gets databases for nucleotide searches | |
| 1327 Throws : n/a | |
| 1328 Comments : Peptide databases are a subset of the nucleotide databases. | |
| 1329 : It is convenient to call this method on the static $Blast object. | |
| 1330 as shown in Usage. | |
| 1331 | |
| 1332 See Also : L<db_remote()|db_remote> | |
| 1333 | |
| 1334 =cut | |
| 1335 | |
| 1336 #---------------- | |
| 1337 sub db_local { | |
| 1338 #---------------- | |
| 1339 my ($self, $type) = @_; | |
| 1340 $type ||= 'p'; | |
| 1341 | |
| 1342 require Bio::Tools::Blast::Run::LocalBlast; | |
| 1343 Bio::Tools::Blast::Run::LocalBlast->import(qw(@Blast_dbp_local | |
| 1344 @Blast_dbn_local)); | |
| 1345 | |
| 1346 # We shouldn't have to fully qualify the Blast_dbX_local arrays. Hm. | |
| 1347 | |
| 1348 my(@dbs); | |
| 1349 if( $type =~ /^p|amino/i) { | |
| 1350 @dbs = @Bio::Tools::Blast::Run::LocalBlast::Blast_dbp_local; | |
| 1351 } else { | |
| 1352 @dbs = @Bio::Tools::Blast::Run::LocalBlast::Blast_dbn_local; | |
| 1353 } | |
| 1354 @dbs; | |
| 1355 } | |
| 1356 | |
| 1357 =head2 parse | |
| 1358 | |
| 1359 Usage : $blast_object->parse( %named_parameters ) | |
| 1360 Purpose : Parse a Blast report from a file or STDIN. | |
| 1361 : * Parses a raw BLAST data, populating Blast object with report data. | |
| 1362 : * Sets the significance cutoff. | |
| 1363 : * Extracts statistical parameters about the BLAST run. | |
| 1364 : * Handles both single files and streams containing multiple reports. | |
| 1365 Returns : integer (number of Blast reports parsed) | |
| 1366 Argument : <named parameters>: (PARAMETER TAGS CAN BE UPPER OR LOWER CASE). | |
| 1367 : -FILE => string (name of file containing raw Blast output. | |
| 1368 : Optional. If a valid file is not supplied, | |
| 1369 : STDIN will be used). | |
| 1370 : -SIGNIF => number (float or scientific notation number to be used | |
| 1371 : as a P- or Expect value cutoff; | |
| 1372 : default = $DEFAULT_SIGNIF (999)). | |
| 1373 : -FILT_FUNC => func_ref (reference to a function to be used for | |
| 1374 : filtering out hits based on arbitrary criteria. | |
| 1375 : This function should take a | |
| 1376 : Bio::Tools::Blast::Sbjct.pm object as its first | |
| 1377 : argument and return a boolean value, | |
| 1378 : true if the hit should be filtered out). | |
| 1379 : Sample filter function: | |
| 1380 : -FILT_FUNC => sub { $hit = shift; | |
| 1381 : $hit->gaps == 0; }, | |
| 1382 : -CHECK_ALL_HITS => boolean (check all hits for significance against | |
| 1383 : significance criteria. Default = false. | |
| 1384 : If false, stops processing hits after the first | |
| 1385 : non-significant hit or the first hit that fails | |
| 1386 : the filt_func call. This speeds parsing, | |
| 1387 : taking advantage of the fact that the hits | |
| 1388 : are processed in the order they are ranked.) | |
| 1389 : -MIN_LEN => integer (to be used as a minimum query sequence length | |
| 1390 : sequences below this length will not be processed). | |
| 1391 : default = no minimum length). | |
| 1392 : -STATS => boolean (collect stats for report: matrix, filters, etc. | |
| 1393 : default = false). | |
| 1394 : -BEST => boolean (only process the best hit of each report; | |
| 1395 : default = false). | |
| 1396 : -OVERLAP => integer (the amount of overlap to permit between | |
| 1397 : adjacent HSPs when tiling HSPs, | |
| 1398 : Default = $MAX_HSP_OVERLAP (2)) | |
| 1399 : | |
| 1400 : PARAMETERS USED WHEN PARSING MULTI-REPORT STREAMS: | |
| 1401 : -------------------------------------------------- | |
| 1402 : -SHARE => boolean (set this to true if all reports in stream | |
| 1403 : share the same stats. Default = true) | |
| 1404 : Must be set to false when parsing both Blast1 and | |
| 1405 : Blast2 reports in the same run or if you need | |
| 1406 : statistical params for each report, Lambda, K, H). | |
| 1407 : -STRICT => boolean (use strict mode for all Blast objects created. | |
| 1408 : Increases sensitivity to errors. For single | |
| 1409 : Blasts, this is parameter is sent to new().) | |
| 1410 : -EXEC_FUNC => func_ref (reference to a function for processing each | |
| 1411 : Blast object after it is parsed. Should accept a | |
| 1412 : Blast object as its sole argument. Return value | |
| 1413 : is ignored. If an -EXEC_FUNC parameter is supplied, | |
| 1414 : the -SAVE_ARRAY parameter will be ignored.) | |
| 1415 : -SAVE_ARRAY =>array_ref, (reference to an array for storing all | |
| 1416 : Blast objects as they are created. | |
| 1417 : Experimental. Not recommended.) | |
| 1418 : -SIGNIF_FMT => boolean String of 'exp' or 'parts'. Sets the format | |
| 1419 : for reporting P/Expect values. 'exp' reports | |
| 1420 : only the exponent portion. 'parts' reports | |
| 1421 : them as a 2 element list. See signif_fmt().. | |
| 1422 : | |
| 1423 Throws : Exception if BLAST report contains a FATAL: error. | |
| 1424 : Propagates any exception thrown by read(). | |
| 1425 : Propagates any exception thrown by called parsing methods. | |
| 1426 Comments : This method can be called either directly using the static $Blast object | |
| 1427 : or indirectly (by Bio::Tools::SeqAnal.pm) during constuction of an | |
| 1428 : individual Blast object. | |
| 1429 : | |
| 1430 : HTML-formatted reports can be parsed as well. No special flag is required | |
| 1431 : since it is detected automatically. The presence of HTML-formatting | |
| 1432 : will result in slower performace, however, since it must be removed | |
| 1433 : prior to parsing. Parsing HTML-formatted reports is highly | |
| 1434 : error prone and is generally not recommended. | |
| 1435 : | |
| 1436 : If one has an HTML report, do NOT remove the HTML from it by using the | |
| 1437 : "Save As" option of a web browser to save it as text. This renders the | |
| 1438 : report unparsable. | |
| 1439 : HTML-formatted reports can be parsed after running through the strip_html | |
| 1440 : function of Blast::HTML.pm as in: | |
| 1441 : require Bio::Tools::Blast::HTML; | |
| 1442 : Bio::Tools::Blast::HTML->import(&strip_html); | |
| 1443 : &strip_html(\$data); | |
| 1444 : # where data contains full contents of an HTML-formatted report. | |
| 1445 : TODO: write a demo script that does this. | |
| 1446 | |
| 1447 See Also : L<_init_parse_params()|_init_parse_params>, L<_parse_blast_stream()|_parse_blast_stream>, L<overlap()|overlap>, L<signif_fmt()|signif_fmt>, B<Bio::Root::Object::read()>, B<Bio::Tools::Blast::HTML.pm::strip_html()>, L<Links to related modules> | |
| 1448 | |
| 1449 =cut | |
| 1450 | |
| 1451 #--------- | |
| 1452 sub parse { | |
| 1453 #--------- | |
| 1454 # $self might be the static $Blast object. | |
| 1455 my ($self, @param) = @_; | |
| 1456 | |
| 1457 my($signif, $filt_func, $min_len, $check_all, $overlap, $stats, | |
| 1458 $share, $strict, $best, $signif_fmt, $no_aligns) = | |
| 1459 $self->_rearrange([qw(SIGNIF FILT_FUNC MIN_LEN CHECK_ALL_HITS | |
| 1460 OVERLAP STATS SHARE STRICT | |
| 1461 BEST EXPONENT NO_ALIGNS )], @param); | |
| 1462 | |
| 1463 ## Initialize the static Blast object with parameters that | |
| 1464 ## apply to all Blast objects within a parsing session. | |
| 1465 | |
| 1466 &_init_parse_params($share, $filt_func, $check_all, | |
| 1467 $signif, $min_len, $strict, | |
| 1468 $best, $signif_fmt, $stats, $no_aligns | |
| 1469 ); | |
| 1470 | |
| 1471 my $count = $self->_parse_blast_stream(@param); | |
| 1472 | |
| 1473 # print STDERR "\nDONE PARSING STREAM.\n"; | |
| 1474 | |
| 1475 if($Blast->{'_blast_errs'}) { | |
| 1476 my @errs = @{$Blast->{'_blast_errs'}}; | |
| 1477 printf STDERR "\n*** %d BLAST REPORTS HAD FATAL ERRORS:\n", scalar(@errs); | |
| 1478 foreach(@errs) { print STDERR "$_\n"; } | |
| 1479 @{$Blast->{'_blast_errs'}} = (); | |
| 1480 } | |
| 1481 | |
| 1482 return $count; | |
| 1483 } | |
| 1484 | |
| 1485 =head2 _init_parse_params | |
| 1486 | |
| 1487 Title : _init_parse_params | |
| 1488 Usage : n/a; called automatically by parse() | |
| 1489 Purpose : Initializes parameters used during parsing of Blast reports. | |
| 1490 : This is a static method used by the $Blast object. | |
| 1491 : Calls _set_signif(). | |
| 1492 Example : | |
| 1493 Returns : n/a | |
| 1494 Args : Args extracted by parse(). | |
| 1495 | |
| 1496 See Also: L<parse()|parse>, L<_set_signif()|_set_signif> | |
| 1497 | |
| 1498 =cut | |
| 1499 | |
| 1500 #---------------------- | |
| 1501 sub _init_parse_params { | |
| 1502 #---------------------- | |
| 1503 my ($share, $filt_func, $check_all, | |
| 1504 $signif, $min_len, $strict, | |
| 1505 $best, $signif_fmt, $stats, $no_aligns) = @_; | |
| 1506 | |
| 1507 ## Default is to share stats. | |
| 1508 $Blast->{'_share'} = defined($share) ? $share : 1; | |
| 1509 $Blast->{'_filt_func'} = $filt_func || 0; | |
| 1510 $Blast->{'_check_all'} = $check_all || 0; | |
| 1511 $Blast->{'_signif_fmt'} ||= $signif_fmt || ''; | |
| 1512 $Blast->{'_no_aligns'} = $no_aligns || 0; | |
| 1513 | |
| 1514 &_set_signif($signif, $min_len, $filt_func); | |
| 1515 $Blast->strict($strict) if defined $strict; | |
| 1516 $Blast->best($best) if $best; | |
| 1517 $Blast->{'_blast_count'} = 0; | |
| 1518 | |
| 1519 ## If $stats is false, miscellaneous statistical and other parameters | |
| 1520 ## are NOT extracted from the Blast report (e.g., matrix name, filter used, etc.). | |
| 1521 ## This can speed processing when crunching tons of Blast reports. | |
| 1522 ## Default is to NOT get stats. | |
| 1523 $Blast->{'_get_stats'} = defined($stats) ? $stats : 0; | |
| 1524 | |
| 1525 # Clear any errors from previous parse. | |
| 1526 undef $Blast->{'_blast_errs'}; | |
| 1527 } | |
| 1528 | |
| 1529 =head2 _set_signif | |
| 1530 | |
| 1531 Usage : n/a; called automatically by _init_parse_params() | |
| 1532 : This is now a "static" method used only by $Blast. | |
| 1533 : _set_signif($signif, $min_len, $filt_func); | |
| 1534 Purpose : Sets significance criteria for the BLAST object. | |
| 1535 Argument : Obligatory three arguments: | |
| 1536 : $signif = float or sci-notation number or undef | |
| 1537 : $min_len = integer or undef | |
| 1538 : $filt_func = function reference or undef | |
| 1539 : | |
| 1540 : If $signif is undefined, a default value is set | |
| 1541 : (see $DEFAULT_SIGNIF; min_length = not set). | |
| 1542 Throws : Exception if significance value is defined but appears | |
| 1543 : out of range or invalid. | |
| 1544 : Exception if $filt_func if defined and is not a func ref. | |
| 1545 Comments : The significance of a BLAST report can be based on | |
| 1546 : the P (or Expect) value and/or the length of the query sequence. | |
| 1547 : P (or Expect) values GREATER than '_significance' are not significant. | |
| 1548 : Query sequence lengths LESS than '_min_length' are not significant. | |
| 1549 : | |
| 1550 : Hits can also be screened using arbitrary significance criteria | |
| 1551 : as discussed in the parse() method. | |
| 1552 : | |
| 1553 : If no $signif is defined, the '_significance' level is set to | |
| 1554 : $Bio::Tools::Blast::DEFAULT_SIGNIF (999). | |
| 1555 | |
| 1556 See Also : L<signif()|signif>, L<min_length()|min_length>, L<_init_parse_params()|_init_parse_params>, L<parse()|parse> | |
| 1557 | |
| 1558 =cut | |
| 1559 | |
| 1560 #----------------- | |
| 1561 sub _set_signif { | |
| 1562 #----------------- | |
| 1563 my( $sig, $len, $func ) = @_; | |
| 1564 | |
| 1565 if(defined $sig) { | |
| 1566 $Blast->{'_confirm_significance'} = 1; | |
| 1567 if( $sig =~ /[^\d.e-]/ or $sig <= 0) { | |
| 1568 $Blast->throw("Invalid significance value: $sig", | |
| 1569 "Must be greater than zero."); | |
| 1570 } | |
| 1571 $Blast->{'_significance'} = $sig; | |
| 1572 } else { | |
| 1573 $Blast->{'_significance'} = $DEFAULT_SIGNIF; | |
| 1574 $Blast->{'_check_all'} = 1 if not $Blast->{'_filt_func'}; | |
| 1575 } | |
| 1576 | |
| 1577 if(defined $len) { | |
| 1578 if($len =~ /\D/ or $len <= 0) { | |
| 1579 $Blast->warn("Invalid minimum length value: $len", | |
| 1580 "Value must be an integer > 0. Value not set."); | |
| 1581 } else { | |
| 1582 $Blast->{'_min_length'} = $len; | |
| 1583 } | |
| 1584 } | |
| 1585 | |
| 1586 if(defined $func) { | |
| 1587 $Blast->{'_confirm_significance'} = 1; | |
| 1588 if($func and not ref $func eq 'CODE') { | |
| 1589 $Blast->throw("Not a function reference: $func", | |
| 1590 "The -filt_func parameter must be function reference."); | |
| 1591 } | |
| 1592 } | |
| 1593 } | |
| 1594 | |
| 1595 =head2 _parse_blast_stream | |
| 1596 | |
| 1597 Usage : n/a. Internal method called by parse() | |
| 1598 Purpose : Obtains the function to be used during parsing and calls read(). | |
| 1599 Returns : Integer (the number of blast reports read) | |
| 1600 Argument : Named parameters (forwarded from parse()) | |
| 1601 Throws : Propagates any exception thrown by _get_parse_blast_func() and read(). | |
| 1602 | |
| 1603 See Also : L<_get_parse_blast_func()|_get_parse_blast_func>, B<Bio::Root::Object::read()> | |
| 1604 | |
| 1605 =cut | |
| 1606 | |
| 1607 #---------------------- | |
| 1608 sub _parse_blast_stream { | |
| 1609 #---------------------- | |
| 1610 my ($self, %param) = @_; | |
| 1611 | |
| 1612 my $func = $self->_get_parse_blast_func(%param); | |
| 1613 # my $func = sub { my $data = shift; | |
| 1614 # printf STDERR "Chunk length = %d\n", length($data); | |
| 1615 # sleep(3); | |
| 1616 # }; | |
| 1617 | |
| 1618 # Only setting the newline character once per session. | |
| 1619 $Newline ||= $Util->get_newline(-client => $self, %param); | |
| 1620 | |
| 1621 $self->read(-REC_SEP =>"$Newline>", | |
| 1622 -FUNC => $func, | |
| 1623 %param); | |
| 1624 | |
| 1625 return $Blast->{'_blast_count'}; | |
| 1626 } | |
| 1627 | |
| 1628 =head2 _get_parse_blast_func | |
| 1629 | |
| 1630 Usage : n/a; internal method used by _parse_blast_stream() | |
| 1631 : $func_ref = $blast_object->_get_parse_blast_func() | |
| 1632 Purpose : Generates a function ref to be used as a closure for parsing | |
| 1633 : raw data as it is being loaded by Bio::Root::IOManager::read(). | |
| 1634 Returns : Function reference (closure). | |
| 1635 Comments : The the function reference contains a fair bit of logic | |
| 1636 : at present. It could perhaps be split up into separate | |
| 1637 : functions to make it more 'digestible'. | |
| 1638 | |
| 1639 See Also : L<_parse_blast_stream()|_parse_blast_stream> | |
| 1640 | |
| 1641 =cut | |
| 1642 | |
| 1643 #-------------------------- | |
| 1644 sub _get_parse_blast_func { | |
| 1645 #-------------------------- | |
| 1646 my ($self, @param) = @_; | |
| 1647 | |
| 1648 my ($save_a, $exec_func) = | |
| 1649 $self->_rearrange([qw(SAVE_ARRAY EXEC_FUNC)], @param); | |
| 1650 | |
| 1651 # $MONITOR && print STDERR "\nParsing Blast stream (5/dot, 250/line)\n"; | |
| 1652 my $count = 0; | |
| 1653 my $strict = $self->strict(); | |
| 1654 | |
| 1655 # Some parameter validation. | |
| 1656 # Remember, all Blast parsing will use this function now. | |
| 1657 # You won't need a exec-func or save_array when just creating a Blast object | |
| 1658 # as in: $blast = new Bio::Tools::Blast(); | |
| 1659 if($exec_func and not ref($exec_func) eq 'CODE') { | |
| 1660 $self->throw("The -EXEC_FUNC parameter must be function reference.", | |
| 1661 "exec_func = $exec_func"); | |
| 1662 | |
| 1663 } elsif($save_a and not ref($save_a) eq 'ARRAY') { | |
| 1664 $self->throw("The -SAVE_ARRAY parameter must supply an array reference". | |
| 1665 "when not using an -EXEC_FUNC parameter."); | |
| 1666 } | |
| 1667 | |
| 1668 ## Might consider breaking this closure up if possible. | |
| 1669 | |
| 1670 return sub { | |
| 1671 my ($data) = @_; | |
| 1672 ## $data should contain one of three possible fragment types | |
| 1673 ## from a Blast report: | |
| 1674 ## 1. Header with description section, | |
| 1675 ## 2. An alignment section for a single hit, or | |
| 1676 ## 3. The final alignment section plus the footer section. | |
| 1677 ## (record separator = "Newline>"). | |
| 1678 | |
| 1679 # print STDERR "\n(BLAST) DATA CHUNK: $data\n"; | |
| 1680 | |
| 1681 my ($current_blast, $current_prog, $current_vers, $current_db); | |
| 1682 my $prev_blast; | |
| 1683 my $contains_translation = 0; | |
| 1684 | |
| 1685 ### steve --- Wed Mar 15 02:48:07 2000 | |
| 1686 ### In the process of addressing bug PR#95. Tricky. | |
| 1687 ### Using the $contains_translation to do so. Not complete | |
| 1688 ### and possibly won't fix. We'll see. | |
| 1689 | |
| 1690 # Check for header section. Start a new Blast object and | |
| 1691 # parse the description section. | |
| 1692 # if ($data =~ /\sQuery\s?=/s || ($contains_translation && $data =~ /Database:/s)) { | |
| 1693 if ($data =~ /\sQuery\s?=/s) { | |
| 1694 $Blast->{'_blast_count'}++; | |
| 1695 print STDERR ".", $Blast->{'_blast_count'} % 50 ? '' : "\n" if $MONITOR; | |
| 1696 | |
| 1697 if($data =~ /$Newline\s+Translating/so) { | |
| 1698 print STDERR "\nCONTAINS TRANSLATION\n"; | |
| 1699 $contains_translation = 1; | |
| 1700 } | |
| 1701 | |
| 1702 # If we're parsing a stream containing multiple reports, | |
| 1703 # all subsequent header sections will contain the last hit of | |
| 1704 # the previous report which needs to be parsed and added to that | |
| 1705 # report if signifcant. It also contains the run parameters | |
| 1706 # at the bottom of the Blast report. | |
| 1707 # if($Blast->{'_blast_count'} > 1 || $contains_translation) { | |
| 1708 if($Blast->{'_blast_count'} > 1) { | |
| 1709 # print STDERR "\nMULTI-BLAST STREAM.\n"; | |
| 1710 $Blast->{'_multi_stream'} = 1; | |
| 1711 | |
| 1712 if($data =~ /(.+?)$Newline(<\w+>)?(T?BLAST[NPX])\s+(.+?)$Newline(.+)/so) { | |
| 1713 ($current_prog, $current_vers, $data) = ($3, $4, $5); | |
| 1714 # Final chunk containing last hit and last footer. | |
| 1715 $Blast->{'_current_blast'}->_parse_alignment($1); | |
| 1716 $prev_blast = $Blast->{'_current_blast'}; # finalized. | |
| 1717 # } elsif($contains_translation) { | |
| 1718 # $data =~ /(T?BLAST[NPX])\s+(.+?)$Newline(.+)/so; | |
| 1719 # ($current_prog, $current_vers, $data) = ($1, $2, $3); | |
| 1720 } else { | |
| 1721 $Blast->throw("Can't determine program type from BLAST report.", | |
| 1722 "Checked for: @Blast_programs."); | |
| 1723 # This has important implications for how to handle interval | |
| 1724 # information for HSPs. TBLASTN uses nucleotides in query HSP | |
| 1725 # but amino acids in the sbjct HSP sequence. | |
| 1726 } | |
| 1727 | |
| 1728 if($data =~ m/Database:\s+(.+?)$Newline/so ) { | |
| 1729 $current_db = $1; | |
| 1730 } else { | |
| 1731 # In some reports, the Database is only listed at end. | |
| 1732 #$Blast->warn("Can't determine database name from BLAST report."); | |
| 1733 } | |
| 1734 | |
| 1735 # Incyte_Fix: Nasty Invisible Bug. | |
| 1736 # Records in blast report are delimited by '>', but... when | |
| 1737 # there are no hits for a query, there won't be a '>'. That | |
| 1738 # causes several blast reports to run together in the data | |
| 1739 # passed to this routine. Need to get rid of non-hits in data | |
| 1740 if ($data =~ /.+(No hits? found.+Sequences.+)/so) { | |
| 1741 $data = $1; | |
| 1742 } | |
| 1743 # End Incyte_Fix | |
| 1744 | |
| 1745 } | |
| 1746 | |
| 1747 # Determine if we need to create a new Blast object | |
| 1748 # or use the $self object for this method. | |
| 1749 | |
| 1750 if($Blast->{'_multi_stream'} or $self->name eq 'Static Blast object') { | |
| 1751 # Strict mode is not object-specific but may be someday. | |
| 1752 # print STDERR "\nCreating new Blast object.\n"; | |
| 1753 $current_blast = new Bio::Tools::Blast(-STRICT => $strict); | |
| 1754 } else { | |
| 1755 $current_blast = $self; | |
| 1756 } | |
| 1757 $Blast->{'_current_blast'} = $current_blast; | |
| 1758 | |
| 1759 # If we're not sharing stats, set data on current blast object. | |
| 1760 if(defined $current_prog and not $Blast->{'_share'}) { | |
| 1761 $current_blast->program($current_prog); | |
| 1762 $current_blast->program_version($current_vers); | |
| 1763 $current_blast->database($current_db); | |
| 1764 } | |
| 1765 | |
| 1766 # print STDERR "CURRENT BLAST = ", $current_blast->name, "\n"; | |
| 1767 $current_blast->_parse_header($data); | |
| 1768 | |
| 1769 # If there were any descriptions in the header, | |
| 1770 # we know if there are any significant hits. | |
| 1771 # No longer throwing exception if there were no significant hits | |
| 1772 # and a -signif parameter was specified. Doing so prevents the | |
| 1773 # construction of a Blast object, which could still be useful. | |
| 1774 # if($current_blast->{'_has_descriptions'} and $Blast->{'_confirm_significance'} and not $current_blast->is_signif) { | |
| 1775 # $current_blast->throw("No significant BLAST hits for ${\$current_blast->name}"); | |
| 1776 | |
| 1777 # } | |
| 1778 | |
| 1779 } # Done parsing header/description section | |
| 1780 | |
| 1781 ### For use with $contains_translation - not right - breaks regular report parsing. | |
| 1782 # elsif(ref $Blast->{'_current_blast'} && $data !~ /\s*\w*\s*/s) { | |
| 1783 elsif(ref $Blast->{'_current_blast'} ) { | |
| 1784 # Process an alignment section. | |
| 1785 $current_blast = $Blast->{'_current_blast'}; | |
| 1786 # print STDERR "\nCONTINUING PROCESSING ALN WITH ", $current_blast->name, "\n"; | |
| 1787 # print STDERR "DATA: $data\n"; | |
| 1788 eval { | |
| 1789 $current_blast->_parse_alignment($data); | |
| 1790 }; | |
| 1791 if($@) { | |
| 1792 # push @{$self->{'_blast_errs'}}, $@; | |
| 1793 } | |
| 1794 } | |
| 1795 | |
| 1796 # If the current Blast object has been completely parsed | |
| 1797 # (occurs with a single Blast stream), or if there is a previous | |
| 1798 # Blast object (occurs with a multi Blast stream), | |
| 1799 # execute a supplied function on it or store it in a supplied array. | |
| 1800 | |
| 1801 if( defined $prev_blast or $current_blast->{'_found_params'}) { | |
| 1802 my $finished_blast = defined($prev_blast) ? $prev_blast : $current_blast; | |
| 1803 | |
| 1804 $finished_blast->_report_errors(); | |
| 1805 # print STDERR "\nNEW BLAST OBJECT: ${\$finished_blast->name}\n"; | |
| 1806 | |
| 1807 if($exec_func) { | |
| 1808 # print STDERR " RUNNING EXEC_FUNC...\n"; | |
| 1809 &$exec_func($finished_blast); # ignoring any return value. | |
| 1810 # Report processed, no longer need object. | |
| 1811 $finished_blast->destroy; | |
| 1812 undef $finished_blast; | |
| 1813 } elsif($save_a) { | |
| 1814 # print STDERR " SAVING IN ARRAY...\n"; | |
| 1815 # We've already verified that if there is no exec_func | |
| 1816 # then there must be a $save_array | |
| 1817 push @$save_a, $finished_blast; | |
| 1818 } | |
| 1819 } | |
| 1820 1; | |
| 1821 } | |
| 1822 } | |
| 1823 | |
| 1824 =head2 _report_errors | |
| 1825 | |
| 1826 Title : _report_errors | |
| 1827 Usage : n/a; Internal method called by _get_parse_blast_func(). | |
| 1828 Purpose : Throw or warn about any errors encountered. | |
| 1829 Returns : n/a | |
| 1830 Args : n/a | |
| 1831 Throws : If all hits generated exceptions, raise exception | |
| 1832 : (a fatal event for the Blast object.) | |
| 1833 : If some hits were okay but some were bad, generate a warning | |
| 1834 : (a few bad applies should not spoil the bunch). | |
| 1835 : This usually indicates a limiting B-value. | |
| 1836 : When the parsing code fails, it is either all or nothing. | |
| 1837 | |
| 1838 =cut | |
| 1839 | |
| 1840 #------------------- | |
| 1841 sub _report_errors { | |
| 1842 #------------------- | |
| 1843 my $self = shift; | |
| 1844 | |
| 1845 return unless ref($self->{'_blast_errs'}); | |
| 1846 # ref($self->{'_blast_errs'}) || (print STDERR "\nNO ERRORS\n", return ); | |
| 1847 | |
| 1848 my @errs = @{$self->{'_blast_errs'}}; | |
| 1849 | |
| 1850 if(scalar @errs) { | |
| 1851 my ($str); | |
| 1852 @{$self->{'_blast_errs'}} = (); # clear the errs on the object. | |
| 1853 # When there are many errors, in most of the cases, they are | |
| 1854 # caused by the same problem. Only need to see full data for | |
| 1855 # the first one. | |
| 1856 if(scalar @errs > 2) { | |
| 1857 $str = "SHOWING FIRST EXCEPTION ONLY:\n$errs[0]"; | |
| 1858 $self->clear_err(); # clearing the existing set of errors (conserve memory). | |
| 1859 # Not necessary, unless the -RECORD_ERR =>1 | |
| 1860 # constructor option was used for Blast object. | |
| 1861 } else { | |
| 1862 $str = join("\n",@errs); | |
| 1863 } | |
| 1864 | |
| 1865 if(not $self->{'_num_hits_significant'}) { | |
| 1866 $self->throw(sprintf("Failed to parse any hit data (n=%d).", scalar(@errs)), | |
| 1867 "\n\nTRAPPED EXCEPTION(S):\n$str\nEND TRAPPED EXCEPTION(S)\n" | |
| 1868 ); | |
| 1869 } else { | |
| 1870 $self->warn(sprintf("Some potential hits were not parsed (n=%d).", scalar(@errs)), | |
| 1871 @errs > 2 ? "This may be due to a limiting B value (max alignment listings)." : "", | |
| 1872 "\n\nTRAPPED EXCEPTION(S):\n$str\nEND TRAPPED EXCEPTION(S)\n" | |
| 1873 ); | |
| 1874 } | |
| 1875 } | |
| 1876 } | |
| 1877 | |
| 1878 =head2 _parse_header | |
| 1879 | |
| 1880 Usage : n/a; called automatically by the _get_parse_blast_func(). | |
| 1881 Purpose : Parses the header section of a BLAST report. | |
| 1882 Argument : String containing the header+description section of a BLAST report. | |
| 1883 Throws : Exception if description data cannot be parsed properly. | |
| 1884 : Exception if there is a 'FATAL' error in the Blast report. | |
| 1885 : Warning if there is a 'WARNING' in the Blast report. | |
| 1886 : Warning if there are no significant hits. | |
| 1887 Comments : Description section contains a single line for each hit listing | |
| 1888 : the seq id, description, score, Expect or P-value, etc. | |
| 1889 | |
| 1890 See Also : L<_get_parse_blast_func()|_get_parse_blast_func> | |
| 1891 | |
| 1892 =cut | |
| 1893 | |
| 1894 #---------------------- | |
| 1895 sub _parse_header { | |
| 1896 #---------------------- | |
| 1897 my( $self, $data ) = @_; | |
| 1898 | |
| 1899 # print STDERR "\n$ID: PARSING HEADER\n"; #$data\n"; | |
| 1900 | |
| 1901 $data =~ s/^\s+|\s+>?$//sg; | |
| 1902 | |
| 1903 if($data =~ /<HTML/i) { | |
| 1904 $self->throw("Can't parse HTML-formatted BLAST reports.", | |
| 1905 # "Such reports can be parsed with a special parsing \n". | |
| 1906 # "script included in the examples/blast directory \n". | |
| 1907 # "of the Bioperl distribution. (TODO)" | |
| 1908 ); | |
| 1909 # This was the old strategy, can't do it with new strategy | |
| 1910 # since we don't have the whole report in one chunk. | |
| 1911 # This could be the basis for the "special parsing script". | |
| 1912 # require Bio::Tools::Blast::HTML; | |
| 1913 # Bio::Tools::Blast::HTML->import(&strip_html); | |
| 1914 # &strip_html(\$data); | |
| 1915 } | |
| 1916 | |
| 1917 $data =~ /WARNING: (.+?)$Newline$Newline/so and $self->warn("$1") if $self->strict; | |
| 1918 $data =~ /FATAL: (.+?)$Newline$Newline/so and $self->throw("FATAL BLAST ERROR = $1"); | |
| 1919 # No longer throwing exception when no hits were found. Still reporting it. | |
| 1920 $data =~ /No hits? found/i and $self->warn("No hits were found.") if $self->strict; | |
| 1921 | |
| 1922 # If this is the first Blast, the program, version, and database info | |
| 1923 # pertain to it. Otherwise, they are for the previous report and have | |
| 1924 # already been parsed out. | |
| 1925 # Data is stored in the static Blast object. Data for subsequent reports | |
| 1926 # will be stored in separate objects if the -share parameter is not set. | |
| 1927 # See _get_parse_blast_func(). | |
| 1928 | |
| 1929 if($Blast->{'_blast_count'} == 1) { | |
| 1930 if($data =~ /(<\w+>)?(T?BLAST[NPX])\s+(.+?)$Newline/so) { | |
| 1931 $Blast->program($2); | |
| 1932 $Blast->program_version($3); | |
| 1933 } else { | |
| 1934 $self->throw("Can't determine program type from BLAST report.", | |
| 1935 "Checked for: @Blast_programs."); | |
| 1936 # This has important implications for how to handle interval | |
| 1937 # information for HSPs. TBLASTN uses nucleotides in query HSP | |
| 1938 # but amino acids in the sbjct HSP sequence. | |
| 1939 } | |
| 1940 | |
| 1941 if($data =~ m/Database:\s+(.+?)$Newline/so ) { | |
| 1942 $Blast->database($1); | |
| 1943 } else { | |
| 1944 # In some reports, the Database is only listed at end. | |
| 1945 #$self->warn("Can't determine database name from BLAST report (_parse_header)\n$data\n."); | |
| 1946 } | |
| 1947 } | |
| 1948 | |
| 1949 my ($header, $descriptions); | |
| 1950 | |
| 1951 ## For efficiency reasons, we want to to avoid using $' and $`. | |
| 1952 ## Therefore using single-line mode pattern matching. | |
| 1953 | |
| 1954 if($data =~ /(.+?)\nSequences producing.+?\n(.+)/s ) { | |
| 1955 ($header, $descriptions) = ($1, $2); | |
| 1956 $self->{'_has_descriptions'} = 1; | |
| 1957 } else { | |
| 1958 $header = $data; | |
| 1959 $self->{'_has_descriptions'} = 0; | |
| 1960 # Blast reports can legally lack description section. No need to warn. | |
| 1961 #push @{$self->{'_blast_errs'}}, "Can't parse description data."; | |
| 1962 } | |
| 1963 | |
| 1964 $self->_set_query($header); # The name of the sequence will appear in error report. | |
| 1965 # print STDERR "\nQUERY = ", $Blast->{'_current_blast'}->query, "\n"; | |
| 1966 | |
| 1967 $self->_set_date($header) if $Blast->{'_get_stats'}; | |
| 1968 $self->_set_length($header); | |
| 1969 | |
| 1970 # not $Blast->{'_confirm_significance'} and print STDERR "\nNOT PARSING DESCRIPTIONS.\n"; | |
| 1971 | |
| 1972 # Setting the absolute max and min significance levels. | |
| 1973 $self->{'_highestSignif'} = 0; | |
| 1974 $self->{'_lowestSignif'} = $DEFAULT_SIGNIF; | |
| 1975 | |
| 1976 if ($Blast->{'_confirm_significance'} || $Blast->{'_no_aligns'}) { | |
| 1977 $self->_parse_descriptions($descriptions) if $descriptions; | |
| 1978 } else { | |
| 1979 $self->{'_is_significant'} = 1; | |
| 1980 } | |
| 1981 } | |
| 1982 | |
| 1983 #----------------------- | |
| 1984 sub _parse_descriptions { | |
| 1985 #----------------------- | |
| 1986 my ($self, $desc) = @_; | |
| 1987 | |
| 1988 # NOTE: This method will not be called if the report lacks | |
| 1989 # a description section. | |
| 1990 | |
| 1991 # print STDERR "\nPARSING DESCRIPTION DATA\n"; | |
| 1992 | |
| 1993 my @descriptions = split( $Newline, $desc); | |
| 1994 my($line); | |
| 1995 | |
| 1996 # NOW step through each line parsing out the P/Expect value | |
| 1997 # All we really need to do is check the first one, if it doesn't | |
| 1998 # meet the significance requirement, we can skip the report. | |
| 1999 # BUT: we want to collect data for all hits anyway to get min/max signif. | |
| 2000 | |
| 2001 my $my_signif = $self->signif; | |
| 2002 my $layout_set = $Blast->{'_layout'} || 0; | |
| 2003 my $layout; | |
| 2004 my $count = 0; | |
| 2005 my $sig; | |
| 2006 | |
| 2007 desc_loop: | |
| 2008 foreach $line (@descriptions) { | |
| 2009 $count++; | |
| 2010 last desc_loop if $line =~ / NONE |End of List/; | |
| 2011 next desc_loop if $line =~ /^\s*$/; | |
| 2012 next desc_loop if $line =~ /^\.\./; | |
| 2013 | |
| 2014 ## Checking the significance value (P- or Expect value) of the hit | |
| 2015 ## in the description line. | |
| 2016 | |
| 2017 # These regexps need testing on a variety of reports. | |
| 2018 if ( $line =~ /\d+\s{1,5}[\de.-]+\s*$/) { | |
| 2019 $layout = 2; | |
| 2020 } elsif( $line =~ /\d+\s{1,5}[\de.-]+\s{1,}\d+\s*$/) { | |
| 2021 $layout = 1; | |
| 2022 } else { | |
| 2023 $self->warn("Can't parse significance data in description line $line"); | |
| 2024 next desc_loop; | |
| 2025 } | |
| 2026 not $layout_set and ($self->_layout($layout), $layout_set = 1); | |
| 2027 | |
| 2028 $sig = &_parse_signif( $line, $layout ); | |
| 2029 | |
| 2030 # print STDERR " Parsed signif ($layout) = $sig\n"; | |
| 2031 | |
| 2032 last desc_loop if ($sig > $my_signif and not $Blast->{'_check_all'}); | |
| 2033 $self->_process_significance($sig, $my_signif); | |
| 2034 } | |
| 2035 | |
| 2036 # printf "\n%d SIGNIFICANT HITS.\nDONE PARSING DESCRIPTIONS.\n", $self->{'_num_hits_significant'}; | |
| 2037 } | |
| 2038 | |
| 2039 sub _process_significance { | |
| 2040 my($self, $sig, $my_signif) = @_; | |
| 2041 | |
| 2042 $self->{'_highestSignif'} = ($sig > $self->{'_highestSignif'}) | |
| 2043 ? $sig : $self->{'_highestSignif'}; | |
| 2044 | |
| 2045 $self->{'_lowestSignif'} = ($sig < $self->{'_lowestSignif'}) | |
| 2046 ? $sig : $self->{'_lowestSignif'}; | |
| 2047 | |
| 2048 # Significance value assessment. | |
| 2049 $sig <= $my_signif and $self->{'_num_hits_significant'}++; | |
| 2050 $self->{'_num_hits'}++; | |
| 2051 | |
| 2052 $self->{'_is_significant'} = 1 if $self->{'_num_hits_significant'}; | |
| 2053 } | |
| 2054 | |
| 2055 =head2 _parse_alignment | |
| 2056 | |
| 2057 Usage : n/a; called automatically by the _get_parse_blast_func(). | |
| 2058 Purpose : Parses a single alignment section of a BLAST report. | |
| 2059 Argument : String containing the alignment section. | |
| 2060 Throws : n/a; All errors are trapped while parsing the hit data | |
| 2061 : and are processed as a group when the report is | |
| 2062 : completely processed (See _report_errors()). | |
| 2063 : | |
| 2064 Comments : Alignment section contains all HSPs for a hit. | |
| 2065 : Requires Bio::Tools::Blast::Sbjct.pm. | |
| 2066 : Optionally calls a filter function to screen the hit on arbitrary | |
| 2067 : criteria. If the filter function returns true for a given hit, | |
| 2068 : that hit will be skipped. | |
| 2069 : | |
| 2070 : If the Blast object was created with -check_all_hits set to true, | |
| 2071 : all hits will be checked for significance and processed if necessary. | |
| 2072 : If this field is false, the parsing will stop after the first | |
| 2073 : non-significant hit. | |
| 2074 : See parse() for description of parsing parameters. | |
| 2075 | |
| 2076 See Also : L<parse()|parse>, L<_get_parse_blast_func()|_get_parse_blast_func>, L<_report_errors()|_report_errors>, B<Bio::Tools::Blast::Sbjct()>, L<Links to related modules> | |
| 2077 | |
| 2078 =cut | |
| 2079 | |
| 2080 #---------------------- | |
| 2081 sub _parse_alignment { | |
| 2082 #---------------------- | |
| 2083 # This method always needs to check detect if the $data argument | |
| 2084 # contains the footer of a Blast report, indicating the last chunk | |
| 2085 # of a single Blast stream. | |
| 2086 | |
| 2087 my( $self, $data ) = @_; | |
| 2088 | |
| 2089 # printf STDERR "\nPARSING ALIGNMENT DATA for %s $self.\n", $self->name; | |
| 2090 | |
| 2091 # NOTE: $self->{'_current_hit'} is an instance variable | |
| 2092 # The $Blast object will not have this member. | |
| 2093 | |
| 2094 # If all of the significant hits have been parsed, | |
| 2095 # return if we're not checking all or if we don't need to get | |
| 2096 # the Blast stats (parameters at footer of report). | |
| 2097 if(defined $self->{'_current_hit'} and | |
| 2098 defined $self->{'_num_hits_significant'}) { | |
| 2099 return if $self->{'_current_hit'} >= $self->{'_num_hits_significant'} and | |
| 2100 not ($Blast->{'_check_all'} or $Blast->{'_get_stats'}); | |
| 2101 } | |
| 2102 | |
| 2103 # Check for the presence of the Blast footer section. | |
| 2104 # _parse_footer returns the alignment section. | |
| 2105 $data = $self->_parse_footer($data); | |
| 2106 | |
| 2107 # Return if we're only interested in the best hit. | |
| 2108 # This has to occur after checking for the parameters section | |
| 2109 # in the footer (since we may still be interested in them). | |
| 2110 return if $Blast->best and ( defined $self->{'_current_hit'} and $self->{'_current_hit'} >=1); | |
| 2111 | |
| 2112 # print "RETURNED FROM _parse_footer (", $self->to_string, ")"; | |
| 2113 # print "\n --> FOUND PARAMS.\n" if $self->{'_found_params'}; | |
| 2114 # print "\n --> DID NOT FIND PARAMS.\n" unless $self->{'_found_params'}; | |
| 2115 | |
| 2116 require Bio::Tools::Blast::Sbjct; | |
| 2117 | |
| 2118 $data =~ s/^\s+|\s+>?$//sg; | |
| 2119 $data =~ s/$Newline$Newline/$Newline/sog; # remove blank lines. | |
| 2120 my @data = split($Newline, $data); | |
| 2121 push @data, 'end'; | |
| 2122 | |
| 2123 # print STDERR "\nALIGNMENT DATA:\n$data\n"; | |
| 2124 | |
| 2125 my $prog = $self->program; | |
| 2126 my $check_all = $Blast->{'_check_all'}; | |
| 2127 my $filt_func = $Blast->{'_filt_func'} || 0; | |
| 2128 my $signif_fmt = $Blast->{'_signif_fmt'}; | |
| 2129 my $my_signif = $self->signif; | |
| 2130 my $err; | |
| 2131 | |
| 2132 # Now construct the Sbjct objects from the alignment section | |
| 2133 | |
| 2134 # debug(1); | |
| 2135 | |
| 2136 $self->{'_current_hit'}++; | |
| 2137 | |
| 2138 # If not confirming significance, _parse_descriptions will not have been run, | |
| 2139 # so we need to count the total number of hits here. | |
| 2140 if( not $Blast->{'_confirm_significance'}) { | |
| 2141 $self->{'_num_hits'}++; | |
| 2142 } | |
| 2143 | |
| 2144 if($Blast->{'_no_aligns'}) { | |
| 2145 # printf STDERR "\nNOT PARSING ALIGNMENT DATA\n"; | |
| 2146 return; | |
| 2147 } | |
| 2148 | |
| 2149 my $hit; # Must be my'ed within hit_loop. | |
| 2150 eval { | |
| 2151 $hit = new Bio::Tools::Blast::Sbjct (-DATA =>\@data, | |
| 2152 -PARENT =>$self, | |
| 2153 -NAME =>$self->{'_current_hit'}, | |
| 2154 -RANK =>$self->{'_current_hit'}, | |
| 2155 -RANK_BY =>'order', | |
| 2156 -PROGRAM =>$prog, | |
| 2157 -SIGNIF_FMT=>$signif_fmt, | |
| 2158 -OVERLAP =>$Blast->{'_overlap'} || $MAX_HSP_OVERLAP, | |
| 2159 ); | |
| 2160 # printf STDERR "NEW HIT: %s, SIGNIFICANCE = %g\n", $hit->name, $hit->expect; <STDIN>; | |
| 2161 # The BLAST report may have not had a description section. | |
| 2162 if(not $self->{'_has_descriptions'}) { | |
| 2163 $self->_process_significance($hit->signif, $my_signif); | |
| 2164 } | |
| 2165 }; | |
| 2166 | |
| 2167 if($@) { | |
| 2168 # Throwing lots of errors can slow down the code substantially. | |
| 2169 # Error handling code is not that efficient. | |
| 2170 #print STDERR "\nERROR _parse_alignment: $@\n"; | |
| 2171 push @{$self->{'_blast_errs'}}, $@; | |
| 2172 $hit->destroy if ref $hit; | |
| 2173 undef $hit; | |
| 2174 } else { | |
| 2175 # Collect overall signif data if we don't already have it, | |
| 2176 # (as occurs if no -signif parameter is supplied). | |
| 2177 my $hit_signif = $hit->signif; | |
| 2178 | |
| 2179 if (not $Blast->{'_confirm_significance'} ) { | |
| 2180 $self->{'_highestSignif'} = ($hit_signif > $self->{'_highestSignif'}) | |
| 2181 ? $hit_signif : $self->{'_highestSignif'}; | |
| 2182 | |
| 2183 $self->{'_lowestSignif'} = ($hit_signif < $self->{'_lowestSignif'}) | |
| 2184 ? $hit_signif : $self->{'_lowestSignif'}; | |
| 2185 } | |
| 2186 | |
| 2187 # Test significance using custom function (if supplied) | |
| 2188 if($filt_func) { | |
| 2189 if(&$filt_func($hit)) { | |
| 2190 push @{$self->{'_hits'}}, $hit; | |
| 2191 } else { | |
| 2192 $hit->destroy; undef $hit; | |
| 2193 } | |
| 2194 } elsif($hit_signif <= $my_signif) { | |
| 2195 push @{$self->{'_hits'}}, $hit; | |
| 2196 } | |
| 2197 } | |
| 2198 | |
| 2199 } | |
| 2200 | |
| 2201 =head2 _parse_footer | |
| 2202 | |
| 2203 Usage : n/a; internal function. called by _parse_alignment() | |
| 2204 Purpose : Extracts statistical and other parameters from the BLAST report. | |
| 2205 : Sets various key elements such as the program and version, | |
| 2206 : gapping, and the layout for the report (blast1 or blast2). | |
| 2207 Argument : Data to be parsed. | |
| 2208 Returns : String containing an alignment section for processing by | |
| 2209 : _parse_alignment(). | |
| 2210 Throws : Exception if cannot find the parameters section of report. | |
| 2211 : Warning if cannot determine if gapping was used. | |
| 2212 : Warning if cannot determine the scoring matrix used. | |
| 2213 Comments : This method must always get called, even if the -STATS | |
| 2214 : parse() parameter is false. The reason is that the layout | |
| 2215 : of the report and the presence of gapping must always be set. | |
| 2216 : The determination whether to set additional stats is made | |
| 2217 : by methods called by _parse_footer(). | |
| 2218 | |
| 2219 See Also : L<parse()|parse>, L<_parse_alignment()|_parse_alignment>, L<_set_database()|_set_database> | |
| 2220 | |
| 2221 =cut | |
| 2222 | |
| 2223 #--------------------- | |
| 2224 sub _parse_footer { | |
| 2225 #--------------------- | |
| 2226 # Basic strategy: | |
| 2227 # 1. figure out if we're supposed to get the stats, | |
| 2228 # 2. figure out if the stats are to be shared. some, not all can be shared | |
| 2229 # (eg., db info and matrix can be shared, karlin altschul params cannot. | |
| 2230 # However, this method assumes they are all sharable.) | |
| 2231 # 3. Parse the stats. | |
| 2232 # 4. return the block before the parameters section if the supplied data | |
| 2233 # contains a footer parameters section. | |
| 2234 | |
| 2235 my ($self, $data) = @_; | |
| 2236 my ($client, $last_align, $params); | |
| 2237 | |
| 2238 # printf STDERR "\nPARSING PARAMETERS for %s $self.\n", $self->name; | |
| 2239 | |
| 2240 # Should the parameters be shared? | |
| 2241 # If so, set $self to be the static $Blast object and return if | |
| 2242 # the parameters were already set. | |
| 2243 # Before returning, we need to extract the last alignment section | |
| 2244 # from the parameter section, if any. | |
| 2245 | |
| 2246 if ($Blast->{'_share'}) { | |
| 2247 $client = $self; | |
| 2248 $self = $Blast if $Blast->{'_share'}; | |
| 2249 } | |
| 2250 | |
| 2251 my $get_stats = $Blast->{'_get_stats'}; | |
| 2252 | |
| 2253 if( $data =~ /(.+?)${Newline}CPU time: (.*)/so) { | |
| 2254 # NCBI-Blast2 format (v2.04). | |
| 2255 ($last_align, $params) = ($1, $2); | |
| 2256 return $last_align if $client->{'_found_params'}; | |
| 2257 $self->_set_blast2_stats($params); | |
| 2258 | |
| 2259 } elsif( $data =~ /(.+?)${Newline}Parameters:(.*)/so) { | |
| 2260 # NCBI-Blast1 or WashU-Blast2 format. | |
| 2261 ($last_align, $params) = ($1, $2); | |
| 2262 return $last_align if $client->{'_found_params'}; | |
| 2263 $self->_set_blast1_stats($params); | |
| 2264 | |
| 2265 } elsif( $data =~ /(.+?)$Newline\s+Database:(.*)/so) { | |
| 2266 # Gotta watch out for confusion with the Database: line in the header | |
| 2267 # which will be present in the last hit of an internal Blast report | |
| 2268 # in a multi-report stream. | |
| 2269 | |
| 2270 # NCBI-Blast2 format (v2.05). | |
| 2271 ($last_align, $params) = ($1, $2); | |
| 2272 return $last_align if $client->{'_found_params'}; | |
| 2273 $self->_set_blast2_stats($params); | |
| 2274 | |
| 2275 } elsif( $data =~ /(.+?)$Newline\s*Searching/so) { | |
| 2276 # trying to detect a Searching at the end of a PSI-blast round. | |
| 2277 # Gotta watch out for confusion with the Searching line in the header | |
| 2278 # which will be present in the last hit of an internal Blast report | |
| 2279 # in a multi-report, non-PSI-blast stream. | |
| 2280 | |
| 2281 # PSI-Blast format (v2.08). | |
| 2282 ($last_align) = ($1); | |
| 2283 return $last_align; # if $client->{'_found_params'}; | |
| 2284 } | |
| 2285 | |
| 2286 # If parameter section was found, set a boolean, | |
| 2287 # otherwise return original data. | |
| 2288 | |
| 2289 if (defined($params)) { | |
| 2290 $client->{'_found_params'} = 1; | |
| 2291 } else { | |
| 2292 return $data; | |
| 2293 } | |
| 2294 | |
| 2295 $self->_set_database($params) if $get_stats; | |
| 2296 | |
| 2297 # The {'_gapped'} member should be set in the _set_blast?_stats() call. | |
| 2298 # This is a last minute attempt to deduce it. | |
| 2299 | |
| 2300 if(!defined($self->{'_gapped'})) { | |
| 2301 if($self->program_version() =~ /^1/) { | |
| 2302 $self->{'_gapped'} = 0; | |
| 2303 } else { | |
| 2304 if($self->strict > 0) { | |
| 2305 $self->warn("Can't determine if gapping was used. Assuming gapped."); | |
| 2306 } | |
| 2307 $self->{'_gapped'} = 1; | |
| 2308 } | |
| 2309 } | |
| 2310 | |
| 2311 return $last_align; | |
| 2312 } | |
| 2313 | |
| 2314 =head2 _set_blast2_stats | |
| 2315 | |
| 2316 Usage : n/a; internal function called by _parse_footer() | |
| 2317 Purpose : Extracts statistical and other parameters from BLAST2 report footer. | |
| 2318 : Stats collected: database release, gapping, | |
| 2319 : posted date, matrix used, filter used, Karlin-Altschul parameters, | |
| 2320 : E, S, T, X, W. | |
| 2321 Throws : Exception if cannot get "Parameters" section of Blast report. | |
| 2322 | |
| 2323 See Also : L<parse()|parse>, L<_parse_footer()|_parse_footer>, L<_set_database()|_set_database>, B<Bio::Tools::SeqAnal::set_date()>,L<Links to related modules> | |
| 2324 | |
| 2325 =cut | |
| 2326 | |
| 2327 #---------------------' | |
| 2328 sub _set_blast2_stats { | |
| 2329 #--------------------- | |
| 2330 my ($self, $data) = (@_); | |
| 2331 | |
| 2332 if($data =~ /$Newline\s*Gapped/so) { | |
| 2333 $self->{'_gapped'} = 1; | |
| 2334 } else { | |
| 2335 $self->{'_gapped'} = 0; | |
| 2336 } | |
| 2337 | |
| 2338 # Other stats are not always essential. | |
| 2339 return unless $Blast->{'_get_stats'}; | |
| 2340 | |
| 2341 # Blast2 Doesn't report what filter was used in the parameters section. | |
| 2342 # It just gives a warning that *some* filter was used in the header. | |
| 2343 # You just have to know the defaults (currently: protein = SEG, nucl = DUST). | |
| 2344 if($data =~ /\bfiltered\b/si) { | |
| 2345 $self->{'_filter'} = 'DEFAULT FILTER'; | |
| 2346 } else { | |
| 2347 $self->{'_filter'} = 'NONE'; | |
| 2348 } | |
| 2349 | |
| 2350 if($data =~ /Gapped$Newline\s*Lambda +K +H$Newline +(.+?)$Newline/so) { | |
| 2351 my ($l, $k, $h) = split(/\s+/, $1); | |
| 2352 $self->{'_lambda'} = $l || 'UNKNOWN'; | |
| 2353 $self->{'_k'} = $k || 'UNKNOWN'; | |
| 2354 $self->{'_h'} = $h || 'UNKNOWN'; | |
| 2355 } elsif($data =~ /Lambda +K +H$Newline +(.+?)$Newline/so) { | |
| 2356 my ($l, $k, $h) = split(/\s+/, $1); | |
| 2357 $self->{'_lambda'} = $l || 'UNKNOWN'; | |
| 2358 $self->{'_k'} = $k || 'UNKNOWN'; | |
| 2359 $self->{'_h'} = $h || 'UNKNOWN'; | |
| 2360 } | |
| 2361 | |
| 2362 if($data =~ /$Newline\s*Matrix: (.+?)$Newline/so) { | |
| 2363 $self->{'_matrix'} = $1; | |
| 2364 } else { | |
| 2365 $self->{'_matrix'} = $DEFAULT_MATRIX.'?'; | |
| 2366 if($self->strict > 0) { | |
| 2367 $self->warn("Can't determine scoring matrix. Assuming $DEFAULT_MATRIX."); | |
| 2368 } | |
| 2369 } | |
| 2370 | |
| 2371 if($data =~ /$Newline\s*Gap Penalties: Existence: +(\d+), +Extension: (\d+)$Newline/so) { | |
| 2372 $self->{'_gapCreation'} = $1; | |
| 2373 $self->{'_gapExtension'} = $2; | |
| 2374 } | |
| 2375 if($data =~ /sequences better than (\d+):/s) { | |
| 2376 $self->{'_expect'} = $1; | |
| 2377 } | |
| 2378 | |
| 2379 if($data =~ /$Newline\s*T: (\d+)/o) { $self->{'_word_size'} = $1; } | |
| 2380 if($data =~ /$Newline\s*A: (\d+)/o) { $self->{'_a'} = $1; } | |
| 2381 if($data =~ /$Newline\s*S1: (\d+)/o) { $self->{'_s'} = $1; } | |
| 2382 if($data =~ /$Newline\s*S2: (\d+)/o) { $self->{'_s'} .= ", $1"; } | |
| 2383 if($data =~ /$Newline\s*X1: (\d+)/o) { $self->{'_x1'} = $1; } | |
| 2384 if($data =~ /$Newline\s*X2: (\d+)/o) { $self->{'_x2'} = $1; } | |
| 2385 } | |
| 2386 | |
| 2387 =head2 _set_blast1_stats | |
| 2388 | |
| 2389 Usage : n/a; internal function called by _parse_footer() | |
| 2390 Purpose : Extracts statistical and other parameters from BLAST 1.x style eports. | |
| 2391 : Handles NCBI Blast1 and WashU-Blast2 formats. | |
| 2392 : Stats collected: database release, gapping, | |
| 2393 : posted date, matrix used, filter used, Karlin-Altschul parameters, | |
| 2394 : E, S, T, X, W. | |
| 2395 | |
| 2396 See Also : L<parse()|parse>, L<_parse_footer()|_parse_footer>, L<_set_database()|_set_database>, B<Bio::Tools::SeqAnal::set_date()>,L<Links to related modules> | |
| 2397 | |
| 2398 =cut | |
| 2399 | |
| 2400 #---------------------- | |
| 2401 sub _set_blast1_stats { | |
| 2402 #---------------------- | |
| 2403 my ($self, $data) = (@_); | |
| 2404 | |
| 2405 if(!$self->{'_gapped'} and $self->program_version() =~ /^2[\w\-\.]+WashU/) { | |
| 2406 $self->_set_gapping_wu($data); | |
| 2407 } else { | |
| 2408 $self->{'_gapped'} = 0; | |
| 2409 } | |
| 2410 | |
| 2411 # Other stats are not always essential. | |
| 2412 return unless $Blast->{'_get_stats'}; | |
| 2413 | |
| 2414 if($data =~ /filter=(.+?)$Newline/so) { | |
| 2415 $self->{'_filter'} = $1; | |
| 2416 } elsif($data =~ /filter$Newline +(.+?)$Newline/so) { | |
| 2417 $self->{'_filter'} = $1; | |
| 2418 } else { | |
| 2419 $self->{'_filter'} = 'NONE'; | |
| 2420 } | |
| 2421 | |
| 2422 if($data =~ /$Newline\s*E=(\d+)$Newline/so) { $self->{'_expect'} = $1; } | |
| 2423 | |
| 2424 if($data =~ /$Newline\s*M=(\w+)$Newline/so) { $self->{'_matrix'} = $1; } | |
| 2425 | |
| 2426 if($data =~ /\s*Frame MatID Matrix name .+?$Newline +(.+?)$Newline/so) { | |
| 2427 ## WU-Blast2. | |
| 2428 my ($fr, $mid, $mat, $lu, $ku, $hu, $lc, $kc, $hc) = split(/\s+/,$1); | |
| 2429 $self->{'_matrix'} = $mat || 'UNKNOWN'; | |
| 2430 $self->{'_lambda'} = $lu || 'UNKNOWN'; | |
| 2431 $self->{'_k'} = $ku || 'UNKNOWN'; | |
| 2432 $self->{'_h'} = $hu || 'UNKNOWN'; | |
| 2433 | |
| 2434 } elsif($data =~ /Lambda +K +H$Newline +(.+?)$Newline/so) { | |
| 2435 ## NCBI-Blast1. | |
| 2436 my ($l, $k, $h) = split(/\s+/, $1); | |
| 2437 $self->{'_lambda'} = $l || 'UNKNOWN'; | |
| 2438 $self->{'_k'} = $k || 'UNKNOWN'; | |
| 2439 $self->{'_h'} = $h || 'UNKNOWN'; | |
| 2440 } | |
| 2441 | |
| 2442 if($data =~ /E +S +W +T +X.+?$Newline +(.+?)$Newline/so) { | |
| 2443 # WashU-Blast2 | |
| 2444 my ($fr, $mid, $len, $elen, $e, $s, $w, $t, $x, $e2, $s2) = split(/\s+/,$1); | |
| 2445 $self->{'_expect'} ||= $e || 'UNKNOWN'; | |
| 2446 $self->{'_s'} = $s || 'UNKNOWN'; | |
| 2447 $self->{'_word_size'} = $w || 'UNKNOWN'; | |
| 2448 $self->{'_t'} = $t || 'UNKNOWN'; | |
| 2449 $self->{'_x'} = $x || 'UNKNOWN'; | |
| 2450 | |
| 2451 } elsif($data =~ /E +S +T1 +T2 +X1 +X2 +W +Gap$Newline +(.+?)$Newline/so) { | |
| 2452 ## NCBI-Blast1. | |
| 2453 my ($e, $s, $t1, $t2, $x1, $x2, $w, $gap) = split(/\s+/,$1); | |
| 2454 $self->{'_expect'} ||= $e || 'UNKNOWN'; | |
| 2455 $self->{'_s'} = $s || 'UNKNOWN'; | |
| 2456 $self->{'_word_size'} = $w || 'UNKNOWN'; | |
| 2457 $self->{'_t1'} = $t1 || 'UNKNOWN'; | |
| 2458 $self->{'_t2'} = $t2 || 'UNKNOWN'; | |
| 2459 $self->{'_x1'} = $x1 || 'UNKNOWN'; | |
| 2460 $self->{'_x2'} = $x2 || 'UNKNOWN'; | |
| 2461 $self->{'_gap'} = $gap || 'UNKNOWN'; | |
| 2462 } | |
| 2463 | |
| 2464 if(!$self->{'_matrix'}) { | |
| 2465 $self->{'_matrix'} = $DEFAULT_MATRIX.'?'; | |
| 2466 if($self->strict > 0) { | |
| 2467 $self->warn("Can't determine scoring matrix. Assuming $DEFAULT_MATRIX."); | |
| 2468 } | |
| 2469 } | |
| 2470 } | |
| 2471 | |
| 2472 =head2 _set_gapping_wu | |
| 2473 | |
| 2474 Usage : n/a; internal function called by _set_blast1_stats() | |
| 2475 Purpose : Determine if gapping_wu was on for WashU Blast reports. | |
| 2476 Comments : In earlier versions, gapping was always specified | |
| 2477 : but in the current version (2.0a19MP), gapping is on by default | |
| 2478 : and there is no positive "gapping" indicator in the Parameters | |
| 2479 : section. | |
| 2480 | |
| 2481 See Also : L<_set_blast1_stats()|_set_blast1_stats> | |
| 2482 | |
| 2483 =cut | |
| 2484 | |
| 2485 #-------------------- | |
| 2486 sub _set_gapping_wu { | |
| 2487 #-------------------- | |
| 2488 my ($self, $data) = @_; | |
| 2489 | |
| 2490 if($data =~ /gaps?$Newline/so) { | |
| 2491 $self->{'_gapped'} = ($data =~ /nogaps?$Newline/so) ? 0 : 1; | |
| 2492 } else { | |
| 2493 $self->{'_gapped'} = 1; | |
| 2494 } | |
| 2495 } | |
| 2496 | |
| 2497 =head2 _set_date | |
| 2498 | |
| 2499 Usage : n/a; internal function called by _parse_footer() | |
| 2500 Purpose : Determine the date on which the Blast analysis was performed. | |
| 2501 Comments : Date information is not consistently added to Blast output. | |
| 2502 : Uses superclass method set_date() to set date from the file, | |
| 2503 : (if any). | |
| 2504 | |
| 2505 See Also : L<_parse_footer()|_parse_footer>, B<Bio::Tools::SeqAnal::set_date()>,L<Links to related modules> | |
| 2506 | |
| 2507 =cut | |
| 2508 | |
| 2509 #-------------- | |
| 2510 sub _set_date { | |
| 2511 #-------------- | |
| 2512 my $self = shift; | |
| 2513 my $data = shift; | |
| 2514 | |
| 2515 ### Network BLAST reports from NCBI are time stamped as follows: | |
| 2516 #Fri Apr 18 15:55:41 EDT 1997, Up 1 day, 19 mins, 1 user, load: 19.54, 19.13, 17.77 | |
| 2517 if($data =~ /Start:\s+(.+?)\s+End:/s) { | |
| 2518 ## Calling superclass method to set the date. | |
| 2519 ## If we can't get date from the report, file date is obtained. | |
| 2520 $self->set_date($1); | |
| 2521 } elsif($data =~ /Date:\s+(.*?)$Newline/so) { | |
| 2522 ## E-mailed reports have a Date: field | |
| 2523 $self->set_date($1); | |
| 2524 } elsif( $data =~ /done\s+at (.+?)$Newline/so ) { | |
| 2525 $self->set_date($1); | |
| 2526 } elsif( $data =~ /$Newline([\w:, ]+), Up \d+/so ) { | |
| 2527 $self->set_date($1); | |
| 2528 } else { | |
| 2529 ## Otherwise, let superclass attempt to get the file creation date. | |
| 2530 $self->set_date() if $self->file; | |
| 2531 } | |
| 2532 } | |
| 2533 | |
| 2534 =head2 _set_length | |
| 2535 | |
| 2536 Usage : n/a; called automatically during Blast report parsing. | |
| 2537 Purpose : Sets the length of the query sequence (extracted from report). | |
| 2538 Returns : integer (length of the query sequence) | |
| 2539 Throws : Exception if cannot determine the query sequence length from | |
| 2540 : the BLAST report. | |
| 2541 : Exception if the length is below the min_length cutoff (if any). | |
| 2542 Comments : The logic here is a bit different from the other _set_XXX() | |
| 2543 : methods since the significance of the BLAST report is assessed | |
| 2544 : if MIN_LENGTH is set. | |
| 2545 | |
| 2546 See Also : B<Bio::Tools::SeqAnal::length()>, L<Links to related modules> | |
| 2547 | |
| 2548 =cut | |
| 2549 | |
| 2550 #--------------- | |
| 2551 sub _set_length { | |
| 2552 #--------------- | |
| 2553 my ($self, $data) = @_; | |
| 2554 | |
| 2555 my ($length); | |
| 2556 if( $data =~ m/$Newline\s+\(([\d|,]+) letters[\);]/so ) { | |
| 2557 $length = $1; | |
| 2558 $length =~ s/,//g; | |
| 2559 # printf "Length = $length in BLAST for %s$Newline",$self->name; <STDIN>; | |
| 2560 } else { | |
| 2561 $self->throw("Can't determine sequence length from BLAST report."); | |
| 2562 } | |
| 2563 | |
| 2564 my($sig_len); | |
| 2565 if(defined($Blast->{'_min_length'})) { | |
| 2566 local $^W = 0; | |
| 2567 if($length < $Blast->{'_min_len'}) { | |
| 2568 $self->throw("Query sequence too short for ${\$self->name} ($length)", | |
| 2569 "Minimum length is $Blast->{'_min_len'}"); | |
| 2570 } | |
| 2571 } | |
| 2572 | |
| 2573 $self->length($length); # defined in superclass. | |
| 2574 } | |
| 2575 | |
| 2576 =head2 _set_database | |
| 2577 | |
| 2578 Usage : n/a; called automatically during Blast report parsing. | |
| 2579 Purpose : Sets the name of the database used by the BLAST analysis. | |
| 2580 : Extracted from raw BLAST report. | |
| 2581 Throws : Exception if the name of the database cannot be determined. | |
| 2582 Comments : The database name is used by methods or related objects | |
| 2583 : for database-specific parsing. | |
| 2584 | |
| 2585 See Also : L<parse()|parse>, B<Bio::Tools::SeqAnal::database()>,B<Bio::Tools::SeqAnal::_set_db_stats()>,L<Links to related modules> | |
| 2586 | |
| 2587 =cut | |
| 2588 | |
| 2589 #------------------ | |
| 2590 sub _set_database { | |
| 2591 #------------------ | |
| 2592 # This now only sets data base information extracted from the report footer. | |
| 2593 | |
| 2594 my ($self, $data) = @_; | |
| 2595 | |
| 2596 my ($name, $date, $lets, $seqs); | |
| 2597 | |
| 2598 my $strict = $self->strict > 0; | |
| 2599 | |
| 2600 # This is fail-safe since DB name usually gets set in _parse_header() | |
| 2601 # In some reports, the database is only listed at bottom (NCBI 2.0.8). | |
| 2602 if($data =~ m/Database: +(.+?)$Newline/so ) { | |
| 2603 $name = $1; | |
| 2604 } elsif(not $self->database) { | |
| 2605 $self->warn("Can't determine database name from BLAST report."); | |
| 2606 } | |
| 2607 | |
| 2608 if($data =~ m/Posted date: +(.+?)$Newline/so ) { | |
| 2609 $date = $1; | |
| 2610 } elsif($data =~ m/Release date: +(.+?)$Newline/so ) { | |
| 2611 $date = $1; | |
| 2612 } elsif($strict) { | |
| 2613 $self->warn("Can't determine database release date."); | |
| 2614 } | |
| 2615 | |
| 2616 if($data =~ m/letters in database: +([\d,]+)/si || | |
| 2617 $data =~ m/length of database: +([\d,]+)/si ) { | |
| 2618 $lets = $1; | |
| 2619 } elsif($strict) { | |
| 2620 $self->warn("Can't determine number of letters in database.\n$data\n"); | |
| 2621 } | |
| 2622 | |
| 2623 if($data =~ m/sequences in database: +([\d,]+)/si || | |
| 2624 $data =~ m/number of sequences: +([\d,]+)/si ) { | |
| 2625 $seqs = $1; | |
| 2626 } elsif($strict) { | |
| 2627 $self->warn("Can't determine number of sequences in database.\n$data\n"); | |
| 2628 } | |
| 2629 | |
| 2630 $self->_set_db_stats( -NAME => $name, | |
| 2631 -RELEASE => $date || '', | |
| 2632 -LETTERS => $lets || '', | |
| 2633 -SEQS => $seqs || '' | |
| 2634 ); | |
| 2635 } | |
| 2636 | |
| 2637 =head2 _set_query | |
| 2638 | |
| 2639 Usage : n/a; called automatically during Blast report parsing. | |
| 2640 Purpose : Set the name of the query and the query description. | |
| 2641 : Extracted from the raw BLAST report. | |
| 2642 Returns : String containing name of query extracted from report. | |
| 2643 Throws : Warning if the name of the query cannont be obtained. | |
| 2644 | |
| 2645 See Also : B<Bio::Tools::SeqAnal::query_desc()>,L<Links to related modules> | |
| 2646 | |
| 2647 =cut | |
| 2648 | |
| 2649 #--------------- | |
| 2650 sub _set_query { | |
| 2651 #--------------- | |
| 2652 my $self = shift; | |
| 2653 my $data = shift; | |
| 2654 | |
| 2655 if($data =~ m/${Newline}Query= *(.+?)$Newline/so ) { | |
| 2656 my $info = $1; | |
| 2657 $info =~ s/TITLE //; | |
| 2658 # Split the query line into two parts. | |
| 2659 # Using \s instead of ' ' | |
| 2660 $info =~ /(\S+?)\s(.*)/; | |
| 2661 $self->query_desc($2 || ''); | |
| 2662 # set name of Blast object and return. | |
| 2663 $self->name($1 || 'UNKNOWN'); | |
| 2664 } else { | |
| 2665 $self->warn("Can't determine query sequence name from BLAST report."); | |
| 2666 } | |
| 2667 # print STDERR "$Newline NAME = ${\$self->name}$Newline"; | |
| 2668 } | |
| 2669 | |
| 2670 =head2 _parse_signif | |
| 2671 | |
| 2672 Usage : &_parse_signif(string, layout, gapped); | |
| 2673 : This is a class function. | |
| 2674 Purpose : Extracts the P- or Expect value from a single line of a BLAST description section. | |
| 2675 Example : &_parse_signif("PDB_UNIQUEP:3HSC_ heat-shock cognate ... 799 4.0e-206 2", 1); | |
| 2676 : &_parse_signif("gi|758803 (U23828) peritrophin-95 precurs 38 0.19", 2); | |
| 2677 Argument : string = line from BLAST description section | |
| 2678 : layout = integer (1 or 2) | |
| 2679 : gapped = boolean (true if gapped Blast). | |
| 2680 Returns : Float (0.001 or 1e-03) | |
| 2681 Status : Static | |
| 2682 | |
| 2683 =cut | |
| 2684 | |
| 2685 #------------------ | |
| 2686 sub _parse_signif { | |
| 2687 #------------------ | |
| 2688 my ($line, $layout, $gapped) = @_; | |
| 2689 | |
| 2690 local $_ = $line; | |
| 2691 my @linedat = split(); | |
| 2692 | |
| 2693 # When processing both Blast1 and Blast2 reports | |
| 2694 # in the same run, offset needs to be configured each time. | |
| 2695 | |
| 2696 my $offset = 0; | |
| 2697 $offset = 1 if $layout == 1 or not $gapped; | |
| 2698 | |
| 2699 my $signif = $linedat[ $#linedat - $offset ]; | |
| 2700 | |
| 2701 # fail-safe check | |
| 2702 if(not $signif =~ /[.-]/) { | |
| 2703 $offset = ($offset == 0 ? 1 : 0); | |
| 2704 $signif = $linedat[ $#linedat - $offset ]; | |
| 2705 } | |
| 2706 | |
| 2707 $signif = "1$signif" if $signif =~ /^e/i; | |
| 2708 return $signif; | |
| 2709 } | |
| 2710 | |
| 2711 ## | |
| 2712 ## BEGIN ACCESSOR METHODS THAT INCORPORATE THE STATIC $Blast OBJECT. | |
| 2713 ## | |
| 2714 | |
| 2715 sub program { | |
| 2716 ## Overridden method to incorporate the BLAST object. | |
| 2717 my $self = shift; | |
| 2718 return $self->SUPER::program(@_) if @_; # set | |
| 2719 $self->SUPER::program || $Blast->SUPER::program; # get | |
| 2720 } | |
| 2721 | |
| 2722 sub program_version { | |
| 2723 ## Overridden method to incorporate the BLAST object. | |
| 2724 my $self = shift; | |
| 2725 return $self->SUPER::program_version(@_) if @_; # set | |
| 2726 $self->SUPER::program_version || $Blast->SUPER::program_version; # get | |
| 2727 } | |
| 2728 | |
| 2729 sub database { | |
| 2730 ## Overridden method to incorporate the BLAST object. | |
| 2731 my $self = shift; | |
| 2732 return $self->SUPER::database(@_) if @_; # set | |
| 2733 $self->SUPER::database || $Blast->SUPER::database; # get | |
| 2734 } | |
| 2735 | |
| 2736 sub database_letters { | |
| 2737 ## Overridden method to incorporate the BLAST object. | |
| 2738 my $self = shift; | |
| 2739 return $self->SUPER::database_letters(@_) if @_; # set | |
| 2740 $self->SUPER::database_letters || $Blast->SUPER::database_letters; # get | |
| 2741 } | |
| 2742 | |
| 2743 sub database_release { | |
| 2744 ## Overridden method to incorporate the BLAST object. | |
| 2745 my $self = shift; | |
| 2746 return $self->SUPER::database_release(@_) if @_; # set | |
| 2747 $self->SUPER::database_release || $Blast->SUPER::database_release; # get | |
| 2748 } | |
| 2749 | |
| 2750 sub database_seqs { | |
| 2751 ## Overridden method to incorporate the BLAST object. | |
| 2752 my $self = shift; | |
| 2753 return $self->SUPER::database_seqs(@_) if @_; # set | |
| 2754 $self->SUPER::database_seqs || $Blast->SUPER::database_seqs; # get | |
| 2755 } | |
| 2756 | |
| 2757 sub date { | |
| 2758 ## Overridden method to incorporate the BLAST object. | |
| 2759 my $self = shift; | |
| 2760 return $self->SUPER::date(@_) if @_; # set | |
| 2761 $self->SUPER::date || $Blast->SUPER::date; # get | |
| 2762 } | |
| 2763 | |
| 2764 sub best { | |
| 2765 ## Overridden method to incorporate the BLAST object. | |
| 2766 my $self = shift; | |
| 2767 return $Blast->SUPER::best(@_) if @_; # set | |
| 2768 $Blast->SUPER::best; # get | |
| 2769 } | |
| 2770 | |
| 2771 =head2 signif | |
| 2772 | |
| 2773 Usage : $blast->signif(); | |
| 2774 Purpose : Gets the P or Expect value used as significance screening cutoff. | |
| 2775 Returns : Scientific notation number with this format: 1.0e-05. | |
| 2776 Argument : n/a | |
| 2777 Comments : Screening of significant hits uses the data provided on the | |
| 2778 : description line. For Blast1 and WU-Blast2, this data is P-value. | |
| 2779 : for Blast2 it is an Expect value. | |
| 2780 : | |
| 2781 : Obtains info from the static $Blast object if it has not been set | |
| 2782 : for the current object. | |
| 2783 | |
| 2784 See Also : L<_set_signif()|_set_signif> | |
| 2785 | |
| 2786 =cut | |
| 2787 | |
| 2788 #----------- | |
| 2789 sub signif { | |
| 2790 #----------- | |
| 2791 my $self = shift; | |
| 2792 my $sig = $self->{'_significance'} || $Blast->{'_significance'}; | |
| 2793 sprintf "%.1e", $sig; | |
| 2794 } | |
| 2795 | |
| 2796 =head2 is_signif | |
| 2797 | |
| 2798 Usage : $blast->is_signif(); | |
| 2799 Purpose : Determine if the BLAST report contains significant hits. | |
| 2800 Returns : Boolean | |
| 2801 Argument : n/a | |
| 2802 Comments : BLAST reports without significant hits but with defined | |
| 2803 : significance criteria will throw exceptions during construction. | |
| 2804 : This obviates the need to check significant() for | |
| 2805 : such objects. | |
| 2806 | |
| 2807 See Also : L<_set_signif()|_set_signif> | |
| 2808 | |
| 2809 =cut | |
| 2810 | |
| 2811 #------------ | |
| 2812 sub is_signif { my $self = shift; return $self->{'_is_significant'}; } | |
| 2813 #------------ | |
| 2814 | |
| 2815 # is_signif() doesn't incorporate the static $Blast object but is included | |
| 2816 # here to be with the other 'signif' methods. | |
| 2817 | |
| 2818 =head2 signif_fmt | |
| 2819 | |
| 2820 Usage : $blast->signif_fmt( [FMT] ); | |
| 2821 Purpose : Allows retrieval of the P/Expect exponent values only | |
| 2822 : or as a two-element list (mantissa, exponent). | |
| 2823 Usage : $blast_obj->signif_fmt('exp'); | |
| 2824 : $blast_obj->signif_fmt('parts'); | |
| 2825 Returns : String or '' if not set. | |
| 2826 Argument : String, FMT = 'exp' (return the exponent only) | |
| 2827 : = 'parts'(return exponent + mantissa in 2-elem list) | |
| 2828 : = undefined (return the raw value) | |
| 2829 Comments : P/Expect values are still stored internally as the full, | |
| 2830 : scientific notation value. | |
| 2831 : This method uses the static $Blast object since this issue | |
| 2832 : will pertain to all Blast reports within a given set. | |
| 2833 : This setting is propagated to Bio::Tools::Blast::Sbjct.pm. | |
| 2834 | |
| 2835 =cut | |
| 2836 | |
| 2837 #------------- | |
| 2838 sub signif_fmt { | |
| 2839 #------------- | |
| 2840 my $self = shift; | |
| 2841 if(@_) { $Blast->{'_signif_fmt'} = shift; } | |
| 2842 $Blast->{'_signif_fmt'} || ''; | |
| 2843 } | |
| 2844 | |
| 2845 =head2 min_length | |
| 2846 | |
| 2847 Usage : $blast->min_length(); | |
| 2848 Purpose : Gets the query sequence length used as significance screening criteria. | |
| 2849 Returns : Integer | |
| 2850 Argument : n/a | |
| 2851 Comments : Obtains info from the static $Blast object if it has not been set | |
| 2852 : for the current object. | |
| 2853 | |
| 2854 See Also : L<_set_signif()|_set_signif>, L<signif()|signif> | |
| 2855 | |
| 2856 =cut | |
| 2857 | |
| 2858 #-------------- | |
| 2859 sub min_length { | |
| 2860 #-------------- | |
| 2861 my $self = shift; | |
| 2862 $self->{'_min_length'} || $Blast->{'_min_length'}; | |
| 2863 } | |
| 2864 | |
| 2865 =head2 gapped | |
| 2866 | |
| 2867 Usage : $blast->gapped(); | |
| 2868 Purpose : Set/Get boolean indicator for gapped BLAST. | |
| 2869 Returns : Boolean | |
| 2870 Argument : n/a | |
| 2871 Comments : Obtains info from the static $Blast object if it has not been set | |
| 2872 : for the current object. | |
| 2873 | |
| 2874 =cut | |
| 2875 | |
| 2876 #----------- | |
| 2877 sub gapped { | |
| 2878 #----------- | |
| 2879 my $self = shift; | |
| 2880 if(@_) { $self->{'_gapped'} = shift; } | |
| 2881 $self->{'_gapped'} || $Blast->{'_gapped'}; | |
| 2882 } | |
| 2883 | |
| 2884 =head2 _get_stats | |
| 2885 | |
| 2886 Usage : n/a; internal method. | |
| 2887 Purpose : Set/Get indicator for collecting full statistics from report. | |
| 2888 Returns : Boolean (0 | 1) | |
| 2889 Comments : Obtains info from the static $Blast object which gets set | |
| 2890 : by _init_parse_params(). | |
| 2891 | |
| 2892 =cut | |
| 2893 | |
| 2894 #--------------- | |
| 2895 sub _get_stats { | |
| 2896 #--------------- | |
| 2897 my $self = shift; | |
| 2898 $Blast->{'_get_stats'}; | |
| 2899 } | |
| 2900 | |
| 2901 =head2 _layout | |
| 2902 | |
| 2903 Usage : n/a; internal method. | |
| 2904 Purpose : Set/Get indicator for the layout of the report. | |
| 2905 Returns : Integer (1 | 2) | |
| 2906 : Defaults to 2 if not set. | |
| 2907 Comments : Blast1 and WashU-Blast2 have a layout = 1. | |
| 2908 : This is intended for internal use by this and closely | |
| 2909 : allied modules like Sbjct.pm and HSP.pm. | |
| 2910 : | |
| 2911 : Obtains info from the static $Blast object if it has not been set | |
| 2912 : for the current object. | |
| 2913 | |
| 2914 =cut | |
| 2915 | |
| 2916 #------------ | |
| 2917 sub _layout { | |
| 2918 #------------ | |
| 2919 my $self = shift; | |
| 2920 if(@_) { | |
| 2921 # Optimization if we know all reports share the same stats. | |
| 2922 if($Blast->{'_share'}) { | |
| 2923 $Blast->{'_layout'} = shift; | |
| 2924 } else { | |
| 2925 $self->{'_layout'} = shift; | |
| 2926 } | |
| 2927 } | |
| 2928 $self->{'_layout'} || $Blast->{'_layout'} || 2; | |
| 2929 } | |
| 2930 | |
| 2931 ## | |
| 2932 ## END ACCESSOR METHODS THAT INCORPORATE THE STATIC $Blast OBJECT. | |
| 2933 ## | |
| 2934 | |
| 2935 =head2 hits | |
| 2936 | |
| 2937 Usage : $blast->hits(); | |
| 2938 Purpose : Get a list containing all BLAST hit (Sbjct) objects. | |
| 2939 : Get the numbers of significant hits. | |
| 2940 Examples : @hits = $blast->hits(); | |
| 2941 : $num_signif = $blast->hits(); | |
| 2942 Returns : List context : list of Bio::Tools::Blast::Sbjct.pm objects | |
| 2943 : or an empty list if there are no hits. | |
| 2944 : Scalar context: integer (number of significant hits) | |
| 2945 : or zero if there are no hits. | |
| 2946 : (Equivalent to num_hits()). | |
| 2947 Argument : n/a. Relies on wantarray. | |
| 2948 Throws : n/a. | |
| 2949 : Not throwing exception because the absence of hits may have | |
| 2950 : resulted from stringent significance criteria, not a failure | |
| 2951 : set the hits. | |
| 2952 | |
| 2953 See Also : L<hit()|hit>, L<num_hits()|num_hits>, L<is_signif()|is_signif>, L<_set_signif()|_set_signif> | |
| 2954 | |
| 2955 =cut | |
| 2956 | |
| 2957 #---------- | |
| 2958 sub hits { | |
| 2959 #---------- | |
| 2960 my $self = shift; | |
| 2961 | |
| 2962 if(wantarray) { | |
| 2963 my @ary = ref($self->{'_hits'}) ? @{$self->{'_hits'}} : (); | |
| 2964 return @ary; | |
| 2965 } else { | |
| 2966 return $self->num_hits(); | |
| 2967 } | |
| 2968 | |
| 2969 # my $num = ref($self->{'_hits'}) ? scalar(@{$self->{'_hits'}}) : 0; | |
| 2970 # my @ary = ref($self->{'_hits'}) ? @{$self->{'_hits'}} : (); | |
| 2971 # | |
| 2972 # return wantarray | |
| 2973 # # returning list containing all hits or empty list. | |
| 2974 # ? $self->{'_is_significant'} ? @ary : () | |
| 2975 # # returning number of hits or 0. | |
| 2976 # : $self->{'_is_significant'} ? $num : 0; | |
| 2977 } | |
| 2978 | |
| 2979 =head2 hit | |
| 2980 | |
| 2981 Example : $blast_obj->hit( [class] ) | |
| 2982 Purpose : Get a specific hit object. | |
| 2983 : Provides some syntactic sugar for the hits() method. | |
| 2984 Usage : $hitObj = $blast->hit(); | |
| 2985 : $hitObj = $blast->hit('best'); | |
| 2986 : $hitObj = $blast->hit('worst'); | |
| 2987 : $hitObj = $blast->hit( $name ); | |
| 2988 Returns : Object reference for a Bio::Tools::Blast::Sbjct.pm object. | |
| 2989 : undef if there are no hit (Sbjct) objects defined. | |
| 2990 Argument : Class (or no argument). | |
| 2991 : No argument (default) = highest scoring hit (same as 'best'). | |
| 2992 : 'best' or 'first' = highest scoring hit. | |
| 2993 : 'worst' or 'last' = lowest scoring hit. | |
| 2994 : $name = retrieve a hit by seq id (case-insensitive). | |
| 2995 Throws : Exception if the Blast object has no significant hits. | |
| 2996 : Exception if a hit cannot be found when supplying a specific | |
| 2997 : hit sequence identifier as an argument. | |
| 2998 Comments : 'best' = lowest significance value (P or Expect) among significant hits. | |
| 2999 : 'worst' = highest sigificance value (P or Expect) among significant hits. | |
| 3000 | |
| 3001 See Also : L<hits()|hits>, L<num_hits()|num_hits>, L<is_signif()|is_signif> | |
| 3002 | |
| 3003 =cut | |
| 3004 | |
| 3005 #--------- | |
| 3006 sub hit { | |
| 3007 #--------- | |
| 3008 my( $self, $option) = @_; | |
| 3009 $option ||= 'best'; | |
| 3010 | |
| 3011 if($Blast->{'_no_aligns'} || ! ref($self->{'_hits'})) { | |
| 3012 return undef; | |
| 3013 } | |
| 3014 | |
| 3015 $self->{'_is_significant'} or | |
| 3016 $self->throw("There were no significant hits.", | |
| 3017 "Use num_hits(), hits(), is_signif() to check."); | |
| 3018 | |
| 3019 my @hits = @{$self->{'_hits'}}; | |
| 3020 | |
| 3021 return $hits[0] if $option =~ /^(best|first|1)$/i; | |
| 3022 return $hits[$#hits] if $option =~ /^(worst|last)$/i; | |
| 3023 | |
| 3024 # Get hit by name. | |
| 3025 foreach ( @hits ) { | |
| 3026 return $_ if $_->name() =~ /$option/i; | |
| 3027 } | |
| 3028 | |
| 3029 $self->throw("Can't get hit for: $option"); | |
| 3030 } | |
| 3031 | |
| 3032 =head2 num_hits | |
| 3033 | |
| 3034 Usage : $blast->num_hits( ['total'] ); | |
| 3035 Purpose : Get number of significant hits or number of total hits. | |
| 3036 Examples : $num_signif = $blast-num_hits; | |
| 3037 : $num_total = $blast->num_hits('total'); | |
| 3038 Returns : Integer | |
| 3039 Argument : String = 'total' (or no argument). | |
| 3040 : No argument (Default) = return number of significant hits. | |
| 3041 : 'total' = number of total hits. | |
| 3042 Throws : n/a. | |
| 3043 : Not throwing exception because the absence of hits may have | |
| 3044 : resulted from stringent significance criteria, not a failure | |
| 3045 : set the hits. | |
| 3046 Comments : A significant hit is defined as a hit with an expect value | |
| 3047 : (or P value for WU-Blast) at or below the -signif parameter | |
| 3048 : used when parsing the report. Additionally, if a filter function | |
| 3049 : was supplied, the significant hit must also pass that | |
| 3050 : criteria. | |
| 3051 | |
| 3052 See Also : L<hits()|hits>, L<hit()|hit>, L<is_signif()|is_signif>, L<_set_signif()|_set_signif>, L<parse()|parse> | |
| 3053 | |
| 3054 =cut | |
| 3055 | |
| 3056 #------------- | |
| 3057 sub num_hits { | |
| 3058 #------------- | |
| 3059 my( $self, $option) = @_; | |
| 3060 $option ||= ''; | |
| 3061 | |
| 3062 $option =~ /total/i and return $self->{'_num_hits'} || 0; | |
| 3063 | |
| 3064 # Default: returning number of significant hits. | |
| 3065 # return $self->{'_num_hits_significant'} || 0; | |
| 3066 # return 0 if not ref $self->{'_hits'}; | |
| 3067 | |
| 3068 if(ref $self->{'_hits'}) { | |
| 3069 return scalar(@{$self->{'_hits'}}); | |
| 3070 } else { | |
| 3071 return $self->{'_num_hits_significant'} || 0; | |
| 3072 } | |
| 3073 } | |
| 3074 | |
| 3075 =head2 lowest_p | |
| 3076 | |
| 3077 Usage : $blast->lowest_p() | |
| 3078 Purpose : Get the lowest P-value among all hits in a BLAST report. | |
| 3079 : Syntactic sugar for $blast->hit('best')->p(). | |
| 3080 Returns : Float or scientific notation number. | |
| 3081 : Returns -1.0 if lowest_p has not been set. | |
| 3082 Argument : n/a. | |
| 3083 Throws : Exception if the Blast report does not report P-values | |
| 3084 : (as is the case for NCBI Blast2). | |
| 3085 Comments : A value is returned regardless of whether or not there were | |
| 3086 : significant hits ($DEFAULT_SIGNIF, currently 999). | |
| 3087 | |
| 3088 See Also : L<lowest_expect()|lowest_expect>, L<lowest_signif()|lowest_signif>, L<highest_p()|highest_p>, L<signif_fmt()|signif_fmt> | |
| 3089 | |
| 3090 =cut | |
| 3091 | |
| 3092 #------------ | |
| 3093 sub lowest_p { | |
| 3094 #------------ | |
| 3095 my $self = shift; | |
| 3096 | |
| 3097 # Layout 2 = NCBI Blast 2.x does not report P-values. | |
| 3098 $self->_layout == 2 and | |
| 3099 $self->throw("Can't get P-value with BLAST2.", | |
| 3100 "Use lowest_signif() or lowest_expect()"); | |
| 3101 | |
| 3102 return $self->{'_lowestSignif'} || -1.0; | |
| 3103 } | |
| 3104 | |
| 3105 =head2 lowest_expect | |
| 3106 | |
| 3107 Usage : $blast->lowest_expect() | |
| 3108 Purpose : Get the lowest Expect value among all hits in a BLAST report. | |
| 3109 : Syntactic sugar for $blast->hit('best')->expect() | |
| 3110 Returns : Float or scientific notation number. | |
| 3111 : Returns -1.0 if lowest_expect has not been set. | |
| 3112 Argument : n/a. | |
| 3113 Throws : Exception if there were no significant hits and the report | |
| 3114 : does not have Expect values on the description lines | |
| 3115 : (i.e., Blast1, WashU-Blast2). | |
| 3116 | |
| 3117 See Also : L<lowest_p()|lowest_p>, L<lowest_signif()|lowest_signif>, L<highest_expect()|highest_expect>, L<signif_fmt()|signif_fmt> | |
| 3118 | |
| 3119 =cut | |
| 3120 | |
| 3121 #------------------ | |
| 3122 sub lowest_expect { | |
| 3123 #------------------ | |
| 3124 my $self = shift; | |
| 3125 | |
| 3126 if ($self->_layout == 2) { | |
| 3127 return $self->{'_lowestSignif'} || -1.0; | |
| 3128 } | |
| 3129 | |
| 3130 if($self->{'_is_significant'}) { | |
| 3131 my $bestHit = $self->{'_hits'}->[0]; | |
| 3132 return $bestHit->expect(); | |
| 3133 } else { | |
| 3134 $self->throw("Can't get lowest expect value: no significant hits ", | |
| 3135 "The format of this report requires expect values to be extracted$Newline". | |
| 3136 "from the hits themselves."); | |
| 3137 } | |
| 3138 } | |
| 3139 | |
| 3140 =head2 highest_p | |
| 3141 | |
| 3142 Example : $blast->highest_p( ['overall']) | |
| 3143 Purpose : Get the highest P-value among all hits in a BLAST report. | |
| 3144 : Syntactic sugar for $blast->hit('worst')->p() | |
| 3145 : Can also get the highest P-value overall (not just among signif hits). | |
| 3146 Usage : $p_signif = $blast->highest_p(); | |
| 3147 : $p_all = $blast->highest_p('overall'); | |
| 3148 Returns : Float or scientific notation number. | |
| 3149 : Returns -1.0 if highest_p has not been set. | |
| 3150 Argument : String 'overall' or no argument. | |
| 3151 : No argument = get highest P-value among significant hits. | |
| 3152 Throws : Exception if object is created from a Blast2 report | |
| 3153 : (which does not report P-values). | |
| 3154 | |
| 3155 See Also : L<highest_signif()|highest_signif>, L<lowest_p()|lowest_p>, L<_set_signif()|_set_signif>, L<signif_fmt()|signif_fmt> | |
| 3156 | |
| 3157 =cut | |
| 3158 | |
| 3159 #--------------- | |
| 3160 sub highest_p { | |
| 3161 #--------------- | |
| 3162 my ($self, $overall) = @_; | |
| 3163 | |
| 3164 # Layout 2 = NCBI Blast 2.x does not report P-values. | |
| 3165 $self->_layout == 2 and | |
| 3166 $self->throw("Can't get P-value with BLAST2.", | |
| 3167 "Use highest_signif() or highest_expect()"); | |
| 3168 | |
| 3169 $overall and return $self->{'_highestSignif'} || -1.0; | |
| 3170 $self->hit('worst')->p(); | |
| 3171 } | |
| 3172 | |
| 3173 =head2 highest_expect | |
| 3174 | |
| 3175 Usage : $blast_object->highest_expect( ['overall']) | |
| 3176 Purpose : Get the highest Expect value among all significant hits in a BLAST report. | |
| 3177 : Syntactic sugar for $blast->hit('worst')->expect() | |
| 3178 Examples : $e_sig = $blast->highest_expect(); | |
| 3179 : $e_all = $blast->highest_expect('overall'); | |
| 3180 Returns : Float or scientific notation number. | |
| 3181 : Returns -1.0 if highest_exoect has not been set. | |
| 3182 Argument : String 'overall' or no argument. | |
| 3183 : No argument = get highest Expect-value among significant hits. | |
| 3184 Throws : Exception if there were no significant hits and the report | |
| 3185 : does not have Expect values on the description lines | |
| 3186 : (i.e., Blast1, WashU-Blast2). | |
| 3187 | |
| 3188 See Also : L<lowest_expect()|lowest_expect>, L<highest_signif()|highest_signif>, L<signif_fmt()|signif_fmt> | |
| 3189 | |
| 3190 =cut | |
| 3191 | |
| 3192 #------------------- | |
| 3193 sub highest_expect { | |
| 3194 #------------------- | |
| 3195 my ($self, $overall) = @_; | |
| 3196 | |
| 3197 if ( $overall and $self->_layout == 2) { | |
| 3198 return $self->{'_highestSignif'} || -1.0; | |
| 3199 } | |
| 3200 | |
| 3201 if($self->{'_is_significant'}) { | |
| 3202 return $self->hit('worst')->expect; | |
| 3203 } else { | |
| 3204 $self->throw("Can't get highest expect value: no significant hits ", | |
| 3205 "The format of this report requires expect values to be extracted$Newline". | |
| 3206 "from the hits themselves."); | |
| 3207 } | |
| 3208 } | |
| 3209 | |
| 3210 =head2 lowest_signif | |
| 3211 | |
| 3212 Usage : $blast_obj->lowest_signif(); | |
| 3213 : Syntactic sugar for $blast->hit('best')->signif() | |
| 3214 Purpose : Get the lowest P or Expect value among all hits | |
| 3215 : in a BLAST report. | |
| 3216 : This method is syntactic sugar for $blast->hit('best')->signif() | |
| 3217 : The value returned is the one which is reported in the decription | |
| 3218 : section of the Blast report. | |
| 3219 : For Blast1 and WU-Blast2, this is a P-value, | |
| 3220 : for NCBI Blast2, it is an Expect value. | |
| 3221 Example : $blast->lowest_signif(); | |
| 3222 Returns : Float or scientific notation number. | |
| 3223 : Returns -1.0 if lowest_signif has not been set. | |
| 3224 Argument : n/a. | |
| 3225 Throws : n/a. | |
| 3226 Status : Deprecated. Use lowest_expect() or lowest_p(). | |
| 3227 Comments : The signif() method provides a way to deal with the fact that | |
| 3228 : Blast1 and Blast2 formats differ in what is reported in the | |
| 3229 : description lines of each hit in the Blast report. The signif() | |
| 3230 : method frees any client code from having to know if this is a P-value | |
| 3231 : or an Expect value, making it easier to write code that can process | |
| 3232 : both Blast1 and Blast2 reports. This is not necessarily a good thing, since | |
| 3233 : one should always know when one is working with P-values or | |
| 3234 : Expect values (hence the deprecated status). | |
| 3235 : Use of lowest_expect() is recommended since all hits will have an Expect value. | |
| 3236 | |
| 3237 See Also : L<lowest_p()|lowest_p>, L<lowest_expect()|lowest_expect>, L<signif()|signif>, L<signif_fmt()|signif_fmt>, L<_set_signif()|_set_signif> | |
| 3238 | |
| 3239 =cut | |
| 3240 | |
| 3241 #------------------ | |
| 3242 sub lowest_signif { | |
| 3243 #------------------ | |
| 3244 my ($self) = @_; | |
| 3245 | |
| 3246 return $self->{'_lowestSignif'} || -1.0; | |
| 3247 } | |
| 3248 | |
| 3249 =head2 highest_signif | |
| 3250 | |
| 3251 Usage : $blast_obj->highest_signif('overall'); | |
| 3252 : Syntactic sugar for $blast->hit('worst')->signif() | |
| 3253 Purpose : Get the highest P or Expect value among all hits | |
| 3254 : in a BLAST report. | |
| 3255 : The value returned is the one which is reported in the decription | |
| 3256 : section of the Blast report. | |
| 3257 : For Blast1 and WU-Blast2, this is a P-value, | |
| 3258 : for NCBI Blast2, it is an Expect value. | |
| 3259 Example : $blast->highest_signif(); | |
| 3260 Returns : Float or scientific notation number. | |
| 3261 : Returns -1.0 if highest_signif has not been set. | |
| 3262 Argument : Optional string 'overall' to get the highest overall significance value. | |
| 3263 Throws : n/a. | |
| 3264 Status : Deprecated. Use highest_expect() or highest_p(). | |
| 3265 Comments : Analogous to lowest_signif(), q.v. | |
| 3266 | |
| 3267 See Also : L<lowest_signif()|lowest_signif>, L<lowest_p()|lowest_p>, L<lowest_expect()|lowest_expect>, L<signif()|signif>, L<signif_fmt()|signif_fmt>, L<_set_signif()|_set_signif> | |
| 3268 | |
| 3269 =cut | |
| 3270 | |
| 3271 #--------------------- | |
| 3272 sub highest_signif { | |
| 3273 #--------------------- | |
| 3274 my ($self, $overall) = @_; | |
| 3275 | |
| 3276 $overall and return $self->{'_highestSignif'} || -1.0; | |
| 3277 | |
| 3278 if($self->{'_is_significant'}) { | |
| 3279 my $worst_hit = $self->hit('worst'); | |
| 3280 if(defined $worst_hit) { | |
| 3281 return $worst_hit->signif; | |
| 3282 } else { | |
| 3283 return $self->{'_highestSignif'}; | |
| 3284 } | |
| 3285 } | |
| 3286 } | |
| 3287 | |
| 3288 =head2 matrix | |
| 3289 | |
| 3290 Usage : $blast_object->matrix(); | |
| 3291 Purpose : Get the name of the scoring matrix used. | |
| 3292 : This is extracted from the report. | |
| 3293 Argument : n/a | |
| 3294 Returns : string or undef if not defined | |
| 3295 | |
| 3296 =cut | |
| 3297 | |
| 3298 #------------ | |
| 3299 sub matrix { my $self = shift; $self->{'_matrix'} || $Blast->{'_matrix'}; } | |
| 3300 #------------ | |
| 3301 | |
| 3302 =head2 filter | |
| 3303 | |
| 3304 Usage : $blast_object->filter(); | |
| 3305 Purpose : Get the name of the low-complexity sequence filter used. | |
| 3306 : (SEG, SEG+XNU, DUST, NONE). | |
| 3307 : This is extracted from the report. | |
| 3308 Argument : n/a | |
| 3309 Returns : string or undef if not defined | |
| 3310 | |
| 3311 =cut | |
| 3312 | |
| 3313 #---------- | |
| 3314 sub filter { my $self = shift; $self->{'_filter'} || $Blast->{'_filter'}; } | |
| 3315 #---------- | |
| 3316 | |
| 3317 =head2 expect | |
| 3318 | |
| 3319 Usage : $blast_object->expect(); | |
| 3320 Purpose : Get the expect parameter (E) used for the Blast analysis. | |
| 3321 : This is extracted from the report. | |
| 3322 Argument : n/a | |
| 3323 Returns : string or undef if not defined. | |
| 3324 | |
| 3325 =cut | |
| 3326 | |
| 3327 #----------- | |
| 3328 sub expect { my $self = shift; $self->{'_expect'} || $Blast->{'_expect'}; } | |
| 3329 #----------- | |
| 3330 | |
| 3331 =head2 karlin_altschul | |
| 3332 | |
| 3333 Usage : $blast_object->karlin_altschul(); | |
| 3334 Purpose : Get the Karlin_Altschul sum statistics (Lambda, K, H) | |
| 3335 : These are extracted from the report. | |
| 3336 Argument : n/a | |
| 3337 Returns : list of three floats (Lambda, K, H) | |
| 3338 : If not defined, returns list of three zeros) | |
| 3339 | |
| 3340 =cut | |
| 3341 | |
| 3342 #--------------------- | |
| 3343 sub karlin_altschul { | |
| 3344 #--------------------- | |
| 3345 my $self = shift; | |
| 3346 if(defined($self->{'_lambda'})) { | |
| 3347 ($self->{'_lambda'}, $self->{'_k'}, $self->{'_h'}); | |
| 3348 } elsif(defined($Blast->{'_lambda'})) { | |
| 3349 ($Blast->{'_lambda'}, $Blast->{'_k'}, $Blast->{'_h'}); | |
| 3350 } else { | |
| 3351 (0, 0, 0); | |
| 3352 } | |
| 3353 } | |
| 3354 | |
| 3355 =head2 word_size | |
| 3356 | |
| 3357 Usage : $blast_object->word_size(); | |
| 3358 Purpose : Get the word_size used during the Blast analysis. | |
| 3359 : This is extracted from the report. | |
| 3360 Argument : n/a | |
| 3361 Returns : integer or undef if not defined. | |
| 3362 | |
| 3363 =cut | |
| 3364 | |
| 3365 #-------------- | |
| 3366 sub word_size { | |
| 3367 #-------------- | |
| 3368 my $self = shift; | |
| 3369 $self->{'_word_size'} || $Blast->{'_word_size'}; | |
| 3370 } | |
| 3371 | |
| 3372 =head2 s | |
| 3373 | |
| 3374 Usage : $blast_object->s(); | |
| 3375 Purpose : Get the s statistic for the Blast analysis. | |
| 3376 : This is extracted from the report. | |
| 3377 Argument : n/a | |
| 3378 Returns : integer or undef if not defined. | |
| 3379 | |
| 3380 =cut | |
| 3381 | |
| 3382 #------ | |
| 3383 sub s { my $self = shift; $self->{'_s'} || $Blast->{'_s'}; } | |
| 3384 #------ | |
| 3385 | |
| 3386 =head2 gap_creation | |
| 3387 | |
| 3388 Usage : $blast_object->gap_creation(); | |
| 3389 Purpose : Get the gap creation penalty used for a gapped Blast analysis. | |
| 3390 : This is extracted from the report. | |
| 3391 Argument : n/a | |
| 3392 Returns : integer or undef if not defined. | |
| 3393 | |
| 3394 See Also : L<gap_extension()|gap_extension> | |
| 3395 | |
| 3396 =cut | |
| 3397 | |
| 3398 #----------------- | |
| 3399 sub gap_creation { | |
| 3400 #----------------- | |
| 3401 my $self = shift; | |
| 3402 $self->{'_gapCreation'} || $Blast->{'_gapCreation'}; | |
| 3403 } | |
| 3404 | |
| 3405 =head2 gap_extension | |
| 3406 | |
| 3407 Usage : $blast_object->gap_extension(); | |
| 3408 Purpose : Get the gap extension penalty used for a gapped Blast analysis. | |
| 3409 : This is extracted from the report. | |
| 3410 Argument : n/a | |
| 3411 Returns : integer or undef if not defined. | |
| 3412 | |
| 3413 See Also : L<gap_extension()|gap_extension> | |
| 3414 | |
| 3415 =cut | |
| 3416 | |
| 3417 #------------------- | |
| 3418 sub gap_extension { | |
| 3419 #------------------- | |
| 3420 my $self = shift; | |
| 3421 $self->{'_gapExtension'} || $Blast->{'_gapExtension'}; | |
| 3422 } | |
| 3423 | |
| 3424 =head2 ambiguous_aln | |
| 3425 | |
| 3426 Usage : $blast_object->ambiguous_aln(); | |
| 3427 Purpose : Test all hits and determine if any have an ambiguous alignment. | |
| 3428 Example : print "ambiguous" if $blast->ambiguous_aln(); | |
| 3429 Returns : Boolean (true if ANY significant hit has an ambiguous alignment) | |
| 3430 Argument : n/a | |
| 3431 Throws : n/a | |
| 3432 Status : Experimental | |
| 3433 Comments : An ambiguous BLAST alignment is defined as one where two or more | |
| 3434 : different HSPs have significantly overlapping sequences such | |
| 3435 : that it is not possible to create a unique alignment | |
| 3436 : by simply concatenating HSPs. This may indicate the presence | |
| 3437 : of multiple domains in one sequence relative to another. | |
| 3438 : This method only indicates the presence of ambiguity in at | |
| 3439 : least one significant hit. To determine the nature of the | |
| 3440 : ambiguity, each hit must be examined. | |
| 3441 | |
| 3442 See Also : B<Bio::Tools::Blast::Sbjct::ambiguous_aln()>,L<Links to related modules> | |
| 3443 | |
| 3444 =cut | |
| 3445 | |
| 3446 #---------------- | |
| 3447 sub ambiguous_aln { | |
| 3448 #---------------- | |
| 3449 my $self = shift; | |
| 3450 foreach($self->hits()) { | |
| 3451 return 1 if ($_->ambiguous_aln() ne '-'); | |
| 3452 } | |
| 3453 0; | |
| 3454 } | |
| 3455 | |
| 3456 =head2 overlap | |
| 3457 | |
| 3458 Usage : $blast_object->overlap([integer]); | |
| 3459 Purpose : Set/Get the number of overlapping residues allowed when tiling multiple HSPs. | |
| 3460 : Delegates to Bio::Tools::Blast::Sbjct::overlap(). | |
| 3461 Throws : Exception if there are no significant hits. | |
| 3462 Status : Experimental | |
| 3463 | |
| 3464 See Also : B<Bio::Tools::Blast::Sbjct::overlap()>,L<Links to related modules> | |
| 3465 | |
| 3466 =cut | |
| 3467 | |
| 3468 #------------ | |
| 3469 sub overlap { | |
| 3470 #------------ | |
| 3471 my $self = shift; | |
| 3472 if(not $self->hits) { | |
| 3473 $self->throw("Can't get overlap data without significant hits."); | |
| 3474 } | |
| 3475 $self->hit->overlap(); | |
| 3476 } | |
| 3477 | |
| 3478 =head2 homol_data | |
| 3479 | |
| 3480 Usage : @data = $blast_object->homo_data( %named_params ); | |
| 3481 Purpose : Gets specific similarity data about each significant hit. | |
| 3482 Returns : Array of strings: | |
| 3483 : "Homology data" for each HSP is in the format: | |
| 3484 : "<integer> <start> <stop>" | |
| 3485 : Data for different HSPs are tab-delimited. | |
| 3486 Argument : named parameters passed along to the hit objects. | |
| 3487 Throws : n/a | |
| 3488 Status : Experimental | |
| 3489 Comments : This is a very experimental method used for obtaining an | |
| 3490 : indication of: | |
| 3491 : 1) how many HSPs are in a Blast alignment | |
| 3492 : 2) how strong the similarity is between sequences in the HSP | |
| 3493 : 3) the endpoints of the alignment (sequence monomer numbers) | |
| 3494 | |
| 3495 See Also : B<Bio::Tools::Blast::Sbjct::homol_data()>,L<Links to related modules> | |
| 3496 | |
| 3497 =cut | |
| 3498 | |
| 3499 #---------------- | |
| 3500 sub homol_data { | |
| 3501 #---------------- | |
| 3502 | |
| 3503 my ($self, %param) = @_; | |
| 3504 my @hits = $self->hits(); | |
| 3505 my @data = (); | |
| 3506 | |
| 3507 ## Note: Homology data can be either for the query sequence or the hit | |
| 3508 ## (Sbjct) sequence. Default is for sbjct. This is specifyable via | |
| 3509 ## $param{-SEQ}='sbjct' || 'query'. | |
| 3510 | |
| 3511 foreach ( @hits ) { | |
| 3512 push @data, $_->homol_data(%param); | |
| 3513 } | |
| 3514 @data; | |
| 3515 } | |
| 3516 | |
| 3517 =head1 REPORT GENERATING METHODS | |
| 3518 | |
| 3519 =head2 table | |
| 3520 | |
| 3521 Usage : $blast_obj->table( [get_desc]); | |
| 3522 Purpose : Output data for each HSP of each hit in tab-delimited format. | |
| 3523 Example : print $blast->table; | |
| 3524 : print $blast->table(0); | |
| 3525 : # Call table_labels() to print labels. | |
| 3526 Argument : get_desc = boolean, if false the description of each hit is not included. | |
| 3527 : Default: true (if not defined, include description column). | |
| 3528 Returns : String containing tab-delimited set of data for each HSP | |
| 3529 : of each significant hit. Different HSPs are separated by newlines. | |
| 3530 : Left-to-Right order of fields: | |
| 3531 : 1 QUERY_NAME # Sequence identifier of the query. | |
| 3532 : 2 QUERY_LENGTH # Full length of the query sequence. | |
| 3533 : 3 SBJCT_NAME # Sequence identifier of the sbjct ("hit". | |
| 3534 : 4 SBJCT_LENGTH # Full length of the sbjct sequence. | |
| 3535 : 5 EXPECT # Expect value for the alignment. | |
| 3536 : 6 SCORE # Blast score for the alignment. | |
| 3537 : 7 BITS # Bit score for the alignment. | |
| 3538 : 8 NUM_HSPS # Number of HSPs (not the "N" value). | |
| 3539 : 9 HSP_FRAC_IDENTICAL # fraction of identical substitutions. | |
| 3540 : 10 HSP_FRAC_CONSERVED # fraction of conserved ("positive") substitutions. | |
| 3541 : 11 HSP_QUERY_ALN_LENGTH # Length of the aligned portion of the query sequence. | |
| 3542 : 12 HSP_SBJCT_ALN_LENGTH # Length of the aligned portion of the sbjct sequence. | |
| 3543 : 13 HSP_QUERY_GAPS # Number of gaps in the aligned query sequence. | |
| 3544 : 14 HSP_SBJCT_GAPS # Number of gaps in the aligned sbjct sequence. | |
| 3545 : 15 HSP_QUERY_START # Starting coordinate of the query sequence. | |
| 3546 : 16 HSP_QUERY_END # Ending coordinate of the query sequence. | |
| 3547 : 17 HSP_SBJCT_START # Starting coordinate of the sbjct sequence. | |
| 3548 : 18 HSP_SBJCT_END # Ending coordinate of the sbjct sequence. | |
| 3549 : 19 HSP_QUERY_STRAND # Strand of the query sequence (TBLASTN/X only) | |
| 3550 : 20 HSP_SBJCT_STRAND # Strand of the sbjct sequence (TBLASTN/X only) | |
| 3551 : 21 HSP_FRAME # Frame for the sbjct translation (TBLASTN/X only) | |
| 3552 : 22 SBJCT_DESCRIPTION (optional) # Full description of the sbjct sequence from | |
| 3553 : # the alignment section. | |
| 3554 Throws : n/a | |
| 3555 Comments : This method does not collect data based on tiling of the HSPs. | |
| 3556 : The table will contains redundant information since the hit name, | |
| 3557 : id, and other info for the hit are listed for each HSP. | |
| 3558 : If you need more flexibility in the output format than this | |
| 3559 : method provides, design a custom function. | |
| 3560 | |
| 3561 See Also : L<table_tiled()|table_tiled>, L<table_labels()|table_labels>, L<_display_hits()|_display_hits> | |
| 3562 | |
| 3563 =cut | |
| 3564 | |
| 3565 #----------- | |
| 3566 sub table { | |
| 3567 #----------- | |
| 3568 my ($self, $get_desc) = @_; | |
| 3569 my $str = ''; | |
| 3570 | |
| 3571 $get_desc = defined($get_desc) ? $get_desc : 1; | |
| 3572 # $str .= $self->_table_labels($get_desc) unless $self->{'_labels'}; | |
| 3573 | |
| 3574 my $sigfmt = $self->signif_fmt(); | |
| 3575 $sigfmt eq 'parts' and $sigfmt = 'exp'; # disallow 'parts' format for this table. | |
| 3576 my $sigprint = $sigfmt eq 'exp' ? 'd' : '.1e'; | |
| 3577 | |
| 3578 my ($hit, $hsp); | |
| 3579 foreach $hit($self->hits) { | |
| 3580 foreach $hsp($hit->hsps) { | |
| 3581 # Note: range() returns a 2-element list. | |
| 3582 $str .= sprintf "%s\t%d\t%s\t%d\t%$sigprint\t%d\t%d\t%d\t%.2f\t%.2f\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%s\t%s\t%s\t%s$Newline", | |
| 3583 $self->name, $self->length, $hit->name, $hit->length, | |
| 3584 $hit->expect($sigfmt), $hit->score, $hit->bits, | |
| 3585 $hit->num_hsps, $hsp->frac_identical, $hsp->frac_conserved, | |
| 3586 $hsp->length('query'), $hsp->length('sbjct'), | |
| 3587 $hsp->gaps('list'), | |
| 3588 $hsp->range('query'), $hsp->range('sbjct'), | |
| 3589 $hsp->strand('query'), $hsp->strand('sbjct'), $hsp->frame, | |
| 3590 ($get_desc ? $hit->desc : ''); | |
| 3591 } | |
| 3592 } | |
| 3593 $str =~ s/\t$Newline/$Newline/gs; | |
| 3594 $str; | |
| 3595 } | |
| 3596 | |
| 3597 =head2 table_labels | |
| 3598 | |
| 3599 Usage : print $blast_obj->table_labels( [get_desc] ); | |
| 3600 Purpose : Get column labels for table(). | |
| 3601 Returns : String containing column labels. Tab-delimited. | |
| 3602 Argument : get_desc = boolean, if false the description column is not included. | |
| 3603 : Default: true (if not defined, include description column). | |
| 3604 Throws : n/a | |
| 3605 | |
| 3606 See Also : L<table()|table> | |
| 3607 | |
| 3608 =cut | |
| 3609 | |
| 3610 #---------------- | |
| 3611 sub table_labels { | |
| 3612 #---------------- | |
| 3613 my ($self, $get_desc) = @_; | |
| 3614 $get_desc = defined($get_desc) ? $get_desc : 1; | |
| 3615 my $descstr = $get_desc ? 'DESC' : ''; | |
| 3616 my $descln = $get_desc ? '-----' : ''; | |
| 3617 | |
| 3618 my $str = sprintf "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s$Newline", | |
| 3619 'QUERY', 'Q_LEN', 'SBJCT', 'S_LEN', 'EXPCT', 'SCORE', 'BITS', 'HSPS', | |
| 3620 'IDEN', 'CONSV', 'Q_ALN', 'S_ALN', 'Q_GAP', 'S_GAP', | |
| 3621 'Q_BEG', 'Q_END', 'S_BEG', 'S_END', 'Q_STR', 'S_STR', 'FRAM', $descstr; | |
| 3622 $str .= sprintf "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s$Newline", | |
| 3623 '-----', '-----', '-----', '-----', '-----', '-----', '-----', '-----', | |
| 3624 '-----', '-----', '-----', '-----', '-----', '-----', | |
| 3625 '-----', '-----', '-----','-----', '-----', '-----','-----', $descln; | |
| 3626 | |
| 3627 $self->{'_labels'} = 1; | |
| 3628 $str =~ s/\t$Newline/$Newline/gs; | |
| 3629 $str; | |
| 3630 } | |
| 3631 | |
| 3632 =head2 table_tiled | |
| 3633 | |
| 3634 Purpose : Get data from tiled HSPs in tab-delimited format. | |
| 3635 : Allows only minimal flexibility in the output format. | |
| 3636 : If you need more flexibility, design a custom function. | |
| 3637 Usage : $blast_obj->table_tiled( [get_desc]); | |
| 3638 Example : print $blast->table_tiled; | |
| 3639 : print $blast->table_tiled(0); | |
| 3640 : # Call table_labels_tiled() if you want labels. | |
| 3641 Argument : get_desc = boolean, if false the description of each hit is not included. | |
| 3642 : Default: true (include description). | |
| 3643 Returns : String containing tab-delimited set of data for each HSP | |
| 3644 : of each significant hit. Multiple hits are separated by newlines. | |
| 3645 : Left-to-Right order of fields: | |
| 3646 : 1 QUERY_NAME # Sequence identifier of the query. | |
| 3647 : 2 QUERY_LENGTH # Full length of the query sequence. | |
| 3648 : 3 SBJCT_NAME # Sequence identifier of the sbjct ("hit". | |
| 3649 : 4 SBJCT_LENGTH # Full length of the sbjct sequence. | |
| 3650 : 5 EXPECT # Expect value for the alignment. | |
| 3651 : 6 SCORE # Blast score for the alignment. | |
| 3652 : 7 BITS # Bit score for the alignment. | |
| 3653 : 8 NUM_HSPS # Number of HSPs (not the "N" value). | |
| 3654 : 9 FRAC_IDENTICAL* # fraction of identical substitutions. | |
| 3655 : 10 FRAC_CONSERVED* # fraction of conserved ("positive") substitutions . | |
| 3656 : 11 FRAC_ALN_QUERY* # fraction of the query sequence that is aligned. | |
| 3657 : 12 FRAC_ALN_SBJCT* # fraction of the sbjct sequence that is aligned. | |
| 3658 : 13 QUERY_ALN_LENGTH* # Length of the aligned portion of the query sequence. | |
| 3659 : 14 SBJCT_ALN_LENGTH* # Length of the aligned portion of the sbjct sequence. | |
| 3660 : 15 QUERY_GAPS* # Number of gaps in the aligned query sequence. | |
| 3661 : 16 SBJCT_GAPS* # Number of gaps in the aligned sbjct sequence. | |
| 3662 : 17 QUERY_START* # Starting coordinate of the query sequence. | |
| 3663 : 18 QUERY_END* # Ending coordinate of the query sequence. | |
| 3664 : 19 SBJCT_START* # Starting coordinate of the sbjct sequence. | |
| 3665 : 20 SBJCT_END* # Ending coordinate of the sbjct sequence. | |
| 3666 : 21 AMBIGUOUS_ALN # Ambiguous alignment indicator ('qs', 'q', 's'). | |
| 3667 : 22 SBJCT_DESCRIPTION (optional) # Full description of the sbjct sequence from | |
| 3668 : # the alignment section. | |
| 3669 : | |
| 3670 : * Items marked with a "*" report data summed across all HSPs | |
| 3671 : after tiling them to avoid counting data from overlapping regions | |
| 3672 : multiple times. | |
| 3673 Throws : n/a | |
| 3674 Comments : This function relies on tiling of the HSPs since it calls | |
| 3675 : frac_identical() etc. on the hit as opposed to each HSP individually. | |
| 3676 | |
| 3677 See Also : L<table()|table>, L<table_labels_tiled()|table_labels_tiled>, B<Bio::Tools::Blast::Sbjct::"HSP Tiling and Ambiguous Alignments">, L<Links to related modules> | |
| 3678 | |
| 3679 =cut | |
| 3680 | |
| 3681 #---------------- | |
| 3682 sub table_tiled { | |
| 3683 #---------------- | |
| 3684 my ($self, $get_desc) = @_; | |
| 3685 my $str = ''; | |
| 3686 | |
| 3687 $get_desc = defined($get_desc) ? $get_desc : 1; | |
| 3688 | |
| 3689 my ($hit); | |
| 3690 my $sigfmt = $self->signif_fmt(); | |
| 3691 $sigfmt eq 'parts' and $sigfmt = 'exp'; # disallow 'parts' format for this table. | |
| 3692 my $sigprint = $sigfmt eq 'exp' ? 'd' : '.1e'; | |
| 3693 | |
| 3694 foreach $hit($self->hits) { | |
| 3695 $str .= sprintf "%s\t%d\t%s\t%d\t%$sigprint\t%d\t%d\t%d\t%.2f\t%.2f\t%.2f\t%.2f\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%s\t%s$Newline", | |
| 3696 $self->name, $self->length, $hit->name, $hit->length, | |
| 3697 $hit->expect($sigfmt), $hit->score, $hit->bits, | |
| 3698 $hit->num_hsps, $hit->frac_identical, $hit->frac_conserved, | |
| 3699 $hit->frac_aligned_query, $hit->frac_aligned_hit, | |
| 3700 $hit->length_aln('query'), $hit->length_aln('sbjct'), | |
| 3701 $hit->gaps('list'), $hit->range('query'), $hit->range('sbjct'), | |
| 3702 $hit->ambiguous_aln, ($get_desc ? $hit->desc : ''); | |
| 3703 } | |
| 3704 $str =~ s/\t$Newline/$Newline/gs; | |
| 3705 $str; | |
| 3706 } | |
| 3707 | |
| 3708 =head2 table_labels_tiled | |
| 3709 | |
| 3710 Usage : print $blast_obj->table_labels_tiled( [get_desc] ); | |
| 3711 Purpose : Get column labels for table_tiled(). | |
| 3712 Returns : String containing column labels. Tab-delimited. | |
| 3713 Argument : get_desc = boolean, if false the description column is not included. | |
| 3714 : Default: true (include description column). | |
| 3715 Throws : n/a | |
| 3716 | |
| 3717 See Also : L<table_tiled()|table_tiled> | |
| 3718 | |
| 3719 =cut | |
| 3720 | |
| 3721 #--------------------- | |
| 3722 sub table_labels_tiled { | |
| 3723 #--------------------- | |
| 3724 my ($self, $get_desc) = @_; | |
| 3725 my $descstr = $get_desc ? 'DESC' : ''; | |
| 3726 my $descln = $get_desc ? '-----' : ''; | |
| 3727 | |
| 3728 my $str = sprintf "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s$Newline", | |
| 3729 'QUERY', 'Q_LEN', 'SBJCT', 'S_LEN', 'EXPCT', 'SCORE', 'BITS', | |
| 3730 'HSPS', 'FR_ID', 'FR_CN', 'FR_ALQ', 'FR_ALS', 'Q_ALN', | |
| 3731 'S_ALN', 'Q_GAP', 'S_GAP', 'Q_BEG', 'Q_END', 'S_BEG', 'S_END', | |
| 3732 'AMBIG', $descstr; | |
| 3733 $str =~ s/\t$Newline/$Newline/; | |
| 3734 $str .= sprintf "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s$Newline", | |
| 3735 '-----', '-----', '------', '-----', '-----','-----', '-----', | |
| 3736 '-----', '-----', '-----', '-----', '-----', '-----', | |
| 3737 '-----', '-----', '-----','-----','-----', '-----', | |
| 3738 '-----','-----', $descln; | |
| 3739 | |
| 3740 $self->{'_labels_tiled'} = 1; | |
| 3741 $str =~ s/\t$Newline/$Newline/gs; | |
| 3742 $str; | |
| 3743 } | |
| 3744 | |
| 3745 =head2 display | |
| 3746 | |
| 3747 Usage : $blast_object->display( %named_parameters ); | |
| 3748 Purpose : Display information about Bio::Tools::Blast.pm data members, | |
| 3749 : E.g., parameters of the report, data for each hit., etc. | |
| 3750 : Overrides Bio::Root::Object::display(). | |
| 3751 Example : $object->display(-SHOW=>'stats'); | |
| 3752 : $object->display(-SHOW=>'stats,hits'); | |
| 3753 Argument : Named parameters: (TAGS CAN BE UPPER OR LOWER CASE) | |
| 3754 : -SHOW => 'file' | 'hits' | 'homol' | |
| 3755 : -WHERE => filehandle (default = STDOUT) | |
| 3756 Returns : n/a (print/printf is called) | |
| 3757 Status : Experimental | |
| 3758 Comments : For tab-delimited output, see table(). | |
| 3759 | |
| 3760 See Also : L<_display_homol()|_display_homol>, L<_display_hits()|_display_hits>, L<_display_stats()|_display_stats>, L<table()|table>, B<Bio::Root::Tools::SeqAnal::display()>,L<Links to related modules>, | |
| 3761 | |
| 3762 =cut | |
| 3763 | |
| 3764 #-------------- | |
| 3765 sub display { | |
| 3766 #-------------- | |
| 3767 my( $self, %param ) = @_; | |
| 3768 | |
| 3769 $self->SUPER::display(%param); | |
| 3770 my $OUT = $self->fh(); | |
| 3771 | |
| 3772 $self->show =~ /homol/i and $self->_display_homol($OUT); | |
| 3773 $self->show =~ /hits/i and $self->_display_hits( %param ); | |
| 3774 1; | |
| 3775 } | |
| 3776 | |
| 3777 =head2 _display_homol | |
| 3778 | |
| 3779 Usage : n/a; called automatically by display() | |
| 3780 Purpose : Print homology data for hits in the BLAST report. | |
| 3781 Example : n/a | |
| 3782 Argument : one argument = filehandle object. | |
| 3783 Returns : printf call. | |
| 3784 Status : Experimental | |
| 3785 | |
| 3786 See Also : L<homol_data()|homol_data>, L<display()|display> | |
| 3787 | |
| 3788 =cut | |
| 3789 | |
| 3790 #------------------- | |
| 3791 sub _display_homol { | |
| 3792 #------------------- | |
| 3793 my( $self, $OUT ) = @_; | |
| 3794 | |
| 3795 print $OUT "${Newline}BLAST HOMOLOGY DATA FOR: ${\$self->name()}$Newline"; | |
| 3796 print $OUT '-'x40,"$Newline"; | |
| 3797 | |
| 3798 foreach ( $self->homol_data()) { | |
| 3799 print $OUT "$_$Newline"; | |
| 3800 } | |
| 3801 } | |
| 3802 | |
| 3803 =head2 _display_stats | |
| 3804 | |
| 3805 Usage : n/a; called automatically by display() | |
| 3806 Purpose : Display information about the Blast report "meta" data. | |
| 3807 : Overrides Bio::Tools::SeqAnal::_display_stats() calling it first. | |
| 3808 Example : n/a | |
| 3809 Argument : one argument = filehandle object. | |
| 3810 Returns : printf call. | |
| 3811 Status : Experimental | |
| 3812 | |
| 3813 See Also : L<display()|display>, B<Bio::Tools::SeqAnal::_display_stats()>,L<Links to related modules> | |
| 3814 | |
| 3815 =cut | |
| 3816 | |
| 3817 #-------------------- | |
| 3818 sub _display_stats { | |
| 3819 #-------------------- | |
| 3820 my( $self, $OUT ) = @_; | |
| 3821 | |
| 3822 $self->SUPER::_display_stats($OUT); | |
| 3823 printf( $OUT "%-15s: %s$Newline", "GAPPED", $self->gapped ? 'YES' : 'NO'); | |
| 3824 printf( $OUT "%-15s: %d$Newline", "TOTAL HITS", $self->num_hits('total')); | |
| 3825 printf( $OUT "%-15s: %s$Newline", "CHECKED ALL", $Blast->{'_check_all'} ? 'YES' : 'NO'); | |
| 3826 printf( $OUT "%-15s: %s$Newline", "FILT FUNC", $Blast->{'_filt_func'} ? 'YES' : 'NO'); | |
| 3827 if($self->min_length) { | |
| 3828 printf( $OUT "%-15s: Length >= %s$Newline", "MIN_LENGTH", $self->min_length); | |
| 3829 } | |
| 3830 | |
| 3831 my $num_hits = $self->num_hits; | |
| 3832 my $signif_str = ($self->_layout == 1) ? 'P' : 'EXPECT'; | |
| 3833 | |
| 3834 printf( $OUT "%-15s: %d$Newline", "SIGNIF HITS", $num_hits); | |
| 3835 # Blast1: signif = P-value, Blast2: signif = Expect value. | |
| 3836 | |
| 3837 printf( $OUT "%-15s: %s ($signif_str-VALUE)$Newline", "SIGNIF CUTOFF", $self->signif); | |
| 3838 printf( $OUT "%-15s: %s$Newline", "LOWEST $signif_str", $self->lowest_signif()); | |
| 3839 printf( $OUT "%-15s: %s$Newline", "HIGHEST $signif_str", $self->highest_signif()); | |
| 3840 | |
| 3841 printf( $OUT "%-15s: %s (OVERALL)$Newline", "HIGHEST $signif_str", $self->highest_signif('overall')); | |
| 3842 | |
| 3843 if($Blast->_get_stats) { | |
| 3844 my $warn = ($Blast->{'_share'}) ? '(SHARED STATS)' : ''; | |
| 3845 printf( $OUT "%-15s: %s$Newline", "MATRIX", $self->matrix() || 'UNKNOWN'); | |
| 3846 printf( $OUT "%-15s: %s$Newline", "FILTER", $self->filter() || 'UNKNOWN'); | |
| 3847 printf( $OUT "%-15s: %s$Newline", "EXPECT", $self->expect() || 'UNKNOWN'); | |
| 3848 printf( $OUT "%-15s: %s, %s, %s %s$Newline", "LAMBDA, K, H", $self->karlin_altschul(), $warn); | |
| 3849 printf( $OUT "%-15s: %s$Newline", "WORD SIZE", $self->word_size() || 'UNKNOWN'); | |
| 3850 printf( $OUT "%-15s: %s %s$Newline", "S", $self->s() || 'UNKNOWN', $warn); | |
| 3851 if($self->gapped) { | |
| 3852 printf( $OUT "%-15s: %s$Newline", "GAP CREATION", $self->gap_creation() || 'UNKNOWN'); | |
| 3853 printf( $OUT "%-15s: %s$Newline", "GAP EXTENSION", $self->gap_extension() || 'UNKNOWN'); | |
| 3854 } | |
| 3855 } | |
| 3856 print $OUT "$Newline"; | |
| 3857 } | |
| 3858 | |
| 3859 =head2 _display_hits | |
| 3860 | |
| 3861 Usage : n/a; called automatically by display() | |
| 3862 Purpose : Display data for each hit. Not tab-delimited. | |
| 3863 Example : n/a | |
| 3864 Argument : one argument = filehandle object. | |
| 3865 Returns : printf call. | |
| 3866 Status : Experimental | |
| 3867 Comments : For tab-delimited output, see table(). | |
| 3868 | |
| 3869 See Also : L<display()|display>, B<Bio::Tools::Blast::Sbjct::display()>, L<table()|table>, L<Links to related modules> | |
| 3870 | |
| 3871 =cut | |
| 3872 | |
| 3873 sub _display_hits { | |
| 3874 | |
| 3875 my( $self, %param ) = @_; | |
| 3876 my $OUT = $self->fh(); | |
| 3877 my @hits = $self->hits(); | |
| 3878 | |
| 3879 ## You need a wide screen to see this properly. | |
| 3880 # Header. | |
| 3881 print $OUT "${Newline}BLAST HITS FOR: ${\$self->name()} length = ${\$self->length}$Newline"; | |
| 3882 print "(This table requires a wide display.)$Newline"; | |
| 3883 print $OUT '-'x80,"$Newline"; | |
| 3884 | |
| 3885 print $self->table_labels_tiled(0); | |
| 3886 print $self->table_tiled(0); | |
| 3887 | |
| 3888 ## Doing this interactively since there is potentially a lot of data here. | |
| 3889 ## Not quite satisfied with this approach. | |
| 3890 | |
| 3891 if (not $param{-INTERACTIVE}) { | |
| 3892 return 1; | |
| 3893 } else { | |
| 3894 my ($reply); | |
| 3895 print "${Newline}DISPLAY FULL HSP DATA? (y/n): [n] "; | |
| 3896 chomp( $reply = <STDIN> ); | |
| 3897 $reply =~ /^y.*/i; | |
| 3898 | |
| 3899 my $count = 0; | |
| 3900 foreach ( @hits ) { | |
| 3901 $count++; | |
| 3902 print $OUT "$Newline$Newline",'-'x80,"$Newline"; | |
| 3903 print $OUT "HSP DATA FOR HIT #$count (hit <RETURN>)"; | |
| 3904 print $OUT "$Newline",'-'x80;<STDIN>; | |
| 3905 $param{-SHOW} = 'hsp'; | |
| 3906 $_->display( %param ); | |
| 3907 } | |
| 3908 } | |
| 3909 1; | |
| 3910 } | |
| 3911 | |
| 3912 =head2 to_html | |
| 3913 | |
| 3914 Usage : $blast_object->to_html( [%named_parameters] ) | |
| 3915 Purpose : To produce an HTML-formatted version of a BLAST report | |
| 3916 : for efficient navigation of the report using a web browser. | |
| 3917 Example : # Using the static Blast object: | |
| 3918 : # Can read from STDIN or from a designated file: | |
| 3919 : $Blast->to_html($file); | |
| 3920 : $Blast->to_html(-FILE=>$file, -HEADER=>$header); | |
| 3921 : (if no file is supplied, STDIN will be used). | |
| 3922 : # saving HTML to an array: | |
| 3923 : $Blast->to_html(-FILE=>$file, -OUT =>\@out); | |
| 3924 : # Using a pre-existing blast object (must have been built from | |
| 3925 : # a file, not STDIN: | |
| 3926 : $blastObj->to_html(); | |
| 3927 Returns : n/a, either prints report to STDOUT or saves to a supplied array | |
| 3928 : if an '-OUT' parameter is defined (see below). | |
| 3929 Argument : %named_parameters: (TAGS ARE AND CASE INSENSITIVE). | |
| 3930 : -FILE => string containing name of a file to be processed. | |
| 3931 : If not a valid file or undefined, STDIN will be used. | |
| 3932 : Can skip the -FILE tag if supplying a filename | |
| 3933 : as a single argument. | |
| 3934 : -HEADER => string | |
| 3935 : This should be an HTML-formatted string to be used | |
| 3936 : as a header for the page, typically describing query sequence, | |
| 3937 : database searched, the date of the analysis, and any | |
| 3938 : additional links. | |
| 3939 : If not supplied, no special header is used. | |
| 3940 : Regardless of whether a header is supplied, the | |
| 3941 : standard info at the top of the report is highlighted. | |
| 3942 : This should include the <HEADER></HEADER> section | |
| 3943 : of the page as well. | |
| 3944 : | |
| 3945 : -IN => array reference containing a raw Blast report. | |
| 3946 : each line in a separate element in the array. | |
| 3947 : If -IN is not supplied, read() is called | |
| 3948 : and data is then read either from STDIN or a file. | |
| 3949 : | |
| 3950 : -OUT => array reference to hold the HTML output. | |
| 3951 : If not supplied, output is sent to STDOUT. | |
| 3952 Throws : Exception is propagated from $HTML::get_html_func() | |
| 3953 : and Bio::Root::Object::read(). | |
| 3954 Comments : The code that does the actual work is located in | |
| 3955 : Bio::Tools::Blast::HTML::get_html_func(). | |
| 3956 Bugs : Some hypertext links to external databases may not be | |
| 3957 : correct. This due in part to the dynamic nature of | |
| 3958 : the web. | |
| 3959 : Hypertext links are not added to hits without database ids. | |
| 3960 TODO : Possibly create a function to produce fancy default header | |
| 3961 : using data extracted from the report (requires some parsing). | |
| 3962 : For example, it would be nice to always include a date | |
| 3963 | |
| 3964 See Also : B<Bio::Tools::Blast::HTML::get_html_func()>, B<Bio::Root::Object::read()>, L<Links to related modules> | |
| 3965 | |
| 3966 =cut | |
| 3967 | |
| 3968 #------------ | |
| 3969 sub to_html { | |
| 3970 #------------ | |
| 3971 my ($self, @param) = @_; | |
| 3972 | |
| 3973 # Permits syntax such as: $blast->to_html($filename); | |
| 3974 my ($file, $header_html, $in_aref, $out_aref) = | |
| 3975 $self->_rearrange([qw(FILE HEADER IN OUT)], @param); | |
| 3976 | |
| 3977 $self->file($file) if $file; | |
| 3978 | |
| 3979 # Only setting the newline character once for efficiency. | |
| 3980 $Newline ||= $Util->get_newline(-client => $self, @param); | |
| 3981 | |
| 3982 $header_html ||= ''; | |
| 3983 (ref($out_aref) eq 'ARRAY') ? push(@$out_aref, $header_html) : print "$header_html$Newline"; | |
| 3984 | |
| 3985 require Bio::Tools::Blast::HTML; | |
| 3986 Bio::Tools::Blast::HTML->import(qw(&get_html_func)); | |
| 3987 | |
| 3988 my ($func); | |
| 3989 eval{ $func = &get_html_func($out_aref); }; | |
| 3990 if($@) { | |
| 3991 my $err = $@; | |
| 3992 $self->throw($err); | |
| 3993 } | |
| 3994 | |
| 3995 eval { | |
| 3996 if(!$header_html) { | |
| 3997 $out_aref ? push(@$out_aref, "<html><body>$Newline") : print "<html><body>$Newline"; | |
| 3998 } | |
| 3999 | |
| 4000 if (ref ($in_aref) =~ /ARRAY/) { | |
| 4001 # If data is being supplied, process it. | |
| 4002 foreach(@$in_aref) { | |
| 4003 &$func($_); | |
| 4004 } | |
| 4005 } else { | |
| 4006 # Otherwise, read it, processing as we go. | |
| 4007 | |
| 4008 $self->read(-FUNC => $func, @param); | |
| 4009 } | |
| 4010 $out_aref ? push(@$out_aref, "$Newline</pre></body></html>") : print "$Newline</pre></body></html>"; | |
| 4011 }; | |
| 4012 | |
| 4013 if($@) { | |
| 4014 # Check for trivial error (report already HTML formatted). | |
| 4015 if($@ =~ /HTML formatted/) { | |
| 4016 print STDERR "\a${Newline}Blast report appears to be HTML formatted already.$Newline$Newline"; | |
| 4017 } else { | |
| 4018 my $err = $@; | |
| 4019 $self->throw($err); | |
| 4020 } | |
| 4021 } | |
| 4022 } | |
| 4023 | |
| 4024 1; | |
| 4025 __END__ | |
| 4026 | |
| 4027 ##################################################################################### | |
| 4028 # END OF CLASS # | |
| 4029 ##################################################################################### | |
| 4030 | |
| 4031 =head1 FOR DEVELOPERS ONLY | |
| 4032 | |
| 4033 =head2 Data Members | |
| 4034 | |
| 4035 Information about the various data members of this module is provided for those | |
| 4036 wishing to modify or understand the code. Two things to bear in mind: | |
| 4037 | |
| 4038 =over 4 | |
| 4039 | |
| 4040 =item 1 Do NOT rely on these in any code outside of this module. | |
| 4041 | |
| 4042 All data members are prefixed with an underscore to signify that they are private. | |
| 4043 Always use accessor methods. If the accessor doesn't exist or is inadequate, | |
| 4044 create or modify an accessor (and let me know, too!). (An exception to this might | |
| 4045 be for Sbjct.pm or HSP.pm which are more tightly coupled to Blast.pm and | |
| 4046 may access Blast data members directly for efficiency purposes, but probably | |
| 4047 should not). | |
| 4048 | |
| 4049 =item 2 This documentation may be incomplete and out of date. | |
| 4050 | |
| 4051 It is easy for these data member descriptions to become obsolete as | |
| 4052 this module is still evolving. Always double check this info and search | |
| 4053 for members not described here. | |
| 4054 | |
| 4055 =back | |
| 4056 | |
| 4057 An instance of Bio::Tools::Blast.pm is a blessed reference to a hash containing | |
| 4058 all or some of the following fields: | |
| 4059 | |
| 4060 FIELD VALUE | |
| 4061 -------------------------------------------------------------- | |
| 4062 _significance P-value or Expect value cutoff (depends on Blast version: | |
| 4063 Blast1/WU-Blast2 = P-value; Blast2 = Expect value). | |
| 4064 Values GREATER than this are deemed not significant. | |
| 4065 | |
| 4066 _significant Boolean. True if the query has one or more significant hit. | |
| 4067 | |
| 4068 _min_length Integer. Query sequences less than this will be skipped. | |
| 4069 | |
| 4070 _confirm_significance Boolean. True if client has supplied significance criteria. | |
| 4071 | |
| 4072 _gapped Boolean. True if BLAST analysis has gapping turned on. | |
| 4073 | |
| 4074 _hits List of Sbjct.pm objects. | |
| 4075 | |
| 4076 _num_hits Number of hits obtained from the BLAST report. | |
| 4077 | |
| 4078 _num_hits_significant Number of significant based on Significant data members. | |
| 4079 | |
| 4080 _highestSignif Highest P or Expect value overall (not just what is stored in _hits). | |
| 4081 | |
| 4082 _lowestSignif Lowest P or Expect value overall (not just what is stored in _hits). | |
| 4083 | |
| 4084 The static $Blast object has a special set of members: | |
| 4085 | |
| 4086 _errs | |
| 4087 _share | |
| 4088 _stream | |
| 4089 _get_stats | |
| 4090 _gapped | |
| 4091 _filt_func | |
| 4092 | |
| 4093 Miscellaneous statistical parameters: | |
| 4094 ------------------------------------- | |
| 4095 _filter, _matrix, _word_size, _expect, _gapCreation, _gapExtension, _s, | |
| 4096 _lambda, _k, _h | |
| 4097 | |
| 4098 INHERITED DATA MEMBERS | |
| 4099 ----------------------- | |
| 4100 (See Bio::Tools::SeqAnal.pm for inherited data members.) | |
| 4101 | |
| 4102 =cut | |
| 4103 | |
| 4104 1; |
