Mercurial > repos > mahtabm > ensembl
comparison variant_effect_predictor/Bio/Tools/Blast.pm @ 0:1f6dce3d34e0
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 02:01:53 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:1f6dce3d34e0 |
---|---|
1 #---------------------------------------------------------------------------- | |
2 # PACKAGE : Bio::Tools::Blast | |
3 # PURPOSE : To encapsulate code for running, parsing, and analyzing | |
4 # BLAST reports. | |
5 # AUTHOR : Steve Chervitz (sac@bioperl.org) | |
6 # CREATED : March 1996 | |
7 # REVISION: $Id: Blast.pm,v 1.30 2002/11/04 09:12:50 heikki Exp $ | |
8 # STATUS : Alpha | |
9 # | |
10 # For the latest version and documentation, visit: | |
11 # http://bio.perl.org/Projects/Blast | |
12 # | |
13 # To generate documentation, run this module through pod2html | |
14 # (preferably from Perl v5.004 or better). | |
15 # | |
16 # Copyright (c) 1996-2000 Steve Chervitz. All Rights Reserved. | |
17 # This module is free software; you can redistribute it and/or | |
18 # modify it under the same terms as Perl itself. | |
19 #---------------------------------------------------------------------------- | |
20 | |
21 package Bio::Tools::Blast; | |
22 use strict; | |
23 use Exporter; | |
24 | |
25 use Bio::Tools::SeqAnal; | |
26 use Bio::Root::Global qw(:std); | |
27 use Bio::Root::Utilities qw(:obj); | |
28 | |
29 require 5.002; | |
30 | |
31 use vars qw(@ISA @EXPORT @EXPORT_OK %EXPORT_TAGS | |
32 $ID $VERSION $Blast @Blast_programs $Revision $Newline); | |
33 | |
34 @ISA = qw( Bio::Tools::SeqAnal Exporter); | |
35 @EXPORT = qw(); | |
36 @EXPORT_OK = qw($VERSION $Blast); | |
37 %EXPORT_TAGS = ( obj => [qw($Blast)], | |
38 std => [qw($Blast)]); | |
39 | |
40 $ID = 'Bio::Tools::Blast'; | |
41 $VERSION = 0.09; | |
42 $Revision = '$Id: Blast.pm,v 1.30 2002/11/04 09:12:50 heikki Exp $'; #' | |
43 | |
44 ## Static Blast object. | |
45 $Blast = {}; | |
46 bless $Blast, $ID; | |
47 $Blast->{'_name'} = "Static Blast object"; | |
48 | |
49 @Blast_programs = qw(blastp blastn blastx tblastn tblastx); | |
50 | |
51 use vars qw($DEFAULT_MATRIX $DEFAULT_SIGNIF); | |
52 my $DEFAULT_MATRIX = 'BLOSUM62'; | |
53 my $DEFAULT_SIGNIF = 999;# Value used as significance cutoff if none supplied. | |
54 my $MAX_HSP_OVERLAP = 2; # Used when tiling multiple HSPs. | |
55 | |
56 ## POD Documentation: | |
57 | |
58 =head1 NAME | |
59 | |
60 Bio::Tools::Blast - Bioperl BLAST sequence analysis object | |
61 | |
62 =head1 SYNOPSIS | |
63 | |
64 =head2 Parsing Blast reports | |
65 | |
66 Parse an existing Blast report from file: | |
67 | |
68 use Bio::Tools::Blast; | |
69 | |
70 $blastObj = Bio::Tools::Blast->new( -file => '/tmp/blast.out', | |
71 -parse => 1, | |
72 -signif => '1e-10', | |
73 ); | |
74 | |
75 Parse an existing Blast report from STDIN: | |
76 | |
77 $blastObj = Bio::Tools::Blast->new( -parse => 1, | |
78 -signif => '1e-10', | |
79 ); | |
80 | |
81 Then send a Blast report to your script via STDIN. | |
82 | |
83 Full parameters for parsing Blast reports. | |
84 | |
85 %blastParam = ( | |
86 -run => \%runParam, | |
87 -file => '', | |
88 -parse => 1, | |
89 -signif => 1e-5, | |
90 -filt_func => \&my_filter, | |
91 -min_len => 15, | |
92 -check_all_hits => 0, | |
93 -strict => 0, | |
94 -stats => 1, | |
95 -best => 0, | |
96 -share => 0, | |
97 -exec_func => \&process_blast, | |
98 -save_array => \@blast_objs, # not used if -exce_func defined. | |
99 ); | |
100 | |
101 See L<parse()|parse> for a description of parameters and see L<USAGE | USAGE> for | |
102 more examples including how to parse streams containing multiple Blast | |
103 reports L<Using the Static $Blast Object>. | |
104 | |
105 See L<Memory Usage Issues> for information about how to make Blast | |
106 parsing be more memory efficient. | |
107 | |
108 =head2 Running Blast reports | |
109 | |
110 Run a new Blast2 at NCBI and then parse it: | |
111 | |
112 %runParam = ( | |
113 -method => 'remote', | |
114 -prog => 'blastp', | |
115 -database => 'swissprot', | |
116 -seqs => [ $seq ], # Bio::Seq.pm objects. | |
117 ); | |
118 | |
119 $blastObj = Bio::Tools::Blast->new( -run => \%runParam, | |
120 -parse => 1, | |
121 -signif => '1e-10', | |
122 -strict => 1, | |
123 ); | |
124 | |
125 Full parameters for running Blasts at NCBI using Webblast.pm: | |
126 | |
127 %runParam = ( | |
128 -method => 'remote', | |
129 -prog => 'blastp', | |
130 -version => 2, # BLAST2 | |
131 -database =>'swissprot', | |
132 -html => 0, | |
133 -seqs => [ $seqObject ], # Bio::Seq.pm object(s) | |
134 -descr => 250, | |
135 -align => 250, | |
136 -expect => 10, | |
137 -gap => 'on', | |
138 -matrix => 'PAM250', | |
139 -email => undef, # don't send report via e-mail if parsing. | |
140 -filter => undef, # use default | |
141 -gap_c => undef, # use default | |
142 -gap_e => undef, # use default | |
143 -word => undef, # use default | |
144 -min_len => undef, # use default | |
145 ); | |
146 | |
147 See L<run()|run> and L<USAGE | USAGE> for more information about running Blasts. | |
148 | |
149 =head2 HTML-formatting Blast reports | |
150 | |
151 Print an HTML-formatted version of a Blast report: | |
152 | |
153 use Bio::Tools::Blast qw(:obj); | |
154 | |
155 $Blast->to_html($filename); | |
156 $Blast->to_html(-file => $filename, | |
157 -header => "<H1>Blast Results</H1>"); | |
158 $Blast->to_html(-file => $filename, | |
159 -out => \@array); # store output | |
160 $Blast->to_html(); # use STDIN | |
161 | |
162 Results are sent directly to STDOUT unless an C<-out =E<gt> array_ref> | |
163 parameter is supplied. See L<to_html()|to_html> for details. | |
164 | |
165 =head1 INSTALLATION | |
166 | |
167 This module is included with the central Bioperl distribution: | |
168 | |
169 http://bio.perl.org/Core/Latest | |
170 ftp://bio.perl.org/pub/DIST | |
171 | |
172 Follow the installation instructions included in the README file. | |
173 | |
174 =head1 DESCRIPTION | |
175 | |
176 The Bio::Tools::Blast.pm module encapsulates data and methods for | |
177 running, parsing, and analyzing pre-existing BLAST reports. This | |
178 module defines an application programming interface (API) for working | |
179 with Blast reports. A Blast object is constructed from raw Blast | |
180 output and encapsulates the Blast results which can then be accessed | |
181 via the interface defined by the Blast object. | |
182 | |
183 The ways in which researchers use Blast data are many and varied. This | |
184 module attempts to be general and flexible enough to accommodate | |
185 different uses. The Blast module API is still at an early stage of | |
186 evolution and I expect it to continue to evolve as new uses for Blast | |
187 data are developed. Your L<FEEDBACK | FEEDBACK> is welcome. | |
188 | |
189 B<FEATURES:> | |
190 | |
191 =over 2 | |
192 | |
193 =item * Supports NCBI Blast1.x, Blast2.x, and WashU-Blast2.x, gapped | |
194 and ungapped. | |
195 | |
196 Can parse HTML-formatted as well as non-HTML-formatted reports. | |
197 | |
198 =item * Launch new Blast analyses remotely or locally. | |
199 | |
200 Blast objects can be constructed directly from the results of the | |
201 run. See L<run()|run>. | |
202 | |
203 =item * Construct Blast objects from pre-existing files or from a new run. | |
204 | |
205 Build a Blast object from a single file or build multiple Blast | |
206 objects from an input stream containing multiple reports. See | |
207 L<parse()|parse>. | |
208 | |
209 =item * Add hypertext links from a BLAST report. | |
210 | |
211 See L<to_html()|to_html>. | |
212 | |
213 =item * Generate sequence and sequence alignment objects from HSP | |
214 sequences. | |
215 | |
216 If you have Bio::Seq.pm and Bio::UnivAln.pm installed on your system, | |
217 they can be used for working with high-scoring segment pair (HSP) | |
218 sequences in the Blast alignment. (A new version of Bio::Seq.pm is | |
219 included in the distribution, see L<INSTALLATION | INSTALLATION>). For more | |
220 information about them, see: | |
221 | |
222 http://bio.perl.org/Projects/Sequence/ | |
223 http://bio.perl.org/Projects/SeqAlign/ | |
224 | |
225 =back | |
226 | |
227 A variety of different data can be extracted from the Blast report by | |
228 querying the Blast.pm object. Some basic examples are given in the | |
229 L<USAGE | USAGE> section. For some working scripts, see the links provided in | |
230 the L<the DEMO SCRIPTS section | DEMO> section. | |
231 | |
232 As a part of the incipient Bioperl framework, the Bio::Tools::Blast.pm | |
233 module inherits from B<Bio::Tools::SeqAnal.pm>, which provides some | |
234 generic functionality for biological sequence analysis. See the | |
235 documentation for that module for details | |
236 (L<Links to related modules>). | |
237 | |
238 =head2 The BLAST Program | |
239 | |
240 BLAST (Basic Local Alignment Search Tool) is a widely used algorithm | |
241 for performing rapid sequence similarity searches between a single DNA | |
242 or protein sequence and a large dataset of sequences. BLAST analyses | |
243 are typically performed by dedicated remote servers, such as the ones | |
244 at the NCBI. Individual groups may also run the program on local | |
245 machines. | |
246 | |
247 The Blast family includes 5 different programs: | |
248 | |
249 Query Seq Database | |
250 ------------ ---------- | |
251 blastp -- protein protein | |
252 blastn -- nucleotide nucleotide | |
253 blastx -- nucleotide* protein | |
254 tblastn -- protein nucleotide* | |
255 tblastx -- nucleotide* nucleotide* | |
256 | |
257 * = dynamically translated in all reading frames, both strands | |
258 | |
259 See L<References & Information about the BLAST program>. | |
260 | |
261 =head2 Versions Supported | |
262 | |
263 BLAST reports generated by different application front ends are similar | |
264 but not exactly the same. Blast reports are not intended to be exchange formats, | |
265 making parsing software susceptible to obsolescence. This module aims to | |
266 support BLAST reports generated by different implementations: | |
267 | |
268 Implementation Latest version tested | |
269 -------------- -------------------- | |
270 NCBI Blast1 1.4.11 [24-Nov-97] | |
271 NCBI Blast2 2.0.8 [Jan-5-1999] | |
272 WashU-BLAST2 2.0a19MP [05-Feb-1998] | |
273 GCG 1.4.8 [1-Feb-95] | |
274 | |
275 Support for both gapped and ungapped versions is included. Currently, there | |
276 is only rudimentary support for PSI-BLAST in that these reports can be parsed but | |
277 there is no special treatment of separate iteration rounds (they are all | |
278 merged together). | |
279 | |
280 =head2 References & Information about the BLAST program | |
281 | |
282 B<WEBSITES:> | |
283 | |
284 http://www.ncbi.nlm.nih.gov/BLAST/ - Homepage at NCBI | |
285 http://www.ncbi.nlm.nih.gov/BLAST/blast_help.html - Help manual | |
286 http://blast.wustl.edu/ - WashU-Blast2 | |
287 | |
288 B<PUBLICATIONS:> (with PubMed links) | |
289 | |
290 Altschul S.F., Gish W., Miller W., Myers E.W., Lipman D.J. (1990). | |
291 "Basic local alignment search tool", J Mol Biol 215: 403-410. | |
292 | |
293 http://www.ncbi.nlm.nih.gov/htbin-post/Entrez/query?uid=2231712&form=6&db=m&Dopt=r | |
294 | |
295 Altschul, Stephen F., Thomas L. Madden, Alejandro A. Schaffer, | |
296 Jinghui Zhang, Zheng Zhang, Webb Miller, and David J. Lipman (1997). | |
297 "Gapped BLAST and PSI-BLAST: a new generation of protein database | |
298 search programs", Nucleic Acids Res. 25:3389-3402. | |
299 | |
300 http://www.ncbi.nlm.nih.gov/htbin-post/Entrez/query?uid=9254694&form=6&db=m&Dopt=r | |
301 | |
302 Karlin, Samuel and Stephen F. Altschul (1990). Methods for | |
303 assessing the statistical significance of molecular sequence | |
304 features by using general scoring schemes. Proc. Natl. Acad. | |
305 Sci. USA 87:2264-68. | |
306 | |
307 http://www.ncbi.nlm.nih.gov/htbin-post/Entrez/query?uid=2315319&form=6&db=m&Dopt=b | |
308 | |
309 Karlin, Samuel and Stephen F. Altschul (1993). Applications | |
310 and statistics for multiple high-scoring segments in molecu- | |
311 lar sequences. Proc. Natl. Acad. Sci. USA 90:5873-7. | |
312 | |
313 http://www.ncbi.nlm.nih.gov/htbin-post/Entrez/query?uid=8390686&form=6&db=m&Dopt=b | |
314 | |
315 =head1 USAGE | |
316 | |
317 =head2 Creating Blast objects | |
318 | |
319 A Blast object can be constructed from the contents of a Blast report | |
320 using a set of named parameters that specify significance criteria for | |
321 parsing. The report data can be read in from an existing file | |
322 specified with the C<-file =E<gt> 'filename'> parameter or from a | |
323 STDIN stream containing potentially multiple Blast reports. If the | |
324 C<-file> parameter does not contain a valid filename, STDIN will be | |
325 used. Separate Blast objects will be created for each report in the | |
326 stream. | |
327 | |
328 To parse the report, you must include a C<-parse =E<gt> 1> parameter | |
329 in addition to any other parsing parameters | |
330 See L<parse()|parse> for a full description of parsing parameters. | |
331 To run a new report and then parse it, include a C<-run =E<gt> \%runParams> | |
332 parameter containing a reference to a hash | |
333 that hold the parameters required by the L<run()|run> method. | |
334 | |
335 The constructor for Blast objects is inherited from Bio::Tools::SeqAnal.pm. | |
336 See the B<_initialize>() method of that package for general information | |
337 relevant to creating Blast objects. (The B<new>() method, inherited from | |
338 B<Bio::Root::Object.pm>, calls B<_initialize>(). See L<Links to related modules>). | |
339 | |
340 The Blast object can read compressed (gzipped) Blast report | |
341 files. Compression/decompression uses the gzip or compress programs | |
342 that are standard on Unix systems and should not require special | |
343 configuration. If you can't or don't want to use gzip as the file | |
344 compression tool, either pre-uncompress your files before parsing with | |
345 this module or modify B<Bio::Root::Utilities.pm> to your liking. | |
346 | |
347 Blast objects can be generated either by direct instantiation as in: | |
348 | |
349 use Bio::Tools::Blast; | |
350 $blast = new Bio::Tools::Blast (%parameters); | |
351 | |
352 =head2 Using the Static $Blast Object | |
353 | |
354 use Bio::Tools::Blast qw(:obj); | |
355 | |
356 This exports the static $Blast object into your namespace. "Static" | |
357 refers to the fact that it has class scope and there is one of these | |
358 created when you use this module. The static $Blast object is | |
359 basically an empty object that is provided for convenience and is also | |
360 used for various internal chores. | |
361 | |
362 It is exported by this module and can be used for | |
363 parsing and running reports as well as HTML-formatting without having | |
364 to first create an empty Blast object. | |
365 | |
366 Using the static $Blast object for parsing a STDIN stream of Blast reports: | |
367 | |
368 use Bio::Tools::Blast qw(:obj); | |
369 | |
370 sub process_blast { | |
371 my $blastObj = shift; | |
372 print $blastObj->table(); | |
373 $blastObj->destroy; | |
374 } | |
375 | |
376 $Blast->parse( -parse => 1, | |
377 -signif => '1e-10', | |
378 -exec_func => \&process_blast, | |
379 ); | |
380 | |
381 Then pipe a stream of Blast reports into your script via STDIN. For | |
382 each Blast report extracted from the input stream, the parser will | |
383 generate a new Blast object and pass it to the function specified by | |
384 C<-exec_func>. The destroy() call tells Perl to free the memory | |
385 associated with the object, important if you are crunching through | |
386 many reports. This method is inherited from B<Bio::Root::Object.pm> | |
387 (see L<Links to related modules>). See L<parse()|parse> for a full | |
388 description of parameters and L<the DEMO SCRIPTS section | DEMO> for | |
389 additional examples. | |
390 | |
391 =head2 Running Blasts | |
392 | |
393 To run a Blast, create a new Blast object with a C<-run =E<gt> | |
394 \%runParams> parameter. Remote Blasts are performed by including a | |
395 C<-method =E<gt> 'remote'> parameter; local Blasts are performed by | |
396 including a C<-method =E<gt> 'local'> parameter. See | |
397 L<Running Blast reports> as well as the | |
398 L<the DEMO SCRIPTS section | DEMO> for examples. | |
399 Note that running local Blasts is not yet supported, see below. | |
400 | |
401 Note that the C<-seqs =E<gt> [ $seqs ]> run parameter must contain a | |
402 reference to an array of B<Bio::Seq.pm> objects | |
403 (L<Links to related modules>). Encapsulating the sequence in an | |
404 object makes sequence information much easier to handle as it can | |
405 be supplied in a variety of formats. Bio::Seq.pm is included with | |
406 this distribution (L<INSTALLATION | INSTALLATION>). | |
407 | |
408 Remote Blasts are implemented using the | |
409 B<Bio::Tools::Blast::Run::Webblast.pm> module. Local Blasts require | |
410 that you customize the B<Bio::Tools::Blast::Run::LocalBlast.pm> | |
411 module. The version of LocalBlast.pm included with this distribution | |
412 provides the basic framework for running local Blasts. | |
413 See L<Links to related modules>. | |
414 | |
415 =head2 Significance screening | |
416 | |
417 A C<-signif> parameter can be used to screen out all hits with | |
418 P-values (or Expect values) above a certain cutoff. For example, to | |
419 exclude all hits with Expect values above 1.0e-10: C<-signif =E<gt> | |
420 1e-10>. Providing a C<-signif> cutoff can speed up processing | |
421 tremendously, since only a small fraction of the report need be | |
422 parsed. This is because the C<-signif> value is used to screen hits | |
423 based on the data in the "Description" section of the Blast report: | |
424 | |
425 For NCBI BLAST2 reports: | |
426 | |
427 Score E | |
428 Sequences producing significant alignments: (bits) Value | |
429 | |
430 sp|P31376|YAB1_YEAST HYPOTHETICAL 74.1 KD PROTEIN IN CYS3-MDM10... 957 0.0 | |
431 | |
432 For BLAST1 or WashU-BLAST2 reports: | |
433 | |
434 Smallest | |
435 Sum | |
436 High Probability | |
437 Sequences producing High-scoring Segment Pairs: Score P(N) N | |
438 | |
439 PDB:3PRK_E Proteinase K complexed with inhibitor ........... 504 1.8e-50 1 | |
440 | |
441 Thus, the C<-signif> parameter will screen based on Expect values for | |
442 BLAST2 reports and based on P-values for BLAST1/WashU-BLAST2 reports. | |
443 | |
444 To screen based on other criteria, you can supply a C<-filt_func> | |
445 parameter containing a function reference that takes a | |
446 B<Bio::Tools::Sbjct.pm> object as an argument and returns a boolean, | |
447 true if the hit is to be screened out. See example below for | |
448 L<Screening hits using arbitrary criteria>. | |
449 | |
450 =head2 Get the best hit. | |
451 | |
452 $hit = $blastObj->hit; | |
453 | |
454 A "hit" is contained by a B<Bio::Tools::Blast::Sbjct.pm> object. | |
455 | |
456 =head2 Get the P-value or Expect value of the most significant hit. | |
457 | |
458 $p = $blastObj->lowest_p; | |
459 $e = $blastObj->lowest_expect; | |
460 | |
461 Alternatively: | |
462 | |
463 $p = $blastObj->hit->p; | |
464 $e = $blastObj->hit->expect; | |
465 | |
466 Note that P-values are not reported in NCBI Blast2 reports. | |
467 | |
468 =head2 Iterate through all the hits | |
469 | |
470 foreach $hit ($blastObj->hits) { | |
471 printf "%s\t %.1e\t %d\t %.2f\t %d\n", | |
472 $hit->name, $hit->expect, $hit->num_hsps, | |
473 $hit->frac_identical, $hit->gaps; | |
474 } | |
475 | |
476 Refer to the documentation for B<Bio::Tools::Blast::Sbjct.pm> | |
477 for other ways to work with hit objects (L<Links to related modules>). | |
478 | |
479 =head2 Screening hits using arbitrary criteria | |
480 | |
481 sub filter { $hit=shift; | |
482 return ($hit->gaps == 0 and | |
483 $hit->frac_conserved > 0.5); } | |
484 | |
485 $blastObj = Bio::Tools::Blast->new( -file => '/tmp/blast.out', | |
486 -parse => 1, | |
487 -filt_func => \&filter ); | |
488 | |
489 While the Blast object is parsing the report, each hit checked by calling | |
490 &filter($hit). All hits that generate false return values from &filter | |
491 are screened out and will not be added to the Blast object. | |
492 Note that the Blast object will normally stop parsing the report after | |
493 the first non-significant hit or the first hit that does not pass the | |
494 filter function. To force the Blast object to check all hits, | |
495 include a C<-check_all_hits =E<gt> 1> parameter. | |
496 Refer to the documentation for B<Bio::Tools::Blast::Sbjct.pm> | |
497 for other ways to work with hit objects. | |
498 | |
499 =over 4 | |
500 | |
501 =item Hit start, end coordinates. | |
502 | |
503 print $sbjct->start('query'); | |
504 print $sbjct->end('sbjct'); | |
505 | |
506 In array context, you can get information for both query and sbjct with one call: | |
507 | |
508 ($qstart, $sstart) = $sbjct->start(); | |
509 ($qend, $send) = $sbjct->end(); | |
510 | |
511 For important information regarding coordinate information, see | |
512 the L<HSP start, end, and strand> section below. | |
513 Also check out documentation for the start and end methods in B<Bio::Tools::Blast::Sbjct.pm>, | |
514 which explains what happens if there is more than one HSP. | |
515 | |
516 =back | |
517 | |
518 =head2 Working with HSPs | |
519 | |
520 =over 4 | |
521 | |
522 =item Iterate through all the HSPs of every hit | |
523 | |
524 foreach $hit ($blastObj->hits) { | |
525 foreach $hsp ($hit->hsps) { | |
526 printf "%.1e\t %d\t %.1f\t %.2f\t %.2f\t %d\t %d\n", | |
527 $hsp->expect, $hsp->score, $hsp->bits, | |
528 $hsp->frac_identical, $hsp->frac_conserved, | |
529 $hsp->gaps('query'), $hsp->gaps('sbjct'); | |
530 } | |
531 | |
532 Refer to the documentation for B<Bio::Tools::Blast::HSP.pm> | |
533 for other ways to work with hit objects (L<Links to related modules>). | |
534 | |
535 =back | |
536 | |
537 =over 4 | |
538 | |
539 =item Extract HSP sequence data as strings or sequence objects | |
540 | |
541 Get the first HSP of the first hit and the sequences | |
542 of the query and sbjct as strings. | |
543 | |
544 $hsp = $blast_obj->hit->hsp; | |
545 $query_seq = $hsp->seq_str('query'); | |
546 $hsp_seq = $hsp->seq_str('sbjct'); | |
547 | |
548 Get the indices of identical and conserved positions in the HSP query seq. | |
549 | |
550 @query_iden_indices = $hsp->seq_inds('query', 'identical'); | |
551 @query_cons_indices = $hsp->seq_inds('query', 'conserved'); | |
552 | |
553 Similarly for the sbjct sequence. | |
554 | |
555 @sbjct_iden_indices = $hsp->seq_inds('sbjct', 'identical'); | |
556 @sbjct_cons_indices = $hsp->seq_inds('sbjct', 'conserved'); | |
557 | |
558 print "Query in Fasta format:\n", $hsp->seq('query')->layout('fasta'); | |
559 print "Sbjct in Fasta format:\n", $hsp->seq('sbjct')->layout('fasta'); | |
560 | |
561 See the B<Bio::Seq.pm> package for more information about using these sequence objects | |
562 (L<Links to related modules>). | |
563 | |
564 =back | |
565 | |
566 =over 4 | |
567 | |
568 =item Create sequence alignment objects using HSP sequences | |
569 | |
570 $aln = $hsp->get_aln; | |
571 print " consensus:\n", $aln->consensus(); | |
572 print $hsp->get_aln->layout('fasta'); | |
573 | |
574 $ENV{READSEQ_DIR} = '/home/users/sac/bin/solaris'; | |
575 $ENV{READSEQ} = 'readseq'; | |
576 print $hsp->get_aln->layout('msf'); | |
577 | |
578 MSF formated layout requires Don Gilbert's ReadSeq program (not included). | |
579 See the B<Bio::UnivAln.pm> for more information about using these alignment objects | |
580 (L<Links to related modules>)'. | |
581 | |
582 =back | |
583 | |
584 =over 4 | |
585 | |
586 =item HSP start, end, and strand | |
587 | |
588 To facilitate HSP processing, endpoint data for each HSP sequence are | |
589 normalized so that B<start is always less than end>. This affects TBLASTN | |
590 and TBLASTX HSPs on the reverse complement or "Minus" strand. | |
591 | |
592 Some examples of obtaining start, end coordinates for HSP objects: | |
593 | |
594 print $hsp->start('query'); | |
595 print $hsp->end('sbjct'); | |
596 ($qstart, $sstart) = $hsp->start(); | |
597 ($qend, $send) = $hsp->end(); | |
598 | |
599 Strandedness of the HSP can be assessed using the strand() method | |
600 on the HSP object: | |
601 | |
602 print $hsp->strand('query'); | |
603 print $hsp->strand('sbjct'); | |
604 | |
605 These will return 'Minus' or 'Plus'. | |
606 Or, to get strand information for both query and sbjct with a single call: | |
607 | |
608 ($qstrand, $sstrand) = $hsp->strand(); | |
609 | |
610 =back | |
611 | |
612 =head2 Report Generation | |
613 | |
614 =over 4 | |
615 | |
616 =item Generate a tab-delimited table of all results. | |
617 | |
618 print $blastObj->table; | |
619 print $blastObj->table(0); # don't include hit descriptions. | |
620 print $blastObj->table_tiled; | |
621 | |
622 The L<table()|table> method returns data for each B<HSP> of each hit listed one per | |
623 line. The L<table_tiled()|table_tiled> method returns data for each B<hit, i.e., Sbjct> | |
624 listed one per line; data from multiple HSPs are combined after tiling to | |
625 reduce overlaps. See B<Bio::Tools::Blast::Sbjct.pm> for more information about | |
626 HSP tiling. These methods generate stereotypical, tab-delimited data for each | |
627 hit of the Blast report. The output is suitable for importation into | |
628 spreadsheets or database tables. Feel free to roll your own table function if | |
629 you need a custom table. | |
630 | |
631 For either table method, descriptions of each hit can be included if a | |
632 single, true argument is supplied (e.g., $blastObj-E<gt>table(1)). The description | |
633 will be added as the last field. This will significantly increase the size of | |
634 the table. Labels for the table columns can be obtained with L<table_labels()|table_labels> | |
635 and L<table_labels_tiled()|table_labels_tiled>. | |
636 | |
637 =back | |
638 | |
639 =over 4 | |
640 | |
641 =item Print a summary of the Blast report | |
642 | |
643 $blastObj->display(); | |
644 $blastObj->display(-show=>'hits'); | |
645 | |
646 L<display()|display> prints various statistics extracted from the Blast report | |
647 such as database name, database size, matrix used, etc. The | |
648 C<display(-show=E<gt>'hits')> call prints a non-tab-delimited table | |
649 attempting to line the data up into more readable columns. The output | |
650 generated is similar to L<table_tiled()|table_tiled>. | |
651 | |
652 =back | |
653 | |
654 =over 4 | |
655 | |
656 =item HTML-format an existing report | |
657 | |
658 use Bio::Tools::Blast qw(:obj); | |
659 | |
660 # Going straight from a non HTML report file to HTML output using | |
661 # the static $Blast object exported by Bio::Tools::Blast.pm | |
662 $Blast->to_html(-file => '/usr/people/me/blast.output.txt', | |
663 -header => qq|<H1>BLASTP Results</H1><A HREF="home.html">Home</A>| | |
664 ); | |
665 | |
666 # You can also use a specific Blast object created previously. | |
667 $blastObj->to_html; | |
668 | |
669 L<to_html()|to_html> will send HTML output, line-by-line, directly to STDOUT | |
670 unless an C<-out =E<gt> array_ref> parameter is supplied (e.g., C<-out | |
671 =E<gt> \@array>), in which case the HTML will be stored in @array, one | |
672 line per array element. The direct outputting permits faster response | |
673 time since Blast reports can be huge. The -header tag can contain a | |
674 string containing any HTML that you want to appear at the top of the | |
675 Blast report. | |
676 | |
677 =back | |
678 | |
679 =head1 DEMO SCRIPTS | |
680 | |
681 Sample Scripts are included in the central bioperl distribution in the | |
682 'examples/blast/' directory (see L<INSTALLATION | INSTALLATION>): | |
683 | |
684 =head2 Handy library for working with Bio::Tools::Blast.pm | |
685 | |
686 examples/blast/blast_config.pl | |
687 | |
688 =head2 Parsing Blast reports one at a time. | |
689 | |
690 examples/blast/parse_blast.pl | |
691 examples/blast/parse_blast2.pl | |
692 examples/blast/parse_positions.pl | |
693 | |
694 =head2 Parsing sets of Blast reports. | |
695 | |
696 examples/blast/parse_blast.pl | |
697 examples/blast/parse_multi.pl | |
698 | |
699 B<Warning:> See note about L<Memory Usage Issues>. | |
700 | |
701 =head2 Running Blast analyses one at a time. | |
702 | |
703 examples/blast/run_blast_remote.pl | |
704 | |
705 =head2 Running Blast analyses given a set of sequences. | |
706 | |
707 examples/blast/blast_seq.pl | |
708 | |
709 =head2 HTML-formatting Blast reports. | |
710 | |
711 examples/blast/html.pl | |
712 | |
713 =head1 TECHNICAL DETAILS | |
714 | |
715 =head2 Blast Modes | |
716 | |
717 A BLAST object may be created using one of three different modes as | |
718 defined by the B<Bio::Tools::SeqAnal.pm> package | |
719 (See L<Links to related modules>): | |
720 | |
721 -- parse - Load a BLAST report and parse it, storing parsed data in | |
722 Blast.pm object. | |
723 -- run - Run the BLAST program to generate a new report. | |
724 -- read - Load a BLAST report into the Blast object without parsing. | |
725 | |
726 B<Run mode support has recently been added>. The module | |
727 B<Bio::Tools::Blast::Run::Webblast.pm> is an modularized adaptation of | |
728 the webblast script by Alex Dong Li: | |
729 | |
730 http://www.genet.sickkids.on.ca/bioinfo_resources/software.html#webblast | |
731 | |
732 for running remote Blast analyses and saving the results locally. Run | |
733 mode can be combined with a parse mode to generate a Blast report and | |
734 then build the Blast object from the parsed results of this report | |
735 (see L<run()|run> and L<SYNOPSIS | SYNOPSIS>). | |
736 | |
737 In read mode, the BLAST report is read in by the Blast object but is | |
738 not parsed. This could be used to internalize a Blast report but not | |
739 parse it for results (e.g., generating HTML formatted output). | |
740 | |
741 =head2 Significant Hits | |
742 | |
743 This module permits the screening of hits on the basis of | |
744 user-specified criteria for significance. Currently, Blast reports can | |
745 be screened based on: | |
746 | |
747 CRITERIA PARAMETER VALUE | |
748 ---------------------------------- --------- ---------------- | |
749 1) the best Expect (or P) value -signif float or sci-notation | |
750 2) the length of the query sequence -min_length integer | |
751 3) arbitrary criteria -filt_func function reference | |
752 | |
753 The parameters are used for construction of the BLAST object or when | |
754 running the L<parse()|parse> method on the static $Blast object. The | |
755 -SIGNIF value represents the number listed in the description section | |
756 at the top of the Blast report. For Blast2, this is an Expect value, | |
757 for Blast1 and WashU-Blast2, this is a P-value. The idea behind the | |
758 C<-filt_func> parameter is that the hit has to pass through a filter | |
759 to be considered significant. Refer to the documentation for | |
760 B<Bio::Tools::Blast::Sbjct.pm> for ways to work with hit objects. | |
761 | |
762 Using a C<-signif> parameter allows for the following: | |
763 | |
764 =over 2 | |
765 | |
766 =item Faster parsing. | |
767 | |
768 Each hit can be screened by examination of the description line alone | |
769 without fully parsing the HSP alignment section. | |
770 | |
771 =item Flexibility. | |
772 | |
773 The C<-signif> tag provides a more semantic-free way to specify the | |
774 value to be used as a basis for screening hits. Thus, C<-signif> can | |
775 be used for screening Blast1 or Blast2 reports. It is up to the user | |
776 to understand whether C<-signif> represents a P-value or an Expect | |
777 value. | |
778 | |
779 =back | |
780 | |
781 Any hit not meeting the significance criteria will not be added to the | |
782 "hit list" of the BLAST object. Also, a BLAST object without any hits | |
783 meeting the significance criteria will throw an exception during | |
784 object construction (a fatal event). | |
785 | |
786 =head2 Statistical Parameters | |
787 | |
788 There are numerous parameters which define the behavior of the BLAST | |
789 program and which are useful for interpreting the search | |
790 results. These parameters are extracted from the Blast report: | |
791 | |
792 filter -- for masking out low-complexity sequences or short repeats | |
793 matrix -- name of the substitution scoring matrix (e.g., BLOSUM62) | |
794 E -- Expect filter (screens out frequent scores) | |
795 S -- Cutoff score for segment pairs | |
796 W -- Word length | |
797 T -- Threshold score for word pairs | |
798 Lambda, -- Karlin-Altschul "sum" statistical parameters dependent on | |
799 K, H sequence composition. | |
800 G -- Gap creation penalty. | |
801 E -- Gap extension penalty. | |
802 | |
803 These parameters are not always needed. Extraction may be turned off | |
804 explicitly by including a C<-stats =E<gt> 0> parameter during object | |
805 construction. Support for all statistical parameters is not complete. | |
806 | |
807 For more about the meaning of parameters, check out the NCBI URLs given above. | |
808 | |
809 =head2 Module Organization | |
810 | |
811 The modules that comprise this Bioperl Blast distribution are location in the | |
812 Bio:: hierarchy as shown in the diagram below. | |
813 | |
814 Bio/ | |
815 | | |
816 +--------------------------+ | |
817 | | | |
818 Bio::Tools Bio::Root | |
819 | | | |
820 +----------------------+ Object.pm | |
821 | | | | |
822 SeqAnal.pm Blast.pm Blast/ | |
823 | | |
824 +---------+---------+------------+ | |
825 | | | | | |
826 Sbjct.pm HSP.pm HTML.pm Run/ | |
827 | | |
828 +------------+ | |
829 | | | |
830 Webblast.pm LocalBlast.pm | |
831 | |
832 Bio::Tools::Blast.pm is a concrete class that inherits from | |
833 B<Bio::Tools::SeqAnal.pm> and relies on other modules for parsing and | |
834 managing BLAST data. Worth mentioning about this hierarchy is the | |
835 lack of a "Parse.pm" module. Since parsing is considered central to | |
836 the purpose of the Bioperl Blast module (and Bioperl in general), it | |
837 seems somewhat unnatural to segregate out all parsing code. This | |
838 segregation could also lead to inefficiencies and harder to maintain | |
839 code. I consider this issue still open for debate. | |
840 | |
841 Bio::Tools::Blast.pm, B<Bio::Tools::Blast::Sbjct.pm>, and | |
842 B<Bio::Tools::Blast::HSP.pm> are mostly dedicated to parsing and all | |
843 can be used to instantiate objects. Blast.pm is the main "command and | |
844 control" module, inheriting some basic behaviors from SeqAnal.pm | |
845 (things that are not specific to Blast I<per se>). | |
846 | |
847 B<Bio::Tools::Blast::HTML.pm> contains functions dedicated to | |
848 generating HTML-formatted Blast reports and does not generate objects. | |
849 | |
850 =head2 Running Blasts: Details | |
851 | |
852 B<Bio::Tools::Blast::Run::Webblast.pm> contains a set of functions for | |
853 running Blast analyses at a remote server and also does not | |
854 instantiate objects. It uses a helper script called postclient.pl, | |
855 located in the Run directory. The proposed LocalBlast.pm module would | |
856 be used for running Blast reports on local machines and thus would be | |
857 customizable for different sites. It would operate in a parallel | |
858 fashion to Webblast.pm (i.e., being a collection of functions, taking | |
859 in sequence objects or files, returning result files). | |
860 | |
861 The Run modules are considered experimental. In particular, | |
862 Webblast.pm catures an HTML-formatted version of the Blast report from | |
863 the NCBI server and strips out the HTML in preparation for parsing. A | |
864 more direct approach would be to capture the Blast results directly | |
865 from the server using an interface to the NCBI toolkit. This approach | |
866 was recently proposed on the Bioperl mailing list: | |
867 http://www.uni-bielefeld.de/mailinglists/BCD/vsns-bcd-perl/9805/0000.html | |
868 | |
869 =head2 Memory Usage Issues | |
870 | |
871 Parsing large numbers of Blast reports (a few thousand or so) with | |
872 Bio::Tools::Blast.pm may lead to unacceptable memory usage situations. | |
873 This is somewhat dependent of the size and complexity of the reports. | |
874 | |
875 While this problem is under investigation, here are some workarounds | |
876 that fix the memory usage problem: | |
877 | |
878 =over 4 | |
879 | |
880 =item 1 Don't specify a -signif criterion when calling L<parse()|parse>. | |
881 | |
882 The C<-signif> value is used for imposing a upper limit to the expect- or | |
883 P-value for Blast hits to be parsed. For reasons that are still under | |
884 investigation, specifying a value for C<-signif> in the L<parse()|parse> | |
885 method prevents Blast objects from being fully | |
886 garbage collected. When using the B<parse_blast.pl> or B<parse_multi.pl> | |
887 scripts in C<examples/blast/> of the bioperl distribution), don't supply | |
888 a C<-signif> command-line parameter. | |
889 | |
890 =item 2 If you want to impose a -signif criterion, put it inside a | |
891 -filt_func. | |
892 | |
893 For the L<parse()|parse> method, a -signif =E<gt> 1e-5 parameter is equivalent | |
894 to using a filter function parameter of | |
895 | |
896 -filt_func => sub { my $hit = shift; return $hit->signif <= 1e-5; } | |
897 | |
898 Using the B<examples/blast/parse_multi.pl> script, you can supply a | |
899 command-line argument of | |
900 | |
901 -filt_func '$hit->signif <= 1e-5' | |
902 | |
903 For more information, see L<parse()|parse> and the section | |
904 L<Screening hits using arbitrary criteria>. | |
905 | |
906 =back | |
907 | |
908 =head1 TODO | |
909 | |
910 =over 4 | |
911 | |
912 =item * Develop a functional, prototype Bio::Tools::Blast::Run::LocalBlast.pm module. | |
913 | |
914 =item * Add support for PSI-BLAST and PHI-BLAST | |
915 | |
916 =item * Parse histogram of expectations and retrieve gif image in | |
917 Blast report (if present). | |
918 | |
919 =item * Further investigate memory leak that occurs when parsing Blast | |
920 streams whe supplying a -signif parameter to L<parse()|parse>. | |
921 | |
922 =item * Access Blast results directly from the NCBI server using a | |
923 Perl interface to the NCBI toolkit or XML formated Blast reports (when | |
924 available). | |
925 | |
926 =item * Further exploit Bio::UnivAln.pm and multiple-sequence | |
927 alignment programs using HSP sequence data. Some of this may best go | |
928 into a separate, dedicated module or script as opposed to burdening | |
929 Blast.pm, Sbjct.pm, and HSP.pm with additional functionality that is | |
930 not always required. | |
931 | |
932 =item * Add an example script for parsing Blast reports containing | |
933 HTML formatting. | |
934 | |
935 =back | |
936 | |
937 =head1 VERSION | |
938 | |
939 Bio::Tools::Blast.pm, 0.09 | |
940 | |
941 =head1 FEEDBACK | |
942 | |
943 =head2 Mailing Lists | |
944 | |
945 User feedback is an integral part of the evolution of this and other | |
946 Bioperl modules. Send your comments and suggestions preferably to one | |
947 of the Bioperl mailing lists. Your participation is much appreciated. | |
948 | |
949 bioperl-l@bioperl.org - General discussion | |
950 http://bio.perl.org/MailList.html - About the mailing lists | |
951 | |
952 =head2 Reporting Bugs | |
953 | |
954 Report bugs to the Bioperl bug tracking system to help us keep track | |
955 the bugs and their resolution. Bug reports can be submitted via email | |
956 or the web: | |
957 | |
958 bioperl-bugs@bio.perl.org | |
959 http://bugzilla.bioperl.org/ | |
960 | |
961 =head1 AUTHOR | |
962 | |
963 Steve Chervitz, sac@bioperl.org | |
964 | |
965 See the L<FEEDBACK | FEEDBACK> section for where to send bug reports and comments. | |
966 | |
967 =head1 ACKNOWLEDGEMENTS | |
968 | |
969 This module was developed under the auspices of the Saccharomyces Genome | |
970 Database: | |
971 http://genome-www.stanford.edu/Saccharomyces | |
972 | |
973 Other contributors include: Alex Dong Li (webblast), Chris Dagdigian | |
974 (Seq.pm), Steve Brenner (Seq.pm), Georg Fuellen (Seq.pm, UnivAln.pm), | |
975 and untold others who have offered comments (noted in the | |
976 Bio/Tools/Blast/CHANGES file of the distribution). | |
977 | |
978 =head1 COPYRIGHT | |
979 | |
980 Copyright (c) 1996-98 Steve Chervitz. All Rights Reserved. This | |
981 module is free software; you can redistribute it and/or modify it | |
982 under the same terms as Perl itself. | |
983 | |
984 =head1 SEE ALSO | |
985 | |
986 Bio::Tools::SeqAnal.pm - Sequence analysis object base class. | |
987 Bio::Tools::Blast::Sbjct.pm - Blast hit object. | |
988 Bio::Tools::Blast::HSP.pm - Blast HSP object. | |
989 Bio::Tools::Blast::HTML.pm - Blast HTML-formating utility class. | |
990 Bio::Tools::Blast::Run::Webblast.pm - Utility module for running Blasts remotely. | |
991 Bio::Tools::Blast::Run::LocalBlast.pm - Utility module for running Blasts locally. | |
992 Bio::Seq.pm - Biosequence object | |
993 Bio::UnivAln.pm - Biosequence alignment object. | |
994 Bio::Root::Object.pm - Proposed base class for all Bioperl objects. | |
995 | |
996 =head2 Links to related modules | |
997 | |
998 Bio::Tools::SeqAnal.pm | |
999 http://bio.perl.org/Core/POD/Bio/Tools/SeqAnal.html | |
1000 | |
1001 Bio::Tools::Blast::Sbjct.pm | |
1002 http://bio.perl.org/Core/POD/Bio/Tools/Blast/Sbjct.html | |
1003 | |
1004 Bio::Tools::Blast::HSP.pm | |
1005 http://bio.perl.org/Core/POD/Bio/Tools/Blast/HSP.html | |
1006 | |
1007 Bio::Tools::Blast::HTML.pm | |
1008 http://bio.perl.org/Core/POD/Bio/Tools/Blast/HTML.html | |
1009 | |
1010 Bio::Tools::Blast::Run::Webblast.pm | |
1011 http://bio.perl.org/Core/POD/Bio/Tools/Blast/Run/Webblast.html | |
1012 | |
1013 Bio::Tools::Blast::Run::LocalBlast.pm | |
1014 http://bio.perl.org/Core/POD/Bio/Tools/Blast/Run/LocalBlast.html | |
1015 | |
1016 Bio::Seq.pm | |
1017 http://bio.perl.org/Core/POD/Seq.html | |
1018 | |
1019 Bio::UnivAln.pm | |
1020 http://bio.perl.org/Projects/SeqAlign/ | |
1021 Europe: http://www.techfak.uni-bielefeld.de/bcd/Perl/Bio/#univaln | |
1022 | |
1023 Bio::Root::Object.pm | |
1024 http://bio.perl.org/Core/POD/Root/Object.html | |
1025 | |
1026 http://bio.perl.org/Projects/modules.html - Online module documentation | |
1027 http://bio.perl.org/Projects/Blast/ - Bioperl Blast Project | |
1028 http://bio.perl.org/ - Bioperl Project Homepage | |
1029 | |
1030 L<References & Information about the BLAST program>. | |
1031 | |
1032 =head1 KNOWN BUGS | |
1033 | |
1034 There is a memory leak that occurs when parsing parsing streams | |
1035 containing large numbers of Blast reports (a few thousand or so) and | |
1036 specifying a -signif parameter to the L<parse()|parse> method. For a | |
1037 workaround, see L<Memory Usage Issues>. | |
1038 | |
1039 Not sharing statistical parameters between different Blast objects | |
1040 when parsing a multi-report stream has not been completely tested and | |
1041 may be a little buggy. | |
1042 | |
1043 Documentation inconsistencies or inaccuracies may exist since this | |
1044 module underwend a fair bit of re-working going from 0.75 to 0.80 | |
1045 (corresponds to versions 0.04.4 to 0.05 of the bioperl distribution). | |
1046 | |
1047 =cut | |
1048 | |
1049 # | |
1050 ## | |
1051 ### | |
1052 #### END of main POD documentation. | |
1053 ### | |
1054 ## | |
1055 # | |
1056 | |
1057 =head1 APPENDIX | |
1058 | |
1059 Methods beginning with a leading underscore are considered private and | |
1060 are intended for internal use by this module. They are B<not> | |
1061 considered part of the public interface and are described here for | |
1062 documentation purposes only. | |
1063 | |
1064 =cut | |
1065 | |
1066 ############################################################################## | |
1067 ## CONSTRUCTOR ## | |
1068 ############################################################################## | |
1069 | |
1070 | |
1071 sub new { | |
1072 my ($class,@args) = @_; | |
1073 $class->warn("Bio::Tools::BLAST is deprecated, use Bio::SearchIO system or Bio::Tools::BPlite"); | |
1074 return $class->SUPER::new(@args); | |
1075 } | |
1076 | |
1077 ## The Blast.pm object relies on the the superclass constructor: | |
1078 ## Bio::Tools::SeqAnal::_initialize(). See that module for details. | |
1079 | |
1080 #------------- | |
1081 sub destroy { | |
1082 #------------- | |
1083 my $self=shift; | |
1084 $DEBUG==2 && print STDERR "DESTROYING $self ${\$self->name}"; | |
1085 if($self->{'_hits'}) { | |
1086 foreach($self->hits) { | |
1087 $_->destroy; | |
1088 undef $_; | |
1089 } | |
1090 undef $self->{'_hits'}; | |
1091 #$self->{'_hits'}->remove_all; ## When and if this member becomes a vector. | |
1092 } | |
1093 | |
1094 $self->SUPER::destroy; | |
1095 } | |
1096 | |
1097 ##################################################################################### | |
1098 ## ACCESSORS ## | |
1099 ##################################################################################### | |
1100 | |
1101 =head2 run | |
1102 | |
1103 Usage : $object->run( %named_parameters ) | |
1104 Purpose : Run a local or remote Blast analysis on one or more sequences. | |
1105 Returns : String containing name of Blast output file if a single Blast | |
1106 : is run. | |
1107 : -- OR -- | |
1108 : List of Blast objects if multiple Blasts are being run as a group. | |
1109 Argument : Named parameters: (PARAMETER TAGS CAN BE UPPER OR LOWER CASE). | |
1110 : -METHOD => 'local' or 'remote' (default = remote), | |
1111 : -PARSE => boolean, (true if the results are to be parsed after the run) | |
1112 : -STRICT => boolean, the strict mode to use for the resulting Blast objects. | |
1113 : ADDITIONAL PARAMETERS: | |
1114 : See methods _run_remote() and _run_local() for required | |
1115 : parameters necessary for running the blast report. | |
1116 Throws : Exception if no Blast output file was obtained. | |
1117 Comments : This method is called automatically during construction of a | |
1118 : Blast.pm object when run parameters are sent to the constructor: | |
1119 : $blastObj = new Bio::Tools::Blast (-RUN =>\%runParam, | |
1120 : %parseParam ); | |
1121 : | |
1122 : The specific run methods (local or remote) called by run() | |
1123 : must return a list containing the file name(s) with the Blast output. | |
1124 : | |
1125 : The run() method can perform single or multiple Blast runs | |
1126 : (analogous to the way parse() works) depending on how many | |
1127 : sequences are submitted. However, the running of multiple | |
1128 : Blasts is probably better handled at the script level. See notes in | |
1129 : the "TODO" section below. | |
1130 : | |
1131 : As for what to do with the Blast result file, that decision is | |
1132 : left for the user who can direct the Blast object to delete, compress, | |
1133 : or leave it alone. | |
1134 : | |
1135 : This method does not worry about load balancing, which | |
1136 : is probably best handled at the server level. | |
1137 : | |
1138 TODO: : Support for running+parsing multiple Blast analyses with a | |
1139 : single run() call is incomplete. One can generate multiple | |
1140 : reports by placing more than one sequence object in the -seqs | |
1141 : reference parameter. This saves some overhead in the code | |
1142 : that executes the Blasts since all options are configured once. | |
1143 : (This is analogous to parsing using the static $Blast object | |
1144 : see parse() and _parse_stream()). | |
1145 : | |
1146 : The trouble is that Blast objects for all runs are constructed, | |
1147 : parsed (if necessary), and then returned as a group | |
1148 : This can require lots of memory when run+parsing many Blasts | |
1149 : but should be fine if you just want to run a bunch Blasts. | |
1150 : | |
1151 : For now, when running+parsing Blasts, stick to running one | |
1152 : Blast at a time, building the Blast object with the results | |
1153 : of that report, and processing as necessary. | |
1154 : | |
1155 : Support for running PSI-Blast is not complete. | |
1156 | |
1157 See Also: L<_run_remote()|_run_remote>, L<_run_local()|_run_local>, L<parse()|parse> | |
1158 | |
1159 =cut | |
1160 | |
1161 #--------- | |
1162 sub run { | |
1163 #--------- | |
1164 my ($self, %param) = @_; | |
1165 my($method, $parse, $strict) = | |
1166 $self->_rearrange([qw(METHOD PARSE STRICT)], %param); | |
1167 | |
1168 $strict = $self->strict($strict) if $strict; | |
1169 | |
1170 my (@files); | |
1171 if($method =~ /loc/i) { | |
1172 @files = $self->_run_local(%param); | |
1173 | |
1174 } else { | |
1175 @files = $self->_run_remote(%param); | |
1176 } | |
1177 | |
1178 $self->throw("Run Blast failed: no Blast output created.") if !@files; | |
1179 | |
1180 if(scalar(@files) == 1) { | |
1181 # If there was just one Blast output file, prepare to incorporate it | |
1182 # into the current Blast object. run() is called before parse() in the | |
1183 # SeqAnal.pm constructor. | |
1184 if($files[0] ne 'email') { | |
1185 $self->file($files[0]); | |
1186 } else { | |
1187 # Can't do anything with the report. | |
1188 $self->throw("Blast report to be sent via e-mail."); | |
1189 } | |
1190 | |
1191 } else { | |
1192 # If there are multiple report files, build individual Blast objects foreach. | |
1193 # In this situation, the static $Blast object is being used to run | |
1194 # a set of related Blasts, similar to the way parse() can be used. | |
1195 # This strategy is not optimal since all reports are generated first | |
1196 # before any are parsed. | |
1197 # Untested. | |
1198 | |
1199 my(@objs); | |
1200 foreach(@files) { | |
1201 push @objs, new Bio::Tools::Blast(-FILE => $_, | |
1202 -PARSE => $parse || 0, | |
1203 -STRICT => $strict, | |
1204 ); | |
1205 } | |
1206 return @objs; | |
1207 } | |
1208 } | |
1209 | |
1210 =head2 _run_remote | |
1211 | |
1212 Usage : n/a; internal method called by run() | |
1213 : $object->_run_remote( %named_parameters ) | |
1214 Purpose : Run Blast on a remote server. | |
1215 Argument : Named parameters: | |
1216 : See documentation for function &blast_remote in | |
1217 : Bio::Tools::Blast::Run::Webblast.pm for description | |
1218 : of parameters. | |
1219 Comments : This method requires the Bio::Tools::Blast::Run::Webblast.pm | |
1220 : which conforms to this minimal API: | |
1221 : * export a method called &blast_remote that accepts a | |
1222 : Bio::Tools::Blast.pm object + named parameters | |
1223 : (specified in the Webblast.pm module). | |
1224 : * return a list of names of files containing the raw Blast reports. | |
1225 : (When building a Blast object, this list would contain a | |
1226 : single file from which the Blast object is to be constructed). | |
1227 | |
1228 See Also : L<run()|run>, L<_run_local()|_run_local>, B<Bio::Tools::Blast::Run::Webblast.pm::blast_remote()>, L<Links to related modules> | |
1229 | |
1230 =cut | |
1231 | |
1232 #---------------- | |
1233 sub _run_remote { | |
1234 #---------------- | |
1235 my ($self, %param) = @_; | |
1236 | |
1237 require Bio::Tools::Blast::Run::Webblast; | |
1238 Bio::Tools::Blast::Run::Webblast->import(qw(&blast_remote)); | |
1239 | |
1240 &blast_remote($self, %param); | |
1241 } | |
1242 | |
1243 =head2 _run_local | |
1244 | |
1245 Usage : n/a; internal method called by run() | |
1246 : $object->_run_local(%named_parameters) | |
1247 Purpose : Run Blast on a local machine. | |
1248 Argument : Named parameters: | |
1249 : See documentation for function &blast_local in | |
1250 : Bio::Tools::Blast::Run::LocalBlast.pm for description | |
1251 : of parameters. | |
1252 Comments : This method requires the Bio::Tools::Blast::Run::LocalBlast.pm | |
1253 : module which should be customized for your site. This module would | |
1254 : contain all the commands, paths, environment variables, and other | |
1255 : data necessary to run Blast commands on a local machine, but should | |
1256 : not contain any semantics for specific query sequences. | |
1257 : | |
1258 : LocalBlast.pm should also conform to this minimal API: | |
1259 : * export a method called &blast_local that accepts a | |
1260 : Bio::Tools::Blast.pm object + named parameters | |
1261 : (specified in the LocalBlast.pm module). | |
1262 : * return a list of names of files containing the raw Blast reports. | |
1263 : (When building a Blast object, this list would contain a | |
1264 : single file from which the Blast object is to be constructed). | |
1265 | |
1266 See Also : L<run()|run>, L<_run_remote()|_run_remote>, B<Bio::Tools::Blast::Run::LocalBlast::blast_local()>, L<Links to related modules> | |
1267 | |
1268 =cut | |
1269 | |
1270 #-------------- | |
1271 sub _run_local { | |
1272 #-------------- | |
1273 my ($self, %param) = @_; | |
1274 | |
1275 require Bio::Tools::Blast::Run::Webblast; | |
1276 Bio::Tools::Blast::Run::Webblast->import(qw(&blast_local)); | |
1277 | |
1278 &blast_local($self, %param); | |
1279 } | |
1280 | |
1281 =head2 db_remote | |
1282 | |
1283 Usage : @dbs = $Blast->db_remote( [seq_type] ); | |
1284 Purpose : Get a list of available sequence databases for remote Blast analysis. | |
1285 Returns : Array of strings | |
1286 Argument : seq_type = 'p' or 'n' | |
1287 : 'p' = Gets databases for peptide searches (default) | |
1288 : 'n' = Gets databases for nucleotide searches | |
1289 Throws : n/a | |
1290 Comments : Peptide databases are a subset of the nucleotide databases. | |
1291 : It is convenient to call this method on the static $Blast object | |
1292 : as shown in Usage. | |
1293 | |
1294 See Also : L<db_local()|db_local> | |
1295 | |
1296 =cut | |
1297 | |
1298 #---------------- | |
1299 sub db_remote { | |
1300 #---------------- | |
1301 my ($self, $type) = @_; | |
1302 $type ||= 'p'; | |
1303 | |
1304 require Bio::Tools::Blast::Run::Webblast; | |
1305 Bio::Tools::Blast::Run::Webblast->import(qw(@Blast_dbp_remote | |
1306 @Blast_dbn_remote)); | |
1307 | |
1308 # We shouldn't have to fully qualify the Blast_dbX_remote arrays. Hm. | |
1309 | |
1310 my(@dbs); | |
1311 if( $type =~ /^p|amino/i) { | |
1312 @dbs = @Bio::Tools::Blast::Run::Webblast::Blast_dbp_remote; | |
1313 } else { | |
1314 @dbs = @Bio::Tools::Blast::Run::Webblast::Blast_dbn_remote; | |
1315 } | |
1316 @dbs; | |
1317 } | |
1318 | |
1319 =head2 db_local | |
1320 | |
1321 Usage : @dbs = $Blast->db_local( [seq_type] ); | |
1322 Purpose : Get a list of available sequence databases for local Blast analysis. | |
1323 Returns : Array of strings | |
1324 Argument : seq_type = 'p' or 'n' | |
1325 : 'p' = Gets databases for peptide searches (default) | |
1326 : 'n' = Gets databases for nucleotide searches | |
1327 Throws : n/a | |
1328 Comments : Peptide databases are a subset of the nucleotide databases. | |
1329 : It is convenient to call this method on the static $Blast object. | |
1330 as shown in Usage. | |
1331 | |
1332 See Also : L<db_remote()|db_remote> | |
1333 | |
1334 =cut | |
1335 | |
1336 #---------------- | |
1337 sub db_local { | |
1338 #---------------- | |
1339 my ($self, $type) = @_; | |
1340 $type ||= 'p'; | |
1341 | |
1342 require Bio::Tools::Blast::Run::LocalBlast; | |
1343 Bio::Tools::Blast::Run::LocalBlast->import(qw(@Blast_dbp_local | |
1344 @Blast_dbn_local)); | |
1345 | |
1346 # We shouldn't have to fully qualify the Blast_dbX_local arrays. Hm. | |
1347 | |
1348 my(@dbs); | |
1349 if( $type =~ /^p|amino/i) { | |
1350 @dbs = @Bio::Tools::Blast::Run::LocalBlast::Blast_dbp_local; | |
1351 } else { | |
1352 @dbs = @Bio::Tools::Blast::Run::LocalBlast::Blast_dbn_local; | |
1353 } | |
1354 @dbs; | |
1355 } | |
1356 | |
1357 =head2 parse | |
1358 | |
1359 Usage : $blast_object->parse( %named_parameters ) | |
1360 Purpose : Parse a Blast report from a file or STDIN. | |
1361 : * Parses a raw BLAST data, populating Blast object with report data. | |
1362 : * Sets the significance cutoff. | |
1363 : * Extracts statistical parameters about the BLAST run. | |
1364 : * Handles both single files and streams containing multiple reports. | |
1365 Returns : integer (number of Blast reports parsed) | |
1366 Argument : <named parameters>: (PARAMETER TAGS CAN BE UPPER OR LOWER CASE). | |
1367 : -FILE => string (name of file containing raw Blast output. | |
1368 : Optional. If a valid file is not supplied, | |
1369 : STDIN will be used). | |
1370 : -SIGNIF => number (float or scientific notation number to be used | |
1371 : as a P- or Expect value cutoff; | |
1372 : default = $DEFAULT_SIGNIF (999)). | |
1373 : -FILT_FUNC => func_ref (reference to a function to be used for | |
1374 : filtering out hits based on arbitrary criteria. | |
1375 : This function should take a | |
1376 : Bio::Tools::Blast::Sbjct.pm object as its first | |
1377 : argument and return a boolean value, | |
1378 : true if the hit should be filtered out). | |
1379 : Sample filter function: | |
1380 : -FILT_FUNC => sub { $hit = shift; | |
1381 : $hit->gaps == 0; }, | |
1382 : -CHECK_ALL_HITS => boolean (check all hits for significance against | |
1383 : significance criteria. Default = false. | |
1384 : If false, stops processing hits after the first | |
1385 : non-significant hit or the first hit that fails | |
1386 : the filt_func call. This speeds parsing, | |
1387 : taking advantage of the fact that the hits | |
1388 : are processed in the order they are ranked.) | |
1389 : -MIN_LEN => integer (to be used as a minimum query sequence length | |
1390 : sequences below this length will not be processed). | |
1391 : default = no minimum length). | |
1392 : -STATS => boolean (collect stats for report: matrix, filters, etc. | |
1393 : default = false). | |
1394 : -BEST => boolean (only process the best hit of each report; | |
1395 : default = false). | |
1396 : -OVERLAP => integer (the amount of overlap to permit between | |
1397 : adjacent HSPs when tiling HSPs, | |
1398 : Default = $MAX_HSP_OVERLAP (2)) | |
1399 : | |
1400 : PARAMETERS USED WHEN PARSING MULTI-REPORT STREAMS: | |
1401 : -------------------------------------------------- | |
1402 : -SHARE => boolean (set this to true if all reports in stream | |
1403 : share the same stats. Default = true) | |
1404 : Must be set to false when parsing both Blast1 and | |
1405 : Blast2 reports in the same run or if you need | |
1406 : statistical params for each report, Lambda, K, H). | |
1407 : -STRICT => boolean (use strict mode for all Blast objects created. | |
1408 : Increases sensitivity to errors. For single | |
1409 : Blasts, this is parameter is sent to new().) | |
1410 : -EXEC_FUNC => func_ref (reference to a function for processing each | |
1411 : Blast object after it is parsed. Should accept a | |
1412 : Blast object as its sole argument. Return value | |
1413 : is ignored. If an -EXEC_FUNC parameter is supplied, | |
1414 : the -SAVE_ARRAY parameter will be ignored.) | |
1415 : -SAVE_ARRAY =>array_ref, (reference to an array for storing all | |
1416 : Blast objects as they are created. | |
1417 : Experimental. Not recommended.) | |
1418 : -SIGNIF_FMT => boolean String of 'exp' or 'parts'. Sets the format | |
1419 : for reporting P/Expect values. 'exp' reports | |
1420 : only the exponent portion. 'parts' reports | |
1421 : them as a 2 element list. See signif_fmt().. | |
1422 : | |
1423 Throws : Exception if BLAST report contains a FATAL: error. | |
1424 : Propagates any exception thrown by read(). | |
1425 : Propagates any exception thrown by called parsing methods. | |
1426 Comments : This method can be called either directly using the static $Blast object | |
1427 : or indirectly (by Bio::Tools::SeqAnal.pm) during constuction of an | |
1428 : individual Blast object. | |
1429 : | |
1430 : HTML-formatted reports can be parsed as well. No special flag is required | |
1431 : since it is detected automatically. The presence of HTML-formatting | |
1432 : will result in slower performace, however, since it must be removed | |
1433 : prior to parsing. Parsing HTML-formatted reports is highly | |
1434 : error prone and is generally not recommended. | |
1435 : | |
1436 : If one has an HTML report, do NOT remove the HTML from it by using the | |
1437 : "Save As" option of a web browser to save it as text. This renders the | |
1438 : report unparsable. | |
1439 : HTML-formatted reports can be parsed after running through the strip_html | |
1440 : function of Blast::HTML.pm as in: | |
1441 : require Bio::Tools::Blast::HTML; | |
1442 : Bio::Tools::Blast::HTML->import(&strip_html); | |
1443 : &strip_html(\$data); | |
1444 : # where data contains full contents of an HTML-formatted report. | |
1445 : TODO: write a demo script that does this. | |
1446 | |
1447 See Also : L<_init_parse_params()|_init_parse_params>, L<_parse_blast_stream()|_parse_blast_stream>, L<overlap()|overlap>, L<signif_fmt()|signif_fmt>, B<Bio::Root::Object::read()>, B<Bio::Tools::Blast::HTML.pm::strip_html()>, L<Links to related modules> | |
1448 | |
1449 =cut | |
1450 | |
1451 #--------- | |
1452 sub parse { | |
1453 #--------- | |
1454 # $self might be the static $Blast object. | |
1455 my ($self, @param) = @_; | |
1456 | |
1457 my($signif, $filt_func, $min_len, $check_all, $overlap, $stats, | |
1458 $share, $strict, $best, $signif_fmt, $no_aligns) = | |
1459 $self->_rearrange([qw(SIGNIF FILT_FUNC MIN_LEN CHECK_ALL_HITS | |
1460 OVERLAP STATS SHARE STRICT | |
1461 BEST EXPONENT NO_ALIGNS )], @param); | |
1462 | |
1463 ## Initialize the static Blast object with parameters that | |
1464 ## apply to all Blast objects within a parsing session. | |
1465 | |
1466 &_init_parse_params($share, $filt_func, $check_all, | |
1467 $signif, $min_len, $strict, | |
1468 $best, $signif_fmt, $stats, $no_aligns | |
1469 ); | |
1470 | |
1471 my $count = $self->_parse_blast_stream(@param); | |
1472 | |
1473 # print STDERR "\nDONE PARSING STREAM.\n"; | |
1474 | |
1475 if($Blast->{'_blast_errs'}) { | |
1476 my @errs = @{$Blast->{'_blast_errs'}}; | |
1477 printf STDERR "\n*** %d BLAST REPORTS HAD FATAL ERRORS:\n", scalar(@errs); | |
1478 foreach(@errs) { print STDERR "$_\n"; } | |
1479 @{$Blast->{'_blast_errs'}} = (); | |
1480 } | |
1481 | |
1482 return $count; | |
1483 } | |
1484 | |
1485 =head2 _init_parse_params | |
1486 | |
1487 Title : _init_parse_params | |
1488 Usage : n/a; called automatically by parse() | |
1489 Purpose : Initializes parameters used during parsing of Blast reports. | |
1490 : This is a static method used by the $Blast object. | |
1491 : Calls _set_signif(). | |
1492 Example : | |
1493 Returns : n/a | |
1494 Args : Args extracted by parse(). | |
1495 | |
1496 See Also: L<parse()|parse>, L<_set_signif()|_set_signif> | |
1497 | |
1498 =cut | |
1499 | |
1500 #---------------------- | |
1501 sub _init_parse_params { | |
1502 #---------------------- | |
1503 my ($share, $filt_func, $check_all, | |
1504 $signif, $min_len, $strict, | |
1505 $best, $signif_fmt, $stats, $no_aligns) = @_; | |
1506 | |
1507 ## Default is to share stats. | |
1508 $Blast->{'_share'} = defined($share) ? $share : 1; | |
1509 $Blast->{'_filt_func'} = $filt_func || 0; | |
1510 $Blast->{'_check_all'} = $check_all || 0; | |
1511 $Blast->{'_signif_fmt'} ||= $signif_fmt || ''; | |
1512 $Blast->{'_no_aligns'} = $no_aligns || 0; | |
1513 | |
1514 &_set_signif($signif, $min_len, $filt_func); | |
1515 $Blast->strict($strict) if defined $strict; | |
1516 $Blast->best($best) if $best; | |
1517 $Blast->{'_blast_count'} = 0; | |
1518 | |
1519 ## If $stats is false, miscellaneous statistical and other parameters | |
1520 ## are NOT extracted from the Blast report (e.g., matrix name, filter used, etc.). | |
1521 ## This can speed processing when crunching tons of Blast reports. | |
1522 ## Default is to NOT get stats. | |
1523 $Blast->{'_get_stats'} = defined($stats) ? $stats : 0; | |
1524 | |
1525 # Clear any errors from previous parse. | |
1526 undef $Blast->{'_blast_errs'}; | |
1527 } | |
1528 | |
1529 =head2 _set_signif | |
1530 | |
1531 Usage : n/a; called automatically by _init_parse_params() | |
1532 : This is now a "static" method used only by $Blast. | |
1533 : _set_signif($signif, $min_len, $filt_func); | |
1534 Purpose : Sets significance criteria for the BLAST object. | |
1535 Argument : Obligatory three arguments: | |
1536 : $signif = float or sci-notation number or undef | |
1537 : $min_len = integer or undef | |
1538 : $filt_func = function reference or undef | |
1539 : | |
1540 : If $signif is undefined, a default value is set | |
1541 : (see $DEFAULT_SIGNIF; min_length = not set). | |
1542 Throws : Exception if significance value is defined but appears | |
1543 : out of range or invalid. | |
1544 : Exception if $filt_func if defined and is not a func ref. | |
1545 Comments : The significance of a BLAST report can be based on | |
1546 : the P (or Expect) value and/or the length of the query sequence. | |
1547 : P (or Expect) values GREATER than '_significance' are not significant. | |
1548 : Query sequence lengths LESS than '_min_length' are not significant. | |
1549 : | |
1550 : Hits can also be screened using arbitrary significance criteria | |
1551 : as discussed in the parse() method. | |
1552 : | |
1553 : If no $signif is defined, the '_significance' level is set to | |
1554 : $Bio::Tools::Blast::DEFAULT_SIGNIF (999). | |
1555 | |
1556 See Also : L<signif()|signif>, L<min_length()|min_length>, L<_init_parse_params()|_init_parse_params>, L<parse()|parse> | |
1557 | |
1558 =cut | |
1559 | |
1560 #----------------- | |
1561 sub _set_signif { | |
1562 #----------------- | |
1563 my( $sig, $len, $func ) = @_; | |
1564 | |
1565 if(defined $sig) { | |
1566 $Blast->{'_confirm_significance'} = 1; | |
1567 if( $sig =~ /[^\d.e-]/ or $sig <= 0) { | |
1568 $Blast->throw("Invalid significance value: $sig", | |
1569 "Must be greater than zero."); | |
1570 } | |
1571 $Blast->{'_significance'} = $sig; | |
1572 } else { | |
1573 $Blast->{'_significance'} = $DEFAULT_SIGNIF; | |
1574 $Blast->{'_check_all'} = 1 if not $Blast->{'_filt_func'}; | |
1575 } | |
1576 | |
1577 if(defined $len) { | |
1578 if($len =~ /\D/ or $len <= 0) { | |
1579 $Blast->warn("Invalid minimum length value: $len", | |
1580 "Value must be an integer > 0. Value not set."); | |
1581 } else { | |
1582 $Blast->{'_min_length'} = $len; | |
1583 } | |
1584 } | |
1585 | |
1586 if(defined $func) { | |
1587 $Blast->{'_confirm_significance'} = 1; | |
1588 if($func and not ref $func eq 'CODE') { | |
1589 $Blast->throw("Not a function reference: $func", | |
1590 "The -filt_func parameter must be function reference."); | |
1591 } | |
1592 } | |
1593 } | |
1594 | |
1595 =head2 _parse_blast_stream | |
1596 | |
1597 Usage : n/a. Internal method called by parse() | |
1598 Purpose : Obtains the function to be used during parsing and calls read(). | |
1599 Returns : Integer (the number of blast reports read) | |
1600 Argument : Named parameters (forwarded from parse()) | |
1601 Throws : Propagates any exception thrown by _get_parse_blast_func() and read(). | |
1602 | |
1603 See Also : L<_get_parse_blast_func()|_get_parse_blast_func>, B<Bio::Root::Object::read()> | |
1604 | |
1605 =cut | |
1606 | |
1607 #---------------------- | |
1608 sub _parse_blast_stream { | |
1609 #---------------------- | |
1610 my ($self, %param) = @_; | |
1611 | |
1612 my $func = $self->_get_parse_blast_func(%param); | |
1613 # my $func = sub { my $data = shift; | |
1614 # printf STDERR "Chunk length = %d\n", length($data); | |
1615 # sleep(3); | |
1616 # }; | |
1617 | |
1618 # Only setting the newline character once per session. | |
1619 $Newline ||= $Util->get_newline(-client => $self, %param); | |
1620 | |
1621 $self->read(-REC_SEP =>"$Newline>", | |
1622 -FUNC => $func, | |
1623 %param); | |
1624 | |
1625 return $Blast->{'_blast_count'}; | |
1626 } | |
1627 | |
1628 =head2 _get_parse_blast_func | |
1629 | |
1630 Usage : n/a; internal method used by _parse_blast_stream() | |
1631 : $func_ref = $blast_object->_get_parse_blast_func() | |
1632 Purpose : Generates a function ref to be used as a closure for parsing | |
1633 : raw data as it is being loaded by Bio::Root::IOManager::read(). | |
1634 Returns : Function reference (closure). | |
1635 Comments : The the function reference contains a fair bit of logic | |
1636 : at present. It could perhaps be split up into separate | |
1637 : functions to make it more 'digestible'. | |
1638 | |
1639 See Also : L<_parse_blast_stream()|_parse_blast_stream> | |
1640 | |
1641 =cut | |
1642 | |
1643 #-------------------------- | |
1644 sub _get_parse_blast_func { | |
1645 #-------------------------- | |
1646 my ($self, @param) = @_; | |
1647 | |
1648 my ($save_a, $exec_func) = | |
1649 $self->_rearrange([qw(SAVE_ARRAY EXEC_FUNC)], @param); | |
1650 | |
1651 # $MONITOR && print STDERR "\nParsing Blast stream (5/dot, 250/line)\n"; | |
1652 my $count = 0; | |
1653 my $strict = $self->strict(); | |
1654 | |
1655 # Some parameter validation. | |
1656 # Remember, all Blast parsing will use this function now. | |
1657 # You won't need a exec-func or save_array when just creating a Blast object | |
1658 # as in: $blast = new Bio::Tools::Blast(); | |
1659 if($exec_func and not ref($exec_func) eq 'CODE') { | |
1660 $self->throw("The -EXEC_FUNC parameter must be function reference.", | |
1661 "exec_func = $exec_func"); | |
1662 | |
1663 } elsif($save_a and not ref($save_a) eq 'ARRAY') { | |
1664 $self->throw("The -SAVE_ARRAY parameter must supply an array reference". | |
1665 "when not using an -EXEC_FUNC parameter."); | |
1666 } | |
1667 | |
1668 ## Might consider breaking this closure up if possible. | |
1669 | |
1670 return sub { | |
1671 my ($data) = @_; | |
1672 ## $data should contain one of three possible fragment types | |
1673 ## from a Blast report: | |
1674 ## 1. Header with description section, | |
1675 ## 2. An alignment section for a single hit, or | |
1676 ## 3. The final alignment section plus the footer section. | |
1677 ## (record separator = "Newline>"). | |
1678 | |
1679 # print STDERR "\n(BLAST) DATA CHUNK: $data\n"; | |
1680 | |
1681 my ($current_blast, $current_prog, $current_vers, $current_db); | |
1682 my $prev_blast; | |
1683 my $contains_translation = 0; | |
1684 | |
1685 ### steve --- Wed Mar 15 02:48:07 2000 | |
1686 ### In the process of addressing bug PR#95. Tricky. | |
1687 ### Using the $contains_translation to do so. Not complete | |
1688 ### and possibly won't fix. We'll see. | |
1689 | |
1690 # Check for header section. Start a new Blast object and | |
1691 # parse the description section. | |
1692 # if ($data =~ /\sQuery\s?=/s || ($contains_translation && $data =~ /Database:/s)) { | |
1693 if ($data =~ /\sQuery\s?=/s) { | |
1694 $Blast->{'_blast_count'}++; | |
1695 print STDERR ".", $Blast->{'_blast_count'} % 50 ? '' : "\n" if $MONITOR; | |
1696 | |
1697 if($data =~ /$Newline\s+Translating/so) { | |
1698 print STDERR "\nCONTAINS TRANSLATION\n"; | |
1699 $contains_translation = 1; | |
1700 } | |
1701 | |
1702 # If we're parsing a stream containing multiple reports, | |
1703 # all subsequent header sections will contain the last hit of | |
1704 # the previous report which needs to be parsed and added to that | |
1705 # report if signifcant. It also contains the run parameters | |
1706 # at the bottom of the Blast report. | |
1707 # if($Blast->{'_blast_count'} > 1 || $contains_translation) { | |
1708 if($Blast->{'_blast_count'} > 1) { | |
1709 # print STDERR "\nMULTI-BLAST STREAM.\n"; | |
1710 $Blast->{'_multi_stream'} = 1; | |
1711 | |
1712 if($data =~ /(.+?)$Newline(<\w+>)?(T?BLAST[NPX])\s+(.+?)$Newline(.+)/so) { | |
1713 ($current_prog, $current_vers, $data) = ($3, $4, $5); | |
1714 # Final chunk containing last hit and last footer. | |
1715 $Blast->{'_current_blast'}->_parse_alignment($1); | |
1716 $prev_blast = $Blast->{'_current_blast'}; # finalized. | |
1717 # } elsif($contains_translation) { | |
1718 # $data =~ /(T?BLAST[NPX])\s+(.+?)$Newline(.+)/so; | |
1719 # ($current_prog, $current_vers, $data) = ($1, $2, $3); | |
1720 } else { | |
1721 $Blast->throw("Can't determine program type from BLAST report.", | |
1722 "Checked for: @Blast_programs."); | |
1723 # This has important implications for how to handle interval | |
1724 # information for HSPs. TBLASTN uses nucleotides in query HSP | |
1725 # but amino acids in the sbjct HSP sequence. | |
1726 } | |
1727 | |
1728 if($data =~ m/Database:\s+(.+?)$Newline/so ) { | |
1729 $current_db = $1; | |
1730 } else { | |
1731 # In some reports, the Database is only listed at end. | |
1732 #$Blast->warn("Can't determine database name from BLAST report."); | |
1733 } | |
1734 | |
1735 # Incyte_Fix: Nasty Invisible Bug. | |
1736 # Records in blast report are delimited by '>', but... when | |
1737 # there are no hits for a query, there won't be a '>'. That | |
1738 # causes several blast reports to run together in the data | |
1739 # passed to this routine. Need to get rid of non-hits in data | |
1740 if ($data =~ /.+(No hits? found.+Sequences.+)/so) { | |
1741 $data = $1; | |
1742 } | |
1743 # End Incyte_Fix | |
1744 | |
1745 } | |
1746 | |
1747 # Determine if we need to create a new Blast object | |
1748 # or use the $self object for this method. | |
1749 | |
1750 if($Blast->{'_multi_stream'} or $self->name eq 'Static Blast object') { | |
1751 # Strict mode is not object-specific but may be someday. | |
1752 # print STDERR "\nCreating new Blast object.\n"; | |
1753 $current_blast = new Bio::Tools::Blast(-STRICT => $strict); | |
1754 } else { | |
1755 $current_blast = $self; | |
1756 } | |
1757 $Blast->{'_current_blast'} = $current_blast; | |
1758 | |
1759 # If we're not sharing stats, set data on current blast object. | |
1760 if(defined $current_prog and not $Blast->{'_share'}) { | |
1761 $current_blast->program($current_prog); | |
1762 $current_blast->program_version($current_vers); | |
1763 $current_blast->database($current_db); | |
1764 } | |
1765 | |
1766 # print STDERR "CURRENT BLAST = ", $current_blast->name, "\n"; | |
1767 $current_blast->_parse_header($data); | |
1768 | |
1769 # If there were any descriptions in the header, | |
1770 # we know if there are any significant hits. | |
1771 # No longer throwing exception if there were no significant hits | |
1772 # and a -signif parameter was specified. Doing so prevents the | |
1773 # construction of a Blast object, which could still be useful. | |
1774 # if($current_blast->{'_has_descriptions'} and $Blast->{'_confirm_significance'} and not $current_blast->is_signif) { | |
1775 # $current_blast->throw("No significant BLAST hits for ${\$current_blast->name}"); | |
1776 | |
1777 # } | |
1778 | |
1779 } # Done parsing header/description section | |
1780 | |
1781 ### For use with $contains_translation - not right - breaks regular report parsing. | |
1782 # elsif(ref $Blast->{'_current_blast'} && $data !~ /\s*\w*\s*/s) { | |
1783 elsif(ref $Blast->{'_current_blast'} ) { | |
1784 # Process an alignment section. | |
1785 $current_blast = $Blast->{'_current_blast'}; | |
1786 # print STDERR "\nCONTINUING PROCESSING ALN WITH ", $current_blast->name, "\n"; | |
1787 # print STDERR "DATA: $data\n"; | |
1788 eval { | |
1789 $current_blast->_parse_alignment($data); | |
1790 }; | |
1791 if($@) { | |
1792 # push @{$self->{'_blast_errs'}}, $@; | |
1793 } | |
1794 } | |
1795 | |
1796 # If the current Blast object has been completely parsed | |
1797 # (occurs with a single Blast stream), or if there is a previous | |
1798 # Blast object (occurs with a multi Blast stream), | |
1799 # execute a supplied function on it or store it in a supplied array. | |
1800 | |
1801 if( defined $prev_blast or $current_blast->{'_found_params'}) { | |
1802 my $finished_blast = defined($prev_blast) ? $prev_blast : $current_blast; | |
1803 | |
1804 $finished_blast->_report_errors(); | |
1805 # print STDERR "\nNEW BLAST OBJECT: ${\$finished_blast->name}\n"; | |
1806 | |
1807 if($exec_func) { | |
1808 # print STDERR " RUNNING EXEC_FUNC...\n"; | |
1809 &$exec_func($finished_blast); # ignoring any return value. | |
1810 # Report processed, no longer need object. | |
1811 $finished_blast->destroy; | |
1812 undef $finished_blast; | |
1813 } elsif($save_a) { | |
1814 # print STDERR " SAVING IN ARRAY...\n"; | |
1815 # We've already verified that if there is no exec_func | |
1816 # then there must be a $save_array | |
1817 push @$save_a, $finished_blast; | |
1818 } | |
1819 } | |
1820 1; | |
1821 } | |
1822 } | |
1823 | |
1824 =head2 _report_errors | |
1825 | |
1826 Title : _report_errors | |
1827 Usage : n/a; Internal method called by _get_parse_blast_func(). | |
1828 Purpose : Throw or warn about any errors encountered. | |
1829 Returns : n/a | |
1830 Args : n/a | |
1831 Throws : If all hits generated exceptions, raise exception | |
1832 : (a fatal event for the Blast object.) | |
1833 : If some hits were okay but some were bad, generate a warning | |
1834 : (a few bad applies should not spoil the bunch). | |
1835 : This usually indicates a limiting B-value. | |
1836 : When the parsing code fails, it is either all or nothing. | |
1837 | |
1838 =cut | |
1839 | |
1840 #------------------- | |
1841 sub _report_errors { | |
1842 #------------------- | |
1843 my $self = shift; | |
1844 | |
1845 return unless ref($self->{'_blast_errs'}); | |
1846 # ref($self->{'_blast_errs'}) || (print STDERR "\nNO ERRORS\n", return ); | |
1847 | |
1848 my @errs = @{$self->{'_blast_errs'}}; | |
1849 | |
1850 if(scalar @errs) { | |
1851 my ($str); | |
1852 @{$self->{'_blast_errs'}} = (); # clear the errs on the object. | |
1853 # When there are many errors, in most of the cases, they are | |
1854 # caused by the same problem. Only need to see full data for | |
1855 # the first one. | |
1856 if(scalar @errs > 2) { | |
1857 $str = "SHOWING FIRST EXCEPTION ONLY:\n$errs[0]"; | |
1858 $self->clear_err(); # clearing the existing set of errors (conserve memory). | |
1859 # Not necessary, unless the -RECORD_ERR =>1 | |
1860 # constructor option was used for Blast object. | |
1861 } else { | |
1862 $str = join("\n",@errs); | |
1863 } | |
1864 | |
1865 if(not $self->{'_num_hits_significant'}) { | |
1866 $self->throw(sprintf("Failed to parse any hit data (n=%d).", scalar(@errs)), | |
1867 "\n\nTRAPPED EXCEPTION(S):\n$str\nEND TRAPPED EXCEPTION(S)\n" | |
1868 ); | |
1869 } else { | |
1870 $self->warn(sprintf("Some potential hits were not parsed (n=%d).", scalar(@errs)), | |
1871 @errs > 2 ? "This may be due to a limiting B value (max alignment listings)." : "", | |
1872 "\n\nTRAPPED EXCEPTION(S):\n$str\nEND TRAPPED EXCEPTION(S)\n" | |
1873 ); | |
1874 } | |
1875 } | |
1876 } | |
1877 | |
1878 =head2 _parse_header | |
1879 | |
1880 Usage : n/a; called automatically by the _get_parse_blast_func(). | |
1881 Purpose : Parses the header section of a BLAST report. | |
1882 Argument : String containing the header+description section of a BLAST report. | |
1883 Throws : Exception if description data cannot be parsed properly. | |
1884 : Exception if there is a 'FATAL' error in the Blast report. | |
1885 : Warning if there is a 'WARNING' in the Blast report. | |
1886 : Warning if there are no significant hits. | |
1887 Comments : Description section contains a single line for each hit listing | |
1888 : the seq id, description, score, Expect or P-value, etc. | |
1889 | |
1890 See Also : L<_get_parse_blast_func()|_get_parse_blast_func> | |
1891 | |
1892 =cut | |
1893 | |
1894 #---------------------- | |
1895 sub _parse_header { | |
1896 #---------------------- | |
1897 my( $self, $data ) = @_; | |
1898 | |
1899 # print STDERR "\n$ID: PARSING HEADER\n"; #$data\n"; | |
1900 | |
1901 $data =~ s/^\s+|\s+>?$//sg; | |
1902 | |
1903 if($data =~ /<HTML/i) { | |
1904 $self->throw("Can't parse HTML-formatted BLAST reports.", | |
1905 # "Such reports can be parsed with a special parsing \n". | |
1906 # "script included in the examples/blast directory \n". | |
1907 # "of the Bioperl distribution. (TODO)" | |
1908 ); | |
1909 # This was the old strategy, can't do it with new strategy | |
1910 # since we don't have the whole report in one chunk. | |
1911 # This could be the basis for the "special parsing script". | |
1912 # require Bio::Tools::Blast::HTML; | |
1913 # Bio::Tools::Blast::HTML->import(&strip_html); | |
1914 # &strip_html(\$data); | |
1915 } | |
1916 | |
1917 $data =~ /WARNING: (.+?)$Newline$Newline/so and $self->warn("$1") if $self->strict; | |
1918 $data =~ /FATAL: (.+?)$Newline$Newline/so and $self->throw("FATAL BLAST ERROR = $1"); | |
1919 # No longer throwing exception when no hits were found. Still reporting it. | |
1920 $data =~ /No hits? found/i and $self->warn("No hits were found.") if $self->strict; | |
1921 | |
1922 # If this is the first Blast, the program, version, and database info | |
1923 # pertain to it. Otherwise, they are for the previous report and have | |
1924 # already been parsed out. | |
1925 # Data is stored in the static Blast object. Data for subsequent reports | |
1926 # will be stored in separate objects if the -share parameter is not set. | |
1927 # See _get_parse_blast_func(). | |
1928 | |
1929 if($Blast->{'_blast_count'} == 1) { | |
1930 if($data =~ /(<\w+>)?(T?BLAST[NPX])\s+(.+?)$Newline/so) { | |
1931 $Blast->program($2); | |
1932 $Blast->program_version($3); | |
1933 } else { | |
1934 $self->throw("Can't determine program type from BLAST report.", | |
1935 "Checked for: @Blast_programs."); | |
1936 # This has important implications for how to handle interval | |
1937 # information for HSPs. TBLASTN uses nucleotides in query HSP | |
1938 # but amino acids in the sbjct HSP sequence. | |
1939 } | |
1940 | |
1941 if($data =~ m/Database:\s+(.+?)$Newline/so ) { | |
1942 $Blast->database($1); | |
1943 } else { | |
1944 # In some reports, the Database is only listed at end. | |
1945 #$self->warn("Can't determine database name from BLAST report (_parse_header)\n$data\n."); | |
1946 } | |
1947 } | |
1948 | |
1949 my ($header, $descriptions); | |
1950 | |
1951 ## For efficiency reasons, we want to to avoid using $' and $`. | |
1952 ## Therefore using single-line mode pattern matching. | |
1953 | |
1954 if($data =~ /(.+?)\nSequences producing.+?\n(.+)/s ) { | |
1955 ($header, $descriptions) = ($1, $2); | |
1956 $self->{'_has_descriptions'} = 1; | |
1957 } else { | |
1958 $header = $data; | |
1959 $self->{'_has_descriptions'} = 0; | |
1960 # Blast reports can legally lack description section. No need to warn. | |
1961 #push @{$self->{'_blast_errs'}}, "Can't parse description data."; | |
1962 } | |
1963 | |
1964 $self->_set_query($header); # The name of the sequence will appear in error report. | |
1965 # print STDERR "\nQUERY = ", $Blast->{'_current_blast'}->query, "\n"; | |
1966 | |
1967 $self->_set_date($header) if $Blast->{'_get_stats'}; | |
1968 $self->_set_length($header); | |
1969 | |
1970 # not $Blast->{'_confirm_significance'} and print STDERR "\nNOT PARSING DESCRIPTIONS.\n"; | |
1971 | |
1972 # Setting the absolute max and min significance levels. | |
1973 $self->{'_highestSignif'} = 0; | |
1974 $self->{'_lowestSignif'} = $DEFAULT_SIGNIF; | |
1975 | |
1976 if ($Blast->{'_confirm_significance'} || $Blast->{'_no_aligns'}) { | |
1977 $self->_parse_descriptions($descriptions) if $descriptions; | |
1978 } else { | |
1979 $self->{'_is_significant'} = 1; | |
1980 } | |
1981 } | |
1982 | |
1983 #----------------------- | |
1984 sub _parse_descriptions { | |
1985 #----------------------- | |
1986 my ($self, $desc) = @_; | |
1987 | |
1988 # NOTE: This method will not be called if the report lacks | |
1989 # a description section. | |
1990 | |
1991 # print STDERR "\nPARSING DESCRIPTION DATA\n"; | |
1992 | |
1993 my @descriptions = split( $Newline, $desc); | |
1994 my($line); | |
1995 | |
1996 # NOW step through each line parsing out the P/Expect value | |
1997 # All we really need to do is check the first one, if it doesn't | |
1998 # meet the significance requirement, we can skip the report. | |
1999 # BUT: we want to collect data for all hits anyway to get min/max signif. | |
2000 | |
2001 my $my_signif = $self->signif; | |
2002 my $layout_set = $Blast->{'_layout'} || 0; | |
2003 my $layout; | |
2004 my $count = 0; | |
2005 my $sig; | |
2006 | |
2007 desc_loop: | |
2008 foreach $line (@descriptions) { | |
2009 $count++; | |
2010 last desc_loop if $line =~ / NONE |End of List/; | |
2011 next desc_loop if $line =~ /^\s*$/; | |
2012 next desc_loop if $line =~ /^\.\./; | |
2013 | |
2014 ## Checking the significance value (P- or Expect value) of the hit | |
2015 ## in the description line. | |
2016 | |
2017 # These regexps need testing on a variety of reports. | |
2018 if ( $line =~ /\d+\s{1,5}[\de.-]+\s*$/) { | |
2019 $layout = 2; | |
2020 } elsif( $line =~ /\d+\s{1,5}[\de.-]+\s{1,}\d+\s*$/) { | |
2021 $layout = 1; | |
2022 } else { | |
2023 $self->warn("Can't parse significance data in description line $line"); | |
2024 next desc_loop; | |
2025 } | |
2026 not $layout_set and ($self->_layout($layout), $layout_set = 1); | |
2027 | |
2028 $sig = &_parse_signif( $line, $layout ); | |
2029 | |
2030 # print STDERR " Parsed signif ($layout) = $sig\n"; | |
2031 | |
2032 last desc_loop if ($sig > $my_signif and not $Blast->{'_check_all'}); | |
2033 $self->_process_significance($sig, $my_signif); | |
2034 } | |
2035 | |
2036 # printf "\n%d SIGNIFICANT HITS.\nDONE PARSING DESCRIPTIONS.\n", $self->{'_num_hits_significant'}; | |
2037 } | |
2038 | |
2039 sub _process_significance { | |
2040 my($self, $sig, $my_signif) = @_; | |
2041 | |
2042 $self->{'_highestSignif'} = ($sig > $self->{'_highestSignif'}) | |
2043 ? $sig : $self->{'_highestSignif'}; | |
2044 | |
2045 $self->{'_lowestSignif'} = ($sig < $self->{'_lowestSignif'}) | |
2046 ? $sig : $self->{'_lowestSignif'}; | |
2047 | |
2048 # Significance value assessment. | |
2049 $sig <= $my_signif and $self->{'_num_hits_significant'}++; | |
2050 $self->{'_num_hits'}++; | |
2051 | |
2052 $self->{'_is_significant'} = 1 if $self->{'_num_hits_significant'}; | |
2053 } | |
2054 | |
2055 =head2 _parse_alignment | |
2056 | |
2057 Usage : n/a; called automatically by the _get_parse_blast_func(). | |
2058 Purpose : Parses a single alignment section of a BLAST report. | |
2059 Argument : String containing the alignment section. | |
2060 Throws : n/a; All errors are trapped while parsing the hit data | |
2061 : and are processed as a group when the report is | |
2062 : completely processed (See _report_errors()). | |
2063 : | |
2064 Comments : Alignment section contains all HSPs for a hit. | |
2065 : Requires Bio::Tools::Blast::Sbjct.pm. | |
2066 : Optionally calls a filter function to screen the hit on arbitrary | |
2067 : criteria. If the filter function returns true for a given hit, | |
2068 : that hit will be skipped. | |
2069 : | |
2070 : If the Blast object was created with -check_all_hits set to true, | |
2071 : all hits will be checked for significance and processed if necessary. | |
2072 : If this field is false, the parsing will stop after the first | |
2073 : non-significant hit. | |
2074 : See parse() for description of parsing parameters. | |
2075 | |
2076 See Also : L<parse()|parse>, L<_get_parse_blast_func()|_get_parse_blast_func>, L<_report_errors()|_report_errors>, B<Bio::Tools::Blast::Sbjct()>, L<Links to related modules> | |
2077 | |
2078 =cut | |
2079 | |
2080 #---------------------- | |
2081 sub _parse_alignment { | |
2082 #---------------------- | |
2083 # This method always needs to check detect if the $data argument | |
2084 # contains the footer of a Blast report, indicating the last chunk | |
2085 # of a single Blast stream. | |
2086 | |
2087 my( $self, $data ) = @_; | |
2088 | |
2089 # printf STDERR "\nPARSING ALIGNMENT DATA for %s $self.\n", $self->name; | |
2090 | |
2091 # NOTE: $self->{'_current_hit'} is an instance variable | |
2092 # The $Blast object will not have this member. | |
2093 | |
2094 # If all of the significant hits have been parsed, | |
2095 # return if we're not checking all or if we don't need to get | |
2096 # the Blast stats (parameters at footer of report). | |
2097 if(defined $self->{'_current_hit'} and | |
2098 defined $self->{'_num_hits_significant'}) { | |
2099 return if $self->{'_current_hit'} >= $self->{'_num_hits_significant'} and | |
2100 not ($Blast->{'_check_all'} or $Blast->{'_get_stats'}); | |
2101 } | |
2102 | |
2103 # Check for the presence of the Blast footer section. | |
2104 # _parse_footer returns the alignment section. | |
2105 $data = $self->_parse_footer($data); | |
2106 | |
2107 # Return if we're only interested in the best hit. | |
2108 # This has to occur after checking for the parameters section | |
2109 # in the footer (since we may still be interested in them). | |
2110 return if $Blast->best and ( defined $self->{'_current_hit'} and $self->{'_current_hit'} >=1); | |
2111 | |
2112 # print "RETURNED FROM _parse_footer (", $self->to_string, ")"; | |
2113 # print "\n --> FOUND PARAMS.\n" if $self->{'_found_params'}; | |
2114 # print "\n --> DID NOT FIND PARAMS.\n" unless $self->{'_found_params'}; | |
2115 | |
2116 require Bio::Tools::Blast::Sbjct; | |
2117 | |
2118 $data =~ s/^\s+|\s+>?$//sg; | |
2119 $data =~ s/$Newline$Newline/$Newline/sog; # remove blank lines. | |
2120 my @data = split($Newline, $data); | |
2121 push @data, 'end'; | |
2122 | |
2123 # print STDERR "\nALIGNMENT DATA:\n$data\n"; | |
2124 | |
2125 my $prog = $self->program; | |
2126 my $check_all = $Blast->{'_check_all'}; | |
2127 my $filt_func = $Blast->{'_filt_func'} || 0; | |
2128 my $signif_fmt = $Blast->{'_signif_fmt'}; | |
2129 my $my_signif = $self->signif; | |
2130 my $err; | |
2131 | |
2132 # Now construct the Sbjct objects from the alignment section | |
2133 | |
2134 # debug(1); | |
2135 | |
2136 $self->{'_current_hit'}++; | |
2137 | |
2138 # If not confirming significance, _parse_descriptions will not have been run, | |
2139 # so we need to count the total number of hits here. | |
2140 if( not $Blast->{'_confirm_significance'}) { | |
2141 $self->{'_num_hits'}++; | |
2142 } | |
2143 | |
2144 if($Blast->{'_no_aligns'}) { | |
2145 # printf STDERR "\nNOT PARSING ALIGNMENT DATA\n"; | |
2146 return; | |
2147 } | |
2148 | |
2149 my $hit; # Must be my'ed within hit_loop. | |
2150 eval { | |
2151 $hit = new Bio::Tools::Blast::Sbjct (-DATA =>\@data, | |
2152 -PARENT =>$self, | |
2153 -NAME =>$self->{'_current_hit'}, | |
2154 -RANK =>$self->{'_current_hit'}, | |
2155 -RANK_BY =>'order', | |
2156 -PROGRAM =>$prog, | |
2157 -SIGNIF_FMT=>$signif_fmt, | |
2158 -OVERLAP =>$Blast->{'_overlap'} || $MAX_HSP_OVERLAP, | |
2159 ); | |
2160 # printf STDERR "NEW HIT: %s, SIGNIFICANCE = %g\n", $hit->name, $hit->expect; <STDIN>; | |
2161 # The BLAST report may have not had a description section. | |
2162 if(not $self->{'_has_descriptions'}) { | |
2163 $self->_process_significance($hit->signif, $my_signif); | |
2164 } | |
2165 }; | |
2166 | |
2167 if($@) { | |
2168 # Throwing lots of errors can slow down the code substantially. | |
2169 # Error handling code is not that efficient. | |
2170 #print STDERR "\nERROR _parse_alignment: $@\n"; | |
2171 push @{$self->{'_blast_errs'}}, $@; | |
2172 $hit->destroy if ref $hit; | |
2173 undef $hit; | |
2174 } else { | |
2175 # Collect overall signif data if we don't already have it, | |
2176 # (as occurs if no -signif parameter is supplied). | |
2177 my $hit_signif = $hit->signif; | |
2178 | |
2179 if (not $Blast->{'_confirm_significance'} ) { | |
2180 $self->{'_highestSignif'} = ($hit_signif > $self->{'_highestSignif'}) | |
2181 ? $hit_signif : $self->{'_highestSignif'}; | |
2182 | |
2183 $self->{'_lowestSignif'} = ($hit_signif < $self->{'_lowestSignif'}) | |
2184 ? $hit_signif : $self->{'_lowestSignif'}; | |
2185 } | |
2186 | |
2187 # Test significance using custom function (if supplied) | |
2188 if($filt_func) { | |
2189 if(&$filt_func($hit)) { | |
2190 push @{$self->{'_hits'}}, $hit; | |
2191 } else { | |
2192 $hit->destroy; undef $hit; | |
2193 } | |
2194 } elsif($hit_signif <= $my_signif) { | |
2195 push @{$self->{'_hits'}}, $hit; | |
2196 } | |
2197 } | |
2198 | |
2199 } | |
2200 | |
2201 =head2 _parse_footer | |
2202 | |
2203 Usage : n/a; internal function. called by _parse_alignment() | |
2204 Purpose : Extracts statistical and other parameters from the BLAST report. | |
2205 : Sets various key elements such as the program and version, | |
2206 : gapping, and the layout for the report (blast1 or blast2). | |
2207 Argument : Data to be parsed. | |
2208 Returns : String containing an alignment section for processing by | |
2209 : _parse_alignment(). | |
2210 Throws : Exception if cannot find the parameters section of report. | |
2211 : Warning if cannot determine if gapping was used. | |
2212 : Warning if cannot determine the scoring matrix used. | |
2213 Comments : This method must always get called, even if the -STATS | |
2214 : parse() parameter is false. The reason is that the layout | |
2215 : of the report and the presence of gapping must always be set. | |
2216 : The determination whether to set additional stats is made | |
2217 : by methods called by _parse_footer(). | |
2218 | |
2219 See Also : L<parse()|parse>, L<_parse_alignment()|_parse_alignment>, L<_set_database()|_set_database> | |
2220 | |
2221 =cut | |
2222 | |
2223 #--------------------- | |
2224 sub _parse_footer { | |
2225 #--------------------- | |
2226 # Basic strategy: | |
2227 # 1. figure out if we're supposed to get the stats, | |
2228 # 2. figure out if the stats are to be shared. some, not all can be shared | |
2229 # (eg., db info and matrix can be shared, karlin altschul params cannot. | |
2230 # However, this method assumes they are all sharable.) | |
2231 # 3. Parse the stats. | |
2232 # 4. return the block before the parameters section if the supplied data | |
2233 # contains a footer parameters section. | |
2234 | |
2235 my ($self, $data) = @_; | |
2236 my ($client, $last_align, $params); | |
2237 | |
2238 # printf STDERR "\nPARSING PARAMETERS for %s $self.\n", $self->name; | |
2239 | |
2240 # Should the parameters be shared? | |
2241 # If so, set $self to be the static $Blast object and return if | |
2242 # the parameters were already set. | |
2243 # Before returning, we need to extract the last alignment section | |
2244 # from the parameter section, if any. | |
2245 | |
2246 if ($Blast->{'_share'}) { | |
2247 $client = $self; | |
2248 $self = $Blast if $Blast->{'_share'}; | |
2249 } | |
2250 | |
2251 my $get_stats = $Blast->{'_get_stats'}; | |
2252 | |
2253 if( $data =~ /(.+?)${Newline}CPU time: (.*)/so) { | |
2254 # NCBI-Blast2 format (v2.04). | |
2255 ($last_align, $params) = ($1, $2); | |
2256 return $last_align if $client->{'_found_params'}; | |
2257 $self->_set_blast2_stats($params); | |
2258 | |
2259 } elsif( $data =~ /(.+?)${Newline}Parameters:(.*)/so) { | |
2260 # NCBI-Blast1 or WashU-Blast2 format. | |
2261 ($last_align, $params) = ($1, $2); | |
2262 return $last_align if $client->{'_found_params'}; | |
2263 $self->_set_blast1_stats($params); | |
2264 | |
2265 } elsif( $data =~ /(.+?)$Newline\s+Database:(.*)/so) { | |
2266 # Gotta watch out for confusion with the Database: line in the header | |
2267 # which will be present in the last hit of an internal Blast report | |
2268 # in a multi-report stream. | |
2269 | |
2270 # NCBI-Blast2 format (v2.05). | |
2271 ($last_align, $params) = ($1, $2); | |
2272 return $last_align if $client->{'_found_params'}; | |
2273 $self->_set_blast2_stats($params); | |
2274 | |
2275 } elsif( $data =~ /(.+?)$Newline\s*Searching/so) { | |
2276 # trying to detect a Searching at the end of a PSI-blast round. | |
2277 # Gotta watch out for confusion with the Searching line in the header | |
2278 # which will be present in the last hit of an internal Blast report | |
2279 # in a multi-report, non-PSI-blast stream. | |
2280 | |
2281 # PSI-Blast format (v2.08). | |
2282 ($last_align) = ($1); | |
2283 return $last_align; # if $client->{'_found_params'}; | |
2284 } | |
2285 | |
2286 # If parameter section was found, set a boolean, | |
2287 # otherwise return original data. | |
2288 | |
2289 if (defined($params)) { | |
2290 $client->{'_found_params'} = 1; | |
2291 } else { | |
2292 return $data; | |
2293 } | |
2294 | |
2295 $self->_set_database($params) if $get_stats; | |
2296 | |
2297 # The {'_gapped'} member should be set in the _set_blast?_stats() call. | |
2298 # This is a last minute attempt to deduce it. | |
2299 | |
2300 if(!defined($self->{'_gapped'})) { | |
2301 if($self->program_version() =~ /^1/) { | |
2302 $self->{'_gapped'} = 0; | |
2303 } else { | |
2304 if($self->strict > 0) { | |
2305 $self->warn("Can't determine if gapping was used. Assuming gapped."); | |
2306 } | |
2307 $self->{'_gapped'} = 1; | |
2308 } | |
2309 } | |
2310 | |
2311 return $last_align; | |
2312 } | |
2313 | |
2314 =head2 _set_blast2_stats | |
2315 | |
2316 Usage : n/a; internal function called by _parse_footer() | |
2317 Purpose : Extracts statistical and other parameters from BLAST2 report footer. | |
2318 : Stats collected: database release, gapping, | |
2319 : posted date, matrix used, filter used, Karlin-Altschul parameters, | |
2320 : E, S, T, X, W. | |
2321 Throws : Exception if cannot get "Parameters" section of Blast report. | |
2322 | |
2323 See Also : L<parse()|parse>, L<_parse_footer()|_parse_footer>, L<_set_database()|_set_database>, B<Bio::Tools::SeqAnal::set_date()>,L<Links to related modules> | |
2324 | |
2325 =cut | |
2326 | |
2327 #---------------------' | |
2328 sub _set_blast2_stats { | |
2329 #--------------------- | |
2330 my ($self, $data) = (@_); | |
2331 | |
2332 if($data =~ /$Newline\s*Gapped/so) { | |
2333 $self->{'_gapped'} = 1; | |
2334 } else { | |
2335 $self->{'_gapped'} = 0; | |
2336 } | |
2337 | |
2338 # Other stats are not always essential. | |
2339 return unless $Blast->{'_get_stats'}; | |
2340 | |
2341 # Blast2 Doesn't report what filter was used in the parameters section. | |
2342 # It just gives a warning that *some* filter was used in the header. | |
2343 # You just have to know the defaults (currently: protein = SEG, nucl = DUST). | |
2344 if($data =~ /\bfiltered\b/si) { | |
2345 $self->{'_filter'} = 'DEFAULT FILTER'; | |
2346 } else { | |
2347 $self->{'_filter'} = 'NONE'; | |
2348 } | |
2349 | |
2350 if($data =~ /Gapped$Newline\s*Lambda +K +H$Newline +(.+?)$Newline/so) { | |
2351 my ($l, $k, $h) = split(/\s+/, $1); | |
2352 $self->{'_lambda'} = $l || 'UNKNOWN'; | |
2353 $self->{'_k'} = $k || 'UNKNOWN'; | |
2354 $self->{'_h'} = $h || 'UNKNOWN'; | |
2355 } elsif($data =~ /Lambda +K +H$Newline +(.+?)$Newline/so) { | |
2356 my ($l, $k, $h) = split(/\s+/, $1); | |
2357 $self->{'_lambda'} = $l || 'UNKNOWN'; | |
2358 $self->{'_k'} = $k || 'UNKNOWN'; | |
2359 $self->{'_h'} = $h || 'UNKNOWN'; | |
2360 } | |
2361 | |
2362 if($data =~ /$Newline\s*Matrix: (.+?)$Newline/so) { | |
2363 $self->{'_matrix'} = $1; | |
2364 } else { | |
2365 $self->{'_matrix'} = $DEFAULT_MATRIX.'?'; | |
2366 if($self->strict > 0) { | |
2367 $self->warn("Can't determine scoring matrix. Assuming $DEFAULT_MATRIX."); | |
2368 } | |
2369 } | |
2370 | |
2371 if($data =~ /$Newline\s*Gap Penalties: Existence: +(\d+), +Extension: (\d+)$Newline/so) { | |
2372 $self->{'_gapCreation'} = $1; | |
2373 $self->{'_gapExtension'} = $2; | |
2374 } | |
2375 if($data =~ /sequences better than (\d+):/s) { | |
2376 $self->{'_expect'} = $1; | |
2377 } | |
2378 | |
2379 if($data =~ /$Newline\s*T: (\d+)/o) { $self->{'_word_size'} = $1; } | |
2380 if($data =~ /$Newline\s*A: (\d+)/o) { $self->{'_a'} = $1; } | |
2381 if($data =~ /$Newline\s*S1: (\d+)/o) { $self->{'_s'} = $1; } | |
2382 if($data =~ /$Newline\s*S2: (\d+)/o) { $self->{'_s'} .= ", $1"; } | |
2383 if($data =~ /$Newline\s*X1: (\d+)/o) { $self->{'_x1'} = $1; } | |
2384 if($data =~ /$Newline\s*X2: (\d+)/o) { $self->{'_x2'} = $1; } | |
2385 } | |
2386 | |
2387 =head2 _set_blast1_stats | |
2388 | |
2389 Usage : n/a; internal function called by _parse_footer() | |
2390 Purpose : Extracts statistical and other parameters from BLAST 1.x style eports. | |
2391 : Handles NCBI Blast1 and WashU-Blast2 formats. | |
2392 : Stats collected: database release, gapping, | |
2393 : posted date, matrix used, filter used, Karlin-Altschul parameters, | |
2394 : E, S, T, X, W. | |
2395 | |
2396 See Also : L<parse()|parse>, L<_parse_footer()|_parse_footer>, L<_set_database()|_set_database>, B<Bio::Tools::SeqAnal::set_date()>,L<Links to related modules> | |
2397 | |
2398 =cut | |
2399 | |
2400 #---------------------- | |
2401 sub _set_blast1_stats { | |
2402 #---------------------- | |
2403 my ($self, $data) = (@_); | |
2404 | |
2405 if(!$self->{'_gapped'} and $self->program_version() =~ /^2[\w\-\.]+WashU/) { | |
2406 $self->_set_gapping_wu($data); | |
2407 } else { | |
2408 $self->{'_gapped'} = 0; | |
2409 } | |
2410 | |
2411 # Other stats are not always essential. | |
2412 return unless $Blast->{'_get_stats'}; | |
2413 | |
2414 if($data =~ /filter=(.+?)$Newline/so) { | |
2415 $self->{'_filter'} = $1; | |
2416 } elsif($data =~ /filter$Newline +(.+?)$Newline/so) { | |
2417 $self->{'_filter'} = $1; | |
2418 } else { | |
2419 $self->{'_filter'} = 'NONE'; | |
2420 } | |
2421 | |
2422 if($data =~ /$Newline\s*E=(\d+)$Newline/so) { $self->{'_expect'} = $1; } | |
2423 | |
2424 if($data =~ /$Newline\s*M=(\w+)$Newline/so) { $self->{'_matrix'} = $1; } | |
2425 | |
2426 if($data =~ /\s*Frame MatID Matrix name .+?$Newline +(.+?)$Newline/so) { | |
2427 ## WU-Blast2. | |
2428 my ($fr, $mid, $mat, $lu, $ku, $hu, $lc, $kc, $hc) = split(/\s+/,$1); | |
2429 $self->{'_matrix'} = $mat || 'UNKNOWN'; | |
2430 $self->{'_lambda'} = $lu || 'UNKNOWN'; | |
2431 $self->{'_k'} = $ku || 'UNKNOWN'; | |
2432 $self->{'_h'} = $hu || 'UNKNOWN'; | |
2433 | |
2434 } elsif($data =~ /Lambda +K +H$Newline +(.+?)$Newline/so) { | |
2435 ## NCBI-Blast1. | |
2436 my ($l, $k, $h) = split(/\s+/, $1); | |
2437 $self->{'_lambda'} = $l || 'UNKNOWN'; | |
2438 $self->{'_k'} = $k || 'UNKNOWN'; | |
2439 $self->{'_h'} = $h || 'UNKNOWN'; | |
2440 } | |
2441 | |
2442 if($data =~ /E +S +W +T +X.+?$Newline +(.+?)$Newline/so) { | |
2443 # WashU-Blast2 | |
2444 my ($fr, $mid, $len, $elen, $e, $s, $w, $t, $x, $e2, $s2) = split(/\s+/,$1); | |
2445 $self->{'_expect'} ||= $e || 'UNKNOWN'; | |
2446 $self->{'_s'} = $s || 'UNKNOWN'; | |
2447 $self->{'_word_size'} = $w || 'UNKNOWN'; | |
2448 $self->{'_t'} = $t || 'UNKNOWN'; | |
2449 $self->{'_x'} = $x || 'UNKNOWN'; | |
2450 | |
2451 } elsif($data =~ /E +S +T1 +T2 +X1 +X2 +W +Gap$Newline +(.+?)$Newline/so) { | |
2452 ## NCBI-Blast1. | |
2453 my ($e, $s, $t1, $t2, $x1, $x2, $w, $gap) = split(/\s+/,$1); | |
2454 $self->{'_expect'} ||= $e || 'UNKNOWN'; | |
2455 $self->{'_s'} = $s || 'UNKNOWN'; | |
2456 $self->{'_word_size'} = $w || 'UNKNOWN'; | |
2457 $self->{'_t1'} = $t1 || 'UNKNOWN'; | |
2458 $self->{'_t2'} = $t2 || 'UNKNOWN'; | |
2459 $self->{'_x1'} = $x1 || 'UNKNOWN'; | |
2460 $self->{'_x2'} = $x2 || 'UNKNOWN'; | |
2461 $self->{'_gap'} = $gap || 'UNKNOWN'; | |
2462 } | |
2463 | |
2464 if(!$self->{'_matrix'}) { | |
2465 $self->{'_matrix'} = $DEFAULT_MATRIX.'?'; | |
2466 if($self->strict > 0) { | |
2467 $self->warn("Can't determine scoring matrix. Assuming $DEFAULT_MATRIX."); | |
2468 } | |
2469 } | |
2470 } | |
2471 | |
2472 =head2 _set_gapping_wu | |
2473 | |
2474 Usage : n/a; internal function called by _set_blast1_stats() | |
2475 Purpose : Determine if gapping_wu was on for WashU Blast reports. | |
2476 Comments : In earlier versions, gapping was always specified | |
2477 : but in the current version (2.0a19MP), gapping is on by default | |
2478 : and there is no positive "gapping" indicator in the Parameters | |
2479 : section. | |
2480 | |
2481 See Also : L<_set_blast1_stats()|_set_blast1_stats> | |
2482 | |
2483 =cut | |
2484 | |
2485 #-------------------- | |
2486 sub _set_gapping_wu { | |
2487 #-------------------- | |
2488 my ($self, $data) = @_; | |
2489 | |
2490 if($data =~ /gaps?$Newline/so) { | |
2491 $self->{'_gapped'} = ($data =~ /nogaps?$Newline/so) ? 0 : 1; | |
2492 } else { | |
2493 $self->{'_gapped'} = 1; | |
2494 } | |
2495 } | |
2496 | |
2497 =head2 _set_date | |
2498 | |
2499 Usage : n/a; internal function called by _parse_footer() | |
2500 Purpose : Determine the date on which the Blast analysis was performed. | |
2501 Comments : Date information is not consistently added to Blast output. | |
2502 : Uses superclass method set_date() to set date from the file, | |
2503 : (if any). | |
2504 | |
2505 See Also : L<_parse_footer()|_parse_footer>, B<Bio::Tools::SeqAnal::set_date()>,L<Links to related modules> | |
2506 | |
2507 =cut | |
2508 | |
2509 #-------------- | |
2510 sub _set_date { | |
2511 #-------------- | |
2512 my $self = shift; | |
2513 my $data = shift; | |
2514 | |
2515 ### Network BLAST reports from NCBI are time stamped as follows: | |
2516 #Fri Apr 18 15:55:41 EDT 1997, Up 1 day, 19 mins, 1 user, load: 19.54, 19.13, 17.77 | |
2517 if($data =~ /Start:\s+(.+?)\s+End:/s) { | |
2518 ## Calling superclass method to set the date. | |
2519 ## If we can't get date from the report, file date is obtained. | |
2520 $self->set_date($1); | |
2521 } elsif($data =~ /Date:\s+(.*?)$Newline/so) { | |
2522 ## E-mailed reports have a Date: field | |
2523 $self->set_date($1); | |
2524 } elsif( $data =~ /done\s+at (.+?)$Newline/so ) { | |
2525 $self->set_date($1); | |
2526 } elsif( $data =~ /$Newline([\w:, ]+), Up \d+/so ) { | |
2527 $self->set_date($1); | |
2528 } else { | |
2529 ## Otherwise, let superclass attempt to get the file creation date. | |
2530 $self->set_date() if $self->file; | |
2531 } | |
2532 } | |
2533 | |
2534 =head2 _set_length | |
2535 | |
2536 Usage : n/a; called automatically during Blast report parsing. | |
2537 Purpose : Sets the length of the query sequence (extracted from report). | |
2538 Returns : integer (length of the query sequence) | |
2539 Throws : Exception if cannot determine the query sequence length from | |
2540 : the BLAST report. | |
2541 : Exception if the length is below the min_length cutoff (if any). | |
2542 Comments : The logic here is a bit different from the other _set_XXX() | |
2543 : methods since the significance of the BLAST report is assessed | |
2544 : if MIN_LENGTH is set. | |
2545 | |
2546 See Also : B<Bio::Tools::SeqAnal::length()>, L<Links to related modules> | |
2547 | |
2548 =cut | |
2549 | |
2550 #--------------- | |
2551 sub _set_length { | |
2552 #--------------- | |
2553 my ($self, $data) = @_; | |
2554 | |
2555 my ($length); | |
2556 if( $data =~ m/$Newline\s+\(([\d|,]+) letters[\);]/so ) { | |
2557 $length = $1; | |
2558 $length =~ s/,//g; | |
2559 # printf "Length = $length in BLAST for %s$Newline",$self->name; <STDIN>; | |
2560 } else { | |
2561 $self->throw("Can't determine sequence length from BLAST report."); | |
2562 } | |
2563 | |
2564 my($sig_len); | |
2565 if(defined($Blast->{'_min_length'})) { | |
2566 local $^W = 0; | |
2567 if($length < $Blast->{'_min_len'}) { | |
2568 $self->throw("Query sequence too short for ${\$self->name} ($length)", | |
2569 "Minimum length is $Blast->{'_min_len'}"); | |
2570 } | |
2571 } | |
2572 | |
2573 $self->length($length); # defined in superclass. | |
2574 } | |
2575 | |
2576 =head2 _set_database | |
2577 | |
2578 Usage : n/a; called automatically during Blast report parsing. | |
2579 Purpose : Sets the name of the database used by the BLAST analysis. | |
2580 : Extracted from raw BLAST report. | |
2581 Throws : Exception if the name of the database cannot be determined. | |
2582 Comments : The database name is used by methods or related objects | |
2583 : for database-specific parsing. | |
2584 | |
2585 See Also : L<parse()|parse>, B<Bio::Tools::SeqAnal::database()>,B<Bio::Tools::SeqAnal::_set_db_stats()>,L<Links to related modules> | |
2586 | |
2587 =cut | |
2588 | |
2589 #------------------ | |
2590 sub _set_database { | |
2591 #------------------ | |
2592 # This now only sets data base information extracted from the report footer. | |
2593 | |
2594 my ($self, $data) = @_; | |
2595 | |
2596 my ($name, $date, $lets, $seqs); | |
2597 | |
2598 my $strict = $self->strict > 0; | |
2599 | |
2600 # This is fail-safe since DB name usually gets set in _parse_header() | |
2601 # In some reports, the database is only listed at bottom (NCBI 2.0.8). | |
2602 if($data =~ m/Database: +(.+?)$Newline/so ) { | |
2603 $name = $1; | |
2604 } elsif(not $self->database) { | |
2605 $self->warn("Can't determine database name from BLAST report."); | |
2606 } | |
2607 | |
2608 if($data =~ m/Posted date: +(.+?)$Newline/so ) { | |
2609 $date = $1; | |
2610 } elsif($data =~ m/Release date: +(.+?)$Newline/so ) { | |
2611 $date = $1; | |
2612 } elsif($strict) { | |
2613 $self->warn("Can't determine database release date."); | |
2614 } | |
2615 | |
2616 if($data =~ m/letters in database: +([\d,]+)/si || | |
2617 $data =~ m/length of database: +([\d,]+)/si ) { | |
2618 $lets = $1; | |
2619 } elsif($strict) { | |
2620 $self->warn("Can't determine number of letters in database.\n$data\n"); | |
2621 } | |
2622 | |
2623 if($data =~ m/sequences in database: +([\d,]+)/si || | |
2624 $data =~ m/number of sequences: +([\d,]+)/si ) { | |
2625 $seqs = $1; | |
2626 } elsif($strict) { | |
2627 $self->warn("Can't determine number of sequences in database.\n$data\n"); | |
2628 } | |
2629 | |
2630 $self->_set_db_stats( -NAME => $name, | |
2631 -RELEASE => $date || '', | |
2632 -LETTERS => $lets || '', | |
2633 -SEQS => $seqs || '' | |
2634 ); | |
2635 } | |
2636 | |
2637 =head2 _set_query | |
2638 | |
2639 Usage : n/a; called automatically during Blast report parsing. | |
2640 Purpose : Set the name of the query and the query description. | |
2641 : Extracted from the raw BLAST report. | |
2642 Returns : String containing name of query extracted from report. | |
2643 Throws : Warning if the name of the query cannont be obtained. | |
2644 | |
2645 See Also : B<Bio::Tools::SeqAnal::query_desc()>,L<Links to related modules> | |
2646 | |
2647 =cut | |
2648 | |
2649 #--------------- | |
2650 sub _set_query { | |
2651 #--------------- | |
2652 my $self = shift; | |
2653 my $data = shift; | |
2654 | |
2655 if($data =~ m/${Newline}Query= *(.+?)$Newline/so ) { | |
2656 my $info = $1; | |
2657 $info =~ s/TITLE //; | |
2658 # Split the query line into two parts. | |
2659 # Using \s instead of ' ' | |
2660 $info =~ /(\S+?)\s(.*)/; | |
2661 $self->query_desc($2 || ''); | |
2662 # set name of Blast object and return. | |
2663 $self->name($1 || 'UNKNOWN'); | |
2664 } else { | |
2665 $self->warn("Can't determine query sequence name from BLAST report."); | |
2666 } | |
2667 # print STDERR "$Newline NAME = ${\$self->name}$Newline"; | |
2668 } | |
2669 | |
2670 =head2 _parse_signif | |
2671 | |
2672 Usage : &_parse_signif(string, layout, gapped); | |
2673 : This is a class function. | |
2674 Purpose : Extracts the P- or Expect value from a single line of a BLAST description section. | |
2675 Example : &_parse_signif("PDB_UNIQUEP:3HSC_ heat-shock cognate ... 799 4.0e-206 2", 1); | |
2676 : &_parse_signif("gi|758803 (U23828) peritrophin-95 precurs 38 0.19", 2); | |
2677 Argument : string = line from BLAST description section | |
2678 : layout = integer (1 or 2) | |
2679 : gapped = boolean (true if gapped Blast). | |
2680 Returns : Float (0.001 or 1e-03) | |
2681 Status : Static | |
2682 | |
2683 =cut | |
2684 | |
2685 #------------------ | |
2686 sub _parse_signif { | |
2687 #------------------ | |
2688 my ($line, $layout, $gapped) = @_; | |
2689 | |
2690 local $_ = $line; | |
2691 my @linedat = split(); | |
2692 | |
2693 # When processing both Blast1 and Blast2 reports | |
2694 # in the same run, offset needs to be configured each time. | |
2695 | |
2696 my $offset = 0; | |
2697 $offset = 1 if $layout == 1 or not $gapped; | |
2698 | |
2699 my $signif = $linedat[ $#linedat - $offset ]; | |
2700 | |
2701 # fail-safe check | |
2702 if(not $signif =~ /[.-]/) { | |
2703 $offset = ($offset == 0 ? 1 : 0); | |
2704 $signif = $linedat[ $#linedat - $offset ]; | |
2705 } | |
2706 | |
2707 $signif = "1$signif" if $signif =~ /^e/i; | |
2708 return $signif; | |
2709 } | |
2710 | |
2711 ## | |
2712 ## BEGIN ACCESSOR METHODS THAT INCORPORATE THE STATIC $Blast OBJECT. | |
2713 ## | |
2714 | |
2715 sub program { | |
2716 ## Overridden method to incorporate the BLAST object. | |
2717 my $self = shift; | |
2718 return $self->SUPER::program(@_) if @_; # set | |
2719 $self->SUPER::program || $Blast->SUPER::program; # get | |
2720 } | |
2721 | |
2722 sub program_version { | |
2723 ## Overridden method to incorporate the BLAST object. | |
2724 my $self = shift; | |
2725 return $self->SUPER::program_version(@_) if @_; # set | |
2726 $self->SUPER::program_version || $Blast->SUPER::program_version; # get | |
2727 } | |
2728 | |
2729 sub database { | |
2730 ## Overridden method to incorporate the BLAST object. | |
2731 my $self = shift; | |
2732 return $self->SUPER::database(@_) if @_; # set | |
2733 $self->SUPER::database || $Blast->SUPER::database; # get | |
2734 } | |
2735 | |
2736 sub database_letters { | |
2737 ## Overridden method to incorporate the BLAST object. | |
2738 my $self = shift; | |
2739 return $self->SUPER::database_letters(@_) if @_; # set | |
2740 $self->SUPER::database_letters || $Blast->SUPER::database_letters; # get | |
2741 } | |
2742 | |
2743 sub database_release { | |
2744 ## Overridden method to incorporate the BLAST object. | |
2745 my $self = shift; | |
2746 return $self->SUPER::database_release(@_) if @_; # set | |
2747 $self->SUPER::database_release || $Blast->SUPER::database_release; # get | |
2748 } | |
2749 | |
2750 sub database_seqs { | |
2751 ## Overridden method to incorporate the BLAST object. | |
2752 my $self = shift; | |
2753 return $self->SUPER::database_seqs(@_) if @_; # set | |
2754 $self->SUPER::database_seqs || $Blast->SUPER::database_seqs; # get | |
2755 } | |
2756 | |
2757 sub date { | |
2758 ## Overridden method to incorporate the BLAST object. | |
2759 my $self = shift; | |
2760 return $self->SUPER::date(@_) if @_; # set | |
2761 $self->SUPER::date || $Blast->SUPER::date; # get | |
2762 } | |
2763 | |
2764 sub best { | |
2765 ## Overridden method to incorporate the BLAST object. | |
2766 my $self = shift; | |
2767 return $Blast->SUPER::best(@_) if @_; # set | |
2768 $Blast->SUPER::best; # get | |
2769 } | |
2770 | |
2771 =head2 signif | |
2772 | |
2773 Usage : $blast->signif(); | |
2774 Purpose : Gets the P or Expect value used as significance screening cutoff. | |
2775 Returns : Scientific notation number with this format: 1.0e-05. | |
2776 Argument : n/a | |
2777 Comments : Screening of significant hits uses the data provided on the | |
2778 : description line. For Blast1 and WU-Blast2, this data is P-value. | |
2779 : for Blast2 it is an Expect value. | |
2780 : | |
2781 : Obtains info from the static $Blast object if it has not been set | |
2782 : for the current object. | |
2783 | |
2784 See Also : L<_set_signif()|_set_signif> | |
2785 | |
2786 =cut | |
2787 | |
2788 #----------- | |
2789 sub signif { | |
2790 #----------- | |
2791 my $self = shift; | |
2792 my $sig = $self->{'_significance'} || $Blast->{'_significance'}; | |
2793 sprintf "%.1e", $sig; | |
2794 } | |
2795 | |
2796 =head2 is_signif | |
2797 | |
2798 Usage : $blast->is_signif(); | |
2799 Purpose : Determine if the BLAST report contains significant hits. | |
2800 Returns : Boolean | |
2801 Argument : n/a | |
2802 Comments : BLAST reports without significant hits but with defined | |
2803 : significance criteria will throw exceptions during construction. | |
2804 : This obviates the need to check significant() for | |
2805 : such objects. | |
2806 | |
2807 See Also : L<_set_signif()|_set_signif> | |
2808 | |
2809 =cut | |
2810 | |
2811 #------------ | |
2812 sub is_signif { my $self = shift; return $self->{'_is_significant'}; } | |
2813 #------------ | |
2814 | |
2815 # is_signif() doesn't incorporate the static $Blast object but is included | |
2816 # here to be with the other 'signif' methods. | |
2817 | |
2818 =head2 signif_fmt | |
2819 | |
2820 Usage : $blast->signif_fmt( [FMT] ); | |
2821 Purpose : Allows retrieval of the P/Expect exponent values only | |
2822 : or as a two-element list (mantissa, exponent). | |
2823 Usage : $blast_obj->signif_fmt('exp'); | |
2824 : $blast_obj->signif_fmt('parts'); | |
2825 Returns : String or '' if not set. | |
2826 Argument : String, FMT = 'exp' (return the exponent only) | |
2827 : = 'parts'(return exponent + mantissa in 2-elem list) | |
2828 : = undefined (return the raw value) | |
2829 Comments : P/Expect values are still stored internally as the full, | |
2830 : scientific notation value. | |
2831 : This method uses the static $Blast object since this issue | |
2832 : will pertain to all Blast reports within a given set. | |
2833 : This setting is propagated to Bio::Tools::Blast::Sbjct.pm. | |
2834 | |
2835 =cut | |
2836 | |
2837 #------------- | |
2838 sub signif_fmt { | |
2839 #------------- | |
2840 my $self = shift; | |
2841 if(@_) { $Blast->{'_signif_fmt'} = shift; } | |
2842 $Blast->{'_signif_fmt'} || ''; | |
2843 } | |
2844 | |
2845 =head2 min_length | |
2846 | |
2847 Usage : $blast->min_length(); | |
2848 Purpose : Gets the query sequence length used as significance screening criteria. | |
2849 Returns : Integer | |
2850 Argument : n/a | |
2851 Comments : Obtains info from the static $Blast object if it has not been set | |
2852 : for the current object. | |
2853 | |
2854 See Also : L<_set_signif()|_set_signif>, L<signif()|signif> | |
2855 | |
2856 =cut | |
2857 | |
2858 #-------------- | |
2859 sub min_length { | |
2860 #-------------- | |
2861 my $self = shift; | |
2862 $self->{'_min_length'} || $Blast->{'_min_length'}; | |
2863 } | |
2864 | |
2865 =head2 gapped | |
2866 | |
2867 Usage : $blast->gapped(); | |
2868 Purpose : Set/Get boolean indicator for gapped BLAST. | |
2869 Returns : Boolean | |
2870 Argument : n/a | |
2871 Comments : Obtains info from the static $Blast object if it has not been set | |
2872 : for the current object. | |
2873 | |
2874 =cut | |
2875 | |
2876 #----------- | |
2877 sub gapped { | |
2878 #----------- | |
2879 my $self = shift; | |
2880 if(@_) { $self->{'_gapped'} = shift; } | |
2881 $self->{'_gapped'} || $Blast->{'_gapped'}; | |
2882 } | |
2883 | |
2884 =head2 _get_stats | |
2885 | |
2886 Usage : n/a; internal method. | |
2887 Purpose : Set/Get indicator for collecting full statistics from report. | |
2888 Returns : Boolean (0 | 1) | |
2889 Comments : Obtains info from the static $Blast object which gets set | |
2890 : by _init_parse_params(). | |
2891 | |
2892 =cut | |
2893 | |
2894 #--------------- | |
2895 sub _get_stats { | |
2896 #--------------- | |
2897 my $self = shift; | |
2898 $Blast->{'_get_stats'}; | |
2899 } | |
2900 | |
2901 =head2 _layout | |
2902 | |
2903 Usage : n/a; internal method. | |
2904 Purpose : Set/Get indicator for the layout of the report. | |
2905 Returns : Integer (1 | 2) | |
2906 : Defaults to 2 if not set. | |
2907 Comments : Blast1 and WashU-Blast2 have a layout = 1. | |
2908 : This is intended for internal use by this and closely | |
2909 : allied modules like Sbjct.pm and HSP.pm. | |
2910 : | |
2911 : Obtains info from the static $Blast object if it has not been set | |
2912 : for the current object. | |
2913 | |
2914 =cut | |
2915 | |
2916 #------------ | |
2917 sub _layout { | |
2918 #------------ | |
2919 my $self = shift; | |
2920 if(@_) { | |
2921 # Optimization if we know all reports share the same stats. | |
2922 if($Blast->{'_share'}) { | |
2923 $Blast->{'_layout'} = shift; | |
2924 } else { | |
2925 $self->{'_layout'} = shift; | |
2926 } | |
2927 } | |
2928 $self->{'_layout'} || $Blast->{'_layout'} || 2; | |
2929 } | |
2930 | |
2931 ## | |
2932 ## END ACCESSOR METHODS THAT INCORPORATE THE STATIC $Blast OBJECT. | |
2933 ## | |
2934 | |
2935 =head2 hits | |
2936 | |
2937 Usage : $blast->hits(); | |
2938 Purpose : Get a list containing all BLAST hit (Sbjct) objects. | |
2939 : Get the numbers of significant hits. | |
2940 Examples : @hits = $blast->hits(); | |
2941 : $num_signif = $blast->hits(); | |
2942 Returns : List context : list of Bio::Tools::Blast::Sbjct.pm objects | |
2943 : or an empty list if there are no hits. | |
2944 : Scalar context: integer (number of significant hits) | |
2945 : or zero if there are no hits. | |
2946 : (Equivalent to num_hits()). | |
2947 Argument : n/a. Relies on wantarray. | |
2948 Throws : n/a. | |
2949 : Not throwing exception because the absence of hits may have | |
2950 : resulted from stringent significance criteria, not a failure | |
2951 : set the hits. | |
2952 | |
2953 See Also : L<hit()|hit>, L<num_hits()|num_hits>, L<is_signif()|is_signif>, L<_set_signif()|_set_signif> | |
2954 | |
2955 =cut | |
2956 | |
2957 #---------- | |
2958 sub hits { | |
2959 #---------- | |
2960 my $self = shift; | |
2961 | |
2962 if(wantarray) { | |
2963 my @ary = ref($self->{'_hits'}) ? @{$self->{'_hits'}} : (); | |
2964 return @ary; | |
2965 } else { | |
2966 return $self->num_hits(); | |
2967 } | |
2968 | |
2969 # my $num = ref($self->{'_hits'}) ? scalar(@{$self->{'_hits'}}) : 0; | |
2970 # my @ary = ref($self->{'_hits'}) ? @{$self->{'_hits'}} : (); | |
2971 # | |
2972 # return wantarray | |
2973 # # returning list containing all hits or empty list. | |
2974 # ? $self->{'_is_significant'} ? @ary : () | |
2975 # # returning number of hits or 0. | |
2976 # : $self->{'_is_significant'} ? $num : 0; | |
2977 } | |
2978 | |
2979 =head2 hit | |
2980 | |
2981 Example : $blast_obj->hit( [class] ) | |
2982 Purpose : Get a specific hit object. | |
2983 : Provides some syntactic sugar for the hits() method. | |
2984 Usage : $hitObj = $blast->hit(); | |
2985 : $hitObj = $blast->hit('best'); | |
2986 : $hitObj = $blast->hit('worst'); | |
2987 : $hitObj = $blast->hit( $name ); | |
2988 Returns : Object reference for a Bio::Tools::Blast::Sbjct.pm object. | |
2989 : undef if there are no hit (Sbjct) objects defined. | |
2990 Argument : Class (or no argument). | |
2991 : No argument (default) = highest scoring hit (same as 'best'). | |
2992 : 'best' or 'first' = highest scoring hit. | |
2993 : 'worst' or 'last' = lowest scoring hit. | |
2994 : $name = retrieve a hit by seq id (case-insensitive). | |
2995 Throws : Exception if the Blast object has no significant hits. | |
2996 : Exception if a hit cannot be found when supplying a specific | |
2997 : hit sequence identifier as an argument. | |
2998 Comments : 'best' = lowest significance value (P or Expect) among significant hits. | |
2999 : 'worst' = highest sigificance value (P or Expect) among significant hits. | |
3000 | |
3001 See Also : L<hits()|hits>, L<num_hits()|num_hits>, L<is_signif()|is_signif> | |
3002 | |
3003 =cut | |
3004 | |
3005 #--------- | |
3006 sub hit { | |
3007 #--------- | |
3008 my( $self, $option) = @_; | |
3009 $option ||= 'best'; | |
3010 | |
3011 if($Blast->{'_no_aligns'} || ! ref($self->{'_hits'})) { | |
3012 return undef; | |
3013 } | |
3014 | |
3015 $self->{'_is_significant'} or | |
3016 $self->throw("There were no significant hits.", | |
3017 "Use num_hits(), hits(), is_signif() to check."); | |
3018 | |
3019 my @hits = @{$self->{'_hits'}}; | |
3020 | |
3021 return $hits[0] if $option =~ /^(best|first|1)$/i; | |
3022 return $hits[$#hits] if $option =~ /^(worst|last)$/i; | |
3023 | |
3024 # Get hit by name. | |
3025 foreach ( @hits ) { | |
3026 return $_ if $_->name() =~ /$option/i; | |
3027 } | |
3028 | |
3029 $self->throw("Can't get hit for: $option"); | |
3030 } | |
3031 | |
3032 =head2 num_hits | |
3033 | |
3034 Usage : $blast->num_hits( ['total'] ); | |
3035 Purpose : Get number of significant hits or number of total hits. | |
3036 Examples : $num_signif = $blast-num_hits; | |
3037 : $num_total = $blast->num_hits('total'); | |
3038 Returns : Integer | |
3039 Argument : String = 'total' (or no argument). | |
3040 : No argument (Default) = return number of significant hits. | |
3041 : 'total' = number of total hits. | |
3042 Throws : n/a. | |
3043 : Not throwing exception because the absence of hits may have | |
3044 : resulted from stringent significance criteria, not a failure | |
3045 : set the hits. | |
3046 Comments : A significant hit is defined as a hit with an expect value | |
3047 : (or P value for WU-Blast) at or below the -signif parameter | |
3048 : used when parsing the report. Additionally, if a filter function | |
3049 : was supplied, the significant hit must also pass that | |
3050 : criteria. | |
3051 | |
3052 See Also : L<hits()|hits>, L<hit()|hit>, L<is_signif()|is_signif>, L<_set_signif()|_set_signif>, L<parse()|parse> | |
3053 | |
3054 =cut | |
3055 | |
3056 #------------- | |
3057 sub num_hits { | |
3058 #------------- | |
3059 my( $self, $option) = @_; | |
3060 $option ||= ''; | |
3061 | |
3062 $option =~ /total/i and return $self->{'_num_hits'} || 0; | |
3063 | |
3064 # Default: returning number of significant hits. | |
3065 # return $self->{'_num_hits_significant'} || 0; | |
3066 # return 0 if not ref $self->{'_hits'}; | |
3067 | |
3068 if(ref $self->{'_hits'}) { | |
3069 return scalar(@{$self->{'_hits'}}); | |
3070 } else { | |
3071 return $self->{'_num_hits_significant'} || 0; | |
3072 } | |
3073 } | |
3074 | |
3075 =head2 lowest_p | |
3076 | |
3077 Usage : $blast->lowest_p() | |
3078 Purpose : Get the lowest P-value among all hits in a BLAST report. | |
3079 : Syntactic sugar for $blast->hit('best')->p(). | |
3080 Returns : Float or scientific notation number. | |
3081 : Returns -1.0 if lowest_p has not been set. | |
3082 Argument : n/a. | |
3083 Throws : Exception if the Blast report does not report P-values | |
3084 : (as is the case for NCBI Blast2). | |
3085 Comments : A value is returned regardless of whether or not there were | |
3086 : significant hits ($DEFAULT_SIGNIF, currently 999). | |
3087 | |
3088 See Also : L<lowest_expect()|lowest_expect>, L<lowest_signif()|lowest_signif>, L<highest_p()|highest_p>, L<signif_fmt()|signif_fmt> | |
3089 | |
3090 =cut | |
3091 | |
3092 #------------ | |
3093 sub lowest_p { | |
3094 #------------ | |
3095 my $self = shift; | |
3096 | |
3097 # Layout 2 = NCBI Blast 2.x does not report P-values. | |
3098 $self->_layout == 2 and | |
3099 $self->throw("Can't get P-value with BLAST2.", | |
3100 "Use lowest_signif() or lowest_expect()"); | |
3101 | |
3102 return $self->{'_lowestSignif'} || -1.0; | |
3103 } | |
3104 | |
3105 =head2 lowest_expect | |
3106 | |
3107 Usage : $blast->lowest_expect() | |
3108 Purpose : Get the lowest Expect value among all hits in a BLAST report. | |
3109 : Syntactic sugar for $blast->hit('best')->expect() | |
3110 Returns : Float or scientific notation number. | |
3111 : Returns -1.0 if lowest_expect has not been set. | |
3112 Argument : n/a. | |
3113 Throws : Exception if there were no significant hits and the report | |
3114 : does not have Expect values on the description lines | |
3115 : (i.e., Blast1, WashU-Blast2). | |
3116 | |
3117 See Also : L<lowest_p()|lowest_p>, L<lowest_signif()|lowest_signif>, L<highest_expect()|highest_expect>, L<signif_fmt()|signif_fmt> | |
3118 | |
3119 =cut | |
3120 | |
3121 #------------------ | |
3122 sub lowest_expect { | |
3123 #------------------ | |
3124 my $self = shift; | |
3125 | |
3126 if ($self->_layout == 2) { | |
3127 return $self->{'_lowestSignif'} || -1.0; | |
3128 } | |
3129 | |
3130 if($self->{'_is_significant'}) { | |
3131 my $bestHit = $self->{'_hits'}->[0]; | |
3132 return $bestHit->expect(); | |
3133 } else { | |
3134 $self->throw("Can't get lowest expect value: no significant hits ", | |
3135 "The format of this report requires expect values to be extracted$Newline". | |
3136 "from the hits themselves."); | |
3137 } | |
3138 } | |
3139 | |
3140 =head2 highest_p | |
3141 | |
3142 Example : $blast->highest_p( ['overall']) | |
3143 Purpose : Get the highest P-value among all hits in a BLAST report. | |
3144 : Syntactic sugar for $blast->hit('worst')->p() | |
3145 : Can also get the highest P-value overall (not just among signif hits). | |
3146 Usage : $p_signif = $blast->highest_p(); | |
3147 : $p_all = $blast->highest_p('overall'); | |
3148 Returns : Float or scientific notation number. | |
3149 : Returns -1.0 if highest_p has not been set. | |
3150 Argument : String 'overall' or no argument. | |
3151 : No argument = get highest P-value among significant hits. | |
3152 Throws : Exception if object is created from a Blast2 report | |
3153 : (which does not report P-values). | |
3154 | |
3155 See Also : L<highest_signif()|highest_signif>, L<lowest_p()|lowest_p>, L<_set_signif()|_set_signif>, L<signif_fmt()|signif_fmt> | |
3156 | |
3157 =cut | |
3158 | |
3159 #--------------- | |
3160 sub highest_p { | |
3161 #--------------- | |
3162 my ($self, $overall) = @_; | |
3163 | |
3164 # Layout 2 = NCBI Blast 2.x does not report P-values. | |
3165 $self->_layout == 2 and | |
3166 $self->throw("Can't get P-value with BLAST2.", | |
3167 "Use highest_signif() or highest_expect()"); | |
3168 | |
3169 $overall and return $self->{'_highestSignif'} || -1.0; | |
3170 $self->hit('worst')->p(); | |
3171 } | |
3172 | |
3173 =head2 highest_expect | |
3174 | |
3175 Usage : $blast_object->highest_expect( ['overall']) | |
3176 Purpose : Get the highest Expect value among all significant hits in a BLAST report. | |
3177 : Syntactic sugar for $blast->hit('worst')->expect() | |
3178 Examples : $e_sig = $blast->highest_expect(); | |
3179 : $e_all = $blast->highest_expect('overall'); | |
3180 Returns : Float or scientific notation number. | |
3181 : Returns -1.0 if highest_exoect has not been set. | |
3182 Argument : String 'overall' or no argument. | |
3183 : No argument = get highest Expect-value among significant hits. | |
3184 Throws : Exception if there were no significant hits and the report | |
3185 : does not have Expect values on the description lines | |
3186 : (i.e., Blast1, WashU-Blast2). | |
3187 | |
3188 See Also : L<lowest_expect()|lowest_expect>, L<highest_signif()|highest_signif>, L<signif_fmt()|signif_fmt> | |
3189 | |
3190 =cut | |
3191 | |
3192 #------------------- | |
3193 sub highest_expect { | |
3194 #------------------- | |
3195 my ($self, $overall) = @_; | |
3196 | |
3197 if ( $overall and $self->_layout == 2) { | |
3198 return $self->{'_highestSignif'} || -1.0; | |
3199 } | |
3200 | |
3201 if($self->{'_is_significant'}) { | |
3202 return $self->hit('worst')->expect; | |
3203 } else { | |
3204 $self->throw("Can't get highest expect value: no significant hits ", | |
3205 "The format of this report requires expect values to be extracted$Newline". | |
3206 "from the hits themselves."); | |
3207 } | |
3208 } | |
3209 | |
3210 =head2 lowest_signif | |
3211 | |
3212 Usage : $blast_obj->lowest_signif(); | |
3213 : Syntactic sugar for $blast->hit('best')->signif() | |
3214 Purpose : Get the lowest P or Expect value among all hits | |
3215 : in a BLAST report. | |
3216 : This method is syntactic sugar for $blast->hit('best')->signif() | |
3217 : The value returned is the one which is reported in the decription | |
3218 : section of the Blast report. | |
3219 : For Blast1 and WU-Blast2, this is a P-value, | |
3220 : for NCBI Blast2, it is an Expect value. | |
3221 Example : $blast->lowest_signif(); | |
3222 Returns : Float or scientific notation number. | |
3223 : Returns -1.0 if lowest_signif has not been set. | |
3224 Argument : n/a. | |
3225 Throws : n/a. | |
3226 Status : Deprecated. Use lowest_expect() or lowest_p(). | |
3227 Comments : The signif() method provides a way to deal with the fact that | |
3228 : Blast1 and Blast2 formats differ in what is reported in the | |
3229 : description lines of each hit in the Blast report. The signif() | |
3230 : method frees any client code from having to know if this is a P-value | |
3231 : or an Expect value, making it easier to write code that can process | |
3232 : both Blast1 and Blast2 reports. This is not necessarily a good thing, since | |
3233 : one should always know when one is working with P-values or | |
3234 : Expect values (hence the deprecated status). | |
3235 : Use of lowest_expect() is recommended since all hits will have an Expect value. | |
3236 | |
3237 See Also : L<lowest_p()|lowest_p>, L<lowest_expect()|lowest_expect>, L<signif()|signif>, L<signif_fmt()|signif_fmt>, L<_set_signif()|_set_signif> | |
3238 | |
3239 =cut | |
3240 | |
3241 #------------------ | |
3242 sub lowest_signif { | |
3243 #------------------ | |
3244 my ($self) = @_; | |
3245 | |
3246 return $self->{'_lowestSignif'} || -1.0; | |
3247 } | |
3248 | |
3249 =head2 highest_signif | |
3250 | |
3251 Usage : $blast_obj->highest_signif('overall'); | |
3252 : Syntactic sugar for $blast->hit('worst')->signif() | |
3253 Purpose : Get the highest P or Expect value among all hits | |
3254 : in a BLAST report. | |
3255 : The value returned is the one which is reported in the decription | |
3256 : section of the Blast report. | |
3257 : For Blast1 and WU-Blast2, this is a P-value, | |
3258 : for NCBI Blast2, it is an Expect value. | |
3259 Example : $blast->highest_signif(); | |
3260 Returns : Float or scientific notation number. | |
3261 : Returns -1.0 if highest_signif has not been set. | |
3262 Argument : Optional string 'overall' to get the highest overall significance value. | |
3263 Throws : n/a. | |
3264 Status : Deprecated. Use highest_expect() or highest_p(). | |
3265 Comments : Analogous to lowest_signif(), q.v. | |
3266 | |
3267 See Also : L<lowest_signif()|lowest_signif>, L<lowest_p()|lowest_p>, L<lowest_expect()|lowest_expect>, L<signif()|signif>, L<signif_fmt()|signif_fmt>, L<_set_signif()|_set_signif> | |
3268 | |
3269 =cut | |
3270 | |
3271 #--------------------- | |
3272 sub highest_signif { | |
3273 #--------------------- | |
3274 my ($self, $overall) = @_; | |
3275 | |
3276 $overall and return $self->{'_highestSignif'} || -1.0; | |
3277 | |
3278 if($self->{'_is_significant'}) { | |
3279 my $worst_hit = $self->hit('worst'); | |
3280 if(defined $worst_hit) { | |
3281 return $worst_hit->signif; | |
3282 } else { | |
3283 return $self->{'_highestSignif'}; | |
3284 } | |
3285 } | |
3286 } | |
3287 | |
3288 =head2 matrix | |
3289 | |
3290 Usage : $blast_object->matrix(); | |
3291 Purpose : Get the name of the scoring matrix used. | |
3292 : This is extracted from the report. | |
3293 Argument : n/a | |
3294 Returns : string or undef if not defined | |
3295 | |
3296 =cut | |
3297 | |
3298 #------------ | |
3299 sub matrix { my $self = shift; $self->{'_matrix'} || $Blast->{'_matrix'}; } | |
3300 #------------ | |
3301 | |
3302 =head2 filter | |
3303 | |
3304 Usage : $blast_object->filter(); | |
3305 Purpose : Get the name of the low-complexity sequence filter used. | |
3306 : (SEG, SEG+XNU, DUST, NONE). | |
3307 : This is extracted from the report. | |
3308 Argument : n/a | |
3309 Returns : string or undef if not defined | |
3310 | |
3311 =cut | |
3312 | |
3313 #---------- | |
3314 sub filter { my $self = shift; $self->{'_filter'} || $Blast->{'_filter'}; } | |
3315 #---------- | |
3316 | |
3317 =head2 expect | |
3318 | |
3319 Usage : $blast_object->expect(); | |
3320 Purpose : Get the expect parameter (E) used for the Blast analysis. | |
3321 : This is extracted from the report. | |
3322 Argument : n/a | |
3323 Returns : string or undef if not defined. | |
3324 | |
3325 =cut | |
3326 | |
3327 #----------- | |
3328 sub expect { my $self = shift; $self->{'_expect'} || $Blast->{'_expect'}; } | |
3329 #----------- | |
3330 | |
3331 =head2 karlin_altschul | |
3332 | |
3333 Usage : $blast_object->karlin_altschul(); | |
3334 Purpose : Get the Karlin_Altschul sum statistics (Lambda, K, H) | |
3335 : These are extracted from the report. | |
3336 Argument : n/a | |
3337 Returns : list of three floats (Lambda, K, H) | |
3338 : If not defined, returns list of three zeros) | |
3339 | |
3340 =cut | |
3341 | |
3342 #--------------------- | |
3343 sub karlin_altschul { | |
3344 #--------------------- | |
3345 my $self = shift; | |
3346 if(defined($self->{'_lambda'})) { | |
3347 ($self->{'_lambda'}, $self->{'_k'}, $self->{'_h'}); | |
3348 } elsif(defined($Blast->{'_lambda'})) { | |
3349 ($Blast->{'_lambda'}, $Blast->{'_k'}, $Blast->{'_h'}); | |
3350 } else { | |
3351 (0, 0, 0); | |
3352 } | |
3353 } | |
3354 | |
3355 =head2 word_size | |
3356 | |
3357 Usage : $blast_object->word_size(); | |
3358 Purpose : Get the word_size used during the Blast analysis. | |
3359 : This is extracted from the report. | |
3360 Argument : n/a | |
3361 Returns : integer or undef if not defined. | |
3362 | |
3363 =cut | |
3364 | |
3365 #-------------- | |
3366 sub word_size { | |
3367 #-------------- | |
3368 my $self = shift; | |
3369 $self->{'_word_size'} || $Blast->{'_word_size'}; | |
3370 } | |
3371 | |
3372 =head2 s | |
3373 | |
3374 Usage : $blast_object->s(); | |
3375 Purpose : Get the s statistic for the Blast analysis. | |
3376 : This is extracted from the report. | |
3377 Argument : n/a | |
3378 Returns : integer or undef if not defined. | |
3379 | |
3380 =cut | |
3381 | |
3382 #------ | |
3383 sub s { my $self = shift; $self->{'_s'} || $Blast->{'_s'}; } | |
3384 #------ | |
3385 | |
3386 =head2 gap_creation | |
3387 | |
3388 Usage : $blast_object->gap_creation(); | |
3389 Purpose : Get the gap creation penalty used for a gapped Blast analysis. | |
3390 : This is extracted from the report. | |
3391 Argument : n/a | |
3392 Returns : integer or undef if not defined. | |
3393 | |
3394 See Also : L<gap_extension()|gap_extension> | |
3395 | |
3396 =cut | |
3397 | |
3398 #----------------- | |
3399 sub gap_creation { | |
3400 #----------------- | |
3401 my $self = shift; | |
3402 $self->{'_gapCreation'} || $Blast->{'_gapCreation'}; | |
3403 } | |
3404 | |
3405 =head2 gap_extension | |
3406 | |
3407 Usage : $blast_object->gap_extension(); | |
3408 Purpose : Get the gap extension penalty used for a gapped Blast analysis. | |
3409 : This is extracted from the report. | |
3410 Argument : n/a | |
3411 Returns : integer or undef if not defined. | |
3412 | |
3413 See Also : L<gap_extension()|gap_extension> | |
3414 | |
3415 =cut | |
3416 | |
3417 #------------------- | |
3418 sub gap_extension { | |
3419 #------------------- | |
3420 my $self = shift; | |
3421 $self->{'_gapExtension'} || $Blast->{'_gapExtension'}; | |
3422 } | |
3423 | |
3424 =head2 ambiguous_aln | |
3425 | |
3426 Usage : $blast_object->ambiguous_aln(); | |
3427 Purpose : Test all hits and determine if any have an ambiguous alignment. | |
3428 Example : print "ambiguous" if $blast->ambiguous_aln(); | |
3429 Returns : Boolean (true if ANY significant hit has an ambiguous alignment) | |
3430 Argument : n/a | |
3431 Throws : n/a | |
3432 Status : Experimental | |
3433 Comments : An ambiguous BLAST alignment is defined as one where two or more | |
3434 : different HSPs have significantly overlapping sequences such | |
3435 : that it is not possible to create a unique alignment | |
3436 : by simply concatenating HSPs. This may indicate the presence | |
3437 : of multiple domains in one sequence relative to another. | |
3438 : This method only indicates the presence of ambiguity in at | |
3439 : least one significant hit. To determine the nature of the | |
3440 : ambiguity, each hit must be examined. | |
3441 | |
3442 See Also : B<Bio::Tools::Blast::Sbjct::ambiguous_aln()>,L<Links to related modules> | |
3443 | |
3444 =cut | |
3445 | |
3446 #---------------- | |
3447 sub ambiguous_aln { | |
3448 #---------------- | |
3449 my $self = shift; | |
3450 foreach($self->hits()) { | |
3451 return 1 if ($_->ambiguous_aln() ne '-'); | |
3452 } | |
3453 0; | |
3454 } | |
3455 | |
3456 =head2 overlap | |
3457 | |
3458 Usage : $blast_object->overlap([integer]); | |
3459 Purpose : Set/Get the number of overlapping residues allowed when tiling multiple HSPs. | |
3460 : Delegates to Bio::Tools::Blast::Sbjct::overlap(). | |
3461 Throws : Exception if there are no significant hits. | |
3462 Status : Experimental | |
3463 | |
3464 See Also : B<Bio::Tools::Blast::Sbjct::overlap()>,L<Links to related modules> | |
3465 | |
3466 =cut | |
3467 | |
3468 #------------ | |
3469 sub overlap { | |
3470 #------------ | |
3471 my $self = shift; | |
3472 if(not $self->hits) { | |
3473 $self->throw("Can't get overlap data without significant hits."); | |
3474 } | |
3475 $self->hit->overlap(); | |
3476 } | |
3477 | |
3478 =head2 homol_data | |
3479 | |
3480 Usage : @data = $blast_object->homo_data( %named_params ); | |
3481 Purpose : Gets specific similarity data about each significant hit. | |
3482 Returns : Array of strings: | |
3483 : "Homology data" for each HSP is in the format: | |
3484 : "<integer> <start> <stop>" | |
3485 : Data for different HSPs are tab-delimited. | |
3486 Argument : named parameters passed along to the hit objects. | |
3487 Throws : n/a | |
3488 Status : Experimental | |
3489 Comments : This is a very experimental method used for obtaining an | |
3490 : indication of: | |
3491 : 1) how many HSPs are in a Blast alignment | |
3492 : 2) how strong the similarity is between sequences in the HSP | |
3493 : 3) the endpoints of the alignment (sequence monomer numbers) | |
3494 | |
3495 See Also : B<Bio::Tools::Blast::Sbjct::homol_data()>,L<Links to related modules> | |
3496 | |
3497 =cut | |
3498 | |
3499 #---------------- | |
3500 sub homol_data { | |
3501 #---------------- | |
3502 | |
3503 my ($self, %param) = @_; | |
3504 my @hits = $self->hits(); | |
3505 my @data = (); | |
3506 | |
3507 ## Note: Homology data can be either for the query sequence or the hit | |
3508 ## (Sbjct) sequence. Default is for sbjct. This is specifyable via | |
3509 ## $param{-SEQ}='sbjct' || 'query'. | |
3510 | |
3511 foreach ( @hits ) { | |
3512 push @data, $_->homol_data(%param); | |
3513 } | |
3514 @data; | |
3515 } | |
3516 | |
3517 =head1 REPORT GENERATING METHODS | |
3518 | |
3519 =head2 table | |
3520 | |
3521 Usage : $blast_obj->table( [get_desc]); | |
3522 Purpose : Output data for each HSP of each hit in tab-delimited format. | |
3523 Example : print $blast->table; | |
3524 : print $blast->table(0); | |
3525 : # Call table_labels() to print labels. | |
3526 Argument : get_desc = boolean, if false the description of each hit is not included. | |
3527 : Default: true (if not defined, include description column). | |
3528 Returns : String containing tab-delimited set of data for each HSP | |
3529 : of each significant hit. Different HSPs are separated by newlines. | |
3530 : Left-to-Right order of fields: | |
3531 : 1 QUERY_NAME # Sequence identifier of the query. | |
3532 : 2 QUERY_LENGTH # Full length of the query sequence. | |
3533 : 3 SBJCT_NAME # Sequence identifier of the sbjct ("hit". | |
3534 : 4 SBJCT_LENGTH # Full length of the sbjct sequence. | |
3535 : 5 EXPECT # Expect value for the alignment. | |
3536 : 6 SCORE # Blast score for the alignment. | |
3537 : 7 BITS # Bit score for the alignment. | |
3538 : 8 NUM_HSPS # Number of HSPs (not the "N" value). | |
3539 : 9 HSP_FRAC_IDENTICAL # fraction of identical substitutions. | |
3540 : 10 HSP_FRAC_CONSERVED # fraction of conserved ("positive") substitutions. | |
3541 : 11 HSP_QUERY_ALN_LENGTH # Length of the aligned portion of the query sequence. | |
3542 : 12 HSP_SBJCT_ALN_LENGTH # Length of the aligned portion of the sbjct sequence. | |
3543 : 13 HSP_QUERY_GAPS # Number of gaps in the aligned query sequence. | |
3544 : 14 HSP_SBJCT_GAPS # Number of gaps in the aligned sbjct sequence. | |
3545 : 15 HSP_QUERY_START # Starting coordinate of the query sequence. | |
3546 : 16 HSP_QUERY_END # Ending coordinate of the query sequence. | |
3547 : 17 HSP_SBJCT_START # Starting coordinate of the sbjct sequence. | |
3548 : 18 HSP_SBJCT_END # Ending coordinate of the sbjct sequence. | |
3549 : 19 HSP_QUERY_STRAND # Strand of the query sequence (TBLASTN/X only) | |
3550 : 20 HSP_SBJCT_STRAND # Strand of the sbjct sequence (TBLASTN/X only) | |
3551 : 21 HSP_FRAME # Frame for the sbjct translation (TBLASTN/X only) | |
3552 : 22 SBJCT_DESCRIPTION (optional) # Full description of the sbjct sequence from | |
3553 : # the alignment section. | |
3554 Throws : n/a | |
3555 Comments : This method does not collect data based on tiling of the HSPs. | |
3556 : The table will contains redundant information since the hit name, | |
3557 : id, and other info for the hit are listed for each HSP. | |
3558 : If you need more flexibility in the output format than this | |
3559 : method provides, design a custom function. | |
3560 | |
3561 See Also : L<table_tiled()|table_tiled>, L<table_labels()|table_labels>, L<_display_hits()|_display_hits> | |
3562 | |
3563 =cut | |
3564 | |
3565 #----------- | |
3566 sub table { | |
3567 #----------- | |
3568 my ($self, $get_desc) = @_; | |
3569 my $str = ''; | |
3570 | |
3571 $get_desc = defined($get_desc) ? $get_desc : 1; | |
3572 # $str .= $self->_table_labels($get_desc) unless $self->{'_labels'}; | |
3573 | |
3574 my $sigfmt = $self->signif_fmt(); | |
3575 $sigfmt eq 'parts' and $sigfmt = 'exp'; # disallow 'parts' format for this table. | |
3576 my $sigprint = $sigfmt eq 'exp' ? 'd' : '.1e'; | |
3577 | |
3578 my ($hit, $hsp); | |
3579 foreach $hit($self->hits) { | |
3580 foreach $hsp($hit->hsps) { | |
3581 # Note: range() returns a 2-element list. | |
3582 $str .= sprintf "%s\t%d\t%s\t%d\t%$sigprint\t%d\t%d\t%d\t%.2f\t%.2f\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%s\t%s\t%s\t%s$Newline", | |
3583 $self->name, $self->length, $hit->name, $hit->length, | |
3584 $hit->expect($sigfmt), $hit->score, $hit->bits, | |
3585 $hit->num_hsps, $hsp->frac_identical, $hsp->frac_conserved, | |
3586 $hsp->length('query'), $hsp->length('sbjct'), | |
3587 $hsp->gaps('list'), | |
3588 $hsp->range('query'), $hsp->range('sbjct'), | |
3589 $hsp->strand('query'), $hsp->strand('sbjct'), $hsp->frame, | |
3590 ($get_desc ? $hit->desc : ''); | |
3591 } | |
3592 } | |
3593 $str =~ s/\t$Newline/$Newline/gs; | |
3594 $str; | |
3595 } | |
3596 | |
3597 =head2 table_labels | |
3598 | |
3599 Usage : print $blast_obj->table_labels( [get_desc] ); | |
3600 Purpose : Get column labels for table(). | |
3601 Returns : String containing column labels. Tab-delimited. | |
3602 Argument : get_desc = boolean, if false the description column is not included. | |
3603 : Default: true (if not defined, include description column). | |
3604 Throws : n/a | |
3605 | |
3606 See Also : L<table()|table> | |
3607 | |
3608 =cut | |
3609 | |
3610 #---------------- | |
3611 sub table_labels { | |
3612 #---------------- | |
3613 my ($self, $get_desc) = @_; | |
3614 $get_desc = defined($get_desc) ? $get_desc : 1; | |
3615 my $descstr = $get_desc ? 'DESC' : ''; | |
3616 my $descln = $get_desc ? '-----' : ''; | |
3617 | |
3618 my $str = sprintf "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s$Newline", | |
3619 'QUERY', 'Q_LEN', 'SBJCT', 'S_LEN', 'EXPCT', 'SCORE', 'BITS', 'HSPS', | |
3620 'IDEN', 'CONSV', 'Q_ALN', 'S_ALN', 'Q_GAP', 'S_GAP', | |
3621 'Q_BEG', 'Q_END', 'S_BEG', 'S_END', 'Q_STR', 'S_STR', 'FRAM', $descstr; | |
3622 $str .= sprintf "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s$Newline", | |
3623 '-----', '-----', '-----', '-----', '-----', '-----', '-----', '-----', | |
3624 '-----', '-----', '-----', '-----', '-----', '-----', | |
3625 '-----', '-----', '-----','-----', '-----', '-----','-----', $descln; | |
3626 | |
3627 $self->{'_labels'} = 1; | |
3628 $str =~ s/\t$Newline/$Newline/gs; | |
3629 $str; | |
3630 } | |
3631 | |
3632 =head2 table_tiled | |
3633 | |
3634 Purpose : Get data from tiled HSPs in tab-delimited format. | |
3635 : Allows only minimal flexibility in the output format. | |
3636 : If you need more flexibility, design a custom function. | |
3637 Usage : $blast_obj->table_tiled( [get_desc]); | |
3638 Example : print $blast->table_tiled; | |
3639 : print $blast->table_tiled(0); | |
3640 : # Call table_labels_tiled() if you want labels. | |
3641 Argument : get_desc = boolean, if false the description of each hit is not included. | |
3642 : Default: true (include description). | |
3643 Returns : String containing tab-delimited set of data for each HSP | |
3644 : of each significant hit. Multiple hits are separated by newlines. | |
3645 : Left-to-Right order of fields: | |
3646 : 1 QUERY_NAME # Sequence identifier of the query. | |
3647 : 2 QUERY_LENGTH # Full length of the query sequence. | |
3648 : 3 SBJCT_NAME # Sequence identifier of the sbjct ("hit". | |
3649 : 4 SBJCT_LENGTH # Full length of the sbjct sequence. | |
3650 : 5 EXPECT # Expect value for the alignment. | |
3651 : 6 SCORE # Blast score for the alignment. | |
3652 : 7 BITS # Bit score for the alignment. | |
3653 : 8 NUM_HSPS # Number of HSPs (not the "N" value). | |
3654 : 9 FRAC_IDENTICAL* # fraction of identical substitutions. | |
3655 : 10 FRAC_CONSERVED* # fraction of conserved ("positive") substitutions . | |
3656 : 11 FRAC_ALN_QUERY* # fraction of the query sequence that is aligned. | |
3657 : 12 FRAC_ALN_SBJCT* # fraction of the sbjct sequence that is aligned. | |
3658 : 13 QUERY_ALN_LENGTH* # Length of the aligned portion of the query sequence. | |
3659 : 14 SBJCT_ALN_LENGTH* # Length of the aligned portion of the sbjct sequence. | |
3660 : 15 QUERY_GAPS* # Number of gaps in the aligned query sequence. | |
3661 : 16 SBJCT_GAPS* # Number of gaps in the aligned sbjct sequence. | |
3662 : 17 QUERY_START* # Starting coordinate of the query sequence. | |
3663 : 18 QUERY_END* # Ending coordinate of the query sequence. | |
3664 : 19 SBJCT_START* # Starting coordinate of the sbjct sequence. | |
3665 : 20 SBJCT_END* # Ending coordinate of the sbjct sequence. | |
3666 : 21 AMBIGUOUS_ALN # Ambiguous alignment indicator ('qs', 'q', 's'). | |
3667 : 22 SBJCT_DESCRIPTION (optional) # Full description of the sbjct sequence from | |
3668 : # the alignment section. | |
3669 : | |
3670 : * Items marked with a "*" report data summed across all HSPs | |
3671 : after tiling them to avoid counting data from overlapping regions | |
3672 : multiple times. | |
3673 Throws : n/a | |
3674 Comments : This function relies on tiling of the HSPs since it calls | |
3675 : frac_identical() etc. on the hit as opposed to each HSP individually. | |
3676 | |
3677 See Also : L<table()|table>, L<table_labels_tiled()|table_labels_tiled>, B<Bio::Tools::Blast::Sbjct::"HSP Tiling and Ambiguous Alignments">, L<Links to related modules> | |
3678 | |
3679 =cut | |
3680 | |
3681 #---------------- | |
3682 sub table_tiled { | |
3683 #---------------- | |
3684 my ($self, $get_desc) = @_; | |
3685 my $str = ''; | |
3686 | |
3687 $get_desc = defined($get_desc) ? $get_desc : 1; | |
3688 | |
3689 my ($hit); | |
3690 my $sigfmt = $self->signif_fmt(); | |
3691 $sigfmt eq 'parts' and $sigfmt = 'exp'; # disallow 'parts' format for this table. | |
3692 my $sigprint = $sigfmt eq 'exp' ? 'd' : '.1e'; | |
3693 | |
3694 foreach $hit($self->hits) { | |
3695 $str .= sprintf "%s\t%d\t%s\t%d\t%$sigprint\t%d\t%d\t%d\t%.2f\t%.2f\t%.2f\t%.2f\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%s\t%s$Newline", | |
3696 $self->name, $self->length, $hit->name, $hit->length, | |
3697 $hit->expect($sigfmt), $hit->score, $hit->bits, | |
3698 $hit->num_hsps, $hit->frac_identical, $hit->frac_conserved, | |
3699 $hit->frac_aligned_query, $hit->frac_aligned_hit, | |
3700 $hit->length_aln('query'), $hit->length_aln('sbjct'), | |
3701 $hit->gaps('list'), $hit->range('query'), $hit->range('sbjct'), | |
3702 $hit->ambiguous_aln, ($get_desc ? $hit->desc : ''); | |
3703 } | |
3704 $str =~ s/\t$Newline/$Newline/gs; | |
3705 $str; | |
3706 } | |
3707 | |
3708 =head2 table_labels_tiled | |
3709 | |
3710 Usage : print $blast_obj->table_labels_tiled( [get_desc] ); | |
3711 Purpose : Get column labels for table_tiled(). | |
3712 Returns : String containing column labels. Tab-delimited. | |
3713 Argument : get_desc = boolean, if false the description column is not included. | |
3714 : Default: true (include description column). | |
3715 Throws : n/a | |
3716 | |
3717 See Also : L<table_tiled()|table_tiled> | |
3718 | |
3719 =cut | |
3720 | |
3721 #--------------------- | |
3722 sub table_labels_tiled { | |
3723 #--------------------- | |
3724 my ($self, $get_desc) = @_; | |
3725 my $descstr = $get_desc ? 'DESC' : ''; | |
3726 my $descln = $get_desc ? '-----' : ''; | |
3727 | |
3728 my $str = sprintf "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s$Newline", | |
3729 'QUERY', 'Q_LEN', 'SBJCT', 'S_LEN', 'EXPCT', 'SCORE', 'BITS', | |
3730 'HSPS', 'FR_ID', 'FR_CN', 'FR_ALQ', 'FR_ALS', 'Q_ALN', | |
3731 'S_ALN', 'Q_GAP', 'S_GAP', 'Q_BEG', 'Q_END', 'S_BEG', 'S_END', | |
3732 'AMBIG', $descstr; | |
3733 $str =~ s/\t$Newline/$Newline/; | |
3734 $str .= sprintf "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s$Newline", | |
3735 '-----', '-----', '------', '-----', '-----','-----', '-----', | |
3736 '-----', '-----', '-----', '-----', '-----', '-----', | |
3737 '-----', '-----', '-----','-----','-----', '-----', | |
3738 '-----','-----', $descln; | |
3739 | |
3740 $self->{'_labels_tiled'} = 1; | |
3741 $str =~ s/\t$Newline/$Newline/gs; | |
3742 $str; | |
3743 } | |
3744 | |
3745 =head2 display | |
3746 | |
3747 Usage : $blast_object->display( %named_parameters ); | |
3748 Purpose : Display information about Bio::Tools::Blast.pm data members, | |
3749 : E.g., parameters of the report, data for each hit., etc. | |
3750 : Overrides Bio::Root::Object::display(). | |
3751 Example : $object->display(-SHOW=>'stats'); | |
3752 : $object->display(-SHOW=>'stats,hits'); | |
3753 Argument : Named parameters: (TAGS CAN BE UPPER OR LOWER CASE) | |
3754 : -SHOW => 'file' | 'hits' | 'homol' | |
3755 : -WHERE => filehandle (default = STDOUT) | |
3756 Returns : n/a (print/printf is called) | |
3757 Status : Experimental | |
3758 Comments : For tab-delimited output, see table(). | |
3759 | |
3760 See Also : L<_display_homol()|_display_homol>, L<_display_hits()|_display_hits>, L<_display_stats()|_display_stats>, L<table()|table>, B<Bio::Root::Tools::SeqAnal::display()>,L<Links to related modules>, | |
3761 | |
3762 =cut | |
3763 | |
3764 #-------------- | |
3765 sub display { | |
3766 #-------------- | |
3767 my( $self, %param ) = @_; | |
3768 | |
3769 $self->SUPER::display(%param); | |
3770 my $OUT = $self->fh(); | |
3771 | |
3772 $self->show =~ /homol/i and $self->_display_homol($OUT); | |
3773 $self->show =~ /hits/i and $self->_display_hits( %param ); | |
3774 1; | |
3775 } | |
3776 | |
3777 =head2 _display_homol | |
3778 | |
3779 Usage : n/a; called automatically by display() | |
3780 Purpose : Print homology data for hits in the BLAST report. | |
3781 Example : n/a | |
3782 Argument : one argument = filehandle object. | |
3783 Returns : printf call. | |
3784 Status : Experimental | |
3785 | |
3786 See Also : L<homol_data()|homol_data>, L<display()|display> | |
3787 | |
3788 =cut | |
3789 | |
3790 #------------------- | |
3791 sub _display_homol { | |
3792 #------------------- | |
3793 my( $self, $OUT ) = @_; | |
3794 | |
3795 print $OUT "${Newline}BLAST HOMOLOGY DATA FOR: ${\$self->name()}$Newline"; | |
3796 print $OUT '-'x40,"$Newline"; | |
3797 | |
3798 foreach ( $self->homol_data()) { | |
3799 print $OUT "$_$Newline"; | |
3800 } | |
3801 } | |
3802 | |
3803 =head2 _display_stats | |
3804 | |
3805 Usage : n/a; called automatically by display() | |
3806 Purpose : Display information about the Blast report "meta" data. | |
3807 : Overrides Bio::Tools::SeqAnal::_display_stats() calling it first. | |
3808 Example : n/a | |
3809 Argument : one argument = filehandle object. | |
3810 Returns : printf call. | |
3811 Status : Experimental | |
3812 | |
3813 See Also : L<display()|display>, B<Bio::Tools::SeqAnal::_display_stats()>,L<Links to related modules> | |
3814 | |
3815 =cut | |
3816 | |
3817 #-------------------- | |
3818 sub _display_stats { | |
3819 #-------------------- | |
3820 my( $self, $OUT ) = @_; | |
3821 | |
3822 $self->SUPER::_display_stats($OUT); | |
3823 printf( $OUT "%-15s: %s$Newline", "GAPPED", $self->gapped ? 'YES' : 'NO'); | |
3824 printf( $OUT "%-15s: %d$Newline", "TOTAL HITS", $self->num_hits('total')); | |
3825 printf( $OUT "%-15s: %s$Newline", "CHECKED ALL", $Blast->{'_check_all'} ? 'YES' : 'NO'); | |
3826 printf( $OUT "%-15s: %s$Newline", "FILT FUNC", $Blast->{'_filt_func'} ? 'YES' : 'NO'); | |
3827 if($self->min_length) { | |
3828 printf( $OUT "%-15s: Length >= %s$Newline", "MIN_LENGTH", $self->min_length); | |
3829 } | |
3830 | |
3831 my $num_hits = $self->num_hits; | |
3832 my $signif_str = ($self->_layout == 1) ? 'P' : 'EXPECT'; | |
3833 | |
3834 printf( $OUT "%-15s: %d$Newline", "SIGNIF HITS", $num_hits); | |
3835 # Blast1: signif = P-value, Blast2: signif = Expect value. | |
3836 | |
3837 printf( $OUT "%-15s: %s ($signif_str-VALUE)$Newline", "SIGNIF CUTOFF", $self->signif); | |
3838 printf( $OUT "%-15s: %s$Newline", "LOWEST $signif_str", $self->lowest_signif()); | |
3839 printf( $OUT "%-15s: %s$Newline", "HIGHEST $signif_str", $self->highest_signif()); | |
3840 | |
3841 printf( $OUT "%-15s: %s (OVERALL)$Newline", "HIGHEST $signif_str", $self->highest_signif('overall')); | |
3842 | |
3843 if($Blast->_get_stats) { | |
3844 my $warn = ($Blast->{'_share'}) ? '(SHARED STATS)' : ''; | |
3845 printf( $OUT "%-15s: %s$Newline", "MATRIX", $self->matrix() || 'UNKNOWN'); | |
3846 printf( $OUT "%-15s: %s$Newline", "FILTER", $self->filter() || 'UNKNOWN'); | |
3847 printf( $OUT "%-15s: %s$Newline", "EXPECT", $self->expect() || 'UNKNOWN'); | |
3848 printf( $OUT "%-15s: %s, %s, %s %s$Newline", "LAMBDA, K, H", $self->karlin_altschul(), $warn); | |
3849 printf( $OUT "%-15s: %s$Newline", "WORD SIZE", $self->word_size() || 'UNKNOWN'); | |
3850 printf( $OUT "%-15s: %s %s$Newline", "S", $self->s() || 'UNKNOWN', $warn); | |
3851 if($self->gapped) { | |
3852 printf( $OUT "%-15s: %s$Newline", "GAP CREATION", $self->gap_creation() || 'UNKNOWN'); | |
3853 printf( $OUT "%-15s: %s$Newline", "GAP EXTENSION", $self->gap_extension() || 'UNKNOWN'); | |
3854 } | |
3855 } | |
3856 print $OUT "$Newline"; | |
3857 } | |
3858 | |
3859 =head2 _display_hits | |
3860 | |
3861 Usage : n/a; called automatically by display() | |
3862 Purpose : Display data for each hit. Not tab-delimited. | |
3863 Example : n/a | |
3864 Argument : one argument = filehandle object. | |
3865 Returns : printf call. | |
3866 Status : Experimental | |
3867 Comments : For tab-delimited output, see table(). | |
3868 | |
3869 See Also : L<display()|display>, B<Bio::Tools::Blast::Sbjct::display()>, L<table()|table>, L<Links to related modules> | |
3870 | |
3871 =cut | |
3872 | |
3873 sub _display_hits { | |
3874 | |
3875 my( $self, %param ) = @_; | |
3876 my $OUT = $self->fh(); | |
3877 my @hits = $self->hits(); | |
3878 | |
3879 ## You need a wide screen to see this properly. | |
3880 # Header. | |
3881 print $OUT "${Newline}BLAST HITS FOR: ${\$self->name()} length = ${\$self->length}$Newline"; | |
3882 print "(This table requires a wide display.)$Newline"; | |
3883 print $OUT '-'x80,"$Newline"; | |
3884 | |
3885 print $self->table_labels_tiled(0); | |
3886 print $self->table_tiled(0); | |
3887 | |
3888 ## Doing this interactively since there is potentially a lot of data here. | |
3889 ## Not quite satisfied with this approach. | |
3890 | |
3891 if (not $param{-INTERACTIVE}) { | |
3892 return 1; | |
3893 } else { | |
3894 my ($reply); | |
3895 print "${Newline}DISPLAY FULL HSP DATA? (y/n): [n] "; | |
3896 chomp( $reply = <STDIN> ); | |
3897 $reply =~ /^y.*/i; | |
3898 | |
3899 my $count = 0; | |
3900 foreach ( @hits ) { | |
3901 $count++; | |
3902 print $OUT "$Newline$Newline",'-'x80,"$Newline"; | |
3903 print $OUT "HSP DATA FOR HIT #$count (hit <RETURN>)"; | |
3904 print $OUT "$Newline",'-'x80;<STDIN>; | |
3905 $param{-SHOW} = 'hsp'; | |
3906 $_->display( %param ); | |
3907 } | |
3908 } | |
3909 1; | |
3910 } | |
3911 | |
3912 =head2 to_html | |
3913 | |
3914 Usage : $blast_object->to_html( [%named_parameters] ) | |
3915 Purpose : To produce an HTML-formatted version of a BLAST report | |
3916 : for efficient navigation of the report using a web browser. | |
3917 Example : # Using the static Blast object: | |
3918 : # Can read from STDIN or from a designated file: | |
3919 : $Blast->to_html($file); | |
3920 : $Blast->to_html(-FILE=>$file, -HEADER=>$header); | |
3921 : (if no file is supplied, STDIN will be used). | |
3922 : # saving HTML to an array: | |
3923 : $Blast->to_html(-FILE=>$file, -OUT =>\@out); | |
3924 : # Using a pre-existing blast object (must have been built from | |
3925 : # a file, not STDIN: | |
3926 : $blastObj->to_html(); | |
3927 Returns : n/a, either prints report to STDOUT or saves to a supplied array | |
3928 : if an '-OUT' parameter is defined (see below). | |
3929 Argument : %named_parameters: (TAGS ARE AND CASE INSENSITIVE). | |
3930 : -FILE => string containing name of a file to be processed. | |
3931 : If not a valid file or undefined, STDIN will be used. | |
3932 : Can skip the -FILE tag if supplying a filename | |
3933 : as a single argument. | |
3934 : -HEADER => string | |
3935 : This should be an HTML-formatted string to be used | |
3936 : as a header for the page, typically describing query sequence, | |
3937 : database searched, the date of the analysis, and any | |
3938 : additional links. | |
3939 : If not supplied, no special header is used. | |
3940 : Regardless of whether a header is supplied, the | |
3941 : standard info at the top of the report is highlighted. | |
3942 : This should include the <HEADER></HEADER> section | |
3943 : of the page as well. | |
3944 : | |
3945 : -IN => array reference containing a raw Blast report. | |
3946 : each line in a separate element in the array. | |
3947 : If -IN is not supplied, read() is called | |
3948 : and data is then read either from STDIN or a file. | |
3949 : | |
3950 : -OUT => array reference to hold the HTML output. | |
3951 : If not supplied, output is sent to STDOUT. | |
3952 Throws : Exception is propagated from $HTML::get_html_func() | |
3953 : and Bio::Root::Object::read(). | |
3954 Comments : The code that does the actual work is located in | |
3955 : Bio::Tools::Blast::HTML::get_html_func(). | |
3956 Bugs : Some hypertext links to external databases may not be | |
3957 : correct. This due in part to the dynamic nature of | |
3958 : the web. | |
3959 : Hypertext links are not added to hits without database ids. | |
3960 TODO : Possibly create a function to produce fancy default header | |
3961 : using data extracted from the report (requires some parsing). | |
3962 : For example, it would be nice to always include a date | |
3963 | |
3964 See Also : B<Bio::Tools::Blast::HTML::get_html_func()>, B<Bio::Root::Object::read()>, L<Links to related modules> | |
3965 | |
3966 =cut | |
3967 | |
3968 #------------ | |
3969 sub to_html { | |
3970 #------------ | |
3971 my ($self, @param) = @_; | |
3972 | |
3973 # Permits syntax such as: $blast->to_html($filename); | |
3974 my ($file, $header_html, $in_aref, $out_aref) = | |
3975 $self->_rearrange([qw(FILE HEADER IN OUT)], @param); | |
3976 | |
3977 $self->file($file) if $file; | |
3978 | |
3979 # Only setting the newline character once for efficiency. | |
3980 $Newline ||= $Util->get_newline(-client => $self, @param); | |
3981 | |
3982 $header_html ||= ''; | |
3983 (ref($out_aref) eq 'ARRAY') ? push(@$out_aref, $header_html) : print "$header_html$Newline"; | |
3984 | |
3985 require Bio::Tools::Blast::HTML; | |
3986 Bio::Tools::Blast::HTML->import(qw(&get_html_func)); | |
3987 | |
3988 my ($func); | |
3989 eval{ $func = &get_html_func($out_aref); }; | |
3990 if($@) { | |
3991 my $err = $@; | |
3992 $self->throw($err); | |
3993 } | |
3994 | |
3995 eval { | |
3996 if(!$header_html) { | |
3997 $out_aref ? push(@$out_aref, "<html><body>$Newline") : print "<html><body>$Newline"; | |
3998 } | |
3999 | |
4000 if (ref ($in_aref) =~ /ARRAY/) { | |
4001 # If data is being supplied, process it. | |
4002 foreach(@$in_aref) { | |
4003 &$func($_); | |
4004 } | |
4005 } else { | |
4006 # Otherwise, read it, processing as we go. | |
4007 | |
4008 $self->read(-FUNC => $func, @param); | |
4009 } | |
4010 $out_aref ? push(@$out_aref, "$Newline</pre></body></html>") : print "$Newline</pre></body></html>"; | |
4011 }; | |
4012 | |
4013 if($@) { | |
4014 # Check for trivial error (report already HTML formatted). | |
4015 if($@ =~ /HTML formatted/) { | |
4016 print STDERR "\a${Newline}Blast report appears to be HTML formatted already.$Newline$Newline"; | |
4017 } else { | |
4018 my $err = $@; | |
4019 $self->throw($err); | |
4020 } | |
4021 } | |
4022 } | |
4023 | |
4024 1; | |
4025 __END__ | |
4026 | |
4027 ##################################################################################### | |
4028 # END OF CLASS # | |
4029 ##################################################################################### | |
4030 | |
4031 =head1 FOR DEVELOPERS ONLY | |
4032 | |
4033 =head2 Data Members | |
4034 | |
4035 Information about the various data members of this module is provided for those | |
4036 wishing to modify or understand the code. Two things to bear in mind: | |
4037 | |
4038 =over 4 | |
4039 | |
4040 =item 1 Do NOT rely on these in any code outside of this module. | |
4041 | |
4042 All data members are prefixed with an underscore to signify that they are private. | |
4043 Always use accessor methods. If the accessor doesn't exist or is inadequate, | |
4044 create or modify an accessor (and let me know, too!). (An exception to this might | |
4045 be for Sbjct.pm or HSP.pm which are more tightly coupled to Blast.pm and | |
4046 may access Blast data members directly for efficiency purposes, but probably | |
4047 should not). | |
4048 | |
4049 =item 2 This documentation may be incomplete and out of date. | |
4050 | |
4051 It is easy for these data member descriptions to become obsolete as | |
4052 this module is still evolving. Always double check this info and search | |
4053 for members not described here. | |
4054 | |
4055 =back | |
4056 | |
4057 An instance of Bio::Tools::Blast.pm is a blessed reference to a hash containing | |
4058 all or some of the following fields: | |
4059 | |
4060 FIELD VALUE | |
4061 -------------------------------------------------------------- | |
4062 _significance P-value or Expect value cutoff (depends on Blast version: | |
4063 Blast1/WU-Blast2 = P-value; Blast2 = Expect value). | |
4064 Values GREATER than this are deemed not significant. | |
4065 | |
4066 _significant Boolean. True if the query has one or more significant hit. | |
4067 | |
4068 _min_length Integer. Query sequences less than this will be skipped. | |
4069 | |
4070 _confirm_significance Boolean. True if client has supplied significance criteria. | |
4071 | |
4072 _gapped Boolean. True if BLAST analysis has gapping turned on. | |
4073 | |
4074 _hits List of Sbjct.pm objects. | |
4075 | |
4076 _num_hits Number of hits obtained from the BLAST report. | |
4077 | |
4078 _num_hits_significant Number of significant based on Significant data members. | |
4079 | |
4080 _highestSignif Highest P or Expect value overall (not just what is stored in _hits). | |
4081 | |
4082 _lowestSignif Lowest P or Expect value overall (not just what is stored in _hits). | |
4083 | |
4084 The static $Blast object has a special set of members: | |
4085 | |
4086 _errs | |
4087 _share | |
4088 _stream | |
4089 _get_stats | |
4090 _gapped | |
4091 _filt_func | |
4092 | |
4093 Miscellaneous statistical parameters: | |
4094 ------------------------------------- | |
4095 _filter, _matrix, _word_size, _expect, _gapCreation, _gapExtension, _s, | |
4096 _lambda, _k, _h | |
4097 | |
4098 INHERITED DATA MEMBERS | |
4099 ----------------------- | |
4100 (See Bio::Tools::SeqAnal.pm for inherited data members.) | |
4101 | |
4102 =cut | |
4103 | |
4104 1; |