0
|
1 #----------------------------------------------------------------------------
|
|
2 # PACKAGE : Bio::Tools::Blast
|
|
3 # PURPOSE : To encapsulate code for running, parsing, and analyzing
|
|
4 # BLAST reports.
|
|
5 # AUTHOR : Steve Chervitz (sac@bioperl.org)
|
|
6 # CREATED : March 1996
|
|
7 # REVISION: $Id: Blast.pm,v 1.30 2002/11/04 09:12:50 heikki Exp $
|
|
8 # STATUS : Alpha
|
|
9 #
|
|
10 # For the latest version and documentation, visit:
|
|
11 # http://bio.perl.org/Projects/Blast
|
|
12 #
|
|
13 # To generate documentation, run this module through pod2html
|
|
14 # (preferably from Perl v5.004 or better).
|
|
15 #
|
|
16 # Copyright (c) 1996-2000 Steve Chervitz. All Rights Reserved.
|
|
17 # This module is free software; you can redistribute it and/or
|
|
18 # modify it under the same terms as Perl itself.
|
|
19 #----------------------------------------------------------------------------
|
|
20
|
|
21 package Bio::Tools::Blast;
|
|
22 use strict;
|
|
23 use Exporter;
|
|
24
|
|
25 use Bio::Tools::SeqAnal;
|
|
26 use Bio::Root::Global qw(:std);
|
|
27 use Bio::Root::Utilities qw(:obj);
|
|
28
|
|
29 require 5.002;
|
|
30
|
|
31 use vars qw(@ISA @EXPORT @EXPORT_OK %EXPORT_TAGS
|
|
32 $ID $VERSION $Blast @Blast_programs $Revision $Newline);
|
|
33
|
|
34 @ISA = qw( Bio::Tools::SeqAnal Exporter);
|
|
35 @EXPORT = qw();
|
|
36 @EXPORT_OK = qw($VERSION $Blast);
|
|
37 %EXPORT_TAGS = ( obj => [qw($Blast)],
|
|
38 std => [qw($Blast)]);
|
|
39
|
|
40 $ID = 'Bio::Tools::Blast';
|
|
41 $VERSION = 0.09;
|
|
42 $Revision = '$Id: Blast.pm,v 1.30 2002/11/04 09:12:50 heikki Exp $'; #'
|
|
43
|
|
44 ## Static Blast object.
|
|
45 $Blast = {};
|
|
46 bless $Blast, $ID;
|
|
47 $Blast->{'_name'} = "Static Blast object";
|
|
48
|
|
49 @Blast_programs = qw(blastp blastn blastx tblastn tblastx);
|
|
50
|
|
51 use vars qw($DEFAULT_MATRIX $DEFAULT_SIGNIF);
|
|
52 my $DEFAULT_MATRIX = 'BLOSUM62';
|
|
53 my $DEFAULT_SIGNIF = 999;# Value used as significance cutoff if none supplied.
|
|
54 my $MAX_HSP_OVERLAP = 2; # Used when tiling multiple HSPs.
|
|
55
|
|
56 ## POD Documentation:
|
|
57
|
|
58 =head1 NAME
|
|
59
|
|
60 Bio::Tools::Blast - Bioperl BLAST sequence analysis object
|
|
61
|
|
62 =head1 SYNOPSIS
|
|
63
|
|
64 =head2 Parsing Blast reports
|
|
65
|
|
66 Parse an existing Blast report from file:
|
|
67
|
|
68 use Bio::Tools::Blast;
|
|
69
|
|
70 $blastObj = Bio::Tools::Blast->new( -file => '/tmp/blast.out',
|
|
71 -parse => 1,
|
|
72 -signif => '1e-10',
|
|
73 );
|
|
74
|
|
75 Parse an existing Blast report from STDIN:
|
|
76
|
|
77 $blastObj = Bio::Tools::Blast->new( -parse => 1,
|
|
78 -signif => '1e-10',
|
|
79 );
|
|
80
|
|
81 Then send a Blast report to your script via STDIN.
|
|
82
|
|
83 Full parameters for parsing Blast reports.
|
|
84
|
|
85 %blastParam = (
|
|
86 -run => \%runParam,
|
|
87 -file => '',
|
|
88 -parse => 1,
|
|
89 -signif => 1e-5,
|
|
90 -filt_func => \&my_filter,
|
|
91 -min_len => 15,
|
|
92 -check_all_hits => 0,
|
|
93 -strict => 0,
|
|
94 -stats => 1,
|
|
95 -best => 0,
|
|
96 -share => 0,
|
|
97 -exec_func => \&process_blast,
|
|
98 -save_array => \@blast_objs, # not used if -exce_func defined.
|
|
99 );
|
|
100
|
|
101 See L<parse()|parse> for a description of parameters and see L<USAGE | USAGE> for
|
|
102 more examples including how to parse streams containing multiple Blast
|
|
103 reports L<Using the Static $Blast Object>.
|
|
104
|
|
105 See L<Memory Usage Issues> for information about how to make Blast
|
|
106 parsing be more memory efficient.
|
|
107
|
|
108 =head2 Running Blast reports
|
|
109
|
|
110 Run a new Blast2 at NCBI and then parse it:
|
|
111
|
|
112 %runParam = (
|
|
113 -method => 'remote',
|
|
114 -prog => 'blastp',
|
|
115 -database => 'swissprot',
|
|
116 -seqs => [ $seq ], # Bio::Seq.pm objects.
|
|
117 );
|
|
118
|
|
119 $blastObj = Bio::Tools::Blast->new( -run => \%runParam,
|
|
120 -parse => 1,
|
|
121 -signif => '1e-10',
|
|
122 -strict => 1,
|
|
123 );
|
|
124
|
|
125 Full parameters for running Blasts at NCBI using Webblast.pm:
|
|
126
|
|
127 %runParam = (
|
|
128 -method => 'remote',
|
|
129 -prog => 'blastp',
|
|
130 -version => 2, # BLAST2
|
|
131 -database =>'swissprot',
|
|
132 -html => 0,
|
|
133 -seqs => [ $seqObject ], # Bio::Seq.pm object(s)
|
|
134 -descr => 250,
|
|
135 -align => 250,
|
|
136 -expect => 10,
|
|
137 -gap => 'on',
|
|
138 -matrix => 'PAM250',
|
|
139 -email => undef, # don't send report via e-mail if parsing.
|
|
140 -filter => undef, # use default
|
|
141 -gap_c => undef, # use default
|
|
142 -gap_e => undef, # use default
|
|
143 -word => undef, # use default
|
|
144 -min_len => undef, # use default
|
|
145 );
|
|
146
|
|
147 See L<run()|run> and L<USAGE | USAGE> for more information about running Blasts.
|
|
148
|
|
149 =head2 HTML-formatting Blast reports
|
|
150
|
|
151 Print an HTML-formatted version of a Blast report:
|
|
152
|
|
153 use Bio::Tools::Blast qw(:obj);
|
|
154
|
|
155 $Blast->to_html($filename);
|
|
156 $Blast->to_html(-file => $filename,
|
|
157 -header => "<H1>Blast Results</H1>");
|
|
158 $Blast->to_html(-file => $filename,
|
|
159 -out => \@array); # store output
|
|
160 $Blast->to_html(); # use STDIN
|
|
161
|
|
162 Results are sent directly to STDOUT unless an C<-out =E<gt> array_ref>
|
|
163 parameter is supplied. See L<to_html()|to_html> for details.
|
|
164
|
|
165 =head1 INSTALLATION
|
|
166
|
|
167 This module is included with the central Bioperl distribution:
|
|
168
|
|
169 http://bio.perl.org/Core/Latest
|
|
170 ftp://bio.perl.org/pub/DIST
|
|
171
|
|
172 Follow the installation instructions included in the README file.
|
|
173
|
|
174 =head1 DESCRIPTION
|
|
175
|
|
176 The Bio::Tools::Blast.pm module encapsulates data and methods for
|
|
177 running, parsing, and analyzing pre-existing BLAST reports. This
|
|
178 module defines an application programming interface (API) for working
|
|
179 with Blast reports. A Blast object is constructed from raw Blast
|
|
180 output and encapsulates the Blast results which can then be accessed
|
|
181 via the interface defined by the Blast object.
|
|
182
|
|
183 The ways in which researchers use Blast data are many and varied. This
|
|
184 module attempts to be general and flexible enough to accommodate
|
|
185 different uses. The Blast module API is still at an early stage of
|
|
186 evolution and I expect it to continue to evolve as new uses for Blast
|
|
187 data are developed. Your L<FEEDBACK | FEEDBACK> is welcome.
|
|
188
|
|
189 B<FEATURES:>
|
|
190
|
|
191 =over 2
|
|
192
|
|
193 =item * Supports NCBI Blast1.x, Blast2.x, and WashU-Blast2.x, gapped
|
|
194 and ungapped.
|
|
195
|
|
196 Can parse HTML-formatted as well as non-HTML-formatted reports.
|
|
197
|
|
198 =item * Launch new Blast analyses remotely or locally.
|
|
199
|
|
200 Blast objects can be constructed directly from the results of the
|
|
201 run. See L<run()|run>.
|
|
202
|
|
203 =item * Construct Blast objects from pre-existing files or from a new run.
|
|
204
|
|
205 Build a Blast object from a single file or build multiple Blast
|
|
206 objects from an input stream containing multiple reports. See
|
|
207 L<parse()|parse>.
|
|
208
|
|
209 =item * Add hypertext links from a BLAST report.
|
|
210
|
|
211 See L<to_html()|to_html>.
|
|
212
|
|
213 =item * Generate sequence and sequence alignment objects from HSP
|
|
214 sequences.
|
|
215
|
|
216 If you have Bio::Seq.pm and Bio::UnivAln.pm installed on your system,
|
|
217 they can be used for working with high-scoring segment pair (HSP)
|
|
218 sequences in the Blast alignment. (A new version of Bio::Seq.pm is
|
|
219 included in the distribution, see L<INSTALLATION | INSTALLATION>). For more
|
|
220 information about them, see:
|
|
221
|
|
222 http://bio.perl.org/Projects/Sequence/
|
|
223 http://bio.perl.org/Projects/SeqAlign/
|
|
224
|
|
225 =back
|
|
226
|
|
227 A variety of different data can be extracted from the Blast report by
|
|
228 querying the Blast.pm object. Some basic examples are given in the
|
|
229 L<USAGE | USAGE> section. For some working scripts, see the links provided in
|
|
230 the L<the DEMO SCRIPTS section | DEMO> section.
|
|
231
|
|
232 As a part of the incipient Bioperl framework, the Bio::Tools::Blast.pm
|
|
233 module inherits from B<Bio::Tools::SeqAnal.pm>, which provides some
|
|
234 generic functionality for biological sequence analysis. See the
|
|
235 documentation for that module for details
|
|
236 (L<Links to related modules>).
|
|
237
|
|
238 =head2 The BLAST Program
|
|
239
|
|
240 BLAST (Basic Local Alignment Search Tool) is a widely used algorithm
|
|
241 for performing rapid sequence similarity searches between a single DNA
|
|
242 or protein sequence and a large dataset of sequences. BLAST analyses
|
|
243 are typically performed by dedicated remote servers, such as the ones
|
|
244 at the NCBI. Individual groups may also run the program on local
|
|
245 machines.
|
|
246
|
|
247 The Blast family includes 5 different programs:
|
|
248
|
|
249 Query Seq Database
|
|
250 ------------ ----------
|
|
251 blastp -- protein protein
|
|
252 blastn -- nucleotide nucleotide
|
|
253 blastx -- nucleotide* protein
|
|
254 tblastn -- protein nucleotide*
|
|
255 tblastx -- nucleotide* nucleotide*
|
|
256
|
|
257 * = dynamically translated in all reading frames, both strands
|
|
258
|
|
259 See L<References & Information about the BLAST program>.
|
|
260
|
|
261 =head2 Versions Supported
|
|
262
|
|
263 BLAST reports generated by different application front ends are similar
|
|
264 but not exactly the same. Blast reports are not intended to be exchange formats,
|
|
265 making parsing software susceptible to obsolescence. This module aims to
|
|
266 support BLAST reports generated by different implementations:
|
|
267
|
|
268 Implementation Latest version tested
|
|
269 -------------- --------------------
|
|
270 NCBI Blast1 1.4.11 [24-Nov-97]
|
|
271 NCBI Blast2 2.0.8 [Jan-5-1999]
|
|
272 WashU-BLAST2 2.0a19MP [05-Feb-1998]
|
|
273 GCG 1.4.8 [1-Feb-95]
|
|
274
|
|
275 Support for both gapped and ungapped versions is included. Currently, there
|
|
276 is only rudimentary support for PSI-BLAST in that these reports can be parsed but
|
|
277 there is no special treatment of separate iteration rounds (they are all
|
|
278 merged together).
|
|
279
|
|
280 =head2 References & Information about the BLAST program
|
|
281
|
|
282 B<WEBSITES:>
|
|
283
|
|
284 http://www.ncbi.nlm.nih.gov/BLAST/ - Homepage at NCBI
|
|
285 http://www.ncbi.nlm.nih.gov/BLAST/blast_help.html - Help manual
|
|
286 http://blast.wustl.edu/ - WashU-Blast2
|
|
287
|
|
288 B<PUBLICATIONS:> (with PubMed links)
|
|
289
|
|
290 Altschul S.F., Gish W., Miller W., Myers E.W., Lipman D.J. (1990).
|
|
291 "Basic local alignment search tool", J Mol Biol 215: 403-410.
|
|
292
|
|
293 http://www.ncbi.nlm.nih.gov/htbin-post/Entrez/query?uid=2231712&form=6&db=m&Dopt=r
|
|
294
|
|
295 Altschul, Stephen F., Thomas L. Madden, Alejandro A. Schaffer,
|
|
296 Jinghui Zhang, Zheng Zhang, Webb Miller, and David J. Lipman (1997).
|
|
297 "Gapped BLAST and PSI-BLAST: a new generation of protein database
|
|
298 search programs", Nucleic Acids Res. 25:3389-3402.
|
|
299
|
|
300 http://www.ncbi.nlm.nih.gov/htbin-post/Entrez/query?uid=9254694&form=6&db=m&Dopt=r
|
|
301
|
|
302 Karlin, Samuel and Stephen F. Altschul (1990). Methods for
|
|
303 assessing the statistical significance of molecular sequence
|
|
304 features by using general scoring schemes. Proc. Natl. Acad.
|
|
305 Sci. USA 87:2264-68.
|
|
306
|
|
307 http://www.ncbi.nlm.nih.gov/htbin-post/Entrez/query?uid=2315319&form=6&db=m&Dopt=b
|
|
308
|
|
309 Karlin, Samuel and Stephen F. Altschul (1993). Applications
|
|
310 and statistics for multiple high-scoring segments in molecu-
|
|
311 lar sequences. Proc. Natl. Acad. Sci. USA 90:5873-7.
|
|
312
|
|
313 http://www.ncbi.nlm.nih.gov/htbin-post/Entrez/query?uid=8390686&form=6&db=m&Dopt=b
|
|
314
|
|
315 =head1 USAGE
|
|
316
|
|
317 =head2 Creating Blast objects
|
|
318
|
|
319 A Blast object can be constructed from the contents of a Blast report
|
|
320 using a set of named parameters that specify significance criteria for
|
|
321 parsing. The report data can be read in from an existing file
|
|
322 specified with the C<-file =E<gt> 'filename'> parameter or from a
|
|
323 STDIN stream containing potentially multiple Blast reports. If the
|
|
324 C<-file> parameter does not contain a valid filename, STDIN will be
|
|
325 used. Separate Blast objects will be created for each report in the
|
|
326 stream.
|
|
327
|
|
328 To parse the report, you must include a C<-parse =E<gt> 1> parameter
|
|
329 in addition to any other parsing parameters
|
|
330 See L<parse()|parse> for a full description of parsing parameters.
|
|
331 To run a new report and then parse it, include a C<-run =E<gt> \%runParams>
|
|
332 parameter containing a reference to a hash
|
|
333 that hold the parameters required by the L<run()|run> method.
|
|
334
|
|
335 The constructor for Blast objects is inherited from Bio::Tools::SeqAnal.pm.
|
|
336 See the B<_initialize>() method of that package for general information
|
|
337 relevant to creating Blast objects. (The B<new>() method, inherited from
|
|
338 B<Bio::Root::Object.pm>, calls B<_initialize>(). See L<Links to related modules>).
|
|
339
|
|
340 The Blast object can read compressed (gzipped) Blast report
|
|
341 files. Compression/decompression uses the gzip or compress programs
|
|
342 that are standard on Unix systems and should not require special
|
|
343 configuration. If you can't or don't want to use gzip as the file
|
|
344 compression tool, either pre-uncompress your files before parsing with
|
|
345 this module or modify B<Bio::Root::Utilities.pm> to your liking.
|
|
346
|
|
347 Blast objects can be generated either by direct instantiation as in:
|
|
348
|
|
349 use Bio::Tools::Blast;
|
|
350 $blast = new Bio::Tools::Blast (%parameters);
|
|
351
|
|
352 =head2 Using the Static $Blast Object
|
|
353
|
|
354 use Bio::Tools::Blast qw(:obj);
|
|
355
|
|
356 This exports the static $Blast object into your namespace. "Static"
|
|
357 refers to the fact that it has class scope and there is one of these
|
|
358 created when you use this module. The static $Blast object is
|
|
359 basically an empty object that is provided for convenience and is also
|
|
360 used for various internal chores.
|
|
361
|
|
362 It is exported by this module and can be used for
|
|
363 parsing and running reports as well as HTML-formatting without having
|
|
364 to first create an empty Blast object.
|
|
365
|
|
366 Using the static $Blast object for parsing a STDIN stream of Blast reports:
|
|
367
|
|
368 use Bio::Tools::Blast qw(:obj);
|
|
369
|
|
370 sub process_blast {
|
|
371 my $blastObj = shift;
|
|
372 print $blastObj->table();
|
|
373 $blastObj->destroy;
|
|
374 }
|
|
375
|
|
376 $Blast->parse( -parse => 1,
|
|
377 -signif => '1e-10',
|
|
378 -exec_func => \&process_blast,
|
|
379 );
|
|
380
|
|
381 Then pipe a stream of Blast reports into your script via STDIN. For
|
|
382 each Blast report extracted from the input stream, the parser will
|
|
383 generate a new Blast object and pass it to the function specified by
|
|
384 C<-exec_func>. The destroy() call tells Perl to free the memory
|
|
385 associated with the object, important if you are crunching through
|
|
386 many reports. This method is inherited from B<Bio::Root::Object.pm>
|
|
387 (see L<Links to related modules>). See L<parse()|parse> for a full
|
|
388 description of parameters and L<the DEMO SCRIPTS section | DEMO> for
|
|
389 additional examples.
|
|
390
|
|
391 =head2 Running Blasts
|
|
392
|
|
393 To run a Blast, create a new Blast object with a C<-run =E<gt>
|
|
394 \%runParams> parameter. Remote Blasts are performed by including a
|
|
395 C<-method =E<gt> 'remote'> parameter; local Blasts are performed by
|
|
396 including a C<-method =E<gt> 'local'> parameter. See
|
|
397 L<Running Blast reports> as well as the
|
|
398 L<the DEMO SCRIPTS section | DEMO> for examples.
|
|
399 Note that running local Blasts is not yet supported, see below.
|
|
400
|
|
401 Note that the C<-seqs =E<gt> [ $seqs ]> run parameter must contain a
|
|
402 reference to an array of B<Bio::Seq.pm> objects
|
|
403 (L<Links to related modules>). Encapsulating the sequence in an
|
|
404 object makes sequence information much easier to handle as it can
|
|
405 be supplied in a variety of formats. Bio::Seq.pm is included with
|
|
406 this distribution (L<INSTALLATION | INSTALLATION>).
|
|
407
|
|
408 Remote Blasts are implemented using the
|
|
409 B<Bio::Tools::Blast::Run::Webblast.pm> module. Local Blasts require
|
|
410 that you customize the B<Bio::Tools::Blast::Run::LocalBlast.pm>
|
|
411 module. The version of LocalBlast.pm included with this distribution
|
|
412 provides the basic framework for running local Blasts.
|
|
413 See L<Links to related modules>.
|
|
414
|
|
415 =head2 Significance screening
|
|
416
|
|
417 A C<-signif> parameter can be used to screen out all hits with
|
|
418 P-values (or Expect values) above a certain cutoff. For example, to
|
|
419 exclude all hits with Expect values above 1.0e-10: C<-signif =E<gt>
|
|
420 1e-10>. Providing a C<-signif> cutoff can speed up processing
|
|
421 tremendously, since only a small fraction of the report need be
|
|
422 parsed. This is because the C<-signif> value is used to screen hits
|
|
423 based on the data in the "Description" section of the Blast report:
|
|
424
|
|
425 For NCBI BLAST2 reports:
|
|
426
|
|
427 Score E
|
|
428 Sequences producing significant alignments: (bits) Value
|
|
429
|
|
430 sp|P31376|YAB1_YEAST HYPOTHETICAL 74.1 KD PROTEIN IN CYS3-MDM10... 957 0.0
|
|
431
|
|
432 For BLAST1 or WashU-BLAST2 reports:
|
|
433
|
|
434 Smallest
|
|
435 Sum
|
|
436 High Probability
|
|
437 Sequences producing High-scoring Segment Pairs: Score P(N) N
|
|
438
|
|
439 PDB:3PRK_E Proteinase K complexed with inhibitor ........... 504 1.8e-50 1
|
|
440
|
|
441 Thus, the C<-signif> parameter will screen based on Expect values for
|
|
442 BLAST2 reports and based on P-values for BLAST1/WashU-BLAST2 reports.
|
|
443
|
|
444 To screen based on other criteria, you can supply a C<-filt_func>
|
|
445 parameter containing a function reference that takes a
|
|
446 B<Bio::Tools::Sbjct.pm> object as an argument and returns a boolean,
|
|
447 true if the hit is to be screened out. See example below for
|
|
448 L<Screening hits using arbitrary criteria>.
|
|
449
|
|
450 =head2 Get the best hit.
|
|
451
|
|
452 $hit = $blastObj->hit;
|
|
453
|
|
454 A "hit" is contained by a B<Bio::Tools::Blast::Sbjct.pm> object.
|
|
455
|
|
456 =head2 Get the P-value or Expect value of the most significant hit.
|
|
457
|
|
458 $p = $blastObj->lowest_p;
|
|
459 $e = $blastObj->lowest_expect;
|
|
460
|
|
461 Alternatively:
|
|
462
|
|
463 $p = $blastObj->hit->p;
|
|
464 $e = $blastObj->hit->expect;
|
|
465
|
|
466 Note that P-values are not reported in NCBI Blast2 reports.
|
|
467
|
|
468 =head2 Iterate through all the hits
|
|
469
|
|
470 foreach $hit ($blastObj->hits) {
|
|
471 printf "%s\t %.1e\t %d\t %.2f\t %d\n",
|
|
472 $hit->name, $hit->expect, $hit->num_hsps,
|
|
473 $hit->frac_identical, $hit->gaps;
|
|
474 }
|
|
475
|
|
476 Refer to the documentation for B<Bio::Tools::Blast::Sbjct.pm>
|
|
477 for other ways to work with hit objects (L<Links to related modules>).
|
|
478
|
|
479 =head2 Screening hits using arbitrary criteria
|
|
480
|
|
481 sub filter { $hit=shift;
|
|
482 return ($hit->gaps == 0 and
|
|
483 $hit->frac_conserved > 0.5); }
|
|
484
|
|
485 $blastObj = Bio::Tools::Blast->new( -file => '/tmp/blast.out',
|
|
486 -parse => 1,
|
|
487 -filt_func => \&filter );
|
|
488
|
|
489 While the Blast object is parsing the report, each hit checked by calling
|
|
490 &filter($hit). All hits that generate false return values from &filter
|
|
491 are screened out and will not be added to the Blast object.
|
|
492 Note that the Blast object will normally stop parsing the report after
|
|
493 the first non-significant hit or the first hit that does not pass the
|
|
494 filter function. To force the Blast object to check all hits,
|
|
495 include a C<-check_all_hits =E<gt> 1> parameter.
|
|
496 Refer to the documentation for B<Bio::Tools::Blast::Sbjct.pm>
|
|
497 for other ways to work with hit objects.
|
|
498
|
|
499 =over 4
|
|
500
|
|
501 =item Hit start, end coordinates.
|
|
502
|
|
503 print $sbjct->start('query');
|
|
504 print $sbjct->end('sbjct');
|
|
505
|
|
506 In array context, you can get information for both query and sbjct with one call:
|
|
507
|
|
508 ($qstart, $sstart) = $sbjct->start();
|
|
509 ($qend, $send) = $sbjct->end();
|
|
510
|
|
511 For important information regarding coordinate information, see
|
|
512 the L<HSP start, end, and strand> section below.
|
|
513 Also check out documentation for the start and end methods in B<Bio::Tools::Blast::Sbjct.pm>,
|
|
514 which explains what happens if there is more than one HSP.
|
|
515
|
|
516 =back
|
|
517
|
|
518 =head2 Working with HSPs
|
|
519
|
|
520 =over 4
|
|
521
|
|
522 =item Iterate through all the HSPs of every hit
|
|
523
|
|
524 foreach $hit ($blastObj->hits) {
|
|
525 foreach $hsp ($hit->hsps) {
|
|
526 printf "%.1e\t %d\t %.1f\t %.2f\t %.2f\t %d\t %d\n",
|
|
527 $hsp->expect, $hsp->score, $hsp->bits,
|
|
528 $hsp->frac_identical, $hsp->frac_conserved,
|
|
529 $hsp->gaps('query'), $hsp->gaps('sbjct');
|
|
530 }
|
|
531
|
|
532 Refer to the documentation for B<Bio::Tools::Blast::HSP.pm>
|
|
533 for other ways to work with hit objects (L<Links to related modules>).
|
|
534
|
|
535 =back
|
|
536
|
|
537 =over 4
|
|
538
|
|
539 =item Extract HSP sequence data as strings or sequence objects
|
|
540
|
|
541 Get the first HSP of the first hit and the sequences
|
|
542 of the query and sbjct as strings.
|
|
543
|
|
544 $hsp = $blast_obj->hit->hsp;
|
|
545 $query_seq = $hsp->seq_str('query');
|
|
546 $hsp_seq = $hsp->seq_str('sbjct');
|
|
547
|
|
548 Get the indices of identical and conserved positions in the HSP query seq.
|
|
549
|
|
550 @query_iden_indices = $hsp->seq_inds('query', 'identical');
|
|
551 @query_cons_indices = $hsp->seq_inds('query', 'conserved');
|
|
552
|
|
553 Similarly for the sbjct sequence.
|
|
554
|
|
555 @sbjct_iden_indices = $hsp->seq_inds('sbjct', 'identical');
|
|
556 @sbjct_cons_indices = $hsp->seq_inds('sbjct', 'conserved');
|
|
557
|
|
558 print "Query in Fasta format:\n", $hsp->seq('query')->layout('fasta');
|
|
559 print "Sbjct in Fasta format:\n", $hsp->seq('sbjct')->layout('fasta');
|
|
560
|
|
561 See the B<Bio::Seq.pm> package for more information about using these sequence objects
|
|
562 (L<Links to related modules>).
|
|
563
|
|
564 =back
|
|
565
|
|
566 =over 4
|
|
567
|
|
568 =item Create sequence alignment objects using HSP sequences
|
|
569
|
|
570 $aln = $hsp->get_aln;
|
|
571 print " consensus:\n", $aln->consensus();
|
|
572 print $hsp->get_aln->layout('fasta');
|
|
573
|
|
574 $ENV{READSEQ_DIR} = '/home/users/sac/bin/solaris';
|
|
575 $ENV{READSEQ} = 'readseq';
|
|
576 print $hsp->get_aln->layout('msf');
|
|
577
|
|
578 MSF formated layout requires Don Gilbert's ReadSeq program (not included).
|
|
579 See the B<Bio::UnivAln.pm> for more information about using these alignment objects
|
|
580 (L<Links to related modules>)'.
|
|
581
|
|
582 =back
|
|
583
|
|
584 =over 4
|
|
585
|
|
586 =item HSP start, end, and strand
|
|
587
|
|
588 To facilitate HSP processing, endpoint data for each HSP sequence are
|
|
589 normalized so that B<start is always less than end>. This affects TBLASTN
|
|
590 and TBLASTX HSPs on the reverse complement or "Minus" strand.
|
|
591
|
|
592 Some examples of obtaining start, end coordinates for HSP objects:
|
|
593
|
|
594 print $hsp->start('query');
|
|
595 print $hsp->end('sbjct');
|
|
596 ($qstart, $sstart) = $hsp->start();
|
|
597 ($qend, $send) = $hsp->end();
|
|
598
|
|
599 Strandedness of the HSP can be assessed using the strand() method
|
|
600 on the HSP object:
|
|
601
|
|
602 print $hsp->strand('query');
|
|
603 print $hsp->strand('sbjct');
|
|
604
|
|
605 These will return 'Minus' or 'Plus'.
|
|
606 Or, to get strand information for both query and sbjct with a single call:
|
|
607
|
|
608 ($qstrand, $sstrand) = $hsp->strand();
|
|
609
|
|
610 =back
|
|
611
|
|
612 =head2 Report Generation
|
|
613
|
|
614 =over 4
|
|
615
|
|
616 =item Generate a tab-delimited table of all results.
|
|
617
|
|
618 print $blastObj->table;
|
|
619 print $blastObj->table(0); # don't include hit descriptions.
|
|
620 print $blastObj->table_tiled;
|
|
621
|
|
622 The L<table()|table> method returns data for each B<HSP> of each hit listed one per
|
|
623 line. The L<table_tiled()|table_tiled> method returns data for each B<hit, i.e., Sbjct>
|
|
624 listed one per line; data from multiple HSPs are combined after tiling to
|
|
625 reduce overlaps. See B<Bio::Tools::Blast::Sbjct.pm> for more information about
|
|
626 HSP tiling. These methods generate stereotypical, tab-delimited data for each
|
|
627 hit of the Blast report. The output is suitable for importation into
|
|
628 spreadsheets or database tables. Feel free to roll your own table function if
|
|
629 you need a custom table.
|
|
630
|
|
631 For either table method, descriptions of each hit can be included if a
|
|
632 single, true argument is supplied (e.g., $blastObj-E<gt>table(1)). The description
|
|
633 will be added as the last field. This will significantly increase the size of
|
|
634 the table. Labels for the table columns can be obtained with L<table_labels()|table_labels>
|
|
635 and L<table_labels_tiled()|table_labels_tiled>.
|
|
636
|
|
637 =back
|
|
638
|
|
639 =over 4
|
|
640
|
|
641 =item Print a summary of the Blast report
|
|
642
|
|
643 $blastObj->display();
|
|
644 $blastObj->display(-show=>'hits');
|
|
645
|
|
646 L<display()|display> prints various statistics extracted from the Blast report
|
|
647 such as database name, database size, matrix used, etc. The
|
|
648 C<display(-show=E<gt>'hits')> call prints a non-tab-delimited table
|
|
649 attempting to line the data up into more readable columns. The output
|
|
650 generated is similar to L<table_tiled()|table_tiled>.
|
|
651
|
|
652 =back
|
|
653
|
|
654 =over 4
|
|
655
|
|
656 =item HTML-format an existing report
|
|
657
|
|
658 use Bio::Tools::Blast qw(:obj);
|
|
659
|
|
660 # Going straight from a non HTML report file to HTML output using
|
|
661 # the static $Blast object exported by Bio::Tools::Blast.pm
|
|
662 $Blast->to_html(-file => '/usr/people/me/blast.output.txt',
|
|
663 -header => qq|<H1>BLASTP Results</H1><A HREF="home.html">Home</A>|
|
|
664 );
|
|
665
|
|
666 # You can also use a specific Blast object created previously.
|
|
667 $blastObj->to_html;
|
|
668
|
|
669 L<to_html()|to_html> will send HTML output, line-by-line, directly to STDOUT
|
|
670 unless an C<-out =E<gt> array_ref> parameter is supplied (e.g., C<-out
|
|
671 =E<gt> \@array>), in which case the HTML will be stored in @array, one
|
|
672 line per array element. The direct outputting permits faster response
|
|
673 time since Blast reports can be huge. The -header tag can contain a
|
|
674 string containing any HTML that you want to appear at the top of the
|
|
675 Blast report.
|
|
676
|
|
677 =back
|
|
678
|
|
679 =head1 DEMO SCRIPTS
|
|
680
|
|
681 Sample Scripts are included in the central bioperl distribution in the
|
|
682 'examples/blast/' directory (see L<INSTALLATION | INSTALLATION>):
|
|
683
|
|
684 =head2 Handy library for working with Bio::Tools::Blast.pm
|
|
685
|
|
686 examples/blast/blast_config.pl
|
|
687
|
|
688 =head2 Parsing Blast reports one at a time.
|
|
689
|
|
690 examples/blast/parse_blast.pl
|
|
691 examples/blast/parse_blast2.pl
|
|
692 examples/blast/parse_positions.pl
|
|
693
|
|
694 =head2 Parsing sets of Blast reports.
|
|
695
|
|
696 examples/blast/parse_blast.pl
|
|
697 examples/blast/parse_multi.pl
|
|
698
|
|
699 B<Warning:> See note about L<Memory Usage Issues>.
|
|
700
|
|
701 =head2 Running Blast analyses one at a time.
|
|
702
|
|
703 examples/blast/run_blast_remote.pl
|
|
704
|
|
705 =head2 Running Blast analyses given a set of sequences.
|
|
706
|
|
707 examples/blast/blast_seq.pl
|
|
708
|
|
709 =head2 HTML-formatting Blast reports.
|
|
710
|
|
711 examples/blast/html.pl
|
|
712
|
|
713 =head1 TECHNICAL DETAILS
|
|
714
|
|
715 =head2 Blast Modes
|
|
716
|
|
717 A BLAST object may be created using one of three different modes as
|
|
718 defined by the B<Bio::Tools::SeqAnal.pm> package
|
|
719 (See L<Links to related modules>):
|
|
720
|
|
721 -- parse - Load a BLAST report and parse it, storing parsed data in
|
|
722 Blast.pm object.
|
|
723 -- run - Run the BLAST program to generate a new report.
|
|
724 -- read - Load a BLAST report into the Blast object without parsing.
|
|
725
|
|
726 B<Run mode support has recently been added>. The module
|
|
727 B<Bio::Tools::Blast::Run::Webblast.pm> is an modularized adaptation of
|
|
728 the webblast script by Alex Dong Li:
|
|
729
|
|
730 http://www.genet.sickkids.on.ca/bioinfo_resources/software.html#webblast
|
|
731
|
|
732 for running remote Blast analyses and saving the results locally. Run
|
|
733 mode can be combined with a parse mode to generate a Blast report and
|
|
734 then build the Blast object from the parsed results of this report
|
|
735 (see L<run()|run> and L<SYNOPSIS | SYNOPSIS>).
|
|
736
|
|
737 In read mode, the BLAST report is read in by the Blast object but is
|
|
738 not parsed. This could be used to internalize a Blast report but not
|
|
739 parse it for results (e.g., generating HTML formatted output).
|
|
740
|
|
741 =head2 Significant Hits
|
|
742
|
|
743 This module permits the screening of hits on the basis of
|
|
744 user-specified criteria for significance. Currently, Blast reports can
|
|
745 be screened based on:
|
|
746
|
|
747 CRITERIA PARAMETER VALUE
|
|
748 ---------------------------------- --------- ----------------
|
|
749 1) the best Expect (or P) value -signif float or sci-notation
|
|
750 2) the length of the query sequence -min_length integer
|
|
751 3) arbitrary criteria -filt_func function reference
|
|
752
|
|
753 The parameters are used for construction of the BLAST object or when
|
|
754 running the L<parse()|parse> method on the static $Blast object. The
|
|
755 -SIGNIF value represents the number listed in the description section
|
|
756 at the top of the Blast report. For Blast2, this is an Expect value,
|
|
757 for Blast1 and WashU-Blast2, this is a P-value. The idea behind the
|
|
758 C<-filt_func> parameter is that the hit has to pass through a filter
|
|
759 to be considered significant. Refer to the documentation for
|
|
760 B<Bio::Tools::Blast::Sbjct.pm> for ways to work with hit objects.
|
|
761
|
|
762 Using a C<-signif> parameter allows for the following:
|
|
763
|
|
764 =over 2
|
|
765
|
|
766 =item Faster parsing.
|
|
767
|
|
768 Each hit can be screened by examination of the description line alone
|
|
769 without fully parsing the HSP alignment section.
|
|
770
|
|
771 =item Flexibility.
|
|
772
|
|
773 The C<-signif> tag provides a more semantic-free way to specify the
|
|
774 value to be used as a basis for screening hits. Thus, C<-signif> can
|
|
775 be used for screening Blast1 or Blast2 reports. It is up to the user
|
|
776 to understand whether C<-signif> represents a P-value or an Expect
|
|
777 value.
|
|
778
|
|
779 =back
|
|
780
|
|
781 Any hit not meeting the significance criteria will not be added to the
|
|
782 "hit list" of the BLAST object. Also, a BLAST object without any hits
|
|
783 meeting the significance criteria will throw an exception during
|
|
784 object construction (a fatal event).
|
|
785
|
|
786 =head2 Statistical Parameters
|
|
787
|
|
788 There are numerous parameters which define the behavior of the BLAST
|
|
789 program and which are useful for interpreting the search
|
|
790 results. These parameters are extracted from the Blast report:
|
|
791
|
|
792 filter -- for masking out low-complexity sequences or short repeats
|
|
793 matrix -- name of the substitution scoring matrix (e.g., BLOSUM62)
|
|
794 E -- Expect filter (screens out frequent scores)
|
|
795 S -- Cutoff score for segment pairs
|
|
796 W -- Word length
|
|
797 T -- Threshold score for word pairs
|
|
798 Lambda, -- Karlin-Altschul "sum" statistical parameters dependent on
|
|
799 K, H sequence composition.
|
|
800 G -- Gap creation penalty.
|
|
801 E -- Gap extension penalty.
|
|
802
|
|
803 These parameters are not always needed. Extraction may be turned off
|
|
804 explicitly by including a C<-stats =E<gt> 0> parameter during object
|
|
805 construction. Support for all statistical parameters is not complete.
|
|
806
|
|
807 For more about the meaning of parameters, check out the NCBI URLs given above.
|
|
808
|
|
809 =head2 Module Organization
|
|
810
|
|
811 The modules that comprise this Bioperl Blast distribution are location in the
|
|
812 Bio:: hierarchy as shown in the diagram below.
|
|
813
|
|
814 Bio/
|
|
815 |
|
|
816 +--------------------------+
|
|
817 | |
|
|
818 Bio::Tools Bio::Root
|
|
819 | |
|
|
820 +----------------------+ Object.pm
|
|
821 | | |
|
|
822 SeqAnal.pm Blast.pm Blast/
|
|
823 |
|
|
824 +---------+---------+------------+
|
|
825 | | | |
|
|
826 Sbjct.pm HSP.pm HTML.pm Run/
|
|
827 |
|
|
828 +------------+
|
|
829 | |
|
|
830 Webblast.pm LocalBlast.pm
|
|
831
|
|
832 Bio::Tools::Blast.pm is a concrete class that inherits from
|
|
833 B<Bio::Tools::SeqAnal.pm> and relies on other modules for parsing and
|
|
834 managing BLAST data. Worth mentioning about this hierarchy is the
|
|
835 lack of a "Parse.pm" module. Since parsing is considered central to
|
|
836 the purpose of the Bioperl Blast module (and Bioperl in general), it
|
|
837 seems somewhat unnatural to segregate out all parsing code. This
|
|
838 segregation could also lead to inefficiencies and harder to maintain
|
|
839 code. I consider this issue still open for debate.
|
|
840
|
|
841 Bio::Tools::Blast.pm, B<Bio::Tools::Blast::Sbjct.pm>, and
|
|
842 B<Bio::Tools::Blast::HSP.pm> are mostly dedicated to parsing and all
|
|
843 can be used to instantiate objects. Blast.pm is the main "command and
|
|
844 control" module, inheriting some basic behaviors from SeqAnal.pm
|
|
845 (things that are not specific to Blast I<per se>).
|
|
846
|
|
847 B<Bio::Tools::Blast::HTML.pm> contains functions dedicated to
|
|
848 generating HTML-formatted Blast reports and does not generate objects.
|
|
849
|
|
850 =head2 Running Blasts: Details
|
|
851
|
|
852 B<Bio::Tools::Blast::Run::Webblast.pm> contains a set of functions for
|
|
853 running Blast analyses at a remote server and also does not
|
|
854 instantiate objects. It uses a helper script called postclient.pl,
|
|
855 located in the Run directory. The proposed LocalBlast.pm module would
|
|
856 be used for running Blast reports on local machines and thus would be
|
|
857 customizable for different sites. It would operate in a parallel
|
|
858 fashion to Webblast.pm (i.e., being a collection of functions, taking
|
|
859 in sequence objects or files, returning result files).
|
|
860
|
|
861 The Run modules are considered experimental. In particular,
|
|
862 Webblast.pm catures an HTML-formatted version of the Blast report from
|
|
863 the NCBI server and strips out the HTML in preparation for parsing. A
|
|
864 more direct approach would be to capture the Blast results directly
|
|
865 from the server using an interface to the NCBI toolkit. This approach
|
|
866 was recently proposed on the Bioperl mailing list:
|
|
867 http://www.uni-bielefeld.de/mailinglists/BCD/vsns-bcd-perl/9805/0000.html
|
|
868
|
|
869 =head2 Memory Usage Issues
|
|
870
|
|
871 Parsing large numbers of Blast reports (a few thousand or so) with
|
|
872 Bio::Tools::Blast.pm may lead to unacceptable memory usage situations.
|
|
873 This is somewhat dependent of the size and complexity of the reports.
|
|
874
|
|
875 While this problem is under investigation, here are some workarounds
|
|
876 that fix the memory usage problem:
|
|
877
|
|
878 =over 4
|
|
879
|
|
880 =item 1 Don't specify a -signif criterion when calling L<parse()|parse>.
|
|
881
|
|
882 The C<-signif> value is used for imposing a upper limit to the expect- or
|
|
883 P-value for Blast hits to be parsed. For reasons that are still under
|
|
884 investigation, specifying a value for C<-signif> in the L<parse()|parse>
|
|
885 method prevents Blast objects from being fully
|
|
886 garbage collected. When using the B<parse_blast.pl> or B<parse_multi.pl>
|
|
887 scripts in C<examples/blast/> of the bioperl distribution), don't supply
|
|
888 a C<-signif> command-line parameter.
|
|
889
|
|
890 =item 2 If you want to impose a -signif criterion, put it inside a
|
|
891 -filt_func.
|
|
892
|
|
893 For the L<parse()|parse> method, a -signif =E<gt> 1e-5 parameter is equivalent
|
|
894 to using a filter function parameter of
|
|
895
|
|
896 -filt_func => sub { my $hit = shift; return $hit->signif <= 1e-5; }
|
|
897
|
|
898 Using the B<examples/blast/parse_multi.pl> script, you can supply a
|
|
899 command-line argument of
|
|
900
|
|
901 -filt_func '$hit->signif <= 1e-5'
|
|
902
|
|
903 For more information, see L<parse()|parse> and the section
|
|
904 L<Screening hits using arbitrary criteria>.
|
|
905
|
|
906 =back
|
|
907
|
|
908 =head1 TODO
|
|
909
|
|
910 =over 4
|
|
911
|
|
912 =item * Develop a functional, prototype Bio::Tools::Blast::Run::LocalBlast.pm module.
|
|
913
|
|
914 =item * Add support for PSI-BLAST and PHI-BLAST
|
|
915
|
|
916 =item * Parse histogram of expectations and retrieve gif image in
|
|
917 Blast report (if present).
|
|
918
|
|
919 =item * Further investigate memory leak that occurs when parsing Blast
|
|
920 streams whe supplying a -signif parameter to L<parse()|parse>.
|
|
921
|
|
922 =item * Access Blast results directly from the NCBI server using a
|
|
923 Perl interface to the NCBI toolkit or XML formated Blast reports (when
|
|
924 available).
|
|
925
|
|
926 =item * Further exploit Bio::UnivAln.pm and multiple-sequence
|
|
927 alignment programs using HSP sequence data. Some of this may best go
|
|
928 into a separate, dedicated module or script as opposed to burdening
|
|
929 Blast.pm, Sbjct.pm, and HSP.pm with additional functionality that is
|
|
930 not always required.
|
|
931
|
|
932 =item * Add an example script for parsing Blast reports containing
|
|
933 HTML formatting.
|
|
934
|
|
935 =back
|
|
936
|
|
937 =head1 VERSION
|
|
938
|
|
939 Bio::Tools::Blast.pm, 0.09
|
|
940
|
|
941 =head1 FEEDBACK
|
|
942
|
|
943 =head2 Mailing Lists
|
|
944
|
|
945 User feedback is an integral part of the evolution of this and other
|
|
946 Bioperl modules. Send your comments and suggestions preferably to one
|
|
947 of the Bioperl mailing lists. Your participation is much appreciated.
|
|
948
|
|
949 bioperl-l@bioperl.org - General discussion
|
|
950 http://bio.perl.org/MailList.html - About the mailing lists
|
|
951
|
|
952 =head2 Reporting Bugs
|
|
953
|
|
954 Report bugs to the Bioperl bug tracking system to help us keep track
|
|
955 the bugs and their resolution. Bug reports can be submitted via email
|
|
956 or the web:
|
|
957
|
|
958 bioperl-bugs@bio.perl.org
|
|
959 http://bugzilla.bioperl.org/
|
|
960
|
|
961 =head1 AUTHOR
|
|
962
|
|
963 Steve Chervitz, sac@bioperl.org
|
|
964
|
|
965 See the L<FEEDBACK | FEEDBACK> section for where to send bug reports and comments.
|
|
966
|
|
967 =head1 ACKNOWLEDGEMENTS
|
|
968
|
|
969 This module was developed under the auspices of the Saccharomyces Genome
|
|
970 Database:
|
|
971 http://genome-www.stanford.edu/Saccharomyces
|
|
972
|
|
973 Other contributors include: Alex Dong Li (webblast), Chris Dagdigian
|
|
974 (Seq.pm), Steve Brenner (Seq.pm), Georg Fuellen (Seq.pm, UnivAln.pm),
|
|
975 and untold others who have offered comments (noted in the
|
|
976 Bio/Tools/Blast/CHANGES file of the distribution).
|
|
977
|
|
978 =head1 COPYRIGHT
|
|
979
|
|
980 Copyright (c) 1996-98 Steve Chervitz. All Rights Reserved. This
|
|
981 module is free software; you can redistribute it and/or modify it
|
|
982 under the same terms as Perl itself.
|
|
983
|
|
984 =head1 SEE ALSO
|
|
985
|
|
986 Bio::Tools::SeqAnal.pm - Sequence analysis object base class.
|
|
987 Bio::Tools::Blast::Sbjct.pm - Blast hit object.
|
|
988 Bio::Tools::Blast::HSP.pm - Blast HSP object.
|
|
989 Bio::Tools::Blast::HTML.pm - Blast HTML-formating utility class.
|
|
990 Bio::Tools::Blast::Run::Webblast.pm - Utility module for running Blasts remotely.
|
|
991 Bio::Tools::Blast::Run::LocalBlast.pm - Utility module for running Blasts locally.
|
|
992 Bio::Seq.pm - Biosequence object
|
|
993 Bio::UnivAln.pm - Biosequence alignment object.
|
|
994 Bio::Root::Object.pm - Proposed base class for all Bioperl objects.
|
|
995
|
|
996 =head2 Links to related modules
|
|
997
|
|
998 Bio::Tools::SeqAnal.pm
|
|
999 http://bio.perl.org/Core/POD/Bio/Tools/SeqAnal.html
|
|
1000
|
|
1001 Bio::Tools::Blast::Sbjct.pm
|
|
1002 http://bio.perl.org/Core/POD/Bio/Tools/Blast/Sbjct.html
|
|
1003
|
|
1004 Bio::Tools::Blast::HSP.pm
|
|
1005 http://bio.perl.org/Core/POD/Bio/Tools/Blast/HSP.html
|
|
1006
|
|
1007 Bio::Tools::Blast::HTML.pm
|
|
1008 http://bio.perl.org/Core/POD/Bio/Tools/Blast/HTML.html
|
|
1009
|
|
1010 Bio::Tools::Blast::Run::Webblast.pm
|
|
1011 http://bio.perl.org/Core/POD/Bio/Tools/Blast/Run/Webblast.html
|
|
1012
|
|
1013 Bio::Tools::Blast::Run::LocalBlast.pm
|
|
1014 http://bio.perl.org/Core/POD/Bio/Tools/Blast/Run/LocalBlast.html
|
|
1015
|
|
1016 Bio::Seq.pm
|
|
1017 http://bio.perl.org/Core/POD/Seq.html
|
|
1018
|
|
1019 Bio::UnivAln.pm
|
|
1020 http://bio.perl.org/Projects/SeqAlign/
|
|
1021 Europe: http://www.techfak.uni-bielefeld.de/bcd/Perl/Bio/#univaln
|
|
1022
|
|
1023 Bio::Root::Object.pm
|
|
1024 http://bio.perl.org/Core/POD/Root/Object.html
|
|
1025
|
|
1026 http://bio.perl.org/Projects/modules.html - Online module documentation
|
|
1027 http://bio.perl.org/Projects/Blast/ - Bioperl Blast Project
|
|
1028 http://bio.perl.org/ - Bioperl Project Homepage
|
|
1029
|
|
1030 L<References & Information about the BLAST program>.
|
|
1031
|
|
1032 =head1 KNOWN BUGS
|
|
1033
|
|
1034 There is a memory leak that occurs when parsing parsing streams
|
|
1035 containing large numbers of Blast reports (a few thousand or so) and
|
|
1036 specifying a -signif parameter to the L<parse()|parse> method. For a
|
|
1037 workaround, see L<Memory Usage Issues>.
|
|
1038
|
|
1039 Not sharing statistical parameters between different Blast objects
|
|
1040 when parsing a multi-report stream has not been completely tested and
|
|
1041 may be a little buggy.
|
|
1042
|
|
1043 Documentation inconsistencies or inaccuracies may exist since this
|
|
1044 module underwend a fair bit of re-working going from 0.75 to 0.80
|
|
1045 (corresponds to versions 0.04.4 to 0.05 of the bioperl distribution).
|
|
1046
|
|
1047 =cut
|
|
1048
|
|
1049 #
|
|
1050 ##
|
|
1051 ###
|
|
1052 #### END of main POD documentation.
|
|
1053 ###
|
|
1054 ##
|
|
1055 #
|
|
1056
|
|
1057 =head1 APPENDIX
|
|
1058
|
|
1059 Methods beginning with a leading underscore are considered private and
|
|
1060 are intended for internal use by this module. They are B<not>
|
|
1061 considered part of the public interface and are described here for
|
|
1062 documentation purposes only.
|
|
1063
|
|
1064 =cut
|
|
1065
|
|
1066 ##############################################################################
|
|
1067 ## CONSTRUCTOR ##
|
|
1068 ##############################################################################
|
|
1069
|
|
1070
|
|
1071 sub new {
|
|
1072 my ($class,@args) = @_;
|
|
1073 $class->warn("Bio::Tools::BLAST is deprecated, use Bio::SearchIO system or Bio::Tools::BPlite");
|
|
1074 return $class->SUPER::new(@args);
|
|
1075 }
|
|
1076
|
|
1077 ## The Blast.pm object relies on the the superclass constructor:
|
|
1078 ## Bio::Tools::SeqAnal::_initialize(). See that module for details.
|
|
1079
|
|
1080 #-------------
|
|
1081 sub destroy {
|
|
1082 #-------------
|
|
1083 my $self=shift;
|
|
1084 $DEBUG==2 && print STDERR "DESTROYING $self ${\$self->name}";
|
|
1085 if($self->{'_hits'}) {
|
|
1086 foreach($self->hits) {
|
|
1087 $_->destroy;
|
|
1088 undef $_;
|
|
1089 }
|
|
1090 undef $self->{'_hits'};
|
|
1091 #$self->{'_hits'}->remove_all; ## When and if this member becomes a vector.
|
|
1092 }
|
|
1093
|
|
1094 $self->SUPER::destroy;
|
|
1095 }
|
|
1096
|
|
1097 #####################################################################################
|
|
1098 ## ACCESSORS ##
|
|
1099 #####################################################################################
|
|
1100
|
|
1101 =head2 run
|
|
1102
|
|
1103 Usage : $object->run( %named_parameters )
|
|
1104 Purpose : Run a local or remote Blast analysis on one or more sequences.
|
|
1105 Returns : String containing name of Blast output file if a single Blast
|
|
1106 : is run.
|
|
1107 : -- OR --
|
|
1108 : List of Blast objects if multiple Blasts are being run as a group.
|
|
1109 Argument : Named parameters: (PARAMETER TAGS CAN BE UPPER OR LOWER CASE).
|
|
1110 : -METHOD => 'local' or 'remote' (default = remote),
|
|
1111 : -PARSE => boolean, (true if the results are to be parsed after the run)
|
|
1112 : -STRICT => boolean, the strict mode to use for the resulting Blast objects.
|
|
1113 : ADDITIONAL PARAMETERS:
|
|
1114 : See methods _run_remote() and _run_local() for required
|
|
1115 : parameters necessary for running the blast report.
|
|
1116 Throws : Exception if no Blast output file was obtained.
|
|
1117 Comments : This method is called automatically during construction of a
|
|
1118 : Blast.pm object when run parameters are sent to the constructor:
|
|
1119 : $blastObj = new Bio::Tools::Blast (-RUN =>\%runParam,
|
|
1120 : %parseParam );
|
|
1121 :
|
|
1122 : The specific run methods (local or remote) called by run()
|
|
1123 : must return a list containing the file name(s) with the Blast output.
|
|
1124 :
|
|
1125 : The run() method can perform single or multiple Blast runs
|
|
1126 : (analogous to the way parse() works) depending on how many
|
|
1127 : sequences are submitted. However, the running of multiple
|
|
1128 : Blasts is probably better handled at the script level. See notes in
|
|
1129 : the "TODO" section below.
|
|
1130 :
|
|
1131 : As for what to do with the Blast result file, that decision is
|
|
1132 : left for the user who can direct the Blast object to delete, compress,
|
|
1133 : or leave it alone.
|
|
1134 :
|
|
1135 : This method does not worry about load balancing, which
|
|
1136 : is probably best handled at the server level.
|
|
1137 :
|
|
1138 TODO: : Support for running+parsing multiple Blast analyses with a
|
|
1139 : single run() call is incomplete. One can generate multiple
|
|
1140 : reports by placing more than one sequence object in the -seqs
|
|
1141 : reference parameter. This saves some overhead in the code
|
|
1142 : that executes the Blasts since all options are configured once.
|
|
1143 : (This is analogous to parsing using the static $Blast object
|
|
1144 : see parse() and _parse_stream()).
|
|
1145 :
|
|
1146 : The trouble is that Blast objects for all runs are constructed,
|
|
1147 : parsed (if necessary), and then returned as a group
|
|
1148 : This can require lots of memory when run+parsing many Blasts
|
|
1149 : but should be fine if you just want to run a bunch Blasts.
|
|
1150 :
|
|
1151 : For now, when running+parsing Blasts, stick to running one
|
|
1152 : Blast at a time, building the Blast object with the results
|
|
1153 : of that report, and processing as necessary.
|
|
1154 :
|
|
1155 : Support for running PSI-Blast is not complete.
|
|
1156
|
|
1157 See Also: L<_run_remote()|_run_remote>, L<_run_local()|_run_local>, L<parse()|parse>
|
|
1158
|
|
1159 =cut
|
|
1160
|
|
1161 #---------
|
|
1162 sub run {
|
|
1163 #---------
|
|
1164 my ($self, %param) = @_;
|
|
1165 my($method, $parse, $strict) =
|
|
1166 $self->_rearrange([qw(METHOD PARSE STRICT)], %param);
|
|
1167
|
|
1168 $strict = $self->strict($strict) if $strict;
|
|
1169
|
|
1170 my (@files);
|
|
1171 if($method =~ /loc/i) {
|
|
1172 @files = $self->_run_local(%param);
|
|
1173
|
|
1174 } else {
|
|
1175 @files = $self->_run_remote(%param);
|
|
1176 }
|
|
1177
|
|
1178 $self->throw("Run Blast failed: no Blast output created.") if !@files;
|
|
1179
|
|
1180 if(scalar(@files) == 1) {
|
|
1181 # If there was just one Blast output file, prepare to incorporate it
|
|
1182 # into the current Blast object. run() is called before parse() in the
|
|
1183 # SeqAnal.pm constructor.
|
|
1184 if($files[0] ne 'email') {
|
|
1185 $self->file($files[0]);
|
|
1186 } else {
|
|
1187 # Can't do anything with the report.
|
|
1188 $self->throw("Blast report to be sent via e-mail.");
|
|
1189 }
|
|
1190
|
|
1191 } else {
|
|
1192 # If there are multiple report files, build individual Blast objects foreach.
|
|
1193 # In this situation, the static $Blast object is being used to run
|
|
1194 # a set of related Blasts, similar to the way parse() can be used.
|
|
1195 # This strategy is not optimal since all reports are generated first
|
|
1196 # before any are parsed.
|
|
1197 # Untested.
|
|
1198
|
|
1199 my(@objs);
|
|
1200 foreach(@files) {
|
|
1201 push @objs, new Bio::Tools::Blast(-FILE => $_,
|
|
1202 -PARSE => $parse || 0,
|
|
1203 -STRICT => $strict,
|
|
1204 );
|
|
1205 }
|
|
1206 return @objs;
|
|
1207 }
|
|
1208 }
|
|
1209
|
|
1210 =head2 _run_remote
|
|
1211
|
|
1212 Usage : n/a; internal method called by run()
|
|
1213 : $object->_run_remote( %named_parameters )
|
|
1214 Purpose : Run Blast on a remote server.
|
|
1215 Argument : Named parameters:
|
|
1216 : See documentation for function &blast_remote in
|
|
1217 : Bio::Tools::Blast::Run::Webblast.pm for description
|
|
1218 : of parameters.
|
|
1219 Comments : This method requires the Bio::Tools::Blast::Run::Webblast.pm
|
|
1220 : which conforms to this minimal API:
|
|
1221 : * export a method called &blast_remote that accepts a
|
|
1222 : Bio::Tools::Blast.pm object + named parameters
|
|
1223 : (specified in the Webblast.pm module).
|
|
1224 : * return a list of names of files containing the raw Blast reports.
|
|
1225 : (When building a Blast object, this list would contain a
|
|
1226 : single file from which the Blast object is to be constructed).
|
|
1227
|
|
1228 See Also : L<run()|run>, L<_run_local()|_run_local>, B<Bio::Tools::Blast::Run::Webblast.pm::blast_remote()>, L<Links to related modules>
|
|
1229
|
|
1230 =cut
|
|
1231
|
|
1232 #----------------
|
|
1233 sub _run_remote {
|
|
1234 #----------------
|
|
1235 my ($self, %param) = @_;
|
|
1236
|
|
1237 require Bio::Tools::Blast::Run::Webblast;
|
|
1238 Bio::Tools::Blast::Run::Webblast->import(qw(&blast_remote));
|
|
1239
|
|
1240 &blast_remote($self, %param);
|
|
1241 }
|
|
1242
|
|
1243 =head2 _run_local
|
|
1244
|
|
1245 Usage : n/a; internal method called by run()
|
|
1246 : $object->_run_local(%named_parameters)
|
|
1247 Purpose : Run Blast on a local machine.
|
|
1248 Argument : Named parameters:
|
|
1249 : See documentation for function &blast_local in
|
|
1250 : Bio::Tools::Blast::Run::LocalBlast.pm for description
|
|
1251 : of parameters.
|
|
1252 Comments : This method requires the Bio::Tools::Blast::Run::LocalBlast.pm
|
|
1253 : module which should be customized for your site. This module would
|
|
1254 : contain all the commands, paths, environment variables, and other
|
|
1255 : data necessary to run Blast commands on a local machine, but should
|
|
1256 : not contain any semantics for specific query sequences.
|
|
1257 :
|
|
1258 : LocalBlast.pm should also conform to this minimal API:
|
|
1259 : * export a method called &blast_local that accepts a
|
|
1260 : Bio::Tools::Blast.pm object + named parameters
|
|
1261 : (specified in the LocalBlast.pm module).
|
|
1262 : * return a list of names of files containing the raw Blast reports.
|
|
1263 : (When building a Blast object, this list would contain a
|
|
1264 : single file from which the Blast object is to be constructed).
|
|
1265
|
|
1266 See Also : L<run()|run>, L<_run_remote()|_run_remote>, B<Bio::Tools::Blast::Run::LocalBlast::blast_local()>, L<Links to related modules>
|
|
1267
|
|
1268 =cut
|
|
1269
|
|
1270 #--------------
|
|
1271 sub _run_local {
|
|
1272 #--------------
|
|
1273 my ($self, %param) = @_;
|
|
1274
|
|
1275 require Bio::Tools::Blast::Run::Webblast;
|
|
1276 Bio::Tools::Blast::Run::Webblast->import(qw(&blast_local));
|
|
1277
|
|
1278 &blast_local($self, %param);
|
|
1279 }
|
|
1280
|
|
1281 =head2 db_remote
|
|
1282
|
|
1283 Usage : @dbs = $Blast->db_remote( [seq_type] );
|
|
1284 Purpose : Get a list of available sequence databases for remote Blast analysis.
|
|
1285 Returns : Array of strings
|
|
1286 Argument : seq_type = 'p' or 'n'
|
|
1287 : 'p' = Gets databases for peptide searches (default)
|
|
1288 : 'n' = Gets databases for nucleotide searches
|
|
1289 Throws : n/a
|
|
1290 Comments : Peptide databases are a subset of the nucleotide databases.
|
|
1291 : It is convenient to call this method on the static $Blast object
|
|
1292 : as shown in Usage.
|
|
1293
|
|
1294 See Also : L<db_local()|db_local>
|
|
1295
|
|
1296 =cut
|
|
1297
|
|
1298 #----------------
|
|
1299 sub db_remote {
|
|
1300 #----------------
|
|
1301 my ($self, $type) = @_;
|
|
1302 $type ||= 'p';
|
|
1303
|
|
1304 require Bio::Tools::Blast::Run::Webblast;
|
|
1305 Bio::Tools::Blast::Run::Webblast->import(qw(@Blast_dbp_remote
|
|
1306 @Blast_dbn_remote));
|
|
1307
|
|
1308 # We shouldn't have to fully qualify the Blast_dbX_remote arrays. Hm.
|
|
1309
|
|
1310 my(@dbs);
|
|
1311 if( $type =~ /^p|amino/i) {
|
|
1312 @dbs = @Bio::Tools::Blast::Run::Webblast::Blast_dbp_remote;
|
|
1313 } else {
|
|
1314 @dbs = @Bio::Tools::Blast::Run::Webblast::Blast_dbn_remote;
|
|
1315 }
|
|
1316 @dbs;
|
|
1317 }
|
|
1318
|
|
1319 =head2 db_local
|
|
1320
|
|
1321 Usage : @dbs = $Blast->db_local( [seq_type] );
|
|
1322 Purpose : Get a list of available sequence databases for local Blast analysis.
|
|
1323 Returns : Array of strings
|
|
1324 Argument : seq_type = 'p' or 'n'
|
|
1325 : 'p' = Gets databases for peptide searches (default)
|
|
1326 : 'n' = Gets databases for nucleotide searches
|
|
1327 Throws : n/a
|
|
1328 Comments : Peptide databases are a subset of the nucleotide databases.
|
|
1329 : It is convenient to call this method on the static $Blast object.
|
|
1330 as shown in Usage.
|
|
1331
|
|
1332 See Also : L<db_remote()|db_remote>
|
|
1333
|
|
1334 =cut
|
|
1335
|
|
1336 #----------------
|
|
1337 sub db_local {
|
|
1338 #----------------
|
|
1339 my ($self, $type) = @_;
|
|
1340 $type ||= 'p';
|
|
1341
|
|
1342 require Bio::Tools::Blast::Run::LocalBlast;
|
|
1343 Bio::Tools::Blast::Run::LocalBlast->import(qw(@Blast_dbp_local
|
|
1344 @Blast_dbn_local));
|
|
1345
|
|
1346 # We shouldn't have to fully qualify the Blast_dbX_local arrays. Hm.
|
|
1347
|
|
1348 my(@dbs);
|
|
1349 if( $type =~ /^p|amino/i) {
|
|
1350 @dbs = @Bio::Tools::Blast::Run::LocalBlast::Blast_dbp_local;
|
|
1351 } else {
|
|
1352 @dbs = @Bio::Tools::Blast::Run::LocalBlast::Blast_dbn_local;
|
|
1353 }
|
|
1354 @dbs;
|
|
1355 }
|
|
1356
|
|
1357 =head2 parse
|
|
1358
|
|
1359 Usage : $blast_object->parse( %named_parameters )
|
|
1360 Purpose : Parse a Blast report from a file or STDIN.
|
|
1361 : * Parses a raw BLAST data, populating Blast object with report data.
|
|
1362 : * Sets the significance cutoff.
|
|
1363 : * Extracts statistical parameters about the BLAST run.
|
|
1364 : * Handles both single files and streams containing multiple reports.
|
|
1365 Returns : integer (number of Blast reports parsed)
|
|
1366 Argument : <named parameters>: (PARAMETER TAGS CAN BE UPPER OR LOWER CASE).
|
|
1367 : -FILE => string (name of file containing raw Blast output.
|
|
1368 : Optional. If a valid file is not supplied,
|
|
1369 : STDIN will be used).
|
|
1370 : -SIGNIF => number (float or scientific notation number to be used
|
|
1371 : as a P- or Expect value cutoff;
|
|
1372 : default = $DEFAULT_SIGNIF (999)).
|
|
1373 : -FILT_FUNC => func_ref (reference to a function to be used for
|
|
1374 : filtering out hits based on arbitrary criteria.
|
|
1375 : This function should take a
|
|
1376 : Bio::Tools::Blast::Sbjct.pm object as its first
|
|
1377 : argument and return a boolean value,
|
|
1378 : true if the hit should be filtered out).
|
|
1379 : Sample filter function:
|
|
1380 : -FILT_FUNC => sub { $hit = shift;
|
|
1381 : $hit->gaps == 0; },
|
|
1382 : -CHECK_ALL_HITS => boolean (check all hits for significance against
|
|
1383 : significance criteria. Default = false.
|
|
1384 : If false, stops processing hits after the first
|
|
1385 : non-significant hit or the first hit that fails
|
|
1386 : the filt_func call. This speeds parsing,
|
|
1387 : taking advantage of the fact that the hits
|
|
1388 : are processed in the order they are ranked.)
|
|
1389 : -MIN_LEN => integer (to be used as a minimum query sequence length
|
|
1390 : sequences below this length will not be processed).
|
|
1391 : default = no minimum length).
|
|
1392 : -STATS => boolean (collect stats for report: matrix, filters, etc.
|
|
1393 : default = false).
|
|
1394 : -BEST => boolean (only process the best hit of each report;
|
|
1395 : default = false).
|
|
1396 : -OVERLAP => integer (the amount of overlap to permit between
|
|
1397 : adjacent HSPs when tiling HSPs,
|
|
1398 : Default = $MAX_HSP_OVERLAP (2))
|
|
1399 :
|
|
1400 : PARAMETERS USED WHEN PARSING MULTI-REPORT STREAMS:
|
|
1401 : --------------------------------------------------
|
|
1402 : -SHARE => boolean (set this to true if all reports in stream
|
|
1403 : share the same stats. Default = true)
|
|
1404 : Must be set to false when parsing both Blast1 and
|
|
1405 : Blast2 reports in the same run or if you need
|
|
1406 : statistical params for each report, Lambda, K, H).
|
|
1407 : -STRICT => boolean (use strict mode for all Blast objects created.
|
|
1408 : Increases sensitivity to errors. For single
|
|
1409 : Blasts, this is parameter is sent to new().)
|
|
1410 : -EXEC_FUNC => func_ref (reference to a function for processing each
|
|
1411 : Blast object after it is parsed. Should accept a
|
|
1412 : Blast object as its sole argument. Return value
|
|
1413 : is ignored. If an -EXEC_FUNC parameter is supplied,
|
|
1414 : the -SAVE_ARRAY parameter will be ignored.)
|
|
1415 : -SAVE_ARRAY =>array_ref, (reference to an array for storing all
|
|
1416 : Blast objects as they are created.
|
|
1417 : Experimental. Not recommended.)
|
|
1418 : -SIGNIF_FMT => boolean String of 'exp' or 'parts'. Sets the format
|
|
1419 : for reporting P/Expect values. 'exp' reports
|
|
1420 : only the exponent portion. 'parts' reports
|
|
1421 : them as a 2 element list. See signif_fmt()..
|
|
1422 :
|
|
1423 Throws : Exception if BLAST report contains a FATAL: error.
|
|
1424 : Propagates any exception thrown by read().
|
|
1425 : Propagates any exception thrown by called parsing methods.
|
|
1426 Comments : This method can be called either directly using the static $Blast object
|
|
1427 : or indirectly (by Bio::Tools::SeqAnal.pm) during constuction of an
|
|
1428 : individual Blast object.
|
|
1429 :
|
|
1430 : HTML-formatted reports can be parsed as well. No special flag is required
|
|
1431 : since it is detected automatically. The presence of HTML-formatting
|
|
1432 : will result in slower performace, however, since it must be removed
|
|
1433 : prior to parsing. Parsing HTML-formatted reports is highly
|
|
1434 : error prone and is generally not recommended.
|
|
1435 :
|
|
1436 : If one has an HTML report, do NOT remove the HTML from it by using the
|
|
1437 : "Save As" option of a web browser to save it as text. This renders the
|
|
1438 : report unparsable.
|
|
1439 : HTML-formatted reports can be parsed after running through the strip_html
|
|
1440 : function of Blast::HTML.pm as in:
|
|
1441 : require Bio::Tools::Blast::HTML;
|
|
1442 : Bio::Tools::Blast::HTML->import(&strip_html);
|
|
1443 : &strip_html(\$data);
|
|
1444 : # where data contains full contents of an HTML-formatted report.
|
|
1445 : TODO: write a demo script that does this.
|
|
1446
|
|
1447 See Also : L<_init_parse_params()|_init_parse_params>, L<_parse_blast_stream()|_parse_blast_stream>, L<overlap()|overlap>, L<signif_fmt()|signif_fmt>, B<Bio::Root::Object::read()>, B<Bio::Tools::Blast::HTML.pm::strip_html()>, L<Links to related modules>
|
|
1448
|
|
1449 =cut
|
|
1450
|
|
1451 #---------
|
|
1452 sub parse {
|
|
1453 #---------
|
|
1454 # $self might be the static $Blast object.
|
|
1455 my ($self, @param) = @_;
|
|
1456
|
|
1457 my($signif, $filt_func, $min_len, $check_all, $overlap, $stats,
|
|
1458 $share, $strict, $best, $signif_fmt, $no_aligns) =
|
|
1459 $self->_rearrange([qw(SIGNIF FILT_FUNC MIN_LEN CHECK_ALL_HITS
|
|
1460 OVERLAP STATS SHARE STRICT
|
|
1461 BEST EXPONENT NO_ALIGNS )], @param);
|
|
1462
|
|
1463 ## Initialize the static Blast object with parameters that
|
|
1464 ## apply to all Blast objects within a parsing session.
|
|
1465
|
|
1466 &_init_parse_params($share, $filt_func, $check_all,
|
|
1467 $signif, $min_len, $strict,
|
|
1468 $best, $signif_fmt, $stats, $no_aligns
|
|
1469 );
|
|
1470
|
|
1471 my $count = $self->_parse_blast_stream(@param);
|
|
1472
|
|
1473 # print STDERR "\nDONE PARSING STREAM.\n";
|
|
1474
|
|
1475 if($Blast->{'_blast_errs'}) {
|
|
1476 my @errs = @{$Blast->{'_blast_errs'}};
|
|
1477 printf STDERR "\n*** %d BLAST REPORTS HAD FATAL ERRORS:\n", scalar(@errs);
|
|
1478 foreach(@errs) { print STDERR "$_\n"; }
|
|
1479 @{$Blast->{'_blast_errs'}} = ();
|
|
1480 }
|
|
1481
|
|
1482 return $count;
|
|
1483 }
|
|
1484
|
|
1485 =head2 _init_parse_params
|
|
1486
|
|
1487 Title : _init_parse_params
|
|
1488 Usage : n/a; called automatically by parse()
|
|
1489 Purpose : Initializes parameters used during parsing of Blast reports.
|
|
1490 : This is a static method used by the $Blast object.
|
|
1491 : Calls _set_signif().
|
|
1492 Example :
|
|
1493 Returns : n/a
|
|
1494 Args : Args extracted by parse().
|
|
1495
|
|
1496 See Also: L<parse()|parse>, L<_set_signif()|_set_signif>
|
|
1497
|
|
1498 =cut
|
|
1499
|
|
1500 #----------------------
|
|
1501 sub _init_parse_params {
|
|
1502 #----------------------
|
|
1503 my ($share, $filt_func, $check_all,
|
|
1504 $signif, $min_len, $strict,
|
|
1505 $best, $signif_fmt, $stats, $no_aligns) = @_;
|
|
1506
|
|
1507 ## Default is to share stats.
|
|
1508 $Blast->{'_share'} = defined($share) ? $share : 1;
|
|
1509 $Blast->{'_filt_func'} = $filt_func || 0;
|
|
1510 $Blast->{'_check_all'} = $check_all || 0;
|
|
1511 $Blast->{'_signif_fmt'} ||= $signif_fmt || '';
|
|
1512 $Blast->{'_no_aligns'} = $no_aligns || 0;
|
|
1513
|
|
1514 &_set_signif($signif, $min_len, $filt_func);
|
|
1515 $Blast->strict($strict) if defined $strict;
|
|
1516 $Blast->best($best) if $best;
|
|
1517 $Blast->{'_blast_count'} = 0;
|
|
1518
|
|
1519 ## If $stats is false, miscellaneous statistical and other parameters
|
|
1520 ## are NOT extracted from the Blast report (e.g., matrix name, filter used, etc.).
|
|
1521 ## This can speed processing when crunching tons of Blast reports.
|
|
1522 ## Default is to NOT get stats.
|
|
1523 $Blast->{'_get_stats'} = defined($stats) ? $stats : 0;
|
|
1524
|
|
1525 # Clear any errors from previous parse.
|
|
1526 undef $Blast->{'_blast_errs'};
|
|
1527 }
|
|
1528
|
|
1529 =head2 _set_signif
|
|
1530
|
|
1531 Usage : n/a; called automatically by _init_parse_params()
|
|
1532 : This is now a "static" method used only by $Blast.
|
|
1533 : _set_signif($signif, $min_len, $filt_func);
|
|
1534 Purpose : Sets significance criteria for the BLAST object.
|
|
1535 Argument : Obligatory three arguments:
|
|
1536 : $signif = float or sci-notation number or undef
|
|
1537 : $min_len = integer or undef
|
|
1538 : $filt_func = function reference or undef
|
|
1539 :
|
|
1540 : If $signif is undefined, a default value is set
|
|
1541 : (see $DEFAULT_SIGNIF; min_length = not set).
|
|
1542 Throws : Exception if significance value is defined but appears
|
|
1543 : out of range or invalid.
|
|
1544 : Exception if $filt_func if defined and is not a func ref.
|
|
1545 Comments : The significance of a BLAST report can be based on
|
|
1546 : the P (or Expect) value and/or the length of the query sequence.
|
|
1547 : P (or Expect) values GREATER than '_significance' are not significant.
|
|
1548 : Query sequence lengths LESS than '_min_length' are not significant.
|
|
1549 :
|
|
1550 : Hits can also be screened using arbitrary significance criteria
|
|
1551 : as discussed in the parse() method.
|
|
1552 :
|
|
1553 : If no $signif is defined, the '_significance' level is set to
|
|
1554 : $Bio::Tools::Blast::DEFAULT_SIGNIF (999).
|
|
1555
|
|
1556 See Also : L<signif()|signif>, L<min_length()|min_length>, L<_init_parse_params()|_init_parse_params>, L<parse()|parse>
|
|
1557
|
|
1558 =cut
|
|
1559
|
|
1560 #-----------------
|
|
1561 sub _set_signif {
|
|
1562 #-----------------
|
|
1563 my( $sig, $len, $func ) = @_;
|
|
1564
|
|
1565 if(defined $sig) {
|
|
1566 $Blast->{'_confirm_significance'} = 1;
|
|
1567 if( $sig =~ /[^\d.e-]/ or $sig <= 0) {
|
|
1568 $Blast->throw("Invalid significance value: $sig",
|
|
1569 "Must be greater than zero.");
|
|
1570 }
|
|
1571 $Blast->{'_significance'} = $sig;
|
|
1572 } else {
|
|
1573 $Blast->{'_significance'} = $DEFAULT_SIGNIF;
|
|
1574 $Blast->{'_check_all'} = 1 if not $Blast->{'_filt_func'};
|
|
1575 }
|
|
1576
|
|
1577 if(defined $len) {
|
|
1578 if($len =~ /\D/ or $len <= 0) {
|
|
1579 $Blast->warn("Invalid minimum length value: $len",
|
|
1580 "Value must be an integer > 0. Value not set.");
|
|
1581 } else {
|
|
1582 $Blast->{'_min_length'} = $len;
|
|
1583 }
|
|
1584 }
|
|
1585
|
|
1586 if(defined $func) {
|
|
1587 $Blast->{'_confirm_significance'} = 1;
|
|
1588 if($func and not ref $func eq 'CODE') {
|
|
1589 $Blast->throw("Not a function reference: $func",
|
|
1590 "The -filt_func parameter must be function reference.");
|
|
1591 }
|
|
1592 }
|
|
1593 }
|
|
1594
|
|
1595 =head2 _parse_blast_stream
|
|
1596
|
|
1597 Usage : n/a. Internal method called by parse()
|
|
1598 Purpose : Obtains the function to be used during parsing and calls read().
|
|
1599 Returns : Integer (the number of blast reports read)
|
|
1600 Argument : Named parameters (forwarded from parse())
|
|
1601 Throws : Propagates any exception thrown by _get_parse_blast_func() and read().
|
|
1602
|
|
1603 See Also : L<_get_parse_blast_func()|_get_parse_blast_func>, B<Bio::Root::Object::read()>
|
|
1604
|
|
1605 =cut
|
|
1606
|
|
1607 #----------------------
|
|
1608 sub _parse_blast_stream {
|
|
1609 #----------------------
|
|
1610 my ($self, %param) = @_;
|
|
1611
|
|
1612 my $func = $self->_get_parse_blast_func(%param);
|
|
1613 # my $func = sub { my $data = shift;
|
|
1614 # printf STDERR "Chunk length = %d\n", length($data);
|
|
1615 # sleep(3);
|
|
1616 # };
|
|
1617
|
|
1618 # Only setting the newline character once per session.
|
|
1619 $Newline ||= $Util->get_newline(-client => $self, %param);
|
|
1620
|
|
1621 $self->read(-REC_SEP =>"$Newline>",
|
|
1622 -FUNC => $func,
|
|
1623 %param);
|
|
1624
|
|
1625 return $Blast->{'_blast_count'};
|
|
1626 }
|
|
1627
|
|
1628 =head2 _get_parse_blast_func
|
|
1629
|
|
1630 Usage : n/a; internal method used by _parse_blast_stream()
|
|
1631 : $func_ref = $blast_object->_get_parse_blast_func()
|
|
1632 Purpose : Generates a function ref to be used as a closure for parsing
|
|
1633 : raw data as it is being loaded by Bio::Root::IOManager::read().
|
|
1634 Returns : Function reference (closure).
|
|
1635 Comments : The the function reference contains a fair bit of logic
|
|
1636 : at present. It could perhaps be split up into separate
|
|
1637 : functions to make it more 'digestible'.
|
|
1638
|
|
1639 See Also : L<_parse_blast_stream()|_parse_blast_stream>
|
|
1640
|
|
1641 =cut
|
|
1642
|
|
1643 #--------------------------
|
|
1644 sub _get_parse_blast_func {
|
|
1645 #--------------------------
|
|
1646 my ($self, @param) = @_;
|
|
1647
|
|
1648 my ($save_a, $exec_func) =
|
|
1649 $self->_rearrange([qw(SAVE_ARRAY EXEC_FUNC)], @param);
|
|
1650
|
|
1651 # $MONITOR && print STDERR "\nParsing Blast stream (5/dot, 250/line)\n";
|
|
1652 my $count = 0;
|
|
1653 my $strict = $self->strict();
|
|
1654
|
|
1655 # Some parameter validation.
|
|
1656 # Remember, all Blast parsing will use this function now.
|
|
1657 # You won't need a exec-func or save_array when just creating a Blast object
|
|
1658 # as in: $blast = new Bio::Tools::Blast();
|
|
1659 if($exec_func and not ref($exec_func) eq 'CODE') {
|
|
1660 $self->throw("The -EXEC_FUNC parameter must be function reference.",
|
|
1661 "exec_func = $exec_func");
|
|
1662
|
|
1663 } elsif($save_a and not ref($save_a) eq 'ARRAY') {
|
|
1664 $self->throw("The -SAVE_ARRAY parameter must supply an array reference".
|
|
1665 "when not using an -EXEC_FUNC parameter.");
|
|
1666 }
|
|
1667
|
|
1668 ## Might consider breaking this closure up if possible.
|
|
1669
|
|
1670 return sub {
|
|
1671 my ($data) = @_;
|
|
1672 ## $data should contain one of three possible fragment types
|
|
1673 ## from a Blast report:
|
|
1674 ## 1. Header with description section,
|
|
1675 ## 2. An alignment section for a single hit, or
|
|
1676 ## 3. The final alignment section plus the footer section.
|
|
1677 ## (record separator = "Newline>").
|
|
1678
|
|
1679 # print STDERR "\n(BLAST) DATA CHUNK: $data\n";
|
|
1680
|
|
1681 my ($current_blast, $current_prog, $current_vers, $current_db);
|
|
1682 my $prev_blast;
|
|
1683 my $contains_translation = 0;
|
|
1684
|
|
1685 ### steve --- Wed Mar 15 02:48:07 2000
|
|
1686 ### In the process of addressing bug PR#95. Tricky.
|
|
1687 ### Using the $contains_translation to do so. Not complete
|
|
1688 ### and possibly won't fix. We'll see.
|
|
1689
|
|
1690 # Check for header section. Start a new Blast object and
|
|
1691 # parse the description section.
|
|
1692 # if ($data =~ /\sQuery\s?=/s || ($contains_translation && $data =~ /Database:/s)) {
|
|
1693 if ($data =~ /\sQuery\s?=/s) {
|
|
1694 $Blast->{'_blast_count'}++;
|
|
1695 print STDERR ".", $Blast->{'_blast_count'} % 50 ? '' : "\n" if $MONITOR;
|
|
1696
|
|
1697 if($data =~ /$Newline\s+Translating/so) {
|
|
1698 print STDERR "\nCONTAINS TRANSLATION\n";
|
|
1699 $contains_translation = 1;
|
|
1700 }
|
|
1701
|
|
1702 # If we're parsing a stream containing multiple reports,
|
|
1703 # all subsequent header sections will contain the last hit of
|
|
1704 # the previous report which needs to be parsed and added to that
|
|
1705 # report if signifcant. It also contains the run parameters
|
|
1706 # at the bottom of the Blast report.
|
|
1707 # if($Blast->{'_blast_count'} > 1 || $contains_translation) {
|
|
1708 if($Blast->{'_blast_count'} > 1) {
|
|
1709 # print STDERR "\nMULTI-BLAST STREAM.\n";
|
|
1710 $Blast->{'_multi_stream'} = 1;
|
|
1711
|
|
1712 if($data =~ /(.+?)$Newline(<\w+>)?(T?BLAST[NPX])\s+(.+?)$Newline(.+)/so) {
|
|
1713 ($current_prog, $current_vers, $data) = ($3, $4, $5);
|
|
1714 # Final chunk containing last hit and last footer.
|
|
1715 $Blast->{'_current_blast'}->_parse_alignment($1);
|
|
1716 $prev_blast = $Blast->{'_current_blast'}; # finalized.
|
|
1717 # } elsif($contains_translation) {
|
|
1718 # $data =~ /(T?BLAST[NPX])\s+(.+?)$Newline(.+)/so;
|
|
1719 # ($current_prog, $current_vers, $data) = ($1, $2, $3);
|
|
1720 } else {
|
|
1721 $Blast->throw("Can't determine program type from BLAST report.",
|
|
1722 "Checked for: @Blast_programs.");
|
|
1723 # This has important implications for how to handle interval
|
|
1724 # information for HSPs. TBLASTN uses nucleotides in query HSP
|
|
1725 # but amino acids in the sbjct HSP sequence.
|
|
1726 }
|
|
1727
|
|
1728 if($data =~ m/Database:\s+(.+?)$Newline/so ) {
|
|
1729 $current_db = $1;
|
|
1730 } else {
|
|
1731 # In some reports, the Database is only listed at end.
|
|
1732 #$Blast->warn("Can't determine database name from BLAST report.");
|
|
1733 }
|
|
1734
|
|
1735 # Incyte_Fix: Nasty Invisible Bug.
|
|
1736 # Records in blast report are delimited by '>', but... when
|
|
1737 # there are no hits for a query, there won't be a '>'. That
|
|
1738 # causes several blast reports to run together in the data
|
|
1739 # passed to this routine. Need to get rid of non-hits in data
|
|
1740 if ($data =~ /.+(No hits? found.+Sequences.+)/so) {
|
|
1741 $data = $1;
|
|
1742 }
|
|
1743 # End Incyte_Fix
|
|
1744
|
|
1745 }
|
|
1746
|
|
1747 # Determine if we need to create a new Blast object
|
|
1748 # or use the $self object for this method.
|
|
1749
|
|
1750 if($Blast->{'_multi_stream'} or $self->name eq 'Static Blast object') {
|
|
1751 # Strict mode is not object-specific but may be someday.
|
|
1752 # print STDERR "\nCreating new Blast object.\n";
|
|
1753 $current_blast = new Bio::Tools::Blast(-STRICT => $strict);
|
|
1754 } else {
|
|
1755 $current_blast = $self;
|
|
1756 }
|
|
1757 $Blast->{'_current_blast'} = $current_blast;
|
|
1758
|
|
1759 # If we're not sharing stats, set data on current blast object.
|
|
1760 if(defined $current_prog and not $Blast->{'_share'}) {
|
|
1761 $current_blast->program($current_prog);
|
|
1762 $current_blast->program_version($current_vers);
|
|
1763 $current_blast->database($current_db);
|
|
1764 }
|
|
1765
|
|
1766 # print STDERR "CURRENT BLAST = ", $current_blast->name, "\n";
|
|
1767 $current_blast->_parse_header($data);
|
|
1768
|
|
1769 # If there were any descriptions in the header,
|
|
1770 # we know if there are any significant hits.
|
|
1771 # No longer throwing exception if there were no significant hits
|
|
1772 # and a -signif parameter was specified. Doing so prevents the
|
|
1773 # construction of a Blast object, which could still be useful.
|
|
1774 # if($current_blast->{'_has_descriptions'} and $Blast->{'_confirm_significance'} and not $current_blast->is_signif) {
|
|
1775 # $current_blast->throw("No significant BLAST hits for ${\$current_blast->name}");
|
|
1776
|
|
1777 # }
|
|
1778
|
|
1779 } # Done parsing header/description section
|
|
1780
|
|
1781 ### For use with $contains_translation - not right - breaks regular report parsing.
|
|
1782 # elsif(ref $Blast->{'_current_blast'} && $data !~ /\s*\w*\s*/s) {
|
|
1783 elsif(ref $Blast->{'_current_blast'} ) {
|
|
1784 # Process an alignment section.
|
|
1785 $current_blast = $Blast->{'_current_blast'};
|
|
1786 # print STDERR "\nCONTINUING PROCESSING ALN WITH ", $current_blast->name, "\n";
|
|
1787 # print STDERR "DATA: $data\n";
|
|
1788 eval {
|
|
1789 $current_blast->_parse_alignment($data);
|
|
1790 };
|
|
1791 if($@) {
|
|
1792 # push @{$self->{'_blast_errs'}}, $@;
|
|
1793 }
|
|
1794 }
|
|
1795
|
|
1796 # If the current Blast object has been completely parsed
|
|
1797 # (occurs with a single Blast stream), or if there is a previous
|
|
1798 # Blast object (occurs with a multi Blast stream),
|
|
1799 # execute a supplied function on it or store it in a supplied array.
|
|
1800
|
|
1801 if( defined $prev_blast or $current_blast->{'_found_params'}) {
|
|
1802 my $finished_blast = defined($prev_blast) ? $prev_blast : $current_blast;
|
|
1803
|
|
1804 $finished_blast->_report_errors();
|
|
1805 # print STDERR "\nNEW BLAST OBJECT: ${\$finished_blast->name}\n";
|
|
1806
|
|
1807 if($exec_func) {
|
|
1808 # print STDERR " RUNNING EXEC_FUNC...\n";
|
|
1809 &$exec_func($finished_blast); # ignoring any return value.
|
|
1810 # Report processed, no longer need object.
|
|
1811 $finished_blast->destroy;
|
|
1812 undef $finished_blast;
|
|
1813 } elsif($save_a) {
|
|
1814 # print STDERR " SAVING IN ARRAY...\n";
|
|
1815 # We've already verified that if there is no exec_func
|
|
1816 # then there must be a $save_array
|
|
1817 push @$save_a, $finished_blast;
|
|
1818 }
|
|
1819 }
|
|
1820 1;
|
|
1821 }
|
|
1822 }
|
|
1823
|
|
1824 =head2 _report_errors
|
|
1825
|
|
1826 Title : _report_errors
|
|
1827 Usage : n/a; Internal method called by _get_parse_blast_func().
|
|
1828 Purpose : Throw or warn about any errors encountered.
|
|
1829 Returns : n/a
|
|
1830 Args : n/a
|
|
1831 Throws : If all hits generated exceptions, raise exception
|
|
1832 : (a fatal event for the Blast object.)
|
|
1833 : If some hits were okay but some were bad, generate a warning
|
|
1834 : (a few bad applies should not spoil the bunch).
|
|
1835 : This usually indicates a limiting B-value.
|
|
1836 : When the parsing code fails, it is either all or nothing.
|
|
1837
|
|
1838 =cut
|
|
1839
|
|
1840 #-------------------
|
|
1841 sub _report_errors {
|
|
1842 #-------------------
|
|
1843 my $self = shift;
|
|
1844
|
|
1845 return unless ref($self->{'_blast_errs'});
|
|
1846 # ref($self->{'_blast_errs'}) || (print STDERR "\nNO ERRORS\n", return );
|
|
1847
|
|
1848 my @errs = @{$self->{'_blast_errs'}};
|
|
1849
|
|
1850 if(scalar @errs) {
|
|
1851 my ($str);
|
|
1852 @{$self->{'_blast_errs'}} = (); # clear the errs on the object.
|
|
1853 # When there are many errors, in most of the cases, they are
|
|
1854 # caused by the same problem. Only need to see full data for
|
|
1855 # the first one.
|
|
1856 if(scalar @errs > 2) {
|
|
1857 $str = "SHOWING FIRST EXCEPTION ONLY:\n$errs[0]";
|
|
1858 $self->clear_err(); # clearing the existing set of errors (conserve memory).
|
|
1859 # Not necessary, unless the -RECORD_ERR =>1
|
|
1860 # constructor option was used for Blast object.
|
|
1861 } else {
|
|
1862 $str = join("\n",@errs);
|
|
1863 }
|
|
1864
|
|
1865 if(not $self->{'_num_hits_significant'}) {
|
|
1866 $self->throw(sprintf("Failed to parse any hit data (n=%d).", scalar(@errs)),
|
|
1867 "\n\nTRAPPED EXCEPTION(S):\n$str\nEND TRAPPED EXCEPTION(S)\n"
|
|
1868 );
|
|
1869 } else {
|
|
1870 $self->warn(sprintf("Some potential hits were not parsed (n=%d).", scalar(@errs)),
|
|
1871 @errs > 2 ? "This may be due to a limiting B value (max alignment listings)." : "",
|
|
1872 "\n\nTRAPPED EXCEPTION(S):\n$str\nEND TRAPPED EXCEPTION(S)\n"
|
|
1873 );
|
|
1874 }
|
|
1875 }
|
|
1876 }
|
|
1877
|
|
1878 =head2 _parse_header
|
|
1879
|
|
1880 Usage : n/a; called automatically by the _get_parse_blast_func().
|
|
1881 Purpose : Parses the header section of a BLAST report.
|
|
1882 Argument : String containing the header+description section of a BLAST report.
|
|
1883 Throws : Exception if description data cannot be parsed properly.
|
|
1884 : Exception if there is a 'FATAL' error in the Blast report.
|
|
1885 : Warning if there is a 'WARNING' in the Blast report.
|
|
1886 : Warning if there are no significant hits.
|
|
1887 Comments : Description section contains a single line for each hit listing
|
|
1888 : the seq id, description, score, Expect or P-value, etc.
|
|
1889
|
|
1890 See Also : L<_get_parse_blast_func()|_get_parse_blast_func>
|
|
1891
|
|
1892 =cut
|
|
1893
|
|
1894 #----------------------
|
|
1895 sub _parse_header {
|
|
1896 #----------------------
|
|
1897 my( $self, $data ) = @_;
|
|
1898
|
|
1899 # print STDERR "\n$ID: PARSING HEADER\n"; #$data\n";
|
|
1900
|
|
1901 $data =~ s/^\s+|\s+>?$//sg;
|
|
1902
|
|
1903 if($data =~ /<HTML/i) {
|
|
1904 $self->throw("Can't parse HTML-formatted BLAST reports.",
|
|
1905 # "Such reports can be parsed with a special parsing \n".
|
|
1906 # "script included in the examples/blast directory \n".
|
|
1907 # "of the Bioperl distribution. (TODO)"
|
|
1908 );
|
|
1909 # This was the old strategy, can't do it with new strategy
|
|
1910 # since we don't have the whole report in one chunk.
|
|
1911 # This could be the basis for the "special parsing script".
|
|
1912 # require Bio::Tools::Blast::HTML;
|
|
1913 # Bio::Tools::Blast::HTML->import(&strip_html);
|
|
1914 # &strip_html(\$data);
|
|
1915 }
|
|
1916
|
|
1917 $data =~ /WARNING: (.+?)$Newline$Newline/so and $self->warn("$1") if $self->strict;
|
|
1918 $data =~ /FATAL: (.+?)$Newline$Newline/so and $self->throw("FATAL BLAST ERROR = $1");
|
|
1919 # No longer throwing exception when no hits were found. Still reporting it.
|
|
1920 $data =~ /No hits? found/i and $self->warn("No hits were found.") if $self->strict;
|
|
1921
|
|
1922 # If this is the first Blast, the program, version, and database info
|
|
1923 # pertain to it. Otherwise, they are for the previous report and have
|
|
1924 # already been parsed out.
|
|
1925 # Data is stored in the static Blast object. Data for subsequent reports
|
|
1926 # will be stored in separate objects if the -share parameter is not set.
|
|
1927 # See _get_parse_blast_func().
|
|
1928
|
|
1929 if($Blast->{'_blast_count'} == 1) {
|
|
1930 if($data =~ /(<\w+>)?(T?BLAST[NPX])\s+(.+?)$Newline/so) {
|
|
1931 $Blast->program($2);
|
|
1932 $Blast->program_version($3);
|
|
1933 } else {
|
|
1934 $self->throw("Can't determine program type from BLAST report.",
|
|
1935 "Checked for: @Blast_programs.");
|
|
1936 # This has important implications for how to handle interval
|
|
1937 # information for HSPs. TBLASTN uses nucleotides in query HSP
|
|
1938 # but amino acids in the sbjct HSP sequence.
|
|
1939 }
|
|
1940
|
|
1941 if($data =~ m/Database:\s+(.+?)$Newline/so ) {
|
|
1942 $Blast->database($1);
|
|
1943 } else {
|
|
1944 # In some reports, the Database is only listed at end.
|
|
1945 #$self->warn("Can't determine database name from BLAST report (_parse_header)\n$data\n.");
|
|
1946 }
|
|
1947 }
|
|
1948
|
|
1949 my ($header, $descriptions);
|
|
1950
|
|
1951 ## For efficiency reasons, we want to to avoid using $' and $`.
|
|
1952 ## Therefore using single-line mode pattern matching.
|
|
1953
|
|
1954 if($data =~ /(.+?)\nSequences producing.+?\n(.+)/s ) {
|
|
1955 ($header, $descriptions) = ($1, $2);
|
|
1956 $self->{'_has_descriptions'} = 1;
|
|
1957 } else {
|
|
1958 $header = $data;
|
|
1959 $self->{'_has_descriptions'} = 0;
|
|
1960 # Blast reports can legally lack description section. No need to warn.
|
|
1961 #push @{$self->{'_blast_errs'}}, "Can't parse description data.";
|
|
1962 }
|
|
1963
|
|
1964 $self->_set_query($header); # The name of the sequence will appear in error report.
|
|
1965 # print STDERR "\nQUERY = ", $Blast->{'_current_blast'}->query, "\n";
|
|
1966
|
|
1967 $self->_set_date($header) if $Blast->{'_get_stats'};
|
|
1968 $self->_set_length($header);
|
|
1969
|
|
1970 # not $Blast->{'_confirm_significance'} and print STDERR "\nNOT PARSING DESCRIPTIONS.\n";
|
|
1971
|
|
1972 # Setting the absolute max and min significance levels.
|
|
1973 $self->{'_highestSignif'} = 0;
|
|
1974 $self->{'_lowestSignif'} = $DEFAULT_SIGNIF;
|
|
1975
|
|
1976 if ($Blast->{'_confirm_significance'} || $Blast->{'_no_aligns'}) {
|
|
1977 $self->_parse_descriptions($descriptions) if $descriptions;
|
|
1978 } else {
|
|
1979 $self->{'_is_significant'} = 1;
|
|
1980 }
|
|
1981 }
|
|
1982
|
|
1983 #-----------------------
|
|
1984 sub _parse_descriptions {
|
|
1985 #-----------------------
|
|
1986 my ($self, $desc) = @_;
|
|
1987
|
|
1988 # NOTE: This method will not be called if the report lacks
|
|
1989 # a description section.
|
|
1990
|
|
1991 # print STDERR "\nPARSING DESCRIPTION DATA\n";
|
|
1992
|
|
1993 my @descriptions = split( $Newline, $desc);
|
|
1994 my($line);
|
|
1995
|
|
1996 # NOW step through each line parsing out the P/Expect value
|
|
1997 # All we really need to do is check the first one, if it doesn't
|
|
1998 # meet the significance requirement, we can skip the report.
|
|
1999 # BUT: we want to collect data for all hits anyway to get min/max signif.
|
|
2000
|
|
2001 my $my_signif = $self->signif;
|
|
2002 my $layout_set = $Blast->{'_layout'} || 0;
|
|
2003 my $layout;
|
|
2004 my $count = 0;
|
|
2005 my $sig;
|
|
2006
|
|
2007 desc_loop:
|
|
2008 foreach $line (@descriptions) {
|
|
2009 $count++;
|
|
2010 last desc_loop if $line =~ / NONE |End of List/;
|
|
2011 next desc_loop if $line =~ /^\s*$/;
|
|
2012 next desc_loop if $line =~ /^\.\./;
|
|
2013
|
|
2014 ## Checking the significance value (P- or Expect value) of the hit
|
|
2015 ## in the description line.
|
|
2016
|
|
2017 # These regexps need testing on a variety of reports.
|
|
2018 if ( $line =~ /\d+\s{1,5}[\de.-]+\s*$/) {
|
|
2019 $layout = 2;
|
|
2020 } elsif( $line =~ /\d+\s{1,5}[\de.-]+\s{1,}\d+\s*$/) {
|
|
2021 $layout = 1;
|
|
2022 } else {
|
|
2023 $self->warn("Can't parse significance data in description line $line");
|
|
2024 next desc_loop;
|
|
2025 }
|
|
2026 not $layout_set and ($self->_layout($layout), $layout_set = 1);
|
|
2027
|
|
2028 $sig = &_parse_signif( $line, $layout );
|
|
2029
|
|
2030 # print STDERR " Parsed signif ($layout) = $sig\n";
|
|
2031
|
|
2032 last desc_loop if ($sig > $my_signif and not $Blast->{'_check_all'});
|
|
2033 $self->_process_significance($sig, $my_signif);
|
|
2034 }
|
|
2035
|
|
2036 # printf "\n%d SIGNIFICANT HITS.\nDONE PARSING DESCRIPTIONS.\n", $self->{'_num_hits_significant'};
|
|
2037 }
|
|
2038
|
|
2039 sub _process_significance {
|
|
2040 my($self, $sig, $my_signif) = @_;
|
|
2041
|
|
2042 $self->{'_highestSignif'} = ($sig > $self->{'_highestSignif'})
|
|
2043 ? $sig : $self->{'_highestSignif'};
|
|
2044
|
|
2045 $self->{'_lowestSignif'} = ($sig < $self->{'_lowestSignif'})
|
|
2046 ? $sig : $self->{'_lowestSignif'};
|
|
2047
|
|
2048 # Significance value assessment.
|
|
2049 $sig <= $my_signif and $self->{'_num_hits_significant'}++;
|
|
2050 $self->{'_num_hits'}++;
|
|
2051
|
|
2052 $self->{'_is_significant'} = 1 if $self->{'_num_hits_significant'};
|
|
2053 }
|
|
2054
|
|
2055 =head2 _parse_alignment
|
|
2056
|
|
2057 Usage : n/a; called automatically by the _get_parse_blast_func().
|
|
2058 Purpose : Parses a single alignment section of a BLAST report.
|
|
2059 Argument : String containing the alignment section.
|
|
2060 Throws : n/a; All errors are trapped while parsing the hit data
|
|
2061 : and are processed as a group when the report is
|
|
2062 : completely processed (See _report_errors()).
|
|
2063 :
|
|
2064 Comments : Alignment section contains all HSPs for a hit.
|
|
2065 : Requires Bio::Tools::Blast::Sbjct.pm.
|
|
2066 : Optionally calls a filter function to screen the hit on arbitrary
|
|
2067 : criteria. If the filter function returns true for a given hit,
|
|
2068 : that hit will be skipped.
|
|
2069 :
|
|
2070 : If the Blast object was created with -check_all_hits set to true,
|
|
2071 : all hits will be checked for significance and processed if necessary.
|
|
2072 : If this field is false, the parsing will stop after the first
|
|
2073 : non-significant hit.
|
|
2074 : See parse() for description of parsing parameters.
|
|
2075
|
|
2076 See Also : L<parse()|parse>, L<_get_parse_blast_func()|_get_parse_blast_func>, L<_report_errors()|_report_errors>, B<Bio::Tools::Blast::Sbjct()>, L<Links to related modules>
|
|
2077
|
|
2078 =cut
|
|
2079
|
|
2080 #----------------------
|
|
2081 sub _parse_alignment {
|
|
2082 #----------------------
|
|
2083 # This method always needs to check detect if the $data argument
|
|
2084 # contains the footer of a Blast report, indicating the last chunk
|
|
2085 # of a single Blast stream.
|
|
2086
|
|
2087 my( $self, $data ) = @_;
|
|
2088
|
|
2089 # printf STDERR "\nPARSING ALIGNMENT DATA for %s $self.\n", $self->name;
|
|
2090
|
|
2091 # NOTE: $self->{'_current_hit'} is an instance variable
|
|
2092 # The $Blast object will not have this member.
|
|
2093
|
|
2094 # If all of the significant hits have been parsed,
|
|
2095 # return if we're not checking all or if we don't need to get
|
|
2096 # the Blast stats (parameters at footer of report).
|
|
2097 if(defined $self->{'_current_hit'} and
|
|
2098 defined $self->{'_num_hits_significant'}) {
|
|
2099 return if $self->{'_current_hit'} >= $self->{'_num_hits_significant'} and
|
|
2100 not ($Blast->{'_check_all'} or $Blast->{'_get_stats'});
|
|
2101 }
|
|
2102
|
|
2103 # Check for the presence of the Blast footer section.
|
|
2104 # _parse_footer returns the alignment section.
|
|
2105 $data = $self->_parse_footer($data);
|
|
2106
|
|
2107 # Return if we're only interested in the best hit.
|
|
2108 # This has to occur after checking for the parameters section
|
|
2109 # in the footer (since we may still be interested in them).
|
|
2110 return if $Blast->best and ( defined $self->{'_current_hit'} and $self->{'_current_hit'} >=1);
|
|
2111
|
|
2112 # print "RETURNED FROM _parse_footer (", $self->to_string, ")";
|
|
2113 # print "\n --> FOUND PARAMS.\n" if $self->{'_found_params'};
|
|
2114 # print "\n --> DID NOT FIND PARAMS.\n" unless $self->{'_found_params'};
|
|
2115
|
|
2116 require Bio::Tools::Blast::Sbjct;
|
|
2117
|
|
2118 $data =~ s/^\s+|\s+>?$//sg;
|
|
2119 $data =~ s/$Newline$Newline/$Newline/sog; # remove blank lines.
|
|
2120 my @data = split($Newline, $data);
|
|
2121 push @data, 'end';
|
|
2122
|
|
2123 # print STDERR "\nALIGNMENT DATA:\n$data\n";
|
|
2124
|
|
2125 my $prog = $self->program;
|
|
2126 my $check_all = $Blast->{'_check_all'};
|
|
2127 my $filt_func = $Blast->{'_filt_func'} || 0;
|
|
2128 my $signif_fmt = $Blast->{'_signif_fmt'};
|
|
2129 my $my_signif = $self->signif;
|
|
2130 my $err;
|
|
2131
|
|
2132 # Now construct the Sbjct objects from the alignment section
|
|
2133
|
|
2134 # debug(1);
|
|
2135
|
|
2136 $self->{'_current_hit'}++;
|
|
2137
|
|
2138 # If not confirming significance, _parse_descriptions will not have been run,
|
|
2139 # so we need to count the total number of hits here.
|
|
2140 if( not $Blast->{'_confirm_significance'}) {
|
|
2141 $self->{'_num_hits'}++;
|
|
2142 }
|
|
2143
|
|
2144 if($Blast->{'_no_aligns'}) {
|
|
2145 # printf STDERR "\nNOT PARSING ALIGNMENT DATA\n";
|
|
2146 return;
|
|
2147 }
|
|
2148
|
|
2149 my $hit; # Must be my'ed within hit_loop.
|
|
2150 eval {
|
|
2151 $hit = new Bio::Tools::Blast::Sbjct (-DATA =>\@data,
|
|
2152 -PARENT =>$self,
|
|
2153 -NAME =>$self->{'_current_hit'},
|
|
2154 -RANK =>$self->{'_current_hit'},
|
|
2155 -RANK_BY =>'order',
|
|
2156 -PROGRAM =>$prog,
|
|
2157 -SIGNIF_FMT=>$signif_fmt,
|
|
2158 -OVERLAP =>$Blast->{'_overlap'} || $MAX_HSP_OVERLAP,
|
|
2159 );
|
|
2160 # printf STDERR "NEW HIT: %s, SIGNIFICANCE = %g\n", $hit->name, $hit->expect; <STDIN>;
|
|
2161 # The BLAST report may have not had a description section.
|
|
2162 if(not $self->{'_has_descriptions'}) {
|
|
2163 $self->_process_significance($hit->signif, $my_signif);
|
|
2164 }
|
|
2165 };
|
|
2166
|
|
2167 if($@) {
|
|
2168 # Throwing lots of errors can slow down the code substantially.
|
|
2169 # Error handling code is not that efficient.
|
|
2170 #print STDERR "\nERROR _parse_alignment: $@\n";
|
|
2171 push @{$self->{'_blast_errs'}}, $@;
|
|
2172 $hit->destroy if ref $hit;
|
|
2173 undef $hit;
|
|
2174 } else {
|
|
2175 # Collect overall signif data if we don't already have it,
|
|
2176 # (as occurs if no -signif parameter is supplied).
|
|
2177 my $hit_signif = $hit->signif;
|
|
2178
|
|
2179 if (not $Blast->{'_confirm_significance'} ) {
|
|
2180 $self->{'_highestSignif'} = ($hit_signif > $self->{'_highestSignif'})
|
|
2181 ? $hit_signif : $self->{'_highestSignif'};
|
|
2182
|
|
2183 $self->{'_lowestSignif'} = ($hit_signif < $self->{'_lowestSignif'})
|
|
2184 ? $hit_signif : $self->{'_lowestSignif'};
|
|
2185 }
|
|
2186
|
|
2187 # Test significance using custom function (if supplied)
|
|
2188 if($filt_func) {
|
|
2189 if(&$filt_func($hit)) {
|
|
2190 push @{$self->{'_hits'}}, $hit;
|
|
2191 } else {
|
|
2192 $hit->destroy; undef $hit;
|
|
2193 }
|
|
2194 } elsif($hit_signif <= $my_signif) {
|
|
2195 push @{$self->{'_hits'}}, $hit;
|
|
2196 }
|
|
2197 }
|
|
2198
|
|
2199 }
|
|
2200
|
|
2201 =head2 _parse_footer
|
|
2202
|
|
2203 Usage : n/a; internal function. called by _parse_alignment()
|
|
2204 Purpose : Extracts statistical and other parameters from the BLAST report.
|
|
2205 : Sets various key elements such as the program and version,
|
|
2206 : gapping, and the layout for the report (blast1 or blast2).
|
|
2207 Argument : Data to be parsed.
|
|
2208 Returns : String containing an alignment section for processing by
|
|
2209 : _parse_alignment().
|
|
2210 Throws : Exception if cannot find the parameters section of report.
|
|
2211 : Warning if cannot determine if gapping was used.
|
|
2212 : Warning if cannot determine the scoring matrix used.
|
|
2213 Comments : This method must always get called, even if the -STATS
|
|
2214 : parse() parameter is false. The reason is that the layout
|
|
2215 : of the report and the presence of gapping must always be set.
|
|
2216 : The determination whether to set additional stats is made
|
|
2217 : by methods called by _parse_footer().
|
|
2218
|
|
2219 See Also : L<parse()|parse>, L<_parse_alignment()|_parse_alignment>, L<_set_database()|_set_database>
|
|
2220
|
|
2221 =cut
|
|
2222
|
|
2223 #---------------------
|
|
2224 sub _parse_footer {
|
|
2225 #---------------------
|
|
2226 # Basic strategy:
|
|
2227 # 1. figure out if we're supposed to get the stats,
|
|
2228 # 2. figure out if the stats are to be shared. some, not all can be shared
|
|
2229 # (eg., db info and matrix can be shared, karlin altschul params cannot.
|
|
2230 # However, this method assumes they are all sharable.)
|
|
2231 # 3. Parse the stats.
|
|
2232 # 4. return the block before the parameters section if the supplied data
|
|
2233 # contains a footer parameters section.
|
|
2234
|
|
2235 my ($self, $data) = @_;
|
|
2236 my ($client, $last_align, $params);
|
|
2237
|
|
2238 # printf STDERR "\nPARSING PARAMETERS for %s $self.\n", $self->name;
|
|
2239
|
|
2240 # Should the parameters be shared?
|
|
2241 # If so, set $self to be the static $Blast object and return if
|
|
2242 # the parameters were already set.
|
|
2243 # Before returning, we need to extract the last alignment section
|
|
2244 # from the parameter section, if any.
|
|
2245
|
|
2246 if ($Blast->{'_share'}) {
|
|
2247 $client = $self;
|
|
2248 $self = $Blast if $Blast->{'_share'};
|
|
2249 }
|
|
2250
|
|
2251 my $get_stats = $Blast->{'_get_stats'};
|
|
2252
|
|
2253 if( $data =~ /(.+?)${Newline}CPU time: (.*)/so) {
|
|
2254 # NCBI-Blast2 format (v2.04).
|
|
2255 ($last_align, $params) = ($1, $2);
|
|
2256 return $last_align if $client->{'_found_params'};
|
|
2257 $self->_set_blast2_stats($params);
|
|
2258
|
|
2259 } elsif( $data =~ /(.+?)${Newline}Parameters:(.*)/so) {
|
|
2260 # NCBI-Blast1 or WashU-Blast2 format.
|
|
2261 ($last_align, $params) = ($1, $2);
|
|
2262 return $last_align if $client->{'_found_params'};
|
|
2263 $self->_set_blast1_stats($params);
|
|
2264
|
|
2265 } elsif( $data =~ /(.+?)$Newline\s+Database:(.*)/so) {
|
|
2266 # Gotta watch out for confusion with the Database: line in the header
|
|
2267 # which will be present in the last hit of an internal Blast report
|
|
2268 # in a multi-report stream.
|
|
2269
|
|
2270 # NCBI-Blast2 format (v2.05).
|
|
2271 ($last_align, $params) = ($1, $2);
|
|
2272 return $last_align if $client->{'_found_params'};
|
|
2273 $self->_set_blast2_stats($params);
|
|
2274
|
|
2275 } elsif( $data =~ /(.+?)$Newline\s*Searching/so) {
|
|
2276 # trying to detect a Searching at the end of a PSI-blast round.
|
|
2277 # Gotta watch out for confusion with the Searching line in the header
|
|
2278 # which will be present in the last hit of an internal Blast report
|
|
2279 # in a multi-report, non-PSI-blast stream.
|
|
2280
|
|
2281 # PSI-Blast format (v2.08).
|
|
2282 ($last_align) = ($1);
|
|
2283 return $last_align; # if $client->{'_found_params'};
|
|
2284 }
|
|
2285
|
|
2286 # If parameter section was found, set a boolean,
|
|
2287 # otherwise return original data.
|
|
2288
|
|
2289 if (defined($params)) {
|
|
2290 $client->{'_found_params'} = 1;
|
|
2291 } else {
|
|
2292 return $data;
|
|
2293 }
|
|
2294
|
|
2295 $self->_set_database($params) if $get_stats;
|
|
2296
|
|
2297 # The {'_gapped'} member should be set in the _set_blast?_stats() call.
|
|
2298 # This is a last minute attempt to deduce it.
|
|
2299
|
|
2300 if(!defined($self->{'_gapped'})) {
|
|
2301 if($self->program_version() =~ /^1/) {
|
|
2302 $self->{'_gapped'} = 0;
|
|
2303 } else {
|
|
2304 if($self->strict > 0) {
|
|
2305 $self->warn("Can't determine if gapping was used. Assuming gapped.");
|
|
2306 }
|
|
2307 $self->{'_gapped'} = 1;
|
|
2308 }
|
|
2309 }
|
|
2310
|
|
2311 return $last_align;
|
|
2312 }
|
|
2313
|
|
2314 =head2 _set_blast2_stats
|
|
2315
|
|
2316 Usage : n/a; internal function called by _parse_footer()
|
|
2317 Purpose : Extracts statistical and other parameters from BLAST2 report footer.
|
|
2318 : Stats collected: database release, gapping,
|
|
2319 : posted date, matrix used, filter used, Karlin-Altschul parameters,
|
|
2320 : E, S, T, X, W.
|
|
2321 Throws : Exception if cannot get "Parameters" section of Blast report.
|
|
2322
|
|
2323 See Also : L<parse()|parse>, L<_parse_footer()|_parse_footer>, L<_set_database()|_set_database>, B<Bio::Tools::SeqAnal::set_date()>,L<Links to related modules>
|
|
2324
|
|
2325 =cut
|
|
2326
|
|
2327 #---------------------'
|
|
2328 sub _set_blast2_stats {
|
|
2329 #---------------------
|
|
2330 my ($self, $data) = (@_);
|
|
2331
|
|
2332 if($data =~ /$Newline\s*Gapped/so) {
|
|
2333 $self->{'_gapped'} = 1;
|
|
2334 } else {
|
|
2335 $self->{'_gapped'} = 0;
|
|
2336 }
|
|
2337
|
|
2338 # Other stats are not always essential.
|
|
2339 return unless $Blast->{'_get_stats'};
|
|
2340
|
|
2341 # Blast2 Doesn't report what filter was used in the parameters section.
|
|
2342 # It just gives a warning that *some* filter was used in the header.
|
|
2343 # You just have to know the defaults (currently: protein = SEG, nucl = DUST).
|
|
2344 if($data =~ /\bfiltered\b/si) {
|
|
2345 $self->{'_filter'} = 'DEFAULT FILTER';
|
|
2346 } else {
|
|
2347 $self->{'_filter'} = 'NONE';
|
|
2348 }
|
|
2349
|
|
2350 if($data =~ /Gapped$Newline\s*Lambda +K +H$Newline +(.+?)$Newline/so) {
|
|
2351 my ($l, $k, $h) = split(/\s+/, $1);
|
|
2352 $self->{'_lambda'} = $l || 'UNKNOWN';
|
|
2353 $self->{'_k'} = $k || 'UNKNOWN';
|
|
2354 $self->{'_h'} = $h || 'UNKNOWN';
|
|
2355 } elsif($data =~ /Lambda +K +H$Newline +(.+?)$Newline/so) {
|
|
2356 my ($l, $k, $h) = split(/\s+/, $1);
|
|
2357 $self->{'_lambda'} = $l || 'UNKNOWN';
|
|
2358 $self->{'_k'} = $k || 'UNKNOWN';
|
|
2359 $self->{'_h'} = $h || 'UNKNOWN';
|
|
2360 }
|
|
2361
|
|
2362 if($data =~ /$Newline\s*Matrix: (.+?)$Newline/so) {
|
|
2363 $self->{'_matrix'} = $1;
|
|
2364 } else {
|
|
2365 $self->{'_matrix'} = $DEFAULT_MATRIX.'?';
|
|
2366 if($self->strict > 0) {
|
|
2367 $self->warn("Can't determine scoring matrix. Assuming $DEFAULT_MATRIX.");
|
|
2368 }
|
|
2369 }
|
|
2370
|
|
2371 if($data =~ /$Newline\s*Gap Penalties: Existence: +(\d+), +Extension: (\d+)$Newline/so) {
|
|
2372 $self->{'_gapCreation'} = $1;
|
|
2373 $self->{'_gapExtension'} = $2;
|
|
2374 }
|
|
2375 if($data =~ /sequences better than (\d+):/s) {
|
|
2376 $self->{'_expect'} = $1;
|
|
2377 }
|
|
2378
|
|
2379 if($data =~ /$Newline\s*T: (\d+)/o) { $self->{'_word_size'} = $1; }
|
|
2380 if($data =~ /$Newline\s*A: (\d+)/o) { $self->{'_a'} = $1; }
|
|
2381 if($data =~ /$Newline\s*S1: (\d+)/o) { $self->{'_s'} = $1; }
|
|
2382 if($data =~ /$Newline\s*S2: (\d+)/o) { $self->{'_s'} .= ", $1"; }
|
|
2383 if($data =~ /$Newline\s*X1: (\d+)/o) { $self->{'_x1'} = $1; }
|
|
2384 if($data =~ /$Newline\s*X2: (\d+)/o) { $self->{'_x2'} = $1; }
|
|
2385 }
|
|
2386
|
|
2387 =head2 _set_blast1_stats
|
|
2388
|
|
2389 Usage : n/a; internal function called by _parse_footer()
|
|
2390 Purpose : Extracts statistical and other parameters from BLAST 1.x style eports.
|
|
2391 : Handles NCBI Blast1 and WashU-Blast2 formats.
|
|
2392 : Stats collected: database release, gapping,
|
|
2393 : posted date, matrix used, filter used, Karlin-Altschul parameters,
|
|
2394 : E, S, T, X, W.
|
|
2395
|
|
2396 See Also : L<parse()|parse>, L<_parse_footer()|_parse_footer>, L<_set_database()|_set_database>, B<Bio::Tools::SeqAnal::set_date()>,L<Links to related modules>
|
|
2397
|
|
2398 =cut
|
|
2399
|
|
2400 #----------------------
|
|
2401 sub _set_blast1_stats {
|
|
2402 #----------------------
|
|
2403 my ($self, $data) = (@_);
|
|
2404
|
|
2405 if(!$self->{'_gapped'} and $self->program_version() =~ /^2[\w\-\.]+WashU/) {
|
|
2406 $self->_set_gapping_wu($data);
|
|
2407 } else {
|
|
2408 $self->{'_gapped'} = 0;
|
|
2409 }
|
|
2410
|
|
2411 # Other stats are not always essential.
|
|
2412 return unless $Blast->{'_get_stats'};
|
|
2413
|
|
2414 if($data =~ /filter=(.+?)$Newline/so) {
|
|
2415 $self->{'_filter'} = $1;
|
|
2416 } elsif($data =~ /filter$Newline +(.+?)$Newline/so) {
|
|
2417 $self->{'_filter'} = $1;
|
|
2418 } else {
|
|
2419 $self->{'_filter'} = 'NONE';
|
|
2420 }
|
|
2421
|
|
2422 if($data =~ /$Newline\s*E=(\d+)$Newline/so) { $self->{'_expect'} = $1; }
|
|
2423
|
|
2424 if($data =~ /$Newline\s*M=(\w+)$Newline/so) { $self->{'_matrix'} = $1; }
|
|
2425
|
|
2426 if($data =~ /\s*Frame MatID Matrix name .+?$Newline +(.+?)$Newline/so) {
|
|
2427 ## WU-Blast2.
|
|
2428 my ($fr, $mid, $mat, $lu, $ku, $hu, $lc, $kc, $hc) = split(/\s+/,$1);
|
|
2429 $self->{'_matrix'} = $mat || 'UNKNOWN';
|
|
2430 $self->{'_lambda'} = $lu || 'UNKNOWN';
|
|
2431 $self->{'_k'} = $ku || 'UNKNOWN';
|
|
2432 $self->{'_h'} = $hu || 'UNKNOWN';
|
|
2433
|
|
2434 } elsif($data =~ /Lambda +K +H$Newline +(.+?)$Newline/so) {
|
|
2435 ## NCBI-Blast1.
|
|
2436 my ($l, $k, $h) = split(/\s+/, $1);
|
|
2437 $self->{'_lambda'} = $l || 'UNKNOWN';
|
|
2438 $self->{'_k'} = $k || 'UNKNOWN';
|
|
2439 $self->{'_h'} = $h || 'UNKNOWN';
|
|
2440 }
|
|
2441
|
|
2442 if($data =~ /E +S +W +T +X.+?$Newline +(.+?)$Newline/so) {
|
|
2443 # WashU-Blast2
|
|
2444 my ($fr, $mid, $len, $elen, $e, $s, $w, $t, $x, $e2, $s2) = split(/\s+/,$1);
|
|
2445 $self->{'_expect'} ||= $e || 'UNKNOWN';
|
|
2446 $self->{'_s'} = $s || 'UNKNOWN';
|
|
2447 $self->{'_word_size'} = $w || 'UNKNOWN';
|
|
2448 $self->{'_t'} = $t || 'UNKNOWN';
|
|
2449 $self->{'_x'} = $x || 'UNKNOWN';
|
|
2450
|
|
2451 } elsif($data =~ /E +S +T1 +T2 +X1 +X2 +W +Gap$Newline +(.+?)$Newline/so) {
|
|
2452 ## NCBI-Blast1.
|
|
2453 my ($e, $s, $t1, $t2, $x1, $x2, $w, $gap) = split(/\s+/,$1);
|
|
2454 $self->{'_expect'} ||= $e || 'UNKNOWN';
|
|
2455 $self->{'_s'} = $s || 'UNKNOWN';
|
|
2456 $self->{'_word_size'} = $w || 'UNKNOWN';
|
|
2457 $self->{'_t1'} = $t1 || 'UNKNOWN';
|
|
2458 $self->{'_t2'} = $t2 || 'UNKNOWN';
|
|
2459 $self->{'_x1'} = $x1 || 'UNKNOWN';
|
|
2460 $self->{'_x2'} = $x2 || 'UNKNOWN';
|
|
2461 $self->{'_gap'} = $gap || 'UNKNOWN';
|
|
2462 }
|
|
2463
|
|
2464 if(!$self->{'_matrix'}) {
|
|
2465 $self->{'_matrix'} = $DEFAULT_MATRIX.'?';
|
|
2466 if($self->strict > 0) {
|
|
2467 $self->warn("Can't determine scoring matrix. Assuming $DEFAULT_MATRIX.");
|
|
2468 }
|
|
2469 }
|
|
2470 }
|
|
2471
|
|
2472 =head2 _set_gapping_wu
|
|
2473
|
|
2474 Usage : n/a; internal function called by _set_blast1_stats()
|
|
2475 Purpose : Determine if gapping_wu was on for WashU Blast reports.
|
|
2476 Comments : In earlier versions, gapping was always specified
|
|
2477 : but in the current version (2.0a19MP), gapping is on by default
|
|
2478 : and there is no positive "gapping" indicator in the Parameters
|
|
2479 : section.
|
|
2480
|
|
2481 See Also : L<_set_blast1_stats()|_set_blast1_stats>
|
|
2482
|
|
2483 =cut
|
|
2484
|
|
2485 #--------------------
|
|
2486 sub _set_gapping_wu {
|
|
2487 #--------------------
|
|
2488 my ($self, $data) = @_;
|
|
2489
|
|
2490 if($data =~ /gaps?$Newline/so) {
|
|
2491 $self->{'_gapped'} = ($data =~ /nogaps?$Newline/so) ? 0 : 1;
|
|
2492 } else {
|
|
2493 $self->{'_gapped'} = 1;
|
|
2494 }
|
|
2495 }
|
|
2496
|
|
2497 =head2 _set_date
|
|
2498
|
|
2499 Usage : n/a; internal function called by _parse_footer()
|
|
2500 Purpose : Determine the date on which the Blast analysis was performed.
|
|
2501 Comments : Date information is not consistently added to Blast output.
|
|
2502 : Uses superclass method set_date() to set date from the file,
|
|
2503 : (if any).
|
|
2504
|
|
2505 See Also : L<_parse_footer()|_parse_footer>, B<Bio::Tools::SeqAnal::set_date()>,L<Links to related modules>
|
|
2506
|
|
2507 =cut
|
|
2508
|
|
2509 #--------------
|
|
2510 sub _set_date {
|
|
2511 #--------------
|
|
2512 my $self = shift;
|
|
2513 my $data = shift;
|
|
2514
|
|
2515 ### Network BLAST reports from NCBI are time stamped as follows:
|
|
2516 #Fri Apr 18 15:55:41 EDT 1997, Up 1 day, 19 mins, 1 user, load: 19.54, 19.13, 17.77
|
|
2517 if($data =~ /Start:\s+(.+?)\s+End:/s) {
|
|
2518 ## Calling superclass method to set the date.
|
|
2519 ## If we can't get date from the report, file date is obtained.
|
|
2520 $self->set_date($1);
|
|
2521 } elsif($data =~ /Date:\s+(.*?)$Newline/so) {
|
|
2522 ## E-mailed reports have a Date: field
|
|
2523 $self->set_date($1);
|
|
2524 } elsif( $data =~ /done\s+at (.+?)$Newline/so ) {
|
|
2525 $self->set_date($1);
|
|
2526 } elsif( $data =~ /$Newline([\w:, ]+), Up \d+/so ) {
|
|
2527 $self->set_date($1);
|
|
2528 } else {
|
|
2529 ## Otherwise, let superclass attempt to get the file creation date.
|
|
2530 $self->set_date() if $self->file;
|
|
2531 }
|
|
2532 }
|
|
2533
|
|
2534 =head2 _set_length
|
|
2535
|
|
2536 Usage : n/a; called automatically during Blast report parsing.
|
|
2537 Purpose : Sets the length of the query sequence (extracted from report).
|
|
2538 Returns : integer (length of the query sequence)
|
|
2539 Throws : Exception if cannot determine the query sequence length from
|
|
2540 : the BLAST report.
|
|
2541 : Exception if the length is below the min_length cutoff (if any).
|
|
2542 Comments : The logic here is a bit different from the other _set_XXX()
|
|
2543 : methods since the significance of the BLAST report is assessed
|
|
2544 : if MIN_LENGTH is set.
|
|
2545
|
|
2546 See Also : B<Bio::Tools::SeqAnal::length()>, L<Links to related modules>
|
|
2547
|
|
2548 =cut
|
|
2549
|
|
2550 #---------------
|
|
2551 sub _set_length {
|
|
2552 #---------------
|
|
2553 my ($self, $data) = @_;
|
|
2554
|
|
2555 my ($length);
|
|
2556 if( $data =~ m/$Newline\s+\(([\d|,]+) letters[\);]/so ) {
|
|
2557 $length = $1;
|
|
2558 $length =~ s/,//g;
|
|
2559 # printf "Length = $length in BLAST for %s$Newline",$self->name; <STDIN>;
|
|
2560 } else {
|
|
2561 $self->throw("Can't determine sequence length from BLAST report.");
|
|
2562 }
|
|
2563
|
|
2564 my($sig_len);
|
|
2565 if(defined($Blast->{'_min_length'})) {
|
|
2566 local $^W = 0;
|
|
2567 if($length < $Blast->{'_min_len'}) {
|
|
2568 $self->throw("Query sequence too short for ${\$self->name} ($length)",
|
|
2569 "Minimum length is $Blast->{'_min_len'}");
|
|
2570 }
|
|
2571 }
|
|
2572
|
|
2573 $self->length($length); # defined in superclass.
|
|
2574 }
|
|
2575
|
|
2576 =head2 _set_database
|
|
2577
|
|
2578 Usage : n/a; called automatically during Blast report parsing.
|
|
2579 Purpose : Sets the name of the database used by the BLAST analysis.
|
|
2580 : Extracted from raw BLAST report.
|
|
2581 Throws : Exception if the name of the database cannot be determined.
|
|
2582 Comments : The database name is used by methods or related objects
|
|
2583 : for database-specific parsing.
|
|
2584
|
|
2585 See Also : L<parse()|parse>, B<Bio::Tools::SeqAnal::database()>,B<Bio::Tools::SeqAnal::_set_db_stats()>,L<Links to related modules>
|
|
2586
|
|
2587 =cut
|
|
2588
|
|
2589 #------------------
|
|
2590 sub _set_database {
|
|
2591 #------------------
|
|
2592 # This now only sets data base information extracted from the report footer.
|
|
2593
|
|
2594 my ($self, $data) = @_;
|
|
2595
|
|
2596 my ($name, $date, $lets, $seqs);
|
|
2597
|
|
2598 my $strict = $self->strict > 0;
|
|
2599
|
|
2600 # This is fail-safe since DB name usually gets set in _parse_header()
|
|
2601 # In some reports, the database is only listed at bottom (NCBI 2.0.8).
|
|
2602 if($data =~ m/Database: +(.+?)$Newline/so ) {
|
|
2603 $name = $1;
|
|
2604 } elsif(not $self->database) {
|
|
2605 $self->warn("Can't determine database name from BLAST report.");
|
|
2606 }
|
|
2607
|
|
2608 if($data =~ m/Posted date: +(.+?)$Newline/so ) {
|
|
2609 $date = $1;
|
|
2610 } elsif($data =~ m/Release date: +(.+?)$Newline/so ) {
|
|
2611 $date = $1;
|
|
2612 } elsif($strict) {
|
|
2613 $self->warn("Can't determine database release date.");
|
|
2614 }
|
|
2615
|
|
2616 if($data =~ m/letters in database: +([\d,]+)/si ||
|
|
2617 $data =~ m/length of database: +([\d,]+)/si ) {
|
|
2618 $lets = $1;
|
|
2619 } elsif($strict) {
|
|
2620 $self->warn("Can't determine number of letters in database.\n$data\n");
|
|
2621 }
|
|
2622
|
|
2623 if($data =~ m/sequences in database: +([\d,]+)/si ||
|
|
2624 $data =~ m/number of sequences: +([\d,]+)/si ) {
|
|
2625 $seqs = $1;
|
|
2626 } elsif($strict) {
|
|
2627 $self->warn("Can't determine number of sequences in database.\n$data\n");
|
|
2628 }
|
|
2629
|
|
2630 $self->_set_db_stats( -NAME => $name,
|
|
2631 -RELEASE => $date || '',
|
|
2632 -LETTERS => $lets || '',
|
|
2633 -SEQS => $seqs || ''
|
|
2634 );
|
|
2635 }
|
|
2636
|
|
2637 =head2 _set_query
|
|
2638
|
|
2639 Usage : n/a; called automatically during Blast report parsing.
|
|
2640 Purpose : Set the name of the query and the query description.
|
|
2641 : Extracted from the raw BLAST report.
|
|
2642 Returns : String containing name of query extracted from report.
|
|
2643 Throws : Warning if the name of the query cannont be obtained.
|
|
2644
|
|
2645 See Also : B<Bio::Tools::SeqAnal::query_desc()>,L<Links to related modules>
|
|
2646
|
|
2647 =cut
|
|
2648
|
|
2649 #---------------
|
|
2650 sub _set_query {
|
|
2651 #---------------
|
|
2652 my $self = shift;
|
|
2653 my $data = shift;
|
|
2654
|
|
2655 if($data =~ m/${Newline}Query= *(.+?)$Newline/so ) {
|
|
2656 my $info = $1;
|
|
2657 $info =~ s/TITLE //;
|
|
2658 # Split the query line into two parts.
|
|
2659 # Using \s instead of ' '
|
|
2660 $info =~ /(\S+?)\s(.*)/;
|
|
2661 $self->query_desc($2 || '');
|
|
2662 # set name of Blast object and return.
|
|
2663 $self->name($1 || 'UNKNOWN');
|
|
2664 } else {
|
|
2665 $self->warn("Can't determine query sequence name from BLAST report.");
|
|
2666 }
|
|
2667 # print STDERR "$Newline NAME = ${\$self->name}$Newline";
|
|
2668 }
|
|
2669
|
|
2670 =head2 _parse_signif
|
|
2671
|
|
2672 Usage : &_parse_signif(string, layout, gapped);
|
|
2673 : This is a class function.
|
|
2674 Purpose : Extracts the P- or Expect value from a single line of a BLAST description section.
|
|
2675 Example : &_parse_signif("PDB_UNIQUEP:3HSC_ heat-shock cognate ... 799 4.0e-206 2", 1);
|
|
2676 : &_parse_signif("gi|758803 (U23828) peritrophin-95 precurs 38 0.19", 2);
|
|
2677 Argument : string = line from BLAST description section
|
|
2678 : layout = integer (1 or 2)
|
|
2679 : gapped = boolean (true if gapped Blast).
|
|
2680 Returns : Float (0.001 or 1e-03)
|
|
2681 Status : Static
|
|
2682
|
|
2683 =cut
|
|
2684
|
|
2685 #------------------
|
|
2686 sub _parse_signif {
|
|
2687 #------------------
|
|
2688 my ($line, $layout, $gapped) = @_;
|
|
2689
|
|
2690 local $_ = $line;
|
|
2691 my @linedat = split();
|
|
2692
|
|
2693 # When processing both Blast1 and Blast2 reports
|
|
2694 # in the same run, offset needs to be configured each time.
|
|
2695
|
|
2696 my $offset = 0;
|
|
2697 $offset = 1 if $layout == 1 or not $gapped;
|
|
2698
|
|
2699 my $signif = $linedat[ $#linedat - $offset ];
|
|
2700
|
|
2701 # fail-safe check
|
|
2702 if(not $signif =~ /[.-]/) {
|
|
2703 $offset = ($offset == 0 ? 1 : 0);
|
|
2704 $signif = $linedat[ $#linedat - $offset ];
|
|
2705 }
|
|
2706
|
|
2707 $signif = "1$signif" if $signif =~ /^e/i;
|
|
2708 return $signif;
|
|
2709 }
|
|
2710
|
|
2711 ##
|
|
2712 ## BEGIN ACCESSOR METHODS THAT INCORPORATE THE STATIC $Blast OBJECT.
|
|
2713 ##
|
|
2714
|
|
2715 sub program {
|
|
2716 ## Overridden method to incorporate the BLAST object.
|
|
2717 my $self = shift;
|
|
2718 return $self->SUPER::program(@_) if @_; # set
|
|
2719 $self->SUPER::program || $Blast->SUPER::program; # get
|
|
2720 }
|
|
2721
|
|
2722 sub program_version {
|
|
2723 ## Overridden method to incorporate the BLAST object.
|
|
2724 my $self = shift;
|
|
2725 return $self->SUPER::program_version(@_) if @_; # set
|
|
2726 $self->SUPER::program_version || $Blast->SUPER::program_version; # get
|
|
2727 }
|
|
2728
|
|
2729 sub database {
|
|
2730 ## Overridden method to incorporate the BLAST object.
|
|
2731 my $self = shift;
|
|
2732 return $self->SUPER::database(@_) if @_; # set
|
|
2733 $self->SUPER::database || $Blast->SUPER::database; # get
|
|
2734 }
|
|
2735
|
|
2736 sub database_letters {
|
|
2737 ## Overridden method to incorporate the BLAST object.
|
|
2738 my $self = shift;
|
|
2739 return $self->SUPER::database_letters(@_) if @_; # set
|
|
2740 $self->SUPER::database_letters || $Blast->SUPER::database_letters; # get
|
|
2741 }
|
|
2742
|
|
2743 sub database_release {
|
|
2744 ## Overridden method to incorporate the BLAST object.
|
|
2745 my $self = shift;
|
|
2746 return $self->SUPER::database_release(@_) if @_; # set
|
|
2747 $self->SUPER::database_release || $Blast->SUPER::database_release; # get
|
|
2748 }
|
|
2749
|
|
2750 sub database_seqs {
|
|
2751 ## Overridden method to incorporate the BLAST object.
|
|
2752 my $self = shift;
|
|
2753 return $self->SUPER::database_seqs(@_) if @_; # set
|
|
2754 $self->SUPER::database_seqs || $Blast->SUPER::database_seqs; # get
|
|
2755 }
|
|
2756
|
|
2757 sub date {
|
|
2758 ## Overridden method to incorporate the BLAST object.
|
|
2759 my $self = shift;
|
|
2760 return $self->SUPER::date(@_) if @_; # set
|
|
2761 $self->SUPER::date || $Blast->SUPER::date; # get
|
|
2762 }
|
|
2763
|
|
2764 sub best {
|
|
2765 ## Overridden method to incorporate the BLAST object.
|
|
2766 my $self = shift;
|
|
2767 return $Blast->SUPER::best(@_) if @_; # set
|
|
2768 $Blast->SUPER::best; # get
|
|
2769 }
|
|
2770
|
|
2771 =head2 signif
|
|
2772
|
|
2773 Usage : $blast->signif();
|
|
2774 Purpose : Gets the P or Expect value used as significance screening cutoff.
|
|
2775 Returns : Scientific notation number with this format: 1.0e-05.
|
|
2776 Argument : n/a
|
|
2777 Comments : Screening of significant hits uses the data provided on the
|
|
2778 : description line. For Blast1 and WU-Blast2, this data is P-value.
|
|
2779 : for Blast2 it is an Expect value.
|
|
2780 :
|
|
2781 : Obtains info from the static $Blast object if it has not been set
|
|
2782 : for the current object.
|
|
2783
|
|
2784 See Also : L<_set_signif()|_set_signif>
|
|
2785
|
|
2786 =cut
|
|
2787
|
|
2788 #-----------
|
|
2789 sub signif {
|
|
2790 #-----------
|
|
2791 my $self = shift;
|
|
2792 my $sig = $self->{'_significance'} || $Blast->{'_significance'};
|
|
2793 sprintf "%.1e", $sig;
|
|
2794 }
|
|
2795
|
|
2796 =head2 is_signif
|
|
2797
|
|
2798 Usage : $blast->is_signif();
|
|
2799 Purpose : Determine if the BLAST report contains significant hits.
|
|
2800 Returns : Boolean
|
|
2801 Argument : n/a
|
|
2802 Comments : BLAST reports without significant hits but with defined
|
|
2803 : significance criteria will throw exceptions during construction.
|
|
2804 : This obviates the need to check significant() for
|
|
2805 : such objects.
|
|
2806
|
|
2807 See Also : L<_set_signif()|_set_signif>
|
|
2808
|
|
2809 =cut
|
|
2810
|
|
2811 #------------
|
|
2812 sub is_signif { my $self = shift; return $self->{'_is_significant'}; }
|
|
2813 #------------
|
|
2814
|
|
2815 # is_signif() doesn't incorporate the static $Blast object but is included
|
|
2816 # here to be with the other 'signif' methods.
|
|
2817
|
|
2818 =head2 signif_fmt
|
|
2819
|
|
2820 Usage : $blast->signif_fmt( [FMT] );
|
|
2821 Purpose : Allows retrieval of the P/Expect exponent values only
|
|
2822 : or as a two-element list (mantissa, exponent).
|
|
2823 Usage : $blast_obj->signif_fmt('exp');
|
|
2824 : $blast_obj->signif_fmt('parts');
|
|
2825 Returns : String or '' if not set.
|
|
2826 Argument : String, FMT = 'exp' (return the exponent only)
|
|
2827 : = 'parts'(return exponent + mantissa in 2-elem list)
|
|
2828 : = undefined (return the raw value)
|
|
2829 Comments : P/Expect values are still stored internally as the full,
|
|
2830 : scientific notation value.
|
|
2831 : This method uses the static $Blast object since this issue
|
|
2832 : will pertain to all Blast reports within a given set.
|
|
2833 : This setting is propagated to Bio::Tools::Blast::Sbjct.pm.
|
|
2834
|
|
2835 =cut
|
|
2836
|
|
2837 #-------------
|
|
2838 sub signif_fmt {
|
|
2839 #-------------
|
|
2840 my $self = shift;
|
|
2841 if(@_) { $Blast->{'_signif_fmt'} = shift; }
|
|
2842 $Blast->{'_signif_fmt'} || '';
|
|
2843 }
|
|
2844
|
|
2845 =head2 min_length
|
|
2846
|
|
2847 Usage : $blast->min_length();
|
|
2848 Purpose : Gets the query sequence length used as significance screening criteria.
|
|
2849 Returns : Integer
|
|
2850 Argument : n/a
|
|
2851 Comments : Obtains info from the static $Blast object if it has not been set
|
|
2852 : for the current object.
|
|
2853
|
|
2854 See Also : L<_set_signif()|_set_signif>, L<signif()|signif>
|
|
2855
|
|
2856 =cut
|
|
2857
|
|
2858 #--------------
|
|
2859 sub min_length {
|
|
2860 #--------------
|
|
2861 my $self = shift;
|
|
2862 $self->{'_min_length'} || $Blast->{'_min_length'};
|
|
2863 }
|
|
2864
|
|
2865 =head2 gapped
|
|
2866
|
|
2867 Usage : $blast->gapped();
|
|
2868 Purpose : Set/Get boolean indicator for gapped BLAST.
|
|
2869 Returns : Boolean
|
|
2870 Argument : n/a
|
|
2871 Comments : Obtains info from the static $Blast object if it has not been set
|
|
2872 : for the current object.
|
|
2873
|
|
2874 =cut
|
|
2875
|
|
2876 #-----------
|
|
2877 sub gapped {
|
|
2878 #-----------
|
|
2879 my $self = shift;
|
|
2880 if(@_) { $self->{'_gapped'} = shift; }
|
|
2881 $self->{'_gapped'} || $Blast->{'_gapped'};
|
|
2882 }
|
|
2883
|
|
2884 =head2 _get_stats
|
|
2885
|
|
2886 Usage : n/a; internal method.
|
|
2887 Purpose : Set/Get indicator for collecting full statistics from report.
|
|
2888 Returns : Boolean (0 | 1)
|
|
2889 Comments : Obtains info from the static $Blast object which gets set
|
|
2890 : by _init_parse_params().
|
|
2891
|
|
2892 =cut
|
|
2893
|
|
2894 #---------------
|
|
2895 sub _get_stats {
|
|
2896 #---------------
|
|
2897 my $self = shift;
|
|
2898 $Blast->{'_get_stats'};
|
|
2899 }
|
|
2900
|
|
2901 =head2 _layout
|
|
2902
|
|
2903 Usage : n/a; internal method.
|
|
2904 Purpose : Set/Get indicator for the layout of the report.
|
|
2905 Returns : Integer (1 | 2)
|
|
2906 : Defaults to 2 if not set.
|
|
2907 Comments : Blast1 and WashU-Blast2 have a layout = 1.
|
|
2908 : This is intended for internal use by this and closely
|
|
2909 : allied modules like Sbjct.pm and HSP.pm.
|
|
2910 :
|
|
2911 : Obtains info from the static $Blast object if it has not been set
|
|
2912 : for the current object.
|
|
2913
|
|
2914 =cut
|
|
2915
|
|
2916 #------------
|
|
2917 sub _layout {
|
|
2918 #------------
|
|
2919 my $self = shift;
|
|
2920 if(@_) {
|
|
2921 # Optimization if we know all reports share the same stats.
|
|
2922 if($Blast->{'_share'}) {
|
|
2923 $Blast->{'_layout'} = shift;
|
|
2924 } else {
|
|
2925 $self->{'_layout'} = shift;
|
|
2926 }
|
|
2927 }
|
|
2928 $self->{'_layout'} || $Blast->{'_layout'} || 2;
|
|
2929 }
|
|
2930
|
|
2931 ##
|
|
2932 ## END ACCESSOR METHODS THAT INCORPORATE THE STATIC $Blast OBJECT.
|
|
2933 ##
|
|
2934
|
|
2935 =head2 hits
|
|
2936
|
|
2937 Usage : $blast->hits();
|
|
2938 Purpose : Get a list containing all BLAST hit (Sbjct) objects.
|
|
2939 : Get the numbers of significant hits.
|
|
2940 Examples : @hits = $blast->hits();
|
|
2941 : $num_signif = $blast->hits();
|
|
2942 Returns : List context : list of Bio::Tools::Blast::Sbjct.pm objects
|
|
2943 : or an empty list if there are no hits.
|
|
2944 : Scalar context: integer (number of significant hits)
|
|
2945 : or zero if there are no hits.
|
|
2946 : (Equivalent to num_hits()).
|
|
2947 Argument : n/a. Relies on wantarray.
|
|
2948 Throws : n/a.
|
|
2949 : Not throwing exception because the absence of hits may have
|
|
2950 : resulted from stringent significance criteria, not a failure
|
|
2951 : set the hits.
|
|
2952
|
|
2953 See Also : L<hit()|hit>, L<num_hits()|num_hits>, L<is_signif()|is_signif>, L<_set_signif()|_set_signif>
|
|
2954
|
|
2955 =cut
|
|
2956
|
|
2957 #----------
|
|
2958 sub hits {
|
|
2959 #----------
|
|
2960 my $self = shift;
|
|
2961
|
|
2962 if(wantarray) {
|
|
2963 my @ary = ref($self->{'_hits'}) ? @{$self->{'_hits'}} : ();
|
|
2964 return @ary;
|
|
2965 } else {
|
|
2966 return $self->num_hits();
|
|
2967 }
|
|
2968
|
|
2969 # my $num = ref($self->{'_hits'}) ? scalar(@{$self->{'_hits'}}) : 0;
|
|
2970 # my @ary = ref($self->{'_hits'}) ? @{$self->{'_hits'}} : ();
|
|
2971 #
|
|
2972 # return wantarray
|
|
2973 # # returning list containing all hits or empty list.
|
|
2974 # ? $self->{'_is_significant'} ? @ary : ()
|
|
2975 # # returning number of hits or 0.
|
|
2976 # : $self->{'_is_significant'} ? $num : 0;
|
|
2977 }
|
|
2978
|
|
2979 =head2 hit
|
|
2980
|
|
2981 Example : $blast_obj->hit( [class] )
|
|
2982 Purpose : Get a specific hit object.
|
|
2983 : Provides some syntactic sugar for the hits() method.
|
|
2984 Usage : $hitObj = $blast->hit();
|
|
2985 : $hitObj = $blast->hit('best');
|
|
2986 : $hitObj = $blast->hit('worst');
|
|
2987 : $hitObj = $blast->hit( $name );
|
|
2988 Returns : Object reference for a Bio::Tools::Blast::Sbjct.pm object.
|
|
2989 : undef if there are no hit (Sbjct) objects defined.
|
|
2990 Argument : Class (or no argument).
|
|
2991 : No argument (default) = highest scoring hit (same as 'best').
|
|
2992 : 'best' or 'first' = highest scoring hit.
|
|
2993 : 'worst' or 'last' = lowest scoring hit.
|
|
2994 : $name = retrieve a hit by seq id (case-insensitive).
|
|
2995 Throws : Exception if the Blast object has no significant hits.
|
|
2996 : Exception if a hit cannot be found when supplying a specific
|
|
2997 : hit sequence identifier as an argument.
|
|
2998 Comments : 'best' = lowest significance value (P or Expect) among significant hits.
|
|
2999 : 'worst' = highest sigificance value (P or Expect) among significant hits.
|
|
3000
|
|
3001 See Also : L<hits()|hits>, L<num_hits()|num_hits>, L<is_signif()|is_signif>
|
|
3002
|
|
3003 =cut
|
|
3004
|
|
3005 #---------
|
|
3006 sub hit {
|
|
3007 #---------
|
|
3008 my( $self, $option) = @_;
|
|
3009 $option ||= 'best';
|
|
3010
|
|
3011 if($Blast->{'_no_aligns'} || ! ref($self->{'_hits'})) {
|
|
3012 return undef;
|
|
3013 }
|
|
3014
|
|
3015 $self->{'_is_significant'} or
|
|
3016 $self->throw("There were no significant hits.",
|
|
3017 "Use num_hits(), hits(), is_signif() to check.");
|
|
3018
|
|
3019 my @hits = @{$self->{'_hits'}};
|
|
3020
|
|
3021 return $hits[0] if $option =~ /^(best|first|1)$/i;
|
|
3022 return $hits[$#hits] if $option =~ /^(worst|last)$/i;
|
|
3023
|
|
3024 # Get hit by name.
|
|
3025 foreach ( @hits ) {
|
|
3026 return $_ if $_->name() =~ /$option/i;
|
|
3027 }
|
|
3028
|
|
3029 $self->throw("Can't get hit for: $option");
|
|
3030 }
|
|
3031
|
|
3032 =head2 num_hits
|
|
3033
|
|
3034 Usage : $blast->num_hits( ['total'] );
|
|
3035 Purpose : Get number of significant hits or number of total hits.
|
|
3036 Examples : $num_signif = $blast-num_hits;
|
|
3037 : $num_total = $blast->num_hits('total');
|
|
3038 Returns : Integer
|
|
3039 Argument : String = 'total' (or no argument).
|
|
3040 : No argument (Default) = return number of significant hits.
|
|
3041 : 'total' = number of total hits.
|
|
3042 Throws : n/a.
|
|
3043 : Not throwing exception because the absence of hits may have
|
|
3044 : resulted from stringent significance criteria, not a failure
|
|
3045 : set the hits.
|
|
3046 Comments : A significant hit is defined as a hit with an expect value
|
|
3047 : (or P value for WU-Blast) at or below the -signif parameter
|
|
3048 : used when parsing the report. Additionally, if a filter function
|
|
3049 : was supplied, the significant hit must also pass that
|
|
3050 : criteria.
|
|
3051
|
|
3052 See Also : L<hits()|hits>, L<hit()|hit>, L<is_signif()|is_signif>, L<_set_signif()|_set_signif>, L<parse()|parse>
|
|
3053
|
|
3054 =cut
|
|
3055
|
|
3056 #-------------
|
|
3057 sub num_hits {
|
|
3058 #-------------
|
|
3059 my( $self, $option) = @_;
|
|
3060 $option ||= '';
|
|
3061
|
|
3062 $option =~ /total/i and return $self->{'_num_hits'} || 0;
|
|
3063
|
|
3064 # Default: returning number of significant hits.
|
|
3065 # return $self->{'_num_hits_significant'} || 0;
|
|
3066 # return 0 if not ref $self->{'_hits'};
|
|
3067
|
|
3068 if(ref $self->{'_hits'}) {
|
|
3069 return scalar(@{$self->{'_hits'}});
|
|
3070 } else {
|
|
3071 return $self->{'_num_hits_significant'} || 0;
|
|
3072 }
|
|
3073 }
|
|
3074
|
|
3075 =head2 lowest_p
|
|
3076
|
|
3077 Usage : $blast->lowest_p()
|
|
3078 Purpose : Get the lowest P-value among all hits in a BLAST report.
|
|
3079 : Syntactic sugar for $blast->hit('best')->p().
|
|
3080 Returns : Float or scientific notation number.
|
|
3081 : Returns -1.0 if lowest_p has not been set.
|
|
3082 Argument : n/a.
|
|
3083 Throws : Exception if the Blast report does not report P-values
|
|
3084 : (as is the case for NCBI Blast2).
|
|
3085 Comments : A value is returned regardless of whether or not there were
|
|
3086 : significant hits ($DEFAULT_SIGNIF, currently 999).
|
|
3087
|
|
3088 See Also : L<lowest_expect()|lowest_expect>, L<lowest_signif()|lowest_signif>, L<highest_p()|highest_p>, L<signif_fmt()|signif_fmt>
|
|
3089
|
|
3090 =cut
|
|
3091
|
|
3092 #------------
|
|
3093 sub lowest_p {
|
|
3094 #------------
|
|
3095 my $self = shift;
|
|
3096
|
|
3097 # Layout 2 = NCBI Blast 2.x does not report P-values.
|
|
3098 $self->_layout == 2 and
|
|
3099 $self->throw("Can't get P-value with BLAST2.",
|
|
3100 "Use lowest_signif() or lowest_expect()");
|
|
3101
|
|
3102 return $self->{'_lowestSignif'} || -1.0;
|
|
3103 }
|
|
3104
|
|
3105 =head2 lowest_expect
|
|
3106
|
|
3107 Usage : $blast->lowest_expect()
|
|
3108 Purpose : Get the lowest Expect value among all hits in a BLAST report.
|
|
3109 : Syntactic sugar for $blast->hit('best')->expect()
|
|
3110 Returns : Float or scientific notation number.
|
|
3111 : Returns -1.0 if lowest_expect has not been set.
|
|
3112 Argument : n/a.
|
|
3113 Throws : Exception if there were no significant hits and the report
|
|
3114 : does not have Expect values on the description lines
|
|
3115 : (i.e., Blast1, WashU-Blast2).
|
|
3116
|
|
3117 See Also : L<lowest_p()|lowest_p>, L<lowest_signif()|lowest_signif>, L<highest_expect()|highest_expect>, L<signif_fmt()|signif_fmt>
|
|
3118
|
|
3119 =cut
|
|
3120
|
|
3121 #------------------
|
|
3122 sub lowest_expect {
|
|
3123 #------------------
|
|
3124 my $self = shift;
|
|
3125
|
|
3126 if ($self->_layout == 2) {
|
|
3127 return $self->{'_lowestSignif'} || -1.0;
|
|
3128 }
|
|
3129
|
|
3130 if($self->{'_is_significant'}) {
|
|
3131 my $bestHit = $self->{'_hits'}->[0];
|
|
3132 return $bestHit->expect();
|
|
3133 } else {
|
|
3134 $self->throw("Can't get lowest expect value: no significant hits ",
|
|
3135 "The format of this report requires expect values to be extracted$Newline".
|
|
3136 "from the hits themselves.");
|
|
3137 }
|
|
3138 }
|
|
3139
|
|
3140 =head2 highest_p
|
|
3141
|
|
3142 Example : $blast->highest_p( ['overall'])
|
|
3143 Purpose : Get the highest P-value among all hits in a BLAST report.
|
|
3144 : Syntactic sugar for $blast->hit('worst')->p()
|
|
3145 : Can also get the highest P-value overall (not just among signif hits).
|
|
3146 Usage : $p_signif = $blast->highest_p();
|
|
3147 : $p_all = $blast->highest_p('overall');
|
|
3148 Returns : Float or scientific notation number.
|
|
3149 : Returns -1.0 if highest_p has not been set.
|
|
3150 Argument : String 'overall' or no argument.
|
|
3151 : No argument = get highest P-value among significant hits.
|
|
3152 Throws : Exception if object is created from a Blast2 report
|
|
3153 : (which does not report P-values).
|
|
3154
|
|
3155 See Also : L<highest_signif()|highest_signif>, L<lowest_p()|lowest_p>, L<_set_signif()|_set_signif>, L<signif_fmt()|signif_fmt>
|
|
3156
|
|
3157 =cut
|
|
3158
|
|
3159 #---------------
|
|
3160 sub highest_p {
|
|
3161 #---------------
|
|
3162 my ($self, $overall) = @_;
|
|
3163
|
|
3164 # Layout 2 = NCBI Blast 2.x does not report P-values.
|
|
3165 $self->_layout == 2 and
|
|
3166 $self->throw("Can't get P-value with BLAST2.",
|
|
3167 "Use highest_signif() or highest_expect()");
|
|
3168
|
|
3169 $overall and return $self->{'_highestSignif'} || -1.0;
|
|
3170 $self->hit('worst')->p();
|
|
3171 }
|
|
3172
|
|
3173 =head2 highest_expect
|
|
3174
|
|
3175 Usage : $blast_object->highest_expect( ['overall'])
|
|
3176 Purpose : Get the highest Expect value among all significant hits in a BLAST report.
|
|
3177 : Syntactic sugar for $blast->hit('worst')->expect()
|
|
3178 Examples : $e_sig = $blast->highest_expect();
|
|
3179 : $e_all = $blast->highest_expect('overall');
|
|
3180 Returns : Float or scientific notation number.
|
|
3181 : Returns -1.0 if highest_exoect has not been set.
|
|
3182 Argument : String 'overall' or no argument.
|
|
3183 : No argument = get highest Expect-value among significant hits.
|
|
3184 Throws : Exception if there were no significant hits and the report
|
|
3185 : does not have Expect values on the description lines
|
|
3186 : (i.e., Blast1, WashU-Blast2).
|
|
3187
|
|
3188 See Also : L<lowest_expect()|lowest_expect>, L<highest_signif()|highest_signif>, L<signif_fmt()|signif_fmt>
|
|
3189
|
|
3190 =cut
|
|
3191
|
|
3192 #-------------------
|
|
3193 sub highest_expect {
|
|
3194 #-------------------
|
|
3195 my ($self, $overall) = @_;
|
|
3196
|
|
3197 if ( $overall and $self->_layout == 2) {
|
|
3198 return $self->{'_highestSignif'} || -1.0;
|
|
3199 }
|
|
3200
|
|
3201 if($self->{'_is_significant'}) {
|
|
3202 return $self->hit('worst')->expect;
|
|
3203 } else {
|
|
3204 $self->throw("Can't get highest expect value: no significant hits ",
|
|
3205 "The format of this report requires expect values to be extracted$Newline".
|
|
3206 "from the hits themselves.");
|
|
3207 }
|
|
3208 }
|
|
3209
|
|
3210 =head2 lowest_signif
|
|
3211
|
|
3212 Usage : $blast_obj->lowest_signif();
|
|
3213 : Syntactic sugar for $blast->hit('best')->signif()
|
|
3214 Purpose : Get the lowest P or Expect value among all hits
|
|
3215 : in a BLAST report.
|
|
3216 : This method is syntactic sugar for $blast->hit('best')->signif()
|
|
3217 : The value returned is the one which is reported in the decription
|
|
3218 : section of the Blast report.
|
|
3219 : For Blast1 and WU-Blast2, this is a P-value,
|
|
3220 : for NCBI Blast2, it is an Expect value.
|
|
3221 Example : $blast->lowest_signif();
|
|
3222 Returns : Float or scientific notation number.
|
|
3223 : Returns -1.0 if lowest_signif has not been set.
|
|
3224 Argument : n/a.
|
|
3225 Throws : n/a.
|
|
3226 Status : Deprecated. Use lowest_expect() or lowest_p().
|
|
3227 Comments : The signif() method provides a way to deal with the fact that
|
|
3228 : Blast1 and Blast2 formats differ in what is reported in the
|
|
3229 : description lines of each hit in the Blast report. The signif()
|
|
3230 : method frees any client code from having to know if this is a P-value
|
|
3231 : or an Expect value, making it easier to write code that can process
|
|
3232 : both Blast1 and Blast2 reports. This is not necessarily a good thing, since
|
|
3233 : one should always know when one is working with P-values or
|
|
3234 : Expect values (hence the deprecated status).
|
|
3235 : Use of lowest_expect() is recommended since all hits will have an Expect value.
|
|
3236
|
|
3237 See Also : L<lowest_p()|lowest_p>, L<lowest_expect()|lowest_expect>, L<signif()|signif>, L<signif_fmt()|signif_fmt>, L<_set_signif()|_set_signif>
|
|
3238
|
|
3239 =cut
|
|
3240
|
|
3241 #------------------
|
|
3242 sub lowest_signif {
|
|
3243 #------------------
|
|
3244 my ($self) = @_;
|
|
3245
|
|
3246 return $self->{'_lowestSignif'} || -1.0;
|
|
3247 }
|
|
3248
|
|
3249 =head2 highest_signif
|
|
3250
|
|
3251 Usage : $blast_obj->highest_signif('overall');
|
|
3252 : Syntactic sugar for $blast->hit('worst')->signif()
|
|
3253 Purpose : Get the highest P or Expect value among all hits
|
|
3254 : in a BLAST report.
|
|
3255 : The value returned is the one which is reported in the decription
|
|
3256 : section of the Blast report.
|
|
3257 : For Blast1 and WU-Blast2, this is a P-value,
|
|
3258 : for NCBI Blast2, it is an Expect value.
|
|
3259 Example : $blast->highest_signif();
|
|
3260 Returns : Float or scientific notation number.
|
|
3261 : Returns -1.0 if highest_signif has not been set.
|
|
3262 Argument : Optional string 'overall' to get the highest overall significance value.
|
|
3263 Throws : n/a.
|
|
3264 Status : Deprecated. Use highest_expect() or highest_p().
|
|
3265 Comments : Analogous to lowest_signif(), q.v.
|
|
3266
|
|
3267 See Also : L<lowest_signif()|lowest_signif>, L<lowest_p()|lowest_p>, L<lowest_expect()|lowest_expect>, L<signif()|signif>, L<signif_fmt()|signif_fmt>, L<_set_signif()|_set_signif>
|
|
3268
|
|
3269 =cut
|
|
3270
|
|
3271 #---------------------
|
|
3272 sub highest_signif {
|
|
3273 #---------------------
|
|
3274 my ($self, $overall) = @_;
|
|
3275
|
|
3276 $overall and return $self->{'_highestSignif'} || -1.0;
|
|
3277
|
|
3278 if($self->{'_is_significant'}) {
|
|
3279 my $worst_hit = $self->hit('worst');
|
|
3280 if(defined $worst_hit) {
|
|
3281 return $worst_hit->signif;
|
|
3282 } else {
|
|
3283 return $self->{'_highestSignif'};
|
|
3284 }
|
|
3285 }
|
|
3286 }
|
|
3287
|
|
3288 =head2 matrix
|
|
3289
|
|
3290 Usage : $blast_object->matrix();
|
|
3291 Purpose : Get the name of the scoring matrix used.
|
|
3292 : This is extracted from the report.
|
|
3293 Argument : n/a
|
|
3294 Returns : string or undef if not defined
|
|
3295
|
|
3296 =cut
|
|
3297
|
|
3298 #------------
|
|
3299 sub matrix { my $self = shift; $self->{'_matrix'} || $Blast->{'_matrix'}; }
|
|
3300 #------------
|
|
3301
|
|
3302 =head2 filter
|
|
3303
|
|
3304 Usage : $blast_object->filter();
|
|
3305 Purpose : Get the name of the low-complexity sequence filter used.
|
|
3306 : (SEG, SEG+XNU, DUST, NONE).
|
|
3307 : This is extracted from the report.
|
|
3308 Argument : n/a
|
|
3309 Returns : string or undef if not defined
|
|
3310
|
|
3311 =cut
|
|
3312
|
|
3313 #----------
|
|
3314 sub filter { my $self = shift; $self->{'_filter'} || $Blast->{'_filter'}; }
|
|
3315 #----------
|
|
3316
|
|
3317 =head2 expect
|
|
3318
|
|
3319 Usage : $blast_object->expect();
|
|
3320 Purpose : Get the expect parameter (E) used for the Blast analysis.
|
|
3321 : This is extracted from the report.
|
|
3322 Argument : n/a
|
|
3323 Returns : string or undef if not defined.
|
|
3324
|
|
3325 =cut
|
|
3326
|
|
3327 #-----------
|
|
3328 sub expect { my $self = shift; $self->{'_expect'} || $Blast->{'_expect'}; }
|
|
3329 #-----------
|
|
3330
|
|
3331 =head2 karlin_altschul
|
|
3332
|
|
3333 Usage : $blast_object->karlin_altschul();
|
|
3334 Purpose : Get the Karlin_Altschul sum statistics (Lambda, K, H)
|
|
3335 : These are extracted from the report.
|
|
3336 Argument : n/a
|
|
3337 Returns : list of three floats (Lambda, K, H)
|
|
3338 : If not defined, returns list of three zeros)
|
|
3339
|
|
3340 =cut
|
|
3341
|
|
3342 #---------------------
|
|
3343 sub karlin_altschul {
|
|
3344 #---------------------
|
|
3345 my $self = shift;
|
|
3346 if(defined($self->{'_lambda'})) {
|
|
3347 ($self->{'_lambda'}, $self->{'_k'}, $self->{'_h'});
|
|
3348 } elsif(defined($Blast->{'_lambda'})) {
|
|
3349 ($Blast->{'_lambda'}, $Blast->{'_k'}, $Blast->{'_h'});
|
|
3350 } else {
|
|
3351 (0, 0, 0);
|
|
3352 }
|
|
3353 }
|
|
3354
|
|
3355 =head2 word_size
|
|
3356
|
|
3357 Usage : $blast_object->word_size();
|
|
3358 Purpose : Get the word_size used during the Blast analysis.
|
|
3359 : This is extracted from the report.
|
|
3360 Argument : n/a
|
|
3361 Returns : integer or undef if not defined.
|
|
3362
|
|
3363 =cut
|
|
3364
|
|
3365 #--------------
|
|
3366 sub word_size {
|
|
3367 #--------------
|
|
3368 my $self = shift;
|
|
3369 $self->{'_word_size'} || $Blast->{'_word_size'};
|
|
3370 }
|
|
3371
|
|
3372 =head2 s
|
|
3373
|
|
3374 Usage : $blast_object->s();
|
|
3375 Purpose : Get the s statistic for the Blast analysis.
|
|
3376 : This is extracted from the report.
|
|
3377 Argument : n/a
|
|
3378 Returns : integer or undef if not defined.
|
|
3379
|
|
3380 =cut
|
|
3381
|
|
3382 #------
|
|
3383 sub s { my $self = shift; $self->{'_s'} || $Blast->{'_s'}; }
|
|
3384 #------
|
|
3385
|
|
3386 =head2 gap_creation
|
|
3387
|
|
3388 Usage : $blast_object->gap_creation();
|
|
3389 Purpose : Get the gap creation penalty used for a gapped Blast analysis.
|
|
3390 : This is extracted from the report.
|
|
3391 Argument : n/a
|
|
3392 Returns : integer or undef if not defined.
|
|
3393
|
|
3394 See Also : L<gap_extension()|gap_extension>
|
|
3395
|
|
3396 =cut
|
|
3397
|
|
3398 #-----------------
|
|
3399 sub gap_creation {
|
|
3400 #-----------------
|
|
3401 my $self = shift;
|
|
3402 $self->{'_gapCreation'} || $Blast->{'_gapCreation'};
|
|
3403 }
|
|
3404
|
|
3405 =head2 gap_extension
|
|
3406
|
|
3407 Usage : $blast_object->gap_extension();
|
|
3408 Purpose : Get the gap extension penalty used for a gapped Blast analysis.
|
|
3409 : This is extracted from the report.
|
|
3410 Argument : n/a
|
|
3411 Returns : integer or undef if not defined.
|
|
3412
|
|
3413 See Also : L<gap_extension()|gap_extension>
|
|
3414
|
|
3415 =cut
|
|
3416
|
|
3417 #-------------------
|
|
3418 sub gap_extension {
|
|
3419 #-------------------
|
|
3420 my $self = shift;
|
|
3421 $self->{'_gapExtension'} || $Blast->{'_gapExtension'};
|
|
3422 }
|
|
3423
|
|
3424 =head2 ambiguous_aln
|
|
3425
|
|
3426 Usage : $blast_object->ambiguous_aln();
|
|
3427 Purpose : Test all hits and determine if any have an ambiguous alignment.
|
|
3428 Example : print "ambiguous" if $blast->ambiguous_aln();
|
|
3429 Returns : Boolean (true if ANY significant hit has an ambiguous alignment)
|
|
3430 Argument : n/a
|
|
3431 Throws : n/a
|
|
3432 Status : Experimental
|
|
3433 Comments : An ambiguous BLAST alignment is defined as one where two or more
|
|
3434 : different HSPs have significantly overlapping sequences such
|
|
3435 : that it is not possible to create a unique alignment
|
|
3436 : by simply concatenating HSPs. This may indicate the presence
|
|
3437 : of multiple domains in one sequence relative to another.
|
|
3438 : This method only indicates the presence of ambiguity in at
|
|
3439 : least one significant hit. To determine the nature of the
|
|
3440 : ambiguity, each hit must be examined.
|
|
3441
|
|
3442 See Also : B<Bio::Tools::Blast::Sbjct::ambiguous_aln()>,L<Links to related modules>
|
|
3443
|
|
3444 =cut
|
|
3445
|
|
3446 #----------------
|
|
3447 sub ambiguous_aln {
|
|
3448 #----------------
|
|
3449 my $self = shift;
|
|
3450 foreach($self->hits()) {
|
|
3451 return 1 if ($_->ambiguous_aln() ne '-');
|
|
3452 }
|
|
3453 0;
|
|
3454 }
|
|
3455
|
|
3456 =head2 overlap
|
|
3457
|
|
3458 Usage : $blast_object->overlap([integer]);
|
|
3459 Purpose : Set/Get the number of overlapping residues allowed when tiling multiple HSPs.
|
|
3460 : Delegates to Bio::Tools::Blast::Sbjct::overlap().
|
|
3461 Throws : Exception if there are no significant hits.
|
|
3462 Status : Experimental
|
|
3463
|
|
3464 See Also : B<Bio::Tools::Blast::Sbjct::overlap()>,L<Links to related modules>
|
|
3465
|
|
3466 =cut
|
|
3467
|
|
3468 #------------
|
|
3469 sub overlap {
|
|
3470 #------------
|
|
3471 my $self = shift;
|
|
3472 if(not $self->hits) {
|
|
3473 $self->throw("Can't get overlap data without significant hits.");
|
|
3474 }
|
|
3475 $self->hit->overlap();
|
|
3476 }
|
|
3477
|
|
3478 =head2 homol_data
|
|
3479
|
|
3480 Usage : @data = $blast_object->homo_data( %named_params );
|
|
3481 Purpose : Gets specific similarity data about each significant hit.
|
|
3482 Returns : Array of strings:
|
|
3483 : "Homology data" for each HSP is in the format:
|
|
3484 : "<integer> <start> <stop>"
|
|
3485 : Data for different HSPs are tab-delimited.
|
|
3486 Argument : named parameters passed along to the hit objects.
|
|
3487 Throws : n/a
|
|
3488 Status : Experimental
|
|
3489 Comments : This is a very experimental method used for obtaining an
|
|
3490 : indication of:
|
|
3491 : 1) how many HSPs are in a Blast alignment
|
|
3492 : 2) how strong the similarity is between sequences in the HSP
|
|
3493 : 3) the endpoints of the alignment (sequence monomer numbers)
|
|
3494
|
|
3495 See Also : B<Bio::Tools::Blast::Sbjct::homol_data()>,L<Links to related modules>
|
|
3496
|
|
3497 =cut
|
|
3498
|
|
3499 #----------------
|
|
3500 sub homol_data {
|
|
3501 #----------------
|
|
3502
|
|
3503 my ($self, %param) = @_;
|
|
3504 my @hits = $self->hits();
|
|
3505 my @data = ();
|
|
3506
|
|
3507 ## Note: Homology data can be either for the query sequence or the hit
|
|
3508 ## (Sbjct) sequence. Default is for sbjct. This is specifyable via
|
|
3509 ## $param{-SEQ}='sbjct' || 'query'.
|
|
3510
|
|
3511 foreach ( @hits ) {
|
|
3512 push @data, $_->homol_data(%param);
|
|
3513 }
|
|
3514 @data;
|
|
3515 }
|
|
3516
|
|
3517 =head1 REPORT GENERATING METHODS
|
|
3518
|
|
3519 =head2 table
|
|
3520
|
|
3521 Usage : $blast_obj->table( [get_desc]);
|
|
3522 Purpose : Output data for each HSP of each hit in tab-delimited format.
|
|
3523 Example : print $blast->table;
|
|
3524 : print $blast->table(0);
|
|
3525 : # Call table_labels() to print labels.
|
|
3526 Argument : get_desc = boolean, if false the description of each hit is not included.
|
|
3527 : Default: true (if not defined, include description column).
|
|
3528 Returns : String containing tab-delimited set of data for each HSP
|
|
3529 : of each significant hit. Different HSPs are separated by newlines.
|
|
3530 : Left-to-Right order of fields:
|
|
3531 : 1 QUERY_NAME # Sequence identifier of the query.
|
|
3532 : 2 QUERY_LENGTH # Full length of the query sequence.
|
|
3533 : 3 SBJCT_NAME # Sequence identifier of the sbjct ("hit".
|
|
3534 : 4 SBJCT_LENGTH # Full length of the sbjct sequence.
|
|
3535 : 5 EXPECT # Expect value for the alignment.
|
|
3536 : 6 SCORE # Blast score for the alignment.
|
|
3537 : 7 BITS # Bit score for the alignment.
|
|
3538 : 8 NUM_HSPS # Number of HSPs (not the "N" value).
|
|
3539 : 9 HSP_FRAC_IDENTICAL # fraction of identical substitutions.
|
|
3540 : 10 HSP_FRAC_CONSERVED # fraction of conserved ("positive") substitutions.
|
|
3541 : 11 HSP_QUERY_ALN_LENGTH # Length of the aligned portion of the query sequence.
|
|
3542 : 12 HSP_SBJCT_ALN_LENGTH # Length of the aligned portion of the sbjct sequence.
|
|
3543 : 13 HSP_QUERY_GAPS # Number of gaps in the aligned query sequence.
|
|
3544 : 14 HSP_SBJCT_GAPS # Number of gaps in the aligned sbjct sequence.
|
|
3545 : 15 HSP_QUERY_START # Starting coordinate of the query sequence.
|
|
3546 : 16 HSP_QUERY_END # Ending coordinate of the query sequence.
|
|
3547 : 17 HSP_SBJCT_START # Starting coordinate of the sbjct sequence.
|
|
3548 : 18 HSP_SBJCT_END # Ending coordinate of the sbjct sequence.
|
|
3549 : 19 HSP_QUERY_STRAND # Strand of the query sequence (TBLASTN/X only)
|
|
3550 : 20 HSP_SBJCT_STRAND # Strand of the sbjct sequence (TBLASTN/X only)
|
|
3551 : 21 HSP_FRAME # Frame for the sbjct translation (TBLASTN/X only)
|
|
3552 : 22 SBJCT_DESCRIPTION (optional) # Full description of the sbjct sequence from
|
|
3553 : # the alignment section.
|
|
3554 Throws : n/a
|
|
3555 Comments : This method does not collect data based on tiling of the HSPs.
|
|
3556 : The table will contains redundant information since the hit name,
|
|
3557 : id, and other info for the hit are listed for each HSP.
|
|
3558 : If you need more flexibility in the output format than this
|
|
3559 : method provides, design a custom function.
|
|
3560
|
|
3561 See Also : L<table_tiled()|table_tiled>, L<table_labels()|table_labels>, L<_display_hits()|_display_hits>
|
|
3562
|
|
3563 =cut
|
|
3564
|
|
3565 #-----------
|
|
3566 sub table {
|
|
3567 #-----------
|
|
3568 my ($self, $get_desc) = @_;
|
|
3569 my $str = '';
|
|
3570
|
|
3571 $get_desc = defined($get_desc) ? $get_desc : 1;
|
|
3572 # $str .= $self->_table_labels($get_desc) unless $self->{'_labels'};
|
|
3573
|
|
3574 my $sigfmt = $self->signif_fmt();
|
|
3575 $sigfmt eq 'parts' and $sigfmt = 'exp'; # disallow 'parts' format for this table.
|
|
3576 my $sigprint = $sigfmt eq 'exp' ? 'd' : '.1e';
|
|
3577
|
|
3578 my ($hit, $hsp);
|
|
3579 foreach $hit($self->hits) {
|
|
3580 foreach $hsp($hit->hsps) {
|
|
3581 # Note: range() returns a 2-element list.
|
|
3582 $str .= sprintf "%s\t%d\t%s\t%d\t%$sigprint\t%d\t%d\t%d\t%.2f\t%.2f\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%s\t%s\t%s\t%s$Newline",
|
|
3583 $self->name, $self->length, $hit->name, $hit->length,
|
|
3584 $hit->expect($sigfmt), $hit->score, $hit->bits,
|
|
3585 $hit->num_hsps, $hsp->frac_identical, $hsp->frac_conserved,
|
|
3586 $hsp->length('query'), $hsp->length('sbjct'),
|
|
3587 $hsp->gaps('list'),
|
|
3588 $hsp->range('query'), $hsp->range('sbjct'),
|
|
3589 $hsp->strand('query'), $hsp->strand('sbjct'), $hsp->frame,
|
|
3590 ($get_desc ? $hit->desc : '');
|
|
3591 }
|
|
3592 }
|
|
3593 $str =~ s/\t$Newline/$Newline/gs;
|
|
3594 $str;
|
|
3595 }
|
|
3596
|
|
3597 =head2 table_labels
|
|
3598
|
|
3599 Usage : print $blast_obj->table_labels( [get_desc] );
|
|
3600 Purpose : Get column labels for table().
|
|
3601 Returns : String containing column labels. Tab-delimited.
|
|
3602 Argument : get_desc = boolean, if false the description column is not included.
|
|
3603 : Default: true (if not defined, include description column).
|
|
3604 Throws : n/a
|
|
3605
|
|
3606 See Also : L<table()|table>
|
|
3607
|
|
3608 =cut
|
|
3609
|
|
3610 #----------------
|
|
3611 sub table_labels {
|
|
3612 #----------------
|
|
3613 my ($self, $get_desc) = @_;
|
|
3614 $get_desc = defined($get_desc) ? $get_desc : 1;
|
|
3615 my $descstr = $get_desc ? 'DESC' : '';
|
|
3616 my $descln = $get_desc ? '-----' : '';
|
|
3617
|
|
3618 my $str = sprintf "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s$Newline",
|
|
3619 'QUERY', 'Q_LEN', 'SBJCT', 'S_LEN', 'EXPCT', 'SCORE', 'BITS', 'HSPS',
|
|
3620 'IDEN', 'CONSV', 'Q_ALN', 'S_ALN', 'Q_GAP', 'S_GAP',
|
|
3621 'Q_BEG', 'Q_END', 'S_BEG', 'S_END', 'Q_STR', 'S_STR', 'FRAM', $descstr;
|
|
3622 $str .= sprintf "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s$Newline",
|
|
3623 '-----', '-----', '-----', '-----', '-----', '-----', '-----', '-----',
|
|
3624 '-----', '-----', '-----', '-----', '-----', '-----',
|
|
3625 '-----', '-----', '-----','-----', '-----', '-----','-----', $descln;
|
|
3626
|
|
3627 $self->{'_labels'} = 1;
|
|
3628 $str =~ s/\t$Newline/$Newline/gs;
|
|
3629 $str;
|
|
3630 }
|
|
3631
|
|
3632 =head2 table_tiled
|
|
3633
|
|
3634 Purpose : Get data from tiled HSPs in tab-delimited format.
|
|
3635 : Allows only minimal flexibility in the output format.
|
|
3636 : If you need more flexibility, design a custom function.
|
|
3637 Usage : $blast_obj->table_tiled( [get_desc]);
|
|
3638 Example : print $blast->table_tiled;
|
|
3639 : print $blast->table_tiled(0);
|
|
3640 : # Call table_labels_tiled() if you want labels.
|
|
3641 Argument : get_desc = boolean, if false the description of each hit is not included.
|
|
3642 : Default: true (include description).
|
|
3643 Returns : String containing tab-delimited set of data for each HSP
|
|
3644 : of each significant hit. Multiple hits are separated by newlines.
|
|
3645 : Left-to-Right order of fields:
|
|
3646 : 1 QUERY_NAME # Sequence identifier of the query.
|
|
3647 : 2 QUERY_LENGTH # Full length of the query sequence.
|
|
3648 : 3 SBJCT_NAME # Sequence identifier of the sbjct ("hit".
|
|
3649 : 4 SBJCT_LENGTH # Full length of the sbjct sequence.
|
|
3650 : 5 EXPECT # Expect value for the alignment.
|
|
3651 : 6 SCORE # Blast score for the alignment.
|
|
3652 : 7 BITS # Bit score for the alignment.
|
|
3653 : 8 NUM_HSPS # Number of HSPs (not the "N" value).
|
|
3654 : 9 FRAC_IDENTICAL* # fraction of identical substitutions.
|
|
3655 : 10 FRAC_CONSERVED* # fraction of conserved ("positive") substitutions .
|
|
3656 : 11 FRAC_ALN_QUERY* # fraction of the query sequence that is aligned.
|
|
3657 : 12 FRAC_ALN_SBJCT* # fraction of the sbjct sequence that is aligned.
|
|
3658 : 13 QUERY_ALN_LENGTH* # Length of the aligned portion of the query sequence.
|
|
3659 : 14 SBJCT_ALN_LENGTH* # Length of the aligned portion of the sbjct sequence.
|
|
3660 : 15 QUERY_GAPS* # Number of gaps in the aligned query sequence.
|
|
3661 : 16 SBJCT_GAPS* # Number of gaps in the aligned sbjct sequence.
|
|
3662 : 17 QUERY_START* # Starting coordinate of the query sequence.
|
|
3663 : 18 QUERY_END* # Ending coordinate of the query sequence.
|
|
3664 : 19 SBJCT_START* # Starting coordinate of the sbjct sequence.
|
|
3665 : 20 SBJCT_END* # Ending coordinate of the sbjct sequence.
|
|
3666 : 21 AMBIGUOUS_ALN # Ambiguous alignment indicator ('qs', 'q', 's').
|
|
3667 : 22 SBJCT_DESCRIPTION (optional) # Full description of the sbjct sequence from
|
|
3668 : # the alignment section.
|
|
3669 :
|
|
3670 : * Items marked with a "*" report data summed across all HSPs
|
|
3671 : after tiling them to avoid counting data from overlapping regions
|
|
3672 : multiple times.
|
|
3673 Throws : n/a
|
|
3674 Comments : This function relies on tiling of the HSPs since it calls
|
|
3675 : frac_identical() etc. on the hit as opposed to each HSP individually.
|
|
3676
|
|
3677 See Also : L<table()|table>, L<table_labels_tiled()|table_labels_tiled>, B<Bio::Tools::Blast::Sbjct::"HSP Tiling and Ambiguous Alignments">, L<Links to related modules>
|
|
3678
|
|
3679 =cut
|
|
3680
|
|
3681 #----------------
|
|
3682 sub table_tiled {
|
|
3683 #----------------
|
|
3684 my ($self, $get_desc) = @_;
|
|
3685 my $str = '';
|
|
3686
|
|
3687 $get_desc = defined($get_desc) ? $get_desc : 1;
|
|
3688
|
|
3689 my ($hit);
|
|
3690 my $sigfmt = $self->signif_fmt();
|
|
3691 $sigfmt eq 'parts' and $sigfmt = 'exp'; # disallow 'parts' format for this table.
|
|
3692 my $sigprint = $sigfmt eq 'exp' ? 'd' : '.1e';
|
|
3693
|
|
3694 foreach $hit($self->hits) {
|
|
3695 $str .= sprintf "%s\t%d\t%s\t%d\t%$sigprint\t%d\t%d\t%d\t%.2f\t%.2f\t%.2f\t%.2f\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%s\t%s$Newline",
|
|
3696 $self->name, $self->length, $hit->name, $hit->length,
|
|
3697 $hit->expect($sigfmt), $hit->score, $hit->bits,
|
|
3698 $hit->num_hsps, $hit->frac_identical, $hit->frac_conserved,
|
|
3699 $hit->frac_aligned_query, $hit->frac_aligned_hit,
|
|
3700 $hit->length_aln('query'), $hit->length_aln('sbjct'),
|
|
3701 $hit->gaps('list'), $hit->range('query'), $hit->range('sbjct'),
|
|
3702 $hit->ambiguous_aln, ($get_desc ? $hit->desc : '');
|
|
3703 }
|
|
3704 $str =~ s/\t$Newline/$Newline/gs;
|
|
3705 $str;
|
|
3706 }
|
|
3707
|
|
3708 =head2 table_labels_tiled
|
|
3709
|
|
3710 Usage : print $blast_obj->table_labels_tiled( [get_desc] );
|
|
3711 Purpose : Get column labels for table_tiled().
|
|
3712 Returns : String containing column labels. Tab-delimited.
|
|
3713 Argument : get_desc = boolean, if false the description column is not included.
|
|
3714 : Default: true (include description column).
|
|
3715 Throws : n/a
|
|
3716
|
|
3717 See Also : L<table_tiled()|table_tiled>
|
|
3718
|
|
3719 =cut
|
|
3720
|
|
3721 #---------------------
|
|
3722 sub table_labels_tiled {
|
|
3723 #---------------------
|
|
3724 my ($self, $get_desc) = @_;
|
|
3725 my $descstr = $get_desc ? 'DESC' : '';
|
|
3726 my $descln = $get_desc ? '-----' : '';
|
|
3727
|
|
3728 my $str = sprintf "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s$Newline",
|
|
3729 'QUERY', 'Q_LEN', 'SBJCT', 'S_LEN', 'EXPCT', 'SCORE', 'BITS',
|
|
3730 'HSPS', 'FR_ID', 'FR_CN', 'FR_ALQ', 'FR_ALS', 'Q_ALN',
|
|
3731 'S_ALN', 'Q_GAP', 'S_GAP', 'Q_BEG', 'Q_END', 'S_BEG', 'S_END',
|
|
3732 'AMBIG', $descstr;
|
|
3733 $str =~ s/\t$Newline/$Newline/;
|
|
3734 $str .= sprintf "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s$Newline",
|
|
3735 '-----', '-----', '------', '-----', '-----','-----', '-----',
|
|
3736 '-----', '-----', '-----', '-----', '-----', '-----',
|
|
3737 '-----', '-----', '-----','-----','-----', '-----',
|
|
3738 '-----','-----', $descln;
|
|
3739
|
|
3740 $self->{'_labels_tiled'} = 1;
|
|
3741 $str =~ s/\t$Newline/$Newline/gs;
|
|
3742 $str;
|
|
3743 }
|
|
3744
|
|
3745 =head2 display
|
|
3746
|
|
3747 Usage : $blast_object->display( %named_parameters );
|
|
3748 Purpose : Display information about Bio::Tools::Blast.pm data members,
|
|
3749 : E.g., parameters of the report, data for each hit., etc.
|
|
3750 : Overrides Bio::Root::Object::display().
|
|
3751 Example : $object->display(-SHOW=>'stats');
|
|
3752 : $object->display(-SHOW=>'stats,hits');
|
|
3753 Argument : Named parameters: (TAGS CAN BE UPPER OR LOWER CASE)
|
|
3754 : -SHOW => 'file' | 'hits' | 'homol'
|
|
3755 : -WHERE => filehandle (default = STDOUT)
|
|
3756 Returns : n/a (print/printf is called)
|
|
3757 Status : Experimental
|
|
3758 Comments : For tab-delimited output, see table().
|
|
3759
|
|
3760 See Also : L<_display_homol()|_display_homol>, L<_display_hits()|_display_hits>, L<_display_stats()|_display_stats>, L<table()|table>, B<Bio::Root::Tools::SeqAnal::display()>,L<Links to related modules>,
|
|
3761
|
|
3762 =cut
|
|
3763
|
|
3764 #--------------
|
|
3765 sub display {
|
|
3766 #--------------
|
|
3767 my( $self, %param ) = @_;
|
|
3768
|
|
3769 $self->SUPER::display(%param);
|
|
3770 my $OUT = $self->fh();
|
|
3771
|
|
3772 $self->show =~ /homol/i and $self->_display_homol($OUT);
|
|
3773 $self->show =~ /hits/i and $self->_display_hits( %param );
|
|
3774 1;
|
|
3775 }
|
|
3776
|
|
3777 =head2 _display_homol
|
|
3778
|
|
3779 Usage : n/a; called automatically by display()
|
|
3780 Purpose : Print homology data for hits in the BLAST report.
|
|
3781 Example : n/a
|
|
3782 Argument : one argument = filehandle object.
|
|
3783 Returns : printf call.
|
|
3784 Status : Experimental
|
|
3785
|
|
3786 See Also : L<homol_data()|homol_data>, L<display()|display>
|
|
3787
|
|
3788 =cut
|
|
3789
|
|
3790 #-------------------
|
|
3791 sub _display_homol {
|
|
3792 #-------------------
|
|
3793 my( $self, $OUT ) = @_;
|
|
3794
|
|
3795 print $OUT "${Newline}BLAST HOMOLOGY DATA FOR: ${\$self->name()}$Newline";
|
|
3796 print $OUT '-'x40,"$Newline";
|
|
3797
|
|
3798 foreach ( $self->homol_data()) {
|
|
3799 print $OUT "$_$Newline";
|
|
3800 }
|
|
3801 }
|
|
3802
|
|
3803 =head2 _display_stats
|
|
3804
|
|
3805 Usage : n/a; called automatically by display()
|
|
3806 Purpose : Display information about the Blast report "meta" data.
|
|
3807 : Overrides Bio::Tools::SeqAnal::_display_stats() calling it first.
|
|
3808 Example : n/a
|
|
3809 Argument : one argument = filehandle object.
|
|
3810 Returns : printf call.
|
|
3811 Status : Experimental
|
|
3812
|
|
3813 See Also : L<display()|display>, B<Bio::Tools::SeqAnal::_display_stats()>,L<Links to related modules>
|
|
3814
|
|
3815 =cut
|
|
3816
|
|
3817 #--------------------
|
|
3818 sub _display_stats {
|
|
3819 #--------------------
|
|
3820 my( $self, $OUT ) = @_;
|
|
3821
|
|
3822 $self->SUPER::_display_stats($OUT);
|
|
3823 printf( $OUT "%-15s: %s$Newline", "GAPPED", $self->gapped ? 'YES' : 'NO');
|
|
3824 printf( $OUT "%-15s: %d$Newline", "TOTAL HITS", $self->num_hits('total'));
|
|
3825 printf( $OUT "%-15s: %s$Newline", "CHECKED ALL", $Blast->{'_check_all'} ? 'YES' : 'NO');
|
|
3826 printf( $OUT "%-15s: %s$Newline", "FILT FUNC", $Blast->{'_filt_func'} ? 'YES' : 'NO');
|
|
3827 if($self->min_length) {
|
|
3828 printf( $OUT "%-15s: Length >= %s$Newline", "MIN_LENGTH", $self->min_length);
|
|
3829 }
|
|
3830
|
|
3831 my $num_hits = $self->num_hits;
|
|
3832 my $signif_str = ($self->_layout == 1) ? 'P' : 'EXPECT';
|
|
3833
|
|
3834 printf( $OUT "%-15s: %d$Newline", "SIGNIF HITS", $num_hits);
|
|
3835 # Blast1: signif = P-value, Blast2: signif = Expect value.
|
|
3836
|
|
3837 printf( $OUT "%-15s: %s ($signif_str-VALUE)$Newline", "SIGNIF CUTOFF", $self->signif);
|
|
3838 printf( $OUT "%-15s: %s$Newline", "LOWEST $signif_str", $self->lowest_signif());
|
|
3839 printf( $OUT "%-15s: %s$Newline", "HIGHEST $signif_str", $self->highest_signif());
|
|
3840
|
|
3841 printf( $OUT "%-15s: %s (OVERALL)$Newline", "HIGHEST $signif_str", $self->highest_signif('overall'));
|
|
3842
|
|
3843 if($Blast->_get_stats) {
|
|
3844 my $warn = ($Blast->{'_share'}) ? '(SHARED STATS)' : '';
|
|
3845 printf( $OUT "%-15s: %s$Newline", "MATRIX", $self->matrix() || 'UNKNOWN');
|
|
3846 printf( $OUT "%-15s: %s$Newline", "FILTER", $self->filter() || 'UNKNOWN');
|
|
3847 printf( $OUT "%-15s: %s$Newline", "EXPECT", $self->expect() || 'UNKNOWN');
|
|
3848 printf( $OUT "%-15s: %s, %s, %s %s$Newline", "LAMBDA, K, H", $self->karlin_altschul(), $warn);
|
|
3849 printf( $OUT "%-15s: %s$Newline", "WORD SIZE", $self->word_size() || 'UNKNOWN');
|
|
3850 printf( $OUT "%-15s: %s %s$Newline", "S", $self->s() || 'UNKNOWN', $warn);
|
|
3851 if($self->gapped) {
|
|
3852 printf( $OUT "%-15s: %s$Newline", "GAP CREATION", $self->gap_creation() || 'UNKNOWN');
|
|
3853 printf( $OUT "%-15s: %s$Newline", "GAP EXTENSION", $self->gap_extension() || 'UNKNOWN');
|
|
3854 }
|
|
3855 }
|
|
3856 print $OUT "$Newline";
|
|
3857 }
|
|
3858
|
|
3859 =head2 _display_hits
|
|
3860
|
|
3861 Usage : n/a; called automatically by display()
|
|
3862 Purpose : Display data for each hit. Not tab-delimited.
|
|
3863 Example : n/a
|
|
3864 Argument : one argument = filehandle object.
|
|
3865 Returns : printf call.
|
|
3866 Status : Experimental
|
|
3867 Comments : For tab-delimited output, see table().
|
|
3868
|
|
3869 See Also : L<display()|display>, B<Bio::Tools::Blast::Sbjct::display()>, L<table()|table>, L<Links to related modules>
|
|
3870
|
|
3871 =cut
|
|
3872
|
|
3873 sub _display_hits {
|
|
3874
|
|
3875 my( $self, %param ) = @_;
|
|
3876 my $OUT = $self->fh();
|
|
3877 my @hits = $self->hits();
|
|
3878
|
|
3879 ## You need a wide screen to see this properly.
|
|
3880 # Header.
|
|
3881 print $OUT "${Newline}BLAST HITS FOR: ${\$self->name()} length = ${\$self->length}$Newline";
|
|
3882 print "(This table requires a wide display.)$Newline";
|
|
3883 print $OUT '-'x80,"$Newline";
|
|
3884
|
|
3885 print $self->table_labels_tiled(0);
|
|
3886 print $self->table_tiled(0);
|
|
3887
|
|
3888 ## Doing this interactively since there is potentially a lot of data here.
|
|
3889 ## Not quite satisfied with this approach.
|
|
3890
|
|
3891 if (not $param{-INTERACTIVE}) {
|
|
3892 return 1;
|
|
3893 } else {
|
|
3894 my ($reply);
|
|
3895 print "${Newline}DISPLAY FULL HSP DATA? (y/n): [n] ";
|
|
3896 chomp( $reply = <STDIN> );
|
|
3897 $reply =~ /^y.*/i;
|
|
3898
|
|
3899 my $count = 0;
|
|
3900 foreach ( @hits ) {
|
|
3901 $count++;
|
|
3902 print $OUT "$Newline$Newline",'-'x80,"$Newline";
|
|
3903 print $OUT "HSP DATA FOR HIT #$count (hit <RETURN>)";
|
|
3904 print $OUT "$Newline",'-'x80;<STDIN>;
|
|
3905 $param{-SHOW} = 'hsp';
|
|
3906 $_->display( %param );
|
|
3907 }
|
|
3908 }
|
|
3909 1;
|
|
3910 }
|
|
3911
|
|
3912 =head2 to_html
|
|
3913
|
|
3914 Usage : $blast_object->to_html( [%named_parameters] )
|
|
3915 Purpose : To produce an HTML-formatted version of a BLAST report
|
|
3916 : for efficient navigation of the report using a web browser.
|
|
3917 Example : # Using the static Blast object:
|
|
3918 : # Can read from STDIN or from a designated file:
|
|
3919 : $Blast->to_html($file);
|
|
3920 : $Blast->to_html(-FILE=>$file, -HEADER=>$header);
|
|
3921 : (if no file is supplied, STDIN will be used).
|
|
3922 : # saving HTML to an array:
|
|
3923 : $Blast->to_html(-FILE=>$file, -OUT =>\@out);
|
|
3924 : # Using a pre-existing blast object (must have been built from
|
|
3925 : # a file, not STDIN:
|
|
3926 : $blastObj->to_html();
|
|
3927 Returns : n/a, either prints report to STDOUT or saves to a supplied array
|
|
3928 : if an '-OUT' parameter is defined (see below).
|
|
3929 Argument : %named_parameters: (TAGS ARE AND CASE INSENSITIVE).
|
|
3930 : -FILE => string containing name of a file to be processed.
|
|
3931 : If not a valid file or undefined, STDIN will be used.
|
|
3932 : Can skip the -FILE tag if supplying a filename
|
|
3933 : as a single argument.
|
|
3934 : -HEADER => string
|
|
3935 : This should be an HTML-formatted string to be used
|
|
3936 : as a header for the page, typically describing query sequence,
|
|
3937 : database searched, the date of the analysis, and any
|
|
3938 : additional links.
|
|
3939 : If not supplied, no special header is used.
|
|
3940 : Regardless of whether a header is supplied, the
|
|
3941 : standard info at the top of the report is highlighted.
|
|
3942 : This should include the <HEADER></HEADER> section
|
|
3943 : of the page as well.
|
|
3944 :
|
|
3945 : -IN => array reference containing a raw Blast report.
|
|
3946 : each line in a separate element in the array.
|
|
3947 : If -IN is not supplied, read() is called
|
|
3948 : and data is then read either from STDIN or a file.
|
|
3949 :
|
|
3950 : -OUT => array reference to hold the HTML output.
|
|
3951 : If not supplied, output is sent to STDOUT.
|
|
3952 Throws : Exception is propagated from $HTML::get_html_func()
|
|
3953 : and Bio::Root::Object::read().
|
|
3954 Comments : The code that does the actual work is located in
|
|
3955 : Bio::Tools::Blast::HTML::get_html_func().
|
|
3956 Bugs : Some hypertext links to external databases may not be
|
|
3957 : correct. This due in part to the dynamic nature of
|
|
3958 : the web.
|
|
3959 : Hypertext links are not added to hits without database ids.
|
|
3960 TODO : Possibly create a function to produce fancy default header
|
|
3961 : using data extracted from the report (requires some parsing).
|
|
3962 : For example, it would be nice to always include a date
|
|
3963
|
|
3964 See Also : B<Bio::Tools::Blast::HTML::get_html_func()>, B<Bio::Root::Object::read()>, L<Links to related modules>
|
|
3965
|
|
3966 =cut
|
|
3967
|
|
3968 #------------
|
|
3969 sub to_html {
|
|
3970 #------------
|
|
3971 my ($self, @param) = @_;
|
|
3972
|
|
3973 # Permits syntax such as: $blast->to_html($filename);
|
|
3974 my ($file, $header_html, $in_aref, $out_aref) =
|
|
3975 $self->_rearrange([qw(FILE HEADER IN OUT)], @param);
|
|
3976
|
|
3977 $self->file($file) if $file;
|
|
3978
|
|
3979 # Only setting the newline character once for efficiency.
|
|
3980 $Newline ||= $Util->get_newline(-client => $self, @param);
|
|
3981
|
|
3982 $header_html ||= '';
|
|
3983 (ref($out_aref) eq 'ARRAY') ? push(@$out_aref, $header_html) : print "$header_html$Newline";
|
|
3984
|
|
3985 require Bio::Tools::Blast::HTML;
|
|
3986 Bio::Tools::Blast::HTML->import(qw(&get_html_func));
|
|
3987
|
|
3988 my ($func);
|
|
3989 eval{ $func = &get_html_func($out_aref); };
|
|
3990 if($@) {
|
|
3991 my $err = $@;
|
|
3992 $self->throw($err);
|
|
3993 }
|
|
3994
|
|
3995 eval {
|
|
3996 if(!$header_html) {
|
|
3997 $out_aref ? push(@$out_aref, "<html><body>$Newline") : print "<html><body>$Newline";
|
|
3998 }
|
|
3999
|
|
4000 if (ref ($in_aref) =~ /ARRAY/) {
|
|
4001 # If data is being supplied, process it.
|
|
4002 foreach(@$in_aref) {
|
|
4003 &$func($_);
|
|
4004 }
|
|
4005 } else {
|
|
4006 # Otherwise, read it, processing as we go.
|
|
4007
|
|
4008 $self->read(-FUNC => $func, @param);
|
|
4009 }
|
|
4010 $out_aref ? push(@$out_aref, "$Newline</pre></body></html>") : print "$Newline</pre></body></html>";
|
|
4011 };
|
|
4012
|
|
4013 if($@) {
|
|
4014 # Check for trivial error (report already HTML formatted).
|
|
4015 if($@ =~ /HTML formatted/) {
|
|
4016 print STDERR "\a${Newline}Blast report appears to be HTML formatted already.$Newline$Newline";
|
|
4017 } else {
|
|
4018 my $err = $@;
|
|
4019 $self->throw($err);
|
|
4020 }
|
|
4021 }
|
|
4022 }
|
|
4023
|
|
4024 1;
|
|
4025 __END__
|
|
4026
|
|
4027 #####################################################################################
|
|
4028 # END OF CLASS #
|
|
4029 #####################################################################################
|
|
4030
|
|
4031 =head1 FOR DEVELOPERS ONLY
|
|
4032
|
|
4033 =head2 Data Members
|
|
4034
|
|
4035 Information about the various data members of this module is provided for those
|
|
4036 wishing to modify or understand the code. Two things to bear in mind:
|
|
4037
|
|
4038 =over 4
|
|
4039
|
|
4040 =item 1 Do NOT rely on these in any code outside of this module.
|
|
4041
|
|
4042 All data members are prefixed with an underscore to signify that they are private.
|
|
4043 Always use accessor methods. If the accessor doesn't exist or is inadequate,
|
|
4044 create or modify an accessor (and let me know, too!). (An exception to this might
|
|
4045 be for Sbjct.pm or HSP.pm which are more tightly coupled to Blast.pm and
|
|
4046 may access Blast data members directly for efficiency purposes, but probably
|
|
4047 should not).
|
|
4048
|
|
4049 =item 2 This documentation may be incomplete and out of date.
|
|
4050
|
|
4051 It is easy for these data member descriptions to become obsolete as
|
|
4052 this module is still evolving. Always double check this info and search
|
|
4053 for members not described here.
|
|
4054
|
|
4055 =back
|
|
4056
|
|
4057 An instance of Bio::Tools::Blast.pm is a blessed reference to a hash containing
|
|
4058 all or some of the following fields:
|
|
4059
|
|
4060 FIELD VALUE
|
|
4061 --------------------------------------------------------------
|
|
4062 _significance P-value or Expect value cutoff (depends on Blast version:
|
|
4063 Blast1/WU-Blast2 = P-value; Blast2 = Expect value).
|
|
4064 Values GREATER than this are deemed not significant.
|
|
4065
|
|
4066 _significant Boolean. True if the query has one or more significant hit.
|
|
4067
|
|
4068 _min_length Integer. Query sequences less than this will be skipped.
|
|
4069
|
|
4070 _confirm_significance Boolean. True if client has supplied significance criteria.
|
|
4071
|
|
4072 _gapped Boolean. True if BLAST analysis has gapping turned on.
|
|
4073
|
|
4074 _hits List of Sbjct.pm objects.
|
|
4075
|
|
4076 _num_hits Number of hits obtained from the BLAST report.
|
|
4077
|
|
4078 _num_hits_significant Number of significant based on Significant data members.
|
|
4079
|
|
4080 _highestSignif Highest P or Expect value overall (not just what is stored in _hits).
|
|
4081
|
|
4082 _lowestSignif Lowest P or Expect value overall (not just what is stored in _hits).
|
|
4083
|
|
4084 The static $Blast object has a special set of members:
|
|
4085
|
|
4086 _errs
|
|
4087 _share
|
|
4088 _stream
|
|
4089 _get_stats
|
|
4090 _gapped
|
|
4091 _filt_func
|
|
4092
|
|
4093 Miscellaneous statistical parameters:
|
|
4094 -------------------------------------
|
|
4095 _filter, _matrix, _word_size, _expect, _gapCreation, _gapExtension, _s,
|
|
4096 _lambda, _k, _h
|
|
4097
|
|
4098 INHERITED DATA MEMBERS
|
|
4099 -----------------------
|
|
4100 (See Bio::Tools::SeqAnal.pm for inherited data members.)
|
|
4101
|
|
4102 =cut
|
|
4103
|
|
4104 1;
|