comparison variant_effect_predictor/Bio/Tools/Blast.pm @ 0:1f6dce3d34e0

Uploaded
author mahtabm
date Thu, 11 Apr 2013 02:01:53 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:1f6dce3d34e0
1 #----------------------------------------------------------------------------
2 # PACKAGE : Bio::Tools::Blast
3 # PURPOSE : To encapsulate code for running, parsing, and analyzing
4 # BLAST reports.
5 # AUTHOR : Steve Chervitz (sac@bioperl.org)
6 # CREATED : March 1996
7 # REVISION: $Id: Blast.pm,v 1.30 2002/11/04 09:12:50 heikki Exp $
8 # STATUS : Alpha
9 #
10 # For the latest version and documentation, visit:
11 # http://bio.perl.org/Projects/Blast
12 #
13 # To generate documentation, run this module through pod2html
14 # (preferably from Perl v5.004 or better).
15 #
16 # Copyright (c) 1996-2000 Steve Chervitz. All Rights Reserved.
17 # This module is free software; you can redistribute it and/or
18 # modify it under the same terms as Perl itself.
19 #----------------------------------------------------------------------------
20
21 package Bio::Tools::Blast;
22 use strict;
23 use Exporter;
24
25 use Bio::Tools::SeqAnal;
26 use Bio::Root::Global qw(:std);
27 use Bio::Root::Utilities qw(:obj);
28
29 require 5.002;
30
31 use vars qw(@ISA @EXPORT @EXPORT_OK %EXPORT_TAGS
32 $ID $VERSION $Blast @Blast_programs $Revision $Newline);
33
34 @ISA = qw( Bio::Tools::SeqAnal Exporter);
35 @EXPORT = qw();
36 @EXPORT_OK = qw($VERSION $Blast);
37 %EXPORT_TAGS = ( obj => [qw($Blast)],
38 std => [qw($Blast)]);
39
40 $ID = 'Bio::Tools::Blast';
41 $VERSION = 0.09;
42 $Revision = '$Id: Blast.pm,v 1.30 2002/11/04 09:12:50 heikki Exp $'; #'
43
44 ## Static Blast object.
45 $Blast = {};
46 bless $Blast, $ID;
47 $Blast->{'_name'} = "Static Blast object";
48
49 @Blast_programs = qw(blastp blastn blastx tblastn tblastx);
50
51 use vars qw($DEFAULT_MATRIX $DEFAULT_SIGNIF);
52 my $DEFAULT_MATRIX = 'BLOSUM62';
53 my $DEFAULT_SIGNIF = 999;# Value used as significance cutoff if none supplied.
54 my $MAX_HSP_OVERLAP = 2; # Used when tiling multiple HSPs.
55
56 ## POD Documentation:
57
58 =head1 NAME
59
60 Bio::Tools::Blast - Bioperl BLAST sequence analysis object
61
62 =head1 SYNOPSIS
63
64 =head2 Parsing Blast reports
65
66 Parse an existing Blast report from file:
67
68 use Bio::Tools::Blast;
69
70 $blastObj = Bio::Tools::Blast->new( -file => '/tmp/blast.out',
71 -parse => 1,
72 -signif => '1e-10',
73 );
74
75 Parse an existing Blast report from STDIN:
76
77 $blastObj = Bio::Tools::Blast->new( -parse => 1,
78 -signif => '1e-10',
79 );
80
81 Then send a Blast report to your script via STDIN.
82
83 Full parameters for parsing Blast reports.
84
85 %blastParam = (
86 -run => \%runParam,
87 -file => '',
88 -parse => 1,
89 -signif => 1e-5,
90 -filt_func => \&my_filter,
91 -min_len => 15,
92 -check_all_hits => 0,
93 -strict => 0,
94 -stats => 1,
95 -best => 0,
96 -share => 0,
97 -exec_func => \&process_blast,
98 -save_array => \@blast_objs, # not used if -exce_func defined.
99 );
100
101 See L<parse()|parse> for a description of parameters and see L<USAGE | USAGE> for
102 more examples including how to parse streams containing multiple Blast
103 reports L<Using the Static $Blast Object>.
104
105 See L<Memory Usage Issues> for information about how to make Blast
106 parsing be more memory efficient.
107
108 =head2 Running Blast reports
109
110 Run a new Blast2 at NCBI and then parse it:
111
112 %runParam = (
113 -method => 'remote',
114 -prog => 'blastp',
115 -database => 'swissprot',
116 -seqs => [ $seq ], # Bio::Seq.pm objects.
117 );
118
119 $blastObj = Bio::Tools::Blast->new( -run => \%runParam,
120 -parse => 1,
121 -signif => '1e-10',
122 -strict => 1,
123 );
124
125 Full parameters for running Blasts at NCBI using Webblast.pm:
126
127 %runParam = (
128 -method => 'remote',
129 -prog => 'blastp',
130 -version => 2, # BLAST2
131 -database =>'swissprot',
132 -html => 0,
133 -seqs => [ $seqObject ], # Bio::Seq.pm object(s)
134 -descr => 250,
135 -align => 250,
136 -expect => 10,
137 -gap => 'on',
138 -matrix => 'PAM250',
139 -email => undef, # don't send report via e-mail if parsing.
140 -filter => undef, # use default
141 -gap_c => undef, # use default
142 -gap_e => undef, # use default
143 -word => undef, # use default
144 -min_len => undef, # use default
145 );
146
147 See L<run()|run> and L<USAGE | USAGE> for more information about running Blasts.
148
149 =head2 HTML-formatting Blast reports
150
151 Print an HTML-formatted version of a Blast report:
152
153 use Bio::Tools::Blast qw(:obj);
154
155 $Blast->to_html($filename);
156 $Blast->to_html(-file => $filename,
157 -header => "<H1>Blast Results</H1>");
158 $Blast->to_html(-file => $filename,
159 -out => \@array); # store output
160 $Blast->to_html(); # use STDIN
161
162 Results are sent directly to STDOUT unless an C<-out =E<gt> array_ref>
163 parameter is supplied. See L<to_html()|to_html> for details.
164
165 =head1 INSTALLATION
166
167 This module is included with the central Bioperl distribution:
168
169 http://bio.perl.org/Core/Latest
170 ftp://bio.perl.org/pub/DIST
171
172 Follow the installation instructions included in the README file.
173
174 =head1 DESCRIPTION
175
176 The Bio::Tools::Blast.pm module encapsulates data and methods for
177 running, parsing, and analyzing pre-existing BLAST reports. This
178 module defines an application programming interface (API) for working
179 with Blast reports. A Blast object is constructed from raw Blast
180 output and encapsulates the Blast results which can then be accessed
181 via the interface defined by the Blast object.
182
183 The ways in which researchers use Blast data are many and varied. This
184 module attempts to be general and flexible enough to accommodate
185 different uses. The Blast module API is still at an early stage of
186 evolution and I expect it to continue to evolve as new uses for Blast
187 data are developed. Your L<FEEDBACK | FEEDBACK> is welcome.
188
189 B<FEATURES:>
190
191 =over 2
192
193 =item * Supports NCBI Blast1.x, Blast2.x, and WashU-Blast2.x, gapped
194 and ungapped.
195
196 Can parse HTML-formatted as well as non-HTML-formatted reports.
197
198 =item * Launch new Blast analyses remotely or locally.
199
200 Blast objects can be constructed directly from the results of the
201 run. See L<run()|run>.
202
203 =item * Construct Blast objects from pre-existing files or from a new run.
204
205 Build a Blast object from a single file or build multiple Blast
206 objects from an input stream containing multiple reports. See
207 L<parse()|parse>.
208
209 =item * Add hypertext links from a BLAST report.
210
211 See L<to_html()|to_html>.
212
213 =item * Generate sequence and sequence alignment objects from HSP
214 sequences.
215
216 If you have Bio::Seq.pm and Bio::UnivAln.pm installed on your system,
217 they can be used for working with high-scoring segment pair (HSP)
218 sequences in the Blast alignment. (A new version of Bio::Seq.pm is
219 included in the distribution, see L<INSTALLATION | INSTALLATION>). For more
220 information about them, see:
221
222 http://bio.perl.org/Projects/Sequence/
223 http://bio.perl.org/Projects/SeqAlign/
224
225 =back
226
227 A variety of different data can be extracted from the Blast report by
228 querying the Blast.pm object. Some basic examples are given in the
229 L<USAGE | USAGE> section. For some working scripts, see the links provided in
230 the L<the DEMO SCRIPTS section | DEMO> section.
231
232 As a part of the incipient Bioperl framework, the Bio::Tools::Blast.pm
233 module inherits from B<Bio::Tools::SeqAnal.pm>, which provides some
234 generic functionality for biological sequence analysis. See the
235 documentation for that module for details
236 (L<Links to related modules>).
237
238 =head2 The BLAST Program
239
240 BLAST (Basic Local Alignment Search Tool) is a widely used algorithm
241 for performing rapid sequence similarity searches between a single DNA
242 or protein sequence and a large dataset of sequences. BLAST analyses
243 are typically performed by dedicated remote servers, such as the ones
244 at the NCBI. Individual groups may also run the program on local
245 machines.
246
247 The Blast family includes 5 different programs:
248
249 Query Seq Database
250 ------------ ----------
251 blastp -- protein protein
252 blastn -- nucleotide nucleotide
253 blastx -- nucleotide* protein
254 tblastn -- protein nucleotide*
255 tblastx -- nucleotide* nucleotide*
256
257 * = dynamically translated in all reading frames, both strands
258
259 See L<References & Information about the BLAST program>.
260
261 =head2 Versions Supported
262
263 BLAST reports generated by different application front ends are similar
264 but not exactly the same. Blast reports are not intended to be exchange formats,
265 making parsing software susceptible to obsolescence. This module aims to
266 support BLAST reports generated by different implementations:
267
268 Implementation Latest version tested
269 -------------- --------------------
270 NCBI Blast1 1.4.11 [24-Nov-97]
271 NCBI Blast2 2.0.8 [Jan-5-1999]
272 WashU-BLAST2 2.0a19MP [05-Feb-1998]
273 GCG 1.4.8 [1-Feb-95]
274
275 Support for both gapped and ungapped versions is included. Currently, there
276 is only rudimentary support for PSI-BLAST in that these reports can be parsed but
277 there is no special treatment of separate iteration rounds (they are all
278 merged together).
279
280 =head2 References & Information about the BLAST program
281
282 B<WEBSITES:>
283
284 http://www.ncbi.nlm.nih.gov/BLAST/ - Homepage at NCBI
285 http://www.ncbi.nlm.nih.gov/BLAST/blast_help.html - Help manual
286 http://blast.wustl.edu/ - WashU-Blast2
287
288 B<PUBLICATIONS:> (with PubMed links)
289
290 Altschul S.F., Gish W., Miller W., Myers E.W., Lipman D.J. (1990).
291 "Basic local alignment search tool", J Mol Biol 215: 403-410.
292
293 http://www.ncbi.nlm.nih.gov/htbin-post/Entrez/query?uid=2231712&form=6&db=m&Dopt=r
294
295 Altschul, Stephen F., Thomas L. Madden, Alejandro A. Schaffer,
296 Jinghui Zhang, Zheng Zhang, Webb Miller, and David J. Lipman (1997).
297 "Gapped BLAST and PSI-BLAST: a new generation of protein database
298 search programs", Nucleic Acids Res. 25:3389-3402.
299
300 http://www.ncbi.nlm.nih.gov/htbin-post/Entrez/query?uid=9254694&form=6&db=m&Dopt=r
301
302 Karlin, Samuel and Stephen F. Altschul (1990). Methods for
303 assessing the statistical significance of molecular sequence
304 features by using general scoring schemes. Proc. Natl. Acad.
305 Sci. USA 87:2264-68.
306
307 http://www.ncbi.nlm.nih.gov/htbin-post/Entrez/query?uid=2315319&form=6&db=m&Dopt=b
308
309 Karlin, Samuel and Stephen F. Altschul (1993). Applications
310 and statistics for multiple high-scoring segments in molecu-
311 lar sequences. Proc. Natl. Acad. Sci. USA 90:5873-7.
312
313 http://www.ncbi.nlm.nih.gov/htbin-post/Entrez/query?uid=8390686&form=6&db=m&Dopt=b
314
315 =head1 USAGE
316
317 =head2 Creating Blast objects
318
319 A Blast object can be constructed from the contents of a Blast report
320 using a set of named parameters that specify significance criteria for
321 parsing. The report data can be read in from an existing file
322 specified with the C<-file =E<gt> 'filename'> parameter or from a
323 STDIN stream containing potentially multiple Blast reports. If the
324 C<-file> parameter does not contain a valid filename, STDIN will be
325 used. Separate Blast objects will be created for each report in the
326 stream.
327
328 To parse the report, you must include a C<-parse =E<gt> 1> parameter
329 in addition to any other parsing parameters
330 See L<parse()|parse> for a full description of parsing parameters.
331 To run a new report and then parse it, include a C<-run =E<gt> \%runParams>
332 parameter containing a reference to a hash
333 that hold the parameters required by the L<run()|run> method.
334
335 The constructor for Blast objects is inherited from Bio::Tools::SeqAnal.pm.
336 See the B<_initialize>() method of that package for general information
337 relevant to creating Blast objects. (The B<new>() method, inherited from
338 B<Bio::Root::Object.pm>, calls B<_initialize>(). See L<Links to related modules>).
339
340 The Blast object can read compressed (gzipped) Blast report
341 files. Compression/decompression uses the gzip or compress programs
342 that are standard on Unix systems and should not require special
343 configuration. If you can't or don't want to use gzip as the file
344 compression tool, either pre-uncompress your files before parsing with
345 this module or modify B<Bio::Root::Utilities.pm> to your liking.
346
347 Blast objects can be generated either by direct instantiation as in:
348
349 use Bio::Tools::Blast;
350 $blast = new Bio::Tools::Blast (%parameters);
351
352 =head2 Using the Static $Blast Object
353
354 use Bio::Tools::Blast qw(:obj);
355
356 This exports the static $Blast object into your namespace. "Static"
357 refers to the fact that it has class scope and there is one of these
358 created when you use this module. The static $Blast object is
359 basically an empty object that is provided for convenience and is also
360 used for various internal chores.
361
362 It is exported by this module and can be used for
363 parsing and running reports as well as HTML-formatting without having
364 to first create an empty Blast object.
365
366 Using the static $Blast object for parsing a STDIN stream of Blast reports:
367
368 use Bio::Tools::Blast qw(:obj);
369
370 sub process_blast {
371 my $blastObj = shift;
372 print $blastObj->table();
373 $blastObj->destroy;
374 }
375
376 $Blast->parse( -parse => 1,
377 -signif => '1e-10',
378 -exec_func => \&process_blast,
379 );
380
381 Then pipe a stream of Blast reports into your script via STDIN. For
382 each Blast report extracted from the input stream, the parser will
383 generate a new Blast object and pass it to the function specified by
384 C<-exec_func>. The destroy() call tells Perl to free the memory
385 associated with the object, important if you are crunching through
386 many reports. This method is inherited from B<Bio::Root::Object.pm>
387 (see L<Links to related modules>). See L<parse()|parse> for a full
388 description of parameters and L<the DEMO SCRIPTS section | DEMO> for
389 additional examples.
390
391 =head2 Running Blasts
392
393 To run a Blast, create a new Blast object with a C<-run =E<gt>
394 \%runParams> parameter. Remote Blasts are performed by including a
395 C<-method =E<gt> 'remote'> parameter; local Blasts are performed by
396 including a C<-method =E<gt> 'local'> parameter. See
397 L<Running Blast reports> as well as the
398 L<the DEMO SCRIPTS section | DEMO> for examples.
399 Note that running local Blasts is not yet supported, see below.
400
401 Note that the C<-seqs =E<gt> [ $seqs ]> run parameter must contain a
402 reference to an array of B<Bio::Seq.pm> objects
403 (L<Links to related modules>). Encapsulating the sequence in an
404 object makes sequence information much easier to handle as it can
405 be supplied in a variety of formats. Bio::Seq.pm is included with
406 this distribution (L<INSTALLATION | INSTALLATION>).
407
408 Remote Blasts are implemented using the
409 B<Bio::Tools::Blast::Run::Webblast.pm> module. Local Blasts require
410 that you customize the B<Bio::Tools::Blast::Run::LocalBlast.pm>
411 module. The version of LocalBlast.pm included with this distribution
412 provides the basic framework for running local Blasts.
413 See L<Links to related modules>.
414
415 =head2 Significance screening
416
417 A C<-signif> parameter can be used to screen out all hits with
418 P-values (or Expect values) above a certain cutoff. For example, to
419 exclude all hits with Expect values above 1.0e-10: C<-signif =E<gt>
420 1e-10>. Providing a C<-signif> cutoff can speed up processing
421 tremendously, since only a small fraction of the report need be
422 parsed. This is because the C<-signif> value is used to screen hits
423 based on the data in the "Description" section of the Blast report:
424
425 For NCBI BLAST2 reports:
426
427 Score E
428 Sequences producing significant alignments: (bits) Value
429
430 sp|P31376|YAB1_YEAST HYPOTHETICAL 74.1 KD PROTEIN IN CYS3-MDM10... 957 0.0
431
432 For BLAST1 or WashU-BLAST2 reports:
433
434 Smallest
435 Sum
436 High Probability
437 Sequences producing High-scoring Segment Pairs: Score P(N) N
438
439 PDB:3PRK_E Proteinase K complexed with inhibitor ........... 504 1.8e-50 1
440
441 Thus, the C<-signif> parameter will screen based on Expect values for
442 BLAST2 reports and based on P-values for BLAST1/WashU-BLAST2 reports.
443
444 To screen based on other criteria, you can supply a C<-filt_func>
445 parameter containing a function reference that takes a
446 B<Bio::Tools::Sbjct.pm> object as an argument and returns a boolean,
447 true if the hit is to be screened out. See example below for
448 L<Screening hits using arbitrary criteria>.
449
450 =head2 Get the best hit.
451
452 $hit = $blastObj->hit;
453
454 A "hit" is contained by a B<Bio::Tools::Blast::Sbjct.pm> object.
455
456 =head2 Get the P-value or Expect value of the most significant hit.
457
458 $p = $blastObj->lowest_p;
459 $e = $blastObj->lowest_expect;
460
461 Alternatively:
462
463 $p = $blastObj->hit->p;
464 $e = $blastObj->hit->expect;
465
466 Note that P-values are not reported in NCBI Blast2 reports.
467
468 =head2 Iterate through all the hits
469
470 foreach $hit ($blastObj->hits) {
471 printf "%s\t %.1e\t %d\t %.2f\t %d\n",
472 $hit->name, $hit->expect, $hit->num_hsps,
473 $hit->frac_identical, $hit->gaps;
474 }
475
476 Refer to the documentation for B<Bio::Tools::Blast::Sbjct.pm>
477 for other ways to work with hit objects (L<Links to related modules>).
478
479 =head2 Screening hits using arbitrary criteria
480
481 sub filter { $hit=shift;
482 return ($hit->gaps == 0 and
483 $hit->frac_conserved > 0.5); }
484
485 $blastObj = Bio::Tools::Blast->new( -file => '/tmp/blast.out',
486 -parse => 1,
487 -filt_func => \&filter );
488
489 While the Blast object is parsing the report, each hit checked by calling
490 &filter($hit). All hits that generate false return values from &filter
491 are screened out and will not be added to the Blast object.
492 Note that the Blast object will normally stop parsing the report after
493 the first non-significant hit or the first hit that does not pass the
494 filter function. To force the Blast object to check all hits,
495 include a C<-check_all_hits =E<gt> 1> parameter.
496 Refer to the documentation for B<Bio::Tools::Blast::Sbjct.pm>
497 for other ways to work with hit objects.
498
499 =over 4
500
501 =item Hit start, end coordinates.
502
503 print $sbjct->start('query');
504 print $sbjct->end('sbjct');
505
506 In array context, you can get information for both query and sbjct with one call:
507
508 ($qstart, $sstart) = $sbjct->start();
509 ($qend, $send) = $sbjct->end();
510
511 For important information regarding coordinate information, see
512 the L<HSP start, end, and strand> section below.
513 Also check out documentation for the start and end methods in B<Bio::Tools::Blast::Sbjct.pm>,
514 which explains what happens if there is more than one HSP.
515
516 =back
517
518 =head2 Working with HSPs
519
520 =over 4
521
522 =item Iterate through all the HSPs of every hit
523
524 foreach $hit ($blastObj->hits) {
525 foreach $hsp ($hit->hsps) {
526 printf "%.1e\t %d\t %.1f\t %.2f\t %.2f\t %d\t %d\n",
527 $hsp->expect, $hsp->score, $hsp->bits,
528 $hsp->frac_identical, $hsp->frac_conserved,
529 $hsp->gaps('query'), $hsp->gaps('sbjct');
530 }
531
532 Refer to the documentation for B<Bio::Tools::Blast::HSP.pm>
533 for other ways to work with hit objects (L<Links to related modules>).
534
535 =back
536
537 =over 4
538
539 =item Extract HSP sequence data as strings or sequence objects
540
541 Get the first HSP of the first hit and the sequences
542 of the query and sbjct as strings.
543
544 $hsp = $blast_obj->hit->hsp;
545 $query_seq = $hsp->seq_str('query');
546 $hsp_seq = $hsp->seq_str('sbjct');
547
548 Get the indices of identical and conserved positions in the HSP query seq.
549
550 @query_iden_indices = $hsp->seq_inds('query', 'identical');
551 @query_cons_indices = $hsp->seq_inds('query', 'conserved');
552
553 Similarly for the sbjct sequence.
554
555 @sbjct_iden_indices = $hsp->seq_inds('sbjct', 'identical');
556 @sbjct_cons_indices = $hsp->seq_inds('sbjct', 'conserved');
557
558 print "Query in Fasta format:\n", $hsp->seq('query')->layout('fasta');
559 print "Sbjct in Fasta format:\n", $hsp->seq('sbjct')->layout('fasta');
560
561 See the B<Bio::Seq.pm> package for more information about using these sequence objects
562 (L<Links to related modules>).
563
564 =back
565
566 =over 4
567
568 =item Create sequence alignment objects using HSP sequences
569
570 $aln = $hsp->get_aln;
571 print " consensus:\n", $aln->consensus();
572 print $hsp->get_aln->layout('fasta');
573
574 $ENV{READSEQ_DIR} = '/home/users/sac/bin/solaris';
575 $ENV{READSEQ} = 'readseq';
576 print $hsp->get_aln->layout('msf');
577
578 MSF formated layout requires Don Gilbert's ReadSeq program (not included).
579 See the B<Bio::UnivAln.pm> for more information about using these alignment objects
580 (L<Links to related modules>)'.
581
582 =back
583
584 =over 4
585
586 =item HSP start, end, and strand
587
588 To facilitate HSP processing, endpoint data for each HSP sequence are
589 normalized so that B<start is always less than end>. This affects TBLASTN
590 and TBLASTX HSPs on the reverse complement or "Minus" strand.
591
592 Some examples of obtaining start, end coordinates for HSP objects:
593
594 print $hsp->start('query');
595 print $hsp->end('sbjct');
596 ($qstart, $sstart) = $hsp->start();
597 ($qend, $send) = $hsp->end();
598
599 Strandedness of the HSP can be assessed using the strand() method
600 on the HSP object:
601
602 print $hsp->strand('query');
603 print $hsp->strand('sbjct');
604
605 These will return 'Minus' or 'Plus'.
606 Or, to get strand information for both query and sbjct with a single call:
607
608 ($qstrand, $sstrand) = $hsp->strand();
609
610 =back
611
612 =head2 Report Generation
613
614 =over 4
615
616 =item Generate a tab-delimited table of all results.
617
618 print $blastObj->table;
619 print $blastObj->table(0); # don't include hit descriptions.
620 print $blastObj->table_tiled;
621
622 The L<table()|table> method returns data for each B<HSP> of each hit listed one per
623 line. The L<table_tiled()|table_tiled> method returns data for each B<hit, i.e., Sbjct>
624 listed one per line; data from multiple HSPs are combined after tiling to
625 reduce overlaps. See B<Bio::Tools::Blast::Sbjct.pm> for more information about
626 HSP tiling. These methods generate stereotypical, tab-delimited data for each
627 hit of the Blast report. The output is suitable for importation into
628 spreadsheets or database tables. Feel free to roll your own table function if
629 you need a custom table.
630
631 For either table method, descriptions of each hit can be included if a
632 single, true argument is supplied (e.g., $blastObj-E<gt>table(1)). The description
633 will be added as the last field. This will significantly increase the size of
634 the table. Labels for the table columns can be obtained with L<table_labels()|table_labels>
635 and L<table_labels_tiled()|table_labels_tiled>.
636
637 =back
638
639 =over 4
640
641 =item Print a summary of the Blast report
642
643 $blastObj->display();
644 $blastObj->display(-show=>'hits');
645
646 L<display()|display> prints various statistics extracted from the Blast report
647 such as database name, database size, matrix used, etc. The
648 C<display(-show=E<gt>'hits')> call prints a non-tab-delimited table
649 attempting to line the data up into more readable columns. The output
650 generated is similar to L<table_tiled()|table_tiled>.
651
652 =back
653
654 =over 4
655
656 =item HTML-format an existing report
657
658 use Bio::Tools::Blast qw(:obj);
659
660 # Going straight from a non HTML report file to HTML output using
661 # the static $Blast object exported by Bio::Tools::Blast.pm
662 $Blast->to_html(-file => '/usr/people/me/blast.output.txt',
663 -header => qq|<H1>BLASTP Results</H1><A HREF="home.html">Home</A>|
664 );
665
666 # You can also use a specific Blast object created previously.
667 $blastObj->to_html;
668
669 L<to_html()|to_html> will send HTML output, line-by-line, directly to STDOUT
670 unless an C<-out =E<gt> array_ref> parameter is supplied (e.g., C<-out
671 =E<gt> \@array>), in which case the HTML will be stored in @array, one
672 line per array element. The direct outputting permits faster response
673 time since Blast reports can be huge. The -header tag can contain a
674 string containing any HTML that you want to appear at the top of the
675 Blast report.
676
677 =back
678
679 =head1 DEMO SCRIPTS
680
681 Sample Scripts are included in the central bioperl distribution in the
682 'examples/blast/' directory (see L<INSTALLATION | INSTALLATION>):
683
684 =head2 Handy library for working with Bio::Tools::Blast.pm
685
686 examples/blast/blast_config.pl
687
688 =head2 Parsing Blast reports one at a time.
689
690 examples/blast/parse_blast.pl
691 examples/blast/parse_blast2.pl
692 examples/blast/parse_positions.pl
693
694 =head2 Parsing sets of Blast reports.
695
696 examples/blast/parse_blast.pl
697 examples/blast/parse_multi.pl
698
699 B<Warning:> See note about L<Memory Usage Issues>.
700
701 =head2 Running Blast analyses one at a time.
702
703 examples/blast/run_blast_remote.pl
704
705 =head2 Running Blast analyses given a set of sequences.
706
707 examples/blast/blast_seq.pl
708
709 =head2 HTML-formatting Blast reports.
710
711 examples/blast/html.pl
712
713 =head1 TECHNICAL DETAILS
714
715 =head2 Blast Modes
716
717 A BLAST object may be created using one of three different modes as
718 defined by the B<Bio::Tools::SeqAnal.pm> package
719 (See L<Links to related modules>):
720
721 -- parse - Load a BLAST report and parse it, storing parsed data in
722 Blast.pm object.
723 -- run - Run the BLAST program to generate a new report.
724 -- read - Load a BLAST report into the Blast object without parsing.
725
726 B<Run mode support has recently been added>. The module
727 B<Bio::Tools::Blast::Run::Webblast.pm> is an modularized adaptation of
728 the webblast script by Alex Dong Li:
729
730 http://www.genet.sickkids.on.ca/bioinfo_resources/software.html#webblast
731
732 for running remote Blast analyses and saving the results locally. Run
733 mode can be combined with a parse mode to generate a Blast report and
734 then build the Blast object from the parsed results of this report
735 (see L<run()|run> and L<SYNOPSIS | SYNOPSIS>).
736
737 In read mode, the BLAST report is read in by the Blast object but is
738 not parsed. This could be used to internalize a Blast report but not
739 parse it for results (e.g., generating HTML formatted output).
740
741 =head2 Significant Hits
742
743 This module permits the screening of hits on the basis of
744 user-specified criteria for significance. Currently, Blast reports can
745 be screened based on:
746
747 CRITERIA PARAMETER VALUE
748 ---------------------------------- --------- ----------------
749 1) the best Expect (or P) value -signif float or sci-notation
750 2) the length of the query sequence -min_length integer
751 3) arbitrary criteria -filt_func function reference
752
753 The parameters are used for construction of the BLAST object or when
754 running the L<parse()|parse> method on the static $Blast object. The
755 -SIGNIF value represents the number listed in the description section
756 at the top of the Blast report. For Blast2, this is an Expect value,
757 for Blast1 and WashU-Blast2, this is a P-value. The idea behind the
758 C<-filt_func> parameter is that the hit has to pass through a filter
759 to be considered significant. Refer to the documentation for
760 B<Bio::Tools::Blast::Sbjct.pm> for ways to work with hit objects.
761
762 Using a C<-signif> parameter allows for the following:
763
764 =over 2
765
766 =item Faster parsing.
767
768 Each hit can be screened by examination of the description line alone
769 without fully parsing the HSP alignment section.
770
771 =item Flexibility.
772
773 The C<-signif> tag provides a more semantic-free way to specify the
774 value to be used as a basis for screening hits. Thus, C<-signif> can
775 be used for screening Blast1 or Blast2 reports. It is up to the user
776 to understand whether C<-signif> represents a P-value or an Expect
777 value.
778
779 =back
780
781 Any hit not meeting the significance criteria will not be added to the
782 "hit list" of the BLAST object. Also, a BLAST object without any hits
783 meeting the significance criteria will throw an exception during
784 object construction (a fatal event).
785
786 =head2 Statistical Parameters
787
788 There are numerous parameters which define the behavior of the BLAST
789 program and which are useful for interpreting the search
790 results. These parameters are extracted from the Blast report:
791
792 filter -- for masking out low-complexity sequences or short repeats
793 matrix -- name of the substitution scoring matrix (e.g., BLOSUM62)
794 E -- Expect filter (screens out frequent scores)
795 S -- Cutoff score for segment pairs
796 W -- Word length
797 T -- Threshold score for word pairs
798 Lambda, -- Karlin-Altschul "sum" statistical parameters dependent on
799 K, H sequence composition.
800 G -- Gap creation penalty.
801 E -- Gap extension penalty.
802
803 These parameters are not always needed. Extraction may be turned off
804 explicitly by including a C<-stats =E<gt> 0> parameter during object
805 construction. Support for all statistical parameters is not complete.
806
807 For more about the meaning of parameters, check out the NCBI URLs given above.
808
809 =head2 Module Organization
810
811 The modules that comprise this Bioperl Blast distribution are location in the
812 Bio:: hierarchy as shown in the diagram below.
813
814 Bio/
815 |
816 +--------------------------+
817 | |
818 Bio::Tools Bio::Root
819 | |
820 +----------------------+ Object.pm
821 | | |
822 SeqAnal.pm Blast.pm Blast/
823 |
824 +---------+---------+------------+
825 | | | |
826 Sbjct.pm HSP.pm HTML.pm Run/
827 |
828 +------------+
829 | |
830 Webblast.pm LocalBlast.pm
831
832 Bio::Tools::Blast.pm is a concrete class that inherits from
833 B<Bio::Tools::SeqAnal.pm> and relies on other modules for parsing and
834 managing BLAST data. Worth mentioning about this hierarchy is the
835 lack of a "Parse.pm" module. Since parsing is considered central to
836 the purpose of the Bioperl Blast module (and Bioperl in general), it
837 seems somewhat unnatural to segregate out all parsing code. This
838 segregation could also lead to inefficiencies and harder to maintain
839 code. I consider this issue still open for debate.
840
841 Bio::Tools::Blast.pm, B<Bio::Tools::Blast::Sbjct.pm>, and
842 B<Bio::Tools::Blast::HSP.pm> are mostly dedicated to parsing and all
843 can be used to instantiate objects. Blast.pm is the main "command and
844 control" module, inheriting some basic behaviors from SeqAnal.pm
845 (things that are not specific to Blast I<per se>).
846
847 B<Bio::Tools::Blast::HTML.pm> contains functions dedicated to
848 generating HTML-formatted Blast reports and does not generate objects.
849
850 =head2 Running Blasts: Details
851
852 B<Bio::Tools::Blast::Run::Webblast.pm> contains a set of functions for
853 running Blast analyses at a remote server and also does not
854 instantiate objects. It uses a helper script called postclient.pl,
855 located in the Run directory. The proposed LocalBlast.pm module would
856 be used for running Blast reports on local machines and thus would be
857 customizable for different sites. It would operate in a parallel
858 fashion to Webblast.pm (i.e., being a collection of functions, taking
859 in sequence objects or files, returning result files).
860
861 The Run modules are considered experimental. In particular,
862 Webblast.pm catures an HTML-formatted version of the Blast report from
863 the NCBI server and strips out the HTML in preparation for parsing. A
864 more direct approach would be to capture the Blast results directly
865 from the server using an interface to the NCBI toolkit. This approach
866 was recently proposed on the Bioperl mailing list:
867 http://www.uni-bielefeld.de/mailinglists/BCD/vsns-bcd-perl/9805/0000.html
868
869 =head2 Memory Usage Issues
870
871 Parsing large numbers of Blast reports (a few thousand or so) with
872 Bio::Tools::Blast.pm may lead to unacceptable memory usage situations.
873 This is somewhat dependent of the size and complexity of the reports.
874
875 While this problem is under investigation, here are some workarounds
876 that fix the memory usage problem:
877
878 =over 4
879
880 =item 1 Don't specify a -signif criterion when calling L<parse()|parse>.
881
882 The C<-signif> value is used for imposing a upper limit to the expect- or
883 P-value for Blast hits to be parsed. For reasons that are still under
884 investigation, specifying a value for C<-signif> in the L<parse()|parse>
885 method prevents Blast objects from being fully
886 garbage collected. When using the B<parse_blast.pl> or B<parse_multi.pl>
887 scripts in C<examples/blast/> of the bioperl distribution), don't supply
888 a C<-signif> command-line parameter.
889
890 =item 2 If you want to impose a -signif criterion, put it inside a
891 -filt_func.
892
893 For the L<parse()|parse> method, a -signif =E<gt> 1e-5 parameter is equivalent
894 to using a filter function parameter of
895
896 -filt_func => sub { my $hit = shift; return $hit->signif <= 1e-5; }
897
898 Using the B<examples/blast/parse_multi.pl> script, you can supply a
899 command-line argument of
900
901 -filt_func '$hit->signif <= 1e-5'
902
903 For more information, see L<parse()|parse> and the section
904 L<Screening hits using arbitrary criteria>.
905
906 =back
907
908 =head1 TODO
909
910 =over 4
911
912 =item * Develop a functional, prototype Bio::Tools::Blast::Run::LocalBlast.pm module.
913
914 =item * Add support for PSI-BLAST and PHI-BLAST
915
916 =item * Parse histogram of expectations and retrieve gif image in
917 Blast report (if present).
918
919 =item * Further investigate memory leak that occurs when parsing Blast
920 streams whe supplying a -signif parameter to L<parse()|parse>.
921
922 =item * Access Blast results directly from the NCBI server using a
923 Perl interface to the NCBI toolkit or XML formated Blast reports (when
924 available).
925
926 =item * Further exploit Bio::UnivAln.pm and multiple-sequence
927 alignment programs using HSP sequence data. Some of this may best go
928 into a separate, dedicated module or script as opposed to burdening
929 Blast.pm, Sbjct.pm, and HSP.pm with additional functionality that is
930 not always required.
931
932 =item * Add an example script for parsing Blast reports containing
933 HTML formatting.
934
935 =back
936
937 =head1 VERSION
938
939 Bio::Tools::Blast.pm, 0.09
940
941 =head1 FEEDBACK
942
943 =head2 Mailing Lists
944
945 User feedback is an integral part of the evolution of this and other
946 Bioperl modules. Send your comments and suggestions preferably to one
947 of the Bioperl mailing lists. Your participation is much appreciated.
948
949 bioperl-l@bioperl.org - General discussion
950 http://bio.perl.org/MailList.html - About the mailing lists
951
952 =head2 Reporting Bugs
953
954 Report bugs to the Bioperl bug tracking system to help us keep track
955 the bugs and their resolution. Bug reports can be submitted via email
956 or the web:
957
958 bioperl-bugs@bio.perl.org
959 http://bugzilla.bioperl.org/
960
961 =head1 AUTHOR
962
963 Steve Chervitz, sac@bioperl.org
964
965 See the L<FEEDBACK | FEEDBACK> section for where to send bug reports and comments.
966
967 =head1 ACKNOWLEDGEMENTS
968
969 This module was developed under the auspices of the Saccharomyces Genome
970 Database:
971 http://genome-www.stanford.edu/Saccharomyces
972
973 Other contributors include: Alex Dong Li (webblast), Chris Dagdigian
974 (Seq.pm), Steve Brenner (Seq.pm), Georg Fuellen (Seq.pm, UnivAln.pm),
975 and untold others who have offered comments (noted in the
976 Bio/Tools/Blast/CHANGES file of the distribution).
977
978 =head1 COPYRIGHT
979
980 Copyright (c) 1996-98 Steve Chervitz. All Rights Reserved. This
981 module is free software; you can redistribute it and/or modify it
982 under the same terms as Perl itself.
983
984 =head1 SEE ALSO
985
986 Bio::Tools::SeqAnal.pm - Sequence analysis object base class.
987 Bio::Tools::Blast::Sbjct.pm - Blast hit object.
988 Bio::Tools::Blast::HSP.pm - Blast HSP object.
989 Bio::Tools::Blast::HTML.pm - Blast HTML-formating utility class.
990 Bio::Tools::Blast::Run::Webblast.pm - Utility module for running Blasts remotely.
991 Bio::Tools::Blast::Run::LocalBlast.pm - Utility module for running Blasts locally.
992 Bio::Seq.pm - Biosequence object
993 Bio::UnivAln.pm - Biosequence alignment object.
994 Bio::Root::Object.pm - Proposed base class for all Bioperl objects.
995
996 =head2 Links to related modules
997
998 Bio::Tools::SeqAnal.pm
999 http://bio.perl.org/Core/POD/Bio/Tools/SeqAnal.html
1000
1001 Bio::Tools::Blast::Sbjct.pm
1002 http://bio.perl.org/Core/POD/Bio/Tools/Blast/Sbjct.html
1003
1004 Bio::Tools::Blast::HSP.pm
1005 http://bio.perl.org/Core/POD/Bio/Tools/Blast/HSP.html
1006
1007 Bio::Tools::Blast::HTML.pm
1008 http://bio.perl.org/Core/POD/Bio/Tools/Blast/HTML.html
1009
1010 Bio::Tools::Blast::Run::Webblast.pm
1011 http://bio.perl.org/Core/POD/Bio/Tools/Blast/Run/Webblast.html
1012
1013 Bio::Tools::Blast::Run::LocalBlast.pm
1014 http://bio.perl.org/Core/POD/Bio/Tools/Blast/Run/LocalBlast.html
1015
1016 Bio::Seq.pm
1017 http://bio.perl.org/Core/POD/Seq.html
1018
1019 Bio::UnivAln.pm
1020 http://bio.perl.org/Projects/SeqAlign/
1021 Europe: http://www.techfak.uni-bielefeld.de/bcd/Perl/Bio/#univaln
1022
1023 Bio::Root::Object.pm
1024 http://bio.perl.org/Core/POD/Root/Object.html
1025
1026 http://bio.perl.org/Projects/modules.html - Online module documentation
1027 http://bio.perl.org/Projects/Blast/ - Bioperl Blast Project
1028 http://bio.perl.org/ - Bioperl Project Homepage
1029
1030 L<References & Information about the BLAST program>.
1031
1032 =head1 KNOWN BUGS
1033
1034 There is a memory leak that occurs when parsing parsing streams
1035 containing large numbers of Blast reports (a few thousand or so) and
1036 specifying a -signif parameter to the L<parse()|parse> method. For a
1037 workaround, see L<Memory Usage Issues>.
1038
1039 Not sharing statistical parameters between different Blast objects
1040 when parsing a multi-report stream has not been completely tested and
1041 may be a little buggy.
1042
1043 Documentation inconsistencies or inaccuracies may exist since this
1044 module underwend a fair bit of re-working going from 0.75 to 0.80
1045 (corresponds to versions 0.04.4 to 0.05 of the bioperl distribution).
1046
1047 =cut
1048
1049 #
1050 ##
1051 ###
1052 #### END of main POD documentation.
1053 ###
1054 ##
1055 #
1056
1057 =head1 APPENDIX
1058
1059 Methods beginning with a leading underscore are considered private and
1060 are intended for internal use by this module. They are B<not>
1061 considered part of the public interface and are described here for
1062 documentation purposes only.
1063
1064 =cut
1065
1066 ##############################################################################
1067 ## CONSTRUCTOR ##
1068 ##############################################################################
1069
1070
1071 sub new {
1072 my ($class,@args) = @_;
1073 $class->warn("Bio::Tools::BLAST is deprecated, use Bio::SearchIO system or Bio::Tools::BPlite");
1074 return $class->SUPER::new(@args);
1075 }
1076
1077 ## The Blast.pm object relies on the the superclass constructor:
1078 ## Bio::Tools::SeqAnal::_initialize(). See that module for details.
1079
1080 #-------------
1081 sub destroy {
1082 #-------------
1083 my $self=shift;
1084 $DEBUG==2 && print STDERR "DESTROYING $self ${\$self->name}";
1085 if($self->{'_hits'}) {
1086 foreach($self->hits) {
1087 $_->destroy;
1088 undef $_;
1089 }
1090 undef $self->{'_hits'};
1091 #$self->{'_hits'}->remove_all; ## When and if this member becomes a vector.
1092 }
1093
1094 $self->SUPER::destroy;
1095 }
1096
1097 #####################################################################################
1098 ## ACCESSORS ##
1099 #####################################################################################
1100
1101 =head2 run
1102
1103 Usage : $object->run( %named_parameters )
1104 Purpose : Run a local or remote Blast analysis on one or more sequences.
1105 Returns : String containing name of Blast output file if a single Blast
1106 : is run.
1107 : -- OR --
1108 : List of Blast objects if multiple Blasts are being run as a group.
1109 Argument : Named parameters: (PARAMETER TAGS CAN BE UPPER OR LOWER CASE).
1110 : -METHOD => 'local' or 'remote' (default = remote),
1111 : -PARSE => boolean, (true if the results are to be parsed after the run)
1112 : -STRICT => boolean, the strict mode to use for the resulting Blast objects.
1113 : ADDITIONAL PARAMETERS:
1114 : See methods _run_remote() and _run_local() for required
1115 : parameters necessary for running the blast report.
1116 Throws : Exception if no Blast output file was obtained.
1117 Comments : This method is called automatically during construction of a
1118 : Blast.pm object when run parameters are sent to the constructor:
1119 : $blastObj = new Bio::Tools::Blast (-RUN =>\%runParam,
1120 : %parseParam );
1121 :
1122 : The specific run methods (local or remote) called by run()
1123 : must return a list containing the file name(s) with the Blast output.
1124 :
1125 : The run() method can perform single or multiple Blast runs
1126 : (analogous to the way parse() works) depending on how many
1127 : sequences are submitted. However, the running of multiple
1128 : Blasts is probably better handled at the script level. See notes in
1129 : the "TODO" section below.
1130 :
1131 : As for what to do with the Blast result file, that decision is
1132 : left for the user who can direct the Blast object to delete, compress,
1133 : or leave it alone.
1134 :
1135 : This method does not worry about load balancing, which
1136 : is probably best handled at the server level.
1137 :
1138 TODO: : Support for running+parsing multiple Blast analyses with a
1139 : single run() call is incomplete. One can generate multiple
1140 : reports by placing more than one sequence object in the -seqs
1141 : reference parameter. This saves some overhead in the code
1142 : that executes the Blasts since all options are configured once.
1143 : (This is analogous to parsing using the static $Blast object
1144 : see parse() and _parse_stream()).
1145 :
1146 : The trouble is that Blast objects for all runs are constructed,
1147 : parsed (if necessary), and then returned as a group
1148 : This can require lots of memory when run+parsing many Blasts
1149 : but should be fine if you just want to run a bunch Blasts.
1150 :
1151 : For now, when running+parsing Blasts, stick to running one
1152 : Blast at a time, building the Blast object with the results
1153 : of that report, and processing as necessary.
1154 :
1155 : Support for running PSI-Blast is not complete.
1156
1157 See Also: L<_run_remote()|_run_remote>, L<_run_local()|_run_local>, L<parse()|parse>
1158
1159 =cut
1160
1161 #---------
1162 sub run {
1163 #---------
1164 my ($self, %param) = @_;
1165 my($method, $parse, $strict) =
1166 $self->_rearrange([qw(METHOD PARSE STRICT)], %param);
1167
1168 $strict = $self->strict($strict) if $strict;
1169
1170 my (@files);
1171 if($method =~ /loc/i) {
1172 @files = $self->_run_local(%param);
1173
1174 } else {
1175 @files = $self->_run_remote(%param);
1176 }
1177
1178 $self->throw("Run Blast failed: no Blast output created.") if !@files;
1179
1180 if(scalar(@files) == 1) {
1181 # If there was just one Blast output file, prepare to incorporate it
1182 # into the current Blast object. run() is called before parse() in the
1183 # SeqAnal.pm constructor.
1184 if($files[0] ne 'email') {
1185 $self->file($files[0]);
1186 } else {
1187 # Can't do anything with the report.
1188 $self->throw("Blast report to be sent via e-mail.");
1189 }
1190
1191 } else {
1192 # If there are multiple report files, build individual Blast objects foreach.
1193 # In this situation, the static $Blast object is being used to run
1194 # a set of related Blasts, similar to the way parse() can be used.
1195 # This strategy is not optimal since all reports are generated first
1196 # before any are parsed.
1197 # Untested.
1198
1199 my(@objs);
1200 foreach(@files) {
1201 push @objs, new Bio::Tools::Blast(-FILE => $_,
1202 -PARSE => $parse || 0,
1203 -STRICT => $strict,
1204 );
1205 }
1206 return @objs;
1207 }
1208 }
1209
1210 =head2 _run_remote
1211
1212 Usage : n/a; internal method called by run()
1213 : $object->_run_remote( %named_parameters )
1214 Purpose : Run Blast on a remote server.
1215 Argument : Named parameters:
1216 : See documentation for function &blast_remote in
1217 : Bio::Tools::Blast::Run::Webblast.pm for description
1218 : of parameters.
1219 Comments : This method requires the Bio::Tools::Blast::Run::Webblast.pm
1220 : which conforms to this minimal API:
1221 : * export a method called &blast_remote that accepts a
1222 : Bio::Tools::Blast.pm object + named parameters
1223 : (specified in the Webblast.pm module).
1224 : * return a list of names of files containing the raw Blast reports.
1225 : (When building a Blast object, this list would contain a
1226 : single file from which the Blast object is to be constructed).
1227
1228 See Also : L<run()|run>, L<_run_local()|_run_local>, B<Bio::Tools::Blast::Run::Webblast.pm::blast_remote()>, L<Links to related modules>
1229
1230 =cut
1231
1232 #----------------
1233 sub _run_remote {
1234 #----------------
1235 my ($self, %param) = @_;
1236
1237 require Bio::Tools::Blast::Run::Webblast;
1238 Bio::Tools::Blast::Run::Webblast->import(qw(&blast_remote));
1239
1240 &blast_remote($self, %param);
1241 }
1242
1243 =head2 _run_local
1244
1245 Usage : n/a; internal method called by run()
1246 : $object->_run_local(%named_parameters)
1247 Purpose : Run Blast on a local machine.
1248 Argument : Named parameters:
1249 : See documentation for function &blast_local in
1250 : Bio::Tools::Blast::Run::LocalBlast.pm for description
1251 : of parameters.
1252 Comments : This method requires the Bio::Tools::Blast::Run::LocalBlast.pm
1253 : module which should be customized for your site. This module would
1254 : contain all the commands, paths, environment variables, and other
1255 : data necessary to run Blast commands on a local machine, but should
1256 : not contain any semantics for specific query sequences.
1257 :
1258 : LocalBlast.pm should also conform to this minimal API:
1259 : * export a method called &blast_local that accepts a
1260 : Bio::Tools::Blast.pm object + named parameters
1261 : (specified in the LocalBlast.pm module).
1262 : * return a list of names of files containing the raw Blast reports.
1263 : (When building a Blast object, this list would contain a
1264 : single file from which the Blast object is to be constructed).
1265
1266 See Also : L<run()|run>, L<_run_remote()|_run_remote>, B<Bio::Tools::Blast::Run::LocalBlast::blast_local()>, L<Links to related modules>
1267
1268 =cut
1269
1270 #--------------
1271 sub _run_local {
1272 #--------------
1273 my ($self, %param) = @_;
1274
1275 require Bio::Tools::Blast::Run::Webblast;
1276 Bio::Tools::Blast::Run::Webblast->import(qw(&blast_local));
1277
1278 &blast_local($self, %param);
1279 }
1280
1281 =head2 db_remote
1282
1283 Usage : @dbs = $Blast->db_remote( [seq_type] );
1284 Purpose : Get a list of available sequence databases for remote Blast analysis.
1285 Returns : Array of strings
1286 Argument : seq_type = 'p' or 'n'
1287 : 'p' = Gets databases for peptide searches (default)
1288 : 'n' = Gets databases for nucleotide searches
1289 Throws : n/a
1290 Comments : Peptide databases are a subset of the nucleotide databases.
1291 : It is convenient to call this method on the static $Blast object
1292 : as shown in Usage.
1293
1294 See Also : L<db_local()|db_local>
1295
1296 =cut
1297
1298 #----------------
1299 sub db_remote {
1300 #----------------
1301 my ($self, $type) = @_;
1302 $type ||= 'p';
1303
1304 require Bio::Tools::Blast::Run::Webblast;
1305 Bio::Tools::Blast::Run::Webblast->import(qw(@Blast_dbp_remote
1306 @Blast_dbn_remote));
1307
1308 # We shouldn't have to fully qualify the Blast_dbX_remote arrays. Hm.
1309
1310 my(@dbs);
1311 if( $type =~ /^p|amino/i) {
1312 @dbs = @Bio::Tools::Blast::Run::Webblast::Blast_dbp_remote;
1313 } else {
1314 @dbs = @Bio::Tools::Blast::Run::Webblast::Blast_dbn_remote;
1315 }
1316 @dbs;
1317 }
1318
1319 =head2 db_local
1320
1321 Usage : @dbs = $Blast->db_local( [seq_type] );
1322 Purpose : Get a list of available sequence databases for local Blast analysis.
1323 Returns : Array of strings
1324 Argument : seq_type = 'p' or 'n'
1325 : 'p' = Gets databases for peptide searches (default)
1326 : 'n' = Gets databases for nucleotide searches
1327 Throws : n/a
1328 Comments : Peptide databases are a subset of the nucleotide databases.
1329 : It is convenient to call this method on the static $Blast object.
1330 as shown in Usage.
1331
1332 See Also : L<db_remote()|db_remote>
1333
1334 =cut
1335
1336 #----------------
1337 sub db_local {
1338 #----------------
1339 my ($self, $type) = @_;
1340 $type ||= 'p';
1341
1342 require Bio::Tools::Blast::Run::LocalBlast;
1343 Bio::Tools::Blast::Run::LocalBlast->import(qw(@Blast_dbp_local
1344 @Blast_dbn_local));
1345
1346 # We shouldn't have to fully qualify the Blast_dbX_local arrays. Hm.
1347
1348 my(@dbs);
1349 if( $type =~ /^p|amino/i) {
1350 @dbs = @Bio::Tools::Blast::Run::LocalBlast::Blast_dbp_local;
1351 } else {
1352 @dbs = @Bio::Tools::Blast::Run::LocalBlast::Blast_dbn_local;
1353 }
1354 @dbs;
1355 }
1356
1357 =head2 parse
1358
1359 Usage : $blast_object->parse( %named_parameters )
1360 Purpose : Parse a Blast report from a file or STDIN.
1361 : * Parses a raw BLAST data, populating Blast object with report data.
1362 : * Sets the significance cutoff.
1363 : * Extracts statistical parameters about the BLAST run.
1364 : * Handles both single files and streams containing multiple reports.
1365 Returns : integer (number of Blast reports parsed)
1366 Argument : <named parameters>: (PARAMETER TAGS CAN BE UPPER OR LOWER CASE).
1367 : -FILE => string (name of file containing raw Blast output.
1368 : Optional. If a valid file is not supplied,
1369 : STDIN will be used).
1370 : -SIGNIF => number (float or scientific notation number to be used
1371 : as a P- or Expect value cutoff;
1372 : default = $DEFAULT_SIGNIF (999)).
1373 : -FILT_FUNC => func_ref (reference to a function to be used for
1374 : filtering out hits based on arbitrary criteria.
1375 : This function should take a
1376 : Bio::Tools::Blast::Sbjct.pm object as its first
1377 : argument and return a boolean value,
1378 : true if the hit should be filtered out).
1379 : Sample filter function:
1380 : -FILT_FUNC => sub { $hit = shift;
1381 : $hit->gaps == 0; },
1382 : -CHECK_ALL_HITS => boolean (check all hits for significance against
1383 : significance criteria. Default = false.
1384 : If false, stops processing hits after the first
1385 : non-significant hit or the first hit that fails
1386 : the filt_func call. This speeds parsing,
1387 : taking advantage of the fact that the hits
1388 : are processed in the order they are ranked.)
1389 : -MIN_LEN => integer (to be used as a minimum query sequence length
1390 : sequences below this length will not be processed).
1391 : default = no minimum length).
1392 : -STATS => boolean (collect stats for report: matrix, filters, etc.
1393 : default = false).
1394 : -BEST => boolean (only process the best hit of each report;
1395 : default = false).
1396 : -OVERLAP => integer (the amount of overlap to permit between
1397 : adjacent HSPs when tiling HSPs,
1398 : Default = $MAX_HSP_OVERLAP (2))
1399 :
1400 : PARAMETERS USED WHEN PARSING MULTI-REPORT STREAMS:
1401 : --------------------------------------------------
1402 : -SHARE => boolean (set this to true if all reports in stream
1403 : share the same stats. Default = true)
1404 : Must be set to false when parsing both Blast1 and
1405 : Blast2 reports in the same run or if you need
1406 : statistical params for each report, Lambda, K, H).
1407 : -STRICT => boolean (use strict mode for all Blast objects created.
1408 : Increases sensitivity to errors. For single
1409 : Blasts, this is parameter is sent to new().)
1410 : -EXEC_FUNC => func_ref (reference to a function for processing each
1411 : Blast object after it is parsed. Should accept a
1412 : Blast object as its sole argument. Return value
1413 : is ignored. If an -EXEC_FUNC parameter is supplied,
1414 : the -SAVE_ARRAY parameter will be ignored.)
1415 : -SAVE_ARRAY =>array_ref, (reference to an array for storing all
1416 : Blast objects as they are created.
1417 : Experimental. Not recommended.)
1418 : -SIGNIF_FMT => boolean String of 'exp' or 'parts'. Sets the format
1419 : for reporting P/Expect values. 'exp' reports
1420 : only the exponent portion. 'parts' reports
1421 : them as a 2 element list. See signif_fmt()..
1422 :
1423 Throws : Exception if BLAST report contains a FATAL: error.
1424 : Propagates any exception thrown by read().
1425 : Propagates any exception thrown by called parsing methods.
1426 Comments : This method can be called either directly using the static $Blast object
1427 : or indirectly (by Bio::Tools::SeqAnal.pm) during constuction of an
1428 : individual Blast object.
1429 :
1430 : HTML-formatted reports can be parsed as well. No special flag is required
1431 : since it is detected automatically. The presence of HTML-formatting
1432 : will result in slower performace, however, since it must be removed
1433 : prior to parsing. Parsing HTML-formatted reports is highly
1434 : error prone and is generally not recommended.
1435 :
1436 : If one has an HTML report, do NOT remove the HTML from it by using the
1437 : "Save As" option of a web browser to save it as text. This renders the
1438 : report unparsable.
1439 : HTML-formatted reports can be parsed after running through the strip_html
1440 : function of Blast::HTML.pm as in:
1441 : require Bio::Tools::Blast::HTML;
1442 : Bio::Tools::Blast::HTML->import(&strip_html);
1443 : &strip_html(\$data);
1444 : # where data contains full contents of an HTML-formatted report.
1445 : TODO: write a demo script that does this.
1446
1447 See Also : L<_init_parse_params()|_init_parse_params>, L<_parse_blast_stream()|_parse_blast_stream>, L<overlap()|overlap>, L<signif_fmt()|signif_fmt>, B<Bio::Root::Object::read()>, B<Bio::Tools::Blast::HTML.pm::strip_html()>, L<Links to related modules>
1448
1449 =cut
1450
1451 #---------
1452 sub parse {
1453 #---------
1454 # $self might be the static $Blast object.
1455 my ($self, @param) = @_;
1456
1457 my($signif, $filt_func, $min_len, $check_all, $overlap, $stats,
1458 $share, $strict, $best, $signif_fmt, $no_aligns) =
1459 $self->_rearrange([qw(SIGNIF FILT_FUNC MIN_LEN CHECK_ALL_HITS
1460 OVERLAP STATS SHARE STRICT
1461 BEST EXPONENT NO_ALIGNS )], @param);
1462
1463 ## Initialize the static Blast object with parameters that
1464 ## apply to all Blast objects within a parsing session.
1465
1466 &_init_parse_params($share, $filt_func, $check_all,
1467 $signif, $min_len, $strict,
1468 $best, $signif_fmt, $stats, $no_aligns
1469 );
1470
1471 my $count = $self->_parse_blast_stream(@param);
1472
1473 # print STDERR "\nDONE PARSING STREAM.\n";
1474
1475 if($Blast->{'_blast_errs'}) {
1476 my @errs = @{$Blast->{'_blast_errs'}};
1477 printf STDERR "\n*** %d BLAST REPORTS HAD FATAL ERRORS:\n", scalar(@errs);
1478 foreach(@errs) { print STDERR "$_\n"; }
1479 @{$Blast->{'_blast_errs'}} = ();
1480 }
1481
1482 return $count;
1483 }
1484
1485 =head2 _init_parse_params
1486
1487 Title : _init_parse_params
1488 Usage : n/a; called automatically by parse()
1489 Purpose : Initializes parameters used during parsing of Blast reports.
1490 : This is a static method used by the $Blast object.
1491 : Calls _set_signif().
1492 Example :
1493 Returns : n/a
1494 Args : Args extracted by parse().
1495
1496 See Also: L<parse()|parse>, L<_set_signif()|_set_signif>
1497
1498 =cut
1499
1500 #----------------------
1501 sub _init_parse_params {
1502 #----------------------
1503 my ($share, $filt_func, $check_all,
1504 $signif, $min_len, $strict,
1505 $best, $signif_fmt, $stats, $no_aligns) = @_;
1506
1507 ## Default is to share stats.
1508 $Blast->{'_share'} = defined($share) ? $share : 1;
1509 $Blast->{'_filt_func'} = $filt_func || 0;
1510 $Blast->{'_check_all'} = $check_all || 0;
1511 $Blast->{'_signif_fmt'} ||= $signif_fmt || '';
1512 $Blast->{'_no_aligns'} = $no_aligns || 0;
1513
1514 &_set_signif($signif, $min_len, $filt_func);
1515 $Blast->strict($strict) if defined $strict;
1516 $Blast->best($best) if $best;
1517 $Blast->{'_blast_count'} = 0;
1518
1519 ## If $stats is false, miscellaneous statistical and other parameters
1520 ## are NOT extracted from the Blast report (e.g., matrix name, filter used, etc.).
1521 ## This can speed processing when crunching tons of Blast reports.
1522 ## Default is to NOT get stats.
1523 $Blast->{'_get_stats'} = defined($stats) ? $stats : 0;
1524
1525 # Clear any errors from previous parse.
1526 undef $Blast->{'_blast_errs'};
1527 }
1528
1529 =head2 _set_signif
1530
1531 Usage : n/a; called automatically by _init_parse_params()
1532 : This is now a "static" method used only by $Blast.
1533 : _set_signif($signif, $min_len, $filt_func);
1534 Purpose : Sets significance criteria for the BLAST object.
1535 Argument : Obligatory three arguments:
1536 : $signif = float or sci-notation number or undef
1537 : $min_len = integer or undef
1538 : $filt_func = function reference or undef
1539 :
1540 : If $signif is undefined, a default value is set
1541 : (see $DEFAULT_SIGNIF; min_length = not set).
1542 Throws : Exception if significance value is defined but appears
1543 : out of range or invalid.
1544 : Exception if $filt_func if defined and is not a func ref.
1545 Comments : The significance of a BLAST report can be based on
1546 : the P (or Expect) value and/or the length of the query sequence.
1547 : P (or Expect) values GREATER than '_significance' are not significant.
1548 : Query sequence lengths LESS than '_min_length' are not significant.
1549 :
1550 : Hits can also be screened using arbitrary significance criteria
1551 : as discussed in the parse() method.
1552 :
1553 : If no $signif is defined, the '_significance' level is set to
1554 : $Bio::Tools::Blast::DEFAULT_SIGNIF (999).
1555
1556 See Also : L<signif()|signif>, L<min_length()|min_length>, L<_init_parse_params()|_init_parse_params>, L<parse()|parse>
1557
1558 =cut
1559
1560 #-----------------
1561 sub _set_signif {
1562 #-----------------
1563 my( $sig, $len, $func ) = @_;
1564
1565 if(defined $sig) {
1566 $Blast->{'_confirm_significance'} = 1;
1567 if( $sig =~ /[^\d.e-]/ or $sig <= 0) {
1568 $Blast->throw("Invalid significance value: $sig",
1569 "Must be greater than zero.");
1570 }
1571 $Blast->{'_significance'} = $sig;
1572 } else {
1573 $Blast->{'_significance'} = $DEFAULT_SIGNIF;
1574 $Blast->{'_check_all'} = 1 if not $Blast->{'_filt_func'};
1575 }
1576
1577 if(defined $len) {
1578 if($len =~ /\D/ or $len <= 0) {
1579 $Blast->warn("Invalid minimum length value: $len",
1580 "Value must be an integer > 0. Value not set.");
1581 } else {
1582 $Blast->{'_min_length'} = $len;
1583 }
1584 }
1585
1586 if(defined $func) {
1587 $Blast->{'_confirm_significance'} = 1;
1588 if($func and not ref $func eq 'CODE') {
1589 $Blast->throw("Not a function reference: $func",
1590 "The -filt_func parameter must be function reference.");
1591 }
1592 }
1593 }
1594
1595 =head2 _parse_blast_stream
1596
1597 Usage : n/a. Internal method called by parse()
1598 Purpose : Obtains the function to be used during parsing and calls read().
1599 Returns : Integer (the number of blast reports read)
1600 Argument : Named parameters (forwarded from parse())
1601 Throws : Propagates any exception thrown by _get_parse_blast_func() and read().
1602
1603 See Also : L<_get_parse_blast_func()|_get_parse_blast_func>, B<Bio::Root::Object::read()>
1604
1605 =cut
1606
1607 #----------------------
1608 sub _parse_blast_stream {
1609 #----------------------
1610 my ($self, %param) = @_;
1611
1612 my $func = $self->_get_parse_blast_func(%param);
1613 # my $func = sub { my $data = shift;
1614 # printf STDERR "Chunk length = %d\n", length($data);
1615 # sleep(3);
1616 # };
1617
1618 # Only setting the newline character once per session.
1619 $Newline ||= $Util->get_newline(-client => $self, %param);
1620
1621 $self->read(-REC_SEP =>"$Newline>",
1622 -FUNC => $func,
1623 %param);
1624
1625 return $Blast->{'_blast_count'};
1626 }
1627
1628 =head2 _get_parse_blast_func
1629
1630 Usage : n/a; internal method used by _parse_blast_stream()
1631 : $func_ref = $blast_object->_get_parse_blast_func()
1632 Purpose : Generates a function ref to be used as a closure for parsing
1633 : raw data as it is being loaded by Bio::Root::IOManager::read().
1634 Returns : Function reference (closure).
1635 Comments : The the function reference contains a fair bit of logic
1636 : at present. It could perhaps be split up into separate
1637 : functions to make it more 'digestible'.
1638
1639 See Also : L<_parse_blast_stream()|_parse_blast_stream>
1640
1641 =cut
1642
1643 #--------------------------
1644 sub _get_parse_blast_func {
1645 #--------------------------
1646 my ($self, @param) = @_;
1647
1648 my ($save_a, $exec_func) =
1649 $self->_rearrange([qw(SAVE_ARRAY EXEC_FUNC)], @param);
1650
1651 # $MONITOR && print STDERR "\nParsing Blast stream (5/dot, 250/line)\n";
1652 my $count = 0;
1653 my $strict = $self->strict();
1654
1655 # Some parameter validation.
1656 # Remember, all Blast parsing will use this function now.
1657 # You won't need a exec-func or save_array when just creating a Blast object
1658 # as in: $blast = new Bio::Tools::Blast();
1659 if($exec_func and not ref($exec_func) eq 'CODE') {
1660 $self->throw("The -EXEC_FUNC parameter must be function reference.",
1661 "exec_func = $exec_func");
1662
1663 } elsif($save_a and not ref($save_a) eq 'ARRAY') {
1664 $self->throw("The -SAVE_ARRAY parameter must supply an array reference".
1665 "when not using an -EXEC_FUNC parameter.");
1666 }
1667
1668 ## Might consider breaking this closure up if possible.
1669
1670 return sub {
1671 my ($data) = @_;
1672 ## $data should contain one of three possible fragment types
1673 ## from a Blast report:
1674 ## 1. Header with description section,
1675 ## 2. An alignment section for a single hit, or
1676 ## 3. The final alignment section plus the footer section.
1677 ## (record separator = "Newline>").
1678
1679 # print STDERR "\n(BLAST) DATA CHUNK: $data\n";
1680
1681 my ($current_blast, $current_prog, $current_vers, $current_db);
1682 my $prev_blast;
1683 my $contains_translation = 0;
1684
1685 ### steve --- Wed Mar 15 02:48:07 2000
1686 ### In the process of addressing bug PR#95. Tricky.
1687 ### Using the $contains_translation to do so. Not complete
1688 ### and possibly won't fix. We'll see.
1689
1690 # Check for header section. Start a new Blast object and
1691 # parse the description section.
1692 # if ($data =~ /\sQuery\s?=/s || ($contains_translation && $data =~ /Database:/s)) {
1693 if ($data =~ /\sQuery\s?=/s) {
1694 $Blast->{'_blast_count'}++;
1695 print STDERR ".", $Blast->{'_blast_count'} % 50 ? '' : "\n" if $MONITOR;
1696
1697 if($data =~ /$Newline\s+Translating/so) {
1698 print STDERR "\nCONTAINS TRANSLATION\n";
1699 $contains_translation = 1;
1700 }
1701
1702 # If we're parsing a stream containing multiple reports,
1703 # all subsequent header sections will contain the last hit of
1704 # the previous report which needs to be parsed and added to that
1705 # report if signifcant. It also contains the run parameters
1706 # at the bottom of the Blast report.
1707 # if($Blast->{'_blast_count'} > 1 || $contains_translation) {
1708 if($Blast->{'_blast_count'} > 1) {
1709 # print STDERR "\nMULTI-BLAST STREAM.\n";
1710 $Blast->{'_multi_stream'} = 1;
1711
1712 if($data =~ /(.+?)$Newline(<\w+>)?(T?BLAST[NPX])\s+(.+?)$Newline(.+)/so) {
1713 ($current_prog, $current_vers, $data) = ($3, $4, $5);
1714 # Final chunk containing last hit and last footer.
1715 $Blast->{'_current_blast'}->_parse_alignment($1);
1716 $prev_blast = $Blast->{'_current_blast'}; # finalized.
1717 # } elsif($contains_translation) {
1718 # $data =~ /(T?BLAST[NPX])\s+(.+?)$Newline(.+)/so;
1719 # ($current_prog, $current_vers, $data) = ($1, $2, $3);
1720 } else {
1721 $Blast->throw("Can't determine program type from BLAST report.",
1722 "Checked for: @Blast_programs.");
1723 # This has important implications for how to handle interval
1724 # information for HSPs. TBLASTN uses nucleotides in query HSP
1725 # but amino acids in the sbjct HSP sequence.
1726 }
1727
1728 if($data =~ m/Database:\s+(.+?)$Newline/so ) {
1729 $current_db = $1;
1730 } else {
1731 # In some reports, the Database is only listed at end.
1732 #$Blast->warn("Can't determine database name from BLAST report.");
1733 }
1734
1735 # Incyte_Fix: Nasty Invisible Bug.
1736 # Records in blast report are delimited by '>', but... when
1737 # there are no hits for a query, there won't be a '>'. That
1738 # causes several blast reports to run together in the data
1739 # passed to this routine. Need to get rid of non-hits in data
1740 if ($data =~ /.+(No hits? found.+Sequences.+)/so) {
1741 $data = $1;
1742 }
1743 # End Incyte_Fix
1744
1745 }
1746
1747 # Determine if we need to create a new Blast object
1748 # or use the $self object for this method.
1749
1750 if($Blast->{'_multi_stream'} or $self->name eq 'Static Blast object') {
1751 # Strict mode is not object-specific but may be someday.
1752 # print STDERR "\nCreating new Blast object.\n";
1753 $current_blast = new Bio::Tools::Blast(-STRICT => $strict);
1754 } else {
1755 $current_blast = $self;
1756 }
1757 $Blast->{'_current_blast'} = $current_blast;
1758
1759 # If we're not sharing stats, set data on current blast object.
1760 if(defined $current_prog and not $Blast->{'_share'}) {
1761 $current_blast->program($current_prog);
1762 $current_blast->program_version($current_vers);
1763 $current_blast->database($current_db);
1764 }
1765
1766 # print STDERR "CURRENT BLAST = ", $current_blast->name, "\n";
1767 $current_blast->_parse_header($data);
1768
1769 # If there were any descriptions in the header,
1770 # we know if there are any significant hits.
1771 # No longer throwing exception if there were no significant hits
1772 # and a -signif parameter was specified. Doing so prevents the
1773 # construction of a Blast object, which could still be useful.
1774 # if($current_blast->{'_has_descriptions'} and $Blast->{'_confirm_significance'} and not $current_blast->is_signif) {
1775 # $current_blast->throw("No significant BLAST hits for ${\$current_blast->name}");
1776
1777 # }
1778
1779 } # Done parsing header/description section
1780
1781 ### For use with $contains_translation - not right - breaks regular report parsing.
1782 # elsif(ref $Blast->{'_current_blast'} && $data !~ /\s*\w*\s*/s) {
1783 elsif(ref $Blast->{'_current_blast'} ) {
1784 # Process an alignment section.
1785 $current_blast = $Blast->{'_current_blast'};
1786 # print STDERR "\nCONTINUING PROCESSING ALN WITH ", $current_blast->name, "\n";
1787 # print STDERR "DATA: $data\n";
1788 eval {
1789 $current_blast->_parse_alignment($data);
1790 };
1791 if($@) {
1792 # push @{$self->{'_blast_errs'}}, $@;
1793 }
1794 }
1795
1796 # If the current Blast object has been completely parsed
1797 # (occurs with a single Blast stream), or if there is a previous
1798 # Blast object (occurs with a multi Blast stream),
1799 # execute a supplied function on it or store it in a supplied array.
1800
1801 if( defined $prev_blast or $current_blast->{'_found_params'}) {
1802 my $finished_blast = defined($prev_blast) ? $prev_blast : $current_blast;
1803
1804 $finished_blast->_report_errors();
1805 # print STDERR "\nNEW BLAST OBJECT: ${\$finished_blast->name}\n";
1806
1807 if($exec_func) {
1808 # print STDERR " RUNNING EXEC_FUNC...\n";
1809 &$exec_func($finished_blast); # ignoring any return value.
1810 # Report processed, no longer need object.
1811 $finished_blast->destroy;
1812 undef $finished_blast;
1813 } elsif($save_a) {
1814 # print STDERR " SAVING IN ARRAY...\n";
1815 # We've already verified that if there is no exec_func
1816 # then there must be a $save_array
1817 push @$save_a, $finished_blast;
1818 }
1819 }
1820 1;
1821 }
1822 }
1823
1824 =head2 _report_errors
1825
1826 Title : _report_errors
1827 Usage : n/a; Internal method called by _get_parse_blast_func().
1828 Purpose : Throw or warn about any errors encountered.
1829 Returns : n/a
1830 Args : n/a
1831 Throws : If all hits generated exceptions, raise exception
1832 : (a fatal event for the Blast object.)
1833 : If some hits were okay but some were bad, generate a warning
1834 : (a few bad applies should not spoil the bunch).
1835 : This usually indicates a limiting B-value.
1836 : When the parsing code fails, it is either all or nothing.
1837
1838 =cut
1839
1840 #-------------------
1841 sub _report_errors {
1842 #-------------------
1843 my $self = shift;
1844
1845 return unless ref($self->{'_blast_errs'});
1846 # ref($self->{'_blast_errs'}) || (print STDERR "\nNO ERRORS\n", return );
1847
1848 my @errs = @{$self->{'_blast_errs'}};
1849
1850 if(scalar @errs) {
1851 my ($str);
1852 @{$self->{'_blast_errs'}} = (); # clear the errs on the object.
1853 # When there are many errors, in most of the cases, they are
1854 # caused by the same problem. Only need to see full data for
1855 # the first one.
1856 if(scalar @errs > 2) {
1857 $str = "SHOWING FIRST EXCEPTION ONLY:\n$errs[0]";
1858 $self->clear_err(); # clearing the existing set of errors (conserve memory).
1859 # Not necessary, unless the -RECORD_ERR =>1
1860 # constructor option was used for Blast object.
1861 } else {
1862 $str = join("\n",@errs);
1863 }
1864
1865 if(not $self->{'_num_hits_significant'}) {
1866 $self->throw(sprintf("Failed to parse any hit data (n=%d).", scalar(@errs)),
1867 "\n\nTRAPPED EXCEPTION(S):\n$str\nEND TRAPPED EXCEPTION(S)\n"
1868 );
1869 } else {
1870 $self->warn(sprintf("Some potential hits were not parsed (n=%d).", scalar(@errs)),
1871 @errs > 2 ? "This may be due to a limiting B value (max alignment listings)." : "",
1872 "\n\nTRAPPED EXCEPTION(S):\n$str\nEND TRAPPED EXCEPTION(S)\n"
1873 );
1874 }
1875 }
1876 }
1877
1878 =head2 _parse_header
1879
1880 Usage : n/a; called automatically by the _get_parse_blast_func().
1881 Purpose : Parses the header section of a BLAST report.
1882 Argument : String containing the header+description section of a BLAST report.
1883 Throws : Exception if description data cannot be parsed properly.
1884 : Exception if there is a 'FATAL' error in the Blast report.
1885 : Warning if there is a 'WARNING' in the Blast report.
1886 : Warning if there are no significant hits.
1887 Comments : Description section contains a single line for each hit listing
1888 : the seq id, description, score, Expect or P-value, etc.
1889
1890 See Also : L<_get_parse_blast_func()|_get_parse_blast_func>
1891
1892 =cut
1893
1894 #----------------------
1895 sub _parse_header {
1896 #----------------------
1897 my( $self, $data ) = @_;
1898
1899 # print STDERR "\n$ID: PARSING HEADER\n"; #$data\n";
1900
1901 $data =~ s/^\s+|\s+>?$//sg;
1902
1903 if($data =~ /<HTML/i) {
1904 $self->throw("Can't parse HTML-formatted BLAST reports.",
1905 # "Such reports can be parsed with a special parsing \n".
1906 # "script included in the examples/blast directory \n".
1907 # "of the Bioperl distribution. (TODO)"
1908 );
1909 # This was the old strategy, can't do it with new strategy
1910 # since we don't have the whole report in one chunk.
1911 # This could be the basis for the "special parsing script".
1912 # require Bio::Tools::Blast::HTML;
1913 # Bio::Tools::Blast::HTML->import(&strip_html);
1914 # &strip_html(\$data);
1915 }
1916
1917 $data =~ /WARNING: (.+?)$Newline$Newline/so and $self->warn("$1") if $self->strict;
1918 $data =~ /FATAL: (.+?)$Newline$Newline/so and $self->throw("FATAL BLAST ERROR = $1");
1919 # No longer throwing exception when no hits were found. Still reporting it.
1920 $data =~ /No hits? found/i and $self->warn("No hits were found.") if $self->strict;
1921
1922 # If this is the first Blast, the program, version, and database info
1923 # pertain to it. Otherwise, they are for the previous report and have
1924 # already been parsed out.
1925 # Data is stored in the static Blast object. Data for subsequent reports
1926 # will be stored in separate objects if the -share parameter is not set.
1927 # See _get_parse_blast_func().
1928
1929 if($Blast->{'_blast_count'} == 1) {
1930 if($data =~ /(<\w+>)?(T?BLAST[NPX])\s+(.+?)$Newline/so) {
1931 $Blast->program($2);
1932 $Blast->program_version($3);
1933 } else {
1934 $self->throw("Can't determine program type from BLAST report.",
1935 "Checked for: @Blast_programs.");
1936 # This has important implications for how to handle interval
1937 # information for HSPs. TBLASTN uses nucleotides in query HSP
1938 # but amino acids in the sbjct HSP sequence.
1939 }
1940
1941 if($data =~ m/Database:\s+(.+?)$Newline/so ) {
1942 $Blast->database($1);
1943 } else {
1944 # In some reports, the Database is only listed at end.
1945 #$self->warn("Can't determine database name from BLAST report (_parse_header)\n$data\n.");
1946 }
1947 }
1948
1949 my ($header, $descriptions);
1950
1951 ## For efficiency reasons, we want to to avoid using $' and $`.
1952 ## Therefore using single-line mode pattern matching.
1953
1954 if($data =~ /(.+?)\nSequences producing.+?\n(.+)/s ) {
1955 ($header, $descriptions) = ($1, $2);
1956 $self->{'_has_descriptions'} = 1;
1957 } else {
1958 $header = $data;
1959 $self->{'_has_descriptions'} = 0;
1960 # Blast reports can legally lack description section. No need to warn.
1961 #push @{$self->{'_blast_errs'}}, "Can't parse description data.";
1962 }
1963
1964 $self->_set_query($header); # The name of the sequence will appear in error report.
1965 # print STDERR "\nQUERY = ", $Blast->{'_current_blast'}->query, "\n";
1966
1967 $self->_set_date($header) if $Blast->{'_get_stats'};
1968 $self->_set_length($header);
1969
1970 # not $Blast->{'_confirm_significance'} and print STDERR "\nNOT PARSING DESCRIPTIONS.\n";
1971
1972 # Setting the absolute max and min significance levels.
1973 $self->{'_highestSignif'} = 0;
1974 $self->{'_lowestSignif'} = $DEFAULT_SIGNIF;
1975
1976 if ($Blast->{'_confirm_significance'} || $Blast->{'_no_aligns'}) {
1977 $self->_parse_descriptions($descriptions) if $descriptions;
1978 } else {
1979 $self->{'_is_significant'} = 1;
1980 }
1981 }
1982
1983 #-----------------------
1984 sub _parse_descriptions {
1985 #-----------------------
1986 my ($self, $desc) = @_;
1987
1988 # NOTE: This method will not be called if the report lacks
1989 # a description section.
1990
1991 # print STDERR "\nPARSING DESCRIPTION DATA\n";
1992
1993 my @descriptions = split( $Newline, $desc);
1994 my($line);
1995
1996 # NOW step through each line parsing out the P/Expect value
1997 # All we really need to do is check the first one, if it doesn't
1998 # meet the significance requirement, we can skip the report.
1999 # BUT: we want to collect data for all hits anyway to get min/max signif.
2000
2001 my $my_signif = $self->signif;
2002 my $layout_set = $Blast->{'_layout'} || 0;
2003 my $layout;
2004 my $count = 0;
2005 my $sig;
2006
2007 desc_loop:
2008 foreach $line (@descriptions) {
2009 $count++;
2010 last desc_loop if $line =~ / NONE |End of List/;
2011 next desc_loop if $line =~ /^\s*$/;
2012 next desc_loop if $line =~ /^\.\./;
2013
2014 ## Checking the significance value (P- or Expect value) of the hit
2015 ## in the description line.
2016
2017 # These regexps need testing on a variety of reports.
2018 if ( $line =~ /\d+\s{1,5}[\de.-]+\s*$/) {
2019 $layout = 2;
2020 } elsif( $line =~ /\d+\s{1,5}[\de.-]+\s{1,}\d+\s*$/) {
2021 $layout = 1;
2022 } else {
2023 $self->warn("Can't parse significance data in description line $line");
2024 next desc_loop;
2025 }
2026 not $layout_set and ($self->_layout($layout), $layout_set = 1);
2027
2028 $sig = &_parse_signif( $line, $layout );
2029
2030 # print STDERR " Parsed signif ($layout) = $sig\n";
2031
2032 last desc_loop if ($sig > $my_signif and not $Blast->{'_check_all'});
2033 $self->_process_significance($sig, $my_signif);
2034 }
2035
2036 # printf "\n%d SIGNIFICANT HITS.\nDONE PARSING DESCRIPTIONS.\n", $self->{'_num_hits_significant'};
2037 }
2038
2039 sub _process_significance {
2040 my($self, $sig, $my_signif) = @_;
2041
2042 $self->{'_highestSignif'} = ($sig > $self->{'_highestSignif'})
2043 ? $sig : $self->{'_highestSignif'};
2044
2045 $self->{'_lowestSignif'} = ($sig < $self->{'_lowestSignif'})
2046 ? $sig : $self->{'_lowestSignif'};
2047
2048 # Significance value assessment.
2049 $sig <= $my_signif and $self->{'_num_hits_significant'}++;
2050 $self->{'_num_hits'}++;
2051
2052 $self->{'_is_significant'} = 1 if $self->{'_num_hits_significant'};
2053 }
2054
2055 =head2 _parse_alignment
2056
2057 Usage : n/a; called automatically by the _get_parse_blast_func().
2058 Purpose : Parses a single alignment section of a BLAST report.
2059 Argument : String containing the alignment section.
2060 Throws : n/a; All errors are trapped while parsing the hit data
2061 : and are processed as a group when the report is
2062 : completely processed (See _report_errors()).
2063 :
2064 Comments : Alignment section contains all HSPs for a hit.
2065 : Requires Bio::Tools::Blast::Sbjct.pm.
2066 : Optionally calls a filter function to screen the hit on arbitrary
2067 : criteria. If the filter function returns true for a given hit,
2068 : that hit will be skipped.
2069 :
2070 : If the Blast object was created with -check_all_hits set to true,
2071 : all hits will be checked for significance and processed if necessary.
2072 : If this field is false, the parsing will stop after the first
2073 : non-significant hit.
2074 : See parse() for description of parsing parameters.
2075
2076 See Also : L<parse()|parse>, L<_get_parse_blast_func()|_get_parse_blast_func>, L<_report_errors()|_report_errors>, B<Bio::Tools::Blast::Sbjct()>, L<Links to related modules>
2077
2078 =cut
2079
2080 #----------------------
2081 sub _parse_alignment {
2082 #----------------------
2083 # This method always needs to check detect if the $data argument
2084 # contains the footer of a Blast report, indicating the last chunk
2085 # of a single Blast stream.
2086
2087 my( $self, $data ) = @_;
2088
2089 # printf STDERR "\nPARSING ALIGNMENT DATA for %s $self.\n", $self->name;
2090
2091 # NOTE: $self->{'_current_hit'} is an instance variable
2092 # The $Blast object will not have this member.
2093
2094 # If all of the significant hits have been parsed,
2095 # return if we're not checking all or if we don't need to get
2096 # the Blast stats (parameters at footer of report).
2097 if(defined $self->{'_current_hit'} and
2098 defined $self->{'_num_hits_significant'}) {
2099 return if $self->{'_current_hit'} >= $self->{'_num_hits_significant'} and
2100 not ($Blast->{'_check_all'} or $Blast->{'_get_stats'});
2101 }
2102
2103 # Check for the presence of the Blast footer section.
2104 # _parse_footer returns the alignment section.
2105 $data = $self->_parse_footer($data);
2106
2107 # Return if we're only interested in the best hit.
2108 # This has to occur after checking for the parameters section
2109 # in the footer (since we may still be interested in them).
2110 return if $Blast->best and ( defined $self->{'_current_hit'} and $self->{'_current_hit'} >=1);
2111
2112 # print "RETURNED FROM _parse_footer (", $self->to_string, ")";
2113 # print "\n --> FOUND PARAMS.\n" if $self->{'_found_params'};
2114 # print "\n --> DID NOT FIND PARAMS.\n" unless $self->{'_found_params'};
2115
2116 require Bio::Tools::Blast::Sbjct;
2117
2118 $data =~ s/^\s+|\s+>?$//sg;
2119 $data =~ s/$Newline$Newline/$Newline/sog; # remove blank lines.
2120 my @data = split($Newline, $data);
2121 push @data, 'end';
2122
2123 # print STDERR "\nALIGNMENT DATA:\n$data\n";
2124
2125 my $prog = $self->program;
2126 my $check_all = $Blast->{'_check_all'};
2127 my $filt_func = $Blast->{'_filt_func'} || 0;
2128 my $signif_fmt = $Blast->{'_signif_fmt'};
2129 my $my_signif = $self->signif;
2130 my $err;
2131
2132 # Now construct the Sbjct objects from the alignment section
2133
2134 # debug(1);
2135
2136 $self->{'_current_hit'}++;
2137
2138 # If not confirming significance, _parse_descriptions will not have been run,
2139 # so we need to count the total number of hits here.
2140 if( not $Blast->{'_confirm_significance'}) {
2141 $self->{'_num_hits'}++;
2142 }
2143
2144 if($Blast->{'_no_aligns'}) {
2145 # printf STDERR "\nNOT PARSING ALIGNMENT DATA\n";
2146 return;
2147 }
2148
2149 my $hit; # Must be my'ed within hit_loop.
2150 eval {
2151 $hit = new Bio::Tools::Blast::Sbjct (-DATA =>\@data,
2152 -PARENT =>$self,
2153 -NAME =>$self->{'_current_hit'},
2154 -RANK =>$self->{'_current_hit'},
2155 -RANK_BY =>'order',
2156 -PROGRAM =>$prog,
2157 -SIGNIF_FMT=>$signif_fmt,
2158 -OVERLAP =>$Blast->{'_overlap'} || $MAX_HSP_OVERLAP,
2159 );
2160 # printf STDERR "NEW HIT: %s, SIGNIFICANCE = %g\n", $hit->name, $hit->expect; <STDIN>;
2161 # The BLAST report may have not had a description section.
2162 if(not $self->{'_has_descriptions'}) {
2163 $self->_process_significance($hit->signif, $my_signif);
2164 }
2165 };
2166
2167 if($@) {
2168 # Throwing lots of errors can slow down the code substantially.
2169 # Error handling code is not that efficient.
2170 #print STDERR "\nERROR _parse_alignment: $@\n";
2171 push @{$self->{'_blast_errs'}}, $@;
2172 $hit->destroy if ref $hit;
2173 undef $hit;
2174 } else {
2175 # Collect overall signif data if we don't already have it,
2176 # (as occurs if no -signif parameter is supplied).
2177 my $hit_signif = $hit->signif;
2178
2179 if (not $Blast->{'_confirm_significance'} ) {
2180 $self->{'_highestSignif'} = ($hit_signif > $self->{'_highestSignif'})
2181 ? $hit_signif : $self->{'_highestSignif'};
2182
2183 $self->{'_lowestSignif'} = ($hit_signif < $self->{'_lowestSignif'})
2184 ? $hit_signif : $self->{'_lowestSignif'};
2185 }
2186
2187 # Test significance using custom function (if supplied)
2188 if($filt_func) {
2189 if(&$filt_func($hit)) {
2190 push @{$self->{'_hits'}}, $hit;
2191 } else {
2192 $hit->destroy; undef $hit;
2193 }
2194 } elsif($hit_signif <= $my_signif) {
2195 push @{$self->{'_hits'}}, $hit;
2196 }
2197 }
2198
2199 }
2200
2201 =head2 _parse_footer
2202
2203 Usage : n/a; internal function. called by _parse_alignment()
2204 Purpose : Extracts statistical and other parameters from the BLAST report.
2205 : Sets various key elements such as the program and version,
2206 : gapping, and the layout for the report (blast1 or blast2).
2207 Argument : Data to be parsed.
2208 Returns : String containing an alignment section for processing by
2209 : _parse_alignment().
2210 Throws : Exception if cannot find the parameters section of report.
2211 : Warning if cannot determine if gapping was used.
2212 : Warning if cannot determine the scoring matrix used.
2213 Comments : This method must always get called, even if the -STATS
2214 : parse() parameter is false. The reason is that the layout
2215 : of the report and the presence of gapping must always be set.
2216 : The determination whether to set additional stats is made
2217 : by methods called by _parse_footer().
2218
2219 See Also : L<parse()|parse>, L<_parse_alignment()|_parse_alignment>, L<_set_database()|_set_database>
2220
2221 =cut
2222
2223 #---------------------
2224 sub _parse_footer {
2225 #---------------------
2226 # Basic strategy:
2227 # 1. figure out if we're supposed to get the stats,
2228 # 2. figure out if the stats are to be shared. some, not all can be shared
2229 # (eg., db info and matrix can be shared, karlin altschul params cannot.
2230 # However, this method assumes they are all sharable.)
2231 # 3. Parse the stats.
2232 # 4. return the block before the parameters section if the supplied data
2233 # contains a footer parameters section.
2234
2235 my ($self, $data) = @_;
2236 my ($client, $last_align, $params);
2237
2238 # printf STDERR "\nPARSING PARAMETERS for %s $self.\n", $self->name;
2239
2240 # Should the parameters be shared?
2241 # If so, set $self to be the static $Blast object and return if
2242 # the parameters were already set.
2243 # Before returning, we need to extract the last alignment section
2244 # from the parameter section, if any.
2245
2246 if ($Blast->{'_share'}) {
2247 $client = $self;
2248 $self = $Blast if $Blast->{'_share'};
2249 }
2250
2251 my $get_stats = $Blast->{'_get_stats'};
2252
2253 if( $data =~ /(.+?)${Newline}CPU time: (.*)/so) {
2254 # NCBI-Blast2 format (v2.04).
2255 ($last_align, $params) = ($1, $2);
2256 return $last_align if $client->{'_found_params'};
2257 $self->_set_blast2_stats($params);
2258
2259 } elsif( $data =~ /(.+?)${Newline}Parameters:(.*)/so) {
2260 # NCBI-Blast1 or WashU-Blast2 format.
2261 ($last_align, $params) = ($1, $2);
2262 return $last_align if $client->{'_found_params'};
2263 $self->_set_blast1_stats($params);
2264
2265 } elsif( $data =~ /(.+?)$Newline\s+Database:(.*)/so) {
2266 # Gotta watch out for confusion with the Database: line in the header
2267 # which will be present in the last hit of an internal Blast report
2268 # in a multi-report stream.
2269
2270 # NCBI-Blast2 format (v2.05).
2271 ($last_align, $params) = ($1, $2);
2272 return $last_align if $client->{'_found_params'};
2273 $self->_set_blast2_stats($params);
2274
2275 } elsif( $data =~ /(.+?)$Newline\s*Searching/so) {
2276 # trying to detect a Searching at the end of a PSI-blast round.
2277 # Gotta watch out for confusion with the Searching line in the header
2278 # which will be present in the last hit of an internal Blast report
2279 # in a multi-report, non-PSI-blast stream.
2280
2281 # PSI-Blast format (v2.08).
2282 ($last_align) = ($1);
2283 return $last_align; # if $client->{'_found_params'};
2284 }
2285
2286 # If parameter section was found, set a boolean,
2287 # otherwise return original data.
2288
2289 if (defined($params)) {
2290 $client->{'_found_params'} = 1;
2291 } else {
2292 return $data;
2293 }
2294
2295 $self->_set_database($params) if $get_stats;
2296
2297 # The {'_gapped'} member should be set in the _set_blast?_stats() call.
2298 # This is a last minute attempt to deduce it.
2299
2300 if(!defined($self->{'_gapped'})) {
2301 if($self->program_version() =~ /^1/) {
2302 $self->{'_gapped'} = 0;
2303 } else {
2304 if($self->strict > 0) {
2305 $self->warn("Can't determine if gapping was used. Assuming gapped.");
2306 }
2307 $self->{'_gapped'} = 1;
2308 }
2309 }
2310
2311 return $last_align;
2312 }
2313
2314 =head2 _set_blast2_stats
2315
2316 Usage : n/a; internal function called by _parse_footer()
2317 Purpose : Extracts statistical and other parameters from BLAST2 report footer.
2318 : Stats collected: database release, gapping,
2319 : posted date, matrix used, filter used, Karlin-Altschul parameters,
2320 : E, S, T, X, W.
2321 Throws : Exception if cannot get "Parameters" section of Blast report.
2322
2323 See Also : L<parse()|parse>, L<_parse_footer()|_parse_footer>, L<_set_database()|_set_database>, B<Bio::Tools::SeqAnal::set_date()>,L<Links to related modules>
2324
2325 =cut
2326
2327 #---------------------'
2328 sub _set_blast2_stats {
2329 #---------------------
2330 my ($self, $data) = (@_);
2331
2332 if($data =~ /$Newline\s*Gapped/so) {
2333 $self->{'_gapped'} = 1;
2334 } else {
2335 $self->{'_gapped'} = 0;
2336 }
2337
2338 # Other stats are not always essential.
2339 return unless $Blast->{'_get_stats'};
2340
2341 # Blast2 Doesn't report what filter was used in the parameters section.
2342 # It just gives a warning that *some* filter was used in the header.
2343 # You just have to know the defaults (currently: protein = SEG, nucl = DUST).
2344 if($data =~ /\bfiltered\b/si) {
2345 $self->{'_filter'} = 'DEFAULT FILTER';
2346 } else {
2347 $self->{'_filter'} = 'NONE';
2348 }
2349
2350 if($data =~ /Gapped$Newline\s*Lambda +K +H$Newline +(.+?)$Newline/so) {
2351 my ($l, $k, $h) = split(/\s+/, $1);
2352 $self->{'_lambda'} = $l || 'UNKNOWN';
2353 $self->{'_k'} = $k || 'UNKNOWN';
2354 $self->{'_h'} = $h || 'UNKNOWN';
2355 } elsif($data =~ /Lambda +K +H$Newline +(.+?)$Newline/so) {
2356 my ($l, $k, $h) = split(/\s+/, $1);
2357 $self->{'_lambda'} = $l || 'UNKNOWN';
2358 $self->{'_k'} = $k || 'UNKNOWN';
2359 $self->{'_h'} = $h || 'UNKNOWN';
2360 }
2361
2362 if($data =~ /$Newline\s*Matrix: (.+?)$Newline/so) {
2363 $self->{'_matrix'} = $1;
2364 } else {
2365 $self->{'_matrix'} = $DEFAULT_MATRIX.'?';
2366 if($self->strict > 0) {
2367 $self->warn("Can't determine scoring matrix. Assuming $DEFAULT_MATRIX.");
2368 }
2369 }
2370
2371 if($data =~ /$Newline\s*Gap Penalties: Existence: +(\d+), +Extension: (\d+)$Newline/so) {
2372 $self->{'_gapCreation'} = $1;
2373 $self->{'_gapExtension'} = $2;
2374 }
2375 if($data =~ /sequences better than (\d+):/s) {
2376 $self->{'_expect'} = $1;
2377 }
2378
2379 if($data =~ /$Newline\s*T: (\d+)/o) { $self->{'_word_size'} = $1; }
2380 if($data =~ /$Newline\s*A: (\d+)/o) { $self->{'_a'} = $1; }
2381 if($data =~ /$Newline\s*S1: (\d+)/o) { $self->{'_s'} = $1; }
2382 if($data =~ /$Newline\s*S2: (\d+)/o) { $self->{'_s'} .= ", $1"; }
2383 if($data =~ /$Newline\s*X1: (\d+)/o) { $self->{'_x1'} = $1; }
2384 if($data =~ /$Newline\s*X2: (\d+)/o) { $self->{'_x2'} = $1; }
2385 }
2386
2387 =head2 _set_blast1_stats
2388
2389 Usage : n/a; internal function called by _parse_footer()
2390 Purpose : Extracts statistical and other parameters from BLAST 1.x style eports.
2391 : Handles NCBI Blast1 and WashU-Blast2 formats.
2392 : Stats collected: database release, gapping,
2393 : posted date, matrix used, filter used, Karlin-Altschul parameters,
2394 : E, S, T, X, W.
2395
2396 See Also : L<parse()|parse>, L<_parse_footer()|_parse_footer>, L<_set_database()|_set_database>, B<Bio::Tools::SeqAnal::set_date()>,L<Links to related modules>
2397
2398 =cut
2399
2400 #----------------------
2401 sub _set_blast1_stats {
2402 #----------------------
2403 my ($self, $data) = (@_);
2404
2405 if(!$self->{'_gapped'} and $self->program_version() =~ /^2[\w\-\.]+WashU/) {
2406 $self->_set_gapping_wu($data);
2407 } else {
2408 $self->{'_gapped'} = 0;
2409 }
2410
2411 # Other stats are not always essential.
2412 return unless $Blast->{'_get_stats'};
2413
2414 if($data =~ /filter=(.+?)$Newline/so) {
2415 $self->{'_filter'} = $1;
2416 } elsif($data =~ /filter$Newline +(.+?)$Newline/so) {
2417 $self->{'_filter'} = $1;
2418 } else {
2419 $self->{'_filter'} = 'NONE';
2420 }
2421
2422 if($data =~ /$Newline\s*E=(\d+)$Newline/so) { $self->{'_expect'} = $1; }
2423
2424 if($data =~ /$Newline\s*M=(\w+)$Newline/so) { $self->{'_matrix'} = $1; }
2425
2426 if($data =~ /\s*Frame MatID Matrix name .+?$Newline +(.+?)$Newline/so) {
2427 ## WU-Blast2.
2428 my ($fr, $mid, $mat, $lu, $ku, $hu, $lc, $kc, $hc) = split(/\s+/,$1);
2429 $self->{'_matrix'} = $mat || 'UNKNOWN';
2430 $self->{'_lambda'} = $lu || 'UNKNOWN';
2431 $self->{'_k'} = $ku || 'UNKNOWN';
2432 $self->{'_h'} = $hu || 'UNKNOWN';
2433
2434 } elsif($data =~ /Lambda +K +H$Newline +(.+?)$Newline/so) {
2435 ## NCBI-Blast1.
2436 my ($l, $k, $h) = split(/\s+/, $1);
2437 $self->{'_lambda'} = $l || 'UNKNOWN';
2438 $self->{'_k'} = $k || 'UNKNOWN';
2439 $self->{'_h'} = $h || 'UNKNOWN';
2440 }
2441
2442 if($data =~ /E +S +W +T +X.+?$Newline +(.+?)$Newline/so) {
2443 # WashU-Blast2
2444 my ($fr, $mid, $len, $elen, $e, $s, $w, $t, $x, $e2, $s2) = split(/\s+/,$1);
2445 $self->{'_expect'} ||= $e || 'UNKNOWN';
2446 $self->{'_s'} = $s || 'UNKNOWN';
2447 $self->{'_word_size'} = $w || 'UNKNOWN';
2448 $self->{'_t'} = $t || 'UNKNOWN';
2449 $self->{'_x'} = $x || 'UNKNOWN';
2450
2451 } elsif($data =~ /E +S +T1 +T2 +X1 +X2 +W +Gap$Newline +(.+?)$Newline/so) {
2452 ## NCBI-Blast1.
2453 my ($e, $s, $t1, $t2, $x1, $x2, $w, $gap) = split(/\s+/,$1);
2454 $self->{'_expect'} ||= $e || 'UNKNOWN';
2455 $self->{'_s'} = $s || 'UNKNOWN';
2456 $self->{'_word_size'} = $w || 'UNKNOWN';
2457 $self->{'_t1'} = $t1 || 'UNKNOWN';
2458 $self->{'_t2'} = $t2 || 'UNKNOWN';
2459 $self->{'_x1'} = $x1 || 'UNKNOWN';
2460 $self->{'_x2'} = $x2 || 'UNKNOWN';
2461 $self->{'_gap'} = $gap || 'UNKNOWN';
2462 }
2463
2464 if(!$self->{'_matrix'}) {
2465 $self->{'_matrix'} = $DEFAULT_MATRIX.'?';
2466 if($self->strict > 0) {
2467 $self->warn("Can't determine scoring matrix. Assuming $DEFAULT_MATRIX.");
2468 }
2469 }
2470 }
2471
2472 =head2 _set_gapping_wu
2473
2474 Usage : n/a; internal function called by _set_blast1_stats()
2475 Purpose : Determine if gapping_wu was on for WashU Blast reports.
2476 Comments : In earlier versions, gapping was always specified
2477 : but in the current version (2.0a19MP), gapping is on by default
2478 : and there is no positive "gapping" indicator in the Parameters
2479 : section.
2480
2481 See Also : L<_set_blast1_stats()|_set_blast1_stats>
2482
2483 =cut
2484
2485 #--------------------
2486 sub _set_gapping_wu {
2487 #--------------------
2488 my ($self, $data) = @_;
2489
2490 if($data =~ /gaps?$Newline/so) {
2491 $self->{'_gapped'} = ($data =~ /nogaps?$Newline/so) ? 0 : 1;
2492 } else {
2493 $self->{'_gapped'} = 1;
2494 }
2495 }
2496
2497 =head2 _set_date
2498
2499 Usage : n/a; internal function called by _parse_footer()
2500 Purpose : Determine the date on which the Blast analysis was performed.
2501 Comments : Date information is not consistently added to Blast output.
2502 : Uses superclass method set_date() to set date from the file,
2503 : (if any).
2504
2505 See Also : L<_parse_footer()|_parse_footer>, B<Bio::Tools::SeqAnal::set_date()>,L<Links to related modules>
2506
2507 =cut
2508
2509 #--------------
2510 sub _set_date {
2511 #--------------
2512 my $self = shift;
2513 my $data = shift;
2514
2515 ### Network BLAST reports from NCBI are time stamped as follows:
2516 #Fri Apr 18 15:55:41 EDT 1997, Up 1 day, 19 mins, 1 user, load: 19.54, 19.13, 17.77
2517 if($data =~ /Start:\s+(.+?)\s+End:/s) {
2518 ## Calling superclass method to set the date.
2519 ## If we can't get date from the report, file date is obtained.
2520 $self->set_date($1);
2521 } elsif($data =~ /Date:\s+(.*?)$Newline/so) {
2522 ## E-mailed reports have a Date: field
2523 $self->set_date($1);
2524 } elsif( $data =~ /done\s+at (.+?)$Newline/so ) {
2525 $self->set_date($1);
2526 } elsif( $data =~ /$Newline([\w:, ]+), Up \d+/so ) {
2527 $self->set_date($1);
2528 } else {
2529 ## Otherwise, let superclass attempt to get the file creation date.
2530 $self->set_date() if $self->file;
2531 }
2532 }
2533
2534 =head2 _set_length
2535
2536 Usage : n/a; called automatically during Blast report parsing.
2537 Purpose : Sets the length of the query sequence (extracted from report).
2538 Returns : integer (length of the query sequence)
2539 Throws : Exception if cannot determine the query sequence length from
2540 : the BLAST report.
2541 : Exception if the length is below the min_length cutoff (if any).
2542 Comments : The logic here is a bit different from the other _set_XXX()
2543 : methods since the significance of the BLAST report is assessed
2544 : if MIN_LENGTH is set.
2545
2546 See Also : B<Bio::Tools::SeqAnal::length()>, L<Links to related modules>
2547
2548 =cut
2549
2550 #---------------
2551 sub _set_length {
2552 #---------------
2553 my ($self, $data) = @_;
2554
2555 my ($length);
2556 if( $data =~ m/$Newline\s+\(([\d|,]+) letters[\);]/so ) {
2557 $length = $1;
2558 $length =~ s/,//g;
2559 # printf "Length = $length in BLAST for %s$Newline",$self->name; <STDIN>;
2560 } else {
2561 $self->throw("Can't determine sequence length from BLAST report.");
2562 }
2563
2564 my($sig_len);
2565 if(defined($Blast->{'_min_length'})) {
2566 local $^W = 0;
2567 if($length < $Blast->{'_min_len'}) {
2568 $self->throw("Query sequence too short for ${\$self->name} ($length)",
2569 "Minimum length is $Blast->{'_min_len'}");
2570 }
2571 }
2572
2573 $self->length($length); # defined in superclass.
2574 }
2575
2576 =head2 _set_database
2577
2578 Usage : n/a; called automatically during Blast report parsing.
2579 Purpose : Sets the name of the database used by the BLAST analysis.
2580 : Extracted from raw BLAST report.
2581 Throws : Exception if the name of the database cannot be determined.
2582 Comments : The database name is used by methods or related objects
2583 : for database-specific parsing.
2584
2585 See Also : L<parse()|parse>, B<Bio::Tools::SeqAnal::database()>,B<Bio::Tools::SeqAnal::_set_db_stats()>,L<Links to related modules>
2586
2587 =cut
2588
2589 #------------------
2590 sub _set_database {
2591 #------------------
2592 # This now only sets data base information extracted from the report footer.
2593
2594 my ($self, $data) = @_;
2595
2596 my ($name, $date, $lets, $seqs);
2597
2598 my $strict = $self->strict > 0;
2599
2600 # This is fail-safe since DB name usually gets set in _parse_header()
2601 # In some reports, the database is only listed at bottom (NCBI 2.0.8).
2602 if($data =~ m/Database: +(.+?)$Newline/so ) {
2603 $name = $1;
2604 } elsif(not $self->database) {
2605 $self->warn("Can't determine database name from BLAST report.");
2606 }
2607
2608 if($data =~ m/Posted date: +(.+?)$Newline/so ) {
2609 $date = $1;
2610 } elsif($data =~ m/Release date: +(.+?)$Newline/so ) {
2611 $date = $1;
2612 } elsif($strict) {
2613 $self->warn("Can't determine database release date.");
2614 }
2615
2616 if($data =~ m/letters in database: +([\d,]+)/si ||
2617 $data =~ m/length of database: +([\d,]+)/si ) {
2618 $lets = $1;
2619 } elsif($strict) {
2620 $self->warn("Can't determine number of letters in database.\n$data\n");
2621 }
2622
2623 if($data =~ m/sequences in database: +([\d,]+)/si ||
2624 $data =~ m/number of sequences: +([\d,]+)/si ) {
2625 $seqs = $1;
2626 } elsif($strict) {
2627 $self->warn("Can't determine number of sequences in database.\n$data\n");
2628 }
2629
2630 $self->_set_db_stats( -NAME => $name,
2631 -RELEASE => $date || '',
2632 -LETTERS => $lets || '',
2633 -SEQS => $seqs || ''
2634 );
2635 }
2636
2637 =head2 _set_query
2638
2639 Usage : n/a; called automatically during Blast report parsing.
2640 Purpose : Set the name of the query and the query description.
2641 : Extracted from the raw BLAST report.
2642 Returns : String containing name of query extracted from report.
2643 Throws : Warning if the name of the query cannont be obtained.
2644
2645 See Also : B<Bio::Tools::SeqAnal::query_desc()>,L<Links to related modules>
2646
2647 =cut
2648
2649 #---------------
2650 sub _set_query {
2651 #---------------
2652 my $self = shift;
2653 my $data = shift;
2654
2655 if($data =~ m/${Newline}Query= *(.+?)$Newline/so ) {
2656 my $info = $1;
2657 $info =~ s/TITLE //;
2658 # Split the query line into two parts.
2659 # Using \s instead of ' '
2660 $info =~ /(\S+?)\s(.*)/;
2661 $self->query_desc($2 || '');
2662 # set name of Blast object and return.
2663 $self->name($1 || 'UNKNOWN');
2664 } else {
2665 $self->warn("Can't determine query sequence name from BLAST report.");
2666 }
2667 # print STDERR "$Newline NAME = ${\$self->name}$Newline";
2668 }
2669
2670 =head2 _parse_signif
2671
2672 Usage : &_parse_signif(string, layout, gapped);
2673 : This is a class function.
2674 Purpose : Extracts the P- or Expect value from a single line of a BLAST description section.
2675 Example : &_parse_signif("PDB_UNIQUEP:3HSC_ heat-shock cognate ... 799 4.0e-206 2", 1);
2676 : &_parse_signif("gi|758803 (U23828) peritrophin-95 precurs 38 0.19", 2);
2677 Argument : string = line from BLAST description section
2678 : layout = integer (1 or 2)
2679 : gapped = boolean (true if gapped Blast).
2680 Returns : Float (0.001 or 1e-03)
2681 Status : Static
2682
2683 =cut
2684
2685 #------------------
2686 sub _parse_signif {
2687 #------------------
2688 my ($line, $layout, $gapped) = @_;
2689
2690 local $_ = $line;
2691 my @linedat = split();
2692
2693 # When processing both Blast1 and Blast2 reports
2694 # in the same run, offset needs to be configured each time.
2695
2696 my $offset = 0;
2697 $offset = 1 if $layout == 1 or not $gapped;
2698
2699 my $signif = $linedat[ $#linedat - $offset ];
2700
2701 # fail-safe check
2702 if(not $signif =~ /[.-]/) {
2703 $offset = ($offset == 0 ? 1 : 0);
2704 $signif = $linedat[ $#linedat - $offset ];
2705 }
2706
2707 $signif = "1$signif" if $signif =~ /^e/i;
2708 return $signif;
2709 }
2710
2711 ##
2712 ## BEGIN ACCESSOR METHODS THAT INCORPORATE THE STATIC $Blast OBJECT.
2713 ##
2714
2715 sub program {
2716 ## Overridden method to incorporate the BLAST object.
2717 my $self = shift;
2718 return $self->SUPER::program(@_) if @_; # set
2719 $self->SUPER::program || $Blast->SUPER::program; # get
2720 }
2721
2722 sub program_version {
2723 ## Overridden method to incorporate the BLAST object.
2724 my $self = shift;
2725 return $self->SUPER::program_version(@_) if @_; # set
2726 $self->SUPER::program_version || $Blast->SUPER::program_version; # get
2727 }
2728
2729 sub database {
2730 ## Overridden method to incorporate the BLAST object.
2731 my $self = shift;
2732 return $self->SUPER::database(@_) if @_; # set
2733 $self->SUPER::database || $Blast->SUPER::database; # get
2734 }
2735
2736 sub database_letters {
2737 ## Overridden method to incorporate the BLAST object.
2738 my $self = shift;
2739 return $self->SUPER::database_letters(@_) if @_; # set
2740 $self->SUPER::database_letters || $Blast->SUPER::database_letters; # get
2741 }
2742
2743 sub database_release {
2744 ## Overridden method to incorporate the BLAST object.
2745 my $self = shift;
2746 return $self->SUPER::database_release(@_) if @_; # set
2747 $self->SUPER::database_release || $Blast->SUPER::database_release; # get
2748 }
2749
2750 sub database_seqs {
2751 ## Overridden method to incorporate the BLAST object.
2752 my $self = shift;
2753 return $self->SUPER::database_seqs(@_) if @_; # set
2754 $self->SUPER::database_seqs || $Blast->SUPER::database_seqs; # get
2755 }
2756
2757 sub date {
2758 ## Overridden method to incorporate the BLAST object.
2759 my $self = shift;
2760 return $self->SUPER::date(@_) if @_; # set
2761 $self->SUPER::date || $Blast->SUPER::date; # get
2762 }
2763
2764 sub best {
2765 ## Overridden method to incorporate the BLAST object.
2766 my $self = shift;
2767 return $Blast->SUPER::best(@_) if @_; # set
2768 $Blast->SUPER::best; # get
2769 }
2770
2771 =head2 signif
2772
2773 Usage : $blast->signif();
2774 Purpose : Gets the P or Expect value used as significance screening cutoff.
2775 Returns : Scientific notation number with this format: 1.0e-05.
2776 Argument : n/a
2777 Comments : Screening of significant hits uses the data provided on the
2778 : description line. For Blast1 and WU-Blast2, this data is P-value.
2779 : for Blast2 it is an Expect value.
2780 :
2781 : Obtains info from the static $Blast object if it has not been set
2782 : for the current object.
2783
2784 See Also : L<_set_signif()|_set_signif>
2785
2786 =cut
2787
2788 #-----------
2789 sub signif {
2790 #-----------
2791 my $self = shift;
2792 my $sig = $self->{'_significance'} || $Blast->{'_significance'};
2793 sprintf "%.1e", $sig;
2794 }
2795
2796 =head2 is_signif
2797
2798 Usage : $blast->is_signif();
2799 Purpose : Determine if the BLAST report contains significant hits.
2800 Returns : Boolean
2801 Argument : n/a
2802 Comments : BLAST reports without significant hits but with defined
2803 : significance criteria will throw exceptions during construction.
2804 : This obviates the need to check significant() for
2805 : such objects.
2806
2807 See Also : L<_set_signif()|_set_signif>
2808
2809 =cut
2810
2811 #------------
2812 sub is_signif { my $self = shift; return $self->{'_is_significant'}; }
2813 #------------
2814
2815 # is_signif() doesn't incorporate the static $Blast object but is included
2816 # here to be with the other 'signif' methods.
2817
2818 =head2 signif_fmt
2819
2820 Usage : $blast->signif_fmt( [FMT] );
2821 Purpose : Allows retrieval of the P/Expect exponent values only
2822 : or as a two-element list (mantissa, exponent).
2823 Usage : $blast_obj->signif_fmt('exp');
2824 : $blast_obj->signif_fmt('parts');
2825 Returns : String or '' if not set.
2826 Argument : String, FMT = 'exp' (return the exponent only)
2827 : = 'parts'(return exponent + mantissa in 2-elem list)
2828 : = undefined (return the raw value)
2829 Comments : P/Expect values are still stored internally as the full,
2830 : scientific notation value.
2831 : This method uses the static $Blast object since this issue
2832 : will pertain to all Blast reports within a given set.
2833 : This setting is propagated to Bio::Tools::Blast::Sbjct.pm.
2834
2835 =cut
2836
2837 #-------------
2838 sub signif_fmt {
2839 #-------------
2840 my $self = shift;
2841 if(@_) { $Blast->{'_signif_fmt'} = shift; }
2842 $Blast->{'_signif_fmt'} || '';
2843 }
2844
2845 =head2 min_length
2846
2847 Usage : $blast->min_length();
2848 Purpose : Gets the query sequence length used as significance screening criteria.
2849 Returns : Integer
2850 Argument : n/a
2851 Comments : Obtains info from the static $Blast object if it has not been set
2852 : for the current object.
2853
2854 See Also : L<_set_signif()|_set_signif>, L<signif()|signif>
2855
2856 =cut
2857
2858 #--------------
2859 sub min_length {
2860 #--------------
2861 my $self = shift;
2862 $self->{'_min_length'} || $Blast->{'_min_length'};
2863 }
2864
2865 =head2 gapped
2866
2867 Usage : $blast->gapped();
2868 Purpose : Set/Get boolean indicator for gapped BLAST.
2869 Returns : Boolean
2870 Argument : n/a
2871 Comments : Obtains info from the static $Blast object if it has not been set
2872 : for the current object.
2873
2874 =cut
2875
2876 #-----------
2877 sub gapped {
2878 #-----------
2879 my $self = shift;
2880 if(@_) { $self->{'_gapped'} = shift; }
2881 $self->{'_gapped'} || $Blast->{'_gapped'};
2882 }
2883
2884 =head2 _get_stats
2885
2886 Usage : n/a; internal method.
2887 Purpose : Set/Get indicator for collecting full statistics from report.
2888 Returns : Boolean (0 | 1)
2889 Comments : Obtains info from the static $Blast object which gets set
2890 : by _init_parse_params().
2891
2892 =cut
2893
2894 #---------------
2895 sub _get_stats {
2896 #---------------
2897 my $self = shift;
2898 $Blast->{'_get_stats'};
2899 }
2900
2901 =head2 _layout
2902
2903 Usage : n/a; internal method.
2904 Purpose : Set/Get indicator for the layout of the report.
2905 Returns : Integer (1 | 2)
2906 : Defaults to 2 if not set.
2907 Comments : Blast1 and WashU-Blast2 have a layout = 1.
2908 : This is intended for internal use by this and closely
2909 : allied modules like Sbjct.pm and HSP.pm.
2910 :
2911 : Obtains info from the static $Blast object if it has not been set
2912 : for the current object.
2913
2914 =cut
2915
2916 #------------
2917 sub _layout {
2918 #------------
2919 my $self = shift;
2920 if(@_) {
2921 # Optimization if we know all reports share the same stats.
2922 if($Blast->{'_share'}) {
2923 $Blast->{'_layout'} = shift;
2924 } else {
2925 $self->{'_layout'} = shift;
2926 }
2927 }
2928 $self->{'_layout'} || $Blast->{'_layout'} || 2;
2929 }
2930
2931 ##
2932 ## END ACCESSOR METHODS THAT INCORPORATE THE STATIC $Blast OBJECT.
2933 ##
2934
2935 =head2 hits
2936
2937 Usage : $blast->hits();
2938 Purpose : Get a list containing all BLAST hit (Sbjct) objects.
2939 : Get the numbers of significant hits.
2940 Examples : @hits = $blast->hits();
2941 : $num_signif = $blast->hits();
2942 Returns : List context : list of Bio::Tools::Blast::Sbjct.pm objects
2943 : or an empty list if there are no hits.
2944 : Scalar context: integer (number of significant hits)
2945 : or zero if there are no hits.
2946 : (Equivalent to num_hits()).
2947 Argument : n/a. Relies on wantarray.
2948 Throws : n/a.
2949 : Not throwing exception because the absence of hits may have
2950 : resulted from stringent significance criteria, not a failure
2951 : set the hits.
2952
2953 See Also : L<hit()|hit>, L<num_hits()|num_hits>, L<is_signif()|is_signif>, L<_set_signif()|_set_signif>
2954
2955 =cut
2956
2957 #----------
2958 sub hits {
2959 #----------
2960 my $self = shift;
2961
2962 if(wantarray) {
2963 my @ary = ref($self->{'_hits'}) ? @{$self->{'_hits'}} : ();
2964 return @ary;
2965 } else {
2966 return $self->num_hits();
2967 }
2968
2969 # my $num = ref($self->{'_hits'}) ? scalar(@{$self->{'_hits'}}) : 0;
2970 # my @ary = ref($self->{'_hits'}) ? @{$self->{'_hits'}} : ();
2971 #
2972 # return wantarray
2973 # # returning list containing all hits or empty list.
2974 # ? $self->{'_is_significant'} ? @ary : ()
2975 # # returning number of hits or 0.
2976 # : $self->{'_is_significant'} ? $num : 0;
2977 }
2978
2979 =head2 hit
2980
2981 Example : $blast_obj->hit( [class] )
2982 Purpose : Get a specific hit object.
2983 : Provides some syntactic sugar for the hits() method.
2984 Usage : $hitObj = $blast->hit();
2985 : $hitObj = $blast->hit('best');
2986 : $hitObj = $blast->hit('worst');
2987 : $hitObj = $blast->hit( $name );
2988 Returns : Object reference for a Bio::Tools::Blast::Sbjct.pm object.
2989 : undef if there are no hit (Sbjct) objects defined.
2990 Argument : Class (or no argument).
2991 : No argument (default) = highest scoring hit (same as 'best').
2992 : 'best' or 'first' = highest scoring hit.
2993 : 'worst' or 'last' = lowest scoring hit.
2994 : $name = retrieve a hit by seq id (case-insensitive).
2995 Throws : Exception if the Blast object has no significant hits.
2996 : Exception if a hit cannot be found when supplying a specific
2997 : hit sequence identifier as an argument.
2998 Comments : 'best' = lowest significance value (P or Expect) among significant hits.
2999 : 'worst' = highest sigificance value (P or Expect) among significant hits.
3000
3001 See Also : L<hits()|hits>, L<num_hits()|num_hits>, L<is_signif()|is_signif>
3002
3003 =cut
3004
3005 #---------
3006 sub hit {
3007 #---------
3008 my( $self, $option) = @_;
3009 $option ||= 'best';
3010
3011 if($Blast->{'_no_aligns'} || ! ref($self->{'_hits'})) {
3012 return undef;
3013 }
3014
3015 $self->{'_is_significant'} or
3016 $self->throw("There were no significant hits.",
3017 "Use num_hits(), hits(), is_signif() to check.");
3018
3019 my @hits = @{$self->{'_hits'}};
3020
3021 return $hits[0] if $option =~ /^(best|first|1)$/i;
3022 return $hits[$#hits] if $option =~ /^(worst|last)$/i;
3023
3024 # Get hit by name.
3025 foreach ( @hits ) {
3026 return $_ if $_->name() =~ /$option/i;
3027 }
3028
3029 $self->throw("Can't get hit for: $option");
3030 }
3031
3032 =head2 num_hits
3033
3034 Usage : $blast->num_hits( ['total'] );
3035 Purpose : Get number of significant hits or number of total hits.
3036 Examples : $num_signif = $blast-num_hits;
3037 : $num_total = $blast->num_hits('total');
3038 Returns : Integer
3039 Argument : String = 'total' (or no argument).
3040 : No argument (Default) = return number of significant hits.
3041 : 'total' = number of total hits.
3042 Throws : n/a.
3043 : Not throwing exception because the absence of hits may have
3044 : resulted from stringent significance criteria, not a failure
3045 : set the hits.
3046 Comments : A significant hit is defined as a hit with an expect value
3047 : (or P value for WU-Blast) at or below the -signif parameter
3048 : used when parsing the report. Additionally, if a filter function
3049 : was supplied, the significant hit must also pass that
3050 : criteria.
3051
3052 See Also : L<hits()|hits>, L<hit()|hit>, L<is_signif()|is_signif>, L<_set_signif()|_set_signif>, L<parse()|parse>
3053
3054 =cut
3055
3056 #-------------
3057 sub num_hits {
3058 #-------------
3059 my( $self, $option) = @_;
3060 $option ||= '';
3061
3062 $option =~ /total/i and return $self->{'_num_hits'} || 0;
3063
3064 # Default: returning number of significant hits.
3065 # return $self->{'_num_hits_significant'} || 0;
3066 # return 0 if not ref $self->{'_hits'};
3067
3068 if(ref $self->{'_hits'}) {
3069 return scalar(@{$self->{'_hits'}});
3070 } else {
3071 return $self->{'_num_hits_significant'} || 0;
3072 }
3073 }
3074
3075 =head2 lowest_p
3076
3077 Usage : $blast->lowest_p()
3078 Purpose : Get the lowest P-value among all hits in a BLAST report.
3079 : Syntactic sugar for $blast->hit('best')->p().
3080 Returns : Float or scientific notation number.
3081 : Returns -1.0 if lowest_p has not been set.
3082 Argument : n/a.
3083 Throws : Exception if the Blast report does not report P-values
3084 : (as is the case for NCBI Blast2).
3085 Comments : A value is returned regardless of whether or not there were
3086 : significant hits ($DEFAULT_SIGNIF, currently 999).
3087
3088 See Also : L<lowest_expect()|lowest_expect>, L<lowest_signif()|lowest_signif>, L<highest_p()|highest_p>, L<signif_fmt()|signif_fmt>
3089
3090 =cut
3091
3092 #------------
3093 sub lowest_p {
3094 #------------
3095 my $self = shift;
3096
3097 # Layout 2 = NCBI Blast 2.x does not report P-values.
3098 $self->_layout == 2 and
3099 $self->throw("Can't get P-value with BLAST2.",
3100 "Use lowest_signif() or lowest_expect()");
3101
3102 return $self->{'_lowestSignif'} || -1.0;
3103 }
3104
3105 =head2 lowest_expect
3106
3107 Usage : $blast->lowest_expect()
3108 Purpose : Get the lowest Expect value among all hits in a BLAST report.
3109 : Syntactic sugar for $blast->hit('best')->expect()
3110 Returns : Float or scientific notation number.
3111 : Returns -1.0 if lowest_expect has not been set.
3112 Argument : n/a.
3113 Throws : Exception if there were no significant hits and the report
3114 : does not have Expect values on the description lines
3115 : (i.e., Blast1, WashU-Blast2).
3116
3117 See Also : L<lowest_p()|lowest_p>, L<lowest_signif()|lowest_signif>, L<highest_expect()|highest_expect>, L<signif_fmt()|signif_fmt>
3118
3119 =cut
3120
3121 #------------------
3122 sub lowest_expect {
3123 #------------------
3124 my $self = shift;
3125
3126 if ($self->_layout == 2) {
3127 return $self->{'_lowestSignif'} || -1.0;
3128 }
3129
3130 if($self->{'_is_significant'}) {
3131 my $bestHit = $self->{'_hits'}->[0];
3132 return $bestHit->expect();
3133 } else {
3134 $self->throw("Can't get lowest expect value: no significant hits ",
3135 "The format of this report requires expect values to be extracted$Newline".
3136 "from the hits themselves.");
3137 }
3138 }
3139
3140 =head2 highest_p
3141
3142 Example : $blast->highest_p( ['overall'])
3143 Purpose : Get the highest P-value among all hits in a BLAST report.
3144 : Syntactic sugar for $blast->hit('worst')->p()
3145 : Can also get the highest P-value overall (not just among signif hits).
3146 Usage : $p_signif = $blast->highest_p();
3147 : $p_all = $blast->highest_p('overall');
3148 Returns : Float or scientific notation number.
3149 : Returns -1.0 if highest_p has not been set.
3150 Argument : String 'overall' or no argument.
3151 : No argument = get highest P-value among significant hits.
3152 Throws : Exception if object is created from a Blast2 report
3153 : (which does not report P-values).
3154
3155 See Also : L<highest_signif()|highest_signif>, L<lowest_p()|lowest_p>, L<_set_signif()|_set_signif>, L<signif_fmt()|signif_fmt>
3156
3157 =cut
3158
3159 #---------------
3160 sub highest_p {
3161 #---------------
3162 my ($self, $overall) = @_;
3163
3164 # Layout 2 = NCBI Blast 2.x does not report P-values.
3165 $self->_layout == 2 and
3166 $self->throw("Can't get P-value with BLAST2.",
3167 "Use highest_signif() or highest_expect()");
3168
3169 $overall and return $self->{'_highestSignif'} || -1.0;
3170 $self->hit('worst')->p();
3171 }
3172
3173 =head2 highest_expect
3174
3175 Usage : $blast_object->highest_expect( ['overall'])
3176 Purpose : Get the highest Expect value among all significant hits in a BLAST report.
3177 : Syntactic sugar for $blast->hit('worst')->expect()
3178 Examples : $e_sig = $blast->highest_expect();
3179 : $e_all = $blast->highest_expect('overall');
3180 Returns : Float or scientific notation number.
3181 : Returns -1.0 if highest_exoect has not been set.
3182 Argument : String 'overall' or no argument.
3183 : No argument = get highest Expect-value among significant hits.
3184 Throws : Exception if there were no significant hits and the report
3185 : does not have Expect values on the description lines
3186 : (i.e., Blast1, WashU-Blast2).
3187
3188 See Also : L<lowest_expect()|lowest_expect>, L<highest_signif()|highest_signif>, L<signif_fmt()|signif_fmt>
3189
3190 =cut
3191
3192 #-------------------
3193 sub highest_expect {
3194 #-------------------
3195 my ($self, $overall) = @_;
3196
3197 if ( $overall and $self->_layout == 2) {
3198 return $self->{'_highestSignif'} || -1.0;
3199 }
3200
3201 if($self->{'_is_significant'}) {
3202 return $self->hit('worst')->expect;
3203 } else {
3204 $self->throw("Can't get highest expect value: no significant hits ",
3205 "The format of this report requires expect values to be extracted$Newline".
3206 "from the hits themselves.");
3207 }
3208 }
3209
3210 =head2 lowest_signif
3211
3212 Usage : $blast_obj->lowest_signif();
3213 : Syntactic sugar for $blast->hit('best')->signif()
3214 Purpose : Get the lowest P or Expect value among all hits
3215 : in a BLAST report.
3216 : This method is syntactic sugar for $blast->hit('best')->signif()
3217 : The value returned is the one which is reported in the decription
3218 : section of the Blast report.
3219 : For Blast1 and WU-Blast2, this is a P-value,
3220 : for NCBI Blast2, it is an Expect value.
3221 Example : $blast->lowest_signif();
3222 Returns : Float or scientific notation number.
3223 : Returns -1.0 if lowest_signif has not been set.
3224 Argument : n/a.
3225 Throws : n/a.
3226 Status : Deprecated. Use lowest_expect() or lowest_p().
3227 Comments : The signif() method provides a way to deal with the fact that
3228 : Blast1 and Blast2 formats differ in what is reported in the
3229 : description lines of each hit in the Blast report. The signif()
3230 : method frees any client code from having to know if this is a P-value
3231 : or an Expect value, making it easier to write code that can process
3232 : both Blast1 and Blast2 reports. This is not necessarily a good thing, since
3233 : one should always know when one is working with P-values or
3234 : Expect values (hence the deprecated status).
3235 : Use of lowest_expect() is recommended since all hits will have an Expect value.
3236
3237 See Also : L<lowest_p()|lowest_p>, L<lowest_expect()|lowest_expect>, L<signif()|signif>, L<signif_fmt()|signif_fmt>, L<_set_signif()|_set_signif>
3238
3239 =cut
3240
3241 #------------------
3242 sub lowest_signif {
3243 #------------------
3244 my ($self) = @_;
3245
3246 return $self->{'_lowestSignif'} || -1.0;
3247 }
3248
3249 =head2 highest_signif
3250
3251 Usage : $blast_obj->highest_signif('overall');
3252 : Syntactic sugar for $blast->hit('worst')->signif()
3253 Purpose : Get the highest P or Expect value among all hits
3254 : in a BLAST report.
3255 : The value returned is the one which is reported in the decription
3256 : section of the Blast report.
3257 : For Blast1 and WU-Blast2, this is a P-value,
3258 : for NCBI Blast2, it is an Expect value.
3259 Example : $blast->highest_signif();
3260 Returns : Float or scientific notation number.
3261 : Returns -1.0 if highest_signif has not been set.
3262 Argument : Optional string 'overall' to get the highest overall significance value.
3263 Throws : n/a.
3264 Status : Deprecated. Use highest_expect() or highest_p().
3265 Comments : Analogous to lowest_signif(), q.v.
3266
3267 See Also : L<lowest_signif()|lowest_signif>, L<lowest_p()|lowest_p>, L<lowest_expect()|lowest_expect>, L<signif()|signif>, L<signif_fmt()|signif_fmt>, L<_set_signif()|_set_signif>
3268
3269 =cut
3270
3271 #---------------------
3272 sub highest_signif {
3273 #---------------------
3274 my ($self, $overall) = @_;
3275
3276 $overall and return $self->{'_highestSignif'} || -1.0;
3277
3278 if($self->{'_is_significant'}) {
3279 my $worst_hit = $self->hit('worst');
3280 if(defined $worst_hit) {
3281 return $worst_hit->signif;
3282 } else {
3283 return $self->{'_highestSignif'};
3284 }
3285 }
3286 }
3287
3288 =head2 matrix
3289
3290 Usage : $blast_object->matrix();
3291 Purpose : Get the name of the scoring matrix used.
3292 : This is extracted from the report.
3293 Argument : n/a
3294 Returns : string or undef if not defined
3295
3296 =cut
3297
3298 #------------
3299 sub matrix { my $self = shift; $self->{'_matrix'} || $Blast->{'_matrix'}; }
3300 #------------
3301
3302 =head2 filter
3303
3304 Usage : $blast_object->filter();
3305 Purpose : Get the name of the low-complexity sequence filter used.
3306 : (SEG, SEG+XNU, DUST, NONE).
3307 : This is extracted from the report.
3308 Argument : n/a
3309 Returns : string or undef if not defined
3310
3311 =cut
3312
3313 #----------
3314 sub filter { my $self = shift; $self->{'_filter'} || $Blast->{'_filter'}; }
3315 #----------
3316
3317 =head2 expect
3318
3319 Usage : $blast_object->expect();
3320 Purpose : Get the expect parameter (E) used for the Blast analysis.
3321 : This is extracted from the report.
3322 Argument : n/a
3323 Returns : string or undef if not defined.
3324
3325 =cut
3326
3327 #-----------
3328 sub expect { my $self = shift; $self->{'_expect'} || $Blast->{'_expect'}; }
3329 #-----------
3330
3331 =head2 karlin_altschul
3332
3333 Usage : $blast_object->karlin_altschul();
3334 Purpose : Get the Karlin_Altschul sum statistics (Lambda, K, H)
3335 : These are extracted from the report.
3336 Argument : n/a
3337 Returns : list of three floats (Lambda, K, H)
3338 : If not defined, returns list of three zeros)
3339
3340 =cut
3341
3342 #---------------------
3343 sub karlin_altschul {
3344 #---------------------
3345 my $self = shift;
3346 if(defined($self->{'_lambda'})) {
3347 ($self->{'_lambda'}, $self->{'_k'}, $self->{'_h'});
3348 } elsif(defined($Blast->{'_lambda'})) {
3349 ($Blast->{'_lambda'}, $Blast->{'_k'}, $Blast->{'_h'});
3350 } else {
3351 (0, 0, 0);
3352 }
3353 }
3354
3355 =head2 word_size
3356
3357 Usage : $blast_object->word_size();
3358 Purpose : Get the word_size used during the Blast analysis.
3359 : This is extracted from the report.
3360 Argument : n/a
3361 Returns : integer or undef if not defined.
3362
3363 =cut
3364
3365 #--------------
3366 sub word_size {
3367 #--------------
3368 my $self = shift;
3369 $self->{'_word_size'} || $Blast->{'_word_size'};
3370 }
3371
3372 =head2 s
3373
3374 Usage : $blast_object->s();
3375 Purpose : Get the s statistic for the Blast analysis.
3376 : This is extracted from the report.
3377 Argument : n/a
3378 Returns : integer or undef if not defined.
3379
3380 =cut
3381
3382 #------
3383 sub s { my $self = shift; $self->{'_s'} || $Blast->{'_s'}; }
3384 #------
3385
3386 =head2 gap_creation
3387
3388 Usage : $blast_object->gap_creation();
3389 Purpose : Get the gap creation penalty used for a gapped Blast analysis.
3390 : This is extracted from the report.
3391 Argument : n/a
3392 Returns : integer or undef if not defined.
3393
3394 See Also : L<gap_extension()|gap_extension>
3395
3396 =cut
3397
3398 #-----------------
3399 sub gap_creation {
3400 #-----------------
3401 my $self = shift;
3402 $self->{'_gapCreation'} || $Blast->{'_gapCreation'};
3403 }
3404
3405 =head2 gap_extension
3406
3407 Usage : $blast_object->gap_extension();
3408 Purpose : Get the gap extension penalty used for a gapped Blast analysis.
3409 : This is extracted from the report.
3410 Argument : n/a
3411 Returns : integer or undef if not defined.
3412
3413 See Also : L<gap_extension()|gap_extension>
3414
3415 =cut
3416
3417 #-------------------
3418 sub gap_extension {
3419 #-------------------
3420 my $self = shift;
3421 $self->{'_gapExtension'} || $Blast->{'_gapExtension'};
3422 }
3423
3424 =head2 ambiguous_aln
3425
3426 Usage : $blast_object->ambiguous_aln();
3427 Purpose : Test all hits and determine if any have an ambiguous alignment.
3428 Example : print "ambiguous" if $blast->ambiguous_aln();
3429 Returns : Boolean (true if ANY significant hit has an ambiguous alignment)
3430 Argument : n/a
3431 Throws : n/a
3432 Status : Experimental
3433 Comments : An ambiguous BLAST alignment is defined as one where two or more
3434 : different HSPs have significantly overlapping sequences such
3435 : that it is not possible to create a unique alignment
3436 : by simply concatenating HSPs. This may indicate the presence
3437 : of multiple domains in one sequence relative to another.
3438 : This method only indicates the presence of ambiguity in at
3439 : least one significant hit. To determine the nature of the
3440 : ambiguity, each hit must be examined.
3441
3442 See Also : B<Bio::Tools::Blast::Sbjct::ambiguous_aln()>,L<Links to related modules>
3443
3444 =cut
3445
3446 #----------------
3447 sub ambiguous_aln {
3448 #----------------
3449 my $self = shift;
3450 foreach($self->hits()) {
3451 return 1 if ($_->ambiguous_aln() ne '-');
3452 }
3453 0;
3454 }
3455
3456 =head2 overlap
3457
3458 Usage : $blast_object->overlap([integer]);
3459 Purpose : Set/Get the number of overlapping residues allowed when tiling multiple HSPs.
3460 : Delegates to Bio::Tools::Blast::Sbjct::overlap().
3461 Throws : Exception if there are no significant hits.
3462 Status : Experimental
3463
3464 See Also : B<Bio::Tools::Blast::Sbjct::overlap()>,L<Links to related modules>
3465
3466 =cut
3467
3468 #------------
3469 sub overlap {
3470 #------------
3471 my $self = shift;
3472 if(not $self->hits) {
3473 $self->throw("Can't get overlap data without significant hits.");
3474 }
3475 $self->hit->overlap();
3476 }
3477
3478 =head2 homol_data
3479
3480 Usage : @data = $blast_object->homo_data( %named_params );
3481 Purpose : Gets specific similarity data about each significant hit.
3482 Returns : Array of strings:
3483 : "Homology data" for each HSP is in the format:
3484 : "<integer> <start> <stop>"
3485 : Data for different HSPs are tab-delimited.
3486 Argument : named parameters passed along to the hit objects.
3487 Throws : n/a
3488 Status : Experimental
3489 Comments : This is a very experimental method used for obtaining an
3490 : indication of:
3491 : 1) how many HSPs are in a Blast alignment
3492 : 2) how strong the similarity is between sequences in the HSP
3493 : 3) the endpoints of the alignment (sequence monomer numbers)
3494
3495 See Also : B<Bio::Tools::Blast::Sbjct::homol_data()>,L<Links to related modules>
3496
3497 =cut
3498
3499 #----------------
3500 sub homol_data {
3501 #----------------
3502
3503 my ($self, %param) = @_;
3504 my @hits = $self->hits();
3505 my @data = ();
3506
3507 ## Note: Homology data can be either for the query sequence or the hit
3508 ## (Sbjct) sequence. Default is for sbjct. This is specifyable via
3509 ## $param{-SEQ}='sbjct' || 'query'.
3510
3511 foreach ( @hits ) {
3512 push @data, $_->homol_data(%param);
3513 }
3514 @data;
3515 }
3516
3517 =head1 REPORT GENERATING METHODS
3518
3519 =head2 table
3520
3521 Usage : $blast_obj->table( [get_desc]);
3522 Purpose : Output data for each HSP of each hit in tab-delimited format.
3523 Example : print $blast->table;
3524 : print $blast->table(0);
3525 : # Call table_labels() to print labels.
3526 Argument : get_desc = boolean, if false the description of each hit is not included.
3527 : Default: true (if not defined, include description column).
3528 Returns : String containing tab-delimited set of data for each HSP
3529 : of each significant hit. Different HSPs are separated by newlines.
3530 : Left-to-Right order of fields:
3531 : 1 QUERY_NAME # Sequence identifier of the query.
3532 : 2 QUERY_LENGTH # Full length of the query sequence.
3533 : 3 SBJCT_NAME # Sequence identifier of the sbjct ("hit".
3534 : 4 SBJCT_LENGTH # Full length of the sbjct sequence.
3535 : 5 EXPECT # Expect value for the alignment.
3536 : 6 SCORE # Blast score for the alignment.
3537 : 7 BITS # Bit score for the alignment.
3538 : 8 NUM_HSPS # Number of HSPs (not the "N" value).
3539 : 9 HSP_FRAC_IDENTICAL # fraction of identical substitutions.
3540 : 10 HSP_FRAC_CONSERVED # fraction of conserved ("positive") substitutions.
3541 : 11 HSP_QUERY_ALN_LENGTH # Length of the aligned portion of the query sequence.
3542 : 12 HSP_SBJCT_ALN_LENGTH # Length of the aligned portion of the sbjct sequence.
3543 : 13 HSP_QUERY_GAPS # Number of gaps in the aligned query sequence.
3544 : 14 HSP_SBJCT_GAPS # Number of gaps in the aligned sbjct sequence.
3545 : 15 HSP_QUERY_START # Starting coordinate of the query sequence.
3546 : 16 HSP_QUERY_END # Ending coordinate of the query sequence.
3547 : 17 HSP_SBJCT_START # Starting coordinate of the sbjct sequence.
3548 : 18 HSP_SBJCT_END # Ending coordinate of the sbjct sequence.
3549 : 19 HSP_QUERY_STRAND # Strand of the query sequence (TBLASTN/X only)
3550 : 20 HSP_SBJCT_STRAND # Strand of the sbjct sequence (TBLASTN/X only)
3551 : 21 HSP_FRAME # Frame for the sbjct translation (TBLASTN/X only)
3552 : 22 SBJCT_DESCRIPTION (optional) # Full description of the sbjct sequence from
3553 : # the alignment section.
3554 Throws : n/a
3555 Comments : This method does not collect data based on tiling of the HSPs.
3556 : The table will contains redundant information since the hit name,
3557 : id, and other info for the hit are listed for each HSP.
3558 : If you need more flexibility in the output format than this
3559 : method provides, design a custom function.
3560
3561 See Also : L<table_tiled()|table_tiled>, L<table_labels()|table_labels>, L<_display_hits()|_display_hits>
3562
3563 =cut
3564
3565 #-----------
3566 sub table {
3567 #-----------
3568 my ($self, $get_desc) = @_;
3569 my $str = '';
3570
3571 $get_desc = defined($get_desc) ? $get_desc : 1;
3572 # $str .= $self->_table_labels($get_desc) unless $self->{'_labels'};
3573
3574 my $sigfmt = $self->signif_fmt();
3575 $sigfmt eq 'parts' and $sigfmt = 'exp'; # disallow 'parts' format for this table.
3576 my $sigprint = $sigfmt eq 'exp' ? 'd' : '.1e';
3577
3578 my ($hit, $hsp);
3579 foreach $hit($self->hits) {
3580 foreach $hsp($hit->hsps) {
3581 # Note: range() returns a 2-element list.
3582 $str .= sprintf "%s\t%d\t%s\t%d\t%$sigprint\t%d\t%d\t%d\t%.2f\t%.2f\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%s\t%s\t%s\t%s$Newline",
3583 $self->name, $self->length, $hit->name, $hit->length,
3584 $hit->expect($sigfmt), $hit->score, $hit->bits,
3585 $hit->num_hsps, $hsp->frac_identical, $hsp->frac_conserved,
3586 $hsp->length('query'), $hsp->length('sbjct'),
3587 $hsp->gaps('list'),
3588 $hsp->range('query'), $hsp->range('sbjct'),
3589 $hsp->strand('query'), $hsp->strand('sbjct'), $hsp->frame,
3590 ($get_desc ? $hit->desc : '');
3591 }
3592 }
3593 $str =~ s/\t$Newline/$Newline/gs;
3594 $str;
3595 }
3596
3597 =head2 table_labels
3598
3599 Usage : print $blast_obj->table_labels( [get_desc] );
3600 Purpose : Get column labels for table().
3601 Returns : String containing column labels. Tab-delimited.
3602 Argument : get_desc = boolean, if false the description column is not included.
3603 : Default: true (if not defined, include description column).
3604 Throws : n/a
3605
3606 See Also : L<table()|table>
3607
3608 =cut
3609
3610 #----------------
3611 sub table_labels {
3612 #----------------
3613 my ($self, $get_desc) = @_;
3614 $get_desc = defined($get_desc) ? $get_desc : 1;
3615 my $descstr = $get_desc ? 'DESC' : '';
3616 my $descln = $get_desc ? '-----' : '';
3617
3618 my $str = sprintf "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s$Newline",
3619 'QUERY', 'Q_LEN', 'SBJCT', 'S_LEN', 'EXPCT', 'SCORE', 'BITS', 'HSPS',
3620 'IDEN', 'CONSV', 'Q_ALN', 'S_ALN', 'Q_GAP', 'S_GAP',
3621 'Q_BEG', 'Q_END', 'S_BEG', 'S_END', 'Q_STR', 'S_STR', 'FRAM', $descstr;
3622 $str .= sprintf "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s$Newline",
3623 '-----', '-----', '-----', '-----', '-----', '-----', '-----', '-----',
3624 '-----', '-----', '-----', '-----', '-----', '-----',
3625 '-----', '-----', '-----','-----', '-----', '-----','-----', $descln;
3626
3627 $self->{'_labels'} = 1;
3628 $str =~ s/\t$Newline/$Newline/gs;
3629 $str;
3630 }
3631
3632 =head2 table_tiled
3633
3634 Purpose : Get data from tiled HSPs in tab-delimited format.
3635 : Allows only minimal flexibility in the output format.
3636 : If you need more flexibility, design a custom function.
3637 Usage : $blast_obj->table_tiled( [get_desc]);
3638 Example : print $blast->table_tiled;
3639 : print $blast->table_tiled(0);
3640 : # Call table_labels_tiled() if you want labels.
3641 Argument : get_desc = boolean, if false the description of each hit is not included.
3642 : Default: true (include description).
3643 Returns : String containing tab-delimited set of data for each HSP
3644 : of each significant hit. Multiple hits are separated by newlines.
3645 : Left-to-Right order of fields:
3646 : 1 QUERY_NAME # Sequence identifier of the query.
3647 : 2 QUERY_LENGTH # Full length of the query sequence.
3648 : 3 SBJCT_NAME # Sequence identifier of the sbjct ("hit".
3649 : 4 SBJCT_LENGTH # Full length of the sbjct sequence.
3650 : 5 EXPECT # Expect value for the alignment.
3651 : 6 SCORE # Blast score for the alignment.
3652 : 7 BITS # Bit score for the alignment.
3653 : 8 NUM_HSPS # Number of HSPs (not the "N" value).
3654 : 9 FRAC_IDENTICAL* # fraction of identical substitutions.
3655 : 10 FRAC_CONSERVED* # fraction of conserved ("positive") substitutions .
3656 : 11 FRAC_ALN_QUERY* # fraction of the query sequence that is aligned.
3657 : 12 FRAC_ALN_SBJCT* # fraction of the sbjct sequence that is aligned.
3658 : 13 QUERY_ALN_LENGTH* # Length of the aligned portion of the query sequence.
3659 : 14 SBJCT_ALN_LENGTH* # Length of the aligned portion of the sbjct sequence.
3660 : 15 QUERY_GAPS* # Number of gaps in the aligned query sequence.
3661 : 16 SBJCT_GAPS* # Number of gaps in the aligned sbjct sequence.
3662 : 17 QUERY_START* # Starting coordinate of the query sequence.
3663 : 18 QUERY_END* # Ending coordinate of the query sequence.
3664 : 19 SBJCT_START* # Starting coordinate of the sbjct sequence.
3665 : 20 SBJCT_END* # Ending coordinate of the sbjct sequence.
3666 : 21 AMBIGUOUS_ALN # Ambiguous alignment indicator ('qs', 'q', 's').
3667 : 22 SBJCT_DESCRIPTION (optional) # Full description of the sbjct sequence from
3668 : # the alignment section.
3669 :
3670 : * Items marked with a "*" report data summed across all HSPs
3671 : after tiling them to avoid counting data from overlapping regions
3672 : multiple times.
3673 Throws : n/a
3674 Comments : This function relies on tiling of the HSPs since it calls
3675 : frac_identical() etc. on the hit as opposed to each HSP individually.
3676
3677 See Also : L<table()|table>, L<table_labels_tiled()|table_labels_tiled>, B<Bio::Tools::Blast::Sbjct::"HSP Tiling and Ambiguous Alignments">, L<Links to related modules>
3678
3679 =cut
3680
3681 #----------------
3682 sub table_tiled {
3683 #----------------
3684 my ($self, $get_desc) = @_;
3685 my $str = '';
3686
3687 $get_desc = defined($get_desc) ? $get_desc : 1;
3688
3689 my ($hit);
3690 my $sigfmt = $self->signif_fmt();
3691 $sigfmt eq 'parts' and $sigfmt = 'exp'; # disallow 'parts' format for this table.
3692 my $sigprint = $sigfmt eq 'exp' ? 'd' : '.1e';
3693
3694 foreach $hit($self->hits) {
3695 $str .= sprintf "%s\t%d\t%s\t%d\t%$sigprint\t%d\t%d\t%d\t%.2f\t%.2f\t%.2f\t%.2f\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%s\t%s$Newline",
3696 $self->name, $self->length, $hit->name, $hit->length,
3697 $hit->expect($sigfmt), $hit->score, $hit->bits,
3698 $hit->num_hsps, $hit->frac_identical, $hit->frac_conserved,
3699 $hit->frac_aligned_query, $hit->frac_aligned_hit,
3700 $hit->length_aln('query'), $hit->length_aln('sbjct'),
3701 $hit->gaps('list'), $hit->range('query'), $hit->range('sbjct'),
3702 $hit->ambiguous_aln, ($get_desc ? $hit->desc : '');
3703 }
3704 $str =~ s/\t$Newline/$Newline/gs;
3705 $str;
3706 }
3707
3708 =head2 table_labels_tiled
3709
3710 Usage : print $blast_obj->table_labels_tiled( [get_desc] );
3711 Purpose : Get column labels for table_tiled().
3712 Returns : String containing column labels. Tab-delimited.
3713 Argument : get_desc = boolean, if false the description column is not included.
3714 : Default: true (include description column).
3715 Throws : n/a
3716
3717 See Also : L<table_tiled()|table_tiled>
3718
3719 =cut
3720
3721 #---------------------
3722 sub table_labels_tiled {
3723 #---------------------
3724 my ($self, $get_desc) = @_;
3725 my $descstr = $get_desc ? 'DESC' : '';
3726 my $descln = $get_desc ? '-----' : '';
3727
3728 my $str = sprintf "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s$Newline",
3729 'QUERY', 'Q_LEN', 'SBJCT', 'S_LEN', 'EXPCT', 'SCORE', 'BITS',
3730 'HSPS', 'FR_ID', 'FR_CN', 'FR_ALQ', 'FR_ALS', 'Q_ALN',
3731 'S_ALN', 'Q_GAP', 'S_GAP', 'Q_BEG', 'Q_END', 'S_BEG', 'S_END',
3732 'AMBIG', $descstr;
3733 $str =~ s/\t$Newline/$Newline/;
3734 $str .= sprintf "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s$Newline",
3735 '-----', '-----', '------', '-----', '-----','-----', '-----',
3736 '-----', '-----', '-----', '-----', '-----', '-----',
3737 '-----', '-----', '-----','-----','-----', '-----',
3738 '-----','-----', $descln;
3739
3740 $self->{'_labels_tiled'} = 1;
3741 $str =~ s/\t$Newline/$Newline/gs;
3742 $str;
3743 }
3744
3745 =head2 display
3746
3747 Usage : $blast_object->display( %named_parameters );
3748 Purpose : Display information about Bio::Tools::Blast.pm data members,
3749 : E.g., parameters of the report, data for each hit., etc.
3750 : Overrides Bio::Root::Object::display().
3751 Example : $object->display(-SHOW=>'stats');
3752 : $object->display(-SHOW=>'stats,hits');
3753 Argument : Named parameters: (TAGS CAN BE UPPER OR LOWER CASE)
3754 : -SHOW => 'file' | 'hits' | 'homol'
3755 : -WHERE => filehandle (default = STDOUT)
3756 Returns : n/a (print/printf is called)
3757 Status : Experimental
3758 Comments : For tab-delimited output, see table().
3759
3760 See Also : L<_display_homol()|_display_homol>, L<_display_hits()|_display_hits>, L<_display_stats()|_display_stats>, L<table()|table>, B<Bio::Root::Tools::SeqAnal::display()>,L<Links to related modules>,
3761
3762 =cut
3763
3764 #--------------
3765 sub display {
3766 #--------------
3767 my( $self, %param ) = @_;
3768
3769 $self->SUPER::display(%param);
3770 my $OUT = $self->fh();
3771
3772 $self->show =~ /homol/i and $self->_display_homol($OUT);
3773 $self->show =~ /hits/i and $self->_display_hits( %param );
3774 1;
3775 }
3776
3777 =head2 _display_homol
3778
3779 Usage : n/a; called automatically by display()
3780 Purpose : Print homology data for hits in the BLAST report.
3781 Example : n/a
3782 Argument : one argument = filehandle object.
3783 Returns : printf call.
3784 Status : Experimental
3785
3786 See Also : L<homol_data()|homol_data>, L<display()|display>
3787
3788 =cut
3789
3790 #-------------------
3791 sub _display_homol {
3792 #-------------------
3793 my( $self, $OUT ) = @_;
3794
3795 print $OUT "${Newline}BLAST HOMOLOGY DATA FOR: ${\$self->name()}$Newline";
3796 print $OUT '-'x40,"$Newline";
3797
3798 foreach ( $self->homol_data()) {
3799 print $OUT "$_$Newline";
3800 }
3801 }
3802
3803 =head2 _display_stats
3804
3805 Usage : n/a; called automatically by display()
3806 Purpose : Display information about the Blast report "meta" data.
3807 : Overrides Bio::Tools::SeqAnal::_display_stats() calling it first.
3808 Example : n/a
3809 Argument : one argument = filehandle object.
3810 Returns : printf call.
3811 Status : Experimental
3812
3813 See Also : L<display()|display>, B<Bio::Tools::SeqAnal::_display_stats()>,L<Links to related modules>
3814
3815 =cut
3816
3817 #--------------------
3818 sub _display_stats {
3819 #--------------------
3820 my( $self, $OUT ) = @_;
3821
3822 $self->SUPER::_display_stats($OUT);
3823 printf( $OUT "%-15s: %s$Newline", "GAPPED", $self->gapped ? 'YES' : 'NO');
3824 printf( $OUT "%-15s: %d$Newline", "TOTAL HITS", $self->num_hits('total'));
3825 printf( $OUT "%-15s: %s$Newline", "CHECKED ALL", $Blast->{'_check_all'} ? 'YES' : 'NO');
3826 printf( $OUT "%-15s: %s$Newline", "FILT FUNC", $Blast->{'_filt_func'} ? 'YES' : 'NO');
3827 if($self->min_length) {
3828 printf( $OUT "%-15s: Length >= %s$Newline", "MIN_LENGTH", $self->min_length);
3829 }
3830
3831 my $num_hits = $self->num_hits;
3832 my $signif_str = ($self->_layout == 1) ? 'P' : 'EXPECT';
3833
3834 printf( $OUT "%-15s: %d$Newline", "SIGNIF HITS", $num_hits);
3835 # Blast1: signif = P-value, Blast2: signif = Expect value.
3836
3837 printf( $OUT "%-15s: %s ($signif_str-VALUE)$Newline", "SIGNIF CUTOFF", $self->signif);
3838 printf( $OUT "%-15s: %s$Newline", "LOWEST $signif_str", $self->lowest_signif());
3839 printf( $OUT "%-15s: %s$Newline", "HIGHEST $signif_str", $self->highest_signif());
3840
3841 printf( $OUT "%-15s: %s (OVERALL)$Newline", "HIGHEST $signif_str", $self->highest_signif('overall'));
3842
3843 if($Blast->_get_stats) {
3844 my $warn = ($Blast->{'_share'}) ? '(SHARED STATS)' : '';
3845 printf( $OUT "%-15s: %s$Newline", "MATRIX", $self->matrix() || 'UNKNOWN');
3846 printf( $OUT "%-15s: %s$Newline", "FILTER", $self->filter() || 'UNKNOWN');
3847 printf( $OUT "%-15s: %s$Newline", "EXPECT", $self->expect() || 'UNKNOWN');
3848 printf( $OUT "%-15s: %s, %s, %s %s$Newline", "LAMBDA, K, H", $self->karlin_altschul(), $warn);
3849 printf( $OUT "%-15s: %s$Newline", "WORD SIZE", $self->word_size() || 'UNKNOWN');
3850 printf( $OUT "%-15s: %s %s$Newline", "S", $self->s() || 'UNKNOWN', $warn);
3851 if($self->gapped) {
3852 printf( $OUT "%-15s: %s$Newline", "GAP CREATION", $self->gap_creation() || 'UNKNOWN');
3853 printf( $OUT "%-15s: %s$Newline", "GAP EXTENSION", $self->gap_extension() || 'UNKNOWN');
3854 }
3855 }
3856 print $OUT "$Newline";
3857 }
3858
3859 =head2 _display_hits
3860
3861 Usage : n/a; called automatically by display()
3862 Purpose : Display data for each hit. Not tab-delimited.
3863 Example : n/a
3864 Argument : one argument = filehandle object.
3865 Returns : printf call.
3866 Status : Experimental
3867 Comments : For tab-delimited output, see table().
3868
3869 See Also : L<display()|display>, B<Bio::Tools::Blast::Sbjct::display()>, L<table()|table>, L<Links to related modules>
3870
3871 =cut
3872
3873 sub _display_hits {
3874
3875 my( $self, %param ) = @_;
3876 my $OUT = $self->fh();
3877 my @hits = $self->hits();
3878
3879 ## You need a wide screen to see this properly.
3880 # Header.
3881 print $OUT "${Newline}BLAST HITS FOR: ${\$self->name()} length = ${\$self->length}$Newline";
3882 print "(This table requires a wide display.)$Newline";
3883 print $OUT '-'x80,"$Newline";
3884
3885 print $self->table_labels_tiled(0);
3886 print $self->table_tiled(0);
3887
3888 ## Doing this interactively since there is potentially a lot of data here.
3889 ## Not quite satisfied with this approach.
3890
3891 if (not $param{-INTERACTIVE}) {
3892 return 1;
3893 } else {
3894 my ($reply);
3895 print "${Newline}DISPLAY FULL HSP DATA? (y/n): [n] ";
3896 chomp( $reply = <STDIN> );
3897 $reply =~ /^y.*/i;
3898
3899 my $count = 0;
3900 foreach ( @hits ) {
3901 $count++;
3902 print $OUT "$Newline$Newline",'-'x80,"$Newline";
3903 print $OUT "HSP DATA FOR HIT #$count (hit <RETURN>)";
3904 print $OUT "$Newline",'-'x80;<STDIN>;
3905 $param{-SHOW} = 'hsp';
3906 $_->display( %param );
3907 }
3908 }
3909 1;
3910 }
3911
3912 =head2 to_html
3913
3914 Usage : $blast_object->to_html( [%named_parameters] )
3915 Purpose : To produce an HTML-formatted version of a BLAST report
3916 : for efficient navigation of the report using a web browser.
3917 Example : # Using the static Blast object:
3918 : # Can read from STDIN or from a designated file:
3919 : $Blast->to_html($file);
3920 : $Blast->to_html(-FILE=>$file, -HEADER=>$header);
3921 : (if no file is supplied, STDIN will be used).
3922 : # saving HTML to an array:
3923 : $Blast->to_html(-FILE=>$file, -OUT =>\@out);
3924 : # Using a pre-existing blast object (must have been built from
3925 : # a file, not STDIN:
3926 : $blastObj->to_html();
3927 Returns : n/a, either prints report to STDOUT or saves to a supplied array
3928 : if an '-OUT' parameter is defined (see below).
3929 Argument : %named_parameters: (TAGS ARE AND CASE INSENSITIVE).
3930 : -FILE => string containing name of a file to be processed.
3931 : If not a valid file or undefined, STDIN will be used.
3932 : Can skip the -FILE tag if supplying a filename
3933 : as a single argument.
3934 : -HEADER => string
3935 : This should be an HTML-formatted string to be used
3936 : as a header for the page, typically describing query sequence,
3937 : database searched, the date of the analysis, and any
3938 : additional links.
3939 : If not supplied, no special header is used.
3940 : Regardless of whether a header is supplied, the
3941 : standard info at the top of the report is highlighted.
3942 : This should include the <HEADER></HEADER> section
3943 : of the page as well.
3944 :
3945 : -IN => array reference containing a raw Blast report.
3946 : each line in a separate element in the array.
3947 : If -IN is not supplied, read() is called
3948 : and data is then read either from STDIN or a file.
3949 :
3950 : -OUT => array reference to hold the HTML output.
3951 : If not supplied, output is sent to STDOUT.
3952 Throws : Exception is propagated from $HTML::get_html_func()
3953 : and Bio::Root::Object::read().
3954 Comments : The code that does the actual work is located in
3955 : Bio::Tools::Blast::HTML::get_html_func().
3956 Bugs : Some hypertext links to external databases may not be
3957 : correct. This due in part to the dynamic nature of
3958 : the web.
3959 : Hypertext links are not added to hits without database ids.
3960 TODO : Possibly create a function to produce fancy default header
3961 : using data extracted from the report (requires some parsing).
3962 : For example, it would be nice to always include a date
3963
3964 See Also : B<Bio::Tools::Blast::HTML::get_html_func()>, B<Bio::Root::Object::read()>, L<Links to related modules>
3965
3966 =cut
3967
3968 #------------
3969 sub to_html {
3970 #------------
3971 my ($self, @param) = @_;
3972
3973 # Permits syntax such as: $blast->to_html($filename);
3974 my ($file, $header_html, $in_aref, $out_aref) =
3975 $self->_rearrange([qw(FILE HEADER IN OUT)], @param);
3976
3977 $self->file($file) if $file;
3978
3979 # Only setting the newline character once for efficiency.
3980 $Newline ||= $Util->get_newline(-client => $self, @param);
3981
3982 $header_html ||= '';
3983 (ref($out_aref) eq 'ARRAY') ? push(@$out_aref, $header_html) : print "$header_html$Newline";
3984
3985 require Bio::Tools::Blast::HTML;
3986 Bio::Tools::Blast::HTML->import(qw(&get_html_func));
3987
3988 my ($func);
3989 eval{ $func = &get_html_func($out_aref); };
3990 if($@) {
3991 my $err = $@;
3992 $self->throw($err);
3993 }
3994
3995 eval {
3996 if(!$header_html) {
3997 $out_aref ? push(@$out_aref, "<html><body>$Newline") : print "<html><body>$Newline";
3998 }
3999
4000 if (ref ($in_aref) =~ /ARRAY/) {
4001 # If data is being supplied, process it.
4002 foreach(@$in_aref) {
4003 &$func($_);
4004 }
4005 } else {
4006 # Otherwise, read it, processing as we go.
4007
4008 $self->read(-FUNC => $func, @param);
4009 }
4010 $out_aref ? push(@$out_aref, "$Newline</pre></body></html>") : print "$Newline</pre></body></html>";
4011 };
4012
4013 if($@) {
4014 # Check for trivial error (report already HTML formatted).
4015 if($@ =~ /HTML formatted/) {
4016 print STDERR "\a${Newline}Blast report appears to be HTML formatted already.$Newline$Newline";
4017 } else {
4018 my $err = $@;
4019 $self->throw($err);
4020 }
4021 }
4022 }
4023
4024 1;
4025 __END__
4026
4027 #####################################################################################
4028 # END OF CLASS #
4029 #####################################################################################
4030
4031 =head1 FOR DEVELOPERS ONLY
4032
4033 =head2 Data Members
4034
4035 Information about the various data members of this module is provided for those
4036 wishing to modify or understand the code. Two things to bear in mind:
4037
4038 =over 4
4039
4040 =item 1 Do NOT rely on these in any code outside of this module.
4041
4042 All data members are prefixed with an underscore to signify that they are private.
4043 Always use accessor methods. If the accessor doesn't exist or is inadequate,
4044 create or modify an accessor (and let me know, too!). (An exception to this might
4045 be for Sbjct.pm or HSP.pm which are more tightly coupled to Blast.pm and
4046 may access Blast data members directly for efficiency purposes, but probably
4047 should not).
4048
4049 =item 2 This documentation may be incomplete and out of date.
4050
4051 It is easy for these data member descriptions to become obsolete as
4052 this module is still evolving. Always double check this info and search
4053 for members not described here.
4054
4055 =back
4056
4057 An instance of Bio::Tools::Blast.pm is a blessed reference to a hash containing
4058 all or some of the following fields:
4059
4060 FIELD VALUE
4061 --------------------------------------------------------------
4062 _significance P-value or Expect value cutoff (depends on Blast version:
4063 Blast1/WU-Blast2 = P-value; Blast2 = Expect value).
4064 Values GREATER than this are deemed not significant.
4065
4066 _significant Boolean. True if the query has one or more significant hit.
4067
4068 _min_length Integer. Query sequences less than this will be skipped.
4069
4070 _confirm_significance Boolean. True if client has supplied significance criteria.
4071
4072 _gapped Boolean. True if BLAST analysis has gapping turned on.
4073
4074 _hits List of Sbjct.pm objects.
4075
4076 _num_hits Number of hits obtained from the BLAST report.
4077
4078 _num_hits_significant Number of significant based on Significant data members.
4079
4080 _highestSignif Highest P or Expect value overall (not just what is stored in _hits).
4081
4082 _lowestSignif Lowest P or Expect value overall (not just what is stored in _hits).
4083
4084 The static $Blast object has a special set of members:
4085
4086 _errs
4087 _share
4088 _stream
4089 _get_stats
4090 _gapped
4091 _filt_func
4092
4093 Miscellaneous statistical parameters:
4094 -------------------------------------
4095 _filter, _matrix, _word_size, _expect, _gapCreation, _gapExtension, _s,
4096 _lambda, _k, _h
4097
4098 INHERITED DATA MEMBERS
4099 -----------------------
4100 (See Bio::Tools::SeqAnal.pm for inherited data members.)
4101
4102 =cut
4103
4104 1;