0
|
1 #------------------------------------------------------------------------------
|
|
2 # PACKAGE : Bio::Tools::Blast::Sbjct
|
|
3 # AUTHOR : Steve Chervitz (sac@bioperl.org)
|
|
4 # CREATED : 7 October 1996
|
|
5 # STATUS : Alpha
|
|
6 # REVISION: $Id: Sbjct.pm,v 1.20 2002/10/22 07:38:48 lapp Exp $
|
|
7 #
|
|
8 # For the latest version and documentation, visit the distribution site:
|
|
9 # http://genome-www.stanford.edu/perlOOP/bioperl/blast/
|
|
10 #
|
|
11 # To generate documentation, run this module through pod2html
|
|
12 # (preferably from Perl v5.004 or better).
|
|
13 #
|
|
14 # Copyright (c) 1996-2000 Steve Chervitz. All Rights Reserved.
|
|
15 # This module is free software; you can redistribute it and/or
|
|
16 # modify it under the same terms as Perl itself.
|
|
17 #------------------------------------------------------------------------------
|
|
18
|
|
19 package Bio::Tools::Blast::Sbjct;
|
|
20
|
|
21 use Bio::Root::Global qw(:devel);
|
|
22 use Bio::Root::Object ();
|
|
23
|
|
24 @ISA = qw( Bio::Root::Object Exporter );
|
|
25
|
|
26 use strict;
|
|
27 use vars qw($ID %SUMMARY_OFFSET $Revision);
|
|
28 $ID = 'Bio::Tools::Blast::Sbjct';
|
|
29 $Revision = '$Id: Sbjct.pm,v 1.20 2002/10/22 07:38:48 lapp Exp $'; #'
|
|
30
|
|
31 my $_prog = '';
|
|
32 my $_signif_fmt = '';
|
|
33
|
|
34 ## POD Documentation:
|
|
35
|
|
36 =head1 NAME
|
|
37
|
|
38 Bio::Tools::Blast::Sbjct - Bioperl BLAST "Hit" object
|
|
39
|
|
40 =head1 SYNOPSIS
|
|
41
|
|
42 =head2 Object Creation
|
|
43
|
|
44 The construction of HSP objects is handled by B<Bio::Tools::Blast>.
|
|
45 You should not need to use this package directly. See L<_initialize()|_initialize>
|
|
46 for a description of constructor parameters.
|
|
47
|
|
48 require Bio::Tools::Blast::Sbjct;
|
|
49
|
|
50 $hit = new Bio::Tools::Blast::Sbjct (-DATA =>\@hitData,
|
|
51 -PARENT =>$self,
|
|
52 -NAME =>5,
|
|
53 -RANK =>5,
|
|
54 -RANK_BY =>'order',
|
|
55 -MAKE =>'query' (or 'sbjct'),
|
|
56 -OVERLAP =>2,
|
|
57 -PROGRAM =>'TBLASTN'
|
|
58 );
|
|
59
|
|
60 @hitData includes the summary line for the hit as element [0], plus
|
|
61 all lines from the HSP alignment section of the BLAST report for
|
|
62 the present hit.
|
|
63
|
|
64 =head1 INSTALLATION
|
|
65
|
|
66 This module is included with the central Bioperl distribution:
|
|
67
|
|
68 http://bio.perl.org/Core/Latest
|
|
69 ftp://bio.perl.org/pub/DIST
|
|
70
|
|
71 Follow the installation instructions included in the README file.
|
|
72
|
|
73 =head1 DESCRIPTION
|
|
74
|
|
75 The Bio::Tools::Blast::Sbjct.pm module encapsulates data and methods for
|
|
76 parsing and manipulating "hits" from a BLAST report.
|
|
77 This module is a utility module used by the Bio::Tools::Blast.pm
|
|
78 and is not intended for separate use.
|
|
79
|
|
80 In Blast lingo, the "sbjct" sequences are all the sequences
|
|
81 in a target database which were compared against a "query" sequence.
|
|
82 The terms "sbjct" and "hit" will be used interchangeably in this and related modules.
|
|
83
|
|
84 This module supports BLAST versions 1.x and 2.x, gapped and ungapped.
|
|
85
|
|
86
|
|
87 =head2 HSP Tiling and Ambiguous Alignments
|
|
88
|
|
89 If a Blast hit has more than one HSP, the Bio::Tools::Blast::Sbjct.pm
|
|
90 object has the ability to merge overlapping HSPs into contiguous
|
|
91 blocks. This permits the Sbjct object to sum data across all HSPs
|
|
92 without counting data in the overlapping regions multiple times, which
|
|
93 would happen if data from each overlapping HSP are simply summed. HSP
|
|
94 tiling is performed automatically when methods of the Sbjct object
|
|
95 that rely on tiled data are invoked. These include
|
|
96 L<frac_identical()|frac_identical>, L<frac_conserved()|frac_conserved>, L<gaps()|gaps>,
|
|
97 L<frac_aligned_query()|frac_aligned_query>, L<frac_aligned_hit()|frac_aligned_hit>,
|
|
98 L<num_unaligned_query()|num_unaligned_query>, L<num_unaligned_hit()|num_unaligned_hit>.
|
|
99
|
|
100 It also permits the assessment of an "ambiguous alignment" if the
|
|
101 query (or sbjct) sequences from different HSPs overlap. The existence
|
|
102 of an overlap could indicate a biologically interesting region in the
|
|
103 sequence, such as a repeated domain. The Sbjct object uses the
|
|
104 -OVERLAP parameter to determine when two sequences overlap; if this is
|
|
105 set to 2 -- the default -- then any two sbjct or query HSP sequences
|
|
106 must overlap by more than two residues to get merged into the same
|
|
107 contig and counted as an overlap. See the L<BUGS | BUGS> section below for
|
|
108 "issues" with HSP tiling.
|
|
109
|
|
110
|
|
111 The results of the HSP tiling is reported with the following ambiguity codes:
|
|
112
|
|
113 'q' = Query sequence contains multiple sub-sequences matching
|
|
114 a single region in the sbjct sequence.
|
|
115
|
|
116 's' = Sbjct sequence contains multiple sub-sequences matching
|
|
117 a single region in the query sequence.
|
|
118
|
|
119 'qs' = Both query and sbjct sequences contain more than one
|
|
120 sub-sequence with similarity to the other sequence.
|
|
121
|
|
122
|
|
123 For addition information about ambiguous BLAST alignments, see
|
|
124 L<_tile_hsps()|_tile_hsps> and
|
|
125
|
|
126 http://www-genome.stanford.edu/Sacch3D/help/ambig_aln.html
|
|
127
|
|
128 =head1 DEPENDENCIES
|
|
129
|
|
130 Bio::Tools::Blast::Sbjct.pm is a concrete class that inherits from B<Bio::Root::Object>
|
|
131 and relies on two other modules:
|
|
132
|
|
133 =over 4
|
|
134
|
|
135 =item B<Bio::Tools::Blast::HSP>
|
|
136
|
|
137 Encapsulates a single high-scoring segment pair within a hit.
|
|
138
|
|
139 =item B<Bio::Tools::Blast>
|
|
140
|
|
141 Provides a container for Sbjct.pm objects.
|
|
142
|
|
143 =back
|
|
144
|
|
145
|
|
146 Bio::Tools::Blast::Sbjct.pm does not currently inherit from
|
|
147 Bio::Root::Vector.pm since Bio::Root::Vector.pm may be re-designed to
|
|
148 make it usable via delegation. Thus, a Blast.pm object would manage a
|
|
149 vector of Sbjct.pm objects. Stay tuned.
|
|
150
|
|
151
|
|
152 =head1 BUGS
|
|
153
|
|
154 One consequence of the HSP tiling is that methods that rely on HSP
|
|
155 tiling such as L<frac_identical()|frac_identical>, L<frac_conserved()|frac_conserved>, L<gaps()|gaps>
|
|
156 etc. may report misleading numbers when C<-OVERLAP> is set to a large
|
|
157 number. For example, say we have two HSPs and the query sequence tile
|
|
158 as follows:
|
|
159
|
|
160 1 8 22 30 40 60
|
|
161 Full seq: ------------------------------------------------------------
|
|
162 * ** * **
|
|
163 HSP1: --------------- (6 identical matches)
|
|
164 ** ** **
|
|
165 HSP2: ------------- (6 identical matches)
|
|
166
|
|
167
|
|
168 If C<-OVERLAP> is set to some number over 4, HSP1 and HSP2 will not be
|
|
169 tiled into a single contig and their numbers of identical matches will
|
|
170 be added, giving a total of 12, not 10 if they had be combined into
|
|
171 one contig. This can lead to number greater than 1.0 for methods
|
|
172 L<frac_identical()|frac_identical> and L<frac_conserved()|frac_conserved>. This is less of an issue
|
|
173 with gapped Blast since it tends to combine HSPs that would be listed
|
|
174 separately without gapping. (Fractions E<gt>1.0 can be viewed as a
|
|
175 signal for an interesting alignment that warrants further inspection,
|
|
176 thus turning this bug into a feature).
|
|
177
|
|
178 Using large values for C<-OVERLAP> can lead to incorrect numbers
|
|
179 reported by methods that rely on HSP tiling but can be useful if you
|
|
180 care more about detecting ambiguous alignments. Setting C<-OVERLAP>
|
|
181 to zero will lead to the most accurate numbers for the
|
|
182 tiling-dependent methods but will be useless for detecting overlapping
|
|
183 HSPs since all HSPs will appear to overlap.
|
|
184
|
|
185
|
|
186 =head1 SEE ALSO
|
|
187
|
|
188 Bio::Tools::Blast::HSP.pm - Blast HSP object.
|
|
189 Bio::Tools::Blast.pm - Blast object.
|
|
190 Bio::Root::Object.pm - Proposed base class for all Bioperl objects.
|
|
191
|
|
192 Links:
|
|
193
|
|
194 http://bio.perl.org/Core/POD/Tools/Blast/HSP.pm.html
|
|
195
|
|
196 http://bio.perl.org/Projects/modules.html - Online module documentation
|
|
197 http://bio.perl.org/Projects/Blast/ - Bioperl Blast Project
|
|
198 http://bio.perl.org/ - Bioperl Project Homepage
|
|
199
|
|
200
|
|
201 =head1 FEEDBACK
|
|
202
|
|
203 =head2 Mailing Lists
|
|
204
|
|
205 User feedback is an integral part of the evolution of this and other
|
|
206 Bioperl modules. Send your comments and suggestions preferably to one
|
|
207 of the Bioperl mailing lists. Your participation is much appreciated.
|
|
208
|
|
209 bioperl-l@bioperl.org - General discussion
|
|
210 http://bio.perl.org/MailList.html - About the mailing lists
|
|
211
|
|
212 =head2 Reporting Bugs
|
|
213
|
|
214 Report bugs to the Bioperl bug tracking system to help us keep track
|
|
215 the bugs and their resolution. Bug reports can be submitted via email
|
|
216 or the web:
|
|
217
|
|
218 bioperl-bugs@bio.perl.org
|
|
219 http://bugzilla.bioperl.org/
|
|
220
|
|
221 =head1 AUTHOR
|
|
222
|
|
223 Steve Chervitz, sac@bioperl.org
|
|
224
|
|
225 See the L<FEEDBACK | FEEDBACK> section for where to send bug reports and comments.
|
|
226
|
|
227 =head1 COPYRIGHT
|
|
228
|
|
229 Copyright (c) 1996-2000 Steve Chervitz. All Rights Reserved.
|
|
230 This module is free software; you can redistribute it and/or
|
|
231 modify it under the same terms as Perl itself.
|
|
232
|
|
233 =cut
|
|
234
|
|
235
|
|
236
|
|
237 #
|
|
238 ##
|
|
239 ###
|
|
240 #### END of main POD documentation.
|
|
241 ###
|
|
242 ##
|
|
243 #'
|
|
244
|
|
245
|
|
246 =head1 APPENDIX
|
|
247
|
|
248 Methods beginning with a leading underscore are considered private
|
|
249 and are intended for internal use by this module. They are
|
|
250 B<not> considered part of the public interface and are described here
|
|
251 for documentation purposes only.
|
|
252
|
|
253 =cut
|
|
254
|
|
255 #####################################################################################
|
|
256 ## CONSTRUCTOR ##
|
|
257 #####################################################################################
|
|
258
|
|
259 =head2 _initialize
|
|
260
|
|
261 Usage : n/a; automatically called by Bio::Root::Object::new()
|
|
262 : Bio::Tools::Blast::Sbjct.pm objects are constructed
|
|
263 : automatically by Bio::Tools::Blast.pm, so there is no need
|
|
264 : for direct consumption.
|
|
265 Purpose : Initializes key varaiables and calls methods to parse a single Blast hit.
|
|
266 : Constructs Bio::Tools::Blast::HSP.pm objects for each
|
|
267 : high-scoring segment pair (HSP).
|
|
268 : Calls superclass constructor first (Bio::Root::Object.pm).
|
|
269 Returns : n/a
|
|
270 Argument : Named Parameters passed from new()
|
|
271 : ALL TAGS MUST BE UPPERCASE (does not call _rearrange()).
|
|
272 : -DATA => array reference holding all data for a single hit.
|
|
273 : The first element should hold the description
|
|
274 : line (from the desctiption section at the top of
|
|
275 : the report), remaining lines should hold all lines
|
|
276 : within the HSP alignment listing section of report.
|
|
277 : -PARENT => object reference to a Bio::Tools::Blast.pm object.
|
|
278 : -NAME => string (typically same as -RANK, just a temporary
|
|
279 : name to use until the actual name of hit is parsed),
|
|
280 : -RANK => integer,
|
|
281 : -RANK_BY => 'order',
|
|
282 : -OVERLAP => integer (maximum overlap between adjacent
|
|
283 : HSPs when tiling)
|
|
284 : -PROGRAM => string (type of Blast blastp, blastn, etc).
|
|
285
|
|
286 See Also : L<_set_id()|_set_id>, L<_set_hsps()|_set_hsps>, L<_tile_hsps()|_tile_hsps>, B<Bio::Root::Object::new()>, B<Bio::Tools::Blast::_set_hit_db()>
|
|
287
|
|
288 =cut
|
|
289
|
|
290 #-------------------
|
|
291 sub _initialize {
|
|
292 #-------------------
|
|
293 my( $self, %param ) = @_;
|
|
294
|
|
295 # $make not currently used.
|
|
296 my $make = $self->SUPER::_initialize( %param );
|
|
297
|
|
298 # Set various class data.
|
|
299 $_prog = $param{-PROGRAM} || '';
|
|
300 $_signif_fmt = $param{-SIGNIF_FMT};
|
|
301
|
|
302 $self->{'_rank'} = $param{-RANK} || '';
|
|
303 $self->_set_id( $param{-DATA}->[0]);
|
|
304 $self->_set_hsps( @{$param{-DATA}} );
|
|
305
|
|
306 $self->{'_overlap'} = $param{-OVERLAP} || 0;
|
|
307 }
|
|
308
|
|
309 #--------------
|
|
310 sub destroy {
|
|
311 #--------------
|
|
312 my $self=shift;
|
|
313 if($self->{'_hsps'}) {
|
|
314 foreach($self->hsps) {
|
|
315 $_->destroy;
|
|
316 undef $_;
|
|
317 }
|
|
318 undef $self->{'_hsps'};
|
|
319 }
|
|
320 $DEBUG==2 && print STDERR "DESTROYING $self ${\$self->name}";
|
|
321 $self->SUPER::destroy;
|
|
322 }
|
|
323
|
|
324 #####################################################################################
|
|
325 ## ACCESSORS ##
|
|
326 #####################################################################################
|
|
327
|
|
328
|
|
329 =head2 rank
|
|
330
|
|
331 Usage : $sbjct->rank( integer or string );
|
|
332 Purpose : Sets/Gets the rank of the current Sbjct object relative to
|
|
333 : other Sbjct objects managed by a given Blast object.
|
|
334 Example : $sbjct->rank(1);
|
|
335 Returns : The current rank value.
|
|
336 Argument : Integer or string to be used for ranking the hit
|
|
337 : relative to other hits.
|
|
338 Throws : n/a
|
|
339 Comments : The rank usually corresponds to the order the listing
|
|
340 : of hits in the BLAST report from lowest to highest p-value.
|
|
341 : Rank need not be restricted to this value.
|
|
342 : rank() may be provided by a delegated or inherited
|
|
343 : iterator class in the future (such as Bio::Root::Vector.pm).
|
|
344
|
|
345 =cut
|
|
346
|
|
347 #-----------
|
|
348 sub rank {
|
|
349 #-----------
|
|
350 my $self = shift;
|
|
351 if(@_) {$self->{'_rank'} = shift; }
|
|
352 $self->{'_rank'};
|
|
353 }
|
|
354
|
|
355
|
|
356
|
|
357 =head2 _set_id
|
|
358
|
|
359 Usage : n/a; automatically called by _initialize()
|
|
360 Purpose : Sets the name of the Sbjct sequence from the BLAST summary line.
|
|
361 : The identifier is assumed to be the first
|
|
362 : chunk of non-whitespace characters in the description line
|
|
363 : Does not assume any semantics in the structure of the identifier
|
|
364 : (Formerly, this method attempted to extract database name from
|
|
365 : the seq identifiers, but this was prone to break).
|
|
366 Returns : n/a
|
|
367 Argument : String containing description line of the hit from Blast report
|
|
368 : or first line of an alignment section.
|
|
369 Throws : Warning if cannot locate sequence ID.
|
|
370
|
|
371 See Also : L<_initialize()|_initialize>, B<Bio::Tools::Blast::database()>
|
|
372
|
|
373 =cut
|
|
374
|
|
375 #---------------
|
|
376 sub _set_id {
|
|
377 #---------------
|
|
378 my( $self, $desc ) = @_;
|
|
379 my ($seqID1, $seqID2, $dbID, @seqDat);
|
|
380
|
|
381 local $_ = $desc;
|
|
382 my @linedat = split();
|
|
383 my $data = $linedat[0];
|
|
384
|
|
385 # New strategy: Assume only that the ID is the first white space
|
|
386 # delimited chunk. Not attempting to extract database name.
|
|
387 # Clients will have to interpret it as necessary.
|
|
388 if($data =~ /^(\S+)\s*/) {
|
|
389 $self->name($1);
|
|
390 }
|
|
391 else {
|
|
392 $self->warn("Can't locate sequence identifier in summary line.", "Line = $data");
|
|
393 $data = 'Unknown sequence ID' if not $data;
|
|
394 $self->name($data);
|
|
395 }
|
|
396 $self->{'_db'} = '-';
|
|
397
|
|
398 # Old strategy: assumes semantics in the identifier
|
|
399 # and tries to separate out database and id components.
|
|
400 # Too fancy and fragile! SAC, 2000-02-18
|
|
401
|
|
402 # # Proceeding from more standard (NCBI-like) to less standard.
|
|
403 # if($data =~ /(\S+?)[:\|]+(\S+?)[:\|]+(\S*)/) {
|
|
404 # # matches: database|id1|id2 or database||id1||id2 or database:id1:id2
|
|
405 # $dbID = $1;
|
|
406 # $seqID1 = $2;
|
|
407 # $seqID2 = $3;
|
|
408 # if($seqID2 eq $seqID1) { undef($seqID2); }
|
|
409 #
|
|
410 # } elsif($data =~ /(\S+?)[:\|]+(\S+)/) {
|
|
411 # # matches: database|id1 or database:id1
|
|
412 # $dbID = $1;
|
|
413 # $seqID1 = $2;
|
|
414 #
|
|
415 # } elsif($data =~ /^(\S+)\s+([gb|emb|dbj|sp|pir])\s+(\S+)*/) {
|
|
416 # # matches: id1 database id2
|
|
417 # $seqID1 = $1;
|
|
418 # $dbID = $2;
|
|
419 # $seqID2 = $3;
|
|
420 #
|
|
421 # } elsif($data =~ /^(\S+)\s*/) {
|
|
422 # $seqID1 = $1;
|
|
423 # }
|
|
424 #
|
|
425 # ## Combine the multiple IDs.
|
|
426 # $seqID2 = scalar($seqID2) ? "/$seqID2" : '';
|
|
427 #
|
|
428 # if( !scalar $seqID1) {
|
|
429 # $self->warn("Can't locate sequence identifier in summary line.", "Line = $data");
|
|
430 # $self->name('Unknown sequence ID');
|
|
431 # } else {
|
|
432 # $self->name($seqID1.$seqID2);
|
|
433 # }
|
|
434 # $self->{'_db'} = $dbID || '-';
|
|
435 }
|
|
436
|
|
437
|
|
438 =head2 _set_hsps
|
|
439
|
|
440 Usage : n/a; called automatically during object construction.
|
|
441 Purpose : Creates HSP.pm objects for each HSP in a BLAST hit alignment.
|
|
442 : Also collects the full description of the hit from the
|
|
443 : HSP alignment section.
|
|
444 Returns : n/a
|
|
445 Argument : List of strings containing raw BLAST report data for
|
|
446 : a single hit's HSP alignment data.
|
|
447 Throws : Warnings for each HSP.pm object that fails to be constructed.
|
|
448 : Exception if no HSP.pm objects can be constructed.
|
|
449 : Exception if can't parse length data for hit sequence.
|
|
450 Comments : Requires Bio::Tools::Blast::HSP.pm.
|
|
451 : Sets the description using the full string present in
|
|
452 : the alignment data.
|
|
453 : Also sets Expect and P-values for the Sbjct object by
|
|
454 : copying from the HSP object.
|
|
455 : While this sacrifices some memory efficiency, it
|
|
456 : improves access speed for these critical data.
|
|
457
|
|
458 See Also : L<_initialize()|_initialize>, L<_set_desc()|_set_desc>
|
|
459
|
|
460 =cut
|
|
461
|
|
462 #--------------
|
|
463 sub _set_hsps {
|
|
464 #--------------
|
|
465
|
|
466 my( $self, @data ) = @_;
|
|
467 my $start = 0;
|
|
468 my $hspCount = 0;
|
|
469
|
|
470 require Bio::Tools::Blast::HSP;
|
|
471
|
|
472 # printf STDERR "$ID _set_hsps(). DATA (%d lines) =\n@data\n", scalar(@data); <STDIN>;
|
|
473
|
|
474 my( @hspData, @hspList, @errs, @bad_names );
|
|
475 my($line, $set_desc, @desc);
|
|
476 $set_desc = 0;
|
|
477
|
|
478 hit_loop:
|
|
479 foreach $line( @data ) {
|
|
480
|
|
481 if( $line =~ /^\s*Length = ([\d,]+)/ ) {
|
|
482 $self->_set_desc(@desc);
|
|
483 $set_desc = 1;
|
|
484 ($self->{'_length'} = $1) =~ s/,//g; # get rid of commas
|
|
485 next hit_loop;
|
|
486 } elsif( !$set_desc) {
|
|
487 $line =~ s/^\s+|\s+$//g;
|
|
488 push @desc, $line;
|
|
489 next hit_loop;
|
|
490 } elsif( $line =~ /^\s*Score/ ) {
|
|
491 ## This block is for setting multiple HSPs.
|
|
492
|
|
493 if( not scalar @hspData ) {
|
|
494 $start = 1;
|
|
495 push @hspData, $line;
|
|
496 next hit_loop;
|
|
497
|
|
498 } elsif( scalar @hspData) {
|
|
499 $hspCount++;
|
|
500 $DEBUG and do{ print STDERR +( $hspCount % 10 ? "+" : "+\n" ); };
|
|
501
|
|
502 # print STDERR "\n$ID: setting HSP: ${\$self->name}\n";
|
|
503 my $hspObj = eval { new Bio::Tools::Blast::HSP(-DATA =>\@hspData,
|
|
504 -PARENT =>$self,
|
|
505 -NAME =>$hspCount,
|
|
506 -PROGRAM =>$_prog,
|
|
507 );
|
|
508 };
|
|
509 if($@) {
|
|
510 # print "$ID: ERROR:\n$@";<STDIN>;
|
|
511 push @errs, $@;
|
|
512 push @bad_names, "#$hspCount";
|
|
513 $hspObj->destroy if ref $hspObj;
|
|
514 undef $hspObj;
|
|
515 } else {
|
|
516 push @hspList, $hspObj;
|
|
517 if (!defined($self->{'_expect'}) || $hspObj->expect() < $self->{'_expect'}) {
|
|
518 $self->{'_expect'} = $hspObj->expect();
|
|
519 }
|
|
520 if (!defined($self->{'_p'}) || $hspObj->p() < $self->{'_p'}) {
|
|
521 $self->{'_p'} = $hspObj->p();
|
|
522 }
|
|
523 }
|
|
524 @hspData = ();
|
|
525 push @hspData, $line;
|
|
526 next;
|
|
527 } else {
|
|
528 push @hspData, $line;
|
|
529 }
|
|
530 } elsif( $start ) {
|
|
531 ## This block is for setting the last HSP (which may be the first as well!).
|
|
532 if( $line =~ /^(end|>|Parameters|CPU|Database:)/ ) {
|
|
533 $hspCount++;
|
|
534 $DEBUG and do{ print STDERR +( $hspCount % 10 ? "+" : "+\n" ); };
|
|
535
|
|
536 # print STDERR "\n$ID: setting HSP: ${\$self->name}\n";
|
|
537
|
|
538 my $hspObj = eval { new Bio::Tools::Blast::HSP(-DATA =>\@hspData,
|
|
539 -PARENT =>$self,
|
|
540 -NAME =>$hspCount,
|
|
541 -PROGRAM =>$_prog,
|
|
542 );
|
|
543 };
|
|
544 if($@) {
|
|
545 # print "$ID: ERROR:\n$@";<STDIN>;
|
|
546 push @errs, $@;
|
|
547 push @bad_names, "#$hspCount";
|
|
548 $hspObj->destroy if ref $hspObj;
|
|
549 undef $hspObj;
|
|
550 } else {
|
|
551 push @hspList, $hspObj;
|
|
552 if (!defined($self->{'_expect'}) || $hspObj->expect() < $self->{'_expect'}) {
|
|
553 $self->{'_expect'} = $hspObj->expect();
|
|
554 }
|
|
555 if (!defined($self->{'_p'}) || $hspObj->p() < $self->{'_p'}) {
|
|
556 $self->{'_p'} = $hspObj->p();
|
|
557 }
|
|
558 }
|
|
559 } else {
|
|
560 push @hspData, $line;
|
|
561 }
|
|
562 }
|
|
563 }
|
|
564
|
|
565 # print STDERR "\n--------> Done building HSPs for ${\$self->name}\n";
|
|
566
|
|
567 $self->{'_length'} or $self->throw( "Can't determine hit sequence length.");
|
|
568
|
|
569 # Adjust logical length based on BLAST flavor.
|
|
570 if($_prog =~ /TBLAST[NX]/) {
|
|
571 $self->{'_logical_length'} = $self->{'_length'} / 3;
|
|
572 }
|
|
573
|
|
574 # Handling errors as done in Blast.pm. (as of version 0.073)
|
|
575
|
|
576 if(@errs) {
|
|
577 my ($str);
|
|
578 # When there are many errors, in most of the cases, they are
|
|
579 # caused by the same problem. Only need to see full data for
|
|
580 # the first one.
|
|
581 if(@errs > 2) {
|
|
582 $str = "SHOWING FIRST EXCEPTION ONLY:\n$errs[0]";
|
|
583 $self->clear_err(); # clearing the existing set of errors.
|
|
584 # Not necessary, unless the -RECORD_ERR =>1
|
|
585 # constructor option was used for Blast object.
|
|
586 } else {
|
|
587 $str = join("\n",@errs);
|
|
588 }
|
|
589
|
|
590 if( not scalar @hspList) {
|
|
591 $self->throw("Failed to create any HSP objects for $hspCount potential HSP(s).",
|
|
592 "\n\nTRAPPED EXCEPTION(S):\n$str\nEND TRAPPED EXCEPTION(S)\n"
|
|
593 );
|
|
594 } else {
|
|
595 $self->warn(sprintf("Could not create HSP objects for %d HSP(s): %s", scalar(@errs), join(', ',@bad_names)),
|
|
596 "\n\nTRAPPED EXCEPTION(S):\n$str\nEND TRAPPED EXCEPTION(S)\n"
|
|
597 );
|
|
598 }
|
|
599
|
|
600 } else {
|
|
601 $self->{'_hsps'} = \@hspList;
|
|
602 }
|
|
603 }
|
|
604
|
|
605 =head2 _set_desc
|
|
606
|
|
607 Usage : n/a; called automatically by _set_hsps()
|
|
608 Purpose : Sets the description of the hit sequence.
|
|
609 : For sequence without descriptions, sets description to "-".
|
|
610 Argument : Array containing description (multiple lines).
|
|
611 Comments : _set_hsps() calls this method with the data from the
|
|
612 : HSP alignment listing, which contains the complete description.
|
|
613 : (Formerly, this was called from the _set_desc_data() method initially.)
|
|
614
|
|
615 See Also : _set_hsps()
|
|
616
|
|
617 =cut
|
|
618
|
|
619 #--------------
|
|
620 sub _set_desc {
|
|
621 #--------------
|
|
622 my( $self, @desc ) = @_;
|
|
623 my( $desc);
|
|
624
|
|
625 # print "$ID: RAW DESC:\n@desc";<STDIN>;
|
|
626
|
|
627 $desc = join(" ", @desc);
|
|
628
|
|
629 if($desc) {
|
|
630 $desc =~ s/^\s*\S+\s+//; # remove the sequence ID(s)
|
|
631 $desc =~ s/^[\s!]+//;
|
|
632 $desc =~ s/ \d+$//;
|
|
633 $desc =~ s/\.+$//;
|
|
634 $self->{'_desc'} = $desc || '-';
|
|
635 } else {
|
|
636 $self->{'_desc'} = '-';
|
|
637 }
|
|
638
|
|
639 # print "$ID: _set_desc = $desc";<STDIN>;
|
|
640 }
|
|
641
|
|
642
|
|
643 =head2 _tile_hsps
|
|
644
|
|
645 Usage : n/a; called automatically during object construction or
|
|
646 : as needed by methods that rely on having tiled data.
|
|
647 Purpose : Collect statistics about the aligned sequences in a set of HSPs.
|
|
648 : Calculates the following data across all HSPs:
|
|
649 : -- total alignment length
|
|
650 : -- total identical residues
|
|
651 : -- total conserved residues
|
|
652 Returns : n/a
|
|
653 Argument : n/a
|
|
654 Throws : n/a
|
|
655 Status : Experimental
|
|
656 Comments :
|
|
657 : This method performs more careful summing of data across
|
|
658 : all HSPs in the Sbjct object. Simply summing the data from all HSPs
|
|
659 : will overestimate the actual length of the alignment if there is
|
|
660 : overlap between different HSPs (often the case).
|
|
661 : The strategy is to tile the HSPs and sum over the
|
|
662 : contigs, collecting data separately from overlapping and
|
|
663 : non-overlapping regions of each HSP. To facilitate this, the
|
|
664 : HSP.pm object now permits extraction of data from sub-sections
|
|
665 : of an HSP.
|
|
666 :
|
|
667 : Additional useful information is collected from the results
|
|
668 : of the tiling. It is possible that sub-sequences in
|
|
669 : different HSPs will overlap significantly. In this case, it
|
|
670 : is impossible to create a single unambiguous alignment by
|
|
671 : concatenating the HSPs. The ambiguity may indicate the
|
|
672 : presence of multiple, similar domains in one or both of the
|
|
673 : aligned sequences. This ambiguity is recorded using the
|
|
674 : ambiguous_aln() method.
|
|
675 :
|
|
676 : This method does not attempt to discern biologically
|
|
677 : significant vs. insignificant overlaps. The allowable amount of
|
|
678 : overlap can be set with the overlap() method or with the -OVERLAP
|
|
679 : parameter used when constructing the Blast & Sbjct objects.
|
|
680 :
|
|
681 : For a given hit, both the query and the sbjct sequences are
|
|
682 : tiled independently.
|
|
683 :
|
|
684 : -- If only query sequence HSPs overlap,
|
|
685 : this may suggest multiple domains in the sbjct.
|
|
686 : -- If only sbjct sequence HSPs overlap,
|
|
687 : this may suggest multiple domains in the query.
|
|
688 : -- If both query & sbjct sequence HSPs overlap,
|
|
689 : this suggests multiple domains in both.
|
|
690 : -- If neither query & sbjct sequence HSPs overlap,
|
|
691 : this suggests either no multiple domains in either
|
|
692 : sequence OR that both sequences have the same
|
|
693 : distribution of multiple similar domains.
|
|
694 :
|
|
695 : This method can deal with the special case of when multiple
|
|
696 : HSPs exactly overlap.
|
|
697 :
|
|
698 : Efficiency concerns:
|
|
699 : Speed will be an issue for sequences with numerous HSPs.
|
|
700 :
|
|
701 Bugs : Currently, _tile_hsps() does not properly account for
|
|
702 : the number of non-tiled but overlapping HSPs, which becomes a problem
|
|
703 : as overlap() grows. Large values overlap() may thus lead to
|
|
704 : incorrect statistics for some hits. For best results, keep overlap()
|
|
705 : below 5 (DEFAULT IS 2). For more about this, see the "HSP Tiling and
|
|
706 : Ambiguous Alignments" section.
|
|
707
|
|
708 See Also : L<_adjust_contigs()|_adjust_contigs>, L<ambiguous_aln()|ambiguous_aln>, L<overlap()|overlap>, L<frac_identical()|frac_identical>, L<frac_conserved()|frac_conserved>, L<frac_aligned_query()|frac_aligned_query>, L<frac_aligned_hit()|frac_aligned_hit>, L<num_unaligned_query()|num_unaligned_query>, L<num_unaligned_hit()|num_unaligned_hit>, L<HSP Tiling and Ambiguous Alignments>
|
|
709
|
|
710 =cut
|
|
711
|
|
712 #--------------
|
|
713 sub _tile_hsps {
|
|
714 #--------------
|
|
715 my $self = shift;
|
|
716 # my $gapped = $self->parent->gapped || 0; # no special treatment
|
|
717
|
|
718 $self->{'_tile_hsps'} = 1;
|
|
719 $self->{'_gaps_query'} = 0;
|
|
720 $self->{'_gaps_sbjct'} = 0;
|
|
721
|
|
722 ## Simple summation scheme. Valid if there is only one HSP.
|
|
723 if((defined($self->{'_n'}) and $self->{'_n'} == 1) or $self->num_hsps == 1) {
|
|
724 my $hsp = $self->hsp;
|
|
725 $self->{'_length_aln_query'} = $hsp->length('query');
|
|
726 $self->{'_length_aln_sbjct'} = $hsp->length('sbjct');
|
|
727 $self->{'_length_aln_total'} = $hsp->length('total');
|
|
728 ($self->{'_totalIdentical'},$self->{'_totalConserved'}) = $hsp->matches();
|
|
729 $self->{'_gaps_query'} = $hsp->gaps('query');
|
|
730 $self->{'_gaps_sbjct'} = $hsp->gaps('sbjct');
|
|
731
|
|
732 # print "_tile_hsps(): single HSP, easy stats.\n";
|
|
733 return;
|
|
734 } else {
|
|
735 # print STDERR "$ID: _tile_hsps: summing multiple HSPs\n";
|
|
736 $self->{'_length_aln_query'} = 0;
|
|
737 $self->{'_length_aln_sbjct'} = 0;
|
|
738 $self->{'_length_aln_total'} = 0;
|
|
739 $self->{'_totalIdentical'} = 0;
|
|
740 $self->{'_totalConserved'} = 0;
|
|
741 }
|
|
742
|
|
743 ## More than one HSP. Must tile HSPs.
|
|
744 # printf "\nTiling HSPs for %s (BLAST: %s)\n",$self->name, $self->parent->name;
|
|
745 my($hsp, $qstart, $qstop, $sstart, $sstop);
|
|
746 my(@qcontigs, @scontigs);
|
|
747 my $qoverlap = 0;
|
|
748 my $soverlap = 0;
|
|
749 my $max_overlap = $self->{'_overlap'};
|
|
750
|
|
751 foreach $hsp ($self->hsps()) {
|
|
752 # printf " HSP: %s\n%s\n",$hsp->name, $hsp->str('query');
|
|
753 # printf " Length = %d; Identical = %d; Conserved = %d; Conserved(1-10): %d",$hsp->length, $hsp->length(-TYPE=>'iden'), $hsp->length(-TYPE=>'cons'), $hsp->length(-TYPE=>'cons',-START=>0,-STOP=>10); <STDIN>;
|
|
754 ($qstart, $qstop) = $hsp->range('query');
|
|
755 ($sstart, $sstop) = $hsp->range('sbjct');
|
|
756
|
|
757 my ($qgaps, $sgaps) = $hsp->gaps();
|
|
758 $self->{'_gaps_query'} += $qgaps;
|
|
759 $self->{'_gaps_sbjct'} += $sgaps;
|
|
760
|
|
761 $self->{'_length_aln_total'} += $hsp->length;
|
|
762 ## Collect contigs in the query sequence.
|
|
763 $qoverlap = &_adjust_contigs('query', $hsp, $qstart, $qstop, \@qcontigs, $max_overlap);
|
|
764
|
|
765 ## Collect contigs in the sbjct sequence (needed for domain data and gapped Blast).
|
|
766 $soverlap = &_adjust_contigs('sbjct', $hsp, $sstart, $sstop, \@scontigs, $max_overlap);
|
|
767
|
|
768 ## Collect overall start and stop data for query and sbjct over all HSPs.
|
|
769 if(not defined $self->{'_queryStart'}) {
|
|
770 $self->{'_queryStart'} = $qstart;
|
|
771 $self->{'_queryStop'} = $qstop;
|
|
772 $self->{'_sbjctStart'} = $sstart;
|
|
773 $self->{'_sbjctStop'} = $sstop;
|
|
774 } else {
|
|
775 $self->{'_queryStart'} = ($qstart < $self->{'_queryStart'} ? $qstart : $self->{'_queryStart'});
|
|
776 $self->{'_queryStop'} = ($qstop > $self->{'_queryStop'} ? $qstop : $self->{'_queryStop'});
|
|
777 $self->{'_sbjctStart'} = ($sstart < $self->{'_sbjctStart'} ? $sstart : $self->{'_sbjctStart'});
|
|
778 $self->{'_sbjctStop'} = ($sstop > $self->{'_sbjctStop'} ? $sstop : $self->{'_sbjctStop'});
|
|
779 }
|
|
780 }
|
|
781
|
|
782 ## Collect data across the collected contigs.
|
|
783
|
|
784 # print "\nQUERY CONTIGS:\n";
|
|
785 # print " gaps = $self->{'_gaps_query'}\n";
|
|
786
|
|
787 foreach(@qcontigs) {
|
|
788 # print " query contig: $_->{'start'} - $_->{'stop'}\n";
|
|
789 # print " iden = $_->{'iden'}; cons = $_->{'cons'}\n";
|
|
790 $self->{'_length_aln_query'} += $_->{'stop'} - $_->{'start'} + 1;
|
|
791 $self->{'_totalIdentical'} += $_->{'iden'};
|
|
792 $self->{'_totalConserved'} += $_->{'cons'};
|
|
793 }
|
|
794
|
|
795 ## Collect data for sbjct contigs. Important for gapped Blast.
|
|
796 ## The totalIdentical and totalConserved numbers will be the same
|
|
797 ## as determined for the query contigs.
|
|
798
|
|
799 # print "\nSBJCT CONTIGS:\n";
|
|
800 # print " gaps = $self->{'_gaps_sbjct'}\n";
|
|
801
|
|
802 foreach(@scontigs) {
|
|
803 # print " sbjct contig: $_->{'start'} - $_->{'stop'}\n";
|
|
804 # print " iden = $_->{'iden'}; cons = $_->{'cons'}\n";
|
|
805 $self->{'_length_aln_sbjct'} += $_->{'stop'} - $_->{'start'} + 1;
|
|
806 }
|
|
807 # <STDIN>;
|
|
808
|
|
809 if($qoverlap) {
|
|
810 if($soverlap) { $self->ambiguous_aln('qs');
|
|
811 # print "\n*** AMBIGUOUS ALIGNMENT: Query and Sbjct\n\n";
|
|
812 }
|
|
813 else { $self->ambiguous_aln('q');
|
|
814 # print "\n*** AMBIGUOUS ALIGNMENT: Query\n\n";
|
|
815 }
|
|
816 } elsif($soverlap) {
|
|
817 $self->ambiguous_aln('s');
|
|
818 # print "\n*** AMBIGUOUS ALIGNMENT: Sbjct\n\n";
|
|
819 }
|
|
820
|
|
821 # Adjust length based on BLAST flavor.
|
|
822 my $prog = $self->parent->program;
|
|
823 if($prog eq 'TBLASTN') {
|
|
824 $self->{'_length_aln_sbjct'} /= 3;
|
|
825 } elsif($prog eq 'BLASTX' ) {
|
|
826 $self->{'_length_aln_query'} /= 3;
|
|
827 } elsif($prog eq 'TBLASTX') {
|
|
828 $self->{'_length_aln_query'} /= 3;
|
|
829 $self->{'_length_aln_sbjct'} /= 3;
|
|
830 }
|
|
831 }
|
|
832
|
|
833
|
|
834
|
|
835 =head2 _adjust_contigs
|
|
836
|
|
837 Usage : n/a; called automatically during object construction.
|
|
838 Purpose : Builds HSP contigs for a given BLAST hit.
|
|
839 : Utility method called by _tile_hsps()
|
|
840 Returns :
|
|
841 Argument :
|
|
842 Throws : Exceptions propagated from Bio::Tools::Blast::HSP::matches()
|
|
843 : for invalid sub-sequence ranges.
|
|
844 Status : Experimental
|
|
845 Comments : This method does not currently support gapped alignments.
|
|
846 : Also, it does not keep track of the number of HSPs that
|
|
847 : overlap within the amount specified by overlap().
|
|
848 : This will lead to significant tracking errors for large
|
|
849 : overlap values.
|
|
850
|
|
851 See Also : L<overlap()|overlap>, L<_tile_hsps()|_tile_hsps>, B<Bio::Tools::Blast::HSP>::matches
|
|
852
|
|
853 =cut
|
|
854
|
|
855 #-------------------
|
|
856 sub _adjust_contigs {
|
|
857 #-------------------
|
|
858 my ($seqType, $hsp, $start, $stop, $contigs_ref, $max_overlap) = @_;
|
|
859
|
|
860 my $overlap = 0;
|
|
861 my ($numID, $numCons);
|
|
862
|
|
863 # print "Testing $seqType data: HSP (${\$hsp->name}); $start, $stop\n";
|
|
864 foreach(@$contigs_ref) {
|
|
865 # print " Contig: $_->{'start'} - $_->{'stop'}, iden= $_->{'iden'}, cons= $_->{'cons'}\n";
|
|
866
|
|
867 ## Test special case of a nested HSP. Skip it.
|
|
868 if($start >= $_->{'start'} and $stop <= $_->{'stop'}) {
|
|
869 # print "----> Nested HSP. Skipping.\n";
|
|
870 $overlap = 1;
|
|
871 next;
|
|
872 }
|
|
873
|
|
874 ## Test for overlap at beginning of contig.
|
|
875 if($start < $_->{'start'} and $stop > ($_->{'start'} + $max_overlap)) {
|
|
876 # print "----> Overlaps beg: existing beg,end: $_->{'start'},$_->{'stop'}, new beg,end: $start,$stop\n";<STDIN>;
|
|
877 # Collect stats over the non-overlapping region.
|
|
878 eval {
|
|
879 ($numID, $numCons) = $hsp->matches(-SEQ =>$seqType,
|
|
880 -START =>$start,
|
|
881 -STOP =>$_->{'start'}-1);
|
|
882 };
|
|
883 if($@) { warn "\a\n$@\n"; }
|
|
884 else {
|
|
885 $_->{'start'} = $start; # Assign a new start coordinate to the contig
|
|
886 $_->{'iden'} += $numID; # and add new data to #identical, #conserved.
|
|
887 $_->{'cons'} += $numCons;
|
|
888 $overlap = 1;
|
|
889 }
|
|
890 }
|
|
891
|
|
892 ## Test for overlap at end of contig.
|
|
893 if($stop > $_->{'stop'} and $start < ($_->{'stop'} - $max_overlap)) {
|
|
894 # print "----> Overlaps end: existing beg,end: $_->{'start'},$_->{'stop'}, new beg,end: $start,$stop\n";<STDIN>;
|
|
895 # Collect stats over the non-overlapping region.
|
|
896 eval {
|
|
897 ($numID,$numCons) = $hsp->matches(-SEQ =>$seqType,
|
|
898 -START =>$_->{'stop'},
|
|
899 -STOP =>$stop);
|
|
900 };
|
|
901 if($@) { warn "\a\n$@\n"; }
|
|
902 else {
|
|
903 $_->{'stop'} = $stop; # Assign a new stop coordinate to the contig
|
|
904 $_->{'iden'} += $numID; # and add new data to #identical, #conserved.
|
|
905 $_->{'cons'} += $numCons;
|
|
906 $overlap = 1;
|
|
907 }
|
|
908 }
|
|
909 $overlap && do {
|
|
910 # print " New Contig data:\n";
|
|
911 # print " Contig: $_->{'start'} - $_->{'stop'}, iden= $_->{'iden'}, cons= $_->{'cons'}\n";
|
|
912 last;
|
|
913 };
|
|
914 }
|
|
915 ## If there is no overlap, add the complete HSP data.
|
|
916 !$overlap && do {
|
|
917 # print "No overlap. Adding new contig.\n";
|
|
918 ($numID,$numCons) = $hsp->matches(-SEQ=>$seqType);
|
|
919 push @$contigs_ref, {'start'=>$start, 'stop'=>$stop,
|
|
920 'iden'=>$numID, 'cons'=>$numCons };
|
|
921 };
|
|
922 # <STDIN>;
|
|
923 $overlap;
|
|
924 }
|
|
925
|
|
926
|
|
927 =head2 ambiguous_aln
|
|
928
|
|
929 Usage : $ambig_code = $sbjct_object->ambiguous_aln();
|
|
930 Purpose : Sets/Gets ambiguity code data member.
|
|
931 Example : (see usage)
|
|
932 Returns : String = 'q', 's', 'qs', '-'
|
|
933 : 'q' = query sequence contains overlapping sub-sequences
|
|
934 : while sbjct does not.
|
|
935 : 's' = sbjct sequence contains overlapping sub-sequences
|
|
936 : while query does not.
|
|
937 : 'qs' = query and sbjct sequence contains overlapping sub-sequences
|
|
938 : relative to each other.
|
|
939 : '-' = query and sbjct sequence do not contains multiple domains
|
|
940 : relative to each other OR both contain the same distribution
|
|
941 : of similar domains.
|
|
942 Argument : n/a
|
|
943 Throws : n/a
|
|
944 Status : Experimental
|
|
945
|
|
946 See Also : L<_tile_hsps()|_tile_hsps>, L<HSP Tiling and Ambiguous Alignments>
|
|
947
|
|
948 =cut
|
|
949
|
|
950 #--------------------
|
|
951 sub ambiguous_aln {
|
|
952 #--------------------
|
|
953 my $self = shift;
|
|
954 if(@_) { $self->{'_ambiguous_aln'} = shift; }
|
|
955 $self->{'_ambiguous_aln'} || '-';
|
|
956 }
|
|
957
|
|
958
|
|
959
|
|
960 =head2 overlap
|
|
961
|
|
962 Usage : $blast_object->overlap( [integer] );
|
|
963 Purpose : Gets/Sets the allowable amount overlap between different HSP sequences.
|
|
964 Example : $blast_object->overlap(5);
|
|
965 : $overlap = $blast_object->overlap();
|
|
966 Returns : Integer.
|
|
967 Argument : integer.
|
|
968 Throws : n/a
|
|
969 Status : Experimental
|
|
970 Comments : Any two HSPs whose sequences overlap by less than or equal
|
|
971 : to the overlap() number of resides will be considered separate HSPs
|
|
972 : and will not get tiled by _adjust_contigs().
|
|
973
|
|
974 See Also : L<_adjust_contigs()|_adjust_contigs>, L<BUGS | BUGS>
|
|
975
|
|
976 =cut
|
|
977
|
|
978 #-------------
|
|
979 sub overlap {
|
|
980 #-------------
|
|
981 my $self = shift;
|
|
982 if(@_) { $self->{'_overlap'} = shift; }
|
|
983 defined $self->{'_overlap'} ? $self->{'_overlap'} : 0;
|
|
984 }
|
|
985
|
|
986
|
|
987
|
|
988
|
|
989 =head2 score
|
|
990
|
|
991 Usage : $sbjct_object->score();
|
|
992 Purpose : Gets the BLAST score of the best HSP for the current Blast hit.
|
|
993 Example : $score = $sbjct_object->score();
|
|
994 Returns : Integer
|
|
995 Argument : n/a
|
|
996 Throws : n/a
|
|
997
|
|
998 See Also : L<bits()|bits>
|
|
999
|
|
1000 =cut
|
|
1001
|
|
1002 #----------
|
|
1003 sub score {
|
|
1004 #----------
|
|
1005 my $self = shift;
|
|
1006
|
|
1007 # The check for $self->{'_score'} is a remnant from the 'query' mode days
|
|
1008 # in which the sbjct object would collect data from the description line only.
|
|
1009
|
|
1010 my ($score);
|
|
1011 if(not defined($self->{'_score'})) {
|
|
1012 $score = $self->hsp->score;
|
|
1013 } else {
|
|
1014 $score = $self->{'_score'};
|
|
1015 }
|
|
1016 return $score;
|
|
1017 }
|
|
1018
|
|
1019
|
|
1020
|
|
1021 =head2 bits
|
|
1022
|
|
1023 Usage : $sbjct_object->bits();
|
|
1024 Purpose : Gets the BLAST bit score of the best HSP for the current Blast hit.
|
|
1025 Example : $bits = $sbjct_object->bits();
|
|
1026 Returns : Integer
|
|
1027 Argument : n/a
|
|
1028 Throws : Exception if bit score is not set.
|
|
1029 Comments : For BLAST1, the non-bit score is listed in the summary line.
|
|
1030
|
|
1031 See Also : L<score()|score>
|
|
1032
|
|
1033 =cut
|
|
1034
|
|
1035 #---------
|
|
1036 sub bits {
|
|
1037 #---------
|
|
1038 my $self = shift;
|
|
1039
|
|
1040 # The check for $self->{'_bits'} is a remnant from the 'query' mode days
|
|
1041 # in which the sbjct object would collect data from the description line only.
|
|
1042
|
|
1043 my ($bits);
|
|
1044 if(not defined($self->{'_bits'})) {
|
|
1045 $bits = $self->hsp->bits;
|
|
1046 } else {
|
|
1047 $bits = $self->{'_bits'};
|
|
1048 }
|
|
1049 return $bits;
|
|
1050 }
|
|
1051
|
|
1052
|
|
1053
|
|
1054 =head2 n
|
|
1055
|
|
1056 Usage : $sbjct_object->n();
|
|
1057 Purpose : Gets the N number for the current Blast hit.
|
|
1058 : This is the number of HSPs in the set which was ascribed
|
|
1059 : the lowest P-value (listed on the description line).
|
|
1060 : This number is not the same as the total number of HSPs.
|
|
1061 : To get the total number of HSPs, use num_hsps().
|
|
1062 Example : $n = $sbjct_object->n();
|
|
1063 Returns : Integer
|
|
1064 Argument : n/a
|
|
1065 Throws : Exception if HSPs have not been set (BLAST2 reports).
|
|
1066 Comments : Note that the N parameter is not reported in gapped BLAST2.
|
|
1067 : Calling n() on such reports will result in a call to num_hsps().
|
|
1068 : The num_hsps() method will count the actual number of
|
|
1069 : HSPs in the alignment listing, which may exceed N in
|
|
1070 : some cases.
|
|
1071
|
|
1072 See Also : L<num_hsps()|num_hsps>
|
|
1073
|
|
1074 =cut
|
|
1075
|
|
1076 #-----
|
|
1077 sub n {
|
|
1078 #-----
|
|
1079 my $self = shift;
|
|
1080
|
|
1081 # The check for $self->{'_n'} is a remnant from the 'query' mode days
|
|
1082 # in which the sbjct object would collect data from the description line only.
|
|
1083
|
|
1084 my ($n);
|
|
1085 if(not defined($self->{'_n'})) {
|
|
1086 $n = $self->hsp->n;
|
|
1087 } else {
|
|
1088 $n = $self->{'_n'};
|
|
1089 }
|
|
1090 $n ||= $self->num_hsps;
|
|
1091
|
|
1092 return $n;
|
|
1093 }
|
|
1094
|
|
1095
|
|
1096
|
|
1097 =head2 frame
|
|
1098
|
|
1099 Usage : $sbjct_object->frame();
|
|
1100 Purpose : Gets the reading frame for the hit sequence (TBLASTN/X only).
|
|
1101 Example : $frame = $sbjct_object->frame();
|
|
1102 Returns : Integer (-3 .. +3).
|
|
1103 Argument : n/a
|
|
1104 Throws : Exception if HSPs have not been set (BLAST2 reports).
|
|
1105
|
|
1106 See Also : L<hsps()|hsps>
|
|
1107
|
|
1108 =cut
|
|
1109
|
|
1110 #----------
|
|
1111 sub frame {
|
|
1112 #----------
|
|
1113 my $self = shift;
|
|
1114
|
|
1115 # The check for $self->{'_frame'} is a remnant from the 'query' mode days
|
|
1116 # in which the sbjct object would collect data from the description line only.
|
|
1117
|
|
1118 my ($frame);
|
|
1119 if(not defined($self->{'_frame'})) {
|
|
1120 $frame = $self->hsp->frame;
|
|
1121 } else {
|
|
1122 $frame = $self->{'_frame'};
|
|
1123 }
|
|
1124 return $frame;
|
|
1125 }
|
|
1126
|
|
1127
|
|
1128
|
|
1129
|
|
1130
|
|
1131 =head2 p
|
|
1132
|
|
1133 Usage : $sbjct_object->p( [format] );
|
|
1134 Purpose : Get the P-value for the given BLAST hit.
|
|
1135 : (Note that P-values are not provided with NCBI Blast2 reports).
|
|
1136 Example : $p = $sbjct->p;
|
|
1137 : $p = $sbjct->p('exp'); # get exponent only.
|
|
1138 : ($num, $exp) = $sbjct->p('parts'); # split sci notation into parts
|
|
1139 Returns : Float or scientific notation number (the raw P-value, DEFAULT).
|
|
1140 : Integer if format == 'exp' (the magnitude of the base 10 exponent).
|
|
1141 : 2-element list (float, int) if format == 'parts' and P-value
|
|
1142 : is in scientific notation (See Comments).
|
|
1143 Argument : format: string of 'raw' | 'exp' | 'parts'
|
|
1144 : 'raw' returns value given in report. Default. (1.2e-34)
|
|
1145 : 'exp' returns exponent value only (34)
|
|
1146 : 'parts' returns the decimal and exponent as a
|
|
1147 : 2-element list (1.2, -34) (See Comments).
|
|
1148 Throws : Exception if the P-value is not defined, which will occur
|
|
1149 : with any NCBI Blast2 report.
|
|
1150 Comments : Using the 'parts' argument is not recommended since it will not
|
|
1151 : work as expected if the P-value is not in scientific notation.
|
|
1152 : That is, floats are not converted into sci notation before
|
|
1153 : splitting into parts.
|
|
1154
|
|
1155 See Also : L<expect()|expect>, L<signif()|signif>, L<get_exponent()|get_exponent>
|
|
1156
|
|
1157 =cut
|
|
1158
|
|
1159 #--------
|
|
1160 sub p {
|
|
1161 #--------
|
|
1162 # Some duplication of logic for p(), expect() and signif() for the sake of performance.
|
|
1163 my ($self, $fmt) = @_;
|
|
1164
|
|
1165 my ($val);
|
|
1166 $fmt ||= $_signif_fmt;
|
|
1167
|
|
1168 # $val can be zero.
|
|
1169 if(not defined($val = $self->{'_p'})) {
|
|
1170 ## P-value not defined, must be a NCBI Blast2 report.
|
|
1171 my $note = '';
|
|
1172 if($self->parent->_layout() == 2) {
|
|
1173 $note = "Blast2 does not report P-values. Use expect() instead.";
|
|
1174 }
|
|
1175 $self->throw("Can't get P-value: undefined.", $note);
|
|
1176 }
|
|
1177
|
|
1178 return $val if not $fmt or $fmt =~ /^raw/i;
|
|
1179 ## Special formats: exponent-only or as list.
|
|
1180 return &get_exponent($val) if $fmt =~ /^exp/i;
|
|
1181 return (split (/eE/, $val)) if $fmt =~ /^parts/i;
|
|
1182
|
|
1183 ## Default: return the raw P-value.
|
|
1184 return $val;
|
|
1185 }
|
|
1186
|
|
1187
|
|
1188
|
|
1189 =head2 expect
|
|
1190
|
|
1191 Usage : $sbjct_object->expect( [format] );
|
|
1192 Purpose : Get the Expect value for the given BLAST hit.
|
|
1193 Example : $e = $sbjct->expect;
|
|
1194 : $e = $sbjct->expect('exp'); # get exponent only.
|
|
1195 : ($num, $exp) = $sbjct->expect('parts'); # split sci notation into parts
|
|
1196 Returns : Float or scientific notation number (the raw expect value, DEFAULT).
|
|
1197 : Integer if format == 'exp' (the magnitude of the base 10 exponent).
|
|
1198 : 2-element list (float, int) if format == 'parts' and Expect
|
|
1199 : is in scientific notation (see Comments).
|
|
1200 Argument : format: string of 'raw' | 'exp' | 'parts'
|
|
1201 : 'raw' returns value given in report. Default. (1.2e-34)
|
|
1202 : 'exp' returns exponent value only (34)
|
|
1203 : 'parts' returns the decimal and exponent as a
|
|
1204 : 2-element list (1.2, -34) (see Comments).
|
|
1205 Throws : Exception if the Expect value is not defined.
|
|
1206 Comments : Using the 'parts' argument is not recommended since it will not
|
|
1207 : work as expected if the expect value is not in scientific notation.
|
|
1208 : That is, floats are not converted into sci notation before
|
|
1209 : splitting into parts.
|
|
1210
|
|
1211 See Also : L<p()|p>, L<signif()|signif>, L<get_exponent()|get_exponent>
|
|
1212
|
|
1213 =cut
|
|
1214
|
|
1215 #-----------
|
|
1216 sub expect {
|
|
1217 #-----------
|
|
1218 # Some duplication of logic for p(), expect() and signif() for the sake of performance.
|
|
1219 my ($self, $fmt) = @_;
|
|
1220
|
|
1221 my $val = $self->{'_expect'};
|
|
1222 $fmt ||= $_signif_fmt;
|
|
1223
|
|
1224 # $val can be zero.
|
|
1225 defined($val) or $self->throw("Can't get Expect value: HSPs may not have been set.");
|
|
1226
|
|
1227 return $val if not $fmt or $fmt =~ /^raw/i;
|
|
1228 ## Special formats: exponent-only or as list.
|
|
1229 return &get_exponent($val) if $fmt =~ /^exp/i;
|
|
1230 return (split (/eE/, $val)) if $fmt =~ /^parts/i;
|
|
1231
|
|
1232 ## Default: return the raw Expect-value.
|
|
1233 return $val;
|
|
1234 }
|
|
1235
|
|
1236
|
|
1237
|
|
1238 =head2 signif
|
|
1239
|
|
1240 Usage : $sbjct_object->signif( [format] );
|
|
1241 Purpose : Get the P or Expect value for the given BLAST hit.
|
|
1242 : The value returned is the one which is reported in the description
|
|
1243 : section of the Blast report. For Blast1 and WU-Blast2, this
|
|
1244 : is a P-value, for Blast2, it is an Expect value.
|
|
1245 Example : $obj->signif() # returns 1.3e-34
|
|
1246 : $obj->signif('exp') # returns -34
|
|
1247 : $obj->signif('parts') # returns (1.3, -34)
|
|
1248 Returns : Float or scientific notation number (the raw P/Expect value, DEFAULT).
|
|
1249 : Integer if format == 'exp' (the magnitude of the base 10 exponent).
|
|
1250 : 2-element list (float, int) if format == 'parts' and P/Expect value
|
|
1251 : is in scientific notation (see Comments).
|
|
1252 Argument : format: string of 'raw' | 'exp' | 'parts'
|
|
1253 : 'raw' returns value given in report. Default. (1.2e-34)
|
|
1254 : 'exp' returns exponent value only (34)
|
|
1255 : 'parts' returns the decimal and exponent as a
|
|
1256 : 2-element list (1.2, -34) (see Comments).
|
|
1257 Throws : n/a
|
|
1258 Status : Deprecated. Use p() or expect().
|
|
1259 Comments : The signif() method provides a way to deal with the fact that
|
|
1260 : Blast1 and Blast2 formats differ in what is reported in the
|
|
1261 : description lines of each hit in the Blast report. The signif()
|
|
1262 : method frees any client code from having to know if this is a P-value
|
|
1263 : or an Expect value, making it easier to write code that can process
|
|
1264 : both Blast1 and Blast2 reports. This is not necessarily a good thing, since
|
|
1265 : one should always know when one is working with P-values or
|
|
1266 : Expect values (hence the deprecated status).
|
|
1267 : Use of expect() is recommended since all hits will have an Expect value.
|
|
1268 :
|
|
1269 : Using the 'parts' argument is not recommended since it will not
|
|
1270 : work as expected if the expect value is not in scientific notation.
|
|
1271 : That is, floats are not converted into sci notation before
|
|
1272 : splitting into parts.
|
|
1273
|
|
1274 See Also : L<p()|p>, L<expect()|expect>, L<get_exponent()|get_exponent>
|
|
1275
|
|
1276 =cut
|
|
1277
|
|
1278 #-------------
|
|
1279 sub signif {
|
|
1280 #-------------
|
|
1281 # Some duplication of logic for p(), expect() and signif() for the sake of performance.
|
|
1282 my ($self, $fmt) = @_;
|
|
1283
|
|
1284 my $val = defined($self->{'_p'}) ? $self->{'_p'} : $self->{'_expect'};
|
|
1285 $fmt ||= $_signif_fmt;
|
|
1286
|
|
1287 # $val can be zero.
|
|
1288 defined($val) or $self->throw("Can't get P- or Expect value: HSPs may not have been set.");
|
|
1289
|
|
1290 return $val if not $fmt or $fmt =~ /^raw/i;
|
|
1291 ## Special formats: exponent-only or as list.
|
|
1292 return &get_exponent($val) if $fmt =~ /^exp/i;
|
|
1293 return (split (/eE/, $val)) if $fmt =~ /^parts/i;
|
|
1294
|
|
1295 ## Default: return the raw P/Expect-value.
|
|
1296 return $val;
|
|
1297 }
|
|
1298
|
|
1299
|
|
1300
|
|
1301 =head2 desc
|
|
1302
|
|
1303 Usage : $sbjct_object->desc( [integer] );
|
|
1304 Purpose : Get the description for the given BLAST hit.
|
|
1305 Example : (see usage)
|
|
1306 Returns : String
|
|
1307 Argument : Integer (optional) indicating the desired length of the
|
|
1308 : description string to be returned.
|
|
1309 Throws : n/a
|
|
1310
|
|
1311 See Also : L<_set_desc()|_set_desc>
|
|
1312
|
|
1313 =cut
|
|
1314
|
|
1315 #---------
|
|
1316 sub desc {
|
|
1317 #---------
|
|
1318 my( $self, $len ) = @_;
|
|
1319 $len = (defined $len) ? $len : (CORE::length $self->{'_desc'});
|
|
1320 substr( $self->{'_desc'}, 0 ,$len );
|
|
1321 }
|
|
1322
|
|
1323
|
|
1324
|
|
1325 =head2 database
|
|
1326
|
|
1327 Usage : $sbjct_object->database();
|
|
1328 Purpose : Get the name of the database for the hit sequence.
|
|
1329 Example : (see usage)
|
|
1330 Returns : String
|
|
1331 Argument : n/a
|
|
1332 Throws : n/a
|
|
1333 Status : Deprecated. Use Bio::Tools::Blast::database()
|
|
1334 Extracting database name from the seq identifier is error prone.
|
|
1335 Comments : Database id should be the same for all hits in a given
|
|
1336 : BLAST report, however, they do not always have the same
|
|
1337 : name as the database name extraced by the Blast.pm object.
|
|
1338 : The Sbjct.pm database id is obtained from the summary line.
|
|
1339
|
|
1340 =cut
|
|
1341
|
|
1342 #--------------
|
|
1343 sub database {
|
|
1344 my $self = shift;
|
|
1345 $self->warn("Bio::Tools::Sbjct::database() is deprecated.\nNo useful information is provided by this method.\nUse Bio::Tools::Blast::database().\n");
|
|
1346 return $self->{'_db'};
|
|
1347 }
|
|
1348 #--------------
|
|
1349
|
|
1350
|
|
1351
|
|
1352
|
|
1353 =head2 hsps
|
|
1354
|
|
1355 Usage : $sbjct_object->hsps();
|
|
1356 Purpose : Get a list containing all HSP objects.
|
|
1357 : Get the numbers of HSPs for the current hit.
|
|
1358 Example : @hsps = $sbjct_object->hsps();
|
|
1359 : $num = $sbjct_object->hsps(); # alternatively, use num_hsps()
|
|
1360 Returns : Array context : list of Bio::Tools::Blast::HSP.pm objects.
|
|
1361 : Scalar context: integer (number of HSPs).
|
|
1362 : (Equivalent to num_hsps()).
|
|
1363 Argument : n/a. Relies on wantarray
|
|
1364 Throws : Exception if the HSPs have not been collected.
|
|
1365
|
|
1366 See Also : L<hsp()|hsp>, L<num_hsps()|num_hsps>, L<_set_hsps()|_set_hsps>
|
|
1367
|
|
1368 =cut
|
|
1369
|
|
1370 #---------
|
|
1371 sub hsps {
|
|
1372 #---------
|
|
1373 my $self = shift;
|
|
1374
|
|
1375 if (not ref $self->{'_hsps'}) {
|
|
1376 $self->throw("Can't get HSPs: data not collected.");
|
|
1377 }
|
|
1378
|
|
1379 return wantarray
|
|
1380 # returning list containing all HSPs.
|
|
1381 ? @{$self->{'_hsps'}}
|
|
1382 # returning number of HSPs.
|
|
1383 : scalar(@{$self->{'_hsps'}});
|
|
1384 }
|
|
1385
|
|
1386
|
|
1387
|
|
1388 =head2 hsp
|
|
1389
|
|
1390 Usage : $sbjct_object->hsp( [string] );
|
|
1391 Purpose : Get a single HSP.pm object for the present Sbjct.pm object.
|
|
1392 Example : $hspObj = $sbjct_object->hsp; # same as 'best'
|
|
1393 : $hspObj = $sbjct_object->hsp('best');
|
|
1394 : $hspObj = $sbjct_object->hsp('worst');
|
|
1395 Returns : Object reference for a Bio::Tools::Blast::HSP.pm object.
|
|
1396 Argument : String (or no argument).
|
|
1397 : No argument (default) = highest scoring HSP (same as 'best').
|
|
1398 : 'best' or 'first' = highest scoring HSP.
|
|
1399 : 'worst' or 'last' = lowest scoring HSP.
|
|
1400 Throws : Exception if the HSPs have not been collected.
|
|
1401 : Exception if an unrecognized argument is used.
|
|
1402
|
|
1403 See Also : L<hsps()|hsps>, L<num_hsps()|num_hsps>, L<_set_hsps()|_set_hsps>
|
|
1404
|
|
1405 =cut
|
|
1406
|
|
1407 #----------
|
|
1408 sub hsp {
|
|
1409 #----------
|
|
1410 my( $self, $option ) = @_;
|
|
1411 $option ||= 'best';
|
|
1412
|
|
1413 if (not ref $self->{'_hsps'}) {
|
|
1414 $self->throw("Can't get HSPs: data not collected.");
|
|
1415 }
|
|
1416
|
|
1417 my @hsps = @{$self->{'_hsps'}};
|
|
1418
|
|
1419 return $hsps[0] if $option =~ /best|first|1/i;
|
|
1420 return $hsps[$#hsps] if $option =~ /worst|last/i;
|
|
1421
|
|
1422 $self->throw("Can't get HSP for: $option",
|
|
1423 "Valid arguments: 'best', 'worst'");
|
|
1424 }
|
|
1425
|
|
1426
|
|
1427
|
|
1428 =head2 num_hsps
|
|
1429
|
|
1430 Usage : $sbjct_object->num_hsps();
|
|
1431 Purpose : Get the number of HSPs for the present Blast hit.
|
|
1432 Example : $nhsps = $sbjct_object->num_hsps();
|
|
1433 Returns : Integer
|
|
1434 Argument : n/a
|
|
1435 Throws : Exception if the HSPs have not been collected.
|
|
1436
|
|
1437 See Also : L<hsps()|hsps>
|
|
1438
|
|
1439 =cut
|
|
1440
|
|
1441 #-------------
|
|
1442 sub num_hsps {
|
|
1443 #-------------
|
|
1444 my $self = shift;
|
|
1445
|
|
1446 if (not defined $self->{'_hsps'}) {
|
|
1447 $self->throw("Can't get HSPs: data not collected.");
|
|
1448 }
|
|
1449
|
|
1450 return scalar(@{$self->{'_hsps'}});
|
|
1451 }
|
|
1452
|
|
1453
|
|
1454
|
|
1455 =head2 length
|
|
1456
|
|
1457 Usage : $sbjct_object->length();
|
|
1458 Purpose : Get the total length of the hit sequence.
|
|
1459 Example : $len = $sbjct_object->length();
|
|
1460 Returns : Integer
|
|
1461 Argument : n/a
|
|
1462 Throws : n/a
|
|
1463 Comments : Developer note: when using the built-in length function within
|
|
1464 : this module, call it as CORE::length().
|
|
1465
|
|
1466 See Also : L<logical_length()|logical_length>, L<length_aln()|length_aln>
|
|
1467
|
|
1468 =cut
|
|
1469
|
|
1470 #-----------
|
|
1471 sub length {
|
|
1472 #-----------
|
|
1473 my $self = shift;
|
|
1474 $self->{'_length'};
|
|
1475 }
|
|
1476
|
|
1477
|
|
1478 =head2 logical_length
|
|
1479
|
|
1480 Usage : $sbjct_object->logical_length( [seq_type] );
|
|
1481 : (mostly intended for internal use).
|
|
1482 Purpose : Get the logical length of the hit sequence.
|
|
1483 : If the Blast is a TBLASTN or TBLASTX, the returned length
|
|
1484 : is the length of the would-be amino acid sequence (length/3).
|
|
1485 : For all other BLAST flavors, this function is the same as length().
|
|
1486 Example : $len = $sbjct_object->logical_length();
|
|
1487 Returns : Integer
|
|
1488 Argument : seq_type = 'query' or 'sbjct' (default = 'query')
|
|
1489 Throws : n/a
|
|
1490 Comments : This is important for functions like frac_aligned_query()
|
|
1491 : which need to operate in amino acid coordinate space when dealing
|
|
1492 : with [T]BLAST[NX] type reports.
|
|
1493
|
|
1494 See Also : L<length()|length>, L<frac_aligned_query()|frac_aligned_query>, L<frac_aligned_hit()|frac_aligned_hit>
|
|
1495
|
|
1496 =cut
|
|
1497
|
|
1498 #--------------------
|
|
1499 sub logical_length {
|
|
1500 #--------------------
|
|
1501 my $self = shift;
|
|
1502 my $seqType = shift || 'query';
|
|
1503
|
|
1504 # Return logical sbjct length
|
|
1505 $seqType eq 'sbjct' and return
|
|
1506 $self->{'_logical_length'} || $self->{'_length'};
|
|
1507
|
|
1508 # Otherwise, return logical query length
|
|
1509 my $qlen = $self->parent->length;
|
|
1510
|
|
1511 # Adjust length based on BLAST flavor.
|
|
1512 my $prog = $self->parent->program;
|
|
1513 if($prog =~ /T?BLASTX/ ) {
|
|
1514 $qlen /= 3;
|
|
1515 }
|
|
1516 return $qlen;
|
|
1517 }
|
|
1518
|
|
1519
|
|
1520
|
|
1521 =head2 length_aln
|
|
1522
|
|
1523 Usage : $sbjct_object->length_aln( [seq_type] );
|
|
1524 Purpose : Get the total length of the aligned region for query or sbjct seq.
|
|
1525 : This number will include all HSPs
|
|
1526 Example : $len = $sbjct_object->length_aln(); # default = query
|
|
1527 : $lenAln = $sbjct_object->length_aln('query');
|
|
1528 Returns : Integer
|
|
1529 Argument : seq_Type = 'query' | 'sbjct' (Default = 'query')
|
|
1530 Throws : Exception if the argument is not recognized.
|
|
1531 Comments : This method will report the logical length of the alignment,
|
|
1532 : meaning that for TBLAST[NX] reports, the length is reported
|
|
1533 : using amino acid coordinate space (i.e., nucleotides / 3).
|
|
1534 :
|
|
1535 : This method requires that all HSPs be tiled. If they have not
|
|
1536 : already been tiled, they will be tiled first.
|
|
1537 : If you don't want the tiled data, iterate through each HSP
|
|
1538 : calling length() on each (use hsps() to get the HSPs).
|
|
1539
|
|
1540 See Also : L<length()|length>, L<frac_aligned_query()|frac_aligned_query>, L<frac_aligned_hit()|frac_aligned_hit>, L<gaps()|gaps>, L<_tile_hsps()|_tile_hsps>, B<Bio::Tools::Blast::HSP::length()>
|
|
1541
|
|
1542 =cut
|
|
1543
|
|
1544 #---------------'
|
|
1545 sub length_aln {
|
|
1546 #---------------
|
|
1547 my( $self, $type ) = @_;
|
|
1548
|
|
1549 $type ||= 'query';
|
|
1550
|
|
1551 $self->_tile_hsps() if not $self->{'_tile_hsps'};
|
|
1552
|
|
1553 my $data = $self->{'_length_aln_'.$type};
|
|
1554
|
|
1555 ## If we don't have data, figure out what went wrong.
|
|
1556 if(!$data) {
|
|
1557 $self->throw("Can't get length aln for sequence type \"$type\"",
|
|
1558 "Valid types are 'query', 'sbjct'");
|
|
1559 }
|
|
1560 $data;
|
|
1561 }
|
|
1562
|
|
1563
|
|
1564 =head2 gaps
|
|
1565
|
|
1566 Usage : $sbjct_object->gaps( [seq_type] );
|
|
1567 Purpose : Get the number of gaps in the aligned query, sbjct, or both sequences.
|
|
1568 : Data is summed across all HSPs.
|
|
1569 Example : $qgaps = $sbjct_object->gaps('query');
|
|
1570 : $sgaps = $sbjct_object->gaps('sbjct');
|
|
1571 : $tgaps = $sbjct_object->gaps(); # default = total (query + sbjct)
|
|
1572 Returns : scalar context: integer
|
|
1573 : array context without args: two-element list of integers
|
|
1574 : (queryGaps, sbjctGaps)
|
|
1575 : Array context can be "induced" by providing an argument of 'list' or 'array'.
|
|
1576 Argument : seq_type: 'query' | 'sbjct' | 'total' | 'list' (default = 'total')
|
|
1577 Throws : n/a
|
|
1578 Comments : If you need data for each HSP, use hsps() and then interate
|
|
1579 : through each HSP object.
|
|
1580 : This method requires that all HSPs be tiled. If they have not
|
|
1581 : already been tiled, they will be tiled first.
|
|
1582 : Not relying on wantarray since that will fail in situations
|
|
1583 : such as printf "%d", $hit->gaps() in which you might expect to
|
|
1584 : be printing the total gaps, but evaluates to array context.
|
|
1585
|
|
1586 See Also : L<length_aln()|length_aln>
|
|
1587
|
|
1588 =cut
|
|
1589
|
|
1590 #----------
|
|
1591 sub gaps {
|
|
1592 #----------
|
|
1593 my( $self, $seqType ) = @_;
|
|
1594
|
|
1595 $seqType ||= (wantarray ? 'list' : 'total');
|
|
1596
|
|
1597 $self->_tile_hsps() if not $self->{'_tile_hsps'};
|
|
1598
|
|
1599 $seqType = lc($seqType);
|
|
1600
|
|
1601 if($seqType =~ /list|array/i) {
|
|
1602 return ($self->{'_gaps_query'}, $self->{'_gaps_sbjct'});
|
|
1603 }
|
|
1604
|
|
1605 if($seqType eq 'total') {
|
|
1606 return ($self->{'_gaps_query'} + $self->{'_gaps_sbjct'}) || 0;
|
|
1607 } else {
|
|
1608 return $self->{'_gaps_'.$seqType} || 0;
|
|
1609 }
|
|
1610 }
|
|
1611
|
|
1612
|
|
1613
|
|
1614 =head2 matches
|
|
1615
|
|
1616 Usage : $sbjct_object->matches( [class] );
|
|
1617 Purpose : Get the total number of identical or conserved matches
|
|
1618 : (or both) across all HSPs.
|
|
1619 : (Note: 'conservative' matches are indicated as 'positives'
|
|
1620 : in the Blast report.)
|
|
1621 Example : ($id,$cons) = $sbjct_object->matches(); # no argument
|
|
1622 : $id = $sbjct_object->matches('id');
|
|
1623 : $cons = $sbjct_object->matches('cons');
|
|
1624 Returns : Integer or a 2-element array of integers
|
|
1625 Argument : class = 'id' | 'cons' OR none.
|
|
1626 : If no argument is provided, both identical and conservative
|
|
1627 : numbers are returned in a two element list.
|
|
1628 : (Other terms can be used to refer to the conservative
|
|
1629 : matches, e.g., 'positive'. All that is checked is whether or
|
|
1630 : not the supplied string starts with 'id'. If not, the
|
|
1631 : conservative matches are returned.)
|
|
1632 Throws : Exception if the requested data cannot be obtained.
|
|
1633 Comments : If you need data for each HSP, use hsps() and then interate
|
|
1634 : through the HSP objects.
|
|
1635 : Does not rely on wantarray to return a list. Only checks for
|
|
1636 : the presence of an argument (no arg = return list).
|
|
1637
|
|
1638 See Also : B<Bio::Tools::Blast::HSP::matches()>, L<hsps()|hsps>
|
|
1639
|
|
1640 =cut
|
|
1641
|
|
1642 #---------------
|
|
1643 sub matches {
|
|
1644 #---------------
|
|
1645 my( $self, $arg) = @_;
|
|
1646 my(@data,$data);
|
|
1647
|
|
1648 if(!$arg) {
|
|
1649 @data = ($self->{'_totalIdentical'}, $self->{'_totalConserved'});
|
|
1650
|
|
1651 return @data if @data;
|
|
1652
|
|
1653 } else {
|
|
1654
|
|
1655 if($arg =~ /^id/i) {
|
|
1656 $data = $self->{'_totalIdentical'};
|
|
1657 } else {
|
|
1658 $data = $self->{'_totalConserved'};
|
|
1659 }
|
|
1660 return $data if $data;
|
|
1661 }
|
|
1662
|
|
1663 ## Something went wrong if we make it to here.
|
|
1664 $self->throw("Can't get identical or conserved data: no data.");
|
|
1665 }
|
|
1666
|
|
1667
|
|
1668 =head2 start
|
|
1669
|
|
1670 Usage : $sbjct->start( [seq_type] );
|
|
1671 Purpose : Gets the start coordinate for the query, sbjct, or both sequences
|
|
1672 : in the Sbjct object. If there is more than one HSP, the lowest start
|
|
1673 : value of all HSPs is returned.
|
|
1674 Example : $qbeg = $sbjct->start('query');
|
|
1675 : $sbeg = $sbjct->start('sbjct');
|
|
1676 : ($qbeg, $sbeg) = $sbjct->start();
|
|
1677 Returns : scalar context: integer
|
|
1678 : array context without args: list of two integers (queryStart, sbjctStart)
|
|
1679 : Array context can be "induced" by providing an argument of 'list' or 'array'.
|
|
1680 Argument : In scalar context: seq_type = 'query' or 'sbjct'
|
|
1681 : (case insensitive). If not supplied, 'query' is used.
|
|
1682 Throws : n/a
|
|
1683 Comments : This method requires that all HSPs be tiled. If there is more than one
|
|
1684 : HSP and they have not already been tiled, they will be tiled first.
|
|
1685 : Remember that the start and end coordinates of all HSPs are
|
|
1686 : normalized so that start < end. Strand information can only be
|
|
1687 : obtained on an HSP-by-HSP basis by calling $hsp->strand().
|
|
1688
|
|
1689 See Also : L<end()|end>, L<range()|range>, L<HSP Tiling and Ambiguous Alignments>, B<Bio::Tools::Blast::HSP::start>()
|
|
1690
|
|
1691 =cut
|
|
1692
|
|
1693 #----------
|
|
1694 sub start {
|
|
1695 #----------
|
|
1696 my ($self, $seqType) = @_;
|
|
1697
|
|
1698 $seqType ||= (wantarray ? 'list' : 'query');
|
|
1699
|
|
1700 # If there is only one HSP, defer this call to the solitary HSP.
|
|
1701 if($self->num_hsps == 1) {
|
|
1702 return $self->hsp->start($seqType);
|
|
1703 } else {
|
|
1704 $self->_tile_hsps() if not $self->{'_tile_hsps'};
|
|
1705 if($seqType =~ /list|array/i) {
|
|
1706 return ($self->{'_queryStart'}, $self->{'_sbjctStart'});
|
|
1707 } else {
|
|
1708 ## Sensitive to member name changes.
|
|
1709 $seqType = "_\L$seqType\E";
|
|
1710 return $self->{$seqType.'Start'};
|
|
1711 }
|
|
1712 }
|
|
1713 }
|
|
1714
|
|
1715
|
|
1716 =head2 end
|
|
1717
|
|
1718 Usage : $sbjct->end( [seq_type] );
|
|
1719 Purpose : Gets the end coordinate for the query, sbjct, or both sequences
|
|
1720 : in the Sbjct object. If there is more than one HSP, the largest end
|
|
1721 : value of all HSPs is returned.
|
|
1722 Example : $qend = $sbjct->end('query');
|
|
1723 : $send = $sbjct->end('sbjct');
|
|
1724 : ($qend, $send) = $sbjct->end();
|
|
1725 Returns : scalar context: integer
|
|
1726 : array context without args: list of two integers (queryEnd, sbjctEnd)
|
|
1727 : Array context can be "induced" by providing an argument of 'list' or 'array'.
|
|
1728 Argument : In scalar context: seq_type = 'query' or 'sbjct'
|
|
1729 : (case insensitive). If not supplied, 'query' is used.
|
|
1730 Throws : n/a
|
|
1731 Comments : This method requires that all HSPs be tiled. If there is more than one
|
|
1732 : HSP and they have not already been tiled, they will be tiled first.
|
|
1733 : Remember that the start and end coordinates of all HSPs are
|
|
1734 : normalized so that start < end. Strand information can only be
|
|
1735 : obtained on an HSP-by-HSP basis by calling $hsp->strand().
|
|
1736
|
|
1737 See Also : L<start()|start>, L<range()|range>, L<HSP Tiling and Ambiguous Alignments>, B<Bio::Tools::Blast::HSP::end>()
|
|
1738
|
|
1739 =cut
|
|
1740
|
|
1741 #----------
|
|
1742 sub end {
|
|
1743 #----------
|
|
1744 my ($self, $seqType) = @_;
|
|
1745
|
|
1746 $seqType ||= (wantarray ? 'list' : 'query');
|
|
1747
|
|
1748 # If there is only one HSP, defer this call to the solitary HSP.
|
|
1749 if($self->num_hsps == 1) {
|
|
1750 return $self->hsp->end($seqType);
|
|
1751 } else {
|
|
1752 $self->_tile_hsps() if not $self->{'_tile_hsps'};
|
|
1753 if($seqType =~ /list|array/i) {
|
|
1754 return ($self->{'_queryStop'}, $self->{'_sbjctStop'});
|
|
1755 } else {
|
|
1756 ## Sensitive to member name changes.
|
|
1757 $seqType = "_\L$seqType\E";
|
|
1758 return $self->{$seqType.'Stop'};
|
|
1759 }
|
|
1760 }
|
|
1761 }
|
|
1762
|
|
1763 =head2 range
|
|
1764
|
|
1765 Usage : $sbjct->range( [seq_type] );
|
|
1766 Purpose : Gets the (start, end) coordinates for the query or sbjct sequence
|
|
1767 : in the HSP alignment.
|
|
1768 Example : ($qbeg, $qend) = $sbjct->range('query');
|
|
1769 : ($sbeg, $send) = $sbjct->range('sbjct');
|
|
1770 Returns : Two-element array of integers
|
|
1771 Argument : seq_type = string, 'query' or 'sbjct' (default = 'query')
|
|
1772 : (case insensitive).
|
|
1773 Throws : n/a
|
|
1774
|
|
1775 See Also : L<start()|start>, L<end()|end>
|
|
1776
|
|
1777 =cut
|
|
1778
|
|
1779 #----------
|
|
1780 sub range {
|
|
1781 #----------
|
|
1782 my ($self, $seqType) = @_;
|
|
1783 $seqType ||= 'query';
|
|
1784 return ($self->start($seqType), $self->end($seqType));
|
|
1785 }
|
|
1786
|
|
1787
|
|
1788 =head2 frac_identical
|
|
1789
|
|
1790 Usage : $sbjct_object->frac_identical( [seq_type] );
|
|
1791 Purpose : Get the overall fraction of identical positions across all HSPs.
|
|
1792 : The number refers to only the aligned regions and does not
|
|
1793 : account for unaligned regions in between the HSPs, if any.
|
|
1794 Example : $frac_iden = $sbjct_object->frac_identical('query');
|
|
1795 Returns : Float (2-decimal precision, e.g., 0.75).
|
|
1796 Argument : seq_type: 'query' | 'sbjct' | 'total'
|
|
1797 : default = 'total' (but see comments below).
|
|
1798 Throws : n/a
|
|
1799 Comments : Different versions of Blast report different values for the total
|
|
1800 : length of the alignment. This is the number reported in the
|
|
1801 : denominators in the stats section:
|
|
1802 : "Identical = 34/120 Positives = 67/120".
|
|
1803 : BLAST-GP uses the total length of the alignment (with gaps)
|
|
1804 : WU-BLAST uses the length of the query sequence (without gaps).
|
|
1805 : Therefore, when called without an argument or an argument of 'total',
|
|
1806 : this method will report different values depending on the
|
|
1807 : version of BLAST used.
|
|
1808 :
|
|
1809 : To get the fraction identical among only the aligned residues,
|
|
1810 : ignoring the gaps, call this method with an argument of 'query'
|
|
1811 : or 'sbjct'.
|
|
1812 :
|
|
1813 : If you need data for each HSP, use hsps() and then iterate
|
|
1814 : through the HSP objects.
|
|
1815 : This method requires that all HSPs be tiled. If they have not
|
|
1816 : already been tiled, they will be tiled first.
|
|
1817
|
|
1818 See Also : L<frac_conserved()|frac_conserved>, L<frac_aligned_query()|frac_aligned_query>, L<matches()|matches>, L<_tile_hsps()|_tile_hsps>
|
|
1819
|
|
1820 =cut
|
|
1821
|
|
1822 #------------------
|
|
1823 sub frac_identical {
|
|
1824 #------------------
|
|
1825 my ($self, $seqType) = @_;
|
|
1826 $seqType ||= 'total';
|
|
1827
|
|
1828 ## Sensitive to member name format.
|
|
1829 $seqType = lc($seqType);
|
|
1830
|
|
1831 $self->_tile_hsps() if not $self->{'_tile_hsps'};
|
|
1832
|
|
1833 sprintf( "%.2f", $self->{'_totalIdentical'}/$self->{'_length_aln_'.$seqType});
|
|
1834 }
|
|
1835
|
|
1836
|
|
1837
|
|
1838 =head2 frac_conserved
|
|
1839
|
|
1840 Usage : $sbjct_object->frac_conserved( [seq_type] );
|
|
1841 Purpose : Get the overall fraction of conserved positions across all HSPs.
|
|
1842 : The number refers to only the aligned regions and does not
|
|
1843 : account for unaligned regions in between the HSPs, if any.
|
|
1844 Example : $frac_cons = $sbjct_object->frac_conserved('sbjct');
|
|
1845 Returns : Float (2-decimal precision, e.g., 0.75).
|
|
1846 Argument : seq_type: 'query' | 'sbjct' | 'total'
|
|
1847 : default = 'total' (but see comments below).
|
|
1848 Throws : n/a
|
|
1849 Comments : Different versions of Blast report different values for the total
|
|
1850 : length of the alignment. This is the number reported in the
|
|
1851 : denominators in the stats section:
|
|
1852 : "Identical = 34/120 Positives = 67/120".
|
|
1853 : BLAST-GP uses the total length of the alignment (with gaps)
|
|
1854 : WU-BLAST uses the length of the query sequence (without gaps).
|
|
1855 : Therefore, when called without an argument or an argument of 'total',
|
|
1856 : this method will report different values depending on the
|
|
1857 : version of BLAST used.
|
|
1858 :
|
|
1859 : To get the fraction conserved among only the aligned residues,
|
|
1860 : ignoring the gaps, call this method with an argument of 'query'
|
|
1861 : or 'sbjct'.
|
|
1862 :
|
|
1863 : If you need data for each HSP, use hsps() and then interate
|
|
1864 : through the HSP objects.
|
|
1865 : This method requires that all HSPs be tiled. If they have not
|
|
1866 : already been tiled, they will be tiled first.
|
|
1867
|
|
1868 See Also : L<frac_identical()|frac_identical>, L<matches()|matches>, L<_tile_hsps()|_tile_hsps>
|
|
1869
|
|
1870 =cut
|
|
1871
|
|
1872 #--------------------
|
|
1873 sub frac_conserved {
|
|
1874 #--------------------
|
|
1875 my ($self, $seqType) = @_;
|
|
1876 $seqType ||= 'total';
|
|
1877
|
|
1878 ## Sensitive to member name format.
|
|
1879 $seqType = lc($seqType);
|
|
1880
|
|
1881 $self->_tile_hsps() if not $self->{'_tile_hsps'};
|
|
1882
|
|
1883 sprintf( "%.2f", $self->{'_totalConserved'}/$self->{'_length_aln_'.$seqType});
|
|
1884 }
|
|
1885
|
|
1886
|
|
1887
|
|
1888
|
|
1889 =head2 frac_aligned_query
|
|
1890
|
|
1891 Usage : $sbjct_object->frac_aligned_query();
|
|
1892 Purpose : Get the fraction of the query sequence which has been aligned
|
|
1893 : across all HSPs (not including intervals between non-overlapping
|
|
1894 : HSPs).
|
|
1895 Example : $frac_alnq = $sbjct_object->frac_aligned_query();
|
|
1896 Returns : Float (2-decimal precision, e.g., 0.75).
|
|
1897 Argument : n/a
|
|
1898 Throws : n/a
|
|
1899 Comments : If you need data for each HSP, use hsps() and then interate
|
|
1900 : through the HSP objects.
|
|
1901 : To compute the fraction aligned, the logical length of the query
|
|
1902 : sequence is used, meaning that for [T]BLASTX reports, the
|
|
1903 : full length of the query sequence is converted into amino acids
|
|
1904 : by dividing by 3. This is necessary because of the way
|
|
1905 : the lengths of aligned sequences are computed.
|
|
1906 : This method requires that all HSPs be tiled. If they have not
|
|
1907 : already been tiled, they will be tiled first.
|
|
1908
|
|
1909 See Also : L<frac_aligned_hit()|frac_aligned_hit>, L<_tile_hsps()|_tile_hsps>, L<logical_length()|logical_length>, L<length_aln()|length_aln>
|
|
1910
|
|
1911 =cut
|
|
1912
|
|
1913 #----------------------
|
|
1914 sub frac_aligned_query {
|
|
1915 #----------------------
|
|
1916 my $self = shift;
|
|
1917
|
|
1918 $self->_tile_hsps() if not $self->{'_tile_hsps'};
|
|
1919
|
|
1920 sprintf( "%.2f", $self->{'_length_aln_query'}/$self->logical_length('query'));
|
|
1921 }
|
|
1922
|
|
1923
|
|
1924
|
|
1925 =head2 frac_aligned_hit
|
|
1926
|
|
1927 Usage : $sbjct_object->frac_aligned_hit();
|
|
1928 Purpose : Get the fraction of the hit (sbjct) sequence which has been aligned
|
|
1929 : across all HSPs (not including intervals between non-overlapping
|
|
1930 : HSPs).
|
|
1931 Example : $frac_alnq = $sbjct_object->frac_aligned_hit();
|
|
1932 Returns : Float (2-decimal precision, e.g., 0.75).
|
|
1933 Argument : n/a
|
|
1934 Throws : n/a
|
|
1935 Comments : If you need data for each HSP, use hsps() and then interate
|
|
1936 : through the HSP objects.
|
|
1937 : To compute the fraction aligned, the logical length of the sbjct
|
|
1938 : sequence is used, meaning that for TBLAST[NX] reports, the
|
|
1939 : full length of the sbjct sequence is converted into amino acids
|
|
1940 : by dividing by 3. This is necessary because of the way
|
|
1941 : the lengths of aligned sequences are computed.
|
|
1942 : This method requires that all HSPs be tiled. If they have not
|
|
1943 : already been tiled, they will be tiled first.
|
|
1944
|
|
1945 See Also : L<frac_aligned_query()|frac_aligned_query>, L<matches()|matches>, L<_tile_hsps()|_tile_hsps>, L<logical_length()|logical_length>, L<length_aln()|length_aln>
|
|
1946
|
|
1947 =cut
|
|
1948
|
|
1949 #--------------------
|
|
1950 sub frac_aligned_hit {
|
|
1951 #--------------------
|
|
1952 my $self = shift;
|
|
1953
|
|
1954 $self->_tile_hsps() if not $self->{'_tile_hsps'};
|
|
1955
|
|
1956 sprintf( "%.2f", $self->{'_length_aln_sbjct'}/$self->logical_length('sbjct'));
|
|
1957 }
|
|
1958
|
|
1959 # Safety-net methods for those who try don't read or remember the API.
|
|
1960 # Redirecting to the proper method. These 'sbjct' versions may be more
|
|
1961 # consistent with the API of this module since there are numerous other
|
|
1962 # instances of using 'sbjct' in arguments. However, 'sbjct' is a bit tech-ee.
|
|
1963
|
|
1964 #-----------------------
|
|
1965 sub frac_aligned_sbjct { my $self=shift; $self->frac_aligned_hit(@_); }
|
|
1966 #-----------------------
|
|
1967 sub num_unaligned_sbjct { my $self=shift; $self->num_unaligned_hit(@_); }
|
|
1968 #-----------------------
|
|
1969
|
|
1970
|
|
1971
|
|
1972 =head2 num_unaligned_hit
|
|
1973
|
|
1974 Usage : $sbjct_object->num_unaligned_hit();
|
|
1975 Purpose : Get the number of the unaligned residues in the hit sequence.
|
|
1976 : Sums across all all HSPs.
|
|
1977 Example : $num_unaln = $sbjct_object->num_unaligned_hit();
|
|
1978 Returns : Integer
|
|
1979 Argument : n/a
|
|
1980 Throws : n/a
|
|
1981 Comments : See notes regarding logical lengths in the comments for frac_aligned_hit().
|
|
1982 : They apply here as well.
|
|
1983 : If you need data for each HSP, use hsps() and then interate
|
|
1984 : through the HSP objects.
|
|
1985 : This method requires that all HSPs be tiled. If they have not
|
|
1986 : already been tiled, they will be tiled first.
|
|
1987
|
|
1988 See Also : L<num_unaligned_query()|num_unaligned_query>, L<_tile_hsps()|_tile_hsps>, L<frac_aligned_hit()|frac_aligned_hit>
|
|
1989
|
|
1990 =cut
|
|
1991
|
|
1992 #---------------------
|
|
1993 sub num_unaligned_hit {
|
|
1994 #---------------------
|
|
1995 my $self = shift;
|
|
1996
|
|
1997 $self->_tile_hsps() if not $self->{'_tile_hsps'};
|
|
1998
|
|
1999 my $num = $self->logical_length('sbjct') - $self->{'_length_aln_sbjct'};
|
|
2000 ($num < 0 ? 0 : $num );
|
|
2001 }
|
|
2002
|
|
2003
|
|
2004 =head2 num_unaligned_query
|
|
2005
|
|
2006 Usage : $sbjct_object->num_unaligned_query();
|
|
2007 Purpose : Get the number of the unaligned residues in the query sequence.
|
|
2008 : Sums across all all HSPs.
|
|
2009 Example : $num_unaln = $sbjct_object->num_unaligned_query();
|
|
2010 Returns : Integer
|
|
2011 Argument : n/a
|
|
2012 Throws : n/a
|
|
2013 Comments : See notes regarding logical lengths in the comments for frac_aligned_query().
|
|
2014 : They apply here as well.
|
|
2015 : If you need data for each HSP, use hsps() and then interate
|
|
2016 : through the HSP objects.
|
|
2017 : This method requires that all HSPs be tiled. If they have not
|
|
2018 : already been tiled, they will be tiled first.
|
|
2019
|
|
2020 See Also : L<num_unaligned_hit()|num_unaligned_hit>, L<_tile_hsps()|_tile_hsps>, L<frac_aligned_query()|frac_aligned_query>
|
|
2021
|
|
2022 =cut
|
|
2023
|
|
2024 #-----------------------
|
|
2025 sub num_unaligned_query {
|
|
2026 #-----------------------
|
|
2027 my $self = shift;
|
|
2028
|
|
2029 $self->_tile_hsps() if not $self->{'_tile_hsps'};
|
|
2030
|
|
2031 my $num = $self->logical_length('query') - $self->{'_length_aln_query'};
|
|
2032 ($num < 0 ? 0 : $num );
|
|
2033 }
|
|
2034
|
|
2035
|
|
2036
|
|
2037 =head2 seq_inds
|
|
2038
|
|
2039 Usage : $hit->seq_inds( seq_type, class, collapse );
|
|
2040 Purpose : Get a list of residue positions (indices) across all HSPs
|
|
2041 : for identical or conserved residues in the query or sbjct sequence.
|
|
2042 Example : @ind = $hit->seq_inds('query', 'identical');
|
|
2043 : @ind = $hit->seq_inds('sbjct', 'conserved');
|
|
2044 : @ind = $hit->seq_inds('sbjct', 'conserved', 1);
|
|
2045 Returns : Array of integers
|
|
2046 : May include ranges if collapse is non-zero.
|
|
2047 Argument : seq_type = 'query' or 'sbjct' (default = query)
|
|
2048 : class = 'identical' or 'conserved' (default = identical)
|
|
2049 : (can be shortened to 'id' or 'cons')
|
|
2050 : (actually, anything not 'id' will evaluate to 'conserved').
|
|
2051 : collapse = boolean, if non-zero, consecutive positions are merged
|
|
2052 : using a range notation, e.g., "1 2 3 4 5 7 9 10 11"
|
|
2053 : collapses to "1-5 7 9-11". This is useful for
|
|
2054 : consolidating long lists. Default = no collapse.
|
|
2055 Throws : n/a.
|
|
2056
|
|
2057 See Also : B<Bio::Tools::Blast::HSP::seq_inds()>
|
|
2058
|
|
2059 =cut
|
|
2060
|
|
2061 #-------------
|
|
2062 sub seq_inds {
|
|
2063 #-------------
|
|
2064 my ($self, $seq, $class, $collapse) = @_;
|
|
2065
|
|
2066 $seq ||= 'query';
|
|
2067 $class ||= 'identical';
|
|
2068 $collapse ||= 0;
|
|
2069
|
|
2070 my (@inds, $hsp);
|
|
2071 foreach $hsp ($self->hsps) {
|
|
2072 # This will merge data for all HSPs together.
|
|
2073 push @inds, $hsp->seq_inds($seq, $class);
|
|
2074 }
|
|
2075
|
|
2076 # Need to remove duplicates and sort the merged positions.
|
|
2077 if(@inds) {
|
|
2078 my %tmp = map { $_, 1 } @inds;
|
|
2079 @inds = sort {$a <=> $b} keys %tmp;
|
|
2080 }
|
|
2081
|
|
2082 require Bio::Tools::Blast::HSP;
|
|
2083
|
|
2084 $collapse ? &Bio::Tools::Blast::HSP::collapse_nums(@inds) : @inds;
|
|
2085 }
|
|
2086
|
|
2087
|
|
2088 #####################################################################################
|
|
2089 ## INSTANCE METHODS ##
|
|
2090 #####################################################################################
|
|
2091
|
|
2092
|
|
2093 =head2 display
|
|
2094
|
|
2095 Usage : $sbjct_object->display( %named_parameters );
|
|
2096 Purpose : Display information about Bio::Tools::Blast::Sbjct.pm data members
|
|
2097 Example : $object->display(-SHOW=>'stats');
|
|
2098 Argument : Named parameters: -SHOW => 'hsp',
|
|
2099 : -WHERE => filehandle (default = STDOUT)
|
|
2100 Returns : n/a
|
|
2101 Status : Deprecated, Buggy.
|
|
2102 : Use Blast::table() or Blast::table_tiled() instead.
|
|
2103
|
|
2104 See Also : L<_display_stats()|_display_stats>, L<_display_hsps()|_display_hsps>, B<Bio::Root::Object>::display
|
|
2105
|
|
2106 =cut
|
|
2107
|
|
2108 #------------
|
|
2109 sub display {
|
|
2110 #------------
|
|
2111 my( $self, %param) = @_;
|
|
2112
|
|
2113 $param{-HEADER} = 0;
|
|
2114 $self->SUPER::display(%param);
|
|
2115
|
|
2116 $self->show =~ /hsp/i and $self->_display_hsps( %param);
|
|
2117 }
|
|
2118
|
|
2119
|
|
2120 =head2 _display_stats
|
|
2121
|
|
2122 Usage : n/a; called automatically by display()
|
|
2123 Purpose : Display information about Bio::Tools::Blast.pm data members.
|
|
2124 : Not tab-delimited.
|
|
2125 : Prints the rank, name, database, score, p, n, length
|
|
2126 : of the hit sequence, length of the aligned region,
|
|
2127 : fraction identical, fraction conserved, and the fraction aligned
|
|
2128 : for both the query and hit sequences.
|
|
2129 Example : n/a
|
|
2130 Argument : one argument = filehandle object.
|
|
2131 Returns : printf call.
|
|
2132 Status : Deprecated, Buggy.
|
|
2133 : Use Blast::table() or Blast::table_tiled() instead.
|
|
2134
|
|
2135 See Also : L<display()|display>
|
|
2136
|
|
2137 =cut
|
|
2138
|
|
2139 #-------------------
|
|
2140 sub _display_stats {
|
|
2141 #-------------------
|
|
2142 my( $self, $OUT) = @_;
|
|
2143 my $layout = $self->parent->_layout();
|
|
2144
|
|
2145 if($layout == 1) {
|
|
2146 printf( $OUT "%-3d %-20s %-11s %-5d %-5d %-9.1e %-9.1e %-4d %-3d %-5d %-5d %-5s %-6.2f %-6.2f %-4d(%.2f) %-4d(%.2f)\n",
|
|
2147 $self->rank(), $self->name(),
|
|
2148 ($self->database() || 'UNKNOWN DB') ,
|
|
2149 $self->score(),$self->bits(),$self->p(),$self->expect(),
|
|
2150 $self->gaps(), $self->n(),
|
|
2151 $self->length(), $self->length_aln('query'),
|
|
2152 $self->ambiguous_aln(),
|
|
2153 $self->frac_aligned_query, $self->frac_aligned_hit,
|
|
2154 $self->matches('iden'), $self->frac_identical('query'),
|
|
2155 $self->matches('cons'), $self->frac_conserved('query'));
|
|
2156 } else {
|
|
2157 printf( $OUT "%-3d %-20s %-11s %-5d %-5d %-9.1e %-4d %-3d %-5d %-5d %-5s %-6.2f %-6.2f %-4d(%.2f) %-4d(%.2f)\n",
|
|
2158 $self->rank(), $self->name(),
|
|
2159 ($self->database() || 'UNKNOWN DB'),
|
|
2160 $self->score(),$self->bits(),$self->expect(),
|
|
2161 $self->gaps(), $self->num_hsps,
|
|
2162 $self->length(), $self->length_aln('query'),
|
|
2163 $self->ambiguous_aln(),
|
|
2164 $self->frac_aligned_query, $self->frac_aligned_hit,
|
|
2165 $self->matches('iden'), $self->frac_identical('query'),
|
|
2166 $self->matches('cons'), $self->frac_conserved('query') );
|
|
2167 }
|
|
2168
|
|
2169 }
|
|
2170
|
|
2171
|
|
2172 =head2 _display_hsps
|
|
2173
|
|
2174 Usage : n/a; called automatically by display()
|
|
2175 Purpose : Display information about each HSP in the current BLAST hit.
|
|
2176 Example : n/a
|
|
2177 Argument : one argument = filehandle object.
|
|
2178 Returns : printf call.
|
|
2179 Status : Experimental
|
|
2180
|
|
2181 See Also : L<display()|display>, B<Bio::Tools::Blast::HSP::display()>
|
|
2182
|
|
2183 =cut
|
|
2184
|
|
2185 #----------------
|
|
2186 sub _display_hsps {
|
|
2187 #----------------
|
|
2188 my( $self, %param) = @_;
|
|
2189 my $OUT = $self->fh();
|
|
2190 my $hspCount = 0;
|
|
2191 my $reply = undef;
|
|
2192
|
|
2193 not defined $self->{'_hsps'} and do{ print $OUT "\nHSP data not loaded.\n\n"; return; };
|
|
2194
|
|
2195 # print $OUT "\n",$self->num_hsps, " HSPs\n\n";
|
|
2196
|
|
2197 my($hsp);
|
|
2198 foreach $hsp ( $self->hsps() ) {
|
|
2199 $hspCount++;
|
|
2200 print $OUT "\n ", '-'x25, "< HSP #$hspCount >", '-'x25, "\n";
|
|
2201
|
|
2202 $hsp->display( %param );
|
|
2203
|
|
2204 if( $hspCount < $self->num_hsps ) {
|
|
2205 print "\n\n\t--------> <RET> FOR NEXT HSP, q TO QUIT <--------\n";
|
|
2206 chop( $reply = <STDIN>);
|
|
2207 $reply =~ /^q/i and return;
|
|
2208 }
|
|
2209 }
|
|
2210 }
|
|
2211
|
|
2212
|
|
2213
|
|
2214 =head2 homol_data
|
|
2215
|
|
2216 Usage : $data = $sbjct_object->homo_data( %named_params );
|
|
2217 Purpose : Gets specific similarity data about all HSPs.
|
|
2218 Returns : String
|
|
2219 Argument : named parameters forwarded to Bio::Tools::Blast::HSP::homol_data().
|
|
2220 Throws : n/a
|
|
2221 Status : Experimental
|
|
2222 Comments : This is an experimental method used for obtaining an
|
|
2223 : indication of:
|
|
2224 : 1) how many HSPs are in a Blast alignment
|
|
2225 : 2) how strong the similarity is between sequences in the HSP
|
|
2226 : 3) the endpoints of the alignment (sequence monomer numbers)
|
|
2227 : "Homology data" for each HSP is in the format:
|
|
2228 : "<integer> <start> <stop>"
|
|
2229 : Data for different HSPs are tab-delimited.
|
|
2230
|
|
2231 See Also : B<Bio::Tools::Blast::homol_data()>, B<Bio::Tools::Blast::HSP::homol_data()>
|
|
2232
|
|
2233 =cut
|
|
2234
|
|
2235 #---------------
|
|
2236 sub homol_data {
|
|
2237 #---------------
|
|
2238 my ($self,%param) = @_;
|
|
2239 my $data = $self->name();
|
|
2240
|
|
2241 foreach ($self->hsps) {
|
|
2242 $data .="\t".$_->homol_data(%param);
|
|
2243 }
|
|
2244 ## Record ambiguous alignment status.
|
|
2245 $data .= "\t".$self->ambiguous_aln();
|
|
2246 $data;
|
|
2247 }
|
|
2248
|
|
2249
|
|
2250 =head2 is_signif
|
|
2251
|
|
2252 Usage : $sbjct_object->is_signif();
|
|
2253 Purpose : Determine if the given BLAST hit is significant.
|
|
2254 Example :
|
|
2255 Returns : Boolean
|
|
2256 Argument : n/a
|
|
2257 Throws : n/a
|
|
2258 Comments : Uses criteria defined in the parent Blast.pm object
|
|
2259 : to assess significance. Currently, only relies on
|
|
2260 : P-value and length criteria.
|
|
2261 : This mehtod is largely obsolete since are hits are now by
|
|
2262 : definition significant.
|
|
2263
|
|
2264 =cut
|
|
2265
|
|
2266 #---------------
|
|
2267 sub is_signif {
|
|
2268 #---------------
|
|
2269 my $self = shift;
|
|
2270 return ($self->{'_significance'} <= $self->parent->signif and
|
|
2271 $self->length > $self->parent->signif_len);
|
|
2272 }
|
|
2273
|
|
2274 #####################################################################################
|
|
2275 ## CLASS METHODS ##
|
|
2276 #####################################################################################
|
|
2277
|
|
2278 =head1 CLASS METHODS
|
|
2279
|
|
2280 =head2 get_exponent
|
|
2281
|
|
2282 Usage : &get_exponent( number );
|
|
2283 Purpose : Determines the power of 10 exponent of an integer, float,
|
|
2284 : or scientific notation number.
|
|
2285 Example : &get_exponent("4.0e-206");
|
|
2286 : &get_exponent("0.00032");
|
|
2287 : &get_exponent("10.");
|
|
2288 : &get_exponent("1000.0");
|
|
2289 : &get_exponent("e+83");
|
|
2290 Argument : Float, Integer, or scientific notation number
|
|
2291 Returns : Integer representing the exponent part of the number (+ or -).
|
|
2292 : If argument == 0 (zero), return value is "-999".
|
|
2293 Comments : Exponents are rounded up (less negative) if the mantissa is >= 5.
|
|
2294 : Exponents are rounded down (more negative) if the mantissa is <= -5.
|
|
2295 : This method probably belongs in a more general utility class.
|
|
2296
|
|
2297 =cut
|
|
2298
|
|
2299 #------------------
|
|
2300 sub get_exponent {
|
|
2301 #------------------
|
|
2302 my $data = shift;
|
|
2303
|
|
2304 my($num, $exp) = split /[eE]/, $data;
|
|
2305
|
|
2306 if( defined $exp) {
|
|
2307 $num = 1 if not $num;
|
|
2308 $num >= 5 and $exp++;
|
|
2309 $num <= -5 and $exp--;
|
|
2310 } elsif( $num == 0) {
|
|
2311 $exp = -999;
|
|
2312 } elsif( not $num =~ /\./) {
|
|
2313 $exp = CORE::length($num) -1;
|
|
2314 } else {
|
|
2315 $exp = 0;
|
|
2316 $num .= '0' if $num =~ /\.$/;
|
|
2317 my ($c);
|
|
2318 my $rev = 0;
|
|
2319 if($num !~ /^0/) {
|
|
2320 $num = reverse($num);
|
|
2321 $rev = 1;
|
|
2322 }
|
|
2323 do { $c = chop($num);
|
|
2324 $c == 0 && $exp++;
|
|
2325 } while( $c ne '.');
|
|
2326
|
|
2327 $exp = -$exp if $num == 0 and not $rev;
|
|
2328 $exp -= 1 if $rev;
|
|
2329 }
|
|
2330 return $exp;
|
|
2331 }
|
|
2332
|
|
2333 1;
|
|
2334 __END__
|
|
2335
|
|
2336 #####################################################################################
|
|
2337 # END OF CLASS #
|
|
2338 #####################################################################################
|
|
2339
|
|
2340
|
|
2341 =head1 FOR DEVELOPERS ONLY
|
|
2342
|
|
2343 =head2 Data Members
|
|
2344
|
|
2345 Information about the various data members of this module is provided for those
|
|
2346 wishing to modify or understand the code. Two things to bear in mind:
|
|
2347
|
|
2348 =over 4
|
|
2349
|
|
2350 =item 1 Do NOT rely on these in any code outside of this module.
|
|
2351
|
|
2352 All data members are prefixed with an underscore to signify that they are private.
|
|
2353 Always use accessor methods. If the accessor doesn't exist or is inadequate,
|
|
2354 create or modify an accessor (and let me know, too!). (An exception to this might
|
|
2355 be for HSP.pm which is more tightly coupled to Sbjct.pm and
|
|
2356 may access Sbjct data members directly for efficiency purposes, but probably
|
|
2357 should not).
|
|
2358
|
|
2359 =item 2 This documentation may be incomplete and out of date.
|
|
2360
|
|
2361 It is easy for these data member descriptions to become obsolete as
|
|
2362 this module is still evolving. Always double check this info and search
|
|
2363 for members not described here.
|
|
2364
|
|
2365 =back
|
|
2366
|
|
2367 An instance of Bio::Tools::Blast::Sbjct.pm is a blessed reference to a hash containing
|
|
2368 all or some of the following fields:
|
|
2369
|
|
2370 FIELD VALUE
|
|
2371 --------------------------------------------------------------
|
|
2372 _hsps : Array ref for a list of Bio::Tools::Blast::HSP.pm objects.
|
|
2373 :
|
|
2374 _db : Database identifier from the summary line.
|
|
2375 :
|
|
2376 _desc : Description data for the hit from the summary line.
|
|
2377 :
|
|
2378 _length : Total length of the hit sequence.
|
|
2379 :
|
|
2380 _score : BLAST score.
|
|
2381 :
|
|
2382 _bits : BLAST score (in bits). Matrix-independent.
|
|
2383 :
|
|
2384 _p : BLAST P value. Obtained from summary section. (Blast1/WU-Blast only)
|
|
2385 :
|
|
2386 _expect : BLAST Expect value. Obtained from summary section.
|
|
2387 :
|
|
2388 _n : BLAST N value (number of HSPs) (Blast1/WU-Blast2 only)
|
|
2389 :
|
|
2390 _frame : Reading frame for TBLASTN and TBLASTX analyses.
|
|
2391 :
|
|
2392 _totalIdentical: Total number of identical aligned monomers.
|
|
2393 :
|
|
2394 _totalConserved: Total number of conserved aligned monomers (a.k.a. "positives").
|
|
2395 :
|
|
2396 _overlap : Maximum number of overlapping residues between adjacent HSPs
|
|
2397 : before considering the alignment to be ambiguous.
|
|
2398 :
|
|
2399 _ambiguous_aln : Boolean. True if the alignment of all HSPs is ambiguous.
|
|
2400 :
|
|
2401 _length_aln_query : Length of the aligned region of the query sequence.
|
|
2402 :
|
|
2403 _length_aln_sbjct : Length of the aligned region of the sbjct sequence.
|
|
2404
|
|
2405
|
|
2406 INHERITED DATA MEMBERS
|
|
2407 ----------------------
|
|
2408 _name : From Bio::Root::Object.pm. String representing the name of the
|
|
2409 : sbjct sequence obtained from the BLAST report.
|
|
2410 :
|
|
2411 _parent : From Bio::Root::Object.pm. This member contains a reference to the
|
|
2412 : Bio::Tools::Blast.pm object to which this hit belongs.
|
|
2413
|
|
2414
|
|
2415 =cut
|
|
2416
|
|
2417 1;
|