0
|
1 #----------------------------------------------------------------------------
|
|
2 # PACKAGE : Bio::Tools::Blast::HSP
|
|
3 # AUTHOR : Steve Chervitz (sac@bioperl.org)
|
|
4 # CREATED : March 1996
|
|
5 # STATUS : Alpha
|
|
6 # REVISION: $Id: HSP.pm,v 1.18 2002/10/22 07:38:48 lapp Exp $
|
|
7 #
|
|
8 # For the latest version and documentation, visit the distribution site:
|
|
9 # http://genome-www.stanford.edu/perlOOP/bioperl/blast/
|
|
10 #
|
|
11 # To generate documentation, run this module through pod2html
|
|
12 # (preferably from Perl v5.004 or better).
|
|
13 #
|
|
14 # Copyright (c) 1996-2000 Steve Chervitz. All Rights Reserved.
|
|
15 # This module is free software; you can redistribute it and/or
|
|
16 # modify it under the same terms as Perl itself.
|
|
17 #----------------------------------------------------------------------------
|
|
18
|
|
19 package Bio::Tools::Blast::HSP;
|
|
20
|
|
21 use Bio::Root::Global qw(:devel);
|
|
22 use Bio::Root::Object ();
|
|
23 #use Bio::Root::Err qw(:std);
|
|
24
|
|
25 @ISA = qw( Bio::Root::Object);
|
|
26
|
|
27 use strict;
|
|
28 use vars qw($ID $GAP_SYMBOL @SCORE_CUTOFFS $Revision %STRAND_SYMBOL);
|
|
29 $ID = 'Bio::Tools::Blast::HSP';
|
|
30 $Revision = '$Id: HSP.pm,v 1.18 2002/10/22 07:38:48 lapp Exp $'; #'
|
|
31
|
|
32 $GAP_SYMBOL = '-'; # Need a more general way to handle gap symbols.
|
|
33 @SCORE_CUTOFFS = ( 100, 30 ); # Bit score cutoffs (see homol_score()).
|
|
34 %STRAND_SYMBOL = ('Plus' => 1, 'Minus' => -1);
|
|
35
|
|
36 ## POD Documentation:
|
|
37
|
|
38 =head1 NAME
|
|
39
|
|
40 Bio::Tools::Blast::HSP - Bioperl BLAST High-Scoring Segment Pair object
|
|
41
|
|
42 =head1 SYNOPSIS
|
|
43
|
|
44 =head2 Object Creation
|
|
45
|
|
46 The construction of HSP objects is handled by Bio::Tools::Blast:: Sbjct.pm.
|
|
47 You should not need to use this package directly. See L<_initialize()|_initialize>
|
|
48 for a description of constructor parameters.
|
|
49
|
|
50 require Bio::Tools::Blast::HSP;
|
|
51
|
|
52 $hspObj = eval{ new Bio::Tools::Blast::HSP(-DATA =>\@hspData,
|
|
53 -PARENT =>$sbjct_object,
|
|
54 -NAME =>$hspCount,
|
|
55 -PROGRAM =>'TBLASTN',
|
|
56 );
|
|
57 };
|
|
58
|
|
59 @hspData includes the raw BLAST report data for a specific HSP,
|
|
60 and is prepared by Bio::Tools::Blast::Sbjct.pm.
|
|
61
|
|
62 =head1 INSTALLATION
|
|
63
|
|
64 This module is included with the central Bioperl distribution:
|
|
65
|
|
66 http://bio.perl.org/Core/Latest
|
|
67 ftp://bio.perl.org/pub/DIST
|
|
68
|
|
69 Follow the installation instructions included in the README file.
|
|
70
|
|
71 =head1 DESCRIPTION
|
|
72
|
|
73 The Bio::Tools::Blast::HSP.pm module encapsulates data and methods for
|
|
74 manipulating, parsing, and analyzing HSPs ("High-scoring Segment Pairs")
|
|
75 derived from BLAST sequence analysis.
|
|
76
|
|
77 This module is a utility module used by the B<Bio::Tools::Blast::Sbjct.pm>
|
|
78 and is not intended for separate use. Please see documentation for
|
|
79 B<Bio::Tools::Blast.pm> for some basic information about using
|
|
80 HSP objects (L<Links:>).
|
|
81
|
|
82 =over 0
|
|
83
|
|
84 =item * Supports BLAST versions 1.x and 2.x, gapped and ungapped.
|
|
85
|
|
86 =back
|
|
87
|
|
88 Bio::Tools::Blast::HSP.pm has the ability to extract a list of all
|
|
89 residue indices for identical and conservative matches along both
|
|
90 query and sbjct sequences. Since this degree of detail is not always
|
|
91 needed, this behavior does not occur during construction of the HSP
|
|
92 object. These data will automatically be collected as necessary as
|
|
93 the HSP.pm object is used.
|
|
94
|
|
95 =head1 DEPENDENCIES
|
|
96
|
|
97 Bio::Tools::Blast::HSP.pm is a concrete class that inherits from
|
|
98 B<Bio::Root::Object.pm> and relies on B<Bio::Tools::Sbjct.pm> as a
|
|
99 container for HSP.pm objects. B<Bio::Seq.pm> and B<Bio::UnivAln.pm>
|
|
100 are employed for creating sequence and alignment objects,
|
|
101 respectively.
|
|
102
|
|
103
|
|
104 =head2 Relationship to UnivAln.pm & Seq.pm
|
|
105
|
|
106 HSP.pm can provide the query or sbjct sequence as a B<Bio::Seq.pm>
|
|
107 object via the L<seq()|seq> method. The HSP.pm object can also create a
|
|
108 two-sequence B<Bio::UnivAln.pm> alignment object using the the query
|
|
109 and sbjct sequences via the L<get_aln()|get_aln> method. Creation of alignment
|
|
110 objects is not automatic when constructing the HSP.pm object since
|
|
111 this level of functionality is not always required and would generate
|
|
112 a lot of extra overhead when crunching many reports.
|
|
113
|
|
114
|
|
115 =head1 FEEDBACK
|
|
116
|
|
117 =head2 Mailing Lists
|
|
118
|
|
119 User feedback is an integral part of the evolution of this and other
|
|
120 Bioperl modules. Send your comments and suggestions preferably to one
|
|
121 of the Bioperl mailing lists. Your participation is much appreciated.
|
|
122
|
|
123 bioperl-l@bioperl.org - General discussion
|
|
124 http://bio.perl.org/MailList.html - About the mailing lists
|
|
125
|
|
126 =head2 Reporting Bugs
|
|
127
|
|
128 Report bugs to the Bioperl bug tracking system to help us keep track
|
|
129 the bugs and their resolution. Bug reports can be submitted via email
|
|
130 or the web:
|
|
131
|
|
132 bioperl-bugs@bio.perl.org
|
|
133 http://bugzilla.bioperl.org/
|
|
134
|
|
135 =head1 AUTHOR
|
|
136
|
|
137 Steve Chervitz, E<lt>sac@bioperl.orgE<gt>
|
|
138
|
|
139 =head1 SEE ALSO
|
|
140
|
|
141 Bio::Tools::Blast::Sbjct.pm - Blast hit object.
|
|
142 Bio::Tools::Blast.pm - Blast object.
|
|
143 Bio::Seq.pm - Biosequence object
|
|
144 Bio::UnivAln.pm - Biosequence alignment object.
|
|
145 Bio::Root::Object.pm - Proposed base class for all Bioperl objects.
|
|
146
|
|
147 =head2 Links:
|
|
148
|
|
149 http://bio.perl.org/Core/POD/Tools/Blast/Sbjct.pm.html
|
|
150
|
|
151 http://bio.perl.org/Projects/modules.html - Online module documentation
|
|
152 http://bio.perl.org/Projects/Blast/ - Bioperl Blast Project
|
|
153 http://bio.perl.org/ - Bioperl Project Homepage
|
|
154
|
|
155 =head1 COPYRIGHT
|
|
156
|
|
157 Copyright (c) 1996-98 Steve Chervitz. All Rights Reserved.
|
|
158 This module is free software; you can redistribute it and/or
|
|
159 modify it under the same terms as Perl itself.
|
|
160
|
|
161 =cut
|
|
162
|
|
163
|
|
164
|
|
165 #
|
|
166 ##
|
|
167 ###
|
|
168 #### END of main POD documentation.
|
|
169 ###
|
|
170 ##
|
|
171 #
|
|
172
|
|
173 =head1 APPENDIX
|
|
174
|
|
175 Methods beginning with a leading underscore are considered private
|
|
176 and are intended for internal use by this module. They are
|
|
177 B<not> considered part of the public interface and are described here
|
|
178 for documentation purposes only.
|
|
179
|
|
180 =cut
|
|
181
|
|
182 #####################################################################################
|
|
183 ## CONSTRUCTOR ##
|
|
184 #####################################################################################
|
|
185
|
|
186 =head2 _initialize
|
|
187
|
|
188 Usage : n/a; automatically called by Bio::Root::Object::new()
|
|
189 : Bio::Tools::Blast::HSP.pm objects are constructed
|
|
190 : automatically by Bio::Tools::Sbjct.pm, so there is no need
|
|
191 : for direct consumption.
|
|
192 Purpose : Initializes HSP data and calls private methods to extract
|
|
193 : the data for a given HSP.
|
|
194 : Calls superclass constructor first (Bio::Root::Object.pm).
|
|
195 Returns : n/a
|
|
196 Argument : Named parameters passed from new():
|
|
197 : All tags must be uppercase (does not call _rearrange()).
|
|
198 : -DATA => array ref containing raw data for one HSP.
|
|
199 : -PARENT => Sbjct.pm object ref.
|
|
200 : -NAME => integer (1..n).
|
|
201 : -PROGRAM => string ('TBLASTN', 'BLASTP', etc.).
|
|
202
|
|
203 See Also : L<_set_data()|_set_data>, B<Bio::Root::Object::new()>, B<Bio::Tools::Blast::Sbjct::_set_hsps()>
|
|
204
|
|
205 =cut
|
|
206
|
|
207 #----------------
|
|
208 sub _initialize {
|
|
209 #----------------
|
|
210 my( $self, %param ) = @_;
|
|
211
|
|
212 $self->SUPER::_initialize( %param );
|
|
213
|
|
214 # The gapped and program booleans may be needed after the HSP object
|
|
215 # is built.
|
|
216 # $self->{'_gapped'} = $param{-GAPPED} || 0;
|
|
217 $self->{'_prog'} = $param{-PROGRAM} || 0;
|
|
218 $self->_set_data( @{$param{-DATA}} );
|
|
219 }
|
|
220
|
|
221 #####################################################################################
|
|
222 ## ACCESSORS ##
|
|
223 #####################################################################################
|
|
224
|
|
225
|
|
226 =head2 _set_data
|
|
227
|
|
228 Usage : n/a; called automatically during object construction.
|
|
229 Purpose : Sets the query sequence, sbjct sequence, and the "match" data
|
|
230 : which consists of the symbols between the query and sbjct lines
|
|
231 : in the alignment.
|
|
232 Argument : Array (all lines from a single, complete HSP, one line per element)
|
|
233 Throws : Propagates any exceptions from the methods called ("See Also")
|
|
234
|
|
235 See Also : L<_set_seq()|_set_seq>, L<_set_residues()|_set_residues>, L<_set_score_stats()|_set_score_stats>, L<_set_match_stats()|_set_match_stats>, L<_initialize()|_initialize>
|
|
236
|
|
237 =cut
|
|
238
|
|
239 #--------------
|
|
240 sub _set_data {
|
|
241 #--------------
|
|
242 my $self = shift;
|
|
243 my @data = @_;
|
|
244 my @queryList = (); # 'Query' = SEQUENCE USED TO QUERY THE DATABASE.
|
|
245 my @sbjctList = (); # 'Sbjct' = HOMOLOGOUS SEQUENCE FOUND IN THE DATABASE.
|
|
246 my @matchList = ();
|
|
247 my $matchLine = 0; # Alternating boolean: when true, load 'match' data.
|
|
248 my @linedat = ();
|
|
249
|
|
250 $DEBUG and print STDERR "$ID: set_data()\n";
|
|
251
|
|
252 my($line, $aln_row_len, $length_diff);
|
|
253 $length_diff = 0;
|
|
254
|
|
255 # Collecting data for all lines in the alignment
|
|
256 # and then storing the collections for possible processing later.
|
|
257 #
|
|
258 # Note that "match" lines may not be properly padded with spaces.
|
|
259 # This loop now properly handles such cases:
|
|
260 # Query: 1141 PSLVELTIRDCPRLEVGPMIRSLPKFPMLKKLDLAVANIIEEDLDVIGSLEELVIXXXXX 1200
|
|
261 # PSLVELTIRDCPRLEVGPMIRSLPKFPMLKKLDLAVANIIEEDLDVIGSLEELVI
|
|
262 # Sbjct: 1141 PSLVELTIRDCPRLEVGPMIRSLPKFPMLKKLDLAVANIIEEDLDVIGSLEELVILSLKL 1200
|
|
263
|
|
264 foreach $line( @data ) {
|
|
265 next if $line =~ /^\s*$/;
|
|
266
|
|
267 if( $line =~ /^ ?Score/ ) {
|
|
268 $self->_set_score_stats( $line );
|
|
269 } elsif( $line =~ /^ ?(Identities|Positives|Strand)/ ) {
|
|
270 $self->_set_match_stats( $line );
|
|
271 } elsif( $line =~ /^ ?Frame = ([\d+-]+)/ ) {
|
|
272 # Version 2.0.8 has Frame information on a separate line.
|
|
273 $self->{'_frame'} = $1;
|
|
274 } elsif( $line =~ /^(Query:?[\s\d]+)([^\s\d]+)/ ) {
|
|
275 push @queryList, $line;
|
|
276 $self->{'_match_indent'} = CORE::length $1;
|
|
277 $aln_row_len = (CORE::length $1) + (CORE::length $2);
|
|
278 $matchLine = 1;
|
|
279 } elsif( $matchLine ) {
|
|
280 # Pad the match line with spaces if necessary.
|
|
281 $length_diff = $aln_row_len - CORE::length $line;
|
|
282 $length_diff and $line .= ' 'x $length_diff;
|
|
283 push @matchList, $line;
|
|
284 $matchLine = 0;
|
|
285 } elsif( $line =~ /^Sbjct/ ) {
|
|
286 push @sbjctList, $line;
|
|
287 }
|
|
288 }
|
|
289
|
|
290 # Storing the query and sbjct lists in case they are needed later.
|
|
291 # We could make this conditional to save memory.
|
|
292 $self->{'_queryList'} = \@queryList;
|
|
293 $self->{'_sbjctList'} = \@sbjctList;
|
|
294
|
|
295 # Storing the match list in case it is needed later.
|
|
296 $self->{'_matchList'} = \@matchList;
|
|
297
|
|
298 if(not defined ($self->{'_numIdentical'})) {
|
|
299 $self->throw("Can't parse match statistics.",
|
|
300 "Possibly a new or unrecognized Blast format.");
|
|
301 }
|
|
302
|
|
303 if(!scalar @queryList or !scalar @sbjctList) {
|
|
304 $self->throw("Can't find query or sbjct alignment lines.",
|
|
305 "Possibly unrecognized Blast format.");
|
|
306 }
|
|
307 }
|
|
308
|
|
309
|
|
310
|
|
311 =head2 _set_score_stats
|
|
312
|
|
313 Usage : n/a; called automatically by _set_data()
|
|
314 Purpose : Sets various score statistics obtained from the HSP listing.
|
|
315 Argument : String with any of the following formats:
|
|
316 : blast2: Score = 30.1 bits (66), Expect = 9.2
|
|
317 : blast2: Score = 158.2 bits (544), Expect(2) = e-110
|
|
318 : blast1: Score = 410 (144.3 bits), Expect = 1.7e-40, P = 1.7e-40
|
|
319 : blast1: Score = 55 (19.4 bits), Expect = 5.3, Sum P(3) = 0.99
|
|
320 Throws : Exception if the stats cannot be parsed, probably due to a change
|
|
321 : in the Blast report format.
|
|
322
|
|
323 See Also : L<_set_data()|_set_data>
|
|
324
|
|
325 =cut
|
|
326
|
|
327 #--------------------
|
|
328 sub _set_score_stats {
|
|
329 #--------------------
|
|
330 my ($self, $data) = @_;
|
|
331
|
|
332 my ($expect, $p);
|
|
333
|
|
334 if($data =~ /Score = +([\d.e+-]+) bits \(([\d.e+-]+)\), +Expect = +([\d.e+-]+)/) {
|
|
335 # blast2 format n = 1
|
|
336 $self->{'_bits'} = $1;
|
|
337 $self->{'_score'} = $2;
|
|
338 $expect = $3;
|
|
339 } elsif($data =~ /Score = +([\d.e+-]+) bits \(([\d.e+-]+)\), +Expect\((\d+)\) = +([\d.e+-]+)/) {
|
|
340 # blast2 format n > 1
|
|
341 $self->{'_bits'} = $1;
|
|
342 $self->{'_score'} = $2;
|
|
343 $self->{'_n'} = $3;
|
|
344 $expect = $4;
|
|
345
|
|
346 } elsif($data =~ /Score = +([\d.e+-]+) \(([\d.e+-]+) bits\), +Expect = +([\d.e+-]+), P = +([\d.e-]+)/) {
|
|
347 # blast1 format, n = 1
|
|
348 $self->{'_score'} = $1;
|
|
349 $self->{'_bits'} = $2;
|
|
350 $expect = $3;
|
|
351 $p = $4;
|
|
352
|
|
353 } elsif($data =~ /Score = +([\d.e+-]+) \(([\d.e+-]+) bits\), +Expect = +([\d.e+-]+), +Sum P\((\d+)\) = +([\d.e-]+)/) {
|
|
354 # blast1 format, n > 1
|
|
355 $self->{'_score'} = $1;
|
|
356 $self->{'_bits'} = $2;
|
|
357 $expect = $3;
|
|
358 $self->{'_n'} = $4;
|
|
359 $p = $5;
|
|
360
|
|
361 } else {
|
|
362 $self->throw("Can't parse score statistics: unrecognized format.", "$data");
|
|
363 }
|
|
364
|
|
365 $expect = "1$expect" if $expect =~ /^e/i;
|
|
366 $p = "1$p" if defined $p and $p=~ /^e/i;
|
|
367
|
|
368 $self->{'_expect'} = $expect;
|
|
369 $self->{'_p'} = $p || undef;
|
|
370
|
|
371 }
|
|
372
|
|
373
|
|
374
|
|
375 =head2 _set_match_stats
|
|
376
|
|
377 Usage : n/a; called automatically by _set_data()
|
|
378 Purpose : Sets various matching statistics obtained from the HSP listing.
|
|
379 Argument : blast2: Identities = 23/74 (31%), Positives = 29/74 (39%), Gaps = 17/74 (22%)
|
|
380 : blast2: Identities = 57/98 (58%), Positives = 74/98 (75%)
|
|
381 : blast1: Identities = 87/204 (42%), Positives = 126/204 (61%)
|
|
382 : blast1: Identities = 87/204 (42%), Positives = 126/204 (61%), Frame = -3
|
|
383 : WU-blast: Identities = 310/553 (56%), Positives = 310/553 (56%), Strand = Minus / Plus
|
|
384 Throws : Exception if the stats cannot be parsed, probably due to a change
|
|
385 : in the Blast report format.
|
|
386 Comments : The "Gaps = " data in the HSP header has a different meaning depending
|
|
387 : on the type of Blast: for BLASTP, this number is the total number of
|
|
388 : gaps in query+sbjct; for TBLASTN, it is the number of gaps in the
|
|
389 : query sequence only. Thus, it is safer to collect the data
|
|
390 : separately by examining the actual sequence strings as is done
|
|
391 : in _set_seq().
|
|
392
|
|
393 See Also : L<_set_data()|_set_data>, L<_set_seq()|_set_seq>
|
|
394
|
|
395 =cut
|
|
396
|
|
397 #--------------------
|
|
398 sub _set_match_stats {
|
|
399 #--------------------
|
|
400 my ($self, $data) = @_;
|
|
401
|
|
402 if($data =~ m!Identities = (\d+)/(\d+)!) {
|
|
403 # blast1 or 2 format
|
|
404 $self->{'_numIdentical'} = $1;
|
|
405 $self->{'_totalLength'} = $2;
|
|
406 }
|
|
407
|
|
408 if($data =~ m!Positives = (\d+)/(\d+)!) {
|
|
409 # blast1 or 2 format
|
|
410 $self->{'_numConserved'} = $1;
|
|
411 $self->{'_totalLength'} = $2;
|
|
412 }
|
|
413
|
|
414 if($data =~ m!Frame = ([\d+-]+)!) {
|
|
415 $self->{'_frame'} = $1;
|
|
416 }
|
|
417
|
|
418 # Strand data is not always present in this line.
|
|
419 # _set_seq() will also set strand information.
|
|
420 if($data =~ m!Strand = (\w+) / (\w+)!) {
|
|
421 $self->{'_queryStrand'} = $1;
|
|
422 $self->{'_sbjctStrand'} = $2;
|
|
423 }
|
|
424
|
|
425 # if($data =~ m!Gaps = (\d+)/(\d+)!) {
|
|
426 # $self->{'_totalGaps'} = $1;
|
|
427 # } else {
|
|
428 # $self->{'_totalGaps'} = 0;
|
|
429 # }
|
|
430 }
|
|
431
|
|
432
|
|
433
|
|
434 =head2 _set_seq_data
|
|
435
|
|
436 Usage : n/a; called automatically when sequence data is requested.
|
|
437 Purpose : Sets the HSP sequence data for both query and sbjct sequences.
|
|
438 : Includes: start, stop, length, gaps, and raw sequence.
|
|
439 Argument : n/a
|
|
440 Throws : Propagates any exception thrown by _set_match_seq()
|
|
441 Comments : Uses raw data stored by _set_data() during object construction.
|
|
442 : These data are not always needed, so it is conditionally
|
|
443 : executed only upon demand by methods such as gaps(), _set_residues(),
|
|
444 : etc. _set_seq() does the dirty work.
|
|
445
|
|
446 See Also : L<_set_seq()|_set_seq>
|
|
447
|
|
448 =cut
|
|
449
|
|
450 sub _set_seq_data {
|
|
451 my $self = shift;
|
|
452
|
|
453 $self->_set_seq('query', @{$self->{'_queryList'}});
|
|
454 $self->_set_seq('sbjct', @{$self->{'_sbjctList'}});
|
|
455
|
|
456 # Liberate some memory.
|
|
457 @{$self->{'_queryList'}} = @{$self->{'_sbjctList'}} = ();
|
|
458 undef $self->{'_queryList'};
|
|
459 undef $self->{'_sbjctList'};
|
|
460
|
|
461 $self->{'_set_seq_data'} = 1;
|
|
462 }
|
|
463
|
|
464
|
|
465
|
|
466 =head2 _set_seq
|
|
467
|
|
468 Usage : n/a; called automatically by _set_seq_data()
|
|
469 : $hsp_obj->($seq_type, @data);
|
|
470 Purpose : Sets sequence information for both the query and sbjct sequences.
|
|
471 : Directly counts the number of gaps in each sequence (if gapped Blast).
|
|
472 Argument : $seq_type = 'query' or 'sbjct'
|
|
473 : @data = all seq lines with the form:
|
|
474 : Query: 61 SPHNVKDRKEQNGSINNAISPTATANTSGSQQINIDSALRDRSSNVAAQPSLSDASSGSN 120
|
|
475 Throws : Exception if data strings cannot be parsed, probably due to a change
|
|
476 : in the Blast report format.
|
|
477 Comments : Uses first argument to determine which data members to set
|
|
478 : making this method sensitive data member name changes.
|
|
479 : Behavior is dependent on the type of BLAST analysis (TBLASTN, BLASTP, etc).
|
|
480 Warning : Sequence endpoints are normalized so that start < end. This affects HSPs
|
|
481 : for TBLASTN/X hits on the minus strand. Normalization facilitates use
|
|
482 : of range information by methods such as match().
|
|
483
|
|
484 See Also : L<_set_seq_data()|_set_seq_data>, L<matches()|matches>, L<range()|range>, L<start()|start>, L<end()|end>
|
|
485
|
|
486 =cut
|
|
487
|
|
488 #-------------
|
|
489 sub _set_seq {
|
|
490 #-------------
|
|
491 my $self = shift;
|
|
492 my $seqType = shift;
|
|
493 my @data = @_;
|
|
494 my @ranges = ();
|
|
495 my @sequence = ();
|
|
496 my $numGaps = 0;
|
|
497
|
|
498 foreach( @data ) {
|
|
499 if( m/(\d+) *(\D+) *(\d+)/) {
|
|
500 push @ranges, ( $1, $3 ) ;
|
|
501 push @sequence, $2;
|
|
502 } else {
|
|
503 $self->warn("Bad sequence data: $_");
|
|
504 }
|
|
505 }
|
|
506
|
|
507 (scalar(@sequence) and scalar(@ranges)) || $self->throw("Can't set sequence: missing data",
|
|
508 "Possibly unrecognized Blast format.");
|
|
509
|
|
510 # Sensitive to member name changes.
|
|
511 $seqType = "_\L$seqType\E";
|
|
512 $self->{$seqType.'Start'} = $ranges[0];
|
|
513 $self->{$seqType.'Stop'} = $ranges[ $#ranges ];
|
|
514 $self->{$seqType.'Seq'} = \@sequence;
|
|
515
|
|
516 $self->{$seqType.'Length'} = abs($ranges[ $#ranges ] - $ranges[0]) + 1;
|
|
517
|
|
518 # Adjust lengths for BLASTX, TBLASTN, TBLASTX sequences
|
|
519 # Converting nucl coords to amino acid coords.
|
|
520
|
|
521 my $prog = $self->{'_prog'};
|
|
522 if($prog eq 'TBLASTN' and $seqType eq '_sbjct') {
|
|
523 $self->{$seqType.'Length'} /= 3;
|
|
524 } elsif($prog eq 'BLASTX' and $seqType eq '_query') {
|
|
525 $self->{$seqType.'Length'} /= 3;
|
|
526 } elsif($prog eq 'TBLASTX') {
|
|
527 $self->{$seqType.'Length'} /= 3;
|
|
528 }
|
|
529
|
|
530 $self->{$seqType.'Strand'} = 'Plus' if $prog =~ /BLAST[NX]/;
|
|
531
|
|
532 # Normalize sequence endpoints so that start < end.
|
|
533 # Reverse complement or 'minus strand' HSPs get flipped here.
|
|
534 if($self->{$seqType.'Start'} > $self->{$seqType.'Stop'}) {
|
|
535 ($self->{$seqType.'Start'}, $self->{$seqType.'Stop'}) =
|
|
536 ($self->{$seqType.'Stop'}, $self->{$seqType.'Start'});
|
|
537 $self->{$seqType.'Strand'} = 'Minus';
|
|
538 }
|
|
539
|
|
540 ## Count number of gaps in each seq. Only need to do this for gapped Blasts.
|
|
541 # if($self->{'_gapped'}) {
|
|
542 my $seqstr = join('', @sequence);
|
|
543 $seqstr =~ s/\s//g;
|
|
544 my $num_gaps = CORE::length($seqstr) - $self->{$seqType.'Length'};
|
|
545 $self->{$seqType.'Gaps'} = $num_gaps if $num_gaps > 0;
|
|
546 # }
|
|
547 }
|
|
548
|
|
549
|
|
550 =head2 _set_residues
|
|
551
|
|
552 Usage : n/a; called automatically when residue data is requested.
|
|
553 Purpose : Sets the residue numbers representing the identical and
|
|
554 : conserved positions. These data are obtained by analyzing the
|
|
555 : symbols between query and sbjct lines of the alignments.
|
|
556 Argument : n/a
|
|
557 Throws : Propagates any exception thrown by _set_seq_data() and _set_match_seq().
|
|
558 Comments : These data are not always needed, so it is conditionally
|
|
559 : executed only upon demand by methods such as seq_inds().
|
|
560 : Behavior is dependent on the type of BLAST analysis (TBLASTN, BLASTP, etc).
|
|
561
|
|
562 See Also : L<_set_seq_data()|_set_seq_data>, L<_set_match_seq()|_set_match_seq>, L<seq_inds()|seq_inds>
|
|
563
|
|
564 =cut
|
|
565
|
|
566 #------------------
|
|
567 sub _set_residues {
|
|
568 #------------------
|
|
569 my $self = shift;
|
|
570 my @sequence = ();
|
|
571
|
|
572 $self->_set_seq_data() unless $self->{'_set_seq_data'};
|
|
573
|
|
574 # Using hashes to avoid saving duplicate residue numbers.
|
|
575 my %identicalList_query = ();
|
|
576 my %identicalList_sbjct = ();
|
|
577 my %conservedList_query = ();
|
|
578 my %conservedList_sbjct = ();
|
|
579
|
|
580 my $aref = $self->_set_match_seq() if not ref $self->{'_matchSeq'};
|
|
581 $aref ||= $self->{'_matchSeq'};
|
|
582 my $seqString = join('', @$aref );
|
|
583
|
|
584 my $qseq = join('',@{$self->{'_querySeq'}});
|
|
585 my $sseq = join('',@{$self->{'_sbjctSeq'}});
|
|
586 my $resCount_query = $self->{'_queryStop'} || 0;
|
|
587 my $resCount_sbjct = $self->{'_sbjctStop'} || 0;
|
|
588
|
|
589 my $prog = $self->{'_prog'};
|
|
590 if($prog !~ /^BLASTP|^BLASTN/) {
|
|
591 if($prog eq 'TBLASTN') {
|
|
592 $resCount_sbjct /= 3;
|
|
593 } elsif($prog eq 'BLASTX') {
|
|
594 $resCount_query /= 3;
|
|
595 } elsif($prog eq 'TBLASTX') {
|
|
596 $resCount_query /= 3;
|
|
597 $resCount_sbjct /= 3;
|
|
598 }
|
|
599 }
|
|
600
|
|
601 my ($mchar, $schar, $qchar);
|
|
602 while( $mchar = chop($seqString) ) {
|
|
603 ($qchar, $schar) = (chop($qseq), chop($sseq));
|
|
604 if( $mchar eq '+' ) {
|
|
605 $conservedList_query{ $resCount_query } = 1;
|
|
606 $conservedList_sbjct{ $resCount_sbjct } = 1;
|
|
607 } elsif( $mchar ne ' ' ) {
|
|
608 $identicalList_query{ $resCount_query } = 1;
|
|
609 $identicalList_sbjct{ $resCount_sbjct } = 1;
|
|
610 }
|
|
611 $resCount_query-- if $qchar ne $GAP_SYMBOL;
|
|
612 $resCount_sbjct-- if $schar ne $GAP_SYMBOL;
|
|
613 }
|
|
614 $self->{'_identicalRes_query'} = \%identicalList_query;
|
|
615 $self->{'_conservedRes_query'} = \%conservedList_query;
|
|
616 $self->{'_identicalRes_sbjct'} = \%identicalList_sbjct;
|
|
617 $self->{'_conservedRes_sbjct'} = \%conservedList_sbjct;
|
|
618
|
|
619 }
|
|
620
|
|
621
|
|
622
|
|
623
|
|
624 =head2 _set_match_seq
|
|
625
|
|
626 Usage : n/a. Internal method.
|
|
627 : $hsp_obj->_set_match_seq()
|
|
628 Purpose : Set the 'match' sequence for the current HSP (symbols in between
|
|
629 : the query and sbjct lines.)
|
|
630 Returns : Array reference holding the match sequences lines.
|
|
631 Argument : n/a
|
|
632 Throws : Exception if the _matchList field is not set.
|
|
633 Comments : The match information is not always necessary. This method
|
|
634 : allows it to be conditionally prepared.
|
|
635 : Called by _set_residues>() and seq_str().
|
|
636
|
|
637 See Also : L<_set_residues()|_set_residues>, L<seq_str()|seq_str>
|
|
638
|
|
639 =cut
|
|
640
|
|
641 #-------------------
|
|
642 sub _set_match_seq {
|
|
643 #-------------------
|
|
644 my $self = shift;
|
|
645
|
|
646 ## DEBUGGING CODE:
|
|
647 # if($self->parent->name eq '1AK5_' and $self->parent->parent->name eq 'YAR073W') {
|
|
648 # print "\n_set_match_seq() called for HSP ", $self->name, " of hit ${\$self->parent->name} in query ${\$self->parent->parent->name}"; <STDIN>;
|
|
649 # }
|
|
650
|
|
651 ref($self->{'_matchList'}) || $self->throw("Can't set HSP match sequence: No data");
|
|
652
|
|
653 my @data = @{$self->{'_matchList'}};
|
|
654
|
|
655 my(@sequence);
|
|
656 foreach( @data ) {
|
|
657 chomp($_);
|
|
658 ## Remove leading spaces; (note: aln may begin with a space
|
|
659 ## which is why we can't use s/^ +//).
|
|
660 s/^ {$self->{'_match_indent'}}//;
|
|
661 push @sequence, $_;
|
|
662 }
|
|
663 # Liberate some memory.
|
|
664 @{$self->{'_matchList'}} = undef;
|
|
665 $self->{'_matchList'} = undef;
|
|
666
|
|
667 $self->{'_matchSeq'} = \@sequence;
|
|
668
|
|
669 ## DEBUGGING CODE:
|
|
670 # if($self->parent->name eq '1AK5_' and $self->parent->parent->name eq 'YAR073W') {
|
|
671 # print "RETURNING: $self->{'_matchSeq'}:\n @{$self->{'_matchSeq'}}";<STDIN>;
|
|
672 # }
|
|
673
|
|
674 $self->{'_matchSeq'};
|
|
675 }
|
|
676
|
|
677
|
|
678
|
|
679 =head2 score
|
|
680
|
|
681 Usage : $hsp_obj->score()
|
|
682 Purpose : Get the Blast score for the HSP.
|
|
683 Returns : Integer
|
|
684 Argument : n/a
|
|
685 Throws : n/a
|
|
686
|
|
687 See Also : L<bits()|bits>
|
|
688
|
|
689 =cut
|
|
690
|
|
691 #---------
|
|
692 sub score { my $self = shift; $self->{'_score'}; }
|
|
693 #---------
|
|
694
|
|
695
|
|
696
|
|
697 =head2 bits
|
|
698
|
|
699 Usage : $hsp_obj->bits()
|
|
700 Purpose : Get the Blast score in bits for the HSP.
|
|
701 Returns : Float
|
|
702 Argument : n/a
|
|
703 Throws : n/a
|
|
704
|
|
705
|
|
706 See Also : L<score()|score>
|
|
707
|
|
708 =cut
|
|
709
|
|
710 #--------
|
|
711 sub bits { my $self = shift; $self->{'_bits'}; }
|
|
712 #--------
|
|
713
|
|
714
|
|
715
|
|
716 =head2 n
|
|
717
|
|
718 Usage : $hsp_obj->n()
|
|
719 Purpose : Get the N value (num HSPs on which P/Expect is based).
|
|
720 : This value is not defined with NCBI Blast2 with gapping.
|
|
721 Returns : Integer or null string if not defined.
|
|
722 Argument : n/a
|
|
723 Throws : n/a
|
|
724 Comments : The 'N' value is listed in parenthesis with P/Expect value:
|
|
725 : e.g., P(3) = 1.2e-30 ---> (N = 3).
|
|
726 : Not defined in NCBI Blast2 with gaps.
|
|
727 : This typically is equal to the number of HSPs but not always.
|
|
728 : To obtain the number of HSPs, use Bio::Tools::Blast::Sbjct::num_hsps().
|
|
729
|
|
730 See Also : L<score()|score>
|
|
731
|
|
732 =cut
|
|
733
|
|
734 #-----
|
|
735 sub n { my $self = shift; $self->{'_n'} || ''; }
|
|
736 #-----
|
|
737
|
|
738
|
|
739
|
|
740 =head2 frame
|
|
741
|
|
742 Usage : $hsp_obj->frame()
|
|
743 Purpose : Get the reading frame number (-/+ 1, 2, 3) (TBLASTN/X only).
|
|
744 Returns : Integer or null string if not defined.
|
|
745 Argument : n/a
|
|
746 Throws : n/a
|
|
747
|
|
748 =cut
|
|
749
|
|
750 #---------
|
|
751 sub frame { my $self = shift; $self->{'_frame'} || ''; }
|
|
752 #---------
|
|
753
|
|
754
|
|
755
|
|
756 =head2 signif()
|
|
757
|
|
758 Usage : $hsp_obj->signif()
|
|
759 Purpose : Get the P-value or Expect value for the HSP.
|
|
760 Returns : Float (0.001 or 1.3e-43)
|
|
761 : Returns P-value if it is defined, otherwise, Expect value.
|
|
762 Argument : n/a
|
|
763 Throws : n/a
|
|
764 Comments : Provided for consistency with Sbjct::signif()
|
|
765 : Support for returning the significance data in different
|
|
766 : formats (e.g., exponent only), is not provided for HSP objects.
|
|
767 : This is only available for the Sbjct or Blast object.
|
|
768
|
|
769 See Also : L<p()|p>, L<expect()|expect>, B<Bio::Tools::Blast::Sbjct::signif()>
|
|
770
|
|
771 =cut
|
|
772
|
|
773 #-----------
|
|
774 sub signif {
|
|
775 #-----------
|
|
776 my $self = shift;
|
|
777 my $val ||= defined($self->{'_p'}) ? $self->{'_p'} : $self->{'_expect'};
|
|
778 $val;
|
|
779 }
|
|
780
|
|
781
|
|
782
|
|
783 =head2 expect
|
|
784
|
|
785 Usage : $hsp_obj->expect()
|
|
786 Purpose : Get the Expect value for the HSP.
|
|
787 Returns : Float (0.001 or 1.3e-43)
|
|
788 Argument : n/a
|
|
789 Throws : n/a
|
|
790 Comments : Support for returning the expectation data in different
|
|
791 : formats (e.g., exponent only), is not provided for HSP objects.
|
|
792 : This is only available for the Sbjct or Blast object.
|
|
793
|
|
794 See Also : L<p()|p>
|
|
795
|
|
796 =cut
|
|
797
|
|
798 #----------
|
|
799 sub expect { my $self = shift; $self->{'_expect'}; }
|
|
800 #----------
|
|
801
|
|
802
|
|
803
|
|
804 =head2 p
|
|
805
|
|
806 Usage : $hsp_obj->p()
|
|
807 Purpose : Get the P-value for the HSP.
|
|
808 Returns : Float (0.001 or 1.3e-43) or undef if not defined.
|
|
809 Argument : n/a
|
|
810 Throws : n/a
|
|
811 Comments : P-value is not defined with NCBI Blast2 reports.
|
|
812 : Support for returning the expectation data in different
|
|
813 : formats (e.g., exponent only) is not provided for HSP objects.
|
|
814 : This is only available for the Sbjct or Blast object.
|
|
815
|
|
816 See Also : L<expect()|expect>
|
|
817
|
|
818 =cut
|
|
819
|
|
820 #-----
|
|
821 sub p { my $self = shift; $self->{'_p'}; }
|
|
822 #-----
|
|
823
|
|
824
|
|
825 =head2 length
|
|
826
|
|
827 Usage : $hsp->length( [seq_type] )
|
|
828 Purpose : Get the length of the aligned portion of the query or sbjct.
|
|
829 Example : $hsp->length('query')
|
|
830 Returns : integer
|
|
831 Argument : seq_type: 'query' | 'sbjct' | 'total' (default = 'total')
|
|
832 Throws : n/a
|
|
833 Comments : 'total' length is the full length of the alignment
|
|
834 : as reported in the denominators in the alignment section:
|
|
835 : "Identical = 34/120 Positives = 67/120".
|
|
836 : Developer note: when using the built-in length function within
|
|
837 : this module, call it as CORE::length().
|
|
838
|
|
839 See Also : L<gaps()|gaps>
|
|
840
|
|
841 =cut
|
|
842
|
|
843 #-----------
|
|
844 sub length {
|
|
845 #-----------
|
|
846 my( $self, $type ) = @_;
|
|
847 $type ||= 'total';
|
|
848
|
|
849 $type ne 'total' and $self->_set_seq_data() unless $self->{'_set_seq_data'};
|
|
850
|
|
851 ## Sensitive to member name format.
|
|
852 $type = "_\L$type\E";
|
|
853 $self->{$type.'Length'};
|
|
854 }
|
|
855
|
|
856
|
|
857
|
|
858 =head2 gaps
|
|
859
|
|
860 Usage : $hsp->gaps( [seq_type] )
|
|
861 Purpose : Get the number of gaps in the query, sbjct, or total alignment.
|
|
862 : Also can return query gaps and sbjct gaps as a two-element list
|
|
863 : when in array context.
|
|
864 Example : $total_gaps = $hsp->gaps();
|
|
865 : ($qgaps, $sgaps) = $hsp->gaps();
|
|
866 : $qgaps = $hsp->gaps('query');
|
|
867 Returns : scalar context: integer
|
|
868 : array context without args: (int, int) = ('queryGaps', 'sbjctGaps')
|
|
869 Argument : seq_type: 'query' | 'sbjct' | 'total'
|
|
870 : (default = 'total', scalar context)
|
|
871 : Array context can be "induced" by providing an argument of 'list' or 'array'.
|
|
872 Throws : n/a
|
|
873
|
|
874 See Also : L<length()|length>, L<matches()|matches>
|
|
875
|
|
876 =cut
|
|
877
|
|
878 #---------
|
|
879 sub gaps {
|
|
880 #---------
|
|
881 my( $self, $seqType ) = @_;
|
|
882
|
|
883 $self->_set_seq_data() unless $self->{'_set_seq_data'};
|
|
884
|
|
885 $seqType ||= (wantarray ? 'list' : 'total');
|
|
886
|
|
887 if($seqType =~ /list|array/i) {
|
|
888 return (($self->{'_queryGaps'} || 0), ($self->{'_sbjctGaps'} || 0));
|
|
889 }
|
|
890
|
|
891 if($seqType eq 'total') {
|
|
892 return ($self->{'_queryGaps'} + $self->{'_sbjctGaps'}) || 0;
|
|
893 } else {
|
|
894 ## Sensitive to member name format.
|
|
895 $seqType = "_\L$seqType\E";
|
|
896 return $self->{$seqType.'Gaps'} || 0;
|
|
897 }
|
|
898 }
|
|
899
|
|
900
|
|
901
|
|
902 =head2 matches
|
|
903
|
|
904 Usage : $hsp->matches([seq_type], [start], [stop]);
|
|
905 Purpose : Get the total number of identical and conservative matches
|
|
906 : in the query or sbjct sequence for the given HSP. Optionally can
|
|
907 : report data within a defined interval along the seq.
|
|
908 : (Note: 'conservative' matches are called 'positives' in the
|
|
909 : Blast report.)
|
|
910 Example : ($id,$cons) = $hsp_object->matches('sbjct');
|
|
911 : ($id,$cons) = $hsp_object->matches('query',300,400);
|
|
912 Returns : 2-element array of integers
|
|
913 Argument : (1) seq_type = 'query' | 'sbjct' (default = query)
|
|
914 : (2) start = Starting coordinate (optional)
|
|
915 : (3) stop = Ending coordinate (optional)
|
|
916 Throws : Exception if the supplied coordinates are out of range.
|
|
917 Comments : Relies on seq_str('match') to get the string of alignment symbols
|
|
918 : between the query and sbjct lines which are used for determining
|
|
919 : the number of identical and conservative matches.
|
|
920
|
|
921 See Also : L<length()|length>, L<gaps()|gaps>, L<seq_str()|seq_str>, B<Bio::Tools::Blast::Sbjct::_adjust_contigs()>
|
|
922
|
|
923 =cut
|
|
924
|
|
925 #-----------
|
|
926 sub matches {
|
|
927 #-----------
|
|
928 my( $self, %param ) = @_;
|
|
929 my(@data);
|
|
930 my($seqType, $beg, $end) = ($param{-SEQ}, $param{-START}, $param{-STOP});
|
|
931 $seqType ||= 'query';
|
|
932
|
|
933 if(!defined $beg && !defined $end) {
|
|
934 ## Get data for the whole alignment.
|
|
935 push @data, ($self->{'_numIdentical'}, $self->{'_numConserved'});
|
|
936 } else {
|
|
937 ## Get the substring representing the desired sub-section of aln.
|
|
938 $beg ||= 0;
|
|
939 $end ||= 0;
|
|
940 my($start,$stop) = $self->range($seqType);
|
|
941 if($beg == 0) { $beg = $start; $end = $beg+$end; }
|
|
942 elsif($end == 0) { $end = $stop; $beg = $end-$beg; }
|
|
943
|
|
944 if($end >= $stop) { $end = $stop; } ##ML changed from if (end >stop)
|
|
945 else { $end += 1;} ##ML moved from commented position below, makes
|
|
946 ##more sense here
|
|
947 # if($end > $stop) { $end = $stop; }
|
|
948 if($beg < $start) { $beg = $start; }
|
|
949 # else { $end += 1;}
|
|
950
|
|
951 # my $seq = substr($self->seq_str('match'), $beg-$start, ($end-$beg));
|
|
952
|
|
953 ## ML: START fix for substr out of range error ------------------
|
|
954 my $seq = "";
|
|
955 if (($self->{'_prog'} eq 'TBLASTN') and ($seqType eq 'sbjct'))
|
|
956 {
|
|
957 $seq = substr($self->seq_str('match'),
|
|
958 int(($beg-$start)/3), int(($end-$beg+1)/3));
|
|
959
|
|
960 } elsif (($self->{'_prog'} eq 'BLASTX') and ($seqType eq 'query'))
|
|
961 {
|
|
962 $seq = substr($self->seq_str('match'),
|
|
963 int(($beg-$start)/3), int(($end-$beg+1)/3));
|
|
964 } else {
|
|
965 $seq = substr($self->seq_str('match'),
|
|
966 $beg-$start, ($end-$beg));
|
|
967 }
|
|
968 ## ML: End of fix for substr out of range error -----------------
|
|
969
|
|
970
|
|
971 ## ML: debugging code
|
|
972 ## This is where we get our exception. Try printing out the values going
|
|
973 ## into this:
|
|
974 ##
|
|
975 # print STDERR
|
|
976 # qq(*------------MY EXCEPTION --------------------\nSeq: ") ,
|
|
977 # $self->seq_str("$seqType"), qq("\n),$self->name,",( index:";
|
|
978 # print STDERR $beg-$start, ", len: ", $end-$beg," ), (HSPRealLen:",
|
|
979 # CORE::length $self->seq_str("$seqType");
|
|
980 # print STDERR ", HSPCalcLen: ", $stop - $start +1 ," ),
|
|
981 # ( beg: $beg, end: $end ), ( start: $start, stop: stop )\n";
|
|
982 ## ML: END DEBUGGING CODE----------
|
|
983
|
|
984 if(!CORE::length $seq) {
|
|
985 $self->throw("Undefined sub-sequence ($beg,$end). Valid range = $start - $stop");
|
|
986 }
|
|
987 ## Get data for a substring.
|
|
988 # printf "Collecting HSP subsection data: beg,end = %d,%d; start,stop = %d,%d\n%s<---\n", $beg, $end, $start, $stop, $seq;
|
|
989 # printf "Original match seq:\n%s\n",$self->seq_str('match');
|
|
990 $seq =~ s/ //g; # remove space (no info).
|
|
991 my $len_cons = CORE::length $seq;
|
|
992 $seq =~ s/\+//g; # remove '+' characters (conservative substitutions)
|
|
993 my $len_id = CORE::length $seq;
|
|
994 push @data, ($len_id, $len_cons);
|
|
995 # printf " HSP = %s\n id = %d; cons = %d\n", $self->name, $len_id, $len_cons; <STDIN>;
|
|
996 }
|
|
997 @data;
|
|
998 }
|
|
999
|
|
1000
|
|
1001
|
|
1002 =head2 frac_identical
|
|
1003
|
|
1004 Usage : $hsp_object->frac_identical( [seq_type] );
|
|
1005 Purpose : Get the fraction of identical positions within the given HSP.
|
|
1006 Example : $frac_iden = $hsp_object->frac_identical('query');
|
|
1007 Returns : Float (2-decimal precision, e.g., 0.75).
|
|
1008 Argument : seq_type: 'query' | 'sbjct' | 'total'
|
|
1009 : default = 'total' (but see comments below).
|
|
1010 Throws : n/a
|
|
1011 Comments : Different versions of Blast report different values for the total
|
|
1012 : length of the alignment. This is the number reported in the
|
|
1013 : denominators in the stats section:
|
|
1014 : "Identical = 34/120 Positives = 67/120".
|
|
1015 : BLAST-GP uses the total length of the alignment (with gaps)
|
|
1016 : WU-BLAST uses the length of the query sequence (without gaps).
|
|
1017 : Therefore, when called without an argument or an argument of 'total',
|
|
1018 : this method will report different values depending on the
|
|
1019 : version of BLAST used.
|
|
1020 :
|
|
1021 : To get the fraction identical among only the aligned residues,
|
|
1022 : ignoring the gaps, call this method with an argument of 'query'
|
|
1023 : or 'sbjct'.
|
|
1024
|
|
1025 See Also : L<frac_conserved()|frac_conserved>, L<num_identical()|num_identical>, L<matches()|matches>
|
|
1026
|
|
1027 =cut
|
|
1028
|
|
1029 #-------------------
|
|
1030 sub frac_identical {
|
|
1031 #-------------------
|
|
1032 # The value is calculated as opposed to storing it from the parsed results.
|
|
1033 # This saves storage and also permits flexibility in determining for which
|
|
1034 # sequence (query or sbjct) the figure is to be calculated.
|
|
1035
|
|
1036 my( $self, $seqType ) = @_;
|
|
1037 $seqType ||= 'total';
|
|
1038
|
|
1039 if($seqType ne 'total') {
|
|
1040 $self->_set_seq_data() unless $self->{'_set_seq_data'};
|
|
1041 }
|
|
1042 ## Sensitive to member name format.
|
|
1043 $seqType = "_\L$seqType\E";
|
|
1044
|
|
1045 sprintf( "%.2f", $self->{'_numIdentical'}/$self->{$seqType.'Length'});
|
|
1046 }
|
|
1047
|
|
1048
|
|
1049 =head2 frac_conserved
|
|
1050
|
|
1051 Usage : $hsp_object->frac_conserved( [seq_type] );
|
|
1052 Purpose : Get the fraction of conserved positions within the given HSP.
|
|
1053 : (Note: 'conservative' positions are called 'positives' in the
|
|
1054 : Blast report.)
|
|
1055 Example : $frac_cons = $hsp_object->frac_conserved('query');
|
|
1056 Returns : Float (2-decimal precision, e.g., 0.75).
|
|
1057 Argument : seq_type: 'query' | 'sbjct'
|
|
1058 : default = 'total' (but see comments below).
|
|
1059 Throws : n/a
|
|
1060 Comments : Different versions of Blast report different values for the total
|
|
1061 : length of the alignment. This is the number reported in the
|
|
1062 : denominators in the stats section:
|
|
1063 : "Identical = 34/120 Positives = 67/120".
|
|
1064 : BLAST-GP uses the total length of the alignment (with gaps)
|
|
1065 : WU-BLAST uses the length of the query sequence (without gaps).
|
|
1066 : Therefore, when called without an argument or an argument of 'total',
|
|
1067 : this method will report different values depending on the
|
|
1068 : version of BLAST used.
|
|
1069 :
|
|
1070 : To get the fraction conserved among only the aligned residues,
|
|
1071 : ignoring the gaps, call this method with an argument of 'query'
|
|
1072 : or 'sbjct'.
|
|
1073
|
|
1074 See Also : L<frac_conserved()|frac_conserved>, L<num_conserved()|num_conserved>, L<matches()|matches>
|
|
1075
|
|
1076 =cut
|
|
1077
|
|
1078 #--------------------
|
|
1079 sub frac_conserved {
|
|
1080 #--------------------
|
|
1081 # The value is calculated as opposed to storing it from the parsed results.
|
|
1082 # This saves storage and also permits flexibility in determining for which
|
|
1083 # sequence (query or sbjct) the figure is to be calculated.
|
|
1084
|
|
1085 my( $self, $seqType ) = @_;
|
|
1086 $seqType ||= 'total';
|
|
1087
|
|
1088 if($seqType ne 'total') {
|
|
1089 $self->_set_seq_data() unless $self->{'_set_seq_data'};
|
|
1090 }
|
|
1091
|
|
1092 ## Sensitive to member name format.
|
|
1093 $seqType = "_\L$seqType\E";
|
|
1094
|
|
1095 sprintf( "%.2f", $self->{'_numConserved'}/$self->{$seqType.'Length'});
|
|
1096 }
|
|
1097
|
|
1098
|
|
1099 =head2 num_identical
|
|
1100
|
|
1101 Usage : $hsp_object->num_identical();
|
|
1102 Purpose : Get the number of identical positions within the given HSP.
|
|
1103 Example : $num_iden = $hsp_object->num_identical();
|
|
1104 Returns : integer
|
|
1105 Argument : n/a
|
|
1106 Throws : n/a
|
|
1107
|
|
1108 See Also : L<num_conserved()|num_conserved>, L<frac_identical()|frac_identical>
|
|
1109
|
|
1110 =cut
|
|
1111
|
|
1112 #-------------------
|
|
1113 sub num_identical {
|
|
1114 #-------------------
|
|
1115 my( $self) = shift;
|
|
1116
|
|
1117 $self->{'_numIdentical'};
|
|
1118 }
|
|
1119
|
|
1120
|
|
1121 =head2 num_conserved
|
|
1122
|
|
1123 Usage : $hsp_object->num_conserved();
|
|
1124 Purpose : Get the number of conserved positions within the given HSP.
|
|
1125 Example : $num_iden = $hsp_object->num_conserved();
|
|
1126 Returns : integer
|
|
1127 Argument : n/a
|
|
1128 Throws : n/a
|
|
1129
|
|
1130 See Also : L<num_identical()|num_identical>, L<frac_conserved()|frac_conserved>
|
|
1131
|
|
1132 =cut
|
|
1133
|
|
1134 #-------------------
|
|
1135 sub num_conserved {
|
|
1136 #-------------------
|
|
1137 my( $self) = shift;
|
|
1138
|
|
1139 $self->{'_numConserved'};
|
|
1140 }
|
|
1141
|
|
1142
|
|
1143
|
|
1144 =head2 range
|
|
1145
|
|
1146 Usage : $hsp->range( [seq_type] );
|
|
1147 Purpose : Gets the (start, end) coordinates for the query or sbjct sequence
|
|
1148 : in the HSP alignment.
|
|
1149 Example : ($qbeg, $qend) = $hsp->range('query');
|
|
1150 : ($sbeg, $send) = $hsp->range('sbjct');
|
|
1151 Returns : Two-element array of integers
|
|
1152 Argument : seq_type = string, 'query' or 'sbjct' (default = 'query')
|
|
1153 : (case insensitive).
|
|
1154 Throws : n/a
|
|
1155
|
|
1156 See Also : L<start()|start>, L<end()|end>
|
|
1157
|
|
1158 =cut
|
|
1159
|
|
1160 #----------
|
|
1161 sub range {
|
|
1162 #----------
|
|
1163 my ($self, $seqType) = @_;
|
|
1164
|
|
1165 $self->_set_seq_data() unless $self->{'_set_seq_data'};
|
|
1166
|
|
1167 $seqType ||= 'query';
|
|
1168 ## Sensitive to member name changes.
|
|
1169 $seqType = "_\L$seqType\E";
|
|
1170
|
|
1171 return ($self->{$seqType.'Start'},$self->{$seqType.'Stop'});
|
|
1172 }
|
|
1173
|
|
1174 =head2 start
|
|
1175
|
|
1176 Usage : $hsp->start( [seq_type] );
|
|
1177 Purpose : Gets the start coordinate for the query, sbjct, or both sequences
|
|
1178 : in the HSP alignment.
|
|
1179 Example : $qbeg = $hsp->start('query');
|
|
1180 : $sbeg = $hsp->start('sbjct');
|
|
1181 : ($qbeg, $sbeg) = $hsp->start();
|
|
1182 Returns : scalar context: integer
|
|
1183 : array context without args: list of two integers
|
|
1184 Argument : In scalar context: seq_type = 'query' or 'sbjct'
|
|
1185 : (case insensitive). If not supplied, 'query' is used.
|
|
1186 : Array context can be "induced" by providing an argument of 'list' or 'array'.
|
|
1187 Throws : n/a
|
|
1188
|
|
1189 See Also : L<end()|end>, L<range()|range>
|
|
1190
|
|
1191 =cut
|
|
1192
|
|
1193 #----------
|
|
1194 sub start {
|
|
1195 #----------
|
|
1196 my ($self, $seqType) = @_;
|
|
1197
|
|
1198 $seqType ||= (wantarray ? 'list' : 'query');
|
|
1199
|
|
1200 $self->_set_seq_data() unless $self->{'_set_seq_data'};
|
|
1201
|
|
1202 if($seqType =~ /list|array/i) {
|
|
1203 return ($self->{'_queryStart'}, $self->{'_sbjctStart'});
|
|
1204 } else {
|
|
1205 ## Sensitive to member name changes.
|
|
1206 $seqType = "_\L$seqType\E";
|
|
1207 return $self->{$seqType.'Start'};
|
|
1208 }
|
|
1209 }
|
|
1210
|
|
1211 =head2 end
|
|
1212
|
|
1213 Usage : $hsp->end( [seq_type] );
|
|
1214 Purpose : Gets the end coordinate for the query, sbjct, or both sequences
|
|
1215 : in the HSP alignment.
|
|
1216 Example : $qbeg = $hsp->end('query');
|
|
1217 : $sbeg = $hsp->end('sbjct');
|
|
1218 : ($qbeg, $sbeg) = $hsp->end();
|
|
1219 Returns : scalar context: integer
|
|
1220 : array context without args: list of two integers
|
|
1221 Argument : In scalar context: seq_type = 'query' or 'sbjct'
|
|
1222 : (case insensitive). If not supplied, 'query' is used.
|
|
1223 : Array context can be "induced" by providing an argument of 'list' or 'array'.
|
|
1224 Throws : n/a
|
|
1225
|
|
1226 See Also : L<start()|start>, L<range()|range>
|
|
1227
|
|
1228 =cut
|
|
1229
|
|
1230 #----------
|
|
1231 sub end {
|
|
1232 #----------
|
|
1233 my ($self, $seqType) = @_;
|
|
1234
|
|
1235 $seqType ||= (wantarray ? 'list' : 'query');
|
|
1236
|
|
1237 $self->_set_seq_data() unless $self->{'_set_seq_data'};
|
|
1238
|
|
1239 if($seqType =~ /list|array/i) {
|
|
1240 return ($self->{'_queryStop'}, $self->{'_sbjctStop'});
|
|
1241 } else {
|
|
1242 ## Sensitive to member name changes.
|
|
1243 $seqType = "_\L$seqType\E";
|
|
1244 return $self->{$seqType.'Stop'};
|
|
1245 }
|
|
1246 }
|
|
1247
|
|
1248
|
|
1249
|
|
1250 =head2 strand
|
|
1251
|
|
1252 Usage : $hsp_object->strand( [seq_type] )
|
|
1253 Purpose : Get the strand of the query or sbjct sequence.
|
|
1254 Example : print $hsp->strand('query');
|
|
1255 : ($qstrand, $sstrand) = $hsp->strand();
|
|
1256 Returns : -1, 0, or 1
|
|
1257 : -1 = Minus strand, +1 = Plus strand
|
|
1258 : Returns 0 if strand is not defined, which occurs
|
|
1259 : for non-TBLASTN/X reports.
|
|
1260 : In scalar context without arguments, returns queryStrand value.
|
|
1261 : In array context without arguments, returns a two-element list
|
|
1262 : of strings (queryStrand, sbjctStrand).
|
|
1263 : Array context can be "induced" by providing an argument of 'list' or 'array'.
|
|
1264 Argument : seq_type: 'query' | 'sbjct' or undef
|
|
1265 Throws : n/a
|
|
1266
|
|
1267 See Also : L<_set_seq()|_set_seq>, L<_set_match_stats()|_set_match_stats>
|
|
1268
|
|
1269 =cut
|
|
1270
|
|
1271 #-----------
|
|
1272 sub strand {
|
|
1273 #-----------
|
|
1274 my( $self, $seqType ) = @_;
|
|
1275 $seqType ||= (wantarray ? 'list' : 'query');
|
|
1276
|
|
1277 return '' if $seqType eq 'query' and $self->{'_prog'} eq 'TBLASTN';
|
|
1278
|
|
1279 ## Sensitive to member name format.
|
|
1280 $seqType = "_\L$seqType\E";
|
|
1281
|
|
1282 # $seqType could be '_list'.
|
|
1283 $self->{'_queryStrand'} or $self->_set_seq_data() unless $self->{'_set_seq_data'};
|
|
1284
|
|
1285 if($seqType =~ /list|array/i) {
|
|
1286 return ('','') unless defined $self->{'_queryStrand'};
|
|
1287 return ($self->{'_queryStrand'}, $self->{'_sbjctStrand'});
|
|
1288 }
|
|
1289 local $^W = 0;
|
|
1290 $STRAND_SYMBOL{$self->{$seqType.'Strand'}} || 0;
|
|
1291 }
|
|
1292
|
|
1293
|
|
1294 #####################################################################################
|
|
1295 ## INSTANCE METHODS ##
|
|
1296 #####################################################################################
|
|
1297
|
|
1298
|
|
1299 =head2 seq
|
|
1300
|
|
1301 Usage : $hsp->seq( [seq_type] );
|
|
1302 Purpose : Get the query or sbjct sequence as a Bio::Seq.pm object.
|
|
1303 Example : $seqObj = $hsp->seq('query');
|
|
1304 Returns : Object reference for a Bio::Seq.pm object.
|
|
1305 Argument : seq_type = 'query' or 'sbjct' (default = 'query').
|
|
1306 Throws : Propagates any exception that occurs during construction
|
|
1307 : of the Bio::Seq.pm object.
|
|
1308 Comments : The sequence is returned in an array of strings corresponding
|
|
1309 : to the strings in the original format of the Blast alignment.
|
|
1310 : (i.e., same spacing).
|
|
1311
|
|
1312 See Also : L<seq_str()|seq_str>, L<seq_inds()|seq_inds>, B<Bio::Seq.pm>
|
|
1313
|
|
1314 =cut
|
|
1315
|
|
1316 #-------
|
|
1317 sub seq {
|
|
1318 #-------
|
|
1319 my($self,$seqType) = @_;
|
|
1320 $seqType ||= 'query';
|
|
1321 my $str = $self->seq_str($seqType);
|
|
1322 my $num = $self->name;
|
|
1323 my $name = $seqType =~ /query/i
|
|
1324 ? $self->parent->parent->name
|
|
1325 : $self->parent->name;
|
|
1326
|
|
1327 require Bio::Seq;
|
|
1328
|
|
1329 new Bio::Seq (-ID => $name,
|
|
1330 -SEQ => $str,
|
|
1331 -DESC => "Blast HSP #$num, $seqType sequence",
|
|
1332 );
|
|
1333 }
|
|
1334
|
|
1335
|
|
1336
|
|
1337 =head2 seq_str
|
|
1338
|
|
1339 Usage : $hsp->seq_str( seq_type );
|
|
1340 Purpose : Get the full query, sbjct, or 'match' sequence as a string.
|
|
1341 : The 'match' sequence is the string of symbols in between the
|
|
1342 : query and sbjct sequences.
|
|
1343 Example : $str = $hsp->seq_str('query');
|
|
1344 Returns : String
|
|
1345 Argument : seq_Type = 'query' or 'sbjct' or 'match'
|
|
1346 Throws : Exception if the argument does not match an accepted seq_type.
|
|
1347 Comments : Calls _set_residues() to set the 'match' sequence if it has
|
|
1348 : not been set already.
|
|
1349
|
|
1350 See Also : L<seq()|seq>, L<seq_inds()|seq_inds>, L<_set_match_seq()|_set_match_seq>
|
|
1351
|
|
1352 =cut
|
|
1353
|
|
1354 #------------
|
|
1355 sub seq_str {
|
|
1356 #------------
|
|
1357 my($self,$seqType) = @_;
|
|
1358
|
|
1359 ## Sensitive to member name changes.
|
|
1360 $seqType = "_\L$seqType\E";
|
|
1361
|
|
1362 $self->_set_seq_data() unless $self->{'_set_seq_data'};
|
|
1363
|
|
1364 if($seqType =~ /sbjct|query/) {
|
|
1365 my $seq = join('',@{$self->{$seqType.'Seq'}});
|
|
1366 $seq =~ s/\s+//g;
|
|
1367 return $seq;
|
|
1368
|
|
1369 } elsif( $seqType =~ /match/i) {
|
|
1370 # Only need to call _set_match_seq() if the match seq is requested.
|
|
1371 my $aref = $self->_set_match_seq() unless ref $self->{'_matchSeq'};
|
|
1372 $aref = $self->{'_matchSeq'};
|
|
1373
|
|
1374 ## DEBUGGING CODE:
|
|
1375 # if($self->parent->name eq '1AK5_' and $self->parent->parent->name eq 'YAR073W') {
|
|
1376 # print "seq_str():\n @$aref";<STDIN>;
|
|
1377 # }
|
|
1378
|
|
1379 return join('',@$aref);
|
|
1380
|
|
1381 } else {
|
|
1382 $self->throw("Invalid or undefined sequence type: $seqType",
|
|
1383 "Valid types: query, sbjct, match");
|
|
1384 }
|
|
1385 }
|
|
1386
|
|
1387
|
|
1388
|
|
1389
|
|
1390 =head2 seq_inds
|
|
1391
|
|
1392 Usage : $hsp->seq_inds( seq_type, class, collapse );
|
|
1393 Purpose : Get a list of residue positions (indices) for all identical
|
|
1394 : or conserved residues in the query or sbjct sequence.
|
|
1395 Example : @ind = $hsp->seq_inds('query', 'identical');
|
|
1396 : @ind = $hsp->seq_inds('sbjct', 'conserved');
|
|
1397 : @ind = $hsp->seq_inds('sbjct', 'conserved', 1);
|
|
1398 Returns : List of integers
|
|
1399 : May include ranges if collapse is true.
|
|
1400 Argument : seq_type = 'query' or 'sbjct' (default = query)
|
|
1401 : class = 'identical' or 'conserved' (default = identical)
|
|
1402 : (can be shortened to 'id' or 'cons')
|
|
1403 : (actually, anything not 'id' will evaluate to 'conserved').
|
|
1404 : collapse = boolean, if true, consecutive positions are merged
|
|
1405 : using a range notation, e.g., "1 2 3 4 5 7 9 10 11"
|
|
1406 : collapses to "1-5 7 9-11". This is useful for
|
|
1407 : consolidating long lists. Default = no collapse.
|
|
1408 Throws : n/a.
|
|
1409 Comments : Calls _set_residues() to set the 'match' sequence if it has
|
|
1410 : not been set already.
|
|
1411
|
|
1412 See Also : L<seq()|seq>, L<_set_residues()|_set_residues>, L<collapse_nums()|collapse_nums>, B<Bio::Tools::Blast::Sbjct::seq_inds()>
|
|
1413
|
|
1414 =cut
|
|
1415
|
|
1416 #---------------
|
|
1417 sub seq_inds {
|
|
1418 #---------------
|
|
1419 my ($self, $seq, $class, $collapse) = @_;
|
|
1420
|
|
1421 $seq ||= 'query';
|
|
1422 $class ||= 'identical';
|
|
1423 $collapse ||= 0;
|
|
1424
|
|
1425 $self->_set_residues() unless defined $self->{'_identicalRes_query'};
|
|
1426
|
|
1427 $seq = ($seq !~ /^q/i ? 'sbjct' : 'query');
|
|
1428 $class = ($class !~ /^id/i ? 'conserved' : 'identical');
|
|
1429
|
|
1430 ## Sensitive to member name changes.
|
|
1431 $seq = "_\L$seq\E";
|
|
1432 $class = "_\L$class\E";
|
|
1433
|
|
1434 my @ary = sort { $a <=> $b } keys %{ $self->{"${class}Res$seq"}};
|
|
1435
|
|
1436 return $collapse ? &collapse_nums(@ary) : @ary;
|
|
1437 }
|
|
1438
|
|
1439
|
|
1440
|
|
1441
|
|
1442 =head2 get_aln
|
|
1443
|
|
1444 Usage : $hsp->get_aln()
|
|
1445 Purpose : Get a Bio::UnivAln.pm object constructed from the query + sbjct
|
|
1446 : sequences of the present HSP object.
|
|
1447 Example : $aln_obj = $hsp->get_aln();
|
|
1448 Returns : Object reference for a Bio::UnivAln.pm object.
|
|
1449 Argument : n/a.
|
|
1450 Throws : Propagates any exception ocurring during the construction of
|
|
1451 : the Bio::UnivAln object.
|
|
1452 Comments : Requires Bio::UnivAln.pm.
|
|
1453 : The Bio::UnivAln.pm object is constructed from the query + sbjct
|
|
1454 : sequence objects obtained by calling seq().
|
|
1455 : Gap residues are included (see $GAP_SYMBOL). It is important that
|
|
1456 : Bio::UnivAln.pm recognizes the gaps correctly. A strategy for doing
|
|
1457 : this is being considered. Currently it is hard-wired.
|
|
1458
|
|
1459 See Also : L<seq()|seq>, B<Bio::UnivAln.pm>
|
|
1460
|
|
1461 =cut
|
|
1462
|
|
1463 #------------
|
|
1464 sub get_aln {
|
|
1465 #------------
|
|
1466 my $self = shift;
|
|
1467
|
|
1468 require Bio::UnivAln;
|
|
1469
|
|
1470 my $qseq = $self->seq('query');
|
|
1471 my $sseq = $self->seq('sbjct');
|
|
1472
|
|
1473 my $desc = sprintf "HSP #%s of query %s vs. sbjct %s",
|
|
1474 $self->name, $self->parent->parent->name, $self->parent->name;
|
|
1475
|
|
1476 my $type = $self->{'_prog'} =~ /P$|^T/ ? 'amino' : 'dna';
|
|
1477
|
|
1478 Bio::UnivAln->new( -seqs => [$qseq, $sseq],
|
|
1479 -desc => $desc,
|
|
1480 -type => $type,
|
|
1481 );
|
|
1482 }
|
|
1483
|
|
1484
|
|
1485 =head2 display
|
|
1486
|
|
1487 Usage : $sbjct_object->display( %named_parameters );
|
|
1488 Purpose : Display information about Bio::Tools::Blast::Sbjct.pm data members
|
|
1489 : including: length, gaps, score, significance value,
|
|
1490 : sequences and sequence indices.
|
|
1491 Example : $object->display(-SHOW=>'stats');
|
|
1492 Argument : Named parameters: (TAGS CAN BE UPPER OR LOWER CASE)
|
|
1493 : -SHOW => 'hsp',
|
|
1494 : -WHERE => filehandle (default = STDOUT)
|
|
1495 Returns : n/a
|
|
1496 Status : Experimental
|
|
1497 Comments : For more control over the display of sequence data,
|
|
1498 : use seq(), seq_str(), seq_inds().
|
|
1499
|
|
1500 See Also : L<_display_seq()|_display_seq>, L<seq()|seq>, L<seq_str()|seq_str>, L<seq_inds()|seq_inds>, L<_display_matches()|_display_matches>, B<Bio::Root::Object::display()>
|
|
1501
|
|
1502 =cut
|
|
1503
|
|
1504 #-----------
|
|
1505 sub display {
|
|
1506 #-----------
|
|
1507 my( $self, %param ) = @_;
|
|
1508
|
|
1509 my $sbjctName = $self->parent->name();
|
|
1510 my $queryName = $self->parent->parent->name();
|
|
1511 my $layout = $self->parent->parent->_layout();
|
|
1512
|
|
1513 my $OUT = $self->set_display(%param);
|
|
1514
|
|
1515 printf( $OUT "%-15s: %d\n", "LENGTH TOTAL", $self->length('total') );
|
|
1516 printf( $OUT "%-15s: %d\n", "LENGTH QUERY", $self->length('query') );
|
|
1517 printf( $OUT "%-15s: %d\n", "LENGTH SBJCT", $self->length('sbjct') );
|
|
1518 printf( $OUT "%-15s: %d\n", "GAPS QUERY", $self->gaps('query') );
|
|
1519 printf( $OUT "%-15s: %d\n", "GAPS SBJCT", $self->gaps('sbjct') );
|
|
1520 printf( $OUT "%-15s: %d\n", "SCORE", $self->{'_score'} );
|
|
1521 printf( $OUT "%-15s: %0.1f\n", "BITS", $self->{'_bits'} );
|
|
1522 if($layout == 1) {
|
|
1523 printf( $OUT "%-15s: %.1e\n", "P-VAL", $self->{'_p'} );
|
|
1524 printf( $OUT "%-15s: %.1e\n", "EXPECT", $self->{'_expect'} );
|
|
1525 } else {
|
|
1526 printf( $OUT "%-15s: %.1e\n", "EXPECT", $self->{'_expect'} );
|
|
1527 }
|
|
1528
|
|
1529 my $queryLength = $self->length('query');
|
|
1530
|
|
1531 printf( $OUT "%-15s: %d (%0.0f%%)\n", "IDENTICAL", $self->{'_numIdentical'},
|
|
1532 $self->{'_numIdentical'}/$queryLength * 100 );
|
|
1533 printf( $OUT "%-15s: %d (%0.0f%%) %s \n", "CONSERVED", $self->{'_numConserved'},
|
|
1534 $self->{'_numConserved'}/$queryLength * 100,
|
|
1535 "includes identical" );
|
|
1536
|
|
1537 $self->_display_seq('query', $queryName, $OUT);
|
|
1538 $self->_display_seq('sbjct', $sbjctName, $OUT);
|
|
1539 $self->_display_matches($queryName, $sbjctName, $OUT);
|
|
1540 }
|
|
1541
|
|
1542
|
|
1543
|
|
1544
|
|
1545 =head2 _display_seq
|
|
1546
|
|
1547 Usage : n/a; called automatically by display()
|
|
1548 Purpose : Display information about query and sbjct HSP sequences.
|
|
1549 : Prints the start, stop coordinates and the actual sequence.
|
|
1550 Example : n/a
|
|
1551 Argument :
|
|
1552 Returns : printf call.
|
|
1553 Status : Experimental
|
|
1554 Comments : For more control, use seq(), seq_str(), or seq_inds().
|
|
1555
|
|
1556 See Also : L<display()|display>, L<seq()|seq>, L<seq_str()|seq_str>, L<seq_inds()|seq_inds>, L<_display_matches()|_display_matches>
|
|
1557
|
|
1558 =cut
|
|
1559
|
|
1560 #------------------
|
|
1561 sub _display_seq {
|
|
1562 #------------------
|
|
1563 my( $self, $seqType, $name, $OUT ) = @_;
|
|
1564
|
|
1565 $self->_set_seq_data() unless $self->{'_set_seq_data'};
|
|
1566
|
|
1567 # Sensitive to member name changes.
|
|
1568 my $mem = "_\L$seqType\E";
|
|
1569 printf( $OUT "\n%10s: %s\n%10s %s\n", "\U$seqType\E", "$name", "-----",
|
|
1570 ('-'x ((CORE::length $name) + 2)) );
|
|
1571 printf( $OUT "%13s: %d\n", "START", $self->{$mem.'Start'} );
|
|
1572 printf( $OUT "%13s: %d\n", "STOP", $self->{$mem.'Stop'} );
|
|
1573 printf( $OUT "%13s: \n", "SEQ" );
|
|
1574 foreach( @{ $self->{$mem.'Seq'}} ) {
|
|
1575 printf( $OUT "%15s%s\n", "", $_);
|
|
1576 }
|
|
1577 }
|
|
1578
|
|
1579
|
|
1580 =head2 _display_matches
|
|
1581
|
|
1582 Usage : n/a; called automatically by display()
|
|
1583 Purpose : Display information about identical and conserved positions
|
|
1584 : within both the query and sbjct sequences.
|
|
1585 Example : n/a
|
|
1586 Argument :
|
|
1587 Returns : printf call.
|
|
1588 Status : Experimental
|
|
1589 Comments : For more control, use seq_inds().
|
|
1590
|
|
1591 See Also : L<display()|display>, L<seq_inds()|seq_inds>, L<_display_seq()|_display_seq>,
|
|
1592
|
|
1593 =cut
|
|
1594
|
|
1595 #--------------------
|
|
1596 sub _display_matches {
|
|
1597 #--------------------
|
|
1598 my( $self, $queryName, $sbjctName, $OUT) = @_;
|
|
1599 my($resNum, $count);
|
|
1600
|
|
1601 $self->_set_residues() unless defined $self->{'_identicalRes_query'};
|
|
1602
|
|
1603 printf( $OUT "\n%10s: \n%10s\n", "HITS", "-----" );
|
|
1604 foreach( @{ $self->{'_matchSeq'}} ) {
|
|
1605 printf( $OUT "%15s%s\n", "", $_ );
|
|
1606 }
|
|
1607
|
|
1608 print $OUT "\n\U$queryName\E\n------------\n";
|
|
1609 printf( $OUT "\n%5s%s:\n%5s%s\n\t", "", "IDENTICAL RESIDUES IN $queryName (n=$self->{'_numIdentical'})",
|
|
1610 "", "--------------------------------------------" );
|
|
1611 $count = 0;
|
|
1612 foreach $resNum ( sort keys %{ $self->{'_identicalRes_query' }} ) {
|
|
1613 $count++;
|
|
1614 print $OUT "$resNum";
|
|
1615 $count > 0 and print $OUT +( $count % 15 ? ", " : "\n\t");
|
|
1616 }
|
|
1617
|
|
1618 print $OUT "\n";
|
|
1619
|
|
1620 my $justConserved = ($self->{'_numConserved'})-($self->{'_numIdentical'});
|
|
1621 printf( $OUT "\n%5s%s:\n%5s%s\n\t", "","CONSERVED RESIDUES IN $queryName (n=$justConserved)",
|
|
1622 "", "--------------------------------------------" );
|
|
1623 $count = 0;
|
|
1624 foreach $resNum ( sort keys %{ $self->{'_conservedRes_query' }} ) {
|
|
1625 $count++;
|
|
1626 print $OUT "$resNum";
|
|
1627 $count > 0 and print $OUT +( $count % 15 ? ", " : "\n\t");
|
|
1628 }
|
|
1629
|
|
1630
|
|
1631 print $OUT "\n\n\U$sbjctName\E\n------------\n";
|
|
1632 printf( $OUT "\n%5s%s:\n%5s%s\n\t", "", "IDENTICAL RESIDUES IN $sbjctName (n=$self->{'_numIdentical'})",
|
|
1633 "", "--------------------------------------------" );
|
|
1634 $count = 0;
|
|
1635 foreach $resNum ( sort keys %{ $self->{'_identicalRes_sbjct' }} ) {
|
|
1636 $count++;
|
|
1637 print $OUT "$resNum";
|
|
1638 $count > 0 and print $OUT +( $count % 15 ? ", " : "\n\t");
|
|
1639 }
|
|
1640
|
|
1641 print $OUT "\n";
|
|
1642 $justConserved = ($self->{'_numConserved'})-($self->{'_numIdentical'});
|
|
1643 printf( $OUT "\n%5s%s:\n%5s%s\n\t", "","CONSERVED RESIDUES IN $sbjctName (n=$justConserved)",
|
|
1644 "", "--------------------------------------------" );
|
|
1645 $count = 0;
|
|
1646 foreach $resNum ( sort keys %{ $self->{'_conservedRes_sbjct' }} ) {
|
|
1647 $count++;
|
|
1648 print $OUT "$resNum";
|
|
1649 $count > 0 and print $OUT +( $count % 15 ? ", " : "\n\t");
|
|
1650 }
|
|
1651 }
|
|
1652
|
|
1653
|
|
1654
|
|
1655
|
|
1656 =head2 homol_data
|
|
1657
|
|
1658 Usage : $data = $hsp_object->homo_data( %named_params );
|
|
1659 Purpose : Gets similarity data for a single HSP.
|
|
1660 Returns : String:
|
|
1661 : "Homology data" for each HSP is in the format:
|
|
1662 : "<integer> <start> <stop>"
|
|
1663 : where integer is the value returned by homol_score().
|
|
1664 Argument : Named params: (UPPER OR LOWERCASE TAGS)
|
|
1665 : currently just one param is used:
|
|
1666 : -SEQ =>'query' or 'sbjct'
|
|
1667 Throws : n/a
|
|
1668 Status : Experimental
|
|
1669 Comments : This is a very experimental method used for obtaining a
|
|
1670 : coarse indication of:
|
|
1671 : 1) how strong the similarity is between the sequences in the HSP,
|
|
1672 : 3) the endpoints of the alignment (sequence monomer numbers)
|
|
1673
|
|
1674 See Also : L<homol_score()|homol_score>, B<Bio::Tools::Blast.::homol_data()>, B<Bio::Tools::Blast::Sbjct::homol_data()>
|
|
1675
|
|
1676 =cut
|
|
1677
|
|
1678 #---------------
|
|
1679 sub homol_data {
|
|
1680 #---------------
|
|
1681 my ($self, %param) = @_;
|
|
1682 my $seq = $param{-SEQ} || $param{'-seq'} || 'sbjct'; # 'query' or 'sbjct'
|
|
1683 my $homolScore = $self->homol_score();
|
|
1684 # Sensitive to member name changes.
|
|
1685 $seq = "_\L$seq\E";
|
|
1686
|
|
1687 $self->_set_seq_data() unless $self->{'_set_seq_data'};
|
|
1688 return ( $homolScore.' '.$self->{$seq.'Start'}.' '.$self->{$seq.'Stop'});
|
|
1689 }
|
|
1690
|
|
1691
|
|
1692 =head2 homol_score
|
|
1693
|
|
1694 Usage : $self->homol_score();
|
|
1695 Purpose : Get a homology score (integer 1 - 3) as a coarse representation of
|
|
1696 : the strength of the similarity independent of sequence composition.
|
|
1697 : Based on the Blast bit score.
|
|
1698 Example : $hscore = $hsp->homol_score();
|
|
1699 Returns : Integer
|
|
1700 Argument : n/a
|
|
1701 Throws : n/a
|
|
1702 Status : Experimental
|
|
1703 Comments : See @Bio::Tools::Blast::HSP::SCORE_CUTOFFS for the specific values.
|
|
1704 : Currently, BIT_SCORE HOMOL_SCORE
|
|
1705 : --------- -----------
|
|
1706 : >=100 --> 3
|
|
1707 : 30-100 --> 2
|
|
1708 : < 30 --> 1
|
|
1709
|
|
1710 See Also : L<homol_data()|homol_data>
|
|
1711
|
|
1712 =cut
|
|
1713
|
|
1714 #----------------
|
|
1715 sub homol_score {
|
|
1716 #----------------
|
|
1717 my $self = shift;
|
|
1718
|
|
1719 if( $self->{'_bits'} >= $SCORE_CUTOFFS[0] ) { 1 }
|
|
1720 elsif($self->{'_bits'} < $SCORE_CUTOFFS[0] and
|
|
1721 $self->{'_bits'} >= $SCORE_CUTOFFS[1] ) { 2 }
|
|
1722 else { 3 }
|
|
1723 }
|
|
1724
|
|
1725
|
|
1726 #####################################################################################
|
|
1727 ## CLASS METHODS ##
|
|
1728 #####################################################################################
|
|
1729
|
|
1730 =head1 CLASS METHODS
|
|
1731
|
|
1732 =head2 collapse_nums
|
|
1733
|
|
1734 Usage : @cnums = collapse_nums( @numbers );
|
|
1735 Purpose : Collapses a list of numbers into a set of ranges of consecutive terms:
|
|
1736 : Useful for condensing long lists of consecutive numbers.
|
|
1737 : EXPANDED:
|
|
1738 : 1 2 3 4 5 6 10 12 13 14 15 17 18 20 21 22 24 26 30 31 32
|
|
1739 : COLLAPSED:
|
|
1740 : 1-6 10 12-15 17 18 20-22 24 26 30-32
|
|
1741 Argument : List of numbers and sorted numerically.
|
|
1742 Returns : List of numbers mixed with ranges of numbers (see above).
|
|
1743 Throws : n/a
|
|
1744 Comments : Probably belongs in a more general utility class.
|
|
1745
|
|
1746 See Also : L<seq_inds()|seq_inds>
|
|
1747
|
|
1748 =cut
|
|
1749
|
|
1750 #------------------
|
|
1751 sub collapse_nums {
|
|
1752 #------------------
|
|
1753 # This is not the slickest connectivity algorithm, but will do for now.
|
|
1754 my @a = @_;
|
|
1755 my ($from, $to, $i, @ca, $consec);
|
|
1756
|
|
1757 $consec = 0;
|
|
1758 for($i=0; $i < @a; $i++) {
|
|
1759 not $from and do{ $from = $a[$i]; next; };
|
|
1760 if($a[$i] == $a[$i-1]+1) {
|
|
1761 $to = $a[$i];
|
|
1762 $consec++;
|
|
1763 } else {
|
|
1764 if($consec == 1) { $from .= ",$to"; }
|
|
1765 else { $from .= $consec>1 ? "\-$to" : ""; }
|
|
1766 push @ca, split(',', $from);
|
|
1767 $from = $a[$i];
|
|
1768 $consec = 0;
|
|
1769 $to = undef;
|
|
1770 }
|
|
1771 }
|
|
1772 if(defined $to) {
|
|
1773 if($consec == 1) { $from .= ",$to"; }
|
|
1774 else { $from .= $consec>1 ? "\-$to" : ""; }
|
|
1775 }
|
|
1776 push @ca, split(',', $from) if $from;
|
|
1777
|
|
1778 @ca;
|
|
1779 }
|
|
1780
|
|
1781
|
|
1782 1;
|
|
1783 __END__
|
|
1784
|
|
1785 #####################################################################################
|
|
1786 # END OF CLASS
|
|
1787 #####################################################################################
|
|
1788
|
|
1789 =head1 FOR DEVELOPERS ONLY
|
|
1790
|
|
1791 =head2 Data Members
|
|
1792
|
|
1793 Information about the various data members of this module is provided for those
|
|
1794 wishing to modify or understand the code. Two things to bear in mind:
|
|
1795
|
|
1796 =over 4
|
|
1797
|
|
1798 =item 1 Do NOT rely on these in any code outside of this module.
|
|
1799
|
|
1800 All data members are prefixed with an underscore to signify that they are private.
|
|
1801 Always use accessor methods. If the accessor doesn't exist or is inadequate,
|
|
1802 create or modify an accessor (and let me know, too!).
|
|
1803
|
|
1804 =item 2 This documentation may be incomplete and out of date.
|
|
1805
|
|
1806 It is easy for these data member descriptions to become obsolete as
|
|
1807 this module is still evolving. Always double check this info and search
|
|
1808 for members not described here.
|
|
1809
|
|
1810 =back
|
|
1811
|
|
1812 An instance of Bio::Tools::Blast::HSP.pm is a blessed reference to a hash containing
|
|
1813 all or some of the following fields:
|
|
1814
|
|
1815 FIELD VALUE
|
|
1816 --------------------------------------------------------------
|
|
1817 (member names are mostly self-explanatory)
|
|
1818
|
|
1819 _score :
|
|
1820 _bits :
|
|
1821 _p :
|
|
1822 _n : Integer. The 'N' value listed in parenthesis with P/Expect value:
|
|
1823 : e.g., P(3) = 1.2e-30 ---> (N = 3).
|
|
1824 : Not defined in NCBI Blast2 with gaps.
|
|
1825 : To obtain the number of HSPs, use Bio::Tools::Blast::Sbjct::num_hsps().
|
|
1826 _expect :
|
|
1827 _queryLength :
|
|
1828 _queryGaps :
|
|
1829 _queryStart :
|
|
1830 _queryStop :
|
|
1831 _querySeq :
|
|
1832 _sbjctLength :
|
|
1833 _sbjctGaps :
|
|
1834 _sbjctStart :
|
|
1835 _sbjctStop :
|
|
1836 _sbjctSeq :
|
|
1837 _matchSeq : String. Contains the symbols between the query and sbjct lines
|
|
1838 which indicate identical (letter) and conserved ('+') matches
|
|
1839 or a mismatch (' ').
|
|
1840 _numIdentical :
|
|
1841 _numConserved :
|
|
1842 _identicalRes_query :
|
|
1843 _identicalRes_sbjct :
|
|
1844 _conservedRes_query :
|
|
1845 _conservedRes_sbjct :
|
|
1846 _match_indent : The number of leading space characters on each line containing
|
|
1847 the match symbols. _match_indent is 13 in this example:
|
|
1848 Query: 285 QNSAPWGLARISHRERLNLGSFNKYLYDDDAG
|
|
1849 Q +APWGLARIS G+ + Y YD+ AG
|
|
1850 ^^^^^^^^^^^^^
|
|
1851
|
|
1852 INHERITED DATA MEMBERS
|
|
1853
|
|
1854 _name : From Bio::Root::Object.pm.
|
|
1855 :
|
|
1856 _parent : From Bio::Root::Object.pm. This member contains a reference to the
|
|
1857 : Bio::Tools::Blast::Sbjct.pm object to which this hit belongs.
|
|
1858
|
|
1859
|
|
1860 =cut
|
|
1861
|
|
1862 1;
|
|
1863
|