0
|
1 # $Id: Sbjct.pm,v 1.23.2.1 2003/02/20 00:39:03 jason Exp $
|
|
2 ###############################################################################
|
|
3 # Bio::Tools::BPlite::Sbjct
|
|
4 ###############################################################################
|
|
5 #
|
|
6 # The original BPlite.pm module has been written by Ian Korf !
|
|
7 # see http://sapiens.wustl.edu/~ikorf
|
|
8 #
|
|
9 # You may distribute this module under the same terms as perl itself
|
|
10 #
|
|
11 # BioPerl module for Bio::Tools::BPlite::Sbjct
|
|
12 #
|
|
13 # Cared for by Peter Schattner <schattner@alum.mit.edu>
|
|
14 #
|
|
15 # Copyright Peter Schattner
|
|
16 #
|
|
17 # You may distribute this module under the same terms as perl itself
|
|
18
|
|
19 # POD documentation - main docs before the code
|
|
20
|
|
21 =head1 NAME
|
|
22
|
|
23 Bio::Tools::BPlite::Sbjct - A Blast Subject (database search Hit)
|
|
24
|
|
25 =head1 SYNOPSIS
|
|
26
|
|
27 use Bio::Tools::BPlite
|
|
28 my $report = new Bio::Tools::BPlite(-fh=>\*STDIN);
|
|
29 while(my $sbjct = $report->nextSbjct) {
|
|
30 $sbjct->name; # access to the hit name
|
|
31 "$sbjct"; # overloaded to return name
|
|
32 $sbjct->nextHSP; # gets the next HSP from the sbjct
|
|
33 while(my $hsp = $sbjct->nextHSP) {
|
|
34 # canonical form is again a while loop
|
|
35 }
|
|
36
|
|
37 =head1 DESCRIPTION
|
|
38
|
|
39 See L<Bio::Tools::BPlite> for a more detailed information about the
|
|
40 BPlite BLAST parsing objects.
|
|
41
|
|
42 The original BPlite.pm module has been written by Ian Korf!
|
|
43 See http://sapiens.wustl.edu/~ikorf
|
|
44
|
|
45 The Sbjct object encapsulates a Hit in a Blast database
|
|
46 search. The Subjects are the "Hits" for a particular query. A
|
|
47 Subject may be made up of multiple High Scoring Pairs (HSP) which are
|
|
48 accessed through the nextHSP method.
|
|
49
|
|
50 If you are searching for the P-value or percent identity that is
|
|
51 specific to each HSP and you will need to use the nextHSP method to
|
|
52 get access to that data.
|
|
53
|
|
54 =head1 FEEDBACK
|
|
55
|
|
56 =head2 Mailing Lists
|
|
57
|
|
58 User feedback is an integral part of the evolution of this and other
|
|
59 Bioperl modules. Send your comments and suggestions preferably to
|
|
60 the Bioperl mailing list. Your participation is much appreciated.
|
|
61
|
|
62 bioperl-l@bioperl.org - General discussion
|
|
63 http://bioperl.org/MailList.shtml - About the mailing lists
|
|
64
|
|
65 =head2 Reporting Bugs
|
|
66
|
|
67 Report bugs to the Bioperl bug tracking system to help us keep track
|
|
68 of the bugs and their resolution. Bug reports can be submitted via
|
|
69 email or the web:
|
|
70
|
|
71 bioperl-bugs@bioperl.org
|
|
72 http://bugzilla.bioperl.org/
|
|
73
|
|
74 =head1 AUTHOR - Peter Schattner
|
|
75
|
|
76 Email: schattner@alum.mit.edu
|
|
77
|
|
78 =head1 CONTRIBUTORS
|
|
79
|
|
80 Jason Stajich, jason@bioperl.org
|
|
81
|
|
82 =head1 APPENDIX
|
|
83
|
|
84 The rest of the documentation details each of the object methods.
|
|
85 Internal methods are usually preceded with a _
|
|
86
|
|
87 =cut
|
|
88
|
|
89 # Let the code begin...
|
|
90
|
|
91 package Bio::Tools::BPlite::Sbjct;
|
|
92
|
|
93 use strict;
|
|
94
|
|
95 use Bio::Root::Root; # root object to inherit from
|
|
96 use Bio::Tools::BPlite::HSP; # we want to use HSP
|
|
97 #use overload '""' => 'name';
|
|
98 use vars qw(@ISA);
|
|
99
|
|
100 @ISA = qw(Bio::Root::Root);
|
|
101
|
|
102 sub new {
|
|
103 my ($class, @args) = @_;
|
|
104 my $self = $class->SUPER::new(@args);
|
|
105
|
|
106 ($self->{'NAME'},$self->{'LENGTH'},
|
|
107 $self->{'PARENT'}) =
|
|
108 $self->_rearrange([qw(NAME
|
|
109 LENGTH
|
|
110 PARENT
|
|
111 )],@args);
|
|
112 $self->report_type($self->{'PARENT'}->{'BLAST_TYPE'} || 'UNKNOWN');
|
|
113 $self->{'HSP_ALL_PARSED'} = 0;
|
|
114
|
|
115 return $self;
|
|
116 }
|
|
117
|
|
118 =head2 name
|
|
119
|
|
120 Title : name
|
|
121 Usage : $name = $obj->name();
|
|
122 Function : returns the name of the Sbjct
|
|
123 Example :
|
|
124 Returns : name of the Sbjct
|
|
125 Args :
|
|
126
|
|
127 =cut
|
|
128
|
|
129 sub name {shift->{'NAME'}}
|
|
130
|
|
131 =head2 report_type
|
|
132
|
|
133 Title : report_type
|
|
134 Usage : $type = $sbjct->report_type()
|
|
135 Function : Returns the type of report from which this hit was obtained.
|
|
136 This usually pertains only to BLAST and friends reports, for which
|
|
137 the report type denotes what type of sequence was aligned against
|
|
138 what (BLASTN: dna-dna, BLASTP prt-prt, BLASTX translated dna-prt,
|
|
139 TBLASTN prt-translated dna, TBLASTX translated dna-translated dna).
|
|
140 Example :
|
|
141 Returns : A string (BLASTN, BLASTP, BLASTX, TBLASTN, TBLASTX, UNKNOWN)
|
|
142 Args : a string on set (you should know what you are doing)
|
|
143
|
|
144 =cut
|
|
145
|
|
146 sub report_type {
|
|
147 my ($self, $rpt) = @_;
|
|
148 if($rpt) {
|
|
149 $self->{'_report_type'} = $rpt;
|
|
150 }
|
|
151 return $self->{'_report_type'};
|
|
152 }
|
|
153
|
|
154 =head2 nextFeaturePair
|
|
155
|
|
156 Title : nextFeaturePair
|
|
157 Usage : $name = $obj->nextFeaturePair();
|
|
158 Function : same as the nextHSP function
|
|
159 Example :
|
|
160 Returns : next FeaturePair
|
|
161 Args :
|
|
162
|
|
163 =cut
|
|
164
|
|
165 sub nextFeaturePair {shift->nextHSP}; # just another name
|
|
166
|
|
167 =head2 nextHSP
|
|
168
|
|
169 Title : nextHSP
|
|
170 Usage : $hsp = $obj->nextHSP();
|
|
171 Function : returns the next available High Scoring Pair
|
|
172 Example :
|
|
173 Returns : Bio::Tools::HSP or null if finished
|
|
174 Args :
|
|
175
|
|
176 =cut
|
|
177
|
|
178 sub nextHSP {
|
|
179 my ($self) = @_;
|
|
180 return undef if $self->{'HSP_ALL_PARSED'};
|
|
181
|
|
182 ############################
|
|
183 # get and parse scorelines #
|
|
184 ############################
|
|
185 my ($qframe, $sframe);
|
|
186 my $scoreline = $self->_readline();
|
|
187 my $nextline = $self->_readline();
|
|
188 return undef if not defined $nextline;
|
|
189 $scoreline .= $nextline;
|
|
190 my ($score, $bits);
|
|
191 if ($scoreline =~ /\d bits\)/) {
|
|
192 ($score, $bits) = $scoreline =~
|
|
193 /Score = (\d+) \((\S+) bits\)/; # WU-BLAST
|
|
194 }
|
|
195 else {
|
|
196 ($bits, $score) = $scoreline =~
|
|
197 /Score =\s+(\S+) bits \((\d+)/; # NCBI-BLAST
|
|
198 }
|
|
199
|
|
200 my ($match, $hsplength) = ($scoreline =~ /Identities = (\d+)\/(\d+)/);
|
|
201 my ($positive) = ($scoreline =~ /Positives = (\d+)/);
|
|
202 my ($gaps) = ($scoreline =~ /Gaps = (\d+)/);
|
|
203 if($self->report_type() eq 'TBLASTX') {
|
|
204 ($qframe, $sframe) = $scoreline =~ /Frame =\s+([+-]\d)\s+\/\s+([+-]\d)/;
|
|
205 } elsif ($self->report_type() eq 'TBLASTN') {
|
|
206 ($sframe) = $scoreline =~ /Frame =\s+([+-]\d)/;
|
|
207 } else {
|
|
208 ($qframe) = $scoreline =~ /Frame =\s+([+-]\d)/;
|
|
209 }
|
|
210 $positive = $match if not defined $positive;
|
|
211 $gaps = '0' if not defined $gaps;
|
|
212 my ($p) = ($scoreline =~ /[Sum ]*P[\(\d+\)]* = (\S+)/);
|
|
213 unless (defined $p) {(undef, $p) = $scoreline =~ /Expect(\(\d+\))? =\s+(\S+)/}
|
|
214 my ($exp) = ($scoreline =~ /Expect(?:\(\d+\))? =\s+([^\s,]+)/);
|
|
215 $exp = -1 unless( defined $exp );
|
|
216
|
|
217 $self->throw("Unable to parse '$scoreline'") unless defined $score;
|
|
218
|
|
219 #######################
|
|
220 # get alignment lines #
|
|
221 #######################
|
|
222 my (@hspline);
|
|
223 while( defined($_ = $self->_readline()) ) {
|
|
224 if ($_ =~ /^WARNING:|^NOTE:/) {
|
|
225 while(defined($_ = $self->_readline())) {last if $_ !~ /\S/}
|
|
226 }
|
|
227 elsif ($_ !~ /\S/) {next}
|
|
228 elsif ($_ =~ /Strand HSP/) {next} # WU-BLAST non-data
|
|
229 elsif ($_ =~ /^\s*Strand/) {next} # NCBI-BLAST non-data
|
|
230 elsif ($_ =~ /^\s*Score/) {$self->_pushback($_); last}
|
|
231
|
|
232 elsif ($_ =~ /^>|^Histogram|^Searching|^Parameters|^\s+Database:|^CPU\stime|^\s*Lambda/)
|
|
233 {
|
|
234 #ps 5/28/01
|
|
235 # elsif ($_ =~ /^>|^Parameters|^\s+Database:|^CPU\stime/) {
|
|
236 $self->_pushback($_);
|
|
237
|
|
238 $self->{'HSP_ALL_PARSED'} = 1;
|
|
239 last;
|
|
240 }
|
|
241 elsif( $_ =~ /^\s*Frame/ ) {
|
|
242 if ($self->report_type() eq 'TBLASTX') {
|
|
243 ($qframe, $sframe) = $_ =~ /Frame = ([\+-]\d)\s+\/\s+([\+-]\d)/;
|
|
244 } elsif ($self->report_type() eq 'TBLASTN') {
|
|
245 ($sframe) = $_ =~ /Frame = ([\+-]\d)/;
|
|
246 } else {
|
|
247 ($qframe) = $_ =~ /Frame = ([\+-]\d)/;
|
|
248 }
|
|
249 }
|
|
250 else {
|
|
251 push @hspline, $_; # store the query line
|
|
252 $nextline = $self->_readline();
|
|
253 # Skip "pattern" line when parsing PHIBLAST reports, otherwise store the alignment line
|
|
254 my $l1 = ($nextline =~ /^\s*pattern/) ? $self->_readline() : $nextline;
|
|
255 push @hspline, $l1; # store the alignment line
|
|
256 my $l2 = $self->_readline(); push @hspline, $l2; # grab/store the sbjct line
|
|
257 }
|
|
258 }
|
|
259
|
|
260 #########################
|
|
261 # parse alignment lines #
|
|
262 #########################
|
|
263 my ($ql, $sl, $as) = ("", "", "");
|
|
264 my ($qb, $qe, $sb, $se) = (0,0,0,0);
|
|
265 my (@QL, @SL, @AS); # for better memory management
|
|
266
|
|
267 for(my $i=0;$i<@hspline;$i+=3) {
|
|
268 # warn $hspline[$i], $hspline[$i+2];
|
|
269 $hspline[$i] =~ /^(?:Query|Trans):\s+(\d+)\s*([\D\S]+)\s+(\d+)/;
|
|
270 $ql = $2; $qb = $1 unless $qb; $qe = $3;
|
|
271
|
|
272 my $offset = index($hspline[$i], $ql);
|
|
273 $as = substr($hspline[$i+1], $offset, CORE::length($ql));
|
|
274
|
|
275 $hspline[$i+2] =~ /^Sbjct:\s+(\d+)\s*([\D\S]+)\s+(\d+)/;
|
|
276 $sl = $2; $sb = $1 unless $sb; $se = $3;
|
|
277
|
|
278 push @QL, $ql; push @SL, $sl; push @AS, $as;
|
|
279 }
|
|
280
|
|
281 ##################
|
|
282 # the HSP object #
|
|
283 ##################
|
|
284 $ql = join("", @QL);
|
|
285 $sl = join("", @SL);
|
|
286 $as = join("", @AS);
|
|
287 # Query name and length are not in the report for a bl2seq report so {'PARENT'}->query and
|
|
288 # {'PARENT'}->qlength will not be available.
|
|
289 my ($qname, $qlength) = ('unknown','unknown');
|
|
290 if ($self->{'PARENT'}->can('query')) {
|
|
291 $qname = $self->{'PARENT'}->query;
|
|
292 $qlength = $self->{'PARENT'}->qlength;
|
|
293 }
|
|
294
|
|
295 my $hsp = new Bio::Tools::BPlite::HSP
|
|
296 ('-score' => $score,
|
|
297 '-bits' => $bits,
|
|
298 '-match' => $match,
|
|
299 '-positive' => $positive,
|
|
300 '-gaps' => $gaps,
|
|
301 '-hsplength' => $hsplength,
|
|
302 '-p' => $p,
|
|
303 '-exp' => $exp,
|
|
304 '-queryBegin' => $qb,
|
|
305 '-queryEnd' => $qe,
|
|
306 '-sbjctBegin' => $sb,
|
|
307 '-sbjctEnd' => $se,
|
|
308 '-querySeq' => $ql,
|
|
309 '-sbjctSeq' => $sl,
|
|
310 '-homologySeq'=> $as,
|
|
311 '-queryName' => $qname,
|
|
312 # '-queryName'=>$self->{'PARENT'}->query,
|
|
313 '-sbjctName' => $self->{'NAME'},
|
|
314 '-queryLength'=> $qlength,
|
|
315 # '-queryLength'=>$self->{'PARENT'}->qlength,
|
|
316 '-sbjctLength'=> $self->{'LENGTH'},
|
|
317 '-queryFrame' => $qframe,
|
|
318 '-sbjctFrame' => $sframe,
|
|
319 '-blastType' => $self->report_type());
|
|
320 return $hsp;
|
|
321 }
|
|
322
|
|
323 =head2 _readline
|
|
324
|
|
325 Title : _readline
|
|
326 Usage : $obj->_readline
|
|
327 Function: Reads a line of input.
|
|
328
|
|
329 Note that this method implicitely uses the value of $/ that is
|
|
330 in effect when called.
|
|
331
|
|
332 Note also that the current implementation does not handle pushed
|
|
333 back input correctly unless the pushed back input ends with the
|
|
334 value of $/.
|
|
335 Example :
|
|
336 Returns :
|
|
337
|
|
338 =cut
|
|
339
|
|
340 sub _readline{
|
|
341 my ($self) = @_;
|
|
342 return $self->{'PARENT'}->_readline();
|
|
343 }
|
|
344
|
|
345 =head2 _pushback
|
|
346
|
|
347 Title : _pushback
|
|
348 Usage : $obj->_pushback($newvalue)
|
|
349 Function: puts a line previously read with _readline back into a buffer
|
|
350 Example :
|
|
351 Returns :
|
|
352 Args : newvalue
|
|
353
|
|
354 =cut
|
|
355
|
|
356 sub _pushback {
|
|
357 my ($self, $arg) = @_;
|
|
358 return $self->{'PARENT'}->_pushback($arg);
|
|
359 }
|
|
360
|
|
361 1;
|