0
|
1 # $Id: BPpsilite.pm,v 1.22 2002/10/22 07:38:45 lapp Exp $
|
|
2 # Bioperl module Bio::Tools::BPpsilite
|
|
3 ############################################################
|
|
4 # based closely on the Bio::Tools::BPlite modules
|
|
5 # Ian Korf (ikorf@sapiens.wustl.edu, http://sapiens.wustl.edu/~ikorf),
|
|
6 # Lorenz Pollak (lorenz@ist.org, bioperl port)
|
|
7 #
|
|
8 #
|
|
9 # Copyright Peter Schattner
|
|
10 #
|
|
11 # You may distribute this module under the same terms as perl itself
|
|
12 # _history
|
|
13 # October 20, 2000
|
|
14 # POD documentation - main docs before the code
|
|
15
|
|
16 =head1 NAME
|
|
17
|
|
18 Bio::Tools::BPpsilite - Lightweight BLAST parser for (iterated) psiblast reports
|
|
19
|
|
20 =head1 SYNOPSIS
|
|
21
|
|
22 use Bio::Tools::BPpsilite;
|
|
23 open FH, "t/psiblastreport.out";
|
|
24 $report = Bio::Tools::BPpsilite->new(-fh=>\*FH);
|
|
25
|
|
26 # determine number of iterations executed by psiblast
|
|
27 $total_iterations = $report->number_of_iterations;
|
|
28 $last_iteration = $report->round($total_iterations);
|
|
29
|
|
30 # Process only hits found in last iteration ...
|
|
31 $oldhitarray_ref = $last_iteration->oldhits;
|
|
32 HIT: while($sbjct = $last_iteration->nextSbjct) {
|
|
33 $id = $sbjct->name;
|
|
34 $is_old = grep /\Q$id\E/, @$oldhitarray_ref;
|
|
35 if ($is_old ){next HIT;}
|
|
36 # do something with new hit...
|
|
37 }
|
|
38
|
|
39
|
|
40 =head1 DESCRIPTION
|
|
41
|
|
42 BPpsilite is a package for parsing multiple iteration PSIBLAST
|
|
43 reports. It is based closely on Ian Korf's BPlite.pm module for
|
|
44 parsing single iteration BLAST reports (as modified by Lorenz Pollak).
|
|
45
|
|
46 Two of the four basic objects of BPpsilite.pm are identical to the
|
|
47 corresponding objects in BPlite - the "HSP.pm" and "Sbjct.pm" objects.
|
|
48 This DESCRIPTION documents only the one new object, the "iteration",
|
|
49 as well as the additional methods that are implemented in BPpsilite
|
|
50 that are not in BPlite. See the BPlite documentation for information
|
|
51 on the BPlite, SBJCT and HSP objects.
|
|
52
|
|
53 The essential difference between PSIBLAST and the other BLAST programs
|
|
54 (in terms of report parsing) is that PSIBLAST performs multiple
|
|
55 iterations of the BLASTing of the database and the results of all of
|
|
56 these iterations are stored in a single PSIBLAST report. (For general
|
|
57 information on PSIBLAST see the README.bla file in the standalone
|
|
58 BLAST distribution and references therein). PSIBLAST's use of multiple
|
|
59 iterations imposes additional demands on the report parser: * There
|
|
60 are several iterations of hits. Many of those hits will be repeated
|
|
61 in more than one iteration. Often only the last iteration will be of
|
|
62 interest. * Each iteration will list two different kinds of hits -
|
|
63 repeated hits that were used in the model and newly identified hits -
|
|
64 which may need to be processed in different manners * The total number
|
|
65 of iterations performed is not displayed in the report until (almost)
|
|
66 the very end of the report. (The user can specify a maximum number of
|
|
67 iterations for the PSIBLAST search, but the program may perform fewer
|
|
68 iterations if convergence is reached)
|
|
69
|
|
70 BPpsilite addresses these issues by offering the following methods:
|
|
71
|
|
72 * The total number of iteration used is given by the method
|
|
73 number_of_iterations as in:
|
|
74
|
|
75 $total_iterations = $report->number_of_iterations;
|
|
76
|
|
77 * Results from an arbitrary iteration round can be accessed by using
|
|
78 the 'round' method:
|
|
79
|
|
80 $iteration3_report = $report->round(3);
|
|
81
|
|
82 * The ids of the sequences which passed the significance threshold for
|
|
83 the first time in the "nth" iteration can be identified by using the
|
|
84 newhits method. Previously identified hits are identified by using
|
|
85 the oldhits method, as in:
|
|
86
|
|
87 $oldhitarray_ref = $iteration3_report->oldhits;
|
|
88 $newhitarray_ref = $iteration3_report->newhits;
|
|
89
|
|
90 BPpsilite.pm should work equally well on reports generated by the
|
|
91 StandAloneBlast.pm local BLAST module as with reports generated by
|
|
92 remote psiblast searches. For examples of usage of BPpsilite.pm, the
|
|
93 user is referred to the BPpsilite.t script in the "t" directory.
|
|
94
|
|
95 =head1 FEEDBACK
|
|
96
|
|
97 =head2 Mailing Lists
|
|
98
|
|
99 User feedback is an integral part of the evolution of this and other
|
|
100 Bioperl modules. Send your comments and suggestions preferably to one
|
|
101 of the Bioperl mailing lists. Your participation is much appreciated.
|
|
102
|
|
103 bioperl-l@bioperl.org - General discussion
|
|
104 http://bio.perl.org/MailList.html - About the mailing lists
|
|
105
|
|
106 =head2 Reporting Bugs
|
|
107
|
|
108 Report bugs to the Bioperl bug tracking system to help us keep track
|
|
109 the bugs and their resolution. Bug reports can be submitted via email
|
|
110 or the web:
|
|
111
|
|
112 bioperl-bugs@bio.perl.org
|
|
113 http://bugzilla.bioperl.org/
|
|
114
|
|
115 =head1 AUTHOR - Peter Schattner
|
|
116
|
|
117 Email: schattner@alum.mit.edu
|
|
118
|
|
119 =head1 CONTRIBUTORS
|
|
120
|
|
121 Jason Stajich, jason@cgt.mc.duke.edu
|
|
122
|
|
123 =head1 ACKNOWLEDGEMENTS
|
|
124
|
|
125 Based on work of:
|
|
126 Ian Korf (ikorf@sapiens.wustl.edu, http://sapiens.wustl.edu/~ikorf),
|
|
127 Lorenz Pollak (lorenz@ist.org, bioperl port)
|
|
128
|
|
129 =head1 COPYRIGHT
|
|
130
|
|
131 BPlite.pm is copyright (C) 1999 by Ian Korf.
|
|
132
|
|
133 =head1 DISCLAIMER
|
|
134
|
|
135 This software is provided "as is" without warranty of any kind.
|
|
136
|
|
137 =cut
|
|
138
|
|
139 package Bio::Tools::BPpsilite;
|
|
140
|
|
141 use strict;
|
|
142 use vars qw(@ISA);
|
|
143 use Bio::Tools::BPlite::Iteration; #
|
|
144 use Bio::Tools::BPlite::Sbjct; # Debug code
|
|
145 use Bio::Root::Root; # root interface to inherit from
|
|
146 use Bio::Root::IO;
|
|
147 use Bio::Tools::BPlite;
|
|
148
|
|
149 @ISA = qw(Bio::Root::Root Bio::Root::IO);
|
|
150
|
|
151 sub new {
|
|
152 my ($class, @args) = @_;
|
|
153 my $self = $class->SUPER::new(@args);
|
|
154
|
|
155 # initialize IO
|
|
156 $self->_initialize_io(@args);
|
|
157 $self->{'_tempdir'} = $self->tempdir('CLEANUP' => 1);
|
|
158 $self->{'QPATLOCATION'} = []; # Anonymous array of query pattern locations for PHIBLAST
|
|
159 $self->{'NEXT_ITERATION_NUMBER'} = 1;
|
|
160 $self->{'TOTAL_ITERATION_NUMBER'} = -1; # -1 indicates preprocessing not yet done
|
|
161
|
|
162 if ($self->_parseHeader) {$self->{'REPORT_DONE'} = 0} # there are alignments
|
|
163 else {$self->{'REPORT_DONE'} = 1} # empty report
|
|
164
|
|
165 return $self; # success - we hope!
|
|
166 }
|
|
167
|
|
168 =head2 query
|
|
169
|
|
170 Title : query
|
|
171 Usage : $query = $obj->query();
|
|
172 Function : returns the query object
|
|
173 Returns : query object
|
|
174 Args :
|
|
175
|
|
176 =cut
|
|
177
|
|
178 sub query {shift->{'QUERY'}}
|
|
179
|
|
180 =head2 qlength
|
|
181
|
|
182 Title : qlength
|
|
183 Usage : $len = $obj->qlength();
|
|
184 Function : returns the length of the query
|
|
185 Returns : length of query
|
|
186 Args :
|
|
187
|
|
188 =cut
|
|
189
|
|
190 sub qlength {shift->{'LENGTH'}}
|
|
191
|
|
192 =head2 database
|
|
193
|
|
194 Title : database
|
|
195 Usage : $db = $obj->database();
|
|
196 Function : returns the database used in this search
|
|
197 Returns : database used for search
|
|
198 Args :
|
|
199
|
|
200 =cut
|
|
201
|
|
202 sub database {shift->{'DATABASE'}}
|
|
203
|
|
204 =head2 number_of_iterations
|
|
205
|
|
206 Title : number_of_iterations
|
|
207 Usage : $total_iterations = $obj-> number_of_iterations();
|
|
208 Function : returns the total number of iterations used in this search
|
|
209 Returns : total number of iterations used for search
|
|
210 Args : none
|
|
211
|
|
212 =cut
|
|
213
|
|
214
|
|
215 =head2 pattern
|
|
216
|
|
217 Title : database
|
|
218 Usage : $pattern = $obj->pattern();
|
|
219 Function : returns the pattern used in a PHIBLAST search
|
|
220
|
|
221 =cut
|
|
222
|
|
223 sub pattern {shift->{'PATTERN'}}
|
|
224
|
|
225 =head2 query_pattern_location
|
|
226
|
|
227 Title : query_pattern_location
|
|
228 Usage : $qpl = $obj->query_pattern_location();
|
|
229 Function : returns reference to array of locations in the query sequence
|
|
230 of pattern used in a PHIBLAST search
|
|
231
|
|
232 =cut
|
|
233
|
|
234 sub query_pattern_location {shift->{'QPATLOCATION'}}
|
|
235
|
|
236
|
|
237
|
|
238
|
|
239 sub number_of_iterations {
|
|
240 my $self = shift;
|
|
241 if ($self->{'TOTAL_ITERATION_NUMBER'} == -1){&_preprocess($self);}
|
|
242 $self->{'TOTAL_ITERATION_NUMBER'};
|
|
243 }
|
|
244
|
|
245 =head2 round
|
|
246
|
|
247 Title : round
|
|
248 Usage : $Iteration3 = $report->round(3);
|
|
249 Function : Method of retrieving data from a specific iteration
|
|
250 Example :
|
|
251 Returns : reference to requested Iteration object or null if argument
|
|
252 is greater than total number of iterations
|
|
253 Args : number of the requested iteration
|
|
254
|
|
255 =cut
|
|
256
|
|
257 sub round {
|
|
258 my $self = shift;
|
|
259 my $iter_num = shift;
|
|
260 $self->_initialize_io(-file => Bio::Root::IO->catfile
|
|
261 ($self->{'_tempdir'},"iteration".$iter_num.".tmp"));
|
|
262 if( ! $self->_fh ) {
|
|
263 $self->throw("unable to re-open iteration file for round ".$iter_num);
|
|
264 }
|
|
265 return Bio::Tools::BPlite::Iteration->new(-round=>$iter_num,
|
|
266 -parent=>$self);
|
|
267 }
|
|
268
|
|
269 # begin private routines
|
|
270
|
|
271 sub _parseHeader {
|
|
272 my ($self) = @_;
|
|
273
|
|
274
|
|
275 while(defined ($_ = $self->_readline) ) {
|
|
276 if ($_ =~ /^Query=\s+([^\(]+)/) {
|
|
277 my $query = $1;
|
|
278 while(defined ($_ = $self->_readline)) {
|
|
279 last if $_ !~ /\S/;
|
|
280 $query .= $_;
|
|
281 }
|
|
282 $query =~ s/\s+/ /g;
|
|
283 $query =~ s/^>//;
|
|
284 $query =~ /\((\d+)\s+\S+\)\s*$/;
|
|
285 my $length = $1;
|
|
286 $self->{'QUERY'} = $query;
|
|
287 $self->{'LENGTH'} = $length;
|
|
288 }
|
|
289 elsif ($_ =~ /^Database:\s+(.+)/) {$self->{'DATABASE'} = $1}
|
|
290 elsif ($_ =~ /^\s*pattern\s+(\S+).*position\s+(\d+)\D/)
|
|
291 { # For PHIBLAST reports
|
|
292 $self->{'PATTERN'} = $1;
|
|
293 push (@{$self->{'QPATLOCATION'}}, $2);
|
|
294 } elsif ($_ =~ /^>|^Results from round 1/) {
|
|
295 $self->_pushback($_);
|
|
296 return 1;
|
|
297 } elsif ($_ =~ /^Parameters|^\s+Database:/) {
|
|
298 $self->_pushback($_);
|
|
299 return 0; # there's nothing in the report
|
|
300 }
|
|
301 }
|
|
302 }
|
|
303
|
|
304 =head2 _preprocess
|
|
305
|
|
306 Title : _preprocess
|
|
307 Usage : internal routine, not called directly
|
|
308 Function : determines number of iterations in report and prepares
|
|
309 data so individual iterations canbe parsed in non-sequential
|
|
310 order
|
|
311 Example :
|
|
312 Returns : nothing. Sets TOTAL_ITERATION_NUMBER in object's hash
|
|
313 Args : reference to calling object
|
|
314
|
|
315 =cut
|
|
316
|
|
317 #'
|
|
318 sub _preprocess {
|
|
319 my $self = shift;
|
|
320 # $self->throw(" PSIBLAST report preprocessing not implemented yet!");
|
|
321
|
|
322 my $oldround = 0;
|
|
323 my ($currentline, $currentfile, $round);
|
|
324
|
|
325 # open output file for data from iteration round #1
|
|
326 $round = 1;
|
|
327 $currentfile = Bio::Root::IO->catfile($self->{'_tempdir'},
|
|
328 "iteration$round.tmp");
|
|
329 open (FILEHANDLE, ">$currentfile") ||
|
|
330 $self->throw("cannot open filehandle to write to file $currentfile");
|
|
331
|
|
332 while(defined ($currentline = $self->_readline()) ) {
|
|
333 if ($currentline =~ /^Results from round\s+(\d+)/) {
|
|
334 if ($oldround) { close (FILEHANDLE) ;}
|
|
335 $round = $1;
|
|
336 $currentfile = Bio::Root::IO->catfile($self->{'_tempdir'},
|
|
337 "iteration$round.tmp");
|
|
338
|
|
339 close FILEHANDLE;
|
|
340 open (FILEHANDLE, ">$currentfile") ||
|
|
341 $self->throw("cannot open filehandle to write to file $currentfile");
|
|
342 $oldround = $round;
|
|
343 }elsif ($currentline =~ /CONVERGED/){ # This is a fix for psiblast parsing with -m 6 /AE
|
|
344 $round--;
|
|
345 }
|
|
346 print FILEHANDLE $currentline ;
|
|
347
|
|
348 }
|
|
349 $self->{'TOTAL_ITERATION_NUMBER'}= $round;
|
|
350 # It is necessary to close filehandle otherwise the whole
|
|
351 # file will not be read later !!
|
|
352 close FILEHANDLE;
|
|
353 }
|
|
354
|
|
355 1;
|
|
356
|
|
357 __END__
|