comparison variant_effect_predictor/Bio/Tools/BPpsilite.pm @ 0:2bc9b66ada89 draft default tip

Uploaded
author mahtabm
date Thu, 11 Apr 2013 06:29:17 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:2bc9b66ada89
1 # $Id: BPpsilite.pm,v 1.22 2002/10/22 07:38:45 lapp Exp $
2 # Bioperl module Bio::Tools::BPpsilite
3 ############################################################
4 # based closely on the Bio::Tools::BPlite modules
5 # Ian Korf (ikorf@sapiens.wustl.edu, http://sapiens.wustl.edu/~ikorf),
6 # Lorenz Pollak (lorenz@ist.org, bioperl port)
7 #
8 #
9 # Copyright Peter Schattner
10 #
11 # You may distribute this module under the same terms as perl itself
12 # _history
13 # October 20, 2000
14 # POD documentation - main docs before the code
15
16 =head1 NAME
17
18 Bio::Tools::BPpsilite - Lightweight BLAST parser for (iterated) psiblast reports
19
20 =head1 SYNOPSIS
21
22 use Bio::Tools::BPpsilite;
23 open FH, "t/psiblastreport.out";
24 $report = Bio::Tools::BPpsilite->new(-fh=>\*FH);
25
26 # determine number of iterations executed by psiblast
27 $total_iterations = $report->number_of_iterations;
28 $last_iteration = $report->round($total_iterations);
29
30 # Process only hits found in last iteration ...
31 $oldhitarray_ref = $last_iteration->oldhits;
32 HIT: while($sbjct = $last_iteration->nextSbjct) {
33 $id = $sbjct->name;
34 $is_old = grep /\Q$id\E/, @$oldhitarray_ref;
35 if ($is_old ){next HIT;}
36 # do something with new hit...
37 }
38
39
40 =head1 DESCRIPTION
41
42 BPpsilite is a package for parsing multiple iteration PSIBLAST
43 reports. It is based closely on Ian Korf's BPlite.pm module for
44 parsing single iteration BLAST reports (as modified by Lorenz Pollak).
45
46 Two of the four basic objects of BPpsilite.pm are identical to the
47 corresponding objects in BPlite - the "HSP.pm" and "Sbjct.pm" objects.
48 This DESCRIPTION documents only the one new object, the "iteration",
49 as well as the additional methods that are implemented in BPpsilite
50 that are not in BPlite. See the BPlite documentation for information
51 on the BPlite, SBJCT and HSP objects.
52
53 The essential difference between PSIBLAST and the other BLAST programs
54 (in terms of report parsing) is that PSIBLAST performs multiple
55 iterations of the BLASTing of the database and the results of all of
56 these iterations are stored in a single PSIBLAST report. (For general
57 information on PSIBLAST see the README.bla file in the standalone
58 BLAST distribution and references therein). PSIBLAST's use of multiple
59 iterations imposes additional demands on the report parser: * There
60 are several iterations of hits. Many of those hits will be repeated
61 in more than one iteration. Often only the last iteration will be of
62 interest. * Each iteration will list two different kinds of hits -
63 repeated hits that were used in the model and newly identified hits -
64 which may need to be processed in different manners * The total number
65 of iterations performed is not displayed in the report until (almost)
66 the very end of the report. (The user can specify a maximum number of
67 iterations for the PSIBLAST search, but the program may perform fewer
68 iterations if convergence is reached)
69
70 BPpsilite addresses these issues by offering the following methods:
71
72 * The total number of iteration used is given by the method
73 number_of_iterations as in:
74
75 $total_iterations = $report->number_of_iterations;
76
77 * Results from an arbitrary iteration round can be accessed by using
78 the 'round' method:
79
80 $iteration3_report = $report->round(3);
81
82 * The ids of the sequences which passed the significance threshold for
83 the first time in the "nth" iteration can be identified by using the
84 newhits method. Previously identified hits are identified by using
85 the oldhits method, as in:
86
87 $oldhitarray_ref = $iteration3_report->oldhits;
88 $newhitarray_ref = $iteration3_report->newhits;
89
90 BPpsilite.pm should work equally well on reports generated by the
91 StandAloneBlast.pm local BLAST module as with reports generated by
92 remote psiblast searches. For examples of usage of BPpsilite.pm, the
93 user is referred to the BPpsilite.t script in the "t" directory.
94
95 =head1 FEEDBACK
96
97 =head2 Mailing Lists
98
99 User feedback is an integral part of the evolution of this and other
100 Bioperl modules. Send your comments and suggestions preferably to one
101 of the Bioperl mailing lists. Your participation is much appreciated.
102
103 bioperl-l@bioperl.org - General discussion
104 http://bio.perl.org/MailList.html - About the mailing lists
105
106 =head2 Reporting Bugs
107
108 Report bugs to the Bioperl bug tracking system to help us keep track
109 the bugs and their resolution. Bug reports can be submitted via email
110 or the web:
111
112 bioperl-bugs@bio.perl.org
113 http://bugzilla.bioperl.org/
114
115 =head1 AUTHOR - Peter Schattner
116
117 Email: schattner@alum.mit.edu
118
119 =head1 CONTRIBUTORS
120
121 Jason Stajich, jason@cgt.mc.duke.edu
122
123 =head1 ACKNOWLEDGEMENTS
124
125 Based on work of:
126 Ian Korf (ikorf@sapiens.wustl.edu, http://sapiens.wustl.edu/~ikorf),
127 Lorenz Pollak (lorenz@ist.org, bioperl port)
128
129 =head1 COPYRIGHT
130
131 BPlite.pm is copyright (C) 1999 by Ian Korf.
132
133 =head1 DISCLAIMER
134
135 This software is provided "as is" without warranty of any kind.
136
137 =cut
138
139 package Bio::Tools::BPpsilite;
140
141 use strict;
142 use vars qw(@ISA);
143 use Bio::Tools::BPlite::Iteration; #
144 use Bio::Tools::BPlite::Sbjct; # Debug code
145 use Bio::Root::Root; # root interface to inherit from
146 use Bio::Root::IO;
147 use Bio::Tools::BPlite;
148
149 @ISA = qw(Bio::Root::Root Bio::Root::IO);
150
151 sub new {
152 my ($class, @args) = @_;
153 my $self = $class->SUPER::new(@args);
154
155 # initialize IO
156 $self->_initialize_io(@args);
157 $self->{'_tempdir'} = $self->tempdir('CLEANUP' => 1);
158 $self->{'QPATLOCATION'} = []; # Anonymous array of query pattern locations for PHIBLAST
159 $self->{'NEXT_ITERATION_NUMBER'} = 1;
160 $self->{'TOTAL_ITERATION_NUMBER'} = -1; # -1 indicates preprocessing not yet done
161
162 if ($self->_parseHeader) {$self->{'REPORT_DONE'} = 0} # there are alignments
163 else {$self->{'REPORT_DONE'} = 1} # empty report
164
165 return $self; # success - we hope!
166 }
167
168 =head2 query
169
170 Title : query
171 Usage : $query = $obj->query();
172 Function : returns the query object
173 Returns : query object
174 Args :
175
176 =cut
177
178 sub query {shift->{'QUERY'}}
179
180 =head2 qlength
181
182 Title : qlength
183 Usage : $len = $obj->qlength();
184 Function : returns the length of the query
185 Returns : length of query
186 Args :
187
188 =cut
189
190 sub qlength {shift->{'LENGTH'}}
191
192 =head2 database
193
194 Title : database
195 Usage : $db = $obj->database();
196 Function : returns the database used in this search
197 Returns : database used for search
198 Args :
199
200 =cut
201
202 sub database {shift->{'DATABASE'}}
203
204 =head2 number_of_iterations
205
206 Title : number_of_iterations
207 Usage : $total_iterations = $obj-> number_of_iterations();
208 Function : returns the total number of iterations used in this search
209 Returns : total number of iterations used for search
210 Args : none
211
212 =cut
213
214
215 =head2 pattern
216
217 Title : database
218 Usage : $pattern = $obj->pattern();
219 Function : returns the pattern used in a PHIBLAST search
220
221 =cut
222
223 sub pattern {shift->{'PATTERN'}}
224
225 =head2 query_pattern_location
226
227 Title : query_pattern_location
228 Usage : $qpl = $obj->query_pattern_location();
229 Function : returns reference to array of locations in the query sequence
230 of pattern used in a PHIBLAST search
231
232 =cut
233
234 sub query_pattern_location {shift->{'QPATLOCATION'}}
235
236
237
238
239 sub number_of_iterations {
240 my $self = shift;
241 if ($self->{'TOTAL_ITERATION_NUMBER'} == -1){&_preprocess($self);}
242 $self->{'TOTAL_ITERATION_NUMBER'};
243 }
244
245 =head2 round
246
247 Title : round
248 Usage : $Iteration3 = $report->round(3);
249 Function : Method of retrieving data from a specific iteration
250 Example :
251 Returns : reference to requested Iteration object or null if argument
252 is greater than total number of iterations
253 Args : number of the requested iteration
254
255 =cut
256
257 sub round {
258 my $self = shift;
259 my $iter_num = shift;
260 $self->_initialize_io(-file => Bio::Root::IO->catfile
261 ($self->{'_tempdir'},"iteration".$iter_num.".tmp"));
262 if( ! $self->_fh ) {
263 $self->throw("unable to re-open iteration file for round ".$iter_num);
264 }
265 return Bio::Tools::BPlite::Iteration->new(-round=>$iter_num,
266 -parent=>$self);
267 }
268
269 # begin private routines
270
271 sub _parseHeader {
272 my ($self) = @_;
273
274
275 while(defined ($_ = $self->_readline) ) {
276 if ($_ =~ /^Query=\s+([^\(]+)/) {
277 my $query = $1;
278 while(defined ($_ = $self->_readline)) {
279 last if $_ !~ /\S/;
280 $query .= $_;
281 }
282 $query =~ s/\s+/ /g;
283 $query =~ s/^>//;
284 $query =~ /\((\d+)\s+\S+\)\s*$/;
285 my $length = $1;
286 $self->{'QUERY'} = $query;
287 $self->{'LENGTH'} = $length;
288 }
289 elsif ($_ =~ /^Database:\s+(.+)/) {$self->{'DATABASE'} = $1}
290 elsif ($_ =~ /^\s*pattern\s+(\S+).*position\s+(\d+)\D/)
291 { # For PHIBLAST reports
292 $self->{'PATTERN'} = $1;
293 push (@{$self->{'QPATLOCATION'}}, $2);
294 } elsif ($_ =~ /^>|^Results from round 1/) {
295 $self->_pushback($_);
296 return 1;
297 } elsif ($_ =~ /^Parameters|^\s+Database:/) {
298 $self->_pushback($_);
299 return 0; # there's nothing in the report
300 }
301 }
302 }
303
304 =head2 _preprocess
305
306 Title : _preprocess
307 Usage : internal routine, not called directly
308 Function : determines number of iterations in report and prepares
309 data so individual iterations canbe parsed in non-sequential
310 order
311 Example :
312 Returns : nothing. Sets TOTAL_ITERATION_NUMBER in object's hash
313 Args : reference to calling object
314
315 =cut
316
317 #'
318 sub _preprocess {
319 my $self = shift;
320 # $self->throw(" PSIBLAST report preprocessing not implemented yet!");
321
322 my $oldround = 0;
323 my ($currentline, $currentfile, $round);
324
325 # open output file for data from iteration round #1
326 $round = 1;
327 $currentfile = Bio::Root::IO->catfile($self->{'_tempdir'},
328 "iteration$round.tmp");
329 open (FILEHANDLE, ">$currentfile") ||
330 $self->throw("cannot open filehandle to write to file $currentfile");
331
332 while(defined ($currentline = $self->_readline()) ) {
333 if ($currentline =~ /^Results from round\s+(\d+)/) {
334 if ($oldround) { close (FILEHANDLE) ;}
335 $round = $1;
336 $currentfile = Bio::Root::IO->catfile($self->{'_tempdir'},
337 "iteration$round.tmp");
338
339 close FILEHANDLE;
340 open (FILEHANDLE, ">$currentfile") ||
341 $self->throw("cannot open filehandle to write to file $currentfile");
342 $oldround = $round;
343 }elsif ($currentline =~ /CONVERGED/){ # This is a fix for psiblast parsing with -m 6 /AE
344 $round--;
345 }
346 print FILEHANDLE $currentline ;
347
348 }
349 $self->{'TOTAL_ITERATION_NUMBER'}= $round;
350 # It is necessary to close filehandle otherwise the whole
351 # file will not be read later !!
352 close FILEHANDLE;
353 }
354
355 1;
356
357 __END__