0
|
1 # $Id: Iteration.pm,v 1.15 2002/06/19 00:27:49 jason Exp $
|
|
2 # Bioperl module Bio::Tools::BPlite::Iteration
|
|
3 # based closely on the Bio::Tools::BPlite modules
|
|
4 # Ian Korf (ikorf@sapiens.wustl.edu, http://sapiens.wustl.edu/~ikorf),
|
|
5 # Lorenz Pollak (lorenz@ist.org, bioperl port)
|
|
6 #
|
|
7 # Copyright Peter Schattner
|
|
8 #
|
|
9 # You may distribute this module under the same terms as perl itself
|
|
10 # _history
|
|
11 # October 20, 2000
|
|
12 # POD documentation - main docs before the code
|
|
13 #
|
|
14 # Added to get a simple_align object for a psiblast run with the -m 6 flag /AE
|
|
15 #
|
|
16
|
|
17 =head1 NAME
|
|
18
|
|
19 Bio::Tools::BPlite::Iteration - object for parsing single iteration
|
|
20 of a PSIBLAST report
|
|
21
|
|
22 =head1 SYNOPSIS
|
|
23
|
|
24 use Bio::Tools:: BPpsilite;
|
|
25
|
|
26 open FH, "t/psiblastreport.out";
|
|
27 $report = Bio::Tools::BPpsilite->new(-fh=>\*FH);
|
|
28
|
|
29 # determine number of iterations executed by psiblast
|
|
30 $total_iterations = $report->number_of_iterations;
|
|
31 $last_iteration = $report->round($total_iterations);
|
|
32
|
|
33 # Process only hits found in last iteration ...
|
|
34 $oldhitarray_ref = $last_iteration->oldhits;
|
|
35 HIT: while($sbjct = $last_iteration->nextSbjct) {
|
|
36 $id = $sbjct->name;
|
|
37 $is_old = grep /\Q$id\E/, @$oldhitarray_ref;
|
|
38 if ($is_old ){next HIT;}
|
|
39 # do something with new hit...
|
|
40 }
|
|
41
|
|
42 =head2 ALIGNMENTS
|
|
43
|
|
44 # This assumed that you have $db pointing to a database, $out to an output file
|
|
45 # $slxdir to a directory and $psiout
|
|
46 # note the alignments can only be obtained if the flag "-m 6" is run.
|
|
47 # It might also be necessary to use the flag -v to get all alignments
|
|
48 #
|
|
49 my @psiparams = ('database' => $db , 'output' => $out, 'j' => 3, 'm' => 6,
|
|
50 'h' => 1.e-3 , 'F' => 'T' , 'Q' => $psiout );
|
|
51 my $factory = Bio::Tools::Run::StandAloneBlast->new(@psiparams);
|
|
52 my $report = $factory->blastpgp($seq);
|
|
53 my $total_iterations = $report->number_of_iterations();
|
|
54 my $last_iteration = $report->round($total_iterations);
|
|
55 my $align=$last_iteration->Align;
|
|
56 my $slxfile=$slxdir.$id.".slx";
|
|
57 my $slx = Bio::AlignIO->new('-format' => 'selex','-file' => ">".$slxfile );
|
|
58 $slx->write_aln($align);
|
|
59
|
|
60 =head1 DESCRIPTION
|
|
61
|
|
62 See the documentation for BPpsilite.pm for a description of the
|
|
63 Iteration.pm module.
|
|
64
|
|
65 =head1 AUTHORS - Peter Schattner
|
|
66
|
|
67 Email: schattner@alum.mit.edu
|
|
68
|
|
69 =head1 CONTRIBUTORS
|
|
70
|
|
71 Jason Stajich, jason@cgt.mc.duke.edu
|
|
72
|
|
73 =head1 ACKNOWLEDGEMENTS
|
|
74
|
|
75 Based on work of:
|
|
76 Ian Korf (ikorf@sapiens.wustl.edu, http://sapiens.wustl.edu/~ikorf),
|
|
77 Lorenz Pollak (lorenz@ist.org, bioperl port)
|
|
78
|
|
79 =head1 COPYRIGHT
|
|
80
|
|
81 BPlite.pm is copyright (C) 1999 by Ian Korf.
|
|
82
|
|
83 =head1 DISCLAIMER
|
|
84
|
|
85 This software is provided "as is" without warranty of any kind.
|
|
86
|
|
87 =cut
|
|
88
|
|
89 package Bio::Tools::BPlite::Iteration;
|
|
90
|
|
91 use strict;
|
|
92 use vars qw(@ISA);
|
|
93 use Bio::Root::Root; # root object to inherit from
|
|
94 use Bio::Tools::BPlite; #
|
|
95 use Bio::Tools::BPlite::Sbjct;
|
|
96
|
|
97 @ISA = qw(Bio::Root::Root);
|
|
98
|
|
99 sub new {
|
|
100 my ($class, @args) = @_;
|
|
101 my $self = $class->SUPER::new(@args);
|
|
102
|
|
103 ($self->{'PARENT'},$self->{'ROUND'}) =
|
|
104 $self->_rearrange([qw(PARENT
|
|
105 ROUND
|
|
106 )],@args);
|
|
107
|
|
108 $self->{'QUERY'} = $self->{'PARENT'}->{'QUERY'};
|
|
109 $self->{'LENGTH'} = $self->{'PARENT'}->{'LENGTH'};
|
|
110
|
|
111 if($self->_parseHeader) {$self->{'REPORT_DONE'} = 0} # there are alignments
|
|
112 else {$self->{'REPORT_DONE'} = 1} # empty report
|
|
113
|
|
114 return $self; # success - we hope!
|
|
115 }
|
|
116
|
|
117 =head2 query
|
|
118
|
|
119 Title : query
|
|
120 Usage : $query = $obj->query();
|
|
121 Function : returns the query object
|
|
122 Example :
|
|
123 Returns : query object
|
|
124 Args :
|
|
125
|
|
126 =cut
|
|
127
|
|
128 sub query {shift->{'QUERY'}}
|
|
129
|
|
130 =head2 qlength
|
|
131
|
|
132 Title : qlength
|
|
133 Usage : $len = $obj->qlength();
|
|
134 Returns : length of query
|
|
135 Args : none
|
|
136
|
|
137 =cut
|
|
138
|
|
139 sub qlength {shift->{'LENGTH'}}
|
|
140
|
|
141 =head2 newhits
|
|
142
|
|
143 Title : newhits
|
|
144 Usage : $newhits = $obj->newhits();
|
|
145 Returns : reference to an array listing all the hits
|
|
146 from the current iteration which were not identified
|
|
147 in the previous iteration
|
|
148 Args : none
|
|
149
|
|
150 =cut
|
|
151
|
|
152 sub newhits {shift->{'NEWHITS'}}
|
|
153
|
|
154 =head2 oldhits
|
|
155
|
|
156 Title : oldhits
|
|
157 Usage : $oldhits = $obj->oldhits();
|
|
158 Returns : reference to an array listing all the hits from
|
|
159 the current iteration which were identified and
|
|
160 above threshold in the previous iteration
|
|
161 Args : none
|
|
162
|
|
163 =cut
|
|
164
|
|
165 sub oldhits {shift->{'OLDHITS'}}
|
|
166
|
|
167
|
|
168 =head2 nextSbjct
|
|
169
|
|
170 Title : nextSbjct
|
|
171 Usage : $sbjct = $obj->nextSbjct();
|
|
172 Function : Method of iterating through all the Sbjct retrieved
|
|
173 from parsing the report
|
|
174 #Example : while ( my $sbjct = $obj->nextSbjct ) {}
|
|
175 Returns : next Sbjct object or undef if finished
|
|
176 Args :
|
|
177
|
|
178 =cut
|
|
179
|
|
180 sub nextSbjct {
|
|
181 my ($self) = @_;
|
|
182 $self->_fastForward or return undef;
|
|
183
|
|
184 #######################
|
|
185 # get all sbjct lines #
|
|
186 #######################
|
|
187 my $def = $self->_readline();
|
|
188
|
|
189 while(defined ($_ = $self->_readline) ) {
|
|
190 if ($_ !~ /\w/) {next}
|
|
191 elsif ($_ =~ /Strand HSP/) {next} # WU-BLAST non-data
|
|
192 elsif ($_ =~ /^\s{0,2}Score/) {$self->_pushback( $_); last}
|
|
193 elsif ($_ =~ /^(\d+) .* \d+$/) { # This is not correct at all
|
|
194 $self->_pushback($_); # 1: HSP does not work for -m 6 flag
|
|
195 $def = $1; # 2: length/name are incorrect
|
|
196 my $length = undef; # 3: Names are repeated many times.
|
|
197 my $sbjct = new Bio::Tools::BPlite::Sbjct('-name'=>$def,
|
|
198 '-length'=>$length,
|
|
199 '-parent'=>$self);
|
|
200 return $sbjct;
|
|
201 } # m-6
|
|
202 elsif ($_ =~ /^Parameters|^\s+Database:|^\s+Posted date:/) {
|
|
203 $self->_pushback( $_);
|
|
204 last;
|
|
205 } else {$def .= $_}
|
|
206 }
|
|
207 $def =~ s/\s+/ /g;
|
|
208 $def =~ s/\s+$//g;
|
|
209 $def =~ s/Length = ([\d,]+)$//g;
|
|
210 my $length = $1;
|
|
211 return 0 unless $def =~ /^>/;
|
|
212 $def =~ s/^>//;
|
|
213
|
|
214 ####################
|
|
215 # the Sbjct object #
|
|
216 ####################
|
|
217 my $sbjct = new Bio::Tools::BPlite::Sbjct('-name'=>$def,
|
|
218 '-length'=>$length,
|
|
219 '-parent'=>$self);
|
|
220 return $sbjct;
|
|
221 }
|
|
222
|
|
223
|
|
224 # This is added by /AE
|
|
225
|
|
226 =head2 Align
|
|
227
|
|
228 Title : Align
|
|
229 Usage : $SimpleAlign = $obj->Align();
|
|
230 Function : Method to obtain a simpleAlign object from psiblast
|
|
231 Example : $SimpleAlign = $obj->Align();
|
|
232 Returns : SimpleAlign object or undef if not found.
|
|
233 BUG : Only works if psiblast has been run with m 6 flag
|
|
234 Args :
|
|
235
|
|
236 =cut
|
|
237
|
|
238 sub Align {
|
|
239 use Bio::SimpleAlign;
|
|
240 my ($self) = @_;
|
|
241 $self->_fastForward or return undef;
|
|
242 my $lastline = $self->_readline();
|
|
243 return undef unless $lastline =~ /^QUERY/; # If psiblast not run correctly
|
|
244 my (%sequence,%first,%last,$num);
|
|
245
|
|
246 if ( $lastline =~ /^QUERY\s+(\d*)\s*([-\w]+)\s*(\d*)\s*$/){
|
|
247 my $name='QUERY';
|
|
248 my $start=$1;
|
|
249 my $seq=$2;
|
|
250 my $stop=$3;
|
|
251 $seq =~ s/-/\./g;
|
|
252 $start =~ s/ //g;
|
|
253 $stop =~ s/ //g;
|
|
254 $sequence{$name} .= $seq;
|
|
255 if ($first{$name} eq ''){$first{$name}=$start;}
|
|
256 if ($stop ne ''){$last{$name}=$stop;}
|
|
257 # print "FOUND:\t$seq\t$start\t$stop\n";
|
|
258 $num=0;
|
|
259 }
|
|
260 while(defined($_ = $self->_readline()) ){
|
|
261 chomp($_);
|
|
262 if ( $_ =~ /^QUERY\s+(\d+)\s*([\-A-Z]+)\s*(\+)\s*$/){
|
|
263 my $name='QUERY';
|
|
264 my $start=$1;
|
|
265 my $seq=$2;
|
|
266 my $stop=$3;
|
|
267 $seq =~ s/-/\./g;
|
|
268 $start =~ s/ //g;
|
|
269 $stop =~ s/ //g;
|
|
270 $sequence{$name} .= $seq;
|
|
271 if ($first{$name} eq '') { $first{$name} = $start;}
|
|
272 if ($stop ne '') { $last{$name}=$stop;}
|
|
273 $num=0;
|
|
274 } elsif ( $_ =~ /^(\d+)\s+(\d+)\s*([\-A-Z]+)\s*(\d+)\s*$/ ){
|
|
275 my $name=$1.".".$num;
|
|
276 my $start=$2;
|
|
277 my $seq=$3;
|
|
278 my $stop=$4;
|
|
279 $seq =~ s/-/\./g;
|
|
280 $start =~ s/ //g;
|
|
281 $stop =~ s/ //g;
|
|
282 $sequence{$name} .= $seq;
|
|
283 if ($first{$name} eq ''){$first{$name}=$start;}
|
|
284 if ($stop ne ''){$last{$name}=$stop;}
|
|
285 $num++;
|
|
286 }
|
|
287 }
|
|
288 my $align = new Bio::SimpleAlign();
|
|
289 my @keys=sort keys(%sequence);
|
|
290 foreach my $name (@keys){
|
|
291 my $nse = $name."/".$first{$name}."-".$last{$name};
|
|
292 my $seqobj = Bio::LocatableSeq->new( -seq => $sequence{$name},
|
|
293 -id => $name,
|
|
294 -name => $nse,
|
|
295 -start => $first{$name},
|
|
296 -end => $last{$name}
|
|
297 );
|
|
298
|
|
299 $align->add_seq($seqobj);
|
|
300 }
|
|
301 return $align;
|
|
302 }
|
|
303
|
|
304 # Start of internal subroutines.
|
|
305
|
|
306 sub _parseHeader {
|
|
307 my ($self) = @_;
|
|
308 my (@old_hits, @new_hits);
|
|
309
|
|
310 my $newhits_true = ($self->{'ROUND'} < 2) ? 1 : 0 ;
|
|
311 while(defined($_ = $self->_readline()) ) {
|
|
312 if ($_ =~ /(\w\w|.*|\w+.*)\s\s+(\d+)\s+([-\.e\d]+)$/) {
|
|
313 my $id = $1;
|
|
314 my $score= $2; #not used currently
|
|
315 my $evalue= $3; #not used currently
|
|
316 if ($newhits_true) { push ( @new_hits, $id);}
|
|
317 else { push (@old_hits, $id);}
|
|
318 }
|
|
319 elsif ($_ =~ /^Sequences not found previously/) {$newhits_true = 1 ;}
|
|
320 # This is changed for "-m 6" option /AE
|
|
321 elsif ($_ =~ /^>/ || $_ =~ /^QUERY/)
|
|
322 {
|
|
323 $self->_pushback($_);
|
|
324 $self->{'OLDHITS'} = \@old_hits;
|
|
325 $self->{'NEWHITS'} = \@new_hits;
|
|
326 return 1;
|
|
327 }
|
|
328 elsif ($_ =~ /^Parameters|^\s+Database:|^\s*Results from round\s+(d+)/) {
|
|
329 $self->_pushback($_);
|
|
330 return 0; # no sequences found in this iteration
|
|
331 }
|
|
332 }
|
|
333 return 0; # no sequences found in this iteration
|
|
334 }
|
|
335
|
|
336 sub _fastForward {
|
|
337 my ($self) = @_;
|
|
338 return 0 if $self->{'REPORT_DONE'}; # empty report
|
|
339
|
|
340 while(defined($_ = $self->_readline()) ) {
|
|
341 if( $_ =~ /^>/ ||
|
|
342 $_ =~ /^QUERY|^\d+ .* \d+$/ ) { # Changed to also handle "-m 6" /AE
|
|
343 $self->_pushback($_);
|
|
344 return 1;
|
|
345 }
|
|
346 # print "FASTFORWARD",$_,"\n";
|
|
347 if ($_ =~ /^>|^Parameters|^\s+Database:/) {
|
|
348 $self->_pushback($_);
|
|
349 return 1;
|
|
350 }
|
|
351 }
|
|
352 $self->warn("Possible error (2) while parsing BLAST report!");
|
|
353 }
|
|
354
|
|
355
|
|
356 =head2 _readline
|
|
357
|
|
358 Title : _readline
|
|
359 Usage : $obj->_readline
|
|
360 Function: Reads a line of input.
|
|
361
|
|
362 Note that this method implicitely uses the value of $/ that is
|
|
363 in effect when called.
|
|
364
|
|
365 Note also that the current implementation does not handle pushed
|
|
366 back input correctly unless the pushed back input ends with the
|
|
367 value of $/.
|
|
368 Example :
|
|
369 Returns :
|
|
370
|
|
371 =cut
|
|
372
|
|
373 sub _readline{
|
|
374 my ($self) = @_;
|
|
375 return $self->{'PARENT'}->_readline();
|
|
376 }
|
|
377
|
|
378 =head2 _pushback
|
|
379
|
|
380 Title : _pushback
|
|
381 Usage : $obj->_pushback($newvalue)
|
|
382 Function: puts a line previously read with _readline back into a buffer
|
|
383 Example :
|
|
384 Returns :
|
|
385 Args : newvalue
|
|
386
|
|
387 =cut
|
|
388
|
|
389 sub _pushback {
|
|
390 my ($self, $arg) = @_;
|
|
391 return $self->{'PARENT'}->_pushback($arg);
|
|
392 }
|
|
393 1;
|
|
394 __END__
|