comparison variant_effect_predictor/Bio/Tools/BPlite.pm @ 0:1f6dce3d34e0

Uploaded
author mahtabm
date Thu, 11 Apr 2013 02:01:53 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:1f6dce3d34e0
1 # $Id: BPlite.pm,v 1.36.2.2 2003/02/20 00:39:03 jason Exp $
2 ##############################################################################
3 # Bioperl module Bio::Tools::BPlite
4 ##############################################################################
5 #
6 # The original BPlite.pm module has been written by Ian Korf !
7 # see http://sapiens.wustl.edu/~ikorf
8 #
9 # You may distribute this module under the same terms as perl itself
10
11 =head1 NAME
12
13 Bio::Tools::BPlite - Lightweight BLAST parser
14
15 =head1 SYNOPSIS
16
17 use Bio::Tools::BPlite;
18 my $report = new Bio::Tools::BPlite(-fh=>\*STDIN);
19
20 {
21 $report->query;
22 $report->database;
23 while(my $sbjct = $report->nextSbjct) {
24 $sbjct->name;
25 while (my $hsp = $sbjct->nextHSP) {
26 $hsp->score;
27 $hsp->bits;
28 $hsp->percent;
29 $hsp->P;
30 $hsp->EXP;
31 $hsp->match;
32 $hsp->positive;
33 $hsp->length;
34 $hsp->querySeq;
35 $hsp->sbjctSeq;
36 $hsp->homologySeq;
37 $hsp->query->start;
38 $hsp->query->end;
39 $hsp->hit->start;
40 $hsp->hit->end;
41 $hsp->hit->seq_id;
42 $hsp->hit->overlaps($exon);
43 }
44 }
45
46 # the following line takes you to the next report in the stream/file
47 # it will return 0 if that report is empty,
48 # but that is valid for an empty blast report.
49 # Returns -1 for EOF.
50
51 last if ($report->_parseHeader == -1);
52 redo;
53 }
54
55
56 =head1 DESCRIPTION
57
58 BPlite is a package for parsing BLAST reports. The BLAST programs are a family
59 of widely used algorithms for sequence database searches. The reports are
60 non-trivial to parse, and there are differences in the formats of the various
61 flavors of BLAST. BPlite parses BLASTN, BLASTP, BLASTX, TBLASTN, and TBLASTX
62 reports from both the high performance WU-BLAST, and the more generic
63 NCBI-BLAST.
64
65 Many people have developed BLAST parsers (I myself have made at least three).
66 BPlite is for those people who would rather not have a giant object
67 specification, but rather a simple handle to a BLAST report that works well
68 in pipes.
69
70 =head2 Object
71
72 BPlite has three kinds of objects, the report, the subject, and the HSP. To
73 create a new report, you pass a filehandle reference to the BPlite constructor.
74
75 my $report = new Bio::Tools::BPlite(-fh=>\*STDIN); # or any other filehandle
76
77 The report has two attributes (query and database), and one method (nextSbjct).
78
79 $report->query; # access to the query name
80 $report->database; # access to the database name
81 $report->nextSbjct; # gets the next subject
82 while(my $sbjct = $report->nextSbjct) {
83 # canonical form of use is in a while loop
84 }
85
86 A subject is a BLAST hit, which should not be confused with an HSP (below). A
87 BLAST hit may have several alignments associated with it. A useful way of
88 thinking about it is that a subject is a gene and HSPs are the exons. Subjects
89 have one attribute (name) and one method (nextHSP).
90
91 $sbjct->name; # access to the subject name
92 $sbjct->nextHSP; # gets the next HSP from the sbjct
93 while(my $hsp = $sbjct->nextHSP) {
94 # canonical form is again a while loop
95 }
96
97 An HSP is a high scoring pair, or simply an alignment. HSP objects
98 inherit all the useful methods from RangeI/SeqFeatureI/FeaturePair,
99 but provide an additional set of attributes (score, bits, percent, P,
100 match, EXP, positive, length, querySeq, sbjctSeq, homologySeq) that
101 should be familiar to anyone who has seen a blast report.
102
103 For lazy/efficient coders, two-letter abbreviations are available for the
104 attributes with long names (qs, ss, hs). Ranges of the aligned sequences in
105 query/subject and other information (like seqname) are stored
106 in SeqFeature objects (i.e.: $hsp-E<gt>query, $hsp-E<gt>subject which is equal to
107 $hsp-E<gt>feature1, $hsp-E<gt>feature2). querySeq, sbjctSeq and homologySeq do only
108 contain the alignment sequences from the blast report.
109
110 $hsp->score;
111 $hsp->bits;
112 $hsp->percent;
113 $hsp->P;
114 $hsp->match;
115 $hsp->positive;
116 $hsp->length;
117 $hsp->querySeq; $hsp->qs;
118 $hsp->sbjctSeq; $hsp->ss;
119 $hsp->homologySeq; $hsp->hs;
120 $hsp->query->start;
121 $hsp->query->end;
122 $hsp->query->seq_id;
123 $hsp->hit->primary_tag; # "similarity"
124 $hsp->hit->source_tag; # "BLAST"
125 $hsp->hit->start;
126 $hsp->hit->end;
127 ...
128
129 So a very simple look into a BLAST report might look like this.
130
131 my $report = new Bio::Tools::BPlite(-fh=>\*STDIN);
132 while(my $sbjct = $report->nextSbjct) {
133 print ">",$sbjct->name,"\n";
134 while(my $hsp = $sbjct->nextHSP) {
135 print "\t",$hsp->start,"..",$hsp->end," ",$hsp->bits,"\n";
136 }
137 }
138
139 The output of such code might look like this:
140
141 >foo
142 100..155 29.5
143 268..300 20.1
144 >bar
145 100..153 28.5
146 265..290 22.1
147
148
149 =head1 AUTHORS
150
151 Ian Korf (ikorf@sapiens.wustl.edu, http://sapiens.wustl.edu/~ikorf),
152 Lorenz Pollak (lorenz@ist.org, bioperl port)
153
154 =head1 ACKNOWLEDGEMENTS
155
156 This software was developed at the Genome Sequencing Center at Washington
157 Univeristy, St. Louis, MO.
158
159 =head1 CONTRIBUTORS
160
161 Jason Stajich, jason@bioperl.org
162
163 =head1 COPYRIGHT
164
165 Copyright (C) 1999 Ian Korf. All Rights Reserved.
166
167 =head1 DISCLAIMER
168
169 This software is provided "as is" without warranty of any kind.
170
171 =cut
172
173 package Bio::Tools::BPlite;
174
175 use strict;
176 use vars qw(@ISA);
177
178 use Bio::Root::Root;
179 use Bio::Root::IO;
180 use Bio::Tools::BPlite::Sbjct; # we want to use Sbjct
181 use Bio::SeqAnalysisParserI;
182 use Symbol;
183
184 @ISA = qw(Bio::Root::Root Bio::SeqAnalysisParserI Bio::Root::IO);
185
186 # new comes from a RootI now
187
188 =head2 new
189
190 Title : new
191 Function: Create a new Bio::Tools::BPlite object
192 Returns : Bio::Tools::BPlite
193 Args : -file input file (alternative to -fh)
194 -fh input stream (alternative to -file)
195
196 =cut
197
198 sub new {
199 my ($class, @args) = @_;
200 my $self = $class->SUPER::new(@args);
201
202 # initialize IO
203 $self->_initialize_io(@args);
204
205 $self->{'QPATLOCATION'} = []; # Anonymous array of query pattern locations for PHIBLAST
206
207 if ($self->_parseHeader) {$self->{'REPORT_DONE'} = 0} # there are alignments
208 else {$self->{'REPORT_DONE'} = 1} # empty report
209
210 return $self; # success - we hope!
211 }
212
213 # for SeqAnalysisParserI compliance
214
215 =head2 next_feature
216
217 Title : next_feature
218 Usage : while( my $feat = $res->next_feature ) { # do something }
219 Function: SeqAnalysisParserI implementing function. This implementation
220 iterates over all HSPs. If the HSPs of the current subject match
221 are exhausted, it will automatically call nextSbjct().
222 Example :
223 Returns : A Bio::SeqFeatureI compliant object, in this case a
224 Bio::Tools::BPlite::HSP object, and FALSE if there are no more
225 HSPs.
226 Args : None
227
228 =cut
229
230 sub next_feature{
231 my ($self) = @_;
232 my ($sbjct, $hsp);
233 $sbjct = $self->{'_current_sbjct'};
234 unless( defined $sbjct ) {
235 $sbjct = $self->{'_current_sbjct'} = $self->nextSbjct;
236 return undef unless defined $sbjct;
237 }
238 $hsp = $sbjct->nextHSP;
239 unless( defined $hsp ) {
240 $self->{'_current_sbjct'} = undef;
241 return $self->next_feature;
242 }
243 return $hsp || undef;
244 }
245
246 =head2 query
247
248 Title : query
249 Usage : $query = $obj->query();
250 Function : returns the query object
251 Example :
252 Returns : query object
253 Args :
254
255 =cut
256
257 sub query {shift->{'QUERY'}}
258
259 =head2 qlength
260
261 Title : qlength
262 Usage : $len = $obj->qlength();
263 Function : returns the length of the query
264 Example :
265 Returns : length of query
266 Args :
267
268 =cut
269
270 sub qlength {shift->{'LENGTH'}}
271
272 =head2 pattern
273
274 Title : pattern
275 Usage : $pattern = $obj->pattern();
276 Function : returns the pattern used in a PHIBLAST search
277
278 =cut
279
280 sub pattern {shift->{'PATTERN'}}
281
282 =head2 query_pattern_location
283
284 Title : query_pattern_location
285 Usage : $qpl = $obj->query_pattern_location();
286 Function : returns reference to array of locations in the query sequence
287 of pattern used in a PHIBLAST search
288
289 =cut
290
291 sub query_pattern_location {shift->{'QPATLOCATION'}}
292
293 =head2 database
294
295 Title : database
296 Usage : $db = $obj->database();
297 Function : returns the database used in this search
298 Example :
299 Returns : database used for search
300 Args :
301
302 =cut
303
304 sub database {shift->{'DATABASE'}}
305
306 =head2 nextSbjct
307
308 Title : nextSbjct
309 Usage : $sbjct = $obj->nextSbjct();
310 Function : Method of iterating through all the Sbjct retrieved
311 from parsing the report
312 Example : while ( my $sbjct = $obj->nextSbjct ) {}
313 Returns : next Sbjct object or null if finished
314 Args :
315
316 =cut
317
318 sub nextSbjct {
319 my ($self) = @_;
320
321 $self->_fastForward or return undef;
322
323 #######################
324 # get all sbjct lines #
325 #######################
326 my $def = $self->_readline();
327 while(defined ($_ = $self->_readline() ) ) {
328 if ($_ !~ /\w/) {next}
329 elsif ($_ =~ /Strand HSP/) {next} # WU-BLAST non-data
330 elsif ($_ =~ /^\s{0,2}Score/) {$self->_pushback($_); last}
331 elsif ($_ =~ /^Histogram|^Searching|^Parameters|^\s+Database:|^\s+Posted date:/) {
332 $self->_pushback($_);
333 last;
334 }
335 else {$def .= $_}
336 }
337 $def =~ s/\s+/ /g;
338 $def =~ s/\s+$//g;
339 $def =~ s/Length = ([\d,]+)$//g;
340 my $length = $1;
341 return undef unless $def =~ /^>/;
342 $def =~ s/^>//;
343
344 ####################
345 # the Sbjct object #
346 ####################
347 my $sbjct = new Bio::Tools::BPlite::Sbjct('-name'=>$def,
348 '-length'=>$length,
349 '-parent'=>$self);
350 return $sbjct;
351 }
352
353 # begin private routines
354
355 sub _parseHeader {
356 my ($self) = @_;
357
358 # normally, _parseHeader will break out of the parse as soon as it
359 # reaches a new Subject (i.e. the first one after the header) if you
360 # call _parseHeader twice in a row, with nothing in between, all you
361 # accomplish is a ->nextSubject call.. so we need a flag to
362 # indicate that we have *entered* a header, before we are allowed to
363 # leave it!
364
365 my $header_flag = 0; # here is the flag/ It is "false" at first, and
366 # is set to "true" when any valid header element
367 # is encountered
368
369 $self->{'REPORT_DONE'} = 0; # reset this bit for a new report
370 while(defined($_ = $self->_readline() ) ) {
371 s/\(\s*\)//;
372 if ($_ =~ /^Query=(?:\s+([^\(]+))?/) {
373 $header_flag = 1; # valid header element found
374 my $query = $1;
375 while( defined($_ = $self->_readline() ) ) {
376 # Continue reading query name until encountering either
377 # a line that starts with "Database" or a blank line.
378 # The latter condition is needed in order to be able to
379 # parse megablast output correctly, since Database comes
380 # before (not after) the query.
381 if( ($_ =~ /^Database/) || ($_ =~ /^$/) ) {
382 $self->_pushback($_); last;
383 }
384 $query .= $_;
385 }
386 $query =~ s/\s+/ /g;
387 $query =~ s/^>//;
388
389 my $length = 0;
390 if( $query =~ /\(([\d,]+)\s+\S+\)\s*$/ ) {
391 $length = $1;
392 $length =~ s/,//g;
393 } else {
394 $self->debug("length is 0 for '$query'\n");
395 }
396 $self->{'QUERY'} = $query;
397 $self->{'LENGTH'} = $length;
398 }
399 elsif ($_ =~ /^(<b>)?(T?BLAST[NPX])\s+([\w\.-]+)\s+(\[[\w-]*\])/) {
400 $self->{'BLAST_TYPE'} = $2;
401 $self->{'BLAST_VERSION'} = $3;
402 } # BLAST report type - not a valid header element # JB949
403
404 # Support Paracel BTK output
405 elsif ( $_ =~ /(^[A-Z0-9_]+)\s+BTK\s+/ ) {
406 $self->{'BLAST_TYPE'} = $1;
407 $self->{'BTK'} = 1;
408 }
409 elsif ($_ =~ /^Database:\s+(.+)/) {$header_flag = 1;$self->{'DATABASE'} = $1} # valid header element found
410 elsif ($_ =~ /^\s*pattern\s+(\S+).*position\s+(\d+)\D/) {
411 # For PHIBLAST reports
412 $header_flag = 1; # valid header element found
413 $self->{'PATTERN'} = $1;
414 push (@{$self->{'QPATLOCATION'}}, $2);
415 }
416 elsif (($_ =~ /^>/) && ($header_flag==1)) {$self->_pushback($_); return 1} # only leave if we have actually parsed a valid header!
417 elsif (($_ =~ /^Parameters|^\s+Database:/) && ($header_flag==1)) { # if we entered a header, and saw nothing before the stats at the end, then it was empty
418 $self->_pushback($_);
419 return 0; # there's nothing in the report
420 }
421 # bug fix suggested by MI Sadowski via Martin Lomas
422 # see bug report #1118
423 if( ref($self->_fh()) !~ /GLOB/ && $self->_fh()->can('EOF') && eof($self->_fh()) ) {
424 $self->warn("unexpected EOF in file\n");
425 return -1;
426 }
427 }
428 return -1; # EOF
429 }
430
431 sub _fastForward {
432 my ($self) = @_;
433 return 0 if $self->{'REPORT_DONE'}; # empty report
434 while(defined( $_ = $self->_readline() ) ) {
435 if ($_ =~ /^Histogram|^Searching|^Parameters|^\s+Database:|^\s+Posted date:/) {
436 return 0;
437 } elsif( $_ =~ /^>/ ) {
438 $self->_pushback($_);
439 return 1;
440 }
441 }
442 unless( $self->{'BTK'} ) { # Paracel BTK reports have no footer
443 $self->warn("Possible error (1) while parsing BLAST report!");
444 }
445 }
446
447 1;
448 __END__