Mercurial > repos > mahtabm > ensembl
comparison variant_effect_predictor/Bio/Tools/BPlite.pm @ 0:1f6dce3d34e0
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 02:01:53 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:1f6dce3d34e0 |
---|---|
1 # $Id: BPlite.pm,v 1.36.2.2 2003/02/20 00:39:03 jason Exp $ | |
2 ############################################################################## | |
3 # Bioperl module Bio::Tools::BPlite | |
4 ############################################################################## | |
5 # | |
6 # The original BPlite.pm module has been written by Ian Korf ! | |
7 # see http://sapiens.wustl.edu/~ikorf | |
8 # | |
9 # You may distribute this module under the same terms as perl itself | |
10 | |
11 =head1 NAME | |
12 | |
13 Bio::Tools::BPlite - Lightweight BLAST parser | |
14 | |
15 =head1 SYNOPSIS | |
16 | |
17 use Bio::Tools::BPlite; | |
18 my $report = new Bio::Tools::BPlite(-fh=>\*STDIN); | |
19 | |
20 { | |
21 $report->query; | |
22 $report->database; | |
23 while(my $sbjct = $report->nextSbjct) { | |
24 $sbjct->name; | |
25 while (my $hsp = $sbjct->nextHSP) { | |
26 $hsp->score; | |
27 $hsp->bits; | |
28 $hsp->percent; | |
29 $hsp->P; | |
30 $hsp->EXP; | |
31 $hsp->match; | |
32 $hsp->positive; | |
33 $hsp->length; | |
34 $hsp->querySeq; | |
35 $hsp->sbjctSeq; | |
36 $hsp->homologySeq; | |
37 $hsp->query->start; | |
38 $hsp->query->end; | |
39 $hsp->hit->start; | |
40 $hsp->hit->end; | |
41 $hsp->hit->seq_id; | |
42 $hsp->hit->overlaps($exon); | |
43 } | |
44 } | |
45 | |
46 # the following line takes you to the next report in the stream/file | |
47 # it will return 0 if that report is empty, | |
48 # but that is valid for an empty blast report. | |
49 # Returns -1 for EOF. | |
50 | |
51 last if ($report->_parseHeader == -1); | |
52 redo; | |
53 } | |
54 | |
55 | |
56 =head1 DESCRIPTION | |
57 | |
58 BPlite is a package for parsing BLAST reports. The BLAST programs are a family | |
59 of widely used algorithms for sequence database searches. The reports are | |
60 non-trivial to parse, and there are differences in the formats of the various | |
61 flavors of BLAST. BPlite parses BLASTN, BLASTP, BLASTX, TBLASTN, and TBLASTX | |
62 reports from both the high performance WU-BLAST, and the more generic | |
63 NCBI-BLAST. | |
64 | |
65 Many people have developed BLAST parsers (I myself have made at least three). | |
66 BPlite is for those people who would rather not have a giant object | |
67 specification, but rather a simple handle to a BLAST report that works well | |
68 in pipes. | |
69 | |
70 =head2 Object | |
71 | |
72 BPlite has three kinds of objects, the report, the subject, and the HSP. To | |
73 create a new report, you pass a filehandle reference to the BPlite constructor. | |
74 | |
75 my $report = new Bio::Tools::BPlite(-fh=>\*STDIN); # or any other filehandle | |
76 | |
77 The report has two attributes (query and database), and one method (nextSbjct). | |
78 | |
79 $report->query; # access to the query name | |
80 $report->database; # access to the database name | |
81 $report->nextSbjct; # gets the next subject | |
82 while(my $sbjct = $report->nextSbjct) { | |
83 # canonical form of use is in a while loop | |
84 } | |
85 | |
86 A subject is a BLAST hit, which should not be confused with an HSP (below). A | |
87 BLAST hit may have several alignments associated with it. A useful way of | |
88 thinking about it is that a subject is a gene and HSPs are the exons. Subjects | |
89 have one attribute (name) and one method (nextHSP). | |
90 | |
91 $sbjct->name; # access to the subject name | |
92 $sbjct->nextHSP; # gets the next HSP from the sbjct | |
93 while(my $hsp = $sbjct->nextHSP) { | |
94 # canonical form is again a while loop | |
95 } | |
96 | |
97 An HSP is a high scoring pair, or simply an alignment. HSP objects | |
98 inherit all the useful methods from RangeI/SeqFeatureI/FeaturePair, | |
99 but provide an additional set of attributes (score, bits, percent, P, | |
100 match, EXP, positive, length, querySeq, sbjctSeq, homologySeq) that | |
101 should be familiar to anyone who has seen a blast report. | |
102 | |
103 For lazy/efficient coders, two-letter abbreviations are available for the | |
104 attributes with long names (qs, ss, hs). Ranges of the aligned sequences in | |
105 query/subject and other information (like seqname) are stored | |
106 in SeqFeature objects (i.e.: $hsp-E<gt>query, $hsp-E<gt>subject which is equal to | |
107 $hsp-E<gt>feature1, $hsp-E<gt>feature2). querySeq, sbjctSeq and homologySeq do only | |
108 contain the alignment sequences from the blast report. | |
109 | |
110 $hsp->score; | |
111 $hsp->bits; | |
112 $hsp->percent; | |
113 $hsp->P; | |
114 $hsp->match; | |
115 $hsp->positive; | |
116 $hsp->length; | |
117 $hsp->querySeq; $hsp->qs; | |
118 $hsp->sbjctSeq; $hsp->ss; | |
119 $hsp->homologySeq; $hsp->hs; | |
120 $hsp->query->start; | |
121 $hsp->query->end; | |
122 $hsp->query->seq_id; | |
123 $hsp->hit->primary_tag; # "similarity" | |
124 $hsp->hit->source_tag; # "BLAST" | |
125 $hsp->hit->start; | |
126 $hsp->hit->end; | |
127 ... | |
128 | |
129 So a very simple look into a BLAST report might look like this. | |
130 | |
131 my $report = new Bio::Tools::BPlite(-fh=>\*STDIN); | |
132 while(my $sbjct = $report->nextSbjct) { | |
133 print ">",$sbjct->name,"\n"; | |
134 while(my $hsp = $sbjct->nextHSP) { | |
135 print "\t",$hsp->start,"..",$hsp->end," ",$hsp->bits,"\n"; | |
136 } | |
137 } | |
138 | |
139 The output of such code might look like this: | |
140 | |
141 >foo | |
142 100..155 29.5 | |
143 268..300 20.1 | |
144 >bar | |
145 100..153 28.5 | |
146 265..290 22.1 | |
147 | |
148 | |
149 =head1 AUTHORS | |
150 | |
151 Ian Korf (ikorf@sapiens.wustl.edu, http://sapiens.wustl.edu/~ikorf), | |
152 Lorenz Pollak (lorenz@ist.org, bioperl port) | |
153 | |
154 =head1 ACKNOWLEDGEMENTS | |
155 | |
156 This software was developed at the Genome Sequencing Center at Washington | |
157 Univeristy, St. Louis, MO. | |
158 | |
159 =head1 CONTRIBUTORS | |
160 | |
161 Jason Stajich, jason@bioperl.org | |
162 | |
163 =head1 COPYRIGHT | |
164 | |
165 Copyright (C) 1999 Ian Korf. All Rights Reserved. | |
166 | |
167 =head1 DISCLAIMER | |
168 | |
169 This software is provided "as is" without warranty of any kind. | |
170 | |
171 =cut | |
172 | |
173 package Bio::Tools::BPlite; | |
174 | |
175 use strict; | |
176 use vars qw(@ISA); | |
177 | |
178 use Bio::Root::Root; | |
179 use Bio::Root::IO; | |
180 use Bio::Tools::BPlite::Sbjct; # we want to use Sbjct | |
181 use Bio::SeqAnalysisParserI; | |
182 use Symbol; | |
183 | |
184 @ISA = qw(Bio::Root::Root Bio::SeqAnalysisParserI Bio::Root::IO); | |
185 | |
186 # new comes from a RootI now | |
187 | |
188 =head2 new | |
189 | |
190 Title : new | |
191 Function: Create a new Bio::Tools::BPlite object | |
192 Returns : Bio::Tools::BPlite | |
193 Args : -file input file (alternative to -fh) | |
194 -fh input stream (alternative to -file) | |
195 | |
196 =cut | |
197 | |
198 sub new { | |
199 my ($class, @args) = @_; | |
200 my $self = $class->SUPER::new(@args); | |
201 | |
202 # initialize IO | |
203 $self->_initialize_io(@args); | |
204 | |
205 $self->{'QPATLOCATION'} = []; # Anonymous array of query pattern locations for PHIBLAST | |
206 | |
207 if ($self->_parseHeader) {$self->{'REPORT_DONE'} = 0} # there are alignments | |
208 else {$self->{'REPORT_DONE'} = 1} # empty report | |
209 | |
210 return $self; # success - we hope! | |
211 } | |
212 | |
213 # for SeqAnalysisParserI compliance | |
214 | |
215 =head2 next_feature | |
216 | |
217 Title : next_feature | |
218 Usage : while( my $feat = $res->next_feature ) { # do something } | |
219 Function: SeqAnalysisParserI implementing function. This implementation | |
220 iterates over all HSPs. If the HSPs of the current subject match | |
221 are exhausted, it will automatically call nextSbjct(). | |
222 Example : | |
223 Returns : A Bio::SeqFeatureI compliant object, in this case a | |
224 Bio::Tools::BPlite::HSP object, and FALSE if there are no more | |
225 HSPs. | |
226 Args : None | |
227 | |
228 =cut | |
229 | |
230 sub next_feature{ | |
231 my ($self) = @_; | |
232 my ($sbjct, $hsp); | |
233 $sbjct = $self->{'_current_sbjct'}; | |
234 unless( defined $sbjct ) { | |
235 $sbjct = $self->{'_current_sbjct'} = $self->nextSbjct; | |
236 return undef unless defined $sbjct; | |
237 } | |
238 $hsp = $sbjct->nextHSP; | |
239 unless( defined $hsp ) { | |
240 $self->{'_current_sbjct'} = undef; | |
241 return $self->next_feature; | |
242 } | |
243 return $hsp || undef; | |
244 } | |
245 | |
246 =head2 query | |
247 | |
248 Title : query | |
249 Usage : $query = $obj->query(); | |
250 Function : returns the query object | |
251 Example : | |
252 Returns : query object | |
253 Args : | |
254 | |
255 =cut | |
256 | |
257 sub query {shift->{'QUERY'}} | |
258 | |
259 =head2 qlength | |
260 | |
261 Title : qlength | |
262 Usage : $len = $obj->qlength(); | |
263 Function : returns the length of the query | |
264 Example : | |
265 Returns : length of query | |
266 Args : | |
267 | |
268 =cut | |
269 | |
270 sub qlength {shift->{'LENGTH'}} | |
271 | |
272 =head2 pattern | |
273 | |
274 Title : pattern | |
275 Usage : $pattern = $obj->pattern(); | |
276 Function : returns the pattern used in a PHIBLAST search | |
277 | |
278 =cut | |
279 | |
280 sub pattern {shift->{'PATTERN'}} | |
281 | |
282 =head2 query_pattern_location | |
283 | |
284 Title : query_pattern_location | |
285 Usage : $qpl = $obj->query_pattern_location(); | |
286 Function : returns reference to array of locations in the query sequence | |
287 of pattern used in a PHIBLAST search | |
288 | |
289 =cut | |
290 | |
291 sub query_pattern_location {shift->{'QPATLOCATION'}} | |
292 | |
293 =head2 database | |
294 | |
295 Title : database | |
296 Usage : $db = $obj->database(); | |
297 Function : returns the database used in this search | |
298 Example : | |
299 Returns : database used for search | |
300 Args : | |
301 | |
302 =cut | |
303 | |
304 sub database {shift->{'DATABASE'}} | |
305 | |
306 =head2 nextSbjct | |
307 | |
308 Title : nextSbjct | |
309 Usage : $sbjct = $obj->nextSbjct(); | |
310 Function : Method of iterating through all the Sbjct retrieved | |
311 from parsing the report | |
312 Example : while ( my $sbjct = $obj->nextSbjct ) {} | |
313 Returns : next Sbjct object or null if finished | |
314 Args : | |
315 | |
316 =cut | |
317 | |
318 sub nextSbjct { | |
319 my ($self) = @_; | |
320 | |
321 $self->_fastForward or return undef; | |
322 | |
323 ####################### | |
324 # get all sbjct lines # | |
325 ####################### | |
326 my $def = $self->_readline(); | |
327 while(defined ($_ = $self->_readline() ) ) { | |
328 if ($_ !~ /\w/) {next} | |
329 elsif ($_ =~ /Strand HSP/) {next} # WU-BLAST non-data | |
330 elsif ($_ =~ /^\s{0,2}Score/) {$self->_pushback($_); last} | |
331 elsif ($_ =~ /^Histogram|^Searching|^Parameters|^\s+Database:|^\s+Posted date:/) { | |
332 $self->_pushback($_); | |
333 last; | |
334 } | |
335 else {$def .= $_} | |
336 } | |
337 $def =~ s/\s+/ /g; | |
338 $def =~ s/\s+$//g; | |
339 $def =~ s/Length = ([\d,]+)$//g; | |
340 my $length = $1; | |
341 return undef unless $def =~ /^>/; | |
342 $def =~ s/^>//; | |
343 | |
344 #################### | |
345 # the Sbjct object # | |
346 #################### | |
347 my $sbjct = new Bio::Tools::BPlite::Sbjct('-name'=>$def, | |
348 '-length'=>$length, | |
349 '-parent'=>$self); | |
350 return $sbjct; | |
351 } | |
352 | |
353 # begin private routines | |
354 | |
355 sub _parseHeader { | |
356 my ($self) = @_; | |
357 | |
358 # normally, _parseHeader will break out of the parse as soon as it | |
359 # reaches a new Subject (i.e. the first one after the header) if you | |
360 # call _parseHeader twice in a row, with nothing in between, all you | |
361 # accomplish is a ->nextSubject call.. so we need a flag to | |
362 # indicate that we have *entered* a header, before we are allowed to | |
363 # leave it! | |
364 | |
365 my $header_flag = 0; # here is the flag/ It is "false" at first, and | |
366 # is set to "true" when any valid header element | |
367 # is encountered | |
368 | |
369 $self->{'REPORT_DONE'} = 0; # reset this bit for a new report | |
370 while(defined($_ = $self->_readline() ) ) { | |
371 s/\(\s*\)//; | |
372 if ($_ =~ /^Query=(?:\s+([^\(]+))?/) { | |
373 $header_flag = 1; # valid header element found | |
374 my $query = $1; | |
375 while( defined($_ = $self->_readline() ) ) { | |
376 # Continue reading query name until encountering either | |
377 # a line that starts with "Database" or a blank line. | |
378 # The latter condition is needed in order to be able to | |
379 # parse megablast output correctly, since Database comes | |
380 # before (not after) the query. | |
381 if( ($_ =~ /^Database/) || ($_ =~ /^$/) ) { | |
382 $self->_pushback($_); last; | |
383 } | |
384 $query .= $_; | |
385 } | |
386 $query =~ s/\s+/ /g; | |
387 $query =~ s/^>//; | |
388 | |
389 my $length = 0; | |
390 if( $query =~ /\(([\d,]+)\s+\S+\)\s*$/ ) { | |
391 $length = $1; | |
392 $length =~ s/,//g; | |
393 } else { | |
394 $self->debug("length is 0 for '$query'\n"); | |
395 } | |
396 $self->{'QUERY'} = $query; | |
397 $self->{'LENGTH'} = $length; | |
398 } | |
399 elsif ($_ =~ /^(<b>)?(T?BLAST[NPX])\s+([\w\.-]+)\s+(\[[\w-]*\])/) { | |
400 $self->{'BLAST_TYPE'} = $2; | |
401 $self->{'BLAST_VERSION'} = $3; | |
402 } # BLAST report type - not a valid header element # JB949 | |
403 | |
404 # Support Paracel BTK output | |
405 elsif ( $_ =~ /(^[A-Z0-9_]+)\s+BTK\s+/ ) { | |
406 $self->{'BLAST_TYPE'} = $1; | |
407 $self->{'BTK'} = 1; | |
408 } | |
409 elsif ($_ =~ /^Database:\s+(.+)/) {$header_flag = 1;$self->{'DATABASE'} = $1} # valid header element found | |
410 elsif ($_ =~ /^\s*pattern\s+(\S+).*position\s+(\d+)\D/) { | |
411 # For PHIBLAST reports | |
412 $header_flag = 1; # valid header element found | |
413 $self->{'PATTERN'} = $1; | |
414 push (@{$self->{'QPATLOCATION'}}, $2); | |
415 } | |
416 elsif (($_ =~ /^>/) && ($header_flag==1)) {$self->_pushback($_); return 1} # only leave if we have actually parsed a valid header! | |
417 elsif (($_ =~ /^Parameters|^\s+Database:/) && ($header_flag==1)) { # if we entered a header, and saw nothing before the stats at the end, then it was empty | |
418 $self->_pushback($_); | |
419 return 0; # there's nothing in the report | |
420 } | |
421 # bug fix suggested by MI Sadowski via Martin Lomas | |
422 # see bug report #1118 | |
423 if( ref($self->_fh()) !~ /GLOB/ && $self->_fh()->can('EOF') && eof($self->_fh()) ) { | |
424 $self->warn("unexpected EOF in file\n"); | |
425 return -1; | |
426 } | |
427 } | |
428 return -1; # EOF | |
429 } | |
430 | |
431 sub _fastForward { | |
432 my ($self) = @_; | |
433 return 0 if $self->{'REPORT_DONE'}; # empty report | |
434 while(defined( $_ = $self->_readline() ) ) { | |
435 if ($_ =~ /^Histogram|^Searching|^Parameters|^\s+Database:|^\s+Posted date:/) { | |
436 return 0; | |
437 } elsif( $_ =~ /^>/ ) { | |
438 $self->_pushback($_); | |
439 return 1; | |
440 } | |
441 } | |
442 unless( $self->{'BTK'} ) { # Paracel BTK reports have no footer | |
443 $self->warn("Possible error (1) while parsing BLAST report!"); | |
444 } | |
445 } | |
446 | |
447 1; | |
448 __END__ |