Mercurial > repos > mahtabm > ensembl
comparison variant_effect_predictor/Bio/Tools/BPlite.pm @ 0:1f6dce3d34e0
Uploaded
| author | mahtabm |
|---|---|
| date | Thu, 11 Apr 2013 02:01:53 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:1f6dce3d34e0 |
|---|---|
| 1 # $Id: BPlite.pm,v 1.36.2.2 2003/02/20 00:39:03 jason Exp $ | |
| 2 ############################################################################## | |
| 3 # Bioperl module Bio::Tools::BPlite | |
| 4 ############################################################################## | |
| 5 # | |
| 6 # The original BPlite.pm module has been written by Ian Korf ! | |
| 7 # see http://sapiens.wustl.edu/~ikorf | |
| 8 # | |
| 9 # You may distribute this module under the same terms as perl itself | |
| 10 | |
| 11 =head1 NAME | |
| 12 | |
| 13 Bio::Tools::BPlite - Lightweight BLAST parser | |
| 14 | |
| 15 =head1 SYNOPSIS | |
| 16 | |
| 17 use Bio::Tools::BPlite; | |
| 18 my $report = new Bio::Tools::BPlite(-fh=>\*STDIN); | |
| 19 | |
| 20 { | |
| 21 $report->query; | |
| 22 $report->database; | |
| 23 while(my $sbjct = $report->nextSbjct) { | |
| 24 $sbjct->name; | |
| 25 while (my $hsp = $sbjct->nextHSP) { | |
| 26 $hsp->score; | |
| 27 $hsp->bits; | |
| 28 $hsp->percent; | |
| 29 $hsp->P; | |
| 30 $hsp->EXP; | |
| 31 $hsp->match; | |
| 32 $hsp->positive; | |
| 33 $hsp->length; | |
| 34 $hsp->querySeq; | |
| 35 $hsp->sbjctSeq; | |
| 36 $hsp->homologySeq; | |
| 37 $hsp->query->start; | |
| 38 $hsp->query->end; | |
| 39 $hsp->hit->start; | |
| 40 $hsp->hit->end; | |
| 41 $hsp->hit->seq_id; | |
| 42 $hsp->hit->overlaps($exon); | |
| 43 } | |
| 44 } | |
| 45 | |
| 46 # the following line takes you to the next report in the stream/file | |
| 47 # it will return 0 if that report is empty, | |
| 48 # but that is valid for an empty blast report. | |
| 49 # Returns -1 for EOF. | |
| 50 | |
| 51 last if ($report->_parseHeader == -1); | |
| 52 redo; | |
| 53 } | |
| 54 | |
| 55 | |
| 56 =head1 DESCRIPTION | |
| 57 | |
| 58 BPlite is a package for parsing BLAST reports. The BLAST programs are a family | |
| 59 of widely used algorithms for sequence database searches. The reports are | |
| 60 non-trivial to parse, and there are differences in the formats of the various | |
| 61 flavors of BLAST. BPlite parses BLASTN, BLASTP, BLASTX, TBLASTN, and TBLASTX | |
| 62 reports from both the high performance WU-BLAST, and the more generic | |
| 63 NCBI-BLAST. | |
| 64 | |
| 65 Many people have developed BLAST parsers (I myself have made at least three). | |
| 66 BPlite is for those people who would rather not have a giant object | |
| 67 specification, but rather a simple handle to a BLAST report that works well | |
| 68 in pipes. | |
| 69 | |
| 70 =head2 Object | |
| 71 | |
| 72 BPlite has three kinds of objects, the report, the subject, and the HSP. To | |
| 73 create a new report, you pass a filehandle reference to the BPlite constructor. | |
| 74 | |
| 75 my $report = new Bio::Tools::BPlite(-fh=>\*STDIN); # or any other filehandle | |
| 76 | |
| 77 The report has two attributes (query and database), and one method (nextSbjct). | |
| 78 | |
| 79 $report->query; # access to the query name | |
| 80 $report->database; # access to the database name | |
| 81 $report->nextSbjct; # gets the next subject | |
| 82 while(my $sbjct = $report->nextSbjct) { | |
| 83 # canonical form of use is in a while loop | |
| 84 } | |
| 85 | |
| 86 A subject is a BLAST hit, which should not be confused with an HSP (below). A | |
| 87 BLAST hit may have several alignments associated with it. A useful way of | |
| 88 thinking about it is that a subject is a gene and HSPs are the exons. Subjects | |
| 89 have one attribute (name) and one method (nextHSP). | |
| 90 | |
| 91 $sbjct->name; # access to the subject name | |
| 92 $sbjct->nextHSP; # gets the next HSP from the sbjct | |
| 93 while(my $hsp = $sbjct->nextHSP) { | |
| 94 # canonical form is again a while loop | |
| 95 } | |
| 96 | |
| 97 An HSP is a high scoring pair, or simply an alignment. HSP objects | |
| 98 inherit all the useful methods from RangeI/SeqFeatureI/FeaturePair, | |
| 99 but provide an additional set of attributes (score, bits, percent, P, | |
| 100 match, EXP, positive, length, querySeq, sbjctSeq, homologySeq) that | |
| 101 should be familiar to anyone who has seen a blast report. | |
| 102 | |
| 103 For lazy/efficient coders, two-letter abbreviations are available for the | |
| 104 attributes with long names (qs, ss, hs). Ranges of the aligned sequences in | |
| 105 query/subject and other information (like seqname) are stored | |
| 106 in SeqFeature objects (i.e.: $hsp-E<gt>query, $hsp-E<gt>subject which is equal to | |
| 107 $hsp-E<gt>feature1, $hsp-E<gt>feature2). querySeq, sbjctSeq and homologySeq do only | |
| 108 contain the alignment sequences from the blast report. | |
| 109 | |
| 110 $hsp->score; | |
| 111 $hsp->bits; | |
| 112 $hsp->percent; | |
| 113 $hsp->P; | |
| 114 $hsp->match; | |
| 115 $hsp->positive; | |
| 116 $hsp->length; | |
| 117 $hsp->querySeq; $hsp->qs; | |
| 118 $hsp->sbjctSeq; $hsp->ss; | |
| 119 $hsp->homologySeq; $hsp->hs; | |
| 120 $hsp->query->start; | |
| 121 $hsp->query->end; | |
| 122 $hsp->query->seq_id; | |
| 123 $hsp->hit->primary_tag; # "similarity" | |
| 124 $hsp->hit->source_tag; # "BLAST" | |
| 125 $hsp->hit->start; | |
| 126 $hsp->hit->end; | |
| 127 ... | |
| 128 | |
| 129 So a very simple look into a BLAST report might look like this. | |
| 130 | |
| 131 my $report = new Bio::Tools::BPlite(-fh=>\*STDIN); | |
| 132 while(my $sbjct = $report->nextSbjct) { | |
| 133 print ">",$sbjct->name,"\n"; | |
| 134 while(my $hsp = $sbjct->nextHSP) { | |
| 135 print "\t",$hsp->start,"..",$hsp->end," ",$hsp->bits,"\n"; | |
| 136 } | |
| 137 } | |
| 138 | |
| 139 The output of such code might look like this: | |
| 140 | |
| 141 >foo | |
| 142 100..155 29.5 | |
| 143 268..300 20.1 | |
| 144 >bar | |
| 145 100..153 28.5 | |
| 146 265..290 22.1 | |
| 147 | |
| 148 | |
| 149 =head1 AUTHORS | |
| 150 | |
| 151 Ian Korf (ikorf@sapiens.wustl.edu, http://sapiens.wustl.edu/~ikorf), | |
| 152 Lorenz Pollak (lorenz@ist.org, bioperl port) | |
| 153 | |
| 154 =head1 ACKNOWLEDGEMENTS | |
| 155 | |
| 156 This software was developed at the Genome Sequencing Center at Washington | |
| 157 Univeristy, St. Louis, MO. | |
| 158 | |
| 159 =head1 CONTRIBUTORS | |
| 160 | |
| 161 Jason Stajich, jason@bioperl.org | |
| 162 | |
| 163 =head1 COPYRIGHT | |
| 164 | |
| 165 Copyright (C) 1999 Ian Korf. All Rights Reserved. | |
| 166 | |
| 167 =head1 DISCLAIMER | |
| 168 | |
| 169 This software is provided "as is" without warranty of any kind. | |
| 170 | |
| 171 =cut | |
| 172 | |
| 173 package Bio::Tools::BPlite; | |
| 174 | |
| 175 use strict; | |
| 176 use vars qw(@ISA); | |
| 177 | |
| 178 use Bio::Root::Root; | |
| 179 use Bio::Root::IO; | |
| 180 use Bio::Tools::BPlite::Sbjct; # we want to use Sbjct | |
| 181 use Bio::SeqAnalysisParserI; | |
| 182 use Symbol; | |
| 183 | |
| 184 @ISA = qw(Bio::Root::Root Bio::SeqAnalysisParserI Bio::Root::IO); | |
| 185 | |
| 186 # new comes from a RootI now | |
| 187 | |
| 188 =head2 new | |
| 189 | |
| 190 Title : new | |
| 191 Function: Create a new Bio::Tools::BPlite object | |
| 192 Returns : Bio::Tools::BPlite | |
| 193 Args : -file input file (alternative to -fh) | |
| 194 -fh input stream (alternative to -file) | |
| 195 | |
| 196 =cut | |
| 197 | |
| 198 sub new { | |
| 199 my ($class, @args) = @_; | |
| 200 my $self = $class->SUPER::new(@args); | |
| 201 | |
| 202 # initialize IO | |
| 203 $self->_initialize_io(@args); | |
| 204 | |
| 205 $self->{'QPATLOCATION'} = []; # Anonymous array of query pattern locations for PHIBLAST | |
| 206 | |
| 207 if ($self->_parseHeader) {$self->{'REPORT_DONE'} = 0} # there are alignments | |
| 208 else {$self->{'REPORT_DONE'} = 1} # empty report | |
| 209 | |
| 210 return $self; # success - we hope! | |
| 211 } | |
| 212 | |
| 213 # for SeqAnalysisParserI compliance | |
| 214 | |
| 215 =head2 next_feature | |
| 216 | |
| 217 Title : next_feature | |
| 218 Usage : while( my $feat = $res->next_feature ) { # do something } | |
| 219 Function: SeqAnalysisParserI implementing function. This implementation | |
| 220 iterates over all HSPs. If the HSPs of the current subject match | |
| 221 are exhausted, it will automatically call nextSbjct(). | |
| 222 Example : | |
| 223 Returns : A Bio::SeqFeatureI compliant object, in this case a | |
| 224 Bio::Tools::BPlite::HSP object, and FALSE if there are no more | |
| 225 HSPs. | |
| 226 Args : None | |
| 227 | |
| 228 =cut | |
| 229 | |
| 230 sub next_feature{ | |
| 231 my ($self) = @_; | |
| 232 my ($sbjct, $hsp); | |
| 233 $sbjct = $self->{'_current_sbjct'}; | |
| 234 unless( defined $sbjct ) { | |
| 235 $sbjct = $self->{'_current_sbjct'} = $self->nextSbjct; | |
| 236 return undef unless defined $sbjct; | |
| 237 } | |
| 238 $hsp = $sbjct->nextHSP; | |
| 239 unless( defined $hsp ) { | |
| 240 $self->{'_current_sbjct'} = undef; | |
| 241 return $self->next_feature; | |
| 242 } | |
| 243 return $hsp || undef; | |
| 244 } | |
| 245 | |
| 246 =head2 query | |
| 247 | |
| 248 Title : query | |
| 249 Usage : $query = $obj->query(); | |
| 250 Function : returns the query object | |
| 251 Example : | |
| 252 Returns : query object | |
| 253 Args : | |
| 254 | |
| 255 =cut | |
| 256 | |
| 257 sub query {shift->{'QUERY'}} | |
| 258 | |
| 259 =head2 qlength | |
| 260 | |
| 261 Title : qlength | |
| 262 Usage : $len = $obj->qlength(); | |
| 263 Function : returns the length of the query | |
| 264 Example : | |
| 265 Returns : length of query | |
| 266 Args : | |
| 267 | |
| 268 =cut | |
| 269 | |
| 270 sub qlength {shift->{'LENGTH'}} | |
| 271 | |
| 272 =head2 pattern | |
| 273 | |
| 274 Title : pattern | |
| 275 Usage : $pattern = $obj->pattern(); | |
| 276 Function : returns the pattern used in a PHIBLAST search | |
| 277 | |
| 278 =cut | |
| 279 | |
| 280 sub pattern {shift->{'PATTERN'}} | |
| 281 | |
| 282 =head2 query_pattern_location | |
| 283 | |
| 284 Title : query_pattern_location | |
| 285 Usage : $qpl = $obj->query_pattern_location(); | |
| 286 Function : returns reference to array of locations in the query sequence | |
| 287 of pattern used in a PHIBLAST search | |
| 288 | |
| 289 =cut | |
| 290 | |
| 291 sub query_pattern_location {shift->{'QPATLOCATION'}} | |
| 292 | |
| 293 =head2 database | |
| 294 | |
| 295 Title : database | |
| 296 Usage : $db = $obj->database(); | |
| 297 Function : returns the database used in this search | |
| 298 Example : | |
| 299 Returns : database used for search | |
| 300 Args : | |
| 301 | |
| 302 =cut | |
| 303 | |
| 304 sub database {shift->{'DATABASE'}} | |
| 305 | |
| 306 =head2 nextSbjct | |
| 307 | |
| 308 Title : nextSbjct | |
| 309 Usage : $sbjct = $obj->nextSbjct(); | |
| 310 Function : Method of iterating through all the Sbjct retrieved | |
| 311 from parsing the report | |
| 312 Example : while ( my $sbjct = $obj->nextSbjct ) {} | |
| 313 Returns : next Sbjct object or null if finished | |
| 314 Args : | |
| 315 | |
| 316 =cut | |
| 317 | |
| 318 sub nextSbjct { | |
| 319 my ($self) = @_; | |
| 320 | |
| 321 $self->_fastForward or return undef; | |
| 322 | |
| 323 ####################### | |
| 324 # get all sbjct lines # | |
| 325 ####################### | |
| 326 my $def = $self->_readline(); | |
| 327 while(defined ($_ = $self->_readline() ) ) { | |
| 328 if ($_ !~ /\w/) {next} | |
| 329 elsif ($_ =~ /Strand HSP/) {next} # WU-BLAST non-data | |
| 330 elsif ($_ =~ /^\s{0,2}Score/) {$self->_pushback($_); last} | |
| 331 elsif ($_ =~ /^Histogram|^Searching|^Parameters|^\s+Database:|^\s+Posted date:/) { | |
| 332 $self->_pushback($_); | |
| 333 last; | |
| 334 } | |
| 335 else {$def .= $_} | |
| 336 } | |
| 337 $def =~ s/\s+/ /g; | |
| 338 $def =~ s/\s+$//g; | |
| 339 $def =~ s/Length = ([\d,]+)$//g; | |
| 340 my $length = $1; | |
| 341 return undef unless $def =~ /^>/; | |
| 342 $def =~ s/^>//; | |
| 343 | |
| 344 #################### | |
| 345 # the Sbjct object # | |
| 346 #################### | |
| 347 my $sbjct = new Bio::Tools::BPlite::Sbjct('-name'=>$def, | |
| 348 '-length'=>$length, | |
| 349 '-parent'=>$self); | |
| 350 return $sbjct; | |
| 351 } | |
| 352 | |
| 353 # begin private routines | |
| 354 | |
| 355 sub _parseHeader { | |
| 356 my ($self) = @_; | |
| 357 | |
| 358 # normally, _parseHeader will break out of the parse as soon as it | |
| 359 # reaches a new Subject (i.e. the first one after the header) if you | |
| 360 # call _parseHeader twice in a row, with nothing in between, all you | |
| 361 # accomplish is a ->nextSubject call.. so we need a flag to | |
| 362 # indicate that we have *entered* a header, before we are allowed to | |
| 363 # leave it! | |
| 364 | |
| 365 my $header_flag = 0; # here is the flag/ It is "false" at first, and | |
| 366 # is set to "true" when any valid header element | |
| 367 # is encountered | |
| 368 | |
| 369 $self->{'REPORT_DONE'} = 0; # reset this bit for a new report | |
| 370 while(defined($_ = $self->_readline() ) ) { | |
| 371 s/\(\s*\)//; | |
| 372 if ($_ =~ /^Query=(?:\s+([^\(]+))?/) { | |
| 373 $header_flag = 1; # valid header element found | |
| 374 my $query = $1; | |
| 375 while( defined($_ = $self->_readline() ) ) { | |
| 376 # Continue reading query name until encountering either | |
| 377 # a line that starts with "Database" or a blank line. | |
| 378 # The latter condition is needed in order to be able to | |
| 379 # parse megablast output correctly, since Database comes | |
| 380 # before (not after) the query. | |
| 381 if( ($_ =~ /^Database/) || ($_ =~ /^$/) ) { | |
| 382 $self->_pushback($_); last; | |
| 383 } | |
| 384 $query .= $_; | |
| 385 } | |
| 386 $query =~ s/\s+/ /g; | |
| 387 $query =~ s/^>//; | |
| 388 | |
| 389 my $length = 0; | |
| 390 if( $query =~ /\(([\d,]+)\s+\S+\)\s*$/ ) { | |
| 391 $length = $1; | |
| 392 $length =~ s/,//g; | |
| 393 } else { | |
| 394 $self->debug("length is 0 for '$query'\n"); | |
| 395 } | |
| 396 $self->{'QUERY'} = $query; | |
| 397 $self->{'LENGTH'} = $length; | |
| 398 } | |
| 399 elsif ($_ =~ /^(<b>)?(T?BLAST[NPX])\s+([\w\.-]+)\s+(\[[\w-]*\])/) { | |
| 400 $self->{'BLAST_TYPE'} = $2; | |
| 401 $self->{'BLAST_VERSION'} = $3; | |
| 402 } # BLAST report type - not a valid header element # JB949 | |
| 403 | |
| 404 # Support Paracel BTK output | |
| 405 elsif ( $_ =~ /(^[A-Z0-9_]+)\s+BTK\s+/ ) { | |
| 406 $self->{'BLAST_TYPE'} = $1; | |
| 407 $self->{'BTK'} = 1; | |
| 408 } | |
| 409 elsif ($_ =~ /^Database:\s+(.+)/) {$header_flag = 1;$self->{'DATABASE'} = $1} # valid header element found | |
| 410 elsif ($_ =~ /^\s*pattern\s+(\S+).*position\s+(\d+)\D/) { | |
| 411 # For PHIBLAST reports | |
| 412 $header_flag = 1; # valid header element found | |
| 413 $self->{'PATTERN'} = $1; | |
| 414 push (@{$self->{'QPATLOCATION'}}, $2); | |
| 415 } | |
| 416 elsif (($_ =~ /^>/) && ($header_flag==1)) {$self->_pushback($_); return 1} # only leave if we have actually parsed a valid header! | |
| 417 elsif (($_ =~ /^Parameters|^\s+Database:/) && ($header_flag==1)) { # if we entered a header, and saw nothing before the stats at the end, then it was empty | |
| 418 $self->_pushback($_); | |
| 419 return 0; # there's nothing in the report | |
| 420 } | |
| 421 # bug fix suggested by MI Sadowski via Martin Lomas | |
| 422 # see bug report #1118 | |
| 423 if( ref($self->_fh()) !~ /GLOB/ && $self->_fh()->can('EOF') && eof($self->_fh()) ) { | |
| 424 $self->warn("unexpected EOF in file\n"); | |
| 425 return -1; | |
| 426 } | |
| 427 } | |
| 428 return -1; # EOF | |
| 429 } | |
| 430 | |
| 431 sub _fastForward { | |
| 432 my ($self) = @_; | |
| 433 return 0 if $self->{'REPORT_DONE'}; # empty report | |
| 434 while(defined( $_ = $self->_readline() ) ) { | |
| 435 if ($_ =~ /^Histogram|^Searching|^Parameters|^\s+Database:|^\s+Posted date:/) { | |
| 436 return 0; | |
| 437 } elsif( $_ =~ /^>/ ) { | |
| 438 $self->_pushback($_); | |
| 439 return 1; | |
| 440 } | |
| 441 } | |
| 442 unless( $self->{'BTK'} ) { # Paracel BTK reports have no footer | |
| 443 $self->warn("Possible error (1) while parsing BLAST report!"); | |
| 444 } | |
| 445 } | |
| 446 | |
| 447 1; | |
| 448 __END__ |
