Mercurial > repos > mahtabm > ensembl
comparison variant_effect_predictor/Bio/DB/Fasta.pm @ 0:1f6dce3d34e0
Uploaded
| author | mahtabm |
|---|---|
| date | Thu, 11 Apr 2013 02:01:53 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:1f6dce3d34e0 |
|---|---|
| 1 =head1 NAME | |
| 2 | |
| 3 Bio::DB::Fasta -- Fast indexed access to a directory of fasta files | |
| 4 | |
| 5 =head1 SYNOPSIS | |
| 6 | |
| 7 use Bio::DB::Fasta; | |
| 8 | |
| 9 # create database from directory of fasta files | |
| 10 my $db = Bio::DB::Fasta->new('/path/to/fasta/files'); | |
| 11 | |
| 12 # simple access (for those without Bioperl) | |
| 13 my $seq = $db->seq('CHROMOSOME_I',4_000_000 => 4_100_000); | |
| 14 my $revseq = $db->seq('CHROMOSOME_I',4_100_000 => 4_000_000); | |
| 15 my @ids = $db->ids; | |
| 16 my $length = $db->length('CHROMOSOME_I'); | |
| 17 my $alphabet = $db->alphabet('CHROMOSOME_I'); | |
| 18 my $header = $db->header('CHROMOSOME_I'); | |
| 19 | |
| 20 # Bioperl-style access | |
| 21 my $db = Bio::DB::Fasta->new('/path/to/fasta/files'); | |
| 22 | |
| 23 my $obj = $db->get_Seq_by_id('CHROMOSOME_I'); | |
| 24 my $seq = $obj->seq; | |
| 25 my $subseq = $obj->subseq(4_000_000 => 4_100_000); | |
| 26 my $length = $obj->length; | |
| 27 # (etc) | |
| 28 | |
| 29 # Bio::SeqIO-style access | |
| 30 my $stream = Bio::DB::Fasta->new('/path/to/fasta/files')->get_PrimarySeq_stream; | |
| 31 while (my $seq = $stream->next_seq) { | |
| 32 # Bio::PrimarySeqI stuff | |
| 33 } | |
| 34 | |
| 35 my $fh = Bio::DB::Fasta->newFh('/path/to/fasta/files'); | |
| 36 while (my $seq = <$fh>) { | |
| 37 # Bio::PrimarySeqI stuff | |
| 38 } | |
| 39 | |
| 40 # tied hash access | |
| 41 tie %sequences,'Bio::DB::Fasta','/path/to/fasta/files'; | |
| 42 print $sequences{'CHROMOSOME_I:1,20000'}; | |
| 43 | |
| 44 =head1 DESCRIPTION | |
| 45 | |
| 46 Bio::DB::Fasta provides indexed access to one or more Fasta files. It | |
| 47 provides random access to each sequence entry, and to subsequences | |
| 48 within each entry, allowing you to retrieve portions of very large | |
| 49 sequences without bringing the entire sequence into memory. | |
| 50 | |
| 51 When you initialize the module, you point it at a single fasta file or | |
| 52 a directory of multiple such files. The first time it is run, the | |
| 53 module generates an index of the contents of the file or directory | |
| 54 using the AnyDBM module (Berkeley DB preferred, followed by GDBM_File, | |
| 55 NDBM_File, and SDBM_File). Thereafter it uses the index file to find | |
| 56 the file and offset for any requested sequence. If one of the source | |
| 57 fasta files is updated, the module reindexes just that one file. (You | |
| 58 can also force reindexing manually). For improved performance, the | |
| 59 module keeps a cache of open filehandles, closing less-recently used | |
| 60 ones when the cache is full. | |
| 61 | |
| 62 The fasta files may contain any combination of nucleotide and protein | |
| 63 sequences; during indexing the module guesses the molecular type. | |
| 64 Entries may have any line length, and different line lengths are | |
| 65 allowed in the same file. However, within a sequence entry, all lines | |
| 66 must be the same length except for the last. | |
| 67 | |
| 68 The module uses /^E<gt>(\S+)/ to extract each sequence's primary ID from | |
| 69 the Fasta header. During indexing, you may pass a callback routine to | |
| 70 modify this primary ID. For example, you may wish to extract a | |
| 71 portion of the gi|gb|abc|xyz nonsense that GenBank Fasta files use. | |
| 72 The original header line can be recovered later. | |
| 73 | |
| 74 This module was developed for use with the C. elegans and human | |
| 75 genomes, and has been tested with sequence segments as large as 20 | |
| 76 megabases. Indexing the C. elegans genome (100 megabases of genomic | |
| 77 sequence plus 100,000 ESTs) takes ~5 minutes on my 300 MHz pentium | |
| 78 laptop. On the same system, average access time for any 200-mer within | |
| 79 the C. elegans genome was E<lt>0.02s. | |
| 80 | |
| 81 =head1 DATABASE CREATION AND INDEXING | |
| 82 | |
| 83 The two constructors for this class are new() and newFh(). The former | |
| 84 creates a Bio::DB::Fasta object which is accessed via method calls. | |
| 85 The latter creates a tied filehandle which can be used Bio::SeqIO | |
| 86 style to fetch sequence objects in a stream fashion. There is also a | |
| 87 tied hash interface. | |
| 88 | |
| 89 =over 4 | |
| 90 | |
| 91 =item $db = Bio::DB::Fasta-E<gt>new($fasta_path [,%options]) | |
| 92 | |
| 93 Create a new Bio::DB::Fasta object from the Fasta file or files | |
| 94 indicated by $fasta_path. Indexing will be performed automatically if | |
| 95 needed. If successful, new() will return the database accessor | |
| 96 object. Otherwise it will return undef. | |
| 97 | |
| 98 $fasta_path may be an individual Fasta file, or may refer to a | |
| 99 directory containing one or more of such files. Following the path, | |
| 100 you may pass a series of name=E<gt>value options or a hash with these | |
| 101 same name=E<gt>value pairs. Valid options are: | |
| 102 | |
| 103 Option Name Description Default | |
| 104 ----------- ----------- ------- | |
| 105 | |
| 106 -glob Glob expression to use *.{fa,fasta,fast,FA,FASTA,FAST,dna} | |
| 107 for searching for Fasta | |
| 108 files in directories. | |
| 109 | |
| 110 -makeid A code subroutine for None | |
| 111 transforming Fasta IDs. | |
| 112 | |
| 113 -maxopen Maximum size of 32 | |
| 114 filehandle cache. | |
| 115 | |
| 116 -debug Turn on status 0 | |
| 117 messages. | |
| 118 | |
| 119 -reindex Force the index to be 0 | |
| 120 rebuilt. | |
| 121 | |
| 122 -dbmargs Additional arguments none | |
| 123 to pass to the DBM | |
| 124 routines when tied | |
| 125 (scalar or array ref). | |
| 126 | |
| 127 -dbmargs can be used to control the format of the index. For example, | |
| 128 you can pass $DB_BTREE to this argument so as to force the IDs to be | |
| 129 sorted and retrieved alphabetically. Note that you must use the same | |
| 130 arguments every time you open the index! | |
| 131 | |
| 132 -reindex can be used to force the index to be recreated from scratch. | |
| 133 | |
| 134 =item $fh = Bio::DB::Fasta-E<gt>newFh($fasta_path [,%options]) | |
| 135 | |
| 136 Create a tied filehandle opened on a Bio::DB::Fasta object. Reading | |
| 137 from this filehandle with E<lt>E<gt> will return a stream of sequence objects, | |
| 138 Bio::SeqIO style. | |
| 139 | |
| 140 =back | |
| 141 | |
| 142 The -makeid option gives you a chance to modify sequence IDs during | |
| 143 indexing. The option's value should be a code reference that will | |
| 144 take a scalar argument and return a scalar result, like this: | |
| 145 | |
| 146 $db = Bio::DB::Fasta->new("file.fa",-makeid=>\&make_my_id); | |
| 147 | |
| 148 sub make_my_id { | |
| 149 my $description_line = shift; | |
| 150 # get a new id from the fasta header | |
| 151 return $new_id; | |
| 152 } | |
| 153 | |
| 154 make_my_id() will be called with the full fasta id line (including the | |
| 155 "E<gt>" symbol!). For example: | |
| 156 | |
| 157 >A12345.3 Predicted C. elegans protein egl-2 | |
| 158 | |
| 159 By default, this module will use the regular expression /^E<gt>(\S+)/ | |
| 160 to extract "A12345.3" for use as the ID. If you pass a -makeid | |
| 161 callback, you can extract any portion of this, such as the "egl-2" | |
| 162 symbol. | |
| 163 | |
| 164 The -makeid option is ignored after the index is constructed. | |
| 165 | |
| 166 =head1 OBJECT METHODS | |
| 167 | |
| 168 The following object methods are provided. | |
| 169 | |
| 170 =over 4 | |
| 171 | |
| 172 =item $raw_seq = $db-E<gt>seq($id [,$start, $stop]) | |
| 173 | |
| 174 Return the raw sequence (a string) given an ID and optionally a start | |
| 175 and stop position in the sequence. In the case of DNA sequence, if | |
| 176 $stop is less than $start, then the reverse complement of the sequence | |
| 177 is returned (this violates Bio::Seq conventions). | |
| 178 | |
| 179 For your convenience, subsequences can be indicated with this compound | |
| 180 ID: | |
| 181 | |
| 182 $db->seq("$id:$start,$stop") | |
| 183 | |
| 184 =item $length = $db-E<gt>length($id) | |
| 185 | |
| 186 Return the length of the indicated sequence. | |
| 187 | |
| 188 =item $header = $db-E<gt>header($id) | |
| 189 | |
| 190 Return the header line for the ID, including the initial "E<gt>". | |
| 191 | |
| 192 =item $type = $db-E<gt>alphabet($id) | |
| 193 | |
| 194 Return the molecular type of the indicated sequence. One of "dna", | |
| 195 "rna" or "protein". | |
| 196 | |
| 197 =item $filename = $db-E<gt>file($id) | |
| 198 | |
| 199 Return the name of the file in which the indicated sequence can be | |
| 200 found. | |
| 201 | |
| 202 =item $offset = $db-E<gt>offset($id) | |
| 203 | |
| 204 Return the offset of the indicated sequence from the beginning of the | |
| 205 file in which it is located. The offset points to the beginning of | |
| 206 the sequence, not the beginning of the header line. | |
| 207 | |
| 208 =item $header_length = $db-E<gt>headerlen($id) | |
| 209 | |
| 210 Return the length of the header line for the indicated sequence. | |
| 211 | |
| 212 =item $header_offset = $db-E<gt>header_offset($id) | |
| 213 | |
| 214 Return the offset of the header line for the indicated sequence from | |
| 215 the beginning of the file in which it is located. | |
| 216 | |
| 217 =item $index_name = $db-E<gt>index_name | |
| 218 | |
| 219 Return the path to the index file. | |
| 220 | |
| 221 =item $path = $db-E<gt>path | |
| 222 | |
| 223 Return the path to the Fasta file(s). | |
| 224 | |
| 225 =back | |
| 226 | |
| 227 For BioPerl-style access, the following methods are provided: | |
| 228 | |
| 229 =over 4 | |
| 230 | |
| 231 =item $seq = $db-E<gt>get_Seq_by_id($id) | |
| 232 | |
| 233 Return a Bio::PrimarySeq::Fasta object, which obeys the | |
| 234 Bio::PrimarySeqI conventions. For example, to recover the raw DNA or | |
| 235 protein sequence, call $seq-E<gt>seq(). | |
| 236 | |
| 237 Note that get_Seq_by_id() does not bring the entire sequence into | |
| 238 memory until requested. Internally, the returned object uses the | |
| 239 accessor to generate subsequences as needed. | |
| 240 | |
| 241 =item $seq = $db-E<gt>get_Seq_by_acc($id) | |
| 242 | |
| 243 =item $seq = $db-E<gt>get_Seq_by_primary_id($id) | |
| 244 | |
| 245 These methods all do the same thing as get_Seq_by_id(). | |
| 246 | |
| 247 =item $stream = $db-E<gt>get_PrimarySeq_stream() | |
| 248 | |
| 249 Return a Bio::DB::Fasta::Stream object, which supports a single method | |
| 250 next_seq(). Each call to next_seq() returns a new | |
| 251 Bio::PrimarySeq::Fasta object, until no more sequences remain. | |
| 252 | |
| 253 =back | |
| 254 | |
| 255 See L<Bio::PrimarySeqI> for methods provided by the sequence objects | |
| 256 returned from get_Seq_by_id() and get_PrimarySeq_stream(). | |
| 257 | |
| 258 =head1 TIED INTERFACES | |
| 259 | |
| 260 This module provides two tied interfaces, one which allows you to | |
| 261 treat the sequence database as a hash, and the other which allows you | |
| 262 to treat the database as an I/O stream. | |
| 263 | |
| 264 =head2 Creating a Tied Hash | |
| 265 | |
| 266 The tied hash interface is very straightforward | |
| 267 | |
| 268 =over 4 | |
| 269 | |
| 270 =item $obj = tie %db,'Bio::DB::Fasta','/path/to/fasta/files' [,@args] | |
| 271 | |
| 272 Tie %db to Bio::DB::Fasta using the indicated path to the Fasta files. | |
| 273 The optional @args list is the same set of named argument/value pairs | |
| 274 used by Bio::DB::Fasta-E<gt>new(). | |
| 275 | |
| 276 If successful, tie() will return the tied object. Otherwise it will | |
| 277 return undef. | |
| 278 | |
| 279 =back | |
| 280 | |
| 281 Once tied, you can use the hash to retrieve an individual sequence by | |
| 282 its ID, like this: | |
| 283 | |
| 284 my $seq = $db{CHROMOSOME_I}; | |
| 285 | |
| 286 You may select a subsequence by appending the comma-separated range to | |
| 287 the sequence ID in the format "$id:$start,$stop". For example, here | |
| 288 is the first 1000 bp of the sequence with the ID "CHROMOSOME_I": | |
| 289 | |
| 290 my $seq = $db{'CHROMOSOME_I:1,1000'}; | |
| 291 | |
| 292 (The regular expression used to parse this format allows sequence IDs | |
| 293 to contain colons.) | |
| 294 | |
| 295 When selecting subsequences, if $start E<gt> stop, then the reverse | |
| 296 complement will be returned for DNA sequences. | |
| 297 | |
| 298 The keys() and values() functions will return the sequence IDs and | |
| 299 their sequences, respectively. In addition, each() can be used to | |
| 300 iterate over the entire data set: | |
| 301 | |
| 302 while (my ($id,$sequence) = each %db) { | |
| 303 print "$id => $sequence\n"; | |
| 304 } | |
| 305 | |
| 306 When dealing with very large sequences, you can avoid bringing them | |
| 307 into memory by calling each() in a scalar context. This returns the | |
| 308 key only. You can then use tied(%db) to recover the Bio::DB::Fasta | |
| 309 object and call its methods. | |
| 310 | |
| 311 while (my $id = each %db) { | |
| 312 print "$id => $db{$sequence:1,100}\n"; | |
| 313 print "$id => ",tied(%db)->length($id),"\n"; | |
| 314 } | |
| 315 | |
| 316 You may, in addition invoke Bio::DB::Fasta's FIRSTKEY and NEXTKEY tied | |
| 317 hash methods directly. | |
| 318 | |
| 319 =over 4 | |
| 320 | |
| 321 =item $id = $db-E<gt>FIRSTKEY | |
| 322 | |
| 323 Return the first ID in the database. | |
| 324 | |
| 325 =item $id = $db-E<gt>NEXTKEY($id) | |
| 326 | |
| 327 Given an ID, return the next ID in sequence. | |
| 328 | |
| 329 =back | |
| 330 | |
| 331 This allows you to write the following iterative loop using just the | |
| 332 object-oriented interface: | |
| 333 | |
| 334 my $db = Bio::DB::Fasta->new('/path/to/fasta/files'); | |
| 335 for (my $id=$db->FIRSTKEY; $id; $id=$db->NEXTKEY($id)) { | |
| 336 # do something with sequence | |
| 337 } | |
| 338 | |
| 339 =head2 Creating a Tied Filehandle | |
| 340 | |
| 341 The Bio::DB::Fasta-E<gt>newFh() method creates a tied filehandle from | |
| 342 which you can read Bio::PrimarySeq::Fasta sequence objects | |
| 343 sequentially. The following bit of code will iterate sequentially | |
| 344 over all sequences in the database: | |
| 345 | |
| 346 my $fh = Bio::DB::Fasta->newFh('/path/to/fasta/files'); | |
| 347 while (my $seq = <$fh>) { | |
| 348 print $seq->id,' => ',$seq->length,"\n"; | |
| 349 } | |
| 350 | |
| 351 When no more sequences remain to be retrieved, the stream will return | |
| 352 undef. | |
| 353 | |
| 354 =head1 BUGS | |
| 355 | |
| 356 When a sequence is deleted from one of the Fasta files, this deletion | |
| 357 is not detected by the module and removed from the index. As a | |
| 358 result, a "ghost" entry will remain in the index and will return | |
| 359 garbage results if accessed. | |
| 360 | |
| 361 Currently, the only way to accomodate deletions is to rebuild the | |
| 362 entire index, either by deleting it manually, or by passing | |
| 363 -reindex=E<gt>1 to new() when initializing the module. | |
| 364 | |
| 365 =head1 SEE ALSO | |
| 366 | |
| 367 L<bioperl> | |
| 368 | |
| 369 =head1 AUTHOR | |
| 370 | |
| 371 Lincoln Stein E<lt>lstein@cshl.orgE<gt>. | |
| 372 | |
| 373 Copyright (c) 2001 Cold Spring Harbor Laboratory. | |
| 374 | |
| 375 This library is free software; you can redistribute it and/or modify | |
| 376 it under the same terms as Perl itself. See DISCLAIMER.txt for | |
| 377 disclaimers of warranty. | |
| 378 | |
| 379 =cut | |
| 380 | |
| 381 #' | |
| 382 package Bio::DB::Fasta; | |
| 383 | |
| 384 BEGIN { | |
| 385 @AnyDBM_File::ISA = qw(DB_File GDBM_File NDBM_File SDBM_File) | |
| 386 } | |
| 387 | |
| 388 use strict; | |
| 389 use IO::File; | |
| 390 use AnyDBM_File; | |
| 391 use Fcntl; | |
| 392 use File::Basename qw(basename dirname); | |
| 393 use Bio::DB::SeqI; | |
| 394 use Bio::Root::Root; | |
| 395 use vars qw($VERSION @ISA); | |
| 396 | |
| 397 @ISA = qw(Bio::DB::SeqI Bio::Root::Root); | |
| 398 | |
| 399 $VERSION = '1.03'; | |
| 400 | |
| 401 *seq = *sequence = \&subseq; | |
| 402 *ids = \&get_all_ids; | |
| 403 *get_seq_by_primary_id = *get_Seq_by_acc = \&get_Seq_by_id; | |
| 404 | |
| 405 use constant STRUCT =>'NNnnCa*'; | |
| 406 use constant DNA => 1; | |
| 407 use constant RNA => 2; | |
| 408 use constant PROTEIN => 3; | |
| 409 | |
| 410 # Bio::DB-like object | |
| 411 # providing fast random access to a directory of FASTA files | |
| 412 | |
| 413 =head2 new | |
| 414 | |
| 415 Title : new | |
| 416 Usage : my $db = new Bio::DB::Fasta( $path, @options); | |
| 417 Function: initialize a new Bio::DB::Fasta object | |
| 418 Returns : new Bio::DB::Fasta object | |
| 419 Args : path to dir of fasta files or a single filename | |
| 420 | |
| 421 These are optional arguments to pass in as well. | |
| 422 | |
| 423 -glob Glob expression to use *.{fa,fasta,fast,FA,FASTA,FAST} | |
| 424 for searching for Fasta | |
| 425 files in directories. | |
| 426 | |
| 427 -makeid A code subroutine for None | |
| 428 transforming Fasta IDs. | |
| 429 | |
| 430 -maxopen Maximum size of 32 | |
| 431 filehandle cache. | |
| 432 | |
| 433 -debug Turn on status 0 | |
| 434 messages. | |
| 435 | |
| 436 -reindex Force the index to be 0 | |
| 437 rebuilt. | |
| 438 | |
| 439 -dbmargs Additional arguments none | |
| 440 to pass to the DBM | |
| 441 routines when tied | |
| 442 (scalar or array ref). | |
| 443 | |
| 444 =cut | |
| 445 | |
| 446 sub new { | |
| 447 my $class = shift; | |
| 448 my $path = shift; | |
| 449 my %opts = @_; | |
| 450 | |
| 451 my $self = bless { debug => $opts{-debug}, | |
| 452 makeid => $opts{-makeid}, | |
| 453 glob => $opts{-glob} || '*.{fa,fasta,FA,FASTA,fast,FAST,dna,fsa}', | |
| 454 maxopen => $opts{-maxfh} || 32, | |
| 455 dbmargs => $opts{-dbmargs} || undef, | |
| 456 fhcache => {}, | |
| 457 cacheseq => {}, | |
| 458 curopen => 0, | |
| 459 openseq => 1, | |
| 460 dirname => undef, | |
| 461 offsets => undef, | |
| 462 }, $class; | |
| 463 my ($offsets,$dirname); | |
| 464 | |
| 465 if (-d $path) { | |
| 466 $offsets = $self->index_dir($path,$opts{-reindex}); | |
| 467 $dirname = $path; | |
| 468 } elsif (-f _) { | |
| 469 $offsets = $self->index_file($path,$opts{-reindex}); | |
| 470 $dirname = dirname($path); | |
| 471 } else { | |
| 472 $self->throw( "$path: Invalid file or dirname"); | |
| 473 } | |
| 474 @{$self}{qw(dirname offsets)} = ($dirname,$offsets); | |
| 475 | |
| 476 $self; | |
| 477 } | |
| 478 | |
| 479 =head2 newFh | |
| 480 | |
| 481 Title : newFh | |
| 482 Function: gets a new Fh for a file | |
| 483 Example : internal method | |
| 484 Returns : GLOB | |
| 485 Args : | |
| 486 | |
| 487 =cut | |
| 488 | |
| 489 sub newFh { | |
| 490 my $class = shift; | |
| 491 my $self = $class->new(@_); | |
| 492 require Symbol; | |
| 493 my $fh = Symbol::gensym or return; | |
| 494 tie $$fh,'Bio::DB::Fasta::Stream',$self or return; | |
| 495 $fh; | |
| 496 } | |
| 497 | |
| 498 sub _open_index { | |
| 499 my $self = shift; | |
| 500 my ($index,$write) = @_; | |
| 501 my %offsets; | |
| 502 my $flags = $write ? O_CREAT|O_RDWR : O_RDONLY; | |
| 503 my @dbmargs = $self->dbmargs; | |
| 504 tie %offsets,'AnyDBM_File',$index,$flags,0644,@dbmargs or $self->throw( "Can't open cache file: $!"); | |
| 505 return \%offsets; | |
| 506 } | |
| 507 | |
| 508 =head2 index_dir | |
| 509 | |
| 510 Title : index_dir | |
| 511 Usage : $db->index_dir($dir) | |
| 512 Function: set the index dir and load all files in the dir | |
| 513 Returns : hashref of seq offsets in each file | |
| 514 Args : dirname, boolean to force a reload of all files | |
| 515 | |
| 516 =cut | |
| 517 | |
| 518 sub index_dir { | |
| 519 my $self = shift; | |
| 520 my $dir = shift; | |
| 521 my $force_reindex = shift; | |
| 522 | |
| 523 # find all fasta files | |
| 524 my @files = glob("$dir/$self->{glob}"); | |
| 525 $self->throw( "no fasta files in $dir") unless @files; | |
| 526 | |
| 527 # get name of index | |
| 528 my $index = $self->index_name($dir,1); | |
| 529 | |
| 530 # if caller has requested reindexing, then unlink | |
| 531 # the index file. | |
| 532 unlink $index if $force_reindex; | |
| 533 | |
| 534 # get the modification time of the index | |
| 535 my $indextime = (stat($index))[9] || 0; | |
| 536 | |
| 537 # get the most recent modification time of any of the contents | |
| 538 my $modtime = 0; | |
| 539 my %modtime; | |
| 540 foreach (@files) { | |
| 541 my $m = (stat($_))[9]; | |
| 542 $modtime{$_} = $m; | |
| 543 $modtime = $m if $modtime < $m; | |
| 544 } | |
| 545 | |
| 546 my $reindex = $force_reindex || $indextime < $modtime; | |
| 547 my $offsets = $self->_open_index($index,$reindex) or return; | |
| 548 $self->{offsets} = $offsets; | |
| 549 | |
| 550 # no indexing needed | |
| 551 return $offsets unless $reindex; | |
| 552 | |
| 553 # otherwise reindex contents of changed files | |
| 554 $self->{indexing} = $index; | |
| 555 foreach (@files) { | |
| 556 next if( defined $indextime && $modtime{$_} <= $indextime); | |
| 557 $self->calculate_offsets($_,$offsets); | |
| 558 } | |
| 559 delete $self->{indexing}; | |
| 560 return $self->{offsets}; | |
| 561 } | |
| 562 | |
| 563 =head2 get_Seq_by_id | |
| 564 | |
| 565 Title : get_Seq_by_id | |
| 566 Usage : my $seq = $db->get_Seq_by_id($id) | |
| 567 Function: Bio::DB::RandomAccessI method implemented | |
| 568 Returns : Bio::PrimarySeqI object | |
| 569 Args : id | |
| 570 | |
| 571 =cut | |
| 572 | |
| 573 sub get_Seq_by_id { | |
| 574 my $self = shift; | |
| 575 my $id = shift; | |
| 576 return Bio::PrimarySeq::Fasta->new($self,$id); | |
| 577 } | |
| 578 | |
| 579 =head2 index_file | |
| 580 | |
| 581 Title : index_file | |
| 582 Usage : $db->index_file($filename) | |
| 583 Function: (re)loads a sequence file and indexes sequences offsets in the file | |
| 584 Returns : seq offsets in the file | |
| 585 Args : filename, | |
| 586 boolean to force reloading a file | |
| 587 | |
| 588 =cut | |
| 589 | |
| 590 | |
| 591 sub index_file { | |
| 592 my $self = shift; | |
| 593 my $file = shift; | |
| 594 my $force_reindex = shift; | |
| 595 | |
| 596 my $index = $self->index_name($file); | |
| 597 # if caller has requested reindexing, then unlink the index | |
| 598 unlink $index if $force_reindex; | |
| 599 | |
| 600 # get the modification time of the index | |
| 601 my $indextime = (stat($index))[9]; | |
| 602 my $modtime = (stat($file))[9]; | |
| 603 | |
| 604 my $reindex = $force_reindex || $indextime < $modtime; | |
| 605 my $offsets = $self->_open_index($index,$reindex) or return; | |
| 606 $self->{offsets} = $offsets; | |
| 607 | |
| 608 return $self->{offsets} unless $reindex; | |
| 609 | |
| 610 $self->{indexing} = $index; | |
| 611 $self->calculate_offsets($file,$offsets); | |
| 612 delete $self->{indexing}; | |
| 613 return $self->{offsets}; | |
| 614 } | |
| 615 | |
| 616 =head2 dbmargs | |
| 617 | |
| 618 Title : dbmargs | |
| 619 Usage : my @args = $db->dbmargs; | |
| 620 Function: gets stored dbm arguments | |
| 621 Returns : array | |
| 622 Args : none | |
| 623 | |
| 624 | |
| 625 =cut | |
| 626 | |
| 627 sub dbmargs { | |
| 628 my $self = shift; | |
| 629 my $args = $self->{dbmargs} or return; | |
| 630 return ref($args) eq 'ARRAY' ? @$args : $args; | |
| 631 } | |
| 632 | |
| 633 =head2 index_name | |
| 634 | |
| 635 Title : index_name | |
| 636 Usage : my $indexname = $db->index_name($path,$isdir); | |
| 637 Function: returns the name of the index for a specific path | |
| 638 Returns : string | |
| 639 Args : path to check, | |
| 640 boolean if it is a dir | |
| 641 | |
| 642 =cut | |
| 643 | |
| 644 sub index_name { | |
| 645 my $self = shift; | |
| 646 my ($path,$isdir) = @_; | |
| 647 unless ($path) { | |
| 648 my $dir = $self->{dirname} or return; | |
| 649 return $self->index_name($dir,-d $dir); | |
| 650 } | |
| 651 return "$path/directory.index" if $isdir; | |
| 652 return "$path.index"; | |
| 653 } | |
| 654 | |
| 655 =head2 calculate_offsets | |
| 656 | |
| 657 Title : calculate_offsets | |
| 658 Usage : $db->calculate_offsets($filename,$offsets); | |
| 659 Function: calculates the sequence offsets in a file based on id | |
| 660 Returns : offset hash for each file | |
| 661 Args : file to process | |
| 662 $offsets - hashref of id to offset storage | |
| 663 | |
| 664 =cut | |
| 665 | |
| 666 sub calculate_offsets { | |
| 667 my $self = shift; | |
| 668 my ($file,$offsets) = @_; | |
| 669 my $base = $self->path2fileno(basename($file)); | |
| 670 | |
| 671 my $fh = IO::File->new($file) or $self->throw( "Can't open $file: $!"); | |
| 672 warn "indexing $file\n" if $self->{debug}; | |
| 673 my ($offset,$id,$linelength,$type,$firstline,$count,%offsets); | |
| 674 while (<$fh>) { # don't try this at home | |
| 675 if (/^>(\S+)/) { | |
| 676 print STDERR "indexed $count sequences...\n" | |
| 677 if $self->{debug} && (++$count%1000) == 0; | |
| 678 my $pos = tell($fh); | |
| 679 if ($id) { | |
| 680 my $seqlength = $pos - $offset - length($_) - 1; | |
| 681 $seqlength -= int($seqlength/$linelength); | |
| 682 $offsets->{$id} = $self->_pack($offset,$seqlength, | |
| 683 $linelength,$firstline, | |
| 684 $type,$base); | |
| 685 } | |
| 686 $id = ref($self->{makeid}) eq 'CODE' ? $self->{makeid}->($_) : $1; | |
| 687 ($offset,$firstline,$linelength) = ($pos,length($_),0); | |
| 688 } else { | |
| 689 $linelength ||= length($_); | |
| 690 $type ||= $self->_type($_); | |
| 691 } | |
| 692 } | |
| 693 # deal with last entry | |
| 694 if ($id) { | |
| 695 my $pos = tell($fh); | |
| 696 | |
| 697 # my $seqlength = $pos - $offset - length($_) - 1; | |
| 698 # $_ is always null should not be part of this calculation | |
| 699 my $seqlength = $pos - $offset - 1; | |
| 700 | |
| 701 if ($linelength == 0) { # yet another pesky empty chr_random.fa file | |
| 702 $seqlength = 0; | |
| 703 } else { | |
| 704 $seqlength -= int($seqlength/$linelength); | |
| 705 }; | |
| 706 $offsets->{$id} = $self->_pack($offset,$seqlength, | |
| 707 $linelength,$firstline, | |
| 708 $type,$base); | |
| 709 } | |
| 710 return \%offsets; | |
| 711 } | |
| 712 | |
| 713 =head2 get_all_ids | |
| 714 | |
| 715 Title : get_all_ids | |
| 716 Usage : my @ids = $db->get_all_ids | |
| 717 Function: gets all the stored ids in all indexes | |
| 718 Returns : list of ids | |
| 719 Args : none | |
| 720 | |
| 721 =cut | |
| 722 | |
| 723 sub get_all_ids { grep {!/^__/} keys %{shift->{offsets}} } | |
| 724 | |
| 725 sub offset { | |
| 726 my $self = shift; | |
| 727 my $id = shift; | |
| 728 my $offset = $self->{offsets}{$id} or return; | |
| 729 ($self->_unpack($offset))[0]; | |
| 730 } | |
| 731 | |
| 732 sub length { | |
| 733 my $self = shift; | |
| 734 my $id = shift; | |
| 735 my $offset = $self->{offsets}{$id} or return; | |
| 736 ($self->_unpack($offset))[1]; | |
| 737 } | |
| 738 | |
| 739 sub linelen { | |
| 740 my $self = shift; | |
| 741 my $id = shift; | |
| 742 my $offset = $self->{offsets}{$id} or return; | |
| 743 ($self->_unpack($offset))[2]; | |
| 744 } | |
| 745 | |
| 746 sub headerlen { | |
| 747 my $self = shift; | |
| 748 my $id = shift; | |
| 749 my $offset = $self->{offsets}{$id} or return; | |
| 750 ($self->_unpack($offset))[3]; | |
| 751 } | |
| 752 | |
| 753 sub alphabet { | |
| 754 my $self = shift; | |
| 755 my $id = shift; | |
| 756 my $offset = $self->{offsets}{$id} or return; | |
| 757 my $type = ($self->_unpack($offset))[4]; | |
| 758 return $type == DNA ? 'dna' | |
| 759 : $type == RNA ? 'rna' | |
| 760 : 'protein'; | |
| 761 | |
| 762 } | |
| 763 | |
| 764 sub path { shift->{dirname} } | |
| 765 | |
| 766 sub header_offset { | |
| 767 my $self = shift; | |
| 768 my $id = shift; | |
| 769 return unless $self->{offsets}{$id}; | |
| 770 return $self->offset($id) - $self->headerlen($id); | |
| 771 } | |
| 772 | |
| 773 sub file { | |
| 774 my $self = shift; | |
| 775 my $id = shift; | |
| 776 my $offset = $self->{offsets}{$id} or return; | |
| 777 $self->fileno2path(($self->_unpack($offset))[5]); | |
| 778 } | |
| 779 | |
| 780 sub fileno2path { | |
| 781 my $self = shift; | |
| 782 my $no = shift; | |
| 783 return $self->{offsets}{"__file_$no"}; | |
| 784 } | |
| 785 | |
| 786 sub path2fileno { | |
| 787 my $self = shift; | |
| 788 my $path = shift; | |
| 789 if ( !defined $self->{offsets}{"__path_$path"} ) { | |
| 790 my $fileno = ($self->{offsets}{"__path_$path"} = 0+ $self->{fileno}++); | |
| 791 $self->{offsets}{"__file_$fileno"} = $path; | |
| 792 } | |
| 793 return $self->{offsets}{"__path_$path"} | |
| 794 } | |
| 795 | |
| 796 =head2 subseq | |
| 797 | |
| 798 Title : subseq | |
| 799 Usage : $seqdb->subseq($id,$start,$stop); | |
| 800 Function: returns a subseq of a sequence in the db | |
| 801 Returns : subsequence data | |
| 802 Args : id of sequence, starting point, ending point | |
| 803 | |
| 804 =cut | |
| 805 | |
| 806 sub subseq { | |
| 807 my ($self,$id,$start,$stop) = @_; | |
| 808 if ($id =~ /^(.+):([\d_]+)[,-]([\d_]+)$/) { | |
| 809 ($id,$start,$stop) = ($1,$2,$3); | |
| 810 $start =~ s/_//g; | |
| 811 $stop =~ s/_//g; | |
| 812 } | |
| 813 $start ||= 1; | |
| 814 $stop ||= $self->length($id); | |
| 815 | |
| 816 my $reversed; | |
| 817 if ($start > $stop) { | |
| 818 ($start,$stop) = ($stop,$start); | |
| 819 $reversed++; | |
| 820 } | |
| 821 | |
| 822 my $data; | |
| 823 | |
| 824 my $fh = $self->fh($id) or return; | |
| 825 my $filestart = $self->caloffset($id,$start); | |
| 826 my $filestop = $self->caloffset($id,$stop); | |
| 827 | |
| 828 seek($fh,$filestart,0); | |
| 829 read($fh,$data,$filestop-$filestart+1); | |
| 830 $data =~ s/\n//g; | |
| 831 if ($reversed) { | |
| 832 $data = reverse $data; | |
| 833 $data =~ tr/gatcGATC/ctagCTAG/; | |
| 834 } | |
| 835 $data; | |
| 836 } | |
| 837 | |
| 838 sub fh { | |
| 839 my $self = shift; | |
| 840 my $id = shift; | |
| 841 my $file = $self->file($id) or return; | |
| 842 $self->fhcache("$self->{dirname}/$file") or $self->throw( "Can't open file $file"); | |
| 843 } | |
| 844 | |
| 845 sub header { | |
| 846 my $self = shift; | |
| 847 my $id = shift; | |
| 848 my ($offset,$seqlength,$linelength,$firstline,$type,$file) | |
| 849 = $self->_unpack($self->{offsets}{$id}) or return; | |
| 850 $offset -= $firstline; | |
| 851 my $data; | |
| 852 my $fh = $self->fh($id) or return; | |
| 853 seek($fh,$offset,0); | |
| 854 read($fh,$data,$firstline); | |
| 855 chomp $data; | |
| 856 substr($data,0,1) = ''; | |
| 857 $data; | |
| 858 } | |
| 859 | |
| 860 sub caloffset { | |
| 861 my $self = shift; | |
| 862 my $id = shift; | |
| 863 my $a = shift()-1; | |
| 864 my ($offset,$seqlength,$linelength,$firstline,$type,$file) = $self->_unpack($self->{offsets}{$id}); | |
| 865 $a = 0 if $a < 0; | |
| 866 $a = $seqlength-1 if $a >= $seqlength; | |
| 867 $offset + $linelength * int($a/($linelength-1)) + $a % ($linelength-1); | |
| 868 } | |
| 869 | |
| 870 sub fhcache { | |
| 871 my $self = shift; | |
| 872 my $path = shift; | |
| 873 if (!$self->{fhcache}{$path}) { | |
| 874 if ($self->{curopen} >= $self->{maxopen}) { | |
| 875 my @lru = sort {$self->{cacheseq}{$a} <=> $self->{cacheseq}{$b};} keys %{$self->{fhcache}}; | |
| 876 splice(@lru, $self->{maxopen} / 3); | |
| 877 $self->{curopen} -= @lru; | |
| 878 for (@lru) { delete $self->{fhcache}{$_} } | |
| 879 } | |
| 880 $self->{fhcache}{$path} = IO::File->new($path) or return; | |
| 881 $self->{curopen}++; | |
| 882 } | |
| 883 $self->{cacheseq}{$path}++; | |
| 884 $self->{fhcache}{$path} | |
| 885 } | |
| 886 | |
| 887 sub _pack { | |
| 888 shift; | |
| 889 pack STRUCT,@_; | |
| 890 } | |
| 891 | |
| 892 sub _unpack { | |
| 893 shift; | |
| 894 unpack STRUCT,shift; | |
| 895 } | |
| 896 | |
| 897 sub _type { | |
| 898 shift; | |
| 899 local $_ = shift; | |
| 900 return /^[gatcnGATCN*-]+$/ ? DNA | |
| 901 : /^[gaucnGAUCN*-]+$/ ? RNA | |
| 902 : PROTEIN; | |
| 903 } | |
| 904 | |
| 905 =head2 get_PrimarySeq_stream | |
| 906 | |
| 907 Title : get_PrimarySeq_stream | |
| 908 Usage : | |
| 909 Function: | |
| 910 Example : | |
| 911 Returns : | |
| 912 Args : | |
| 913 | |
| 914 | |
| 915 =cut | |
| 916 | |
| 917 sub get_PrimarySeq_stream { | |
| 918 my $self = shift; | |
| 919 return Bio::DB::Fasta::Stream->new($self); | |
| 920 } | |
| 921 | |
| 922 sub TIEHASH { | |
| 923 my $self = shift; | |
| 924 return $self->new(@_); | |
| 925 } | |
| 926 | |
| 927 sub FETCH { | |
| 928 shift->subseq(@_); | |
| 929 } | |
| 930 sub STORE { | |
| 931 shift->throw("Read-only database"); | |
| 932 } | |
| 933 sub DELETE { | |
| 934 shift->throw("Read-only database"); | |
| 935 } | |
| 936 sub CLEAR { | |
| 937 shift->throw("Read-only database"); | |
| 938 } | |
| 939 sub EXISTS { | |
| 940 defined shift->offset(@_); | |
| 941 } | |
| 942 sub FIRSTKEY { tied(%{shift->{offsets}})->FIRSTKEY(@_); } | |
| 943 sub NEXTKEY { tied(%{shift->{offsets}})->NEXTKEY(@_); } | |
| 944 | |
| 945 sub DESTROY { | |
| 946 my $self = shift; | |
| 947 if ($self->{indexing}) { # killed prematurely, so index file is no good! | |
| 948 warn "indexing was interrupted, so unlinking $self->{indexing}"; | |
| 949 unlink $self->{indexing}; | |
| 950 } | |
| 951 } | |
| 952 | |
| 953 #------------------------------------------------------------- | |
| 954 # Bio::PrimarySeqI compatibility | |
| 955 # | |
| 956 package Bio::PrimarySeq::Fasta; | |
| 957 use overload '""' => 'display_id'; | |
| 958 | |
| 959 use vars '@ISA'; | |
| 960 eval { | |
| 961 require Bio::PrimarySeqI; | |
| 962 require Bio::Root::Root; | |
| 963 } && (@ISA = ('Bio::Root::Root','Bio::PrimarySeqI')); | |
| 964 | |
| 965 sub new { | |
| 966 my $class = shift; | |
| 967 $class = ref($class) if ref $class; | |
| 968 my ($db,$id,$start,$stop) = @_; | |
| 969 return bless { db => $db, | |
| 970 id => $id, | |
| 971 start => $start || 1, | |
| 972 stop => $stop || $db->length($id) | |
| 973 },$class; | |
| 974 } | |
| 975 | |
| 976 sub seq { | |
| 977 my $self = shift; | |
| 978 return $self->{db}->seq($self->{id},$self->{start},$self->{stop}); | |
| 979 } | |
| 980 | |
| 981 sub subseq { | |
| 982 my $self = shift; | |
| 983 my ($start,$stop) = @_; | |
| 984 $self->throw("Stop cannot be smaller than start") unless $start <= $stop; | |
| 985 return $self->{start} <= $self->{stop} ? $self->new($self->{db}, | |
| 986 $self->{id}, | |
| 987 $self->{start}+$start-1, | |
| 988 $self->{start}+$stop-1) | |
| 989 : $self->new($self->{db}, | |
| 990 $self->{id}, | |
| 991 $self->{start}-($start-1), | |
| 992 $self->{start}-($stop-1) | |
| 993 ); | |
| 994 | |
| 995 } | |
| 996 | |
| 997 sub display_id { | |
| 998 my $self = shift; | |
| 999 return $self->{id}; | |
| 1000 } | |
| 1001 | |
| 1002 sub accession_number { | |
| 1003 my $self = shift; | |
| 1004 return "unknown"; | |
| 1005 } | |
| 1006 | |
| 1007 sub primary_id { | |
| 1008 my $self = shift; | |
| 1009 return overload::StrVal($self); | |
| 1010 } | |
| 1011 | |
| 1012 sub can_call_new { return 0 } | |
| 1013 | |
| 1014 sub alphabet { | |
| 1015 my $self = shift; | |
| 1016 return $self->{db}->alphabet($self->{id}); | |
| 1017 } | |
| 1018 | |
| 1019 sub revcom { | |
| 1020 my $self = shift; | |
| 1021 return $self->new(@{$self}{'db','id','stop','start'}); | |
| 1022 } | |
| 1023 | |
| 1024 sub length { | |
| 1025 my $self = shift; | |
| 1026 return $self->{db}->length($self->{id}); | |
| 1027 } | |
| 1028 | |
| 1029 sub desc { | |
| 1030 my $self = shift; | |
| 1031 return ''; | |
| 1032 } | |
| 1033 | |
| 1034 #------------------------------------------------------------- | |
| 1035 # stream-based access to the database | |
| 1036 # | |
| 1037 package Bio::DB::Fasta::Stream; | |
| 1038 use Tie::Handle; | |
| 1039 use vars qw(@ISA); | |
| 1040 @ISA = qw(Tie::Handle); | |
| 1041 eval { | |
| 1042 require Bio::DB::SeqI; | |
| 1043 } && (push @ISA,'Bio::DB::SeqI'); | |
| 1044 | |
| 1045 | |
| 1046 sub new { | |
| 1047 my $class = shift; | |
| 1048 my $db = shift; | |
| 1049 my $key = $db->FIRSTKEY; | |
| 1050 return bless { db=>$db,key=>$key },$class; | |
| 1051 } | |
| 1052 | |
| 1053 sub next_seq { | |
| 1054 my $self = shift; | |
| 1055 my ($key,$db) = @{$self}{'key','db'}; | |
| 1056 my $value = $db->get_Seq_by_id($key); | |
| 1057 $self->{key} = $db->NEXTKEY($key); | |
| 1058 $value; | |
| 1059 } | |
| 1060 | |
| 1061 sub TIEHANDLE { | |
| 1062 my $class = shift; | |
| 1063 my $db = shift; | |
| 1064 return $class->new($db); | |
| 1065 } | |
| 1066 sub READLINE { | |
| 1067 my $self = shift; | |
| 1068 $self->next_seq; | |
| 1069 } | |
| 1070 | |
| 1071 1; | |
| 1072 | |
| 1073 __END__ | |
| 1074 |
