0
|
1 =head1 NAME
|
|
2
|
|
3 Bio::DB::Fasta -- Fast indexed access to a directory of fasta files
|
|
4
|
|
5 =head1 SYNOPSIS
|
|
6
|
|
7 use Bio::DB::Fasta;
|
|
8
|
|
9 # create database from directory of fasta files
|
|
10 my $db = Bio::DB::Fasta->new('/path/to/fasta/files');
|
|
11
|
|
12 # simple access (for those without Bioperl)
|
|
13 my $seq = $db->seq('CHROMOSOME_I',4_000_000 => 4_100_000);
|
|
14 my $revseq = $db->seq('CHROMOSOME_I',4_100_000 => 4_000_000);
|
|
15 my @ids = $db->ids;
|
|
16 my $length = $db->length('CHROMOSOME_I');
|
|
17 my $alphabet = $db->alphabet('CHROMOSOME_I');
|
|
18 my $header = $db->header('CHROMOSOME_I');
|
|
19
|
|
20 # Bioperl-style access
|
|
21 my $db = Bio::DB::Fasta->new('/path/to/fasta/files');
|
|
22
|
|
23 my $obj = $db->get_Seq_by_id('CHROMOSOME_I');
|
|
24 my $seq = $obj->seq;
|
|
25 my $subseq = $obj->subseq(4_000_000 => 4_100_000);
|
|
26 my $length = $obj->length;
|
|
27 # (etc)
|
|
28
|
|
29 # Bio::SeqIO-style access
|
|
30 my $stream = Bio::DB::Fasta->new('/path/to/fasta/files')->get_PrimarySeq_stream;
|
|
31 while (my $seq = $stream->next_seq) {
|
|
32 # Bio::PrimarySeqI stuff
|
|
33 }
|
|
34
|
|
35 my $fh = Bio::DB::Fasta->newFh('/path/to/fasta/files');
|
|
36 while (my $seq = <$fh>) {
|
|
37 # Bio::PrimarySeqI stuff
|
|
38 }
|
|
39
|
|
40 # tied hash access
|
|
41 tie %sequences,'Bio::DB::Fasta','/path/to/fasta/files';
|
|
42 print $sequences{'CHROMOSOME_I:1,20000'};
|
|
43
|
|
44 =head1 DESCRIPTION
|
|
45
|
|
46 Bio::DB::Fasta provides indexed access to one or more Fasta files. It
|
|
47 provides random access to each sequence entry, and to subsequences
|
|
48 within each entry, allowing you to retrieve portions of very large
|
|
49 sequences without bringing the entire sequence into memory.
|
|
50
|
|
51 When you initialize the module, you point it at a single fasta file or
|
|
52 a directory of multiple such files. The first time it is run, the
|
|
53 module generates an index of the contents of the file or directory
|
|
54 using the AnyDBM module (Berkeley DB preferred, followed by GDBM_File,
|
|
55 NDBM_File, and SDBM_File). Thereafter it uses the index file to find
|
|
56 the file and offset for any requested sequence. If one of the source
|
|
57 fasta files is updated, the module reindexes just that one file. (You
|
|
58 can also force reindexing manually). For improved performance, the
|
|
59 module keeps a cache of open filehandles, closing less-recently used
|
|
60 ones when the cache is full.
|
|
61
|
|
62 The fasta files may contain any combination of nucleotide and protein
|
|
63 sequences; during indexing the module guesses the molecular type.
|
|
64 Entries may have any line length, and different line lengths are
|
|
65 allowed in the same file. However, within a sequence entry, all lines
|
|
66 must be the same length except for the last.
|
|
67
|
|
68 The module uses /^E<gt>(\S+)/ to extract each sequence's primary ID from
|
|
69 the Fasta header. During indexing, you may pass a callback routine to
|
|
70 modify this primary ID. For example, you may wish to extract a
|
|
71 portion of the gi|gb|abc|xyz nonsense that GenBank Fasta files use.
|
|
72 The original header line can be recovered later.
|
|
73
|
|
74 This module was developed for use with the C. elegans and human
|
|
75 genomes, and has been tested with sequence segments as large as 20
|
|
76 megabases. Indexing the C. elegans genome (100 megabases of genomic
|
|
77 sequence plus 100,000 ESTs) takes ~5 minutes on my 300 MHz pentium
|
|
78 laptop. On the same system, average access time for any 200-mer within
|
|
79 the C. elegans genome was E<lt>0.02s.
|
|
80
|
|
81 =head1 DATABASE CREATION AND INDEXING
|
|
82
|
|
83 The two constructors for this class are new() and newFh(). The former
|
|
84 creates a Bio::DB::Fasta object which is accessed via method calls.
|
|
85 The latter creates a tied filehandle which can be used Bio::SeqIO
|
|
86 style to fetch sequence objects in a stream fashion. There is also a
|
|
87 tied hash interface.
|
|
88
|
|
89 =over 4
|
|
90
|
|
91 =item $db = Bio::DB::Fasta-E<gt>new($fasta_path [,%options])
|
|
92
|
|
93 Create a new Bio::DB::Fasta object from the Fasta file or files
|
|
94 indicated by $fasta_path. Indexing will be performed automatically if
|
|
95 needed. If successful, new() will return the database accessor
|
|
96 object. Otherwise it will return undef.
|
|
97
|
|
98 $fasta_path may be an individual Fasta file, or may refer to a
|
|
99 directory containing one or more of such files. Following the path,
|
|
100 you may pass a series of name=E<gt>value options or a hash with these
|
|
101 same name=E<gt>value pairs. Valid options are:
|
|
102
|
|
103 Option Name Description Default
|
|
104 ----------- ----------- -------
|
|
105
|
|
106 -glob Glob expression to use *.{fa,fasta,fast,FA,FASTA,FAST,dna}
|
|
107 for searching for Fasta
|
|
108 files in directories.
|
|
109
|
|
110 -makeid A code subroutine for None
|
|
111 transforming Fasta IDs.
|
|
112
|
|
113 -maxopen Maximum size of 32
|
|
114 filehandle cache.
|
|
115
|
|
116 -debug Turn on status 0
|
|
117 messages.
|
|
118
|
|
119 -reindex Force the index to be 0
|
|
120 rebuilt.
|
|
121
|
|
122 -dbmargs Additional arguments none
|
|
123 to pass to the DBM
|
|
124 routines when tied
|
|
125 (scalar or array ref).
|
|
126
|
|
127 -dbmargs can be used to control the format of the index. For example,
|
|
128 you can pass $DB_BTREE to this argument so as to force the IDs to be
|
|
129 sorted and retrieved alphabetically. Note that you must use the same
|
|
130 arguments every time you open the index!
|
|
131
|
|
132 -reindex can be used to force the index to be recreated from scratch.
|
|
133
|
|
134 =item $fh = Bio::DB::Fasta-E<gt>newFh($fasta_path [,%options])
|
|
135
|
|
136 Create a tied filehandle opened on a Bio::DB::Fasta object. Reading
|
|
137 from this filehandle with E<lt>E<gt> will return a stream of sequence objects,
|
|
138 Bio::SeqIO style.
|
|
139
|
|
140 =back
|
|
141
|
|
142 The -makeid option gives you a chance to modify sequence IDs during
|
|
143 indexing. The option's value should be a code reference that will
|
|
144 take a scalar argument and return a scalar result, like this:
|
|
145
|
|
146 $db = Bio::DB::Fasta->new("file.fa",-makeid=>\&make_my_id);
|
|
147
|
|
148 sub make_my_id {
|
|
149 my $description_line = shift;
|
|
150 # get a new id from the fasta header
|
|
151 return $new_id;
|
|
152 }
|
|
153
|
|
154 make_my_id() will be called with the full fasta id line (including the
|
|
155 "E<gt>" symbol!). For example:
|
|
156
|
|
157 >A12345.3 Predicted C. elegans protein egl-2
|
|
158
|
|
159 By default, this module will use the regular expression /^E<gt>(\S+)/
|
|
160 to extract "A12345.3" for use as the ID. If you pass a -makeid
|
|
161 callback, you can extract any portion of this, such as the "egl-2"
|
|
162 symbol.
|
|
163
|
|
164 The -makeid option is ignored after the index is constructed.
|
|
165
|
|
166 =head1 OBJECT METHODS
|
|
167
|
|
168 The following object methods are provided.
|
|
169
|
|
170 =over 4
|
|
171
|
|
172 =item $raw_seq = $db-E<gt>seq($id [,$start, $stop])
|
|
173
|
|
174 Return the raw sequence (a string) given an ID and optionally a start
|
|
175 and stop position in the sequence. In the case of DNA sequence, if
|
|
176 $stop is less than $start, then the reverse complement of the sequence
|
|
177 is returned (this violates Bio::Seq conventions).
|
|
178
|
|
179 For your convenience, subsequences can be indicated with this compound
|
|
180 ID:
|
|
181
|
|
182 $db->seq("$id:$start,$stop")
|
|
183
|
|
184 =item $length = $db-E<gt>length($id)
|
|
185
|
|
186 Return the length of the indicated sequence.
|
|
187
|
|
188 =item $header = $db-E<gt>header($id)
|
|
189
|
|
190 Return the header line for the ID, including the initial "E<gt>".
|
|
191
|
|
192 =item $type = $db-E<gt>alphabet($id)
|
|
193
|
|
194 Return the molecular type of the indicated sequence. One of "dna",
|
|
195 "rna" or "protein".
|
|
196
|
|
197 =item $filename = $db-E<gt>file($id)
|
|
198
|
|
199 Return the name of the file in which the indicated sequence can be
|
|
200 found.
|
|
201
|
|
202 =item $offset = $db-E<gt>offset($id)
|
|
203
|
|
204 Return the offset of the indicated sequence from the beginning of the
|
|
205 file in which it is located. The offset points to the beginning of
|
|
206 the sequence, not the beginning of the header line.
|
|
207
|
|
208 =item $header_length = $db-E<gt>headerlen($id)
|
|
209
|
|
210 Return the length of the header line for the indicated sequence.
|
|
211
|
|
212 =item $header_offset = $db-E<gt>header_offset($id)
|
|
213
|
|
214 Return the offset of the header line for the indicated sequence from
|
|
215 the beginning of the file in which it is located.
|
|
216
|
|
217 =item $index_name = $db-E<gt>index_name
|
|
218
|
|
219 Return the path to the index file.
|
|
220
|
|
221 =item $path = $db-E<gt>path
|
|
222
|
|
223 Return the path to the Fasta file(s).
|
|
224
|
|
225 =back
|
|
226
|
|
227 For BioPerl-style access, the following methods are provided:
|
|
228
|
|
229 =over 4
|
|
230
|
|
231 =item $seq = $db-E<gt>get_Seq_by_id($id)
|
|
232
|
|
233 Return a Bio::PrimarySeq::Fasta object, which obeys the
|
|
234 Bio::PrimarySeqI conventions. For example, to recover the raw DNA or
|
|
235 protein sequence, call $seq-E<gt>seq().
|
|
236
|
|
237 Note that get_Seq_by_id() does not bring the entire sequence into
|
|
238 memory until requested. Internally, the returned object uses the
|
|
239 accessor to generate subsequences as needed.
|
|
240
|
|
241 =item $seq = $db-E<gt>get_Seq_by_acc($id)
|
|
242
|
|
243 =item $seq = $db-E<gt>get_Seq_by_primary_id($id)
|
|
244
|
|
245 These methods all do the same thing as get_Seq_by_id().
|
|
246
|
|
247 =item $stream = $db-E<gt>get_PrimarySeq_stream()
|
|
248
|
|
249 Return a Bio::DB::Fasta::Stream object, which supports a single method
|
|
250 next_seq(). Each call to next_seq() returns a new
|
|
251 Bio::PrimarySeq::Fasta object, until no more sequences remain.
|
|
252
|
|
253 =back
|
|
254
|
|
255 See L<Bio::PrimarySeqI> for methods provided by the sequence objects
|
|
256 returned from get_Seq_by_id() and get_PrimarySeq_stream().
|
|
257
|
|
258 =head1 TIED INTERFACES
|
|
259
|
|
260 This module provides two tied interfaces, one which allows you to
|
|
261 treat the sequence database as a hash, and the other which allows you
|
|
262 to treat the database as an I/O stream.
|
|
263
|
|
264 =head2 Creating a Tied Hash
|
|
265
|
|
266 The tied hash interface is very straightforward
|
|
267
|
|
268 =over 4
|
|
269
|
|
270 =item $obj = tie %db,'Bio::DB::Fasta','/path/to/fasta/files' [,@args]
|
|
271
|
|
272 Tie %db to Bio::DB::Fasta using the indicated path to the Fasta files.
|
|
273 The optional @args list is the same set of named argument/value pairs
|
|
274 used by Bio::DB::Fasta-E<gt>new().
|
|
275
|
|
276 If successful, tie() will return the tied object. Otherwise it will
|
|
277 return undef.
|
|
278
|
|
279 =back
|
|
280
|
|
281 Once tied, you can use the hash to retrieve an individual sequence by
|
|
282 its ID, like this:
|
|
283
|
|
284 my $seq = $db{CHROMOSOME_I};
|
|
285
|
|
286 You may select a subsequence by appending the comma-separated range to
|
|
287 the sequence ID in the format "$id:$start,$stop". For example, here
|
|
288 is the first 1000 bp of the sequence with the ID "CHROMOSOME_I":
|
|
289
|
|
290 my $seq = $db{'CHROMOSOME_I:1,1000'};
|
|
291
|
|
292 (The regular expression used to parse this format allows sequence IDs
|
|
293 to contain colons.)
|
|
294
|
|
295 When selecting subsequences, if $start E<gt> stop, then the reverse
|
|
296 complement will be returned for DNA sequences.
|
|
297
|
|
298 The keys() and values() functions will return the sequence IDs and
|
|
299 their sequences, respectively. In addition, each() can be used to
|
|
300 iterate over the entire data set:
|
|
301
|
|
302 while (my ($id,$sequence) = each %db) {
|
|
303 print "$id => $sequence\n";
|
|
304 }
|
|
305
|
|
306 When dealing with very large sequences, you can avoid bringing them
|
|
307 into memory by calling each() in a scalar context. This returns the
|
|
308 key only. You can then use tied(%db) to recover the Bio::DB::Fasta
|
|
309 object and call its methods.
|
|
310
|
|
311 while (my $id = each %db) {
|
|
312 print "$id => $db{$sequence:1,100}\n";
|
|
313 print "$id => ",tied(%db)->length($id),"\n";
|
|
314 }
|
|
315
|
|
316 You may, in addition invoke Bio::DB::Fasta's FIRSTKEY and NEXTKEY tied
|
|
317 hash methods directly.
|
|
318
|
|
319 =over 4
|
|
320
|
|
321 =item $id = $db-E<gt>FIRSTKEY
|
|
322
|
|
323 Return the first ID in the database.
|
|
324
|
|
325 =item $id = $db-E<gt>NEXTKEY($id)
|
|
326
|
|
327 Given an ID, return the next ID in sequence.
|
|
328
|
|
329 =back
|
|
330
|
|
331 This allows you to write the following iterative loop using just the
|
|
332 object-oriented interface:
|
|
333
|
|
334 my $db = Bio::DB::Fasta->new('/path/to/fasta/files');
|
|
335 for (my $id=$db->FIRSTKEY; $id; $id=$db->NEXTKEY($id)) {
|
|
336 # do something with sequence
|
|
337 }
|
|
338
|
|
339 =head2 Creating a Tied Filehandle
|
|
340
|
|
341 The Bio::DB::Fasta-E<gt>newFh() method creates a tied filehandle from
|
|
342 which you can read Bio::PrimarySeq::Fasta sequence objects
|
|
343 sequentially. The following bit of code will iterate sequentially
|
|
344 over all sequences in the database:
|
|
345
|
|
346 my $fh = Bio::DB::Fasta->newFh('/path/to/fasta/files');
|
|
347 while (my $seq = <$fh>) {
|
|
348 print $seq->id,' => ',$seq->length,"\n";
|
|
349 }
|
|
350
|
|
351 When no more sequences remain to be retrieved, the stream will return
|
|
352 undef.
|
|
353
|
|
354 =head1 BUGS
|
|
355
|
|
356 When a sequence is deleted from one of the Fasta files, this deletion
|
|
357 is not detected by the module and removed from the index. As a
|
|
358 result, a "ghost" entry will remain in the index and will return
|
|
359 garbage results if accessed.
|
|
360
|
|
361 Currently, the only way to accomodate deletions is to rebuild the
|
|
362 entire index, either by deleting it manually, or by passing
|
|
363 -reindex=E<gt>1 to new() when initializing the module.
|
|
364
|
|
365 =head1 SEE ALSO
|
|
366
|
|
367 L<bioperl>
|
|
368
|
|
369 =head1 AUTHOR
|
|
370
|
|
371 Lincoln Stein E<lt>lstein@cshl.orgE<gt>.
|
|
372
|
|
373 Copyright (c) 2001 Cold Spring Harbor Laboratory.
|
|
374
|
|
375 This library is free software; you can redistribute it and/or modify
|
|
376 it under the same terms as Perl itself. See DISCLAIMER.txt for
|
|
377 disclaimers of warranty.
|
|
378
|
|
379 =cut
|
|
380
|
|
381 #'
|
|
382 package Bio::DB::Fasta;
|
|
383
|
|
384 BEGIN {
|
|
385 @AnyDBM_File::ISA = qw(DB_File GDBM_File NDBM_File SDBM_File)
|
|
386 }
|
|
387
|
|
388 use strict;
|
|
389 use IO::File;
|
|
390 use AnyDBM_File;
|
|
391 use Fcntl;
|
|
392 use File::Basename qw(basename dirname);
|
|
393 use Bio::DB::SeqI;
|
|
394 use Bio::Root::Root;
|
|
395 use vars qw($VERSION @ISA);
|
|
396
|
|
397 @ISA = qw(Bio::DB::SeqI Bio::Root::Root);
|
|
398
|
|
399 $VERSION = '1.03';
|
|
400
|
|
401 *seq = *sequence = \&subseq;
|
|
402 *ids = \&get_all_ids;
|
|
403 *get_seq_by_primary_id = *get_Seq_by_acc = \&get_Seq_by_id;
|
|
404
|
|
405 use constant STRUCT =>'NNnnCa*';
|
|
406 use constant DNA => 1;
|
|
407 use constant RNA => 2;
|
|
408 use constant PROTEIN => 3;
|
|
409
|
|
410 # Bio::DB-like object
|
|
411 # providing fast random access to a directory of FASTA files
|
|
412
|
|
413 =head2 new
|
|
414
|
|
415 Title : new
|
|
416 Usage : my $db = new Bio::DB::Fasta( $path, @options);
|
|
417 Function: initialize a new Bio::DB::Fasta object
|
|
418 Returns : new Bio::DB::Fasta object
|
|
419 Args : path to dir of fasta files or a single filename
|
|
420
|
|
421 These are optional arguments to pass in as well.
|
|
422
|
|
423 -glob Glob expression to use *.{fa,fasta,fast,FA,FASTA,FAST}
|
|
424 for searching for Fasta
|
|
425 files in directories.
|
|
426
|
|
427 -makeid A code subroutine for None
|
|
428 transforming Fasta IDs.
|
|
429
|
|
430 -maxopen Maximum size of 32
|
|
431 filehandle cache.
|
|
432
|
|
433 -debug Turn on status 0
|
|
434 messages.
|
|
435
|
|
436 -reindex Force the index to be 0
|
|
437 rebuilt.
|
|
438
|
|
439 -dbmargs Additional arguments none
|
|
440 to pass to the DBM
|
|
441 routines when tied
|
|
442 (scalar or array ref).
|
|
443
|
|
444 =cut
|
|
445
|
|
446 sub new {
|
|
447 my $class = shift;
|
|
448 my $path = shift;
|
|
449 my %opts = @_;
|
|
450
|
|
451 my $self = bless { debug => $opts{-debug},
|
|
452 makeid => $opts{-makeid},
|
|
453 glob => $opts{-glob} || '*.{fa,fasta,FA,FASTA,fast,FAST,dna,fsa}',
|
|
454 maxopen => $opts{-maxfh} || 32,
|
|
455 dbmargs => $opts{-dbmargs} || undef,
|
|
456 fhcache => {},
|
|
457 cacheseq => {},
|
|
458 curopen => 0,
|
|
459 openseq => 1,
|
|
460 dirname => undef,
|
|
461 offsets => undef,
|
|
462 }, $class;
|
|
463 my ($offsets,$dirname);
|
|
464
|
|
465 if (-d $path) {
|
|
466 $offsets = $self->index_dir($path,$opts{-reindex});
|
|
467 $dirname = $path;
|
|
468 } elsif (-f _) {
|
|
469 $offsets = $self->index_file($path,$opts{-reindex});
|
|
470 $dirname = dirname($path);
|
|
471 } else {
|
|
472 $self->throw( "$path: Invalid file or dirname");
|
|
473 }
|
|
474 @{$self}{qw(dirname offsets)} = ($dirname,$offsets);
|
|
475
|
|
476 $self;
|
|
477 }
|
|
478
|
|
479 =head2 newFh
|
|
480
|
|
481 Title : newFh
|
|
482 Function: gets a new Fh for a file
|
|
483 Example : internal method
|
|
484 Returns : GLOB
|
|
485 Args :
|
|
486
|
|
487 =cut
|
|
488
|
|
489 sub newFh {
|
|
490 my $class = shift;
|
|
491 my $self = $class->new(@_);
|
|
492 require Symbol;
|
|
493 my $fh = Symbol::gensym or return;
|
|
494 tie $$fh,'Bio::DB::Fasta::Stream',$self or return;
|
|
495 $fh;
|
|
496 }
|
|
497
|
|
498 sub _open_index {
|
|
499 my $self = shift;
|
|
500 my ($index,$write) = @_;
|
|
501 my %offsets;
|
|
502 my $flags = $write ? O_CREAT|O_RDWR : O_RDONLY;
|
|
503 my @dbmargs = $self->dbmargs;
|
|
504 tie %offsets,'AnyDBM_File',$index,$flags,0644,@dbmargs or $self->throw( "Can't open cache file: $!");
|
|
505 return \%offsets;
|
|
506 }
|
|
507
|
|
508 =head2 index_dir
|
|
509
|
|
510 Title : index_dir
|
|
511 Usage : $db->index_dir($dir)
|
|
512 Function: set the index dir and load all files in the dir
|
|
513 Returns : hashref of seq offsets in each file
|
|
514 Args : dirname, boolean to force a reload of all files
|
|
515
|
|
516 =cut
|
|
517
|
|
518 sub index_dir {
|
|
519 my $self = shift;
|
|
520 my $dir = shift;
|
|
521 my $force_reindex = shift;
|
|
522
|
|
523 # find all fasta files
|
|
524 my @files = glob("$dir/$self->{glob}");
|
|
525 $self->throw( "no fasta files in $dir") unless @files;
|
|
526
|
|
527 # get name of index
|
|
528 my $index = $self->index_name($dir,1);
|
|
529
|
|
530 # if caller has requested reindexing, then unlink
|
|
531 # the index file.
|
|
532 unlink $index if $force_reindex;
|
|
533
|
|
534 # get the modification time of the index
|
|
535 my $indextime = (stat($index))[9] || 0;
|
|
536
|
|
537 # get the most recent modification time of any of the contents
|
|
538 my $modtime = 0;
|
|
539 my %modtime;
|
|
540 foreach (@files) {
|
|
541 my $m = (stat($_))[9];
|
|
542 $modtime{$_} = $m;
|
|
543 $modtime = $m if $modtime < $m;
|
|
544 }
|
|
545
|
|
546 my $reindex = $force_reindex || $indextime < $modtime;
|
|
547 my $offsets = $self->_open_index($index,$reindex) or return;
|
|
548 $self->{offsets} = $offsets;
|
|
549
|
|
550 # no indexing needed
|
|
551 return $offsets unless $reindex;
|
|
552
|
|
553 # otherwise reindex contents of changed files
|
|
554 $self->{indexing} = $index;
|
|
555 foreach (@files) {
|
|
556 next if( defined $indextime && $modtime{$_} <= $indextime);
|
|
557 $self->calculate_offsets($_,$offsets);
|
|
558 }
|
|
559 delete $self->{indexing};
|
|
560 return $self->{offsets};
|
|
561 }
|
|
562
|
|
563 =head2 get_Seq_by_id
|
|
564
|
|
565 Title : get_Seq_by_id
|
|
566 Usage : my $seq = $db->get_Seq_by_id($id)
|
|
567 Function: Bio::DB::RandomAccessI method implemented
|
|
568 Returns : Bio::PrimarySeqI object
|
|
569 Args : id
|
|
570
|
|
571 =cut
|
|
572
|
|
573 sub get_Seq_by_id {
|
|
574 my $self = shift;
|
|
575 my $id = shift;
|
|
576 return Bio::PrimarySeq::Fasta->new($self,$id);
|
|
577 }
|
|
578
|
|
579 =head2 index_file
|
|
580
|
|
581 Title : index_file
|
|
582 Usage : $db->index_file($filename)
|
|
583 Function: (re)loads a sequence file and indexes sequences offsets in the file
|
|
584 Returns : seq offsets in the file
|
|
585 Args : filename,
|
|
586 boolean to force reloading a file
|
|
587
|
|
588 =cut
|
|
589
|
|
590
|
|
591 sub index_file {
|
|
592 my $self = shift;
|
|
593 my $file = shift;
|
|
594 my $force_reindex = shift;
|
|
595
|
|
596 my $index = $self->index_name($file);
|
|
597 # if caller has requested reindexing, then unlink the index
|
|
598 unlink $index if $force_reindex;
|
|
599
|
|
600 # get the modification time of the index
|
|
601 my $indextime = (stat($index))[9];
|
|
602 my $modtime = (stat($file))[9];
|
|
603
|
|
604 my $reindex = $force_reindex || $indextime < $modtime;
|
|
605 my $offsets = $self->_open_index($index,$reindex) or return;
|
|
606 $self->{offsets} = $offsets;
|
|
607
|
|
608 return $self->{offsets} unless $reindex;
|
|
609
|
|
610 $self->{indexing} = $index;
|
|
611 $self->calculate_offsets($file,$offsets);
|
|
612 delete $self->{indexing};
|
|
613 return $self->{offsets};
|
|
614 }
|
|
615
|
|
616 =head2 dbmargs
|
|
617
|
|
618 Title : dbmargs
|
|
619 Usage : my @args = $db->dbmargs;
|
|
620 Function: gets stored dbm arguments
|
|
621 Returns : array
|
|
622 Args : none
|
|
623
|
|
624
|
|
625 =cut
|
|
626
|
|
627 sub dbmargs {
|
|
628 my $self = shift;
|
|
629 my $args = $self->{dbmargs} or return;
|
|
630 return ref($args) eq 'ARRAY' ? @$args : $args;
|
|
631 }
|
|
632
|
|
633 =head2 index_name
|
|
634
|
|
635 Title : index_name
|
|
636 Usage : my $indexname = $db->index_name($path,$isdir);
|
|
637 Function: returns the name of the index for a specific path
|
|
638 Returns : string
|
|
639 Args : path to check,
|
|
640 boolean if it is a dir
|
|
641
|
|
642 =cut
|
|
643
|
|
644 sub index_name {
|
|
645 my $self = shift;
|
|
646 my ($path,$isdir) = @_;
|
|
647 unless ($path) {
|
|
648 my $dir = $self->{dirname} or return;
|
|
649 return $self->index_name($dir,-d $dir);
|
|
650 }
|
|
651 return "$path/directory.index" if $isdir;
|
|
652 return "$path.index";
|
|
653 }
|
|
654
|
|
655 =head2 calculate_offsets
|
|
656
|
|
657 Title : calculate_offsets
|
|
658 Usage : $db->calculate_offsets($filename,$offsets);
|
|
659 Function: calculates the sequence offsets in a file based on id
|
|
660 Returns : offset hash for each file
|
|
661 Args : file to process
|
|
662 $offsets - hashref of id to offset storage
|
|
663
|
|
664 =cut
|
|
665
|
|
666 sub calculate_offsets {
|
|
667 my $self = shift;
|
|
668 my ($file,$offsets) = @_;
|
|
669 my $base = $self->path2fileno(basename($file));
|
|
670
|
|
671 my $fh = IO::File->new($file) or $self->throw( "Can't open $file: $!");
|
|
672 warn "indexing $file\n" if $self->{debug};
|
|
673 my ($offset,$id,$linelength,$type,$firstline,$count,%offsets);
|
|
674 while (<$fh>) { # don't try this at home
|
|
675 if (/^>(\S+)/) {
|
|
676 print STDERR "indexed $count sequences...\n"
|
|
677 if $self->{debug} && (++$count%1000) == 0;
|
|
678 my $pos = tell($fh);
|
|
679 if ($id) {
|
|
680 my $seqlength = $pos - $offset - length($_) - 1;
|
|
681 $seqlength -= int($seqlength/$linelength);
|
|
682 $offsets->{$id} = $self->_pack($offset,$seqlength,
|
|
683 $linelength,$firstline,
|
|
684 $type,$base);
|
|
685 }
|
|
686 $id = ref($self->{makeid}) eq 'CODE' ? $self->{makeid}->($_) : $1;
|
|
687 ($offset,$firstline,$linelength) = ($pos,length($_),0);
|
|
688 } else {
|
|
689 $linelength ||= length($_);
|
|
690 $type ||= $self->_type($_);
|
|
691 }
|
|
692 }
|
|
693 # deal with last entry
|
|
694 if ($id) {
|
|
695 my $pos = tell($fh);
|
|
696
|
|
697 # my $seqlength = $pos - $offset - length($_) - 1;
|
|
698 # $_ is always null should not be part of this calculation
|
|
699 my $seqlength = $pos - $offset - 1;
|
|
700
|
|
701 if ($linelength == 0) { # yet another pesky empty chr_random.fa file
|
|
702 $seqlength = 0;
|
|
703 } else {
|
|
704 $seqlength -= int($seqlength/$linelength);
|
|
705 };
|
|
706 $offsets->{$id} = $self->_pack($offset,$seqlength,
|
|
707 $linelength,$firstline,
|
|
708 $type,$base);
|
|
709 }
|
|
710 return \%offsets;
|
|
711 }
|
|
712
|
|
713 =head2 get_all_ids
|
|
714
|
|
715 Title : get_all_ids
|
|
716 Usage : my @ids = $db->get_all_ids
|
|
717 Function: gets all the stored ids in all indexes
|
|
718 Returns : list of ids
|
|
719 Args : none
|
|
720
|
|
721 =cut
|
|
722
|
|
723 sub get_all_ids { grep {!/^__/} keys %{shift->{offsets}} }
|
|
724
|
|
725 sub offset {
|
|
726 my $self = shift;
|
|
727 my $id = shift;
|
|
728 my $offset = $self->{offsets}{$id} or return;
|
|
729 ($self->_unpack($offset))[0];
|
|
730 }
|
|
731
|
|
732 sub length {
|
|
733 my $self = shift;
|
|
734 my $id = shift;
|
|
735 my $offset = $self->{offsets}{$id} or return;
|
|
736 ($self->_unpack($offset))[1];
|
|
737 }
|
|
738
|
|
739 sub linelen {
|
|
740 my $self = shift;
|
|
741 my $id = shift;
|
|
742 my $offset = $self->{offsets}{$id} or return;
|
|
743 ($self->_unpack($offset))[2];
|
|
744 }
|
|
745
|
|
746 sub headerlen {
|
|
747 my $self = shift;
|
|
748 my $id = shift;
|
|
749 my $offset = $self->{offsets}{$id} or return;
|
|
750 ($self->_unpack($offset))[3];
|
|
751 }
|
|
752
|
|
753 sub alphabet {
|
|
754 my $self = shift;
|
|
755 my $id = shift;
|
|
756 my $offset = $self->{offsets}{$id} or return;
|
|
757 my $type = ($self->_unpack($offset))[4];
|
|
758 return $type == DNA ? 'dna'
|
|
759 : $type == RNA ? 'rna'
|
|
760 : 'protein';
|
|
761
|
|
762 }
|
|
763
|
|
764 sub path { shift->{dirname} }
|
|
765
|
|
766 sub header_offset {
|
|
767 my $self = shift;
|
|
768 my $id = shift;
|
|
769 return unless $self->{offsets}{$id};
|
|
770 return $self->offset($id) - $self->headerlen($id);
|
|
771 }
|
|
772
|
|
773 sub file {
|
|
774 my $self = shift;
|
|
775 my $id = shift;
|
|
776 my $offset = $self->{offsets}{$id} or return;
|
|
777 $self->fileno2path(($self->_unpack($offset))[5]);
|
|
778 }
|
|
779
|
|
780 sub fileno2path {
|
|
781 my $self = shift;
|
|
782 my $no = shift;
|
|
783 return $self->{offsets}{"__file_$no"};
|
|
784 }
|
|
785
|
|
786 sub path2fileno {
|
|
787 my $self = shift;
|
|
788 my $path = shift;
|
|
789 if ( !defined $self->{offsets}{"__path_$path"} ) {
|
|
790 my $fileno = ($self->{offsets}{"__path_$path"} = 0+ $self->{fileno}++);
|
|
791 $self->{offsets}{"__file_$fileno"} = $path;
|
|
792 }
|
|
793 return $self->{offsets}{"__path_$path"}
|
|
794 }
|
|
795
|
|
796 =head2 subseq
|
|
797
|
|
798 Title : subseq
|
|
799 Usage : $seqdb->subseq($id,$start,$stop);
|
|
800 Function: returns a subseq of a sequence in the db
|
|
801 Returns : subsequence data
|
|
802 Args : id of sequence, starting point, ending point
|
|
803
|
|
804 =cut
|
|
805
|
|
806 sub subseq {
|
|
807 my ($self,$id,$start,$stop) = @_;
|
|
808 if ($id =~ /^(.+):([\d_]+)[,-]([\d_]+)$/) {
|
|
809 ($id,$start,$stop) = ($1,$2,$3);
|
|
810 $start =~ s/_//g;
|
|
811 $stop =~ s/_//g;
|
|
812 }
|
|
813 $start ||= 1;
|
|
814 $stop ||= $self->length($id);
|
|
815
|
|
816 my $reversed;
|
|
817 if ($start > $stop) {
|
|
818 ($start,$stop) = ($stop,$start);
|
|
819 $reversed++;
|
|
820 }
|
|
821
|
|
822 my $data;
|
|
823
|
|
824 my $fh = $self->fh($id) or return;
|
|
825 my $filestart = $self->caloffset($id,$start);
|
|
826 my $filestop = $self->caloffset($id,$stop);
|
|
827
|
|
828 seek($fh,$filestart,0);
|
|
829 read($fh,$data,$filestop-$filestart+1);
|
|
830 $data =~ s/\n//g;
|
|
831 if ($reversed) {
|
|
832 $data = reverse $data;
|
|
833 $data =~ tr/gatcGATC/ctagCTAG/;
|
|
834 }
|
|
835 $data;
|
|
836 }
|
|
837
|
|
838 sub fh {
|
|
839 my $self = shift;
|
|
840 my $id = shift;
|
|
841 my $file = $self->file($id) or return;
|
|
842 $self->fhcache("$self->{dirname}/$file") or $self->throw( "Can't open file $file");
|
|
843 }
|
|
844
|
|
845 sub header {
|
|
846 my $self = shift;
|
|
847 my $id = shift;
|
|
848 my ($offset,$seqlength,$linelength,$firstline,$type,$file)
|
|
849 = $self->_unpack($self->{offsets}{$id}) or return;
|
|
850 $offset -= $firstline;
|
|
851 my $data;
|
|
852 my $fh = $self->fh($id) or return;
|
|
853 seek($fh,$offset,0);
|
|
854 read($fh,$data,$firstline);
|
|
855 chomp $data;
|
|
856 substr($data,0,1) = '';
|
|
857 $data;
|
|
858 }
|
|
859
|
|
860 sub caloffset {
|
|
861 my $self = shift;
|
|
862 my $id = shift;
|
|
863 my $a = shift()-1;
|
|
864 my ($offset,$seqlength,$linelength,$firstline,$type,$file) = $self->_unpack($self->{offsets}{$id});
|
|
865 $a = 0 if $a < 0;
|
|
866 $a = $seqlength-1 if $a >= $seqlength;
|
|
867 $offset + $linelength * int($a/($linelength-1)) + $a % ($linelength-1);
|
|
868 }
|
|
869
|
|
870 sub fhcache {
|
|
871 my $self = shift;
|
|
872 my $path = shift;
|
|
873 if (!$self->{fhcache}{$path}) {
|
|
874 if ($self->{curopen} >= $self->{maxopen}) {
|
|
875 my @lru = sort {$self->{cacheseq}{$a} <=> $self->{cacheseq}{$b};} keys %{$self->{fhcache}};
|
|
876 splice(@lru, $self->{maxopen} / 3);
|
|
877 $self->{curopen} -= @lru;
|
|
878 for (@lru) { delete $self->{fhcache}{$_} }
|
|
879 }
|
|
880 $self->{fhcache}{$path} = IO::File->new($path) or return;
|
|
881 $self->{curopen}++;
|
|
882 }
|
|
883 $self->{cacheseq}{$path}++;
|
|
884 $self->{fhcache}{$path}
|
|
885 }
|
|
886
|
|
887 sub _pack {
|
|
888 shift;
|
|
889 pack STRUCT,@_;
|
|
890 }
|
|
891
|
|
892 sub _unpack {
|
|
893 shift;
|
|
894 unpack STRUCT,shift;
|
|
895 }
|
|
896
|
|
897 sub _type {
|
|
898 shift;
|
|
899 local $_ = shift;
|
|
900 return /^[gatcnGATCN*-]+$/ ? DNA
|
|
901 : /^[gaucnGAUCN*-]+$/ ? RNA
|
|
902 : PROTEIN;
|
|
903 }
|
|
904
|
|
905 =head2 get_PrimarySeq_stream
|
|
906
|
|
907 Title : get_PrimarySeq_stream
|
|
908 Usage :
|
|
909 Function:
|
|
910 Example :
|
|
911 Returns :
|
|
912 Args :
|
|
913
|
|
914
|
|
915 =cut
|
|
916
|
|
917 sub get_PrimarySeq_stream {
|
|
918 my $self = shift;
|
|
919 return Bio::DB::Fasta::Stream->new($self);
|
|
920 }
|
|
921
|
|
922 sub TIEHASH {
|
|
923 my $self = shift;
|
|
924 return $self->new(@_);
|
|
925 }
|
|
926
|
|
927 sub FETCH {
|
|
928 shift->subseq(@_);
|
|
929 }
|
|
930 sub STORE {
|
|
931 shift->throw("Read-only database");
|
|
932 }
|
|
933 sub DELETE {
|
|
934 shift->throw("Read-only database");
|
|
935 }
|
|
936 sub CLEAR {
|
|
937 shift->throw("Read-only database");
|
|
938 }
|
|
939 sub EXISTS {
|
|
940 defined shift->offset(@_);
|
|
941 }
|
|
942 sub FIRSTKEY { tied(%{shift->{offsets}})->FIRSTKEY(@_); }
|
|
943 sub NEXTKEY { tied(%{shift->{offsets}})->NEXTKEY(@_); }
|
|
944
|
|
945 sub DESTROY {
|
|
946 my $self = shift;
|
|
947 if ($self->{indexing}) { # killed prematurely, so index file is no good!
|
|
948 warn "indexing was interrupted, so unlinking $self->{indexing}";
|
|
949 unlink $self->{indexing};
|
|
950 }
|
|
951 }
|
|
952
|
|
953 #-------------------------------------------------------------
|
|
954 # Bio::PrimarySeqI compatibility
|
|
955 #
|
|
956 package Bio::PrimarySeq::Fasta;
|
|
957 use overload '""' => 'display_id';
|
|
958
|
|
959 use vars '@ISA';
|
|
960 eval {
|
|
961 require Bio::PrimarySeqI;
|
|
962 require Bio::Root::Root;
|
|
963 } && (@ISA = ('Bio::Root::Root','Bio::PrimarySeqI'));
|
|
964
|
|
965 sub new {
|
|
966 my $class = shift;
|
|
967 $class = ref($class) if ref $class;
|
|
968 my ($db,$id,$start,$stop) = @_;
|
|
969 return bless { db => $db,
|
|
970 id => $id,
|
|
971 start => $start || 1,
|
|
972 stop => $stop || $db->length($id)
|
|
973 },$class;
|
|
974 }
|
|
975
|
|
976 sub seq {
|
|
977 my $self = shift;
|
|
978 return $self->{db}->seq($self->{id},$self->{start},$self->{stop});
|
|
979 }
|
|
980
|
|
981 sub subseq {
|
|
982 my $self = shift;
|
|
983 my ($start,$stop) = @_;
|
|
984 $self->throw("Stop cannot be smaller than start") unless $start <= $stop;
|
|
985 return $self->{start} <= $self->{stop} ? $self->new($self->{db},
|
|
986 $self->{id},
|
|
987 $self->{start}+$start-1,
|
|
988 $self->{start}+$stop-1)
|
|
989 : $self->new($self->{db},
|
|
990 $self->{id},
|
|
991 $self->{start}-($start-1),
|
|
992 $self->{start}-($stop-1)
|
|
993 );
|
|
994
|
|
995 }
|
|
996
|
|
997 sub display_id {
|
|
998 my $self = shift;
|
|
999 return $self->{id};
|
|
1000 }
|
|
1001
|
|
1002 sub accession_number {
|
|
1003 my $self = shift;
|
|
1004 return "unknown";
|
|
1005 }
|
|
1006
|
|
1007 sub primary_id {
|
|
1008 my $self = shift;
|
|
1009 return overload::StrVal($self);
|
|
1010 }
|
|
1011
|
|
1012 sub can_call_new { return 0 }
|
|
1013
|
|
1014 sub alphabet {
|
|
1015 my $self = shift;
|
|
1016 return $self->{db}->alphabet($self->{id});
|
|
1017 }
|
|
1018
|
|
1019 sub revcom {
|
|
1020 my $self = shift;
|
|
1021 return $self->new(@{$self}{'db','id','stop','start'});
|
|
1022 }
|
|
1023
|
|
1024 sub length {
|
|
1025 my $self = shift;
|
|
1026 return $self->{db}->length($self->{id});
|
|
1027 }
|
|
1028
|
|
1029 sub desc {
|
|
1030 my $self = shift;
|
|
1031 return '';
|
|
1032 }
|
|
1033
|
|
1034 #-------------------------------------------------------------
|
|
1035 # stream-based access to the database
|
|
1036 #
|
|
1037 package Bio::DB::Fasta::Stream;
|
|
1038 use Tie::Handle;
|
|
1039 use vars qw(@ISA);
|
|
1040 @ISA = qw(Tie::Handle);
|
|
1041 eval {
|
|
1042 require Bio::DB::SeqI;
|
|
1043 } && (push @ISA,'Bio::DB::SeqI');
|
|
1044
|
|
1045
|
|
1046 sub new {
|
|
1047 my $class = shift;
|
|
1048 my $db = shift;
|
|
1049 my $key = $db->FIRSTKEY;
|
|
1050 return bless { db=>$db,key=>$key },$class;
|
|
1051 }
|
|
1052
|
|
1053 sub next_seq {
|
|
1054 my $self = shift;
|
|
1055 my ($key,$db) = @{$self}{'key','db'};
|
|
1056 my $value = $db->get_Seq_by_id($key);
|
|
1057 $self->{key} = $db->NEXTKEY($key);
|
|
1058 $value;
|
|
1059 }
|
|
1060
|
|
1061 sub TIEHANDLE {
|
|
1062 my $class = shift;
|
|
1063 my $db = shift;
|
|
1064 return $class->new($db);
|
|
1065 }
|
|
1066 sub READLINE {
|
|
1067 my $self = shift;
|
|
1068 $self->next_seq;
|
|
1069 }
|
|
1070
|
|
1071 1;
|
|
1072
|
|
1073 __END__
|
|
1074
|