Mercurial > repos > mahtabm > ensembl
comparison variant_effect_predictor/Bio/DB/Fasta.pm @ 0:1f6dce3d34e0
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 02:01:53 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:1f6dce3d34e0 |
---|---|
1 =head1 NAME | |
2 | |
3 Bio::DB::Fasta -- Fast indexed access to a directory of fasta files | |
4 | |
5 =head1 SYNOPSIS | |
6 | |
7 use Bio::DB::Fasta; | |
8 | |
9 # create database from directory of fasta files | |
10 my $db = Bio::DB::Fasta->new('/path/to/fasta/files'); | |
11 | |
12 # simple access (for those without Bioperl) | |
13 my $seq = $db->seq('CHROMOSOME_I',4_000_000 => 4_100_000); | |
14 my $revseq = $db->seq('CHROMOSOME_I',4_100_000 => 4_000_000); | |
15 my @ids = $db->ids; | |
16 my $length = $db->length('CHROMOSOME_I'); | |
17 my $alphabet = $db->alphabet('CHROMOSOME_I'); | |
18 my $header = $db->header('CHROMOSOME_I'); | |
19 | |
20 # Bioperl-style access | |
21 my $db = Bio::DB::Fasta->new('/path/to/fasta/files'); | |
22 | |
23 my $obj = $db->get_Seq_by_id('CHROMOSOME_I'); | |
24 my $seq = $obj->seq; | |
25 my $subseq = $obj->subseq(4_000_000 => 4_100_000); | |
26 my $length = $obj->length; | |
27 # (etc) | |
28 | |
29 # Bio::SeqIO-style access | |
30 my $stream = Bio::DB::Fasta->new('/path/to/fasta/files')->get_PrimarySeq_stream; | |
31 while (my $seq = $stream->next_seq) { | |
32 # Bio::PrimarySeqI stuff | |
33 } | |
34 | |
35 my $fh = Bio::DB::Fasta->newFh('/path/to/fasta/files'); | |
36 while (my $seq = <$fh>) { | |
37 # Bio::PrimarySeqI stuff | |
38 } | |
39 | |
40 # tied hash access | |
41 tie %sequences,'Bio::DB::Fasta','/path/to/fasta/files'; | |
42 print $sequences{'CHROMOSOME_I:1,20000'}; | |
43 | |
44 =head1 DESCRIPTION | |
45 | |
46 Bio::DB::Fasta provides indexed access to one or more Fasta files. It | |
47 provides random access to each sequence entry, and to subsequences | |
48 within each entry, allowing you to retrieve portions of very large | |
49 sequences without bringing the entire sequence into memory. | |
50 | |
51 When you initialize the module, you point it at a single fasta file or | |
52 a directory of multiple such files. The first time it is run, the | |
53 module generates an index of the contents of the file or directory | |
54 using the AnyDBM module (Berkeley DB preferred, followed by GDBM_File, | |
55 NDBM_File, and SDBM_File). Thereafter it uses the index file to find | |
56 the file and offset for any requested sequence. If one of the source | |
57 fasta files is updated, the module reindexes just that one file. (You | |
58 can also force reindexing manually). For improved performance, the | |
59 module keeps a cache of open filehandles, closing less-recently used | |
60 ones when the cache is full. | |
61 | |
62 The fasta files may contain any combination of nucleotide and protein | |
63 sequences; during indexing the module guesses the molecular type. | |
64 Entries may have any line length, and different line lengths are | |
65 allowed in the same file. However, within a sequence entry, all lines | |
66 must be the same length except for the last. | |
67 | |
68 The module uses /^E<gt>(\S+)/ to extract each sequence's primary ID from | |
69 the Fasta header. During indexing, you may pass a callback routine to | |
70 modify this primary ID. For example, you may wish to extract a | |
71 portion of the gi|gb|abc|xyz nonsense that GenBank Fasta files use. | |
72 The original header line can be recovered later. | |
73 | |
74 This module was developed for use with the C. elegans and human | |
75 genomes, and has been tested with sequence segments as large as 20 | |
76 megabases. Indexing the C. elegans genome (100 megabases of genomic | |
77 sequence plus 100,000 ESTs) takes ~5 minutes on my 300 MHz pentium | |
78 laptop. On the same system, average access time for any 200-mer within | |
79 the C. elegans genome was E<lt>0.02s. | |
80 | |
81 =head1 DATABASE CREATION AND INDEXING | |
82 | |
83 The two constructors for this class are new() and newFh(). The former | |
84 creates a Bio::DB::Fasta object which is accessed via method calls. | |
85 The latter creates a tied filehandle which can be used Bio::SeqIO | |
86 style to fetch sequence objects in a stream fashion. There is also a | |
87 tied hash interface. | |
88 | |
89 =over 4 | |
90 | |
91 =item $db = Bio::DB::Fasta-E<gt>new($fasta_path [,%options]) | |
92 | |
93 Create a new Bio::DB::Fasta object from the Fasta file or files | |
94 indicated by $fasta_path. Indexing will be performed automatically if | |
95 needed. If successful, new() will return the database accessor | |
96 object. Otherwise it will return undef. | |
97 | |
98 $fasta_path may be an individual Fasta file, or may refer to a | |
99 directory containing one or more of such files. Following the path, | |
100 you may pass a series of name=E<gt>value options or a hash with these | |
101 same name=E<gt>value pairs. Valid options are: | |
102 | |
103 Option Name Description Default | |
104 ----------- ----------- ------- | |
105 | |
106 -glob Glob expression to use *.{fa,fasta,fast,FA,FASTA,FAST,dna} | |
107 for searching for Fasta | |
108 files in directories. | |
109 | |
110 -makeid A code subroutine for None | |
111 transforming Fasta IDs. | |
112 | |
113 -maxopen Maximum size of 32 | |
114 filehandle cache. | |
115 | |
116 -debug Turn on status 0 | |
117 messages. | |
118 | |
119 -reindex Force the index to be 0 | |
120 rebuilt. | |
121 | |
122 -dbmargs Additional arguments none | |
123 to pass to the DBM | |
124 routines when tied | |
125 (scalar or array ref). | |
126 | |
127 -dbmargs can be used to control the format of the index. For example, | |
128 you can pass $DB_BTREE to this argument so as to force the IDs to be | |
129 sorted and retrieved alphabetically. Note that you must use the same | |
130 arguments every time you open the index! | |
131 | |
132 -reindex can be used to force the index to be recreated from scratch. | |
133 | |
134 =item $fh = Bio::DB::Fasta-E<gt>newFh($fasta_path [,%options]) | |
135 | |
136 Create a tied filehandle opened on a Bio::DB::Fasta object. Reading | |
137 from this filehandle with E<lt>E<gt> will return a stream of sequence objects, | |
138 Bio::SeqIO style. | |
139 | |
140 =back | |
141 | |
142 The -makeid option gives you a chance to modify sequence IDs during | |
143 indexing. The option's value should be a code reference that will | |
144 take a scalar argument and return a scalar result, like this: | |
145 | |
146 $db = Bio::DB::Fasta->new("file.fa",-makeid=>\&make_my_id); | |
147 | |
148 sub make_my_id { | |
149 my $description_line = shift; | |
150 # get a new id from the fasta header | |
151 return $new_id; | |
152 } | |
153 | |
154 make_my_id() will be called with the full fasta id line (including the | |
155 "E<gt>" symbol!). For example: | |
156 | |
157 >A12345.3 Predicted C. elegans protein egl-2 | |
158 | |
159 By default, this module will use the regular expression /^E<gt>(\S+)/ | |
160 to extract "A12345.3" for use as the ID. If you pass a -makeid | |
161 callback, you can extract any portion of this, such as the "egl-2" | |
162 symbol. | |
163 | |
164 The -makeid option is ignored after the index is constructed. | |
165 | |
166 =head1 OBJECT METHODS | |
167 | |
168 The following object methods are provided. | |
169 | |
170 =over 4 | |
171 | |
172 =item $raw_seq = $db-E<gt>seq($id [,$start, $stop]) | |
173 | |
174 Return the raw sequence (a string) given an ID and optionally a start | |
175 and stop position in the sequence. In the case of DNA sequence, if | |
176 $stop is less than $start, then the reverse complement of the sequence | |
177 is returned (this violates Bio::Seq conventions). | |
178 | |
179 For your convenience, subsequences can be indicated with this compound | |
180 ID: | |
181 | |
182 $db->seq("$id:$start,$stop") | |
183 | |
184 =item $length = $db-E<gt>length($id) | |
185 | |
186 Return the length of the indicated sequence. | |
187 | |
188 =item $header = $db-E<gt>header($id) | |
189 | |
190 Return the header line for the ID, including the initial "E<gt>". | |
191 | |
192 =item $type = $db-E<gt>alphabet($id) | |
193 | |
194 Return the molecular type of the indicated sequence. One of "dna", | |
195 "rna" or "protein". | |
196 | |
197 =item $filename = $db-E<gt>file($id) | |
198 | |
199 Return the name of the file in which the indicated sequence can be | |
200 found. | |
201 | |
202 =item $offset = $db-E<gt>offset($id) | |
203 | |
204 Return the offset of the indicated sequence from the beginning of the | |
205 file in which it is located. The offset points to the beginning of | |
206 the sequence, not the beginning of the header line. | |
207 | |
208 =item $header_length = $db-E<gt>headerlen($id) | |
209 | |
210 Return the length of the header line for the indicated sequence. | |
211 | |
212 =item $header_offset = $db-E<gt>header_offset($id) | |
213 | |
214 Return the offset of the header line for the indicated sequence from | |
215 the beginning of the file in which it is located. | |
216 | |
217 =item $index_name = $db-E<gt>index_name | |
218 | |
219 Return the path to the index file. | |
220 | |
221 =item $path = $db-E<gt>path | |
222 | |
223 Return the path to the Fasta file(s). | |
224 | |
225 =back | |
226 | |
227 For BioPerl-style access, the following methods are provided: | |
228 | |
229 =over 4 | |
230 | |
231 =item $seq = $db-E<gt>get_Seq_by_id($id) | |
232 | |
233 Return a Bio::PrimarySeq::Fasta object, which obeys the | |
234 Bio::PrimarySeqI conventions. For example, to recover the raw DNA or | |
235 protein sequence, call $seq-E<gt>seq(). | |
236 | |
237 Note that get_Seq_by_id() does not bring the entire sequence into | |
238 memory until requested. Internally, the returned object uses the | |
239 accessor to generate subsequences as needed. | |
240 | |
241 =item $seq = $db-E<gt>get_Seq_by_acc($id) | |
242 | |
243 =item $seq = $db-E<gt>get_Seq_by_primary_id($id) | |
244 | |
245 These methods all do the same thing as get_Seq_by_id(). | |
246 | |
247 =item $stream = $db-E<gt>get_PrimarySeq_stream() | |
248 | |
249 Return a Bio::DB::Fasta::Stream object, which supports a single method | |
250 next_seq(). Each call to next_seq() returns a new | |
251 Bio::PrimarySeq::Fasta object, until no more sequences remain. | |
252 | |
253 =back | |
254 | |
255 See L<Bio::PrimarySeqI> for methods provided by the sequence objects | |
256 returned from get_Seq_by_id() and get_PrimarySeq_stream(). | |
257 | |
258 =head1 TIED INTERFACES | |
259 | |
260 This module provides two tied interfaces, one which allows you to | |
261 treat the sequence database as a hash, and the other which allows you | |
262 to treat the database as an I/O stream. | |
263 | |
264 =head2 Creating a Tied Hash | |
265 | |
266 The tied hash interface is very straightforward | |
267 | |
268 =over 4 | |
269 | |
270 =item $obj = tie %db,'Bio::DB::Fasta','/path/to/fasta/files' [,@args] | |
271 | |
272 Tie %db to Bio::DB::Fasta using the indicated path to the Fasta files. | |
273 The optional @args list is the same set of named argument/value pairs | |
274 used by Bio::DB::Fasta-E<gt>new(). | |
275 | |
276 If successful, tie() will return the tied object. Otherwise it will | |
277 return undef. | |
278 | |
279 =back | |
280 | |
281 Once tied, you can use the hash to retrieve an individual sequence by | |
282 its ID, like this: | |
283 | |
284 my $seq = $db{CHROMOSOME_I}; | |
285 | |
286 You may select a subsequence by appending the comma-separated range to | |
287 the sequence ID in the format "$id:$start,$stop". For example, here | |
288 is the first 1000 bp of the sequence with the ID "CHROMOSOME_I": | |
289 | |
290 my $seq = $db{'CHROMOSOME_I:1,1000'}; | |
291 | |
292 (The regular expression used to parse this format allows sequence IDs | |
293 to contain colons.) | |
294 | |
295 When selecting subsequences, if $start E<gt> stop, then the reverse | |
296 complement will be returned for DNA sequences. | |
297 | |
298 The keys() and values() functions will return the sequence IDs and | |
299 their sequences, respectively. In addition, each() can be used to | |
300 iterate over the entire data set: | |
301 | |
302 while (my ($id,$sequence) = each %db) { | |
303 print "$id => $sequence\n"; | |
304 } | |
305 | |
306 When dealing with very large sequences, you can avoid bringing them | |
307 into memory by calling each() in a scalar context. This returns the | |
308 key only. You can then use tied(%db) to recover the Bio::DB::Fasta | |
309 object and call its methods. | |
310 | |
311 while (my $id = each %db) { | |
312 print "$id => $db{$sequence:1,100}\n"; | |
313 print "$id => ",tied(%db)->length($id),"\n"; | |
314 } | |
315 | |
316 You may, in addition invoke Bio::DB::Fasta's FIRSTKEY and NEXTKEY tied | |
317 hash methods directly. | |
318 | |
319 =over 4 | |
320 | |
321 =item $id = $db-E<gt>FIRSTKEY | |
322 | |
323 Return the first ID in the database. | |
324 | |
325 =item $id = $db-E<gt>NEXTKEY($id) | |
326 | |
327 Given an ID, return the next ID in sequence. | |
328 | |
329 =back | |
330 | |
331 This allows you to write the following iterative loop using just the | |
332 object-oriented interface: | |
333 | |
334 my $db = Bio::DB::Fasta->new('/path/to/fasta/files'); | |
335 for (my $id=$db->FIRSTKEY; $id; $id=$db->NEXTKEY($id)) { | |
336 # do something with sequence | |
337 } | |
338 | |
339 =head2 Creating a Tied Filehandle | |
340 | |
341 The Bio::DB::Fasta-E<gt>newFh() method creates a tied filehandle from | |
342 which you can read Bio::PrimarySeq::Fasta sequence objects | |
343 sequentially. The following bit of code will iterate sequentially | |
344 over all sequences in the database: | |
345 | |
346 my $fh = Bio::DB::Fasta->newFh('/path/to/fasta/files'); | |
347 while (my $seq = <$fh>) { | |
348 print $seq->id,' => ',$seq->length,"\n"; | |
349 } | |
350 | |
351 When no more sequences remain to be retrieved, the stream will return | |
352 undef. | |
353 | |
354 =head1 BUGS | |
355 | |
356 When a sequence is deleted from one of the Fasta files, this deletion | |
357 is not detected by the module and removed from the index. As a | |
358 result, a "ghost" entry will remain in the index and will return | |
359 garbage results if accessed. | |
360 | |
361 Currently, the only way to accomodate deletions is to rebuild the | |
362 entire index, either by deleting it manually, or by passing | |
363 -reindex=E<gt>1 to new() when initializing the module. | |
364 | |
365 =head1 SEE ALSO | |
366 | |
367 L<bioperl> | |
368 | |
369 =head1 AUTHOR | |
370 | |
371 Lincoln Stein E<lt>lstein@cshl.orgE<gt>. | |
372 | |
373 Copyright (c) 2001 Cold Spring Harbor Laboratory. | |
374 | |
375 This library is free software; you can redistribute it and/or modify | |
376 it under the same terms as Perl itself. See DISCLAIMER.txt for | |
377 disclaimers of warranty. | |
378 | |
379 =cut | |
380 | |
381 #' | |
382 package Bio::DB::Fasta; | |
383 | |
384 BEGIN { | |
385 @AnyDBM_File::ISA = qw(DB_File GDBM_File NDBM_File SDBM_File) | |
386 } | |
387 | |
388 use strict; | |
389 use IO::File; | |
390 use AnyDBM_File; | |
391 use Fcntl; | |
392 use File::Basename qw(basename dirname); | |
393 use Bio::DB::SeqI; | |
394 use Bio::Root::Root; | |
395 use vars qw($VERSION @ISA); | |
396 | |
397 @ISA = qw(Bio::DB::SeqI Bio::Root::Root); | |
398 | |
399 $VERSION = '1.03'; | |
400 | |
401 *seq = *sequence = \&subseq; | |
402 *ids = \&get_all_ids; | |
403 *get_seq_by_primary_id = *get_Seq_by_acc = \&get_Seq_by_id; | |
404 | |
405 use constant STRUCT =>'NNnnCa*'; | |
406 use constant DNA => 1; | |
407 use constant RNA => 2; | |
408 use constant PROTEIN => 3; | |
409 | |
410 # Bio::DB-like object | |
411 # providing fast random access to a directory of FASTA files | |
412 | |
413 =head2 new | |
414 | |
415 Title : new | |
416 Usage : my $db = new Bio::DB::Fasta( $path, @options); | |
417 Function: initialize a new Bio::DB::Fasta object | |
418 Returns : new Bio::DB::Fasta object | |
419 Args : path to dir of fasta files or a single filename | |
420 | |
421 These are optional arguments to pass in as well. | |
422 | |
423 -glob Glob expression to use *.{fa,fasta,fast,FA,FASTA,FAST} | |
424 for searching for Fasta | |
425 files in directories. | |
426 | |
427 -makeid A code subroutine for None | |
428 transforming Fasta IDs. | |
429 | |
430 -maxopen Maximum size of 32 | |
431 filehandle cache. | |
432 | |
433 -debug Turn on status 0 | |
434 messages. | |
435 | |
436 -reindex Force the index to be 0 | |
437 rebuilt. | |
438 | |
439 -dbmargs Additional arguments none | |
440 to pass to the DBM | |
441 routines when tied | |
442 (scalar or array ref). | |
443 | |
444 =cut | |
445 | |
446 sub new { | |
447 my $class = shift; | |
448 my $path = shift; | |
449 my %opts = @_; | |
450 | |
451 my $self = bless { debug => $opts{-debug}, | |
452 makeid => $opts{-makeid}, | |
453 glob => $opts{-glob} || '*.{fa,fasta,FA,FASTA,fast,FAST,dna,fsa}', | |
454 maxopen => $opts{-maxfh} || 32, | |
455 dbmargs => $opts{-dbmargs} || undef, | |
456 fhcache => {}, | |
457 cacheseq => {}, | |
458 curopen => 0, | |
459 openseq => 1, | |
460 dirname => undef, | |
461 offsets => undef, | |
462 }, $class; | |
463 my ($offsets,$dirname); | |
464 | |
465 if (-d $path) { | |
466 $offsets = $self->index_dir($path,$opts{-reindex}); | |
467 $dirname = $path; | |
468 } elsif (-f _) { | |
469 $offsets = $self->index_file($path,$opts{-reindex}); | |
470 $dirname = dirname($path); | |
471 } else { | |
472 $self->throw( "$path: Invalid file or dirname"); | |
473 } | |
474 @{$self}{qw(dirname offsets)} = ($dirname,$offsets); | |
475 | |
476 $self; | |
477 } | |
478 | |
479 =head2 newFh | |
480 | |
481 Title : newFh | |
482 Function: gets a new Fh for a file | |
483 Example : internal method | |
484 Returns : GLOB | |
485 Args : | |
486 | |
487 =cut | |
488 | |
489 sub newFh { | |
490 my $class = shift; | |
491 my $self = $class->new(@_); | |
492 require Symbol; | |
493 my $fh = Symbol::gensym or return; | |
494 tie $$fh,'Bio::DB::Fasta::Stream',$self or return; | |
495 $fh; | |
496 } | |
497 | |
498 sub _open_index { | |
499 my $self = shift; | |
500 my ($index,$write) = @_; | |
501 my %offsets; | |
502 my $flags = $write ? O_CREAT|O_RDWR : O_RDONLY; | |
503 my @dbmargs = $self->dbmargs; | |
504 tie %offsets,'AnyDBM_File',$index,$flags,0644,@dbmargs or $self->throw( "Can't open cache file: $!"); | |
505 return \%offsets; | |
506 } | |
507 | |
508 =head2 index_dir | |
509 | |
510 Title : index_dir | |
511 Usage : $db->index_dir($dir) | |
512 Function: set the index dir and load all files in the dir | |
513 Returns : hashref of seq offsets in each file | |
514 Args : dirname, boolean to force a reload of all files | |
515 | |
516 =cut | |
517 | |
518 sub index_dir { | |
519 my $self = shift; | |
520 my $dir = shift; | |
521 my $force_reindex = shift; | |
522 | |
523 # find all fasta files | |
524 my @files = glob("$dir/$self->{glob}"); | |
525 $self->throw( "no fasta files in $dir") unless @files; | |
526 | |
527 # get name of index | |
528 my $index = $self->index_name($dir,1); | |
529 | |
530 # if caller has requested reindexing, then unlink | |
531 # the index file. | |
532 unlink $index if $force_reindex; | |
533 | |
534 # get the modification time of the index | |
535 my $indextime = (stat($index))[9] || 0; | |
536 | |
537 # get the most recent modification time of any of the contents | |
538 my $modtime = 0; | |
539 my %modtime; | |
540 foreach (@files) { | |
541 my $m = (stat($_))[9]; | |
542 $modtime{$_} = $m; | |
543 $modtime = $m if $modtime < $m; | |
544 } | |
545 | |
546 my $reindex = $force_reindex || $indextime < $modtime; | |
547 my $offsets = $self->_open_index($index,$reindex) or return; | |
548 $self->{offsets} = $offsets; | |
549 | |
550 # no indexing needed | |
551 return $offsets unless $reindex; | |
552 | |
553 # otherwise reindex contents of changed files | |
554 $self->{indexing} = $index; | |
555 foreach (@files) { | |
556 next if( defined $indextime && $modtime{$_} <= $indextime); | |
557 $self->calculate_offsets($_,$offsets); | |
558 } | |
559 delete $self->{indexing}; | |
560 return $self->{offsets}; | |
561 } | |
562 | |
563 =head2 get_Seq_by_id | |
564 | |
565 Title : get_Seq_by_id | |
566 Usage : my $seq = $db->get_Seq_by_id($id) | |
567 Function: Bio::DB::RandomAccessI method implemented | |
568 Returns : Bio::PrimarySeqI object | |
569 Args : id | |
570 | |
571 =cut | |
572 | |
573 sub get_Seq_by_id { | |
574 my $self = shift; | |
575 my $id = shift; | |
576 return Bio::PrimarySeq::Fasta->new($self,$id); | |
577 } | |
578 | |
579 =head2 index_file | |
580 | |
581 Title : index_file | |
582 Usage : $db->index_file($filename) | |
583 Function: (re)loads a sequence file and indexes sequences offsets in the file | |
584 Returns : seq offsets in the file | |
585 Args : filename, | |
586 boolean to force reloading a file | |
587 | |
588 =cut | |
589 | |
590 | |
591 sub index_file { | |
592 my $self = shift; | |
593 my $file = shift; | |
594 my $force_reindex = shift; | |
595 | |
596 my $index = $self->index_name($file); | |
597 # if caller has requested reindexing, then unlink the index | |
598 unlink $index if $force_reindex; | |
599 | |
600 # get the modification time of the index | |
601 my $indextime = (stat($index))[9]; | |
602 my $modtime = (stat($file))[9]; | |
603 | |
604 my $reindex = $force_reindex || $indextime < $modtime; | |
605 my $offsets = $self->_open_index($index,$reindex) or return; | |
606 $self->{offsets} = $offsets; | |
607 | |
608 return $self->{offsets} unless $reindex; | |
609 | |
610 $self->{indexing} = $index; | |
611 $self->calculate_offsets($file,$offsets); | |
612 delete $self->{indexing}; | |
613 return $self->{offsets}; | |
614 } | |
615 | |
616 =head2 dbmargs | |
617 | |
618 Title : dbmargs | |
619 Usage : my @args = $db->dbmargs; | |
620 Function: gets stored dbm arguments | |
621 Returns : array | |
622 Args : none | |
623 | |
624 | |
625 =cut | |
626 | |
627 sub dbmargs { | |
628 my $self = shift; | |
629 my $args = $self->{dbmargs} or return; | |
630 return ref($args) eq 'ARRAY' ? @$args : $args; | |
631 } | |
632 | |
633 =head2 index_name | |
634 | |
635 Title : index_name | |
636 Usage : my $indexname = $db->index_name($path,$isdir); | |
637 Function: returns the name of the index for a specific path | |
638 Returns : string | |
639 Args : path to check, | |
640 boolean if it is a dir | |
641 | |
642 =cut | |
643 | |
644 sub index_name { | |
645 my $self = shift; | |
646 my ($path,$isdir) = @_; | |
647 unless ($path) { | |
648 my $dir = $self->{dirname} or return; | |
649 return $self->index_name($dir,-d $dir); | |
650 } | |
651 return "$path/directory.index" if $isdir; | |
652 return "$path.index"; | |
653 } | |
654 | |
655 =head2 calculate_offsets | |
656 | |
657 Title : calculate_offsets | |
658 Usage : $db->calculate_offsets($filename,$offsets); | |
659 Function: calculates the sequence offsets in a file based on id | |
660 Returns : offset hash for each file | |
661 Args : file to process | |
662 $offsets - hashref of id to offset storage | |
663 | |
664 =cut | |
665 | |
666 sub calculate_offsets { | |
667 my $self = shift; | |
668 my ($file,$offsets) = @_; | |
669 my $base = $self->path2fileno(basename($file)); | |
670 | |
671 my $fh = IO::File->new($file) or $self->throw( "Can't open $file: $!"); | |
672 warn "indexing $file\n" if $self->{debug}; | |
673 my ($offset,$id,$linelength,$type,$firstline,$count,%offsets); | |
674 while (<$fh>) { # don't try this at home | |
675 if (/^>(\S+)/) { | |
676 print STDERR "indexed $count sequences...\n" | |
677 if $self->{debug} && (++$count%1000) == 0; | |
678 my $pos = tell($fh); | |
679 if ($id) { | |
680 my $seqlength = $pos - $offset - length($_) - 1; | |
681 $seqlength -= int($seqlength/$linelength); | |
682 $offsets->{$id} = $self->_pack($offset,$seqlength, | |
683 $linelength,$firstline, | |
684 $type,$base); | |
685 } | |
686 $id = ref($self->{makeid}) eq 'CODE' ? $self->{makeid}->($_) : $1; | |
687 ($offset,$firstline,$linelength) = ($pos,length($_),0); | |
688 } else { | |
689 $linelength ||= length($_); | |
690 $type ||= $self->_type($_); | |
691 } | |
692 } | |
693 # deal with last entry | |
694 if ($id) { | |
695 my $pos = tell($fh); | |
696 | |
697 # my $seqlength = $pos - $offset - length($_) - 1; | |
698 # $_ is always null should not be part of this calculation | |
699 my $seqlength = $pos - $offset - 1; | |
700 | |
701 if ($linelength == 0) { # yet another pesky empty chr_random.fa file | |
702 $seqlength = 0; | |
703 } else { | |
704 $seqlength -= int($seqlength/$linelength); | |
705 }; | |
706 $offsets->{$id} = $self->_pack($offset,$seqlength, | |
707 $linelength,$firstline, | |
708 $type,$base); | |
709 } | |
710 return \%offsets; | |
711 } | |
712 | |
713 =head2 get_all_ids | |
714 | |
715 Title : get_all_ids | |
716 Usage : my @ids = $db->get_all_ids | |
717 Function: gets all the stored ids in all indexes | |
718 Returns : list of ids | |
719 Args : none | |
720 | |
721 =cut | |
722 | |
723 sub get_all_ids { grep {!/^__/} keys %{shift->{offsets}} } | |
724 | |
725 sub offset { | |
726 my $self = shift; | |
727 my $id = shift; | |
728 my $offset = $self->{offsets}{$id} or return; | |
729 ($self->_unpack($offset))[0]; | |
730 } | |
731 | |
732 sub length { | |
733 my $self = shift; | |
734 my $id = shift; | |
735 my $offset = $self->{offsets}{$id} or return; | |
736 ($self->_unpack($offset))[1]; | |
737 } | |
738 | |
739 sub linelen { | |
740 my $self = shift; | |
741 my $id = shift; | |
742 my $offset = $self->{offsets}{$id} or return; | |
743 ($self->_unpack($offset))[2]; | |
744 } | |
745 | |
746 sub headerlen { | |
747 my $self = shift; | |
748 my $id = shift; | |
749 my $offset = $self->{offsets}{$id} or return; | |
750 ($self->_unpack($offset))[3]; | |
751 } | |
752 | |
753 sub alphabet { | |
754 my $self = shift; | |
755 my $id = shift; | |
756 my $offset = $self->{offsets}{$id} or return; | |
757 my $type = ($self->_unpack($offset))[4]; | |
758 return $type == DNA ? 'dna' | |
759 : $type == RNA ? 'rna' | |
760 : 'protein'; | |
761 | |
762 } | |
763 | |
764 sub path { shift->{dirname} } | |
765 | |
766 sub header_offset { | |
767 my $self = shift; | |
768 my $id = shift; | |
769 return unless $self->{offsets}{$id}; | |
770 return $self->offset($id) - $self->headerlen($id); | |
771 } | |
772 | |
773 sub file { | |
774 my $self = shift; | |
775 my $id = shift; | |
776 my $offset = $self->{offsets}{$id} or return; | |
777 $self->fileno2path(($self->_unpack($offset))[5]); | |
778 } | |
779 | |
780 sub fileno2path { | |
781 my $self = shift; | |
782 my $no = shift; | |
783 return $self->{offsets}{"__file_$no"}; | |
784 } | |
785 | |
786 sub path2fileno { | |
787 my $self = shift; | |
788 my $path = shift; | |
789 if ( !defined $self->{offsets}{"__path_$path"} ) { | |
790 my $fileno = ($self->{offsets}{"__path_$path"} = 0+ $self->{fileno}++); | |
791 $self->{offsets}{"__file_$fileno"} = $path; | |
792 } | |
793 return $self->{offsets}{"__path_$path"} | |
794 } | |
795 | |
796 =head2 subseq | |
797 | |
798 Title : subseq | |
799 Usage : $seqdb->subseq($id,$start,$stop); | |
800 Function: returns a subseq of a sequence in the db | |
801 Returns : subsequence data | |
802 Args : id of sequence, starting point, ending point | |
803 | |
804 =cut | |
805 | |
806 sub subseq { | |
807 my ($self,$id,$start,$stop) = @_; | |
808 if ($id =~ /^(.+):([\d_]+)[,-]([\d_]+)$/) { | |
809 ($id,$start,$stop) = ($1,$2,$3); | |
810 $start =~ s/_//g; | |
811 $stop =~ s/_//g; | |
812 } | |
813 $start ||= 1; | |
814 $stop ||= $self->length($id); | |
815 | |
816 my $reversed; | |
817 if ($start > $stop) { | |
818 ($start,$stop) = ($stop,$start); | |
819 $reversed++; | |
820 } | |
821 | |
822 my $data; | |
823 | |
824 my $fh = $self->fh($id) or return; | |
825 my $filestart = $self->caloffset($id,$start); | |
826 my $filestop = $self->caloffset($id,$stop); | |
827 | |
828 seek($fh,$filestart,0); | |
829 read($fh,$data,$filestop-$filestart+1); | |
830 $data =~ s/\n//g; | |
831 if ($reversed) { | |
832 $data = reverse $data; | |
833 $data =~ tr/gatcGATC/ctagCTAG/; | |
834 } | |
835 $data; | |
836 } | |
837 | |
838 sub fh { | |
839 my $self = shift; | |
840 my $id = shift; | |
841 my $file = $self->file($id) or return; | |
842 $self->fhcache("$self->{dirname}/$file") or $self->throw( "Can't open file $file"); | |
843 } | |
844 | |
845 sub header { | |
846 my $self = shift; | |
847 my $id = shift; | |
848 my ($offset,$seqlength,$linelength,$firstline,$type,$file) | |
849 = $self->_unpack($self->{offsets}{$id}) or return; | |
850 $offset -= $firstline; | |
851 my $data; | |
852 my $fh = $self->fh($id) or return; | |
853 seek($fh,$offset,0); | |
854 read($fh,$data,$firstline); | |
855 chomp $data; | |
856 substr($data,0,1) = ''; | |
857 $data; | |
858 } | |
859 | |
860 sub caloffset { | |
861 my $self = shift; | |
862 my $id = shift; | |
863 my $a = shift()-1; | |
864 my ($offset,$seqlength,$linelength,$firstline,$type,$file) = $self->_unpack($self->{offsets}{$id}); | |
865 $a = 0 if $a < 0; | |
866 $a = $seqlength-1 if $a >= $seqlength; | |
867 $offset + $linelength * int($a/($linelength-1)) + $a % ($linelength-1); | |
868 } | |
869 | |
870 sub fhcache { | |
871 my $self = shift; | |
872 my $path = shift; | |
873 if (!$self->{fhcache}{$path}) { | |
874 if ($self->{curopen} >= $self->{maxopen}) { | |
875 my @lru = sort {$self->{cacheseq}{$a} <=> $self->{cacheseq}{$b};} keys %{$self->{fhcache}}; | |
876 splice(@lru, $self->{maxopen} / 3); | |
877 $self->{curopen} -= @lru; | |
878 for (@lru) { delete $self->{fhcache}{$_} } | |
879 } | |
880 $self->{fhcache}{$path} = IO::File->new($path) or return; | |
881 $self->{curopen}++; | |
882 } | |
883 $self->{cacheseq}{$path}++; | |
884 $self->{fhcache}{$path} | |
885 } | |
886 | |
887 sub _pack { | |
888 shift; | |
889 pack STRUCT,@_; | |
890 } | |
891 | |
892 sub _unpack { | |
893 shift; | |
894 unpack STRUCT,shift; | |
895 } | |
896 | |
897 sub _type { | |
898 shift; | |
899 local $_ = shift; | |
900 return /^[gatcnGATCN*-]+$/ ? DNA | |
901 : /^[gaucnGAUCN*-]+$/ ? RNA | |
902 : PROTEIN; | |
903 } | |
904 | |
905 =head2 get_PrimarySeq_stream | |
906 | |
907 Title : get_PrimarySeq_stream | |
908 Usage : | |
909 Function: | |
910 Example : | |
911 Returns : | |
912 Args : | |
913 | |
914 | |
915 =cut | |
916 | |
917 sub get_PrimarySeq_stream { | |
918 my $self = shift; | |
919 return Bio::DB::Fasta::Stream->new($self); | |
920 } | |
921 | |
922 sub TIEHASH { | |
923 my $self = shift; | |
924 return $self->new(@_); | |
925 } | |
926 | |
927 sub FETCH { | |
928 shift->subseq(@_); | |
929 } | |
930 sub STORE { | |
931 shift->throw("Read-only database"); | |
932 } | |
933 sub DELETE { | |
934 shift->throw("Read-only database"); | |
935 } | |
936 sub CLEAR { | |
937 shift->throw("Read-only database"); | |
938 } | |
939 sub EXISTS { | |
940 defined shift->offset(@_); | |
941 } | |
942 sub FIRSTKEY { tied(%{shift->{offsets}})->FIRSTKEY(@_); } | |
943 sub NEXTKEY { tied(%{shift->{offsets}})->NEXTKEY(@_); } | |
944 | |
945 sub DESTROY { | |
946 my $self = shift; | |
947 if ($self->{indexing}) { # killed prematurely, so index file is no good! | |
948 warn "indexing was interrupted, so unlinking $self->{indexing}"; | |
949 unlink $self->{indexing}; | |
950 } | |
951 } | |
952 | |
953 #------------------------------------------------------------- | |
954 # Bio::PrimarySeqI compatibility | |
955 # | |
956 package Bio::PrimarySeq::Fasta; | |
957 use overload '""' => 'display_id'; | |
958 | |
959 use vars '@ISA'; | |
960 eval { | |
961 require Bio::PrimarySeqI; | |
962 require Bio::Root::Root; | |
963 } && (@ISA = ('Bio::Root::Root','Bio::PrimarySeqI')); | |
964 | |
965 sub new { | |
966 my $class = shift; | |
967 $class = ref($class) if ref $class; | |
968 my ($db,$id,$start,$stop) = @_; | |
969 return bless { db => $db, | |
970 id => $id, | |
971 start => $start || 1, | |
972 stop => $stop || $db->length($id) | |
973 },$class; | |
974 } | |
975 | |
976 sub seq { | |
977 my $self = shift; | |
978 return $self->{db}->seq($self->{id},$self->{start},$self->{stop}); | |
979 } | |
980 | |
981 sub subseq { | |
982 my $self = shift; | |
983 my ($start,$stop) = @_; | |
984 $self->throw("Stop cannot be smaller than start") unless $start <= $stop; | |
985 return $self->{start} <= $self->{stop} ? $self->new($self->{db}, | |
986 $self->{id}, | |
987 $self->{start}+$start-1, | |
988 $self->{start}+$stop-1) | |
989 : $self->new($self->{db}, | |
990 $self->{id}, | |
991 $self->{start}-($start-1), | |
992 $self->{start}-($stop-1) | |
993 ); | |
994 | |
995 } | |
996 | |
997 sub display_id { | |
998 my $self = shift; | |
999 return $self->{id}; | |
1000 } | |
1001 | |
1002 sub accession_number { | |
1003 my $self = shift; | |
1004 return "unknown"; | |
1005 } | |
1006 | |
1007 sub primary_id { | |
1008 my $self = shift; | |
1009 return overload::StrVal($self); | |
1010 } | |
1011 | |
1012 sub can_call_new { return 0 } | |
1013 | |
1014 sub alphabet { | |
1015 my $self = shift; | |
1016 return $self->{db}->alphabet($self->{id}); | |
1017 } | |
1018 | |
1019 sub revcom { | |
1020 my $self = shift; | |
1021 return $self->new(@{$self}{'db','id','stop','start'}); | |
1022 } | |
1023 | |
1024 sub length { | |
1025 my $self = shift; | |
1026 return $self->{db}->length($self->{id}); | |
1027 } | |
1028 | |
1029 sub desc { | |
1030 my $self = shift; | |
1031 return ''; | |
1032 } | |
1033 | |
1034 #------------------------------------------------------------- | |
1035 # stream-based access to the database | |
1036 # | |
1037 package Bio::DB::Fasta::Stream; | |
1038 use Tie::Handle; | |
1039 use vars qw(@ISA); | |
1040 @ISA = qw(Tie::Handle); | |
1041 eval { | |
1042 require Bio::DB::SeqI; | |
1043 } && (push @ISA,'Bio::DB::SeqI'); | |
1044 | |
1045 | |
1046 sub new { | |
1047 my $class = shift; | |
1048 my $db = shift; | |
1049 my $key = $db->FIRSTKEY; | |
1050 return bless { db=>$db,key=>$key },$class; | |
1051 } | |
1052 | |
1053 sub next_seq { | |
1054 my $self = shift; | |
1055 my ($key,$db) = @{$self}{'key','db'}; | |
1056 my $value = $db->get_Seq_by_id($key); | |
1057 $self->{key} = $db->NEXTKEY($key); | |
1058 $value; | |
1059 } | |
1060 | |
1061 sub TIEHANDLE { | |
1062 my $class = shift; | |
1063 my $db = shift; | |
1064 return $class->new($db); | |
1065 } | |
1066 sub READLINE { | |
1067 my $self = shift; | |
1068 $self->next_seq; | |
1069 } | |
1070 | |
1071 1; | |
1072 | |
1073 __END__ | |
1074 |