comparison variant_effect_predictor/Bio/DB/Fasta.pm @ 0:1f6dce3d34e0

Uploaded
author mahtabm
date Thu, 11 Apr 2013 02:01:53 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:1f6dce3d34e0
1 =head1 NAME
2
3 Bio::DB::Fasta -- Fast indexed access to a directory of fasta files
4
5 =head1 SYNOPSIS
6
7 use Bio::DB::Fasta;
8
9 # create database from directory of fasta files
10 my $db = Bio::DB::Fasta->new('/path/to/fasta/files');
11
12 # simple access (for those without Bioperl)
13 my $seq = $db->seq('CHROMOSOME_I',4_000_000 => 4_100_000);
14 my $revseq = $db->seq('CHROMOSOME_I',4_100_000 => 4_000_000);
15 my @ids = $db->ids;
16 my $length = $db->length('CHROMOSOME_I');
17 my $alphabet = $db->alphabet('CHROMOSOME_I');
18 my $header = $db->header('CHROMOSOME_I');
19
20 # Bioperl-style access
21 my $db = Bio::DB::Fasta->new('/path/to/fasta/files');
22
23 my $obj = $db->get_Seq_by_id('CHROMOSOME_I');
24 my $seq = $obj->seq;
25 my $subseq = $obj->subseq(4_000_000 => 4_100_000);
26 my $length = $obj->length;
27 # (etc)
28
29 # Bio::SeqIO-style access
30 my $stream = Bio::DB::Fasta->new('/path/to/fasta/files')->get_PrimarySeq_stream;
31 while (my $seq = $stream->next_seq) {
32 # Bio::PrimarySeqI stuff
33 }
34
35 my $fh = Bio::DB::Fasta->newFh('/path/to/fasta/files');
36 while (my $seq = <$fh>) {
37 # Bio::PrimarySeqI stuff
38 }
39
40 # tied hash access
41 tie %sequences,'Bio::DB::Fasta','/path/to/fasta/files';
42 print $sequences{'CHROMOSOME_I:1,20000'};
43
44 =head1 DESCRIPTION
45
46 Bio::DB::Fasta provides indexed access to one or more Fasta files. It
47 provides random access to each sequence entry, and to subsequences
48 within each entry, allowing you to retrieve portions of very large
49 sequences without bringing the entire sequence into memory.
50
51 When you initialize the module, you point it at a single fasta file or
52 a directory of multiple such files. The first time it is run, the
53 module generates an index of the contents of the file or directory
54 using the AnyDBM module (Berkeley DB preferred, followed by GDBM_File,
55 NDBM_File, and SDBM_File). Thereafter it uses the index file to find
56 the file and offset for any requested sequence. If one of the source
57 fasta files is updated, the module reindexes just that one file. (You
58 can also force reindexing manually). For improved performance, the
59 module keeps a cache of open filehandles, closing less-recently used
60 ones when the cache is full.
61
62 The fasta files may contain any combination of nucleotide and protein
63 sequences; during indexing the module guesses the molecular type.
64 Entries may have any line length, and different line lengths are
65 allowed in the same file. However, within a sequence entry, all lines
66 must be the same length except for the last.
67
68 The module uses /^E<gt>(\S+)/ to extract each sequence's primary ID from
69 the Fasta header. During indexing, you may pass a callback routine to
70 modify this primary ID. For example, you may wish to extract a
71 portion of the gi|gb|abc|xyz nonsense that GenBank Fasta files use.
72 The original header line can be recovered later.
73
74 This module was developed for use with the C. elegans and human
75 genomes, and has been tested with sequence segments as large as 20
76 megabases. Indexing the C. elegans genome (100 megabases of genomic
77 sequence plus 100,000 ESTs) takes ~5 minutes on my 300 MHz pentium
78 laptop. On the same system, average access time for any 200-mer within
79 the C. elegans genome was E<lt>0.02s.
80
81 =head1 DATABASE CREATION AND INDEXING
82
83 The two constructors for this class are new() and newFh(). The former
84 creates a Bio::DB::Fasta object which is accessed via method calls.
85 The latter creates a tied filehandle which can be used Bio::SeqIO
86 style to fetch sequence objects in a stream fashion. There is also a
87 tied hash interface.
88
89 =over 4
90
91 =item $db = Bio::DB::Fasta-E<gt>new($fasta_path [,%options])
92
93 Create a new Bio::DB::Fasta object from the Fasta file or files
94 indicated by $fasta_path. Indexing will be performed automatically if
95 needed. If successful, new() will return the database accessor
96 object. Otherwise it will return undef.
97
98 $fasta_path may be an individual Fasta file, or may refer to a
99 directory containing one or more of such files. Following the path,
100 you may pass a series of name=E<gt>value options or a hash with these
101 same name=E<gt>value pairs. Valid options are:
102
103 Option Name Description Default
104 ----------- ----------- -------
105
106 -glob Glob expression to use *.{fa,fasta,fast,FA,FASTA,FAST,dna}
107 for searching for Fasta
108 files in directories.
109
110 -makeid A code subroutine for None
111 transforming Fasta IDs.
112
113 -maxopen Maximum size of 32
114 filehandle cache.
115
116 -debug Turn on status 0
117 messages.
118
119 -reindex Force the index to be 0
120 rebuilt.
121
122 -dbmargs Additional arguments none
123 to pass to the DBM
124 routines when tied
125 (scalar or array ref).
126
127 -dbmargs can be used to control the format of the index. For example,
128 you can pass $DB_BTREE to this argument so as to force the IDs to be
129 sorted and retrieved alphabetically. Note that you must use the same
130 arguments every time you open the index!
131
132 -reindex can be used to force the index to be recreated from scratch.
133
134 =item $fh = Bio::DB::Fasta-E<gt>newFh($fasta_path [,%options])
135
136 Create a tied filehandle opened on a Bio::DB::Fasta object. Reading
137 from this filehandle with E<lt>E<gt> will return a stream of sequence objects,
138 Bio::SeqIO style.
139
140 =back
141
142 The -makeid option gives you a chance to modify sequence IDs during
143 indexing. The option's value should be a code reference that will
144 take a scalar argument and return a scalar result, like this:
145
146 $db = Bio::DB::Fasta->new("file.fa",-makeid=>\&make_my_id);
147
148 sub make_my_id {
149 my $description_line = shift;
150 # get a new id from the fasta header
151 return $new_id;
152 }
153
154 make_my_id() will be called with the full fasta id line (including the
155 "E<gt>" symbol!). For example:
156
157 >A12345.3 Predicted C. elegans protein egl-2
158
159 By default, this module will use the regular expression /^E<gt>(\S+)/
160 to extract "A12345.3" for use as the ID. If you pass a -makeid
161 callback, you can extract any portion of this, such as the "egl-2"
162 symbol.
163
164 The -makeid option is ignored after the index is constructed.
165
166 =head1 OBJECT METHODS
167
168 The following object methods are provided.
169
170 =over 4
171
172 =item $raw_seq = $db-E<gt>seq($id [,$start, $stop])
173
174 Return the raw sequence (a string) given an ID and optionally a start
175 and stop position in the sequence. In the case of DNA sequence, if
176 $stop is less than $start, then the reverse complement of the sequence
177 is returned (this violates Bio::Seq conventions).
178
179 For your convenience, subsequences can be indicated with this compound
180 ID:
181
182 $db->seq("$id:$start,$stop")
183
184 =item $length = $db-E<gt>length($id)
185
186 Return the length of the indicated sequence.
187
188 =item $header = $db-E<gt>header($id)
189
190 Return the header line for the ID, including the initial "E<gt>".
191
192 =item $type = $db-E<gt>alphabet($id)
193
194 Return the molecular type of the indicated sequence. One of "dna",
195 "rna" or "protein".
196
197 =item $filename = $db-E<gt>file($id)
198
199 Return the name of the file in which the indicated sequence can be
200 found.
201
202 =item $offset = $db-E<gt>offset($id)
203
204 Return the offset of the indicated sequence from the beginning of the
205 file in which it is located. The offset points to the beginning of
206 the sequence, not the beginning of the header line.
207
208 =item $header_length = $db-E<gt>headerlen($id)
209
210 Return the length of the header line for the indicated sequence.
211
212 =item $header_offset = $db-E<gt>header_offset($id)
213
214 Return the offset of the header line for the indicated sequence from
215 the beginning of the file in which it is located.
216
217 =item $index_name = $db-E<gt>index_name
218
219 Return the path to the index file.
220
221 =item $path = $db-E<gt>path
222
223 Return the path to the Fasta file(s).
224
225 =back
226
227 For BioPerl-style access, the following methods are provided:
228
229 =over 4
230
231 =item $seq = $db-E<gt>get_Seq_by_id($id)
232
233 Return a Bio::PrimarySeq::Fasta object, which obeys the
234 Bio::PrimarySeqI conventions. For example, to recover the raw DNA or
235 protein sequence, call $seq-E<gt>seq().
236
237 Note that get_Seq_by_id() does not bring the entire sequence into
238 memory until requested. Internally, the returned object uses the
239 accessor to generate subsequences as needed.
240
241 =item $seq = $db-E<gt>get_Seq_by_acc($id)
242
243 =item $seq = $db-E<gt>get_Seq_by_primary_id($id)
244
245 These methods all do the same thing as get_Seq_by_id().
246
247 =item $stream = $db-E<gt>get_PrimarySeq_stream()
248
249 Return a Bio::DB::Fasta::Stream object, which supports a single method
250 next_seq(). Each call to next_seq() returns a new
251 Bio::PrimarySeq::Fasta object, until no more sequences remain.
252
253 =back
254
255 See L<Bio::PrimarySeqI> for methods provided by the sequence objects
256 returned from get_Seq_by_id() and get_PrimarySeq_stream().
257
258 =head1 TIED INTERFACES
259
260 This module provides two tied interfaces, one which allows you to
261 treat the sequence database as a hash, and the other which allows you
262 to treat the database as an I/O stream.
263
264 =head2 Creating a Tied Hash
265
266 The tied hash interface is very straightforward
267
268 =over 4
269
270 =item $obj = tie %db,'Bio::DB::Fasta','/path/to/fasta/files' [,@args]
271
272 Tie %db to Bio::DB::Fasta using the indicated path to the Fasta files.
273 The optional @args list is the same set of named argument/value pairs
274 used by Bio::DB::Fasta-E<gt>new().
275
276 If successful, tie() will return the tied object. Otherwise it will
277 return undef.
278
279 =back
280
281 Once tied, you can use the hash to retrieve an individual sequence by
282 its ID, like this:
283
284 my $seq = $db{CHROMOSOME_I};
285
286 You may select a subsequence by appending the comma-separated range to
287 the sequence ID in the format "$id:$start,$stop". For example, here
288 is the first 1000 bp of the sequence with the ID "CHROMOSOME_I":
289
290 my $seq = $db{'CHROMOSOME_I:1,1000'};
291
292 (The regular expression used to parse this format allows sequence IDs
293 to contain colons.)
294
295 When selecting subsequences, if $start E<gt> stop, then the reverse
296 complement will be returned for DNA sequences.
297
298 The keys() and values() functions will return the sequence IDs and
299 their sequences, respectively. In addition, each() can be used to
300 iterate over the entire data set:
301
302 while (my ($id,$sequence) = each %db) {
303 print "$id => $sequence\n";
304 }
305
306 When dealing with very large sequences, you can avoid bringing them
307 into memory by calling each() in a scalar context. This returns the
308 key only. You can then use tied(%db) to recover the Bio::DB::Fasta
309 object and call its methods.
310
311 while (my $id = each %db) {
312 print "$id => $db{$sequence:1,100}\n";
313 print "$id => ",tied(%db)->length($id),"\n";
314 }
315
316 You may, in addition invoke Bio::DB::Fasta's FIRSTKEY and NEXTKEY tied
317 hash methods directly.
318
319 =over 4
320
321 =item $id = $db-E<gt>FIRSTKEY
322
323 Return the first ID in the database.
324
325 =item $id = $db-E<gt>NEXTKEY($id)
326
327 Given an ID, return the next ID in sequence.
328
329 =back
330
331 This allows you to write the following iterative loop using just the
332 object-oriented interface:
333
334 my $db = Bio::DB::Fasta->new('/path/to/fasta/files');
335 for (my $id=$db->FIRSTKEY; $id; $id=$db->NEXTKEY($id)) {
336 # do something with sequence
337 }
338
339 =head2 Creating a Tied Filehandle
340
341 The Bio::DB::Fasta-E<gt>newFh() method creates a tied filehandle from
342 which you can read Bio::PrimarySeq::Fasta sequence objects
343 sequentially. The following bit of code will iterate sequentially
344 over all sequences in the database:
345
346 my $fh = Bio::DB::Fasta->newFh('/path/to/fasta/files');
347 while (my $seq = <$fh>) {
348 print $seq->id,' => ',$seq->length,"\n";
349 }
350
351 When no more sequences remain to be retrieved, the stream will return
352 undef.
353
354 =head1 BUGS
355
356 When a sequence is deleted from one of the Fasta files, this deletion
357 is not detected by the module and removed from the index. As a
358 result, a "ghost" entry will remain in the index and will return
359 garbage results if accessed.
360
361 Currently, the only way to accomodate deletions is to rebuild the
362 entire index, either by deleting it manually, or by passing
363 -reindex=E<gt>1 to new() when initializing the module.
364
365 =head1 SEE ALSO
366
367 L<bioperl>
368
369 =head1 AUTHOR
370
371 Lincoln Stein E<lt>lstein@cshl.orgE<gt>.
372
373 Copyright (c) 2001 Cold Spring Harbor Laboratory.
374
375 This library is free software; you can redistribute it and/or modify
376 it under the same terms as Perl itself. See DISCLAIMER.txt for
377 disclaimers of warranty.
378
379 =cut
380
381 #'
382 package Bio::DB::Fasta;
383
384 BEGIN {
385 @AnyDBM_File::ISA = qw(DB_File GDBM_File NDBM_File SDBM_File)
386 }
387
388 use strict;
389 use IO::File;
390 use AnyDBM_File;
391 use Fcntl;
392 use File::Basename qw(basename dirname);
393 use Bio::DB::SeqI;
394 use Bio::Root::Root;
395 use vars qw($VERSION @ISA);
396
397 @ISA = qw(Bio::DB::SeqI Bio::Root::Root);
398
399 $VERSION = '1.03';
400
401 *seq = *sequence = \&subseq;
402 *ids = \&get_all_ids;
403 *get_seq_by_primary_id = *get_Seq_by_acc = \&get_Seq_by_id;
404
405 use constant STRUCT =>'NNnnCa*';
406 use constant DNA => 1;
407 use constant RNA => 2;
408 use constant PROTEIN => 3;
409
410 # Bio::DB-like object
411 # providing fast random access to a directory of FASTA files
412
413 =head2 new
414
415 Title : new
416 Usage : my $db = new Bio::DB::Fasta( $path, @options);
417 Function: initialize a new Bio::DB::Fasta object
418 Returns : new Bio::DB::Fasta object
419 Args : path to dir of fasta files or a single filename
420
421 These are optional arguments to pass in as well.
422
423 -glob Glob expression to use *.{fa,fasta,fast,FA,FASTA,FAST}
424 for searching for Fasta
425 files in directories.
426
427 -makeid A code subroutine for None
428 transforming Fasta IDs.
429
430 -maxopen Maximum size of 32
431 filehandle cache.
432
433 -debug Turn on status 0
434 messages.
435
436 -reindex Force the index to be 0
437 rebuilt.
438
439 -dbmargs Additional arguments none
440 to pass to the DBM
441 routines when tied
442 (scalar or array ref).
443
444 =cut
445
446 sub new {
447 my $class = shift;
448 my $path = shift;
449 my %opts = @_;
450
451 my $self = bless { debug => $opts{-debug},
452 makeid => $opts{-makeid},
453 glob => $opts{-glob} || '*.{fa,fasta,FA,FASTA,fast,FAST,dna,fsa}',
454 maxopen => $opts{-maxfh} || 32,
455 dbmargs => $opts{-dbmargs} || undef,
456 fhcache => {},
457 cacheseq => {},
458 curopen => 0,
459 openseq => 1,
460 dirname => undef,
461 offsets => undef,
462 }, $class;
463 my ($offsets,$dirname);
464
465 if (-d $path) {
466 $offsets = $self->index_dir($path,$opts{-reindex});
467 $dirname = $path;
468 } elsif (-f _) {
469 $offsets = $self->index_file($path,$opts{-reindex});
470 $dirname = dirname($path);
471 } else {
472 $self->throw( "$path: Invalid file or dirname");
473 }
474 @{$self}{qw(dirname offsets)} = ($dirname,$offsets);
475
476 $self;
477 }
478
479 =head2 newFh
480
481 Title : newFh
482 Function: gets a new Fh for a file
483 Example : internal method
484 Returns : GLOB
485 Args :
486
487 =cut
488
489 sub newFh {
490 my $class = shift;
491 my $self = $class->new(@_);
492 require Symbol;
493 my $fh = Symbol::gensym or return;
494 tie $$fh,'Bio::DB::Fasta::Stream',$self or return;
495 $fh;
496 }
497
498 sub _open_index {
499 my $self = shift;
500 my ($index,$write) = @_;
501 my %offsets;
502 my $flags = $write ? O_CREAT|O_RDWR : O_RDONLY;
503 my @dbmargs = $self->dbmargs;
504 tie %offsets,'AnyDBM_File',$index,$flags,0644,@dbmargs or $self->throw( "Can't open cache file: $!");
505 return \%offsets;
506 }
507
508 =head2 index_dir
509
510 Title : index_dir
511 Usage : $db->index_dir($dir)
512 Function: set the index dir and load all files in the dir
513 Returns : hashref of seq offsets in each file
514 Args : dirname, boolean to force a reload of all files
515
516 =cut
517
518 sub index_dir {
519 my $self = shift;
520 my $dir = shift;
521 my $force_reindex = shift;
522
523 # find all fasta files
524 my @files = glob("$dir/$self->{glob}");
525 $self->throw( "no fasta files in $dir") unless @files;
526
527 # get name of index
528 my $index = $self->index_name($dir,1);
529
530 # if caller has requested reindexing, then unlink
531 # the index file.
532 unlink $index if $force_reindex;
533
534 # get the modification time of the index
535 my $indextime = (stat($index))[9] || 0;
536
537 # get the most recent modification time of any of the contents
538 my $modtime = 0;
539 my %modtime;
540 foreach (@files) {
541 my $m = (stat($_))[9];
542 $modtime{$_} = $m;
543 $modtime = $m if $modtime < $m;
544 }
545
546 my $reindex = $force_reindex || $indextime < $modtime;
547 my $offsets = $self->_open_index($index,$reindex) or return;
548 $self->{offsets} = $offsets;
549
550 # no indexing needed
551 return $offsets unless $reindex;
552
553 # otherwise reindex contents of changed files
554 $self->{indexing} = $index;
555 foreach (@files) {
556 next if( defined $indextime && $modtime{$_} <= $indextime);
557 $self->calculate_offsets($_,$offsets);
558 }
559 delete $self->{indexing};
560 return $self->{offsets};
561 }
562
563 =head2 get_Seq_by_id
564
565 Title : get_Seq_by_id
566 Usage : my $seq = $db->get_Seq_by_id($id)
567 Function: Bio::DB::RandomAccessI method implemented
568 Returns : Bio::PrimarySeqI object
569 Args : id
570
571 =cut
572
573 sub get_Seq_by_id {
574 my $self = shift;
575 my $id = shift;
576 return Bio::PrimarySeq::Fasta->new($self,$id);
577 }
578
579 =head2 index_file
580
581 Title : index_file
582 Usage : $db->index_file($filename)
583 Function: (re)loads a sequence file and indexes sequences offsets in the file
584 Returns : seq offsets in the file
585 Args : filename,
586 boolean to force reloading a file
587
588 =cut
589
590
591 sub index_file {
592 my $self = shift;
593 my $file = shift;
594 my $force_reindex = shift;
595
596 my $index = $self->index_name($file);
597 # if caller has requested reindexing, then unlink the index
598 unlink $index if $force_reindex;
599
600 # get the modification time of the index
601 my $indextime = (stat($index))[9];
602 my $modtime = (stat($file))[9];
603
604 my $reindex = $force_reindex || $indextime < $modtime;
605 my $offsets = $self->_open_index($index,$reindex) or return;
606 $self->{offsets} = $offsets;
607
608 return $self->{offsets} unless $reindex;
609
610 $self->{indexing} = $index;
611 $self->calculate_offsets($file,$offsets);
612 delete $self->{indexing};
613 return $self->{offsets};
614 }
615
616 =head2 dbmargs
617
618 Title : dbmargs
619 Usage : my @args = $db->dbmargs;
620 Function: gets stored dbm arguments
621 Returns : array
622 Args : none
623
624
625 =cut
626
627 sub dbmargs {
628 my $self = shift;
629 my $args = $self->{dbmargs} or return;
630 return ref($args) eq 'ARRAY' ? @$args : $args;
631 }
632
633 =head2 index_name
634
635 Title : index_name
636 Usage : my $indexname = $db->index_name($path,$isdir);
637 Function: returns the name of the index for a specific path
638 Returns : string
639 Args : path to check,
640 boolean if it is a dir
641
642 =cut
643
644 sub index_name {
645 my $self = shift;
646 my ($path,$isdir) = @_;
647 unless ($path) {
648 my $dir = $self->{dirname} or return;
649 return $self->index_name($dir,-d $dir);
650 }
651 return "$path/directory.index" if $isdir;
652 return "$path.index";
653 }
654
655 =head2 calculate_offsets
656
657 Title : calculate_offsets
658 Usage : $db->calculate_offsets($filename,$offsets);
659 Function: calculates the sequence offsets in a file based on id
660 Returns : offset hash for each file
661 Args : file to process
662 $offsets - hashref of id to offset storage
663
664 =cut
665
666 sub calculate_offsets {
667 my $self = shift;
668 my ($file,$offsets) = @_;
669 my $base = $self->path2fileno(basename($file));
670
671 my $fh = IO::File->new($file) or $self->throw( "Can't open $file: $!");
672 warn "indexing $file\n" if $self->{debug};
673 my ($offset,$id,$linelength,$type,$firstline,$count,%offsets);
674 while (<$fh>) { # don't try this at home
675 if (/^>(\S+)/) {
676 print STDERR "indexed $count sequences...\n"
677 if $self->{debug} && (++$count%1000) == 0;
678 my $pos = tell($fh);
679 if ($id) {
680 my $seqlength = $pos - $offset - length($_) - 1;
681 $seqlength -= int($seqlength/$linelength);
682 $offsets->{$id} = $self->_pack($offset,$seqlength,
683 $linelength,$firstline,
684 $type,$base);
685 }
686 $id = ref($self->{makeid}) eq 'CODE' ? $self->{makeid}->($_) : $1;
687 ($offset,$firstline,$linelength) = ($pos,length($_),0);
688 } else {
689 $linelength ||= length($_);
690 $type ||= $self->_type($_);
691 }
692 }
693 # deal with last entry
694 if ($id) {
695 my $pos = tell($fh);
696
697 # my $seqlength = $pos - $offset - length($_) - 1;
698 # $_ is always null should not be part of this calculation
699 my $seqlength = $pos - $offset - 1;
700
701 if ($linelength == 0) { # yet another pesky empty chr_random.fa file
702 $seqlength = 0;
703 } else {
704 $seqlength -= int($seqlength/$linelength);
705 };
706 $offsets->{$id} = $self->_pack($offset,$seqlength,
707 $linelength,$firstline,
708 $type,$base);
709 }
710 return \%offsets;
711 }
712
713 =head2 get_all_ids
714
715 Title : get_all_ids
716 Usage : my @ids = $db->get_all_ids
717 Function: gets all the stored ids in all indexes
718 Returns : list of ids
719 Args : none
720
721 =cut
722
723 sub get_all_ids { grep {!/^__/} keys %{shift->{offsets}} }
724
725 sub offset {
726 my $self = shift;
727 my $id = shift;
728 my $offset = $self->{offsets}{$id} or return;
729 ($self->_unpack($offset))[0];
730 }
731
732 sub length {
733 my $self = shift;
734 my $id = shift;
735 my $offset = $self->{offsets}{$id} or return;
736 ($self->_unpack($offset))[1];
737 }
738
739 sub linelen {
740 my $self = shift;
741 my $id = shift;
742 my $offset = $self->{offsets}{$id} or return;
743 ($self->_unpack($offset))[2];
744 }
745
746 sub headerlen {
747 my $self = shift;
748 my $id = shift;
749 my $offset = $self->{offsets}{$id} or return;
750 ($self->_unpack($offset))[3];
751 }
752
753 sub alphabet {
754 my $self = shift;
755 my $id = shift;
756 my $offset = $self->{offsets}{$id} or return;
757 my $type = ($self->_unpack($offset))[4];
758 return $type == DNA ? 'dna'
759 : $type == RNA ? 'rna'
760 : 'protein';
761
762 }
763
764 sub path { shift->{dirname} }
765
766 sub header_offset {
767 my $self = shift;
768 my $id = shift;
769 return unless $self->{offsets}{$id};
770 return $self->offset($id) - $self->headerlen($id);
771 }
772
773 sub file {
774 my $self = shift;
775 my $id = shift;
776 my $offset = $self->{offsets}{$id} or return;
777 $self->fileno2path(($self->_unpack($offset))[5]);
778 }
779
780 sub fileno2path {
781 my $self = shift;
782 my $no = shift;
783 return $self->{offsets}{"__file_$no"};
784 }
785
786 sub path2fileno {
787 my $self = shift;
788 my $path = shift;
789 if ( !defined $self->{offsets}{"__path_$path"} ) {
790 my $fileno = ($self->{offsets}{"__path_$path"} = 0+ $self->{fileno}++);
791 $self->{offsets}{"__file_$fileno"} = $path;
792 }
793 return $self->{offsets}{"__path_$path"}
794 }
795
796 =head2 subseq
797
798 Title : subseq
799 Usage : $seqdb->subseq($id,$start,$stop);
800 Function: returns a subseq of a sequence in the db
801 Returns : subsequence data
802 Args : id of sequence, starting point, ending point
803
804 =cut
805
806 sub subseq {
807 my ($self,$id,$start,$stop) = @_;
808 if ($id =~ /^(.+):([\d_]+)[,-]([\d_]+)$/) {
809 ($id,$start,$stop) = ($1,$2,$3);
810 $start =~ s/_//g;
811 $stop =~ s/_//g;
812 }
813 $start ||= 1;
814 $stop ||= $self->length($id);
815
816 my $reversed;
817 if ($start > $stop) {
818 ($start,$stop) = ($stop,$start);
819 $reversed++;
820 }
821
822 my $data;
823
824 my $fh = $self->fh($id) or return;
825 my $filestart = $self->caloffset($id,$start);
826 my $filestop = $self->caloffset($id,$stop);
827
828 seek($fh,$filestart,0);
829 read($fh,$data,$filestop-$filestart+1);
830 $data =~ s/\n//g;
831 if ($reversed) {
832 $data = reverse $data;
833 $data =~ tr/gatcGATC/ctagCTAG/;
834 }
835 $data;
836 }
837
838 sub fh {
839 my $self = shift;
840 my $id = shift;
841 my $file = $self->file($id) or return;
842 $self->fhcache("$self->{dirname}/$file") or $self->throw( "Can't open file $file");
843 }
844
845 sub header {
846 my $self = shift;
847 my $id = shift;
848 my ($offset,$seqlength,$linelength,$firstline,$type,$file)
849 = $self->_unpack($self->{offsets}{$id}) or return;
850 $offset -= $firstline;
851 my $data;
852 my $fh = $self->fh($id) or return;
853 seek($fh,$offset,0);
854 read($fh,$data,$firstline);
855 chomp $data;
856 substr($data,0,1) = '';
857 $data;
858 }
859
860 sub caloffset {
861 my $self = shift;
862 my $id = shift;
863 my $a = shift()-1;
864 my ($offset,$seqlength,$linelength,$firstline,$type,$file) = $self->_unpack($self->{offsets}{$id});
865 $a = 0 if $a < 0;
866 $a = $seqlength-1 if $a >= $seqlength;
867 $offset + $linelength * int($a/($linelength-1)) + $a % ($linelength-1);
868 }
869
870 sub fhcache {
871 my $self = shift;
872 my $path = shift;
873 if (!$self->{fhcache}{$path}) {
874 if ($self->{curopen} >= $self->{maxopen}) {
875 my @lru = sort {$self->{cacheseq}{$a} <=> $self->{cacheseq}{$b};} keys %{$self->{fhcache}};
876 splice(@lru, $self->{maxopen} / 3);
877 $self->{curopen} -= @lru;
878 for (@lru) { delete $self->{fhcache}{$_} }
879 }
880 $self->{fhcache}{$path} = IO::File->new($path) or return;
881 $self->{curopen}++;
882 }
883 $self->{cacheseq}{$path}++;
884 $self->{fhcache}{$path}
885 }
886
887 sub _pack {
888 shift;
889 pack STRUCT,@_;
890 }
891
892 sub _unpack {
893 shift;
894 unpack STRUCT,shift;
895 }
896
897 sub _type {
898 shift;
899 local $_ = shift;
900 return /^[gatcnGATCN*-]+$/ ? DNA
901 : /^[gaucnGAUCN*-]+$/ ? RNA
902 : PROTEIN;
903 }
904
905 =head2 get_PrimarySeq_stream
906
907 Title : get_PrimarySeq_stream
908 Usage :
909 Function:
910 Example :
911 Returns :
912 Args :
913
914
915 =cut
916
917 sub get_PrimarySeq_stream {
918 my $self = shift;
919 return Bio::DB::Fasta::Stream->new($self);
920 }
921
922 sub TIEHASH {
923 my $self = shift;
924 return $self->new(@_);
925 }
926
927 sub FETCH {
928 shift->subseq(@_);
929 }
930 sub STORE {
931 shift->throw("Read-only database");
932 }
933 sub DELETE {
934 shift->throw("Read-only database");
935 }
936 sub CLEAR {
937 shift->throw("Read-only database");
938 }
939 sub EXISTS {
940 defined shift->offset(@_);
941 }
942 sub FIRSTKEY { tied(%{shift->{offsets}})->FIRSTKEY(@_); }
943 sub NEXTKEY { tied(%{shift->{offsets}})->NEXTKEY(@_); }
944
945 sub DESTROY {
946 my $self = shift;
947 if ($self->{indexing}) { # killed prematurely, so index file is no good!
948 warn "indexing was interrupted, so unlinking $self->{indexing}";
949 unlink $self->{indexing};
950 }
951 }
952
953 #-------------------------------------------------------------
954 # Bio::PrimarySeqI compatibility
955 #
956 package Bio::PrimarySeq::Fasta;
957 use overload '""' => 'display_id';
958
959 use vars '@ISA';
960 eval {
961 require Bio::PrimarySeqI;
962 require Bio::Root::Root;
963 } && (@ISA = ('Bio::Root::Root','Bio::PrimarySeqI'));
964
965 sub new {
966 my $class = shift;
967 $class = ref($class) if ref $class;
968 my ($db,$id,$start,$stop) = @_;
969 return bless { db => $db,
970 id => $id,
971 start => $start || 1,
972 stop => $stop || $db->length($id)
973 },$class;
974 }
975
976 sub seq {
977 my $self = shift;
978 return $self->{db}->seq($self->{id},$self->{start},$self->{stop});
979 }
980
981 sub subseq {
982 my $self = shift;
983 my ($start,$stop) = @_;
984 $self->throw("Stop cannot be smaller than start") unless $start <= $stop;
985 return $self->{start} <= $self->{stop} ? $self->new($self->{db},
986 $self->{id},
987 $self->{start}+$start-1,
988 $self->{start}+$stop-1)
989 : $self->new($self->{db},
990 $self->{id},
991 $self->{start}-($start-1),
992 $self->{start}-($stop-1)
993 );
994
995 }
996
997 sub display_id {
998 my $self = shift;
999 return $self->{id};
1000 }
1001
1002 sub accession_number {
1003 my $self = shift;
1004 return "unknown";
1005 }
1006
1007 sub primary_id {
1008 my $self = shift;
1009 return overload::StrVal($self);
1010 }
1011
1012 sub can_call_new { return 0 }
1013
1014 sub alphabet {
1015 my $self = shift;
1016 return $self->{db}->alphabet($self->{id});
1017 }
1018
1019 sub revcom {
1020 my $self = shift;
1021 return $self->new(@{$self}{'db','id','stop','start'});
1022 }
1023
1024 sub length {
1025 my $self = shift;
1026 return $self->{db}->length($self->{id});
1027 }
1028
1029 sub desc {
1030 my $self = shift;
1031 return '';
1032 }
1033
1034 #-------------------------------------------------------------
1035 # stream-based access to the database
1036 #
1037 package Bio::DB::Fasta::Stream;
1038 use Tie::Handle;
1039 use vars qw(@ISA);
1040 @ISA = qw(Tie::Handle);
1041 eval {
1042 require Bio::DB::SeqI;
1043 } && (push @ISA,'Bio::DB::SeqI');
1044
1045
1046 sub new {
1047 my $class = shift;
1048 my $db = shift;
1049 my $key = $db->FIRSTKEY;
1050 return bless { db=>$db,key=>$key },$class;
1051 }
1052
1053 sub next_seq {
1054 my $self = shift;
1055 my ($key,$db) = @{$self}{'key','db'};
1056 my $value = $db->get_Seq_by_id($key);
1057 $self->{key} = $db->NEXTKEY($key);
1058 $value;
1059 }
1060
1061 sub TIEHANDLE {
1062 my $class = shift;
1063 my $db = shift;
1064 return $class->new($db);
1065 }
1066 sub READLINE {
1067 my $self = shift;
1068 $self->next_seq;
1069 }
1070
1071 1;
1072
1073 __END__
1074