Mercurial > repos > mahtabm > ensembl
diff variant_effect_predictor/Bio/DB/Fasta.pm @ 0:1f6dce3d34e0
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 02:01:53 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/DB/Fasta.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,1074 @@ +=head1 NAME + +Bio::DB::Fasta -- Fast indexed access to a directory of fasta files + +=head1 SYNOPSIS + + use Bio::DB::Fasta; + + # create database from directory of fasta files + my $db = Bio::DB::Fasta->new('/path/to/fasta/files'); + + # simple access (for those without Bioperl) + my $seq = $db->seq('CHROMOSOME_I',4_000_000 => 4_100_000); + my $revseq = $db->seq('CHROMOSOME_I',4_100_000 => 4_000_000); + my @ids = $db->ids; + my $length = $db->length('CHROMOSOME_I'); + my $alphabet = $db->alphabet('CHROMOSOME_I'); + my $header = $db->header('CHROMOSOME_I'); + + # Bioperl-style access + my $db = Bio::DB::Fasta->new('/path/to/fasta/files'); + + my $obj = $db->get_Seq_by_id('CHROMOSOME_I'); + my $seq = $obj->seq; + my $subseq = $obj->subseq(4_000_000 => 4_100_000); + my $length = $obj->length; + # (etc) + + # Bio::SeqIO-style access + my $stream = Bio::DB::Fasta->new('/path/to/fasta/files')->get_PrimarySeq_stream; + while (my $seq = $stream->next_seq) { + # Bio::PrimarySeqI stuff + } + + my $fh = Bio::DB::Fasta->newFh('/path/to/fasta/files'); + while (my $seq = <$fh>) { + # Bio::PrimarySeqI stuff + } + + # tied hash access + tie %sequences,'Bio::DB::Fasta','/path/to/fasta/files'; + print $sequences{'CHROMOSOME_I:1,20000'}; + +=head1 DESCRIPTION + +Bio::DB::Fasta provides indexed access to one or more Fasta files. It +provides random access to each sequence entry, and to subsequences +within each entry, allowing you to retrieve portions of very large +sequences without bringing the entire sequence into memory. + +When you initialize the module, you point it at a single fasta file or +a directory of multiple such files. The first time it is run, the +module generates an index of the contents of the file or directory +using the AnyDBM module (Berkeley DB preferred, followed by GDBM_File, +NDBM_File, and SDBM_File). Thereafter it uses the index file to find +the file and offset for any requested sequence. If one of the source +fasta files is updated, the module reindexes just that one file. (You +can also force reindexing manually). For improved performance, the +module keeps a cache of open filehandles, closing less-recently used +ones when the cache is full. + +The fasta files may contain any combination of nucleotide and protein +sequences; during indexing the module guesses the molecular type. +Entries may have any line length, and different line lengths are +allowed in the same file. However, within a sequence entry, all lines +must be the same length except for the last. + +The module uses /^E<gt>(\S+)/ to extract each sequence's primary ID from +the Fasta header. During indexing, you may pass a callback routine to +modify this primary ID. For example, you may wish to extract a +portion of the gi|gb|abc|xyz nonsense that GenBank Fasta files use. +The original header line can be recovered later. + +This module was developed for use with the C. elegans and human +genomes, and has been tested with sequence segments as large as 20 +megabases. Indexing the C. elegans genome (100 megabases of genomic +sequence plus 100,000 ESTs) takes ~5 minutes on my 300 MHz pentium +laptop. On the same system, average access time for any 200-mer within +the C. elegans genome was E<lt>0.02s. + +=head1 DATABASE CREATION AND INDEXING + +The two constructors for this class are new() and newFh(). The former +creates a Bio::DB::Fasta object which is accessed via method calls. +The latter creates a tied filehandle which can be used Bio::SeqIO +style to fetch sequence objects in a stream fashion. There is also a +tied hash interface. + +=over 4 + +=item $db = Bio::DB::Fasta-E<gt>new($fasta_path [,%options]) + +Create a new Bio::DB::Fasta object from the Fasta file or files +indicated by $fasta_path. Indexing will be performed automatically if +needed. If successful, new() will return the database accessor +object. Otherwise it will return undef. + +$fasta_path may be an individual Fasta file, or may refer to a +directory containing one or more of such files. Following the path, +you may pass a series of name=E<gt>value options or a hash with these +same name=E<gt>value pairs. Valid options are: + + Option Name Description Default + ----------- ----------- ------- + + -glob Glob expression to use *.{fa,fasta,fast,FA,FASTA,FAST,dna} + for searching for Fasta + files in directories. + + -makeid A code subroutine for None + transforming Fasta IDs. + + -maxopen Maximum size of 32 + filehandle cache. + + -debug Turn on status 0 + messages. + + -reindex Force the index to be 0 + rebuilt. + + -dbmargs Additional arguments none + to pass to the DBM + routines when tied + (scalar or array ref). + +-dbmargs can be used to control the format of the index. For example, +you can pass $DB_BTREE to this argument so as to force the IDs to be +sorted and retrieved alphabetically. Note that you must use the same +arguments every time you open the index! + +-reindex can be used to force the index to be recreated from scratch. + +=item $fh = Bio::DB::Fasta-E<gt>newFh($fasta_path [,%options]) + +Create a tied filehandle opened on a Bio::DB::Fasta object. Reading +from this filehandle with E<lt>E<gt> will return a stream of sequence objects, +Bio::SeqIO style. + +=back + +The -makeid option gives you a chance to modify sequence IDs during +indexing. The option's value should be a code reference that will +take a scalar argument and return a scalar result, like this: + + $db = Bio::DB::Fasta->new("file.fa",-makeid=>\&make_my_id); + + sub make_my_id { + my $description_line = shift; + # get a new id from the fasta header + return $new_id; + } + +make_my_id() will be called with the full fasta id line (including the +"E<gt>" symbol!). For example: + + >A12345.3 Predicted C. elegans protein egl-2 + +By default, this module will use the regular expression /^E<gt>(\S+)/ +to extract "A12345.3" for use as the ID. If you pass a -makeid +callback, you can extract any portion of this, such as the "egl-2" +symbol. + +The -makeid option is ignored after the index is constructed. + +=head1 OBJECT METHODS + +The following object methods are provided. + +=over 4 + +=item $raw_seq = $db-E<gt>seq($id [,$start, $stop]) + +Return the raw sequence (a string) given an ID and optionally a start +and stop position in the sequence. In the case of DNA sequence, if +$stop is less than $start, then the reverse complement of the sequence +is returned (this violates Bio::Seq conventions). + +For your convenience, subsequences can be indicated with this compound +ID: + + $db->seq("$id:$start,$stop") + +=item $length = $db-E<gt>length($id) + +Return the length of the indicated sequence. + +=item $header = $db-E<gt>header($id) + +Return the header line for the ID, including the initial "E<gt>". + +=item $type = $db-E<gt>alphabet($id) + +Return the molecular type of the indicated sequence. One of "dna", +"rna" or "protein". + +=item $filename = $db-E<gt>file($id) + +Return the name of the file in which the indicated sequence can be +found. + +=item $offset = $db-E<gt>offset($id) + +Return the offset of the indicated sequence from the beginning of the +file in which it is located. The offset points to the beginning of +the sequence, not the beginning of the header line. + +=item $header_length = $db-E<gt>headerlen($id) + +Return the length of the header line for the indicated sequence. + +=item $header_offset = $db-E<gt>header_offset($id) + +Return the offset of the header line for the indicated sequence from +the beginning of the file in which it is located. + +=item $index_name = $db-E<gt>index_name + +Return the path to the index file. + +=item $path = $db-E<gt>path + +Return the path to the Fasta file(s). + +=back + +For BioPerl-style access, the following methods are provided: + +=over 4 + +=item $seq = $db-E<gt>get_Seq_by_id($id) + +Return a Bio::PrimarySeq::Fasta object, which obeys the +Bio::PrimarySeqI conventions. For example, to recover the raw DNA or +protein sequence, call $seq-E<gt>seq(). + +Note that get_Seq_by_id() does not bring the entire sequence into +memory until requested. Internally, the returned object uses the +accessor to generate subsequences as needed. + +=item $seq = $db-E<gt>get_Seq_by_acc($id) + +=item $seq = $db-E<gt>get_Seq_by_primary_id($id) + +These methods all do the same thing as get_Seq_by_id(). + +=item $stream = $db-E<gt>get_PrimarySeq_stream() + +Return a Bio::DB::Fasta::Stream object, which supports a single method +next_seq(). Each call to next_seq() returns a new +Bio::PrimarySeq::Fasta object, until no more sequences remain. + +=back + +See L<Bio::PrimarySeqI> for methods provided by the sequence objects +returned from get_Seq_by_id() and get_PrimarySeq_stream(). + +=head1 TIED INTERFACES + +This module provides two tied interfaces, one which allows you to +treat the sequence database as a hash, and the other which allows you +to treat the database as an I/O stream. + +=head2 Creating a Tied Hash + +The tied hash interface is very straightforward + +=over 4 + +=item $obj = tie %db,'Bio::DB::Fasta','/path/to/fasta/files' [,@args] + +Tie %db to Bio::DB::Fasta using the indicated path to the Fasta files. +The optional @args list is the same set of named argument/value pairs +used by Bio::DB::Fasta-E<gt>new(). + +If successful, tie() will return the tied object. Otherwise it will +return undef. + +=back + +Once tied, you can use the hash to retrieve an individual sequence by +its ID, like this: + + my $seq = $db{CHROMOSOME_I}; + +You may select a subsequence by appending the comma-separated range to +the sequence ID in the format "$id:$start,$stop". For example, here +is the first 1000 bp of the sequence with the ID "CHROMOSOME_I": + + my $seq = $db{'CHROMOSOME_I:1,1000'}; + +(The regular expression used to parse this format allows sequence IDs +to contain colons.) + +When selecting subsequences, if $start E<gt> stop, then the reverse +complement will be returned for DNA sequences. + +The keys() and values() functions will return the sequence IDs and +their sequences, respectively. In addition, each() can be used to +iterate over the entire data set: + + while (my ($id,$sequence) = each %db) { + print "$id => $sequence\n"; + } + +When dealing with very large sequences, you can avoid bringing them +into memory by calling each() in a scalar context. This returns the +key only. You can then use tied(%db) to recover the Bio::DB::Fasta +object and call its methods. + + while (my $id = each %db) { + print "$id => $db{$sequence:1,100}\n"; + print "$id => ",tied(%db)->length($id),"\n"; + } + +You may, in addition invoke Bio::DB::Fasta's FIRSTKEY and NEXTKEY tied +hash methods directly. + +=over 4 + +=item $id = $db-E<gt>FIRSTKEY + +Return the first ID in the database. + +=item $id = $db-E<gt>NEXTKEY($id) + +Given an ID, return the next ID in sequence. + +=back + +This allows you to write the following iterative loop using just the +object-oriented interface: + + my $db = Bio::DB::Fasta->new('/path/to/fasta/files'); + for (my $id=$db->FIRSTKEY; $id; $id=$db->NEXTKEY($id)) { + # do something with sequence + } + +=head2 Creating a Tied Filehandle + +The Bio::DB::Fasta-E<gt>newFh() method creates a tied filehandle from +which you can read Bio::PrimarySeq::Fasta sequence objects +sequentially. The following bit of code will iterate sequentially +over all sequences in the database: + + my $fh = Bio::DB::Fasta->newFh('/path/to/fasta/files'); + while (my $seq = <$fh>) { + print $seq->id,' => ',$seq->length,"\n"; + } + +When no more sequences remain to be retrieved, the stream will return +undef. + +=head1 BUGS + +When a sequence is deleted from one of the Fasta files, this deletion +is not detected by the module and removed from the index. As a +result, a "ghost" entry will remain in the index and will return +garbage results if accessed. + +Currently, the only way to accomodate deletions is to rebuild the +entire index, either by deleting it manually, or by passing +-reindex=E<gt>1 to new() when initializing the module. + +=head1 SEE ALSO + +L<bioperl> + +=head1 AUTHOR + +Lincoln Stein E<lt>lstein@cshl.orgE<gt>. + +Copyright (c) 2001 Cold Spring Harbor Laboratory. + +This library is free software; you can redistribute it and/or modify +it under the same terms as Perl itself. See DISCLAIMER.txt for +disclaimers of warranty. + +=cut + +#' +package Bio::DB::Fasta; + +BEGIN { + @AnyDBM_File::ISA = qw(DB_File GDBM_File NDBM_File SDBM_File) +} + +use strict; +use IO::File; +use AnyDBM_File; +use Fcntl; +use File::Basename qw(basename dirname); +use Bio::DB::SeqI; +use Bio::Root::Root; +use vars qw($VERSION @ISA); + +@ISA = qw(Bio::DB::SeqI Bio::Root::Root); + +$VERSION = '1.03'; + +*seq = *sequence = \&subseq; +*ids = \&get_all_ids; +*get_seq_by_primary_id = *get_Seq_by_acc = \&get_Seq_by_id; + +use constant STRUCT =>'NNnnCa*'; +use constant DNA => 1; +use constant RNA => 2; +use constant PROTEIN => 3; + +# Bio::DB-like object +# providing fast random access to a directory of FASTA files + +=head2 new + + Title : new + Usage : my $db = new Bio::DB::Fasta( $path, @options); + Function: initialize a new Bio::DB::Fasta object + Returns : new Bio::DB::Fasta object + Args : path to dir of fasta files or a single filename + +These are optional arguments to pass in as well. + + -glob Glob expression to use *.{fa,fasta,fast,FA,FASTA,FAST} + for searching for Fasta + files in directories. + + -makeid A code subroutine for None + transforming Fasta IDs. + + -maxopen Maximum size of 32 + filehandle cache. + + -debug Turn on status 0 + messages. + + -reindex Force the index to be 0 + rebuilt. + + -dbmargs Additional arguments none + to pass to the DBM + routines when tied + (scalar or array ref). + +=cut + +sub new { + my $class = shift; + my $path = shift; + my %opts = @_; + + my $self = bless { debug => $opts{-debug}, + makeid => $opts{-makeid}, + glob => $opts{-glob} || '*.{fa,fasta,FA,FASTA,fast,FAST,dna,fsa}', + maxopen => $opts{-maxfh} || 32, + dbmargs => $opts{-dbmargs} || undef, + fhcache => {}, + cacheseq => {}, + curopen => 0, + openseq => 1, + dirname => undef, + offsets => undef, + }, $class; + my ($offsets,$dirname); + + if (-d $path) { + $offsets = $self->index_dir($path,$opts{-reindex}); + $dirname = $path; + } elsif (-f _) { + $offsets = $self->index_file($path,$opts{-reindex}); + $dirname = dirname($path); + } else { + $self->throw( "$path: Invalid file or dirname"); + } + @{$self}{qw(dirname offsets)} = ($dirname,$offsets); + + $self; +} + +=head2 newFh + + Title : newFh + Function: gets a new Fh for a file + Example : internal method + Returns : GLOB + Args : + +=cut + +sub newFh { + my $class = shift; + my $self = $class->new(@_); + require Symbol; + my $fh = Symbol::gensym or return; + tie $$fh,'Bio::DB::Fasta::Stream',$self or return; + $fh; +} + +sub _open_index { + my $self = shift; + my ($index,$write) = @_; + my %offsets; + my $flags = $write ? O_CREAT|O_RDWR : O_RDONLY; + my @dbmargs = $self->dbmargs; + tie %offsets,'AnyDBM_File',$index,$flags,0644,@dbmargs or $self->throw( "Can't open cache file: $!"); + return \%offsets; +} + +=head2 index_dir + + Title : index_dir + Usage : $db->index_dir($dir) + Function: set the index dir and load all files in the dir + Returns : hashref of seq offsets in each file + Args : dirname, boolean to force a reload of all files + +=cut + +sub index_dir { + my $self = shift; + my $dir = shift; + my $force_reindex = shift; + + # find all fasta files + my @files = glob("$dir/$self->{glob}"); + $self->throw( "no fasta files in $dir") unless @files; + + # get name of index + my $index = $self->index_name($dir,1); + + # if caller has requested reindexing, then unlink + # the index file. + unlink $index if $force_reindex; + + # get the modification time of the index + my $indextime = (stat($index))[9] || 0; + + # get the most recent modification time of any of the contents + my $modtime = 0; + my %modtime; + foreach (@files) { + my $m = (stat($_))[9]; + $modtime{$_} = $m; + $modtime = $m if $modtime < $m; + } + + my $reindex = $force_reindex || $indextime < $modtime; + my $offsets = $self->_open_index($index,$reindex) or return; + $self->{offsets} = $offsets; + + # no indexing needed + return $offsets unless $reindex; + + # otherwise reindex contents of changed files + $self->{indexing} = $index; + foreach (@files) { + next if( defined $indextime && $modtime{$_} <= $indextime); + $self->calculate_offsets($_,$offsets); + } + delete $self->{indexing}; + return $self->{offsets}; +} + +=head2 get_Seq_by_id + + Title : get_Seq_by_id + Usage : my $seq = $db->get_Seq_by_id($id) + Function: Bio::DB::RandomAccessI method implemented + Returns : Bio::PrimarySeqI object + Args : id + +=cut + +sub get_Seq_by_id { + my $self = shift; + my $id = shift; + return Bio::PrimarySeq::Fasta->new($self,$id); +} + +=head2 index_file + + Title : index_file + Usage : $db->index_file($filename) + Function: (re)loads a sequence file and indexes sequences offsets in the file + Returns : seq offsets in the file + Args : filename, + boolean to force reloading a file + +=cut + + +sub index_file { + my $self = shift; + my $file = shift; + my $force_reindex = shift; + + my $index = $self->index_name($file); + # if caller has requested reindexing, then unlink the index + unlink $index if $force_reindex; + + # get the modification time of the index + my $indextime = (stat($index))[9]; + my $modtime = (stat($file))[9]; + + my $reindex = $force_reindex || $indextime < $modtime; + my $offsets = $self->_open_index($index,$reindex) or return; + $self->{offsets} = $offsets; + + return $self->{offsets} unless $reindex; + + $self->{indexing} = $index; + $self->calculate_offsets($file,$offsets); + delete $self->{indexing}; + return $self->{offsets}; +} + +=head2 dbmargs + + Title : dbmargs + Usage : my @args = $db->dbmargs; + Function: gets stored dbm arguments + Returns : array + Args : none + + +=cut + +sub dbmargs { + my $self = shift; + my $args = $self->{dbmargs} or return; + return ref($args) eq 'ARRAY' ? @$args : $args; +} + +=head2 index_name + + Title : index_name + Usage : my $indexname = $db->index_name($path,$isdir); + Function: returns the name of the index for a specific path + Returns : string + Args : path to check, + boolean if it is a dir + +=cut + +sub index_name { + my $self = shift; + my ($path,$isdir) = @_; + unless ($path) { + my $dir = $self->{dirname} or return; + return $self->index_name($dir,-d $dir); + } + return "$path/directory.index" if $isdir; + return "$path.index"; +} + +=head2 calculate_offsets + + Title : calculate_offsets + Usage : $db->calculate_offsets($filename,$offsets); + Function: calculates the sequence offsets in a file based on id + Returns : offset hash for each file + Args : file to process + $offsets - hashref of id to offset storage + +=cut + +sub calculate_offsets { + my $self = shift; + my ($file,$offsets) = @_; + my $base = $self->path2fileno(basename($file)); + + my $fh = IO::File->new($file) or $self->throw( "Can't open $file: $!"); + warn "indexing $file\n" if $self->{debug}; + my ($offset,$id,$linelength,$type,$firstline,$count,%offsets); + while (<$fh>) { # don't try this at home + if (/^>(\S+)/) { + print STDERR "indexed $count sequences...\n" + if $self->{debug} && (++$count%1000) == 0; + my $pos = tell($fh); + if ($id) { + my $seqlength = $pos - $offset - length($_) - 1; + $seqlength -= int($seqlength/$linelength); + $offsets->{$id} = $self->_pack($offset,$seqlength, + $linelength,$firstline, + $type,$base); + } + $id = ref($self->{makeid}) eq 'CODE' ? $self->{makeid}->($_) : $1; + ($offset,$firstline,$linelength) = ($pos,length($_),0); + } else { + $linelength ||= length($_); + $type ||= $self->_type($_); + } + } + # deal with last entry + if ($id) { + my $pos = tell($fh); + +# my $seqlength = $pos - $offset - length($_) - 1; + # $_ is always null should not be part of this calculation + my $seqlength = $pos - $offset - 1; + + if ($linelength == 0) { # yet another pesky empty chr_random.fa file + $seqlength = 0; + } else { + $seqlength -= int($seqlength/$linelength); + }; + $offsets->{$id} = $self->_pack($offset,$seqlength, + $linelength,$firstline, + $type,$base); + } + return \%offsets; +} + +=head2 get_all_ids + + Title : get_all_ids + Usage : my @ids = $db->get_all_ids + Function: gets all the stored ids in all indexes + Returns : list of ids + Args : none + +=cut + +sub get_all_ids { grep {!/^__/} keys %{shift->{offsets}} } + +sub offset { + my $self = shift; + my $id = shift; + my $offset = $self->{offsets}{$id} or return; + ($self->_unpack($offset))[0]; +} + +sub length { + my $self = shift; + my $id = shift; + my $offset = $self->{offsets}{$id} or return; + ($self->_unpack($offset))[1]; +} + +sub linelen { + my $self = shift; + my $id = shift; + my $offset = $self->{offsets}{$id} or return; + ($self->_unpack($offset))[2]; +} + +sub headerlen { + my $self = shift; + my $id = shift; + my $offset = $self->{offsets}{$id} or return; + ($self->_unpack($offset))[3]; +} + +sub alphabet { + my $self = shift; + my $id = shift; + my $offset = $self->{offsets}{$id} or return; + my $type = ($self->_unpack($offset))[4]; + return $type == DNA ? 'dna' + : $type == RNA ? 'rna' + : 'protein'; + +} + +sub path { shift->{dirname} } + +sub header_offset { + my $self = shift; + my $id = shift; + return unless $self->{offsets}{$id}; + return $self->offset($id) - $self->headerlen($id); +} + +sub file { + my $self = shift; + my $id = shift; + my $offset = $self->{offsets}{$id} or return; + $self->fileno2path(($self->_unpack($offset))[5]); +} + +sub fileno2path { + my $self = shift; + my $no = shift; + return $self->{offsets}{"__file_$no"}; +} + +sub path2fileno { + my $self = shift; + my $path = shift; + if ( !defined $self->{offsets}{"__path_$path"} ) { + my $fileno = ($self->{offsets}{"__path_$path"} = 0+ $self->{fileno}++); + $self->{offsets}{"__file_$fileno"} = $path; + } + return $self->{offsets}{"__path_$path"} +} + +=head2 subseq + + Title : subseq + Usage : $seqdb->subseq($id,$start,$stop); + Function: returns a subseq of a sequence in the db + Returns : subsequence data + Args : id of sequence, starting point, ending point + +=cut + +sub subseq { + my ($self,$id,$start,$stop) = @_; + if ($id =~ /^(.+):([\d_]+)[,-]([\d_]+)$/) { + ($id,$start,$stop) = ($1,$2,$3); + $start =~ s/_//g; + $stop =~ s/_//g; + } + $start ||= 1; + $stop ||= $self->length($id); + + my $reversed; + if ($start > $stop) { + ($start,$stop) = ($stop,$start); + $reversed++; + } + + my $data; + + my $fh = $self->fh($id) or return; + my $filestart = $self->caloffset($id,$start); + my $filestop = $self->caloffset($id,$stop); + + seek($fh,$filestart,0); + read($fh,$data,$filestop-$filestart+1); + $data =~ s/\n//g; + if ($reversed) { + $data = reverse $data; + $data =~ tr/gatcGATC/ctagCTAG/; + } + $data; +} + +sub fh { + my $self = shift; + my $id = shift; + my $file = $self->file($id) or return; + $self->fhcache("$self->{dirname}/$file") or $self->throw( "Can't open file $file"); +} + +sub header { + my $self = shift; + my $id = shift; + my ($offset,$seqlength,$linelength,$firstline,$type,$file) + = $self->_unpack($self->{offsets}{$id}) or return; + $offset -= $firstline; + my $data; + my $fh = $self->fh($id) or return; + seek($fh,$offset,0); + read($fh,$data,$firstline); + chomp $data; + substr($data,0,1) = ''; + $data; +} + +sub caloffset { + my $self = shift; + my $id = shift; + my $a = shift()-1; + my ($offset,$seqlength,$linelength,$firstline,$type,$file) = $self->_unpack($self->{offsets}{$id}); + $a = 0 if $a < 0; + $a = $seqlength-1 if $a >= $seqlength; + $offset + $linelength * int($a/($linelength-1)) + $a % ($linelength-1); +} + +sub fhcache { + my $self = shift; + my $path = shift; + if (!$self->{fhcache}{$path}) { + if ($self->{curopen} >= $self->{maxopen}) { + my @lru = sort {$self->{cacheseq}{$a} <=> $self->{cacheseq}{$b};} keys %{$self->{fhcache}}; + splice(@lru, $self->{maxopen} / 3); + $self->{curopen} -= @lru; + for (@lru) { delete $self->{fhcache}{$_} } + } + $self->{fhcache}{$path} = IO::File->new($path) or return; + $self->{curopen}++; + } + $self->{cacheseq}{$path}++; + $self->{fhcache}{$path} +} + +sub _pack { + shift; + pack STRUCT,@_; +} + +sub _unpack { + shift; + unpack STRUCT,shift; +} + +sub _type { + shift; + local $_ = shift; + return /^[gatcnGATCN*-]+$/ ? DNA + : /^[gaucnGAUCN*-]+$/ ? RNA + : PROTEIN; +} + +=head2 get_PrimarySeq_stream + + Title : get_PrimarySeq_stream + Usage : + Function: + Example : + Returns : + Args : + + +=cut + +sub get_PrimarySeq_stream { + my $self = shift; + return Bio::DB::Fasta::Stream->new($self); +} + +sub TIEHASH { + my $self = shift; + return $self->new(@_); +} + +sub FETCH { + shift->subseq(@_); +} +sub STORE { + shift->throw("Read-only database"); +} +sub DELETE { + shift->throw("Read-only database"); +} +sub CLEAR { + shift->throw("Read-only database"); +} +sub EXISTS { + defined shift->offset(@_); +} +sub FIRSTKEY { tied(%{shift->{offsets}})->FIRSTKEY(@_); } +sub NEXTKEY { tied(%{shift->{offsets}})->NEXTKEY(@_); } + +sub DESTROY { + my $self = shift; + if ($self->{indexing}) { # killed prematurely, so index file is no good! + warn "indexing was interrupted, so unlinking $self->{indexing}"; + unlink $self->{indexing}; + } +} + +#------------------------------------------------------------- +# Bio::PrimarySeqI compatibility +# +package Bio::PrimarySeq::Fasta; +use overload '""' => 'display_id'; + +use vars '@ISA'; +eval { + require Bio::PrimarySeqI; + require Bio::Root::Root; +} && (@ISA = ('Bio::Root::Root','Bio::PrimarySeqI')); + +sub new { + my $class = shift; + $class = ref($class) if ref $class; + my ($db,$id,$start,$stop) = @_; + return bless { db => $db, + id => $id, + start => $start || 1, + stop => $stop || $db->length($id) + },$class; +} + +sub seq { + my $self = shift; + return $self->{db}->seq($self->{id},$self->{start},$self->{stop}); +} + +sub subseq { + my $self = shift; + my ($start,$stop) = @_; + $self->throw("Stop cannot be smaller than start") unless $start <= $stop; + return $self->{start} <= $self->{stop} ? $self->new($self->{db}, + $self->{id}, + $self->{start}+$start-1, + $self->{start}+$stop-1) + : $self->new($self->{db}, + $self->{id}, + $self->{start}-($start-1), + $self->{start}-($stop-1) + ); + +} + +sub display_id { + my $self = shift; + return $self->{id}; +} + +sub accession_number { + my $self = shift; + return "unknown"; +} + +sub primary_id { + my $self = shift; + return overload::StrVal($self); +} + +sub can_call_new { return 0 } + +sub alphabet { + my $self = shift; + return $self->{db}->alphabet($self->{id}); +} + +sub revcom { + my $self = shift; + return $self->new(@{$self}{'db','id','stop','start'}); +} + +sub length { + my $self = shift; + return $self->{db}->length($self->{id}); +} + +sub desc { + my $self = shift; + return ''; +} + +#------------------------------------------------------------- +# stream-based access to the database +# +package Bio::DB::Fasta::Stream; +use Tie::Handle; +use vars qw(@ISA); +@ISA = qw(Tie::Handle); +eval { + require Bio::DB::SeqI; +} && (push @ISA,'Bio::DB::SeqI'); + + +sub new { + my $class = shift; + my $db = shift; + my $key = $db->FIRSTKEY; + return bless { db=>$db,key=>$key },$class; +} + +sub next_seq { + my $self = shift; + my ($key,$db) = @{$self}{'key','db'}; + my $value = $db->get_Seq_by_id($key); + $self->{key} = $db->NEXTKEY($key); + $value; +} + +sub TIEHANDLE { + my $class = shift; + my $db = shift; + return $class->new($db); +} +sub READLINE { + my $self = shift; + $self->next_seq; +} + +1; + +__END__ +