ensembl: variant_effect_predictor/Bio/DB/Fasta.pm comparison

comparison variant_effect_predictor/Bio/DB/Fasta.pm @ 0:1f6dce3d34e0

Uploaded

author	mahtabm
date	Thu, 11 Apr 2013 02:01:53 -0400
parents
children

comparison

equal deleted inserted replaced

--1:000000000000
+:1f6dce3d34e0
+=head1 NAME
+Bio::DB::Fasta -- Fast indexed access to a directory of fasta files
+=head1 SYNOPSIS
+use Bio::DB::Fasta;
+# create database from directory of fasta files
+my $db      = Bio::DB::Fasta->new('/path/to/fasta/files');
+# simple access (for those without Bioperl)
+my $seq     = $db->seq('CHROMOSOME_I',4_000_000 => 4_100_000);
+my $revseq  = $db->seq('CHROMOSOME_I',4_100_000 => 4_000_000);
+my @ids     = $db->ids;
+my $length  = $db->length('CHROMOSOME_I');
+my $alphabet = $db->alphabet('CHROMOSOME_I');
+my $header  = $db->header('CHROMOSOME_I');
+# Bioperl-style access
+my $db      = Bio::DB::Fasta->new('/path/to/fasta/files');
+my $obj     = $db->get_Seq_by_id('CHROMOSOME_I');
+my $seq     = $obj->seq;
+my $subseq  = $obj->subseq(4_000_000 => 4_100_000);
+my $length  = $obj->length;
+# (etc)
+# Bio::SeqIO-style access
+my $stream  = Bio::DB::Fasta->new('/path/to/fasta/files')->get_PrimarySeq_stream;
+while (my $seq = $stream->next_seq) {
+# Bio::PrimarySeqI stuff
+}
+my $fh = Bio::DB::Fasta->newFh('/path/to/fasta/files');
+while (my $seq = <$fh>) {
+# Bio::PrimarySeqI stuff
+}
+# tied hash access
+tie %sequences,'Bio::DB::Fasta','/path/to/fasta/files';
+print $sequences{'CHROMOSOME_I:1,20000'};
+=head1 DESCRIPTION
+Bio::DB::Fasta provides indexed access to one or more Fasta files.  It
+provides random access to each sequence entry, and to subsequences
+within each entry, allowing you to retrieve portions of very large
+sequences without bringing the entire sequence into memory.
+When you initialize the module, you point it at a single fasta file or
+a directory of multiple such files.  The first time it is run, the
+module generates an index of the contents of the file or directory
+using the AnyDBM module (Berkeley DB preferred, followed by GDBM_File,
+NDBM_File, and SDBM_File).  Thereafter it uses the index file to find
+the file and offset for any requested sequence.  If one of the source
+fasta files is updated, the module reindexes just that one file.  (You
+can also force reindexing manually).  For improved performance, the
+module keeps a cache of open filehandles, closing less-recently used
+ones when the cache is full.
+The fasta files may contain any combination of nucleotide and protein
+sequences; during indexing the module guesses the molecular type.
+Entries may have any line length, and different line lengths are
+allowed in the same file.  However, within a sequence entry, all lines
+must be the same length except for the last.
+The module uses /^E<gt>(\S+)/ to extract each sequence's primary ID from
+the Fasta header.  During indexing, you may pass a callback routine to
+modify this primary ID.  For example, you may wish to extract a
+portion of the gi|gb|abc|xyz nonsense that GenBank Fasta files use.
+The original header line can be recovered later.
+This module was developed for use with the C. elegans and human
+genomes, and has been tested with sequence segments as large as 20
+megabases.  Indexing the C. elegans genome (100 megabases of genomic
+sequence plus 100,000 ESTs) takes ~5 minutes on my 300 MHz pentium
+laptop. On the same system, average access time for any 200-mer within
+the C. elegans genome was E<lt>0.02s.
+=head1 DATABASE CREATION AND INDEXING
+The two constructors for this class are new() and newFh().  The former
+creates a Bio::DB::Fasta object which is accessed via method calls.
+The latter creates a tied filehandle which can be used Bio::SeqIO
+style to fetch sequence objects in a stream fashion.  There is also a
+tied hash interface.
+=over 4
+=item $db = Bio::DB::Fasta-E<gt>new($fasta_path [,%options])
+Create a new Bio::DB::Fasta object from the Fasta file or files
+indicated by $fasta_path.  Indexing will be performed automatically if
+needed.  If successful, new() will return the database accessor
+object.  Otherwise it will return undef.
+$fasta_path may be an individual Fasta file, or may refer to a
+directory containing one or more of such files.  Following the path,
+you may pass a series of name=E<gt>value options or a hash with these
+same name=E<gt>value pairs.  Valid options are:
+Option Name   Description               Default
+-----------   -----------               -------
+-glob         Glob expression to use    *.{fa,fasta,fast,FA,FASTA,FAST,dna}
+for searching for Fasta
+	       files in directories.
+-makeid       A code subroutine for     None
+	       transforming Fasta IDs.
+-maxopen      Maximum size of		 32
+	       filehandle cache.
+-debug        Turn on status		 0
+	       messages.
+-reindex      Force the index to be     0
+rebuilt.
+-dbmargs      Additional arguments      none
+to pass to the DBM
+routines when tied
+(scalar or array ref).
+-dbmargs can be used to control the format of the index.  For example,
+you can pass $DB_BTREE to this argument so as to force the IDs to be
+sorted and retrieved alphabetically.  Note that you must use the same
+arguments every time you open the index!
+-reindex can be used to force the index to be recreated from scratch.
+=item $fh = Bio::DB::Fasta-E<gt>newFh($fasta_path [,%options])
+Create a tied filehandle opened on a Bio::DB::Fasta object.  Reading
+from this filehandle with E<lt>E<gt> will return a stream of sequence objects,
+Bio::SeqIO style.
+=back
+The -makeid option gives you a chance to modify sequence IDs during
+indexing.  The option's value should be a code reference that will
+take a scalar argument and return a scalar result, like this:
+$db = Bio::DB::Fasta->new("file.fa",-makeid=>\&make_my_id);
+sub make_my_id {
+my $description_line = shift;
+# get a new id from the fasta header
+return $new_id;
+}
+make_my_id() will be called with the full fasta id line (including the
+"E<gt>" symbol!).  For example:
+>A12345.3 Predicted C. elegans protein egl-2
+By default, this module will use the regular expression /^E<gt>(\S+)/
+to extract "A12345.3" for use as the ID.  If you pass a -makeid
+callback, you can extract any portion of this, such as the "egl-2"
+symbol.
+The -makeid option is ignored after the index is constructed.
+=head1 OBJECT METHODS
+The following object methods are provided.
+=over 4
+=item $raw_seq = $db-E<gt>seq($id [,$start, $stop])
+Return the raw sequence (a string) given an ID and optionally a start
+and stop position in the sequence.  In the case of DNA sequence, if
+$stop is less than $start, then the reverse complement of the sequence
+is returned (this violates Bio::Seq conventions).
+For your convenience, subsequences can be indicated with this compound
+ID:
+$db->seq("$id:$start,$stop")
+=item $length = $db-E<gt>length($id)
+Return the length of the indicated sequence.
+=item $header = $db-E<gt>header($id)
+Return the header line for the ID, including the initial "E<gt>".
+=item $type  = $db-E<gt>alphabet($id)
+Return the molecular type of the indicated sequence.  One of "dna",
+"rna" or "protein".
+=item $filename  = $db-E<gt>file($id)
+Return the name of the file in which the indicated sequence can be
+found.
+=item $offset    = $db-E<gt>offset($id)
+Return the offset of the indicated sequence from the beginning of the
+file in which it is located.  The offset points to the beginning of
+the sequence, not the beginning of the header line.
+=item $header_length = $db-E<gt>headerlen($id)
+Return the length of the header line for the indicated sequence.
+=item $header_offset = $db-E<gt>header_offset($id)
+Return the offset of the header line for the indicated sequence from
+the beginning of the file in which it is located.
+=item $index_name  = $db-E<gt>index_name
+Return the path to the index file.
+=item $path = $db-E<gt>path
+Return the path to the Fasta file(s).
+=back
+For BioPerl-style access, the following methods are provided:
+=over 4
+=item $seq = $db-E<gt>get_Seq_by_id($id)
+Return a Bio::PrimarySeq::Fasta object, which obeys the
+Bio::PrimarySeqI conventions.  For example, to recover the raw DNA or
+protein sequence, call $seq-E<gt>seq().
+Note that get_Seq_by_id() does not bring the entire sequence into
+memory until requested.  Internally, the returned object uses the
+accessor to generate subsequences as needed.
+=item $seq = $db-E<gt>get_Seq_by_acc($id)
+=item $seq = $db-E<gt>get_Seq_by_primary_id($id)
+These methods all do the same thing as get_Seq_by_id().
+=item $stream = $db-E<gt>get_PrimarySeq_stream()
+Return a Bio::DB::Fasta::Stream object, which supports a single method
+next_seq(). Each call to next_seq() returns a new
+Bio::PrimarySeq::Fasta object, until no more sequences remain.
+=back
+See L<Bio::PrimarySeqI> for methods provided by the sequence objects
+returned from get_Seq_by_id() and get_PrimarySeq_stream().
+=head1 TIED INTERFACES
+This module provides two tied interfaces, one which allows you to
+treat the sequence database as a hash, and the other which allows you
+to treat the database as an I/O stream.
+=head2 Creating a Tied Hash
+The tied hash interface is very straightforward
+=over 4
+=item $obj = tie %db,'Bio::DB::Fasta','/path/to/fasta/files' [,@args]
+Tie %db to Bio::DB::Fasta using the indicated path to the Fasta files.
+The optional @args list is the same set of named argument/value pairs
+used by Bio::DB::Fasta-E<gt>new().
+If successful, tie() will return the tied object.  Otherwise it will
+return undef.
+=back
+Once tied, you can use the hash to retrieve an individual sequence by
+its ID, like this:
+my $seq = $db{CHROMOSOME_I};
+You may select a subsequence by appending the comma-separated range to
+the sequence ID in the format "$id:$start,$stop".  For example, here
+is the first 1000 bp of the sequence with the ID "CHROMOSOME_I":
+my $seq = $db{'CHROMOSOME_I:1,1000'};
+(The regular expression used to parse this format allows sequence IDs
+to contain colons.)
+When selecting subsequences, if $start E<gt> stop, then the reverse
+complement will be returned for DNA sequences.
+The keys() and values() functions will return the sequence IDs and
+their sequences, respectively.  In addition, each() can be used to
+iterate over the entire data set:
+while (my ($id,$sequence) = each %db) {
+print "$id => $sequence\n";
+}
+When dealing with very large sequences, you can avoid bringing them
+into memory by calling each() in a scalar context.  This returns the
+key only.  You can then use tied(%db) to recover the Bio::DB::Fasta
+object and call its methods.
+while (my $id = each %db) {
+print "$id => $db{$sequence:1,100}\n";
+print "$id => ",tied(%db)->length($id),"\n";
+}
+You may, in addition invoke Bio::DB::Fasta's FIRSTKEY and NEXTKEY tied
+hash methods directly.
+=over 4
+=item $id = $db-E<gt>FIRSTKEY
+Return the first ID in the database.
+=item $id = $db-E<gt>NEXTKEY($id)
+Given an ID, return the next ID in sequence.
+=back
+This allows you to write the following iterative loop using just the
+object-oriented interface:
+my $db = Bio::DB::Fasta->new('/path/to/fasta/files');
+for (my $id=$db->FIRSTKEY; $id; $id=$db->NEXTKEY($id)) {
+# do something with sequence
+}
+=head2 Creating a Tied Filehandle
+The Bio::DB::Fasta-E<gt>newFh() method creates a tied filehandle from
+which you can read Bio::PrimarySeq::Fasta sequence objects
+sequentially.  The following bit of code will iterate sequentially
+over all sequences in the database:
+my $fh = Bio::DB::Fasta->newFh('/path/to/fasta/files');
+while (my $seq = <$fh>) {
+print $seq->id,' => ',$seq->length,"\n";
+}
+When no more sequences remain to be retrieved, the stream will return
+undef.
+=head1 BUGS
+When a sequence is deleted from one of the Fasta files, this deletion
+is not detected by the module and removed from the index.  As a
+result, a "ghost" entry will remain in the index and will return
+garbage results if accessed.
+Currently, the only way to accomodate deletions is to rebuild the
+entire index, either by deleting it manually, or by passing
+-reindex=E<gt>1 to new() when initializing the module.
+=head1 SEE ALSO
+L<bioperl>
+=head1 AUTHOR
+Lincoln Stein E<lt>lstein@cshl.orgE<gt>.
+Copyright (c) 2001 Cold Spring Harbor Laboratory.
+This library is free software; you can redistribute it and/or modify
+it under the same terms as Perl itself.  See DISCLAIMER.txt for
+disclaimers of warranty.
+=cut
+#'
+package Bio::DB::Fasta;
+BEGIN {
+@AnyDBM_File::ISA = qw(DB_File GDBM_File NDBM_File SDBM_File)
+}
+use strict;
+use IO::File;
+use AnyDBM_File;
+use Fcntl;
+use File::Basename qw(basename dirname);
+use Bio::DB::SeqI;
+use Bio::Root::Root;
+use vars qw($VERSION @ISA);
+@ISA = qw(Bio::DB::SeqI Bio::Root::Root);
+$VERSION = '1.03';
+*seq = *sequence = \&subseq;
+*ids = \&get_all_ids;
+*get_seq_by_primary_id = *get_Seq_by_acc  = \&get_Seq_by_id;
+use constant STRUCT =>'NNnnCa*';
+use constant DNA     => 1;
+use constant RNA     => 2;
+use constant PROTEIN => 3;
+# Bio::DB-like object
+# providing fast random access to a directory of FASTA files
+=head2 new
+Title   : new
+Usage   : my $db = new Bio::DB::Fasta( $path, @options);
+Function: initialize a new Bio::DB::Fasta object
+Returns : new Bio::DB::Fasta object
+Args    : path to dir of fasta files or a single filename
+These are optional arguments to pass in as well.
+-glob         Glob expression to use    *.{fa,fasta,fast,FA,FASTA,FAST}
+for searching for Fasta
+	       files in directories.
+-makeid       A code subroutine for     None
+	       transforming Fasta IDs.
+-maxopen      Maximum size of		 32
+	       filehandle cache.
+-debug        Turn on status		 0
+	       messages.
+-reindex      Force the index to be     0
+rebuilt.
+-dbmargs      Additional arguments      none
+to pass to the DBM
+routines when tied
+(scalar or array ref).
+=cut
+sub new {
+my $class = shift;
+my $path  = shift;
+my %opts  = @_;
+my $self = bless { debug      => $opts{-debug},
+		     makeid     => $opts{-makeid},
+		     glob       => $opts{-glob}    || '*.{fa,fasta,FA,FASTA,fast,FAST,dna,fsa}',
+		     maxopen    => $opts{-maxfh}   || 32,
+		     dbmargs    => $opts{-dbmargs} || undef,
+		     fhcache    => {},
+		     cacheseq   => {},
+		     curopen    => 0,
+		     openseq    => 1,
+		     dirname    => undef,
+		     offsets    => undef,
+		   }, $class;
+my ($offsets,$dirname);
+if (-d $path) {
+$offsets = $self->index_dir($path,$opts{-reindex});
+$dirname = $path;
+} elsif (-f _) {
+$offsets = $self->index_file($path,$opts{-reindex});
+$dirname = dirname($path);
+} else {
+$self->throw( "$path: Invalid file or dirname");
+}
+@{$self}{qw(dirname offsets)} = ($dirname,$offsets);
+$self;
+}
+=head2 newFh
+Title   : newFh
+Function: gets a new Fh for a file
+Example : internal method
+Returns : GLOB
+Args    :
+=cut
+sub newFh {
+my $class = shift;
+my $self  = $class->new(@_);
+require Symbol;
+my $fh = Symbol::gensym or return;
+tie $$fh,'Bio::DB::Fasta::Stream',$self or return;
+$fh;
+}
+sub _open_index {
+my $self = shift;
+my ($index,$write) = @_;
+my %offsets;
+my $flags = $write ? O_CREAT|O_RDWR : O_RDONLY;
+my @dbmargs = $self->dbmargs;
+tie %offsets,'AnyDBM_File',$index,$flags,0644,@dbmargs or $self->throw( "Can't open cache file: $!");
+return \%offsets;
+}
+=head2 index_dir
+Title   : index_dir
+Usage   : $db->index_dir($dir)
+Function: set the index dir and load all files in the dir
+Returns : hashref of seq offsets in each file
+Args    : dirname, boolean to force a reload of all files
+=cut
+sub index_dir {
+my $self = shift;
+my $dir  = shift;
+my $force_reindex = shift;
+# find all fasta files
+my @files = glob("$dir/$self->{glob}");
+$self->throw( "no fasta files in $dir") unless @files;
+# get name of index
+my $index = $self->index_name($dir,1);
+# if caller has requested reindexing, then unlink
+# the index file.
+unlink $index if $force_reindex;
+# get the modification time of the index
+my $indextime = (stat($index))[9] || 0;
+# get the most recent modification time of any of the contents
+my $modtime = 0;
+my %modtime;
+foreach (@files) {
+my $m = (stat($_))[9];
+$modtime{$_} = $m;
+$modtime = $m if $modtime < $m;
+}
+my $reindex = $force_reindex || $indextime < $modtime;
+my $offsets = $self->_open_index($index,$reindex) or return;
+$self->{offsets} = $offsets;
+# no indexing needed
+return $offsets unless $reindex;
+# otherwise reindex contents of changed files
+$self->{indexing} = $index;
+foreach (@files) {
+next if( defined $indextime && $modtime{$_} <= $indextime);
+$self->calculate_offsets($_,$offsets);
+}
+delete $self->{indexing};
+return $self->{offsets};
+}
+=head2 get_Seq_by_id
+Title   : get_Seq_by_id
+Usage   : my $seq = $db->get_Seq_by_id($id)
+Function: Bio::DB::RandomAccessI method implemented
+Returns : Bio::PrimarySeqI object
+Args    : id
+=cut
+sub get_Seq_by_id {
+my $self = shift;
+my $id   = shift;
+return Bio::PrimarySeq::Fasta->new($self,$id);
+}
+=head2 index_file
+Title   : index_file
+Usage   : $db->index_file($filename)
+Function: (re)loads a sequence file and indexes sequences offsets in the file
+Returns : seq offsets in the file
+Args    : filename,
+boolean to force reloading a file
+=cut
+sub index_file {
+my $self = shift;
+my $file = shift;
+my $force_reindex = shift;
+my $index = $self->index_name($file);
+# if caller has requested reindexing, then unlink the index
+unlink $index if $force_reindex;
+# get the modification time of the index
+my $indextime = (stat($index))[9];
+my $modtime   = (stat($file))[9];
+my $reindex = $force_reindex || $indextime < $modtime;
+my $offsets = $self->_open_index($index,$reindex) or return;
+$self->{offsets} = $offsets;
+return $self->{offsets} unless $reindex;
+$self->{indexing} = $index;
+$self->calculate_offsets($file,$offsets);
+delete $self->{indexing};
+return $self->{offsets};
+}
+=head2 dbmargs
+Title   : dbmargs
+Usage   : my @args = $db->dbmargs;
+Function: gets stored dbm arguments
+Returns : array
+Args    : none
+=cut
+sub dbmargs {
+my $self = shift;
+my $args = $self->{dbmargs} or return;
+return ref($args) eq 'ARRAY' ? @$args : $args;
+}
+=head2 index_name
+Title   : index_name
+Usage   : my $indexname = $db->index_name($path,$isdir);
+Function: returns the name of the index for a specific path
+Returns : string
+Args    : path to check,
+boolean if it is a dir
+=cut
+sub index_name {
+my $self  = shift;
+my ($path,$isdir) = @_;
+unless ($path) {
+my $dir = $self->{dirname} or return;
+return $self->index_name($dir,-d $dir);
+}
+return "$path/directory.index" if $isdir;
+return "$path.index";
+}
+=head2 calculate_offsets
+Title   : calculate_offsets
+Usage   : $db->calculate_offsets($filename,$offsets);
+Function: calculates the sequence offsets in a file based on id
+Returns : offset hash for each file
+Args    : file to process
+$offsets - hashref of id to offset storage
+=cut
+sub calculate_offsets {
+my $self = shift;
+my ($file,$offsets) = @_;
+my $base = $self->path2fileno(basename($file));
+my $fh = IO::File->new($file) or $self->throw( "Can't open $file: $!");
+warn "indexing $file\n" if $self->{debug};
+my ($offset,$id,$linelength,$type,$firstline,$count,%offsets);
+while (<$fh>) {		# don't try this at home
+if (/^>(\S+)/) {
+print STDERR "indexed $count sequences...\n"
+	if $self->{debug} && (++$count%1000) == 0;
+my $pos = tell($fh);
+if ($id) {
+	my $seqlength    = $pos - $offset - length($_) - 1;
+	$seqlength      -= int($seqlength/$linelength);
+	$offsets->{$id}  = $self->_pack($offset,$seqlength,
+					$linelength,$firstline,
+					$type,$base);
+}
+$id = ref($self->{makeid}) eq 'CODE' ? $self->{makeid}->($_) : $1;
+($offset,$firstline,$linelength) = ($pos,length($_),0);
+} else {
+$linelength ||= length($_);
+$type       ||= $self->_type($_);
+}
+}
+# deal with last entry
+if ($id) {
+my $pos = tell($fh);
+#    my $seqlength   = $pos - $offset - length($_) - 1;
+# $_ is always null should not be part of this calculation
+my $seqlength   = $pos - $offset  - 1;
+if ($linelength == 0) { # yet another pesky empty chr_random.fa file
+$seqlength = 0;
+} else {
+$seqlength -= int($seqlength/$linelength);
+};
+$offsets->{$id} = $self->_pack($offset,$seqlength,
+				   $linelength,$firstline,
+				   $type,$base);
+}
+return \%offsets;
+}
+=head2 get_all_ids
+Title   : get_all_ids
+Usage   : my @ids = $db->get_all_ids
+Function: gets all the stored ids in all indexes
+Returns : list of ids
+Args    : none
+=cut
+sub get_all_ids  { grep {!/^__/} keys %{shift->{offsets}} }
+sub offset {
+my $self = shift;
+my $id   = shift;
+my $offset = $self->{offsets}{$id} or return;
+($self->_unpack($offset))[0];
+}
+sub length {
+my $self = shift;
+my $id   = shift;
+my $offset = $self->{offsets}{$id} or return;
+($self->_unpack($offset))[1];
+}
+sub linelen {
+my $self = shift;
+my $id   = shift;
+my $offset = $self->{offsets}{$id} or return;
+($self->_unpack($offset))[2];
+}
+sub headerlen {
+my $self = shift;
+my $id   = shift;
+my $offset = $self->{offsets}{$id} or return;
+($self->_unpack($offset))[3];
+}
+sub alphabet {
+my $self = shift;
+my $id   = shift;
+my $offset = $self->{offsets}{$id} or return;
+my $type = ($self->_unpack($offset))[4];
+return $type == DNA ? 'dna'
+: $type == RNA ? 'rna'
+: 'protein';
+}
+sub path { shift->{dirname} }
+sub header_offset {
+my $self = shift;
+my $id   = shift;
+return unless $self->{offsets}{$id};
+return $self->offset($id) - $self->headerlen($id);
+}
+sub file {
+my $self = shift;
+my $id   = shift;
+my $offset = $self->{offsets}{$id} or return;
+$self->fileno2path(($self->_unpack($offset))[5]);
+}
+sub fileno2path {
+my $self = shift;
+my $no   = shift;
+return $self->{offsets}{"__file_$no"};
+}
+sub path2fileno {
+my $self = shift;
+my $path = shift;
+if ( !defined $self->{offsets}{"__path_$path"} ) {
+my $fileno  = ($self->{offsets}{"__path_$path"} = 0+ $self->{fileno}++);
+$self->{offsets}{"__file_$fileno"} = $path;
+}
+return $self->{offsets}{"__path_$path"}
+}
+=head2 subseq
+Title   : subseq
+Usage   : $seqdb->subseq($id,$start,$stop);
+Function: returns a subseq of a sequence in the db
+Returns : subsequence data
+Args    : id of sequence, starting point, ending point
+=cut
+sub subseq {
+my ($self,$id,$start,$stop) = @_;
+if ($id =~ /^(.+):([\d_]+)[,-]([\d_]+)$/) {
+($id,$start,$stop) = ($1,$2,$3);
+$start =~ s/_//g;
+$stop =~ s/_//g;
+}
+$start ||= 1;
+$stop  ||= $self->length($id);
+my $reversed;
+if ($start > $stop) {
+($start,$stop) = ($stop,$start);
+$reversed++;
+}
+my $data;
+my $fh = $self->fh($id) or return;
+my $filestart = $self->caloffset($id,$start);
+my $filestop  = $self->caloffset($id,$stop);
+seek($fh,$filestart,0);
+read($fh,$data,$filestop-$filestart+1);
+$data =~ s/\n//g;
+if ($reversed) {
+$data = reverse $data;
+$data =~ tr/gatcGATC/ctagCTAG/;
+}
+$data;
+}
+sub fh {
+my $self = shift;
+my $id   = shift;
+my $file = $self->file($id) or return;
+$self->fhcache("$self->{dirname}/$file") or $self->throw( "Can't open file $file");
+}
+sub header {
+my $self = shift;
+my $id   = shift;
+my ($offset,$seqlength,$linelength,$firstline,$type,$file)
+= $self->_unpack($self->{offsets}{$id}) or return;
+$offset -= $firstline;
+my $data;
+my $fh = $self->fh($id) or return;
+seek($fh,$offset,0);
+read($fh,$data,$firstline);
+chomp $data;
+substr($data,0,1) = '';
+$data;
+}
+sub caloffset {
+my $self = shift;
+my $id   = shift;
+my $a    = shift()-1;
+my ($offset,$seqlength,$linelength,$firstline,$type,$file) = $self->_unpack($self->{offsets}{$id});
+$a = 0            if $a < 0;
+$a = $seqlength-1 if $a >= $seqlength;
+$offset + $linelength * int($a/($linelength-1)) + $a % ($linelength-1);
+}
+sub fhcache {
+my $self = shift;
+my $path = shift;
+if (!$self->{fhcache}{$path}) {
+if ($self->{curopen} >= $self->{maxopen}) {
+my @lru = sort {$self->{cacheseq}{$a} <=> $self->{cacheseq}{$b};} keys %{$self->{fhcache}};
+splice(@lru, $self->{maxopen} / 3);
+$self->{curopen} -= @lru;
+for (@lru) { delete $self->{fhcache}{$_} }
+}
+$self->{fhcache}{$path} = IO::File->new($path) or return;
+$self->{curopen}++;
+}
+$self->{cacheseq}{$path}++;
+$self->{fhcache}{$path}
+}
+sub _pack {
+shift;
+pack STRUCT,@_;
+}
+sub _unpack {
+shift;
+unpack STRUCT,shift;
+}
+sub _type {
+shift;
+local $_ = shift;
+return /^[gatcnGATCN*-]+$/   ? DNA
+: /^[gaucnGAUCN*-]+$/ ? RNA
+	 : PROTEIN;
+}
+=head2 get_PrimarySeq_stream
+Title   : get_PrimarySeq_stream
+Usage   :
+Function:
+Example :
+Returns :
+Args    :
+=cut
+sub get_PrimarySeq_stream {
+my $self = shift;
+return Bio::DB::Fasta::Stream->new($self);
+}
+sub TIEHASH {
+my $self = shift;
+return $self->new(@_);
+}
+sub FETCH {
+shift->subseq(@_);
+}
+sub STORE {
+shift->throw("Read-only database");
+}
+sub DELETE {
+shift->throw("Read-only database");
+}
+sub CLEAR {
+shift->throw("Read-only database");
+}
+sub EXISTS {
+defined shift->offset(@_);
+}
+sub FIRSTKEY { tied(%{shift->{offsets}})->FIRSTKEY(@_); }
+sub NEXTKEY  { tied(%{shift->{offsets}})->NEXTKEY(@_);  }
+sub DESTROY {
+my $self = shift;
+if ($self->{indexing}) {  # killed prematurely, so index file is no good!
+warn "indexing was interrupted, so unlinking $self->{indexing}";
+unlink $self->{indexing};
+}
+}
+#-------------------------------------------------------------
+# Bio::PrimarySeqI compatibility
+#
+package Bio::PrimarySeq::Fasta;
+use overload '""' => 'display_id';
+use vars '@ISA';
+eval {
+require Bio::PrimarySeqI;
+require Bio::Root::Root;
+} && (@ISA = ('Bio::Root::Root','Bio::PrimarySeqI'));
+sub new {
+my $class = shift;
+$class = ref($class) if ref $class;
+my ($db,$id,$start,$stop) = @_;
+return bless { db    => $db,
+		 id    => $id,
+		 start => $start || 1,
+		 stop  => $stop  || $db->length($id)
+	       },$class;
+}
+sub seq {
+my $self = shift;
+return $self->{db}->seq($self->{id},$self->{start},$self->{stop});
+}
+sub subseq {
+my $self = shift;
+my ($start,$stop) = @_;
+$self->throw("Stop cannot be smaller than start")  unless $start <= $stop;
+return $self->{start} <= $self->{stop} ?  $self->new($self->{db},
+						       $self->{id},
+						       $self->{start}+$start-1,
+						       $self->{start}+$stop-1)
+:  $self->new($self->{db},
+						       $self->{id},
+						       $self->{start}-($start-1),
+						       $self->{start}-($stop-1)
+						      );
+}
+sub display_id {
+my $self = shift;
+return $self->{id};
+}
+sub accession_number {
+my $self = shift;
+return "unknown";
+}
+sub primary_id {
+my $self = shift;
+return overload::StrVal($self);
+}
+sub can_call_new { return 0 }
+sub alphabet {
+my $self = shift;
+return $self->{db}->alphabet($self->{id});
+}
+sub revcom {
+my $self = shift;
+return $self->new(@{$self}{'db','id','stop','start'});
+}
+sub length {
+my $self = shift;
+return $self->{db}->length($self->{id});
+}
+sub desc  {
+my $self = shift;
+return '';
+}
+#-------------------------------------------------------------
+# stream-based access to the database
+#
+package Bio::DB::Fasta::Stream;
+use Tie::Handle;
+use vars qw(@ISA);
+@ISA = qw(Tie::Handle);
+eval {
+require Bio::DB::SeqI;
+} && (push @ISA,'Bio::DB::SeqI');
+sub new {
+my $class = shift;
+my $db    = shift;
+my $key = $db->FIRSTKEY;
+return bless { db=>$db,key=>$key },$class;
+}
+sub next_seq {
+my $self = shift;
+my ($key,$db) = @{$self}{'key','db'};
+my $value = $db->get_Seq_by_id($key);
+$self->{key} = $db->NEXTKEY($key);
+$value;
+}
+sub TIEHANDLE {
+my $class = shift;
+my $db    = shift;
+return $class->new($db);
+}
+sub READLINE {
+my $self = shift;
+$self->next_seq;
+}
+1;
+__END__

Mercurial > repos > mahtabm > ensembl

comparison variant_effect_predictor/Bio/DB/Fasta.pm @ 0:1f6dce3d34e0