Mercurial > repos > mahtabm > ensembl
diff variant_effect_predictor/Bio/DB/Flat/BDB.pm @ 0:1f6dce3d34e0
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 02:01:53 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/DB/Flat/BDB.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,472 @@ +# +# $Id: BDB.pm,v 1.6.2.1 2003/03/25 18:46:10 jason Exp $ +# +# BioPerl module for Bio::DB::Flat::BDB +# +# Cared for by Lincoln Stein <lstein@cshl.org> +# +# You may distribute this module under the same terms as perl itself + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::DB::Flat::BDB - Interface for BioHackathon standard BDB-indexed flat file + +=head1 SYNOPSIS + +You should not be using this module directly. See Bio::DB::Flat. + +=head1 DESCRIPTION + +This object provides the basic mechanism to associate positions in +files with primary and secondary name spaces. Unlike +Bio::Index::Abstract (see L<Bio::Index::Abstract>), this is specialized +to work with the BerkeleyDB-indexed "common" flat file format worked +out at the 2002 BioHackathon. + +This object is the guts to the mechanism, which will be used by the +specific objects inheriting from it. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via +email or the web: + + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Lincoln Stein + +Email - lstein@cshl.org + +=head1 SEE ALSO + +L<Bio::DB::Flat>, + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. Internal +methods are usually preceded with an "_" (underscore). + +=cut + + +# Let the code begin... + +package Bio::DB::Flat::BDB; + +use strict; +use DB_File; +use IO::File; +use Fcntl qw(O_CREAT O_RDWR O_RDONLY); +use File::Spec; +use Bio::DB::Flat; +use Bio::SeqIO; +use Bio::DB::RandomAccessI; +use Bio::Root::Root; +use Bio::Root::IO; +use vars '@ISA'; + +@ISA = qw(Bio::DB::Flat); + +sub _initialize { + my $self = shift; + my ($max_open) = $self->_rearrange(['MAXOPEN'],@_); + $self->{bdb_maxopen} = $max_open || 32; +} + +# return a filehandle seeked to the appropriate place +# this only works with the primary namespace +sub _get_stream { + my ($self,$id) = @_; + my ($filepath,$offset,$length) = $self->_lookup_primary($id) + or $self->throw("Unable to find a record for $id in the flat file index"); + my $fh = $self->_fhcache($filepath) + or $self->throw("couldn't open $filepath: $!"); + seek($fh,$offset,0) or $self->throw("can't seek on $filepath: $!"); + $fh; +} + +# return records corresponding to the indicated index +# if there are multiple hits will return a list in list context, +# otherwise will throw an exception +sub fetch_raw { + my ($self,$id,$namespace) = @_; + + # secondary lookup + if (defined $namespace && $namespace ne $self->primary_namespace) { + my @hits = $self->_lookup_secondary($namespace,$id); + $self->throw("Multiple records correspond to $namespace=>$id but function called in a scalar context") + unless wantarray; + return map {$self->_read_record(@$_)} @hits; + } + + # primary lookup + my @args = $self->_lookup_primary($id) + or $self->throw("Unable to find a record for $id in the flat file index"); + return $self->_read_record(@args); +} + +# create real live Bio::Seq object +sub get_Seq_by_id { + my $self = shift; + my $id = shift; + my $fh = eval {$self->_get_stream($id)} or return; + my $seqio = + $self->{bdb_cached_parsers}{fileno $fh} ||= Bio::SeqIO->new( -Format => $self->file_format, + -fh => $fh); + return $seqio->next_seq; +} + +# fetch array of Bio::Seq objects +sub get_Seq_by_acc { + my $self = shift; + unshift @_,'ACC' if @_==1; + my ($ns,$key) = @_; + my @primary_ids = $self->expand_ids($ns => $key); + $self->throw("more than one sequences correspond to this accession") + if @primary_ids > 1 && ! wantarray; + my @rc = map {$self->get_Seq_by_id($_)} @primary_ids; + return wantarray ? @rc : $rc[0]; +} + +# fetch array of Bio::Seq objects +sub get_Seq_by_version { + my $self = shift; + unshift @_,'VERSION' if @_==1; + my ($ns,$key) = @_; + my @primary_ids = $self->expand_ids($ns => $key); + $self->throw("more than one sequences correspond to this accession") + if @primary_ids > 1 && !wantarray; + return map {$self->get_Seq_by_id($_)} @primary_ids; +} + +=head2 get_PrimarySeq_stream + + Title : get_PrimarySeq_stream + Usage : $stream = get_PrimarySeq_stream + Function: Makes a Bio::DB::SeqStreamI compliant object + which provides a single method, next_primary_seq + Returns : Bio::DB::SeqStreamI + Args : none + + +=cut + +sub get_PrimarySeq_stream { + my $self = shift; + my @files = $self->files || 0; + my $out = Bio::SeqIO::MultiFile->new( -format => $self->file_format , + -files => \@files); + return $out; +} + +sub get_all_primary_ids { + my $self = shift; + my $db = $self->primary_db; + return keys %$db; +} + +=head2 get_all_primary_ids + + Title : get_all_primary_ids + Usage : @ids = $seqdb->get_all_primary_ids() + Function: gives an array of all the primary_ids of the + sequence objects in the database. + Example : + Returns : an array of strings + Args : none + +=cut + +# this will perform an ID lookup on a (possibly secondary) +# id, returning all the corresponding ids +sub expand_ids { + my $self = shift; + my ($ns,$key) = @_; + return $key unless defined $ns; + return $key if $ns eq $self->primary_namespace; + my $db = $self->secondary_db($ns) + or $self->throw("invalid secondary namespace $ns"); + my $record = $db->{$key} or return; # nothing there + return $self->unpack_secondary($record); +} + +# build index from files listed +sub build_index { + my $self = shift; + my @files = @_; + my $count = 0; + for my $file (@files) { + $file = File::Spec->rel2abs($file) + unless File::Spec->file_name_is_absolute($file); + $count += $self->_index_file($file); + } + $self->write_config; + $count; +} + +sub _index_file { + my $self = shift; + my $file = shift; + + my $fileno = $self->_path2fileno($file); + defined $fileno or $self->throw("could not create a file number for $file"); + + my $fh = $self->_fhcache($file) or $self->throw("could not open $file for indexing: $!"); + my $offset = 0; + my $count = 0; + while (!eof($fh)) { + my ($ids,$adjustment) = $self->parse_one_record($fh) or next; + $adjustment ||= 0; # prevent uninit variable warning + my $pos = tell($fh) + $adjustment; + $self->_store_index($ids,$file,$offset,$pos-$offset); + $offset = $pos; + $count++; + } + $count; +} + +=head2 To Be Implemented in Subclasses + +The following methods MUST be implemented by subclasses. + +=cut + +=head2 May Be Overridden in Subclasses + +The following methods MAY be overridden by subclasses. + +=cut + +sub default_primary_namespace { + return "ACC"; +} + +sub default_secondary_namespaces { + return; +} + +sub _read_record { + my $self = shift; + my ($filepath,$offset,$length) = @_; + my $fh = $self->_fhcache($filepath) + or $self->throw("couldn't open $filepath: $!"); + seek($fh,$offset,0) or $self->throw("can't seek on $filepath: $!"); + my $record; + read($fh,$record,$length) or $self->throw("can't read $filepath: $!"); + $record +} + +# return a list in the form ($filepath,$offset,$length) +sub _lookup_primary { + my $self = shift; + my $primary = shift; + my $db = $self->primary_db + or $self->throw("no primary namespace database is open"); + + my $record = $db->{$primary} or return; # nothing here + + my($fileid,$offset,$length) = $self->unpack_primary($record); + my $filepath = $self->_fileno2path($fileid) + or $self->throw("no file path entry for fileid $fileid"); + return ($filepath,$offset,$length); +} + +# return a list of array refs in the form [$filepath,$offset,$length] +sub _lookup_secondary { + my $self = shift; + my ($namespace,$secondary) = @_; + my @primary = $self->expand_ids($namespace=>$secondary); + return map {[$self->_lookup_primary($_)]} @primary; +} + +# store indexing information into a primary & secondary record +# $namespaces is one of: +# 1. a scalar corresponding to the primary name +# 2. a hashref corresponding to namespace=>id identifiers +# it is valid for secondary id to be an arrayref +sub _store_index { + my $self = shift; + my ($keys,$filepath,$offset,$length) = @_; + my ($primary,%secondary); + + if (ref $keys eq 'HASH') { + my %valid_secondary = map {$_=>1} $self->secondary_namespaces; + while (my($ns,$value) = each %$keys) { + if ($ns eq $self->primary_namespace) { + $primary = $value; + } else { + $valid_secondary{$ns} or $self->throw("invalid secondary namespace $ns"); + push @{$secondary{$ns}},$value; + } + } + $primary or $self->throw("no primary namespace ID provided"); + } else { + $primary = $keys; + } + + $self->throw("invalid primary ID; must be a scalar") + if ref($primary) =~ /^(ARRAY|HASH)$/; # but allow stringified objects + + $self->_store_primary($primary,$filepath,$offset,$length); + for my $ns (keys %secondary) { + my @ids = ref $secondary{$ns} ? @{$secondary{$ns}} : $secondary{$ns}; + $self->_store_secondary($ns,$_,$primary) foreach @ids; + } + + 1; +} + +# store primary index +sub _store_primary { + my $self = shift; + my ($id,$filepath,$offset,$length) = @_; + + my $db = $self->primary_db + or $self->throw("no primary namespace database is open"); + my $fileno = $self->_path2fileno($filepath); + defined $fileno or $self->throw("could not create a file number for $filepath"); + + my $record = $self->pack_primary($fileno,$offset,$length); + $db->{$id} = $record or return; # nothing here + 1; +} + +# store a primary index name under a secondary index +sub _store_secondary { + my $self = shift; + my ($secondary_ns,$secondary_id,$primary_id) = @_; + + my $db = $self->secondary_db($secondary_ns) + or $self->throw("invalid secondary namespace $secondary_ns"); + + # first get whatever secondary ids are already stored there + my @primary = $self->unpack_secondary($db->{$secondary_id}); + # uniqueify + my %unique = map {$_=>undef} @primary,$primary_id; + + my $record = $self->pack_secondary(keys %unique); + $db->{$secondary_id} = $record; +} + +# get output file handle +sub _outfh { + my $self = shift; +#### XXXXX FINISH ##### +# my $ +} + +# unpack a primary record into fileid,offset,length +sub unpack_primary { + my $self = shift; + my $index_record = shift; + return split "\t",$index_record; +} + +# unpack a secondary record into a list of primary ids +sub unpack_secondary { + my $self = shift; + my $index_record = shift or return; + return split "\t",$index_record; +} + +# pack a list of fileid,offset,length into a primary id record +sub pack_primary { + my $self = shift; + my ($fileid,$offset,$length) = @_; + return join "\t",($fileid,$offset,$length); +} + +# pack a list of primary ids into a secondary id record +sub pack_secondary { + my $self = shift; + my @secondaries = @_; + return join "\t",@secondaries; +} + +sub primary_db { + my $self = shift; + # lazy opening + $self->_open_bdb unless exists $self->{bdb_primary_db}; + return $self->{bdb_primary_db}; +} + +sub secondary_db { + my $self = shift; + my $secondary_namespace = shift + or $self->throw("usage: secondary_db(\$secondary_namespace)"); + $self->_open_bdb unless exists $self->{bdb_primary_db}; + return $self->{bdb_secondary_db}{$secondary_namespace}; +} + +sub _open_bdb { + my $self = shift; + + my $flags = $self->write_flag ? O_CREAT|O_RDWR : O_RDONLY; + + my $primary_db = {}; + tie(%$primary_db,'DB_File',$self->_catfile($self->_primary_db_name),$flags,0666,$DB_BTREE) + or $self->throw("Could not open primary index file: $! (did you remember to use -write_flag=>1?)"); + $self->{bdb_primary_db} = $primary_db; + + for my $secondary ($self->secondary_namespaces) { + my $secondary_db = {}; + tie(%$secondary_db,'DB_File',$self->_catfile($self->_secondary_db_name($secondary)),$flags,0666,$DB_BTREE) + or $self->throw("Could not open primary index file"); + $self->{bdb_secondary_db}{$secondary} = $secondary_db; + } + + 1; +} + +sub _primary_db_name { + my $self = shift; + my $pns = $self->primary_namespace or $self->throw('no primary namespace defined'); + return "key_$pns"; +} + +sub _secondary_db_name { + my $self = shift; + my $sns = shift; + return "id_$sns"; +} + +sub _fhcache { + my $self = shift; + my $path = shift; + my $write = shift; + + if (!$self->{bdb_fhcache}{$path}) { + $self->{bdb_curopen} ||= 0; + if ($self->{bdb_curopen} >= $self->{bdb_maxopen}) { + my @lru = sort {$self->{bdb_cacheseq}{$a} <=> $self->{bdb_cacheseq}{$b};} keys %{$self->{bdb_fhcache}}; + splice(@lru, $self->{bdb_maxopen} / 3); + $self->{bdb_curopen} -= @lru; + for (@lru) { delete $self->{bdb_fhcache}{$_} } + } + if ($write) { + my $modifier = $self->{bdb_fhcache_seenit}{$path}++ ? '>' : '>>'; + $self->{bdb_fhcache}{$path} = IO::File->new("${modifier}${path}") or return; + } else { + $self->{bdb_fhcache}{$path} = IO::File->new($path) or return; + } + $self->{bdb_curopen}++; + } + $self->{bdb_cacheseq}{$path}++; + $self->{bdb_fhcache}{$path} +} + +1;