Mercurial > repos > mahtabm > ensembl
diff variant_effect_predictor/Bio/EnsEMBL/Utils/Cache.pm @ 0:1f6dce3d34e0
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 02:01:53 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Utils/Cache.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,703 @@ +# This package, originally distributed by CPAN, has been modified from +# its original version in order to be used by the ensembl project. +# +# 8 July 2002 - changed package name +# + +#package Tie::Cache; # old package +package Bio::EnsEMBL::Utils::Cache; + +use strict; +use vars qw( + $VERSION $Debug $STRUCT_SIZE $REF_SIZE + $BEFORE $AFTER $KEY $VALUE $BYTES $DIRTY +); + +$VERSION = .17; +$Debug = 0; # set to 1 for summary, 2 for debug output +$STRUCT_SIZE = 240; # per cached elem bytes overhead, approximate +$REF_SIZE = 16; + +# NODE ARRAY STRUCT +$KEY = 0; +$VALUE = 1; +$BYTES = 2; +$BEFORE = 3; +$AFTER = 4; +$DIRTY = 5; + +=pod + +=head1 NAME + +Tie::Cache - LRU Cache in Memory + +=head1 SYNOPSIS + + use Tie::Cache; + tie %cache, 'Tie::Cache', 100, { Debug => 1 }; + tie %cache2, 'Tie::Cache', { MaxCount => 100, MaxBytes => 50000 }; + tie %cache3, 'Tie::Cache', 100, { Debug => 1 , WriteSync => 0}; + + # Options ################################################################## + # + # Debug => 0 - DEFAULT, no debugging output + # 1 - prints cache statistics upon destroying + # 2 - prints detailed debugging info + # + # MaxCount => Maximum entries in cache. + # + # MaxBytes => Maximum bytes taken in memory for cache based on approximate + # size of total cache structure in memory + # + # There is approximately 240 bytes used per key/value pair in the cache for + # the cache data structures, so a cache of 5000 entries would take + # at approximately 1.2M plus the size of the data being cached. + # + # MaxSize => Maximum size of each cache entry. Larger entries are not cached. + # This helps prevent much of the cache being flushed when + # you set an exceptionally large entry. Defaults to MaxBytes/10 + # + # WriteSync => 1 - DEFAULT, write() when data is dirtied for + # TRUE CACHE (see below) + # 0 - write() dirty data as late as possible, when leaving + # cache, or when cache is being DESTROY'd + # + ############################################################################ + + # cache supports normal tied hash functions + $cache{1} = 2; # STORE + print "$cache{1}\n"; # FETCH + + # FIRSTKEY, NEXTKEY + while(($k, $v) = each %cache) { print "$k: $v\n"; } + + delete $cache{1}; # DELETE + %cache = (); # CLEAR + +=head1 DESCRIPTION + +This module implements a least recently used (LRU) cache in memory +through a tie interface. Any time data is stored in the tied hash, +that key/value pair has an entry time associated with it, and +as the cache fills up, those members of the cache that are +the oldest are removed to make room for new entries. + +So, the cache only "remembers" the last written entries, up to the +size of the cache. This can be especially useful if you access +great amounts of data, but only access a minority of the data a +majority of the time. + +The implementation is a hash, for quick lookups, +overlaying a doubly linked list for quick insertion and deletion. +On a WinNT PII 300, writes to the hash were done at a rate +3100 per second, and reads from the hash at 6300 per second. +Work has been done to optimize refreshing cache entries that are +frequently read from, code like $cache{entry}, which moves the +entry to the end of the linked list internally. + +=cut Documentation continues at the end of the module. + +sub TIEHASH { + my($class, $max_count, $options) = @_; + + if(ref($max_count)) { + $options = $max_count; + $max_count = $options->{MaxCount}; + } + + unless($max_count || $options->{MaxBytes}) { + die('you must specify cache size with either MaxBytes or MaxCount'); + } + + my $sync = exists($options->{WriteSync}) ? $options->{WriteSync} : 1; + + my $self = bless + { + # how many items to cache + max_count=> $max_count, + + # max bytes to cache + max_bytes => $options->{MaxBytes}, + + # max size (in bytes) of an individual cache entry + max_size => $options->{MaxSize} || ($options->{MaxBytes} ? (int($options->{MaxBytes}/10) + 1) : 0), + + # class track, so know if overridden subs should be used + 'class' => $class, + 'subclass' => $class ne 'Tie::Cache' ? 1 : 0, + + # current sizes + count=>0, + bytes=>0, + + # inner structures + head=>0, + tail=>0, + nodes=>{}, + 'keys'=>[], + + # statistics + hit => 0, + miss => 0, + + # config + sync => $sync, + dbg => $options->{Debug} || $Debug + + + }, $class; + + if (($self->{max_bytes} && ! $self->{max_size})) { + die("MaxSize must be defined when MaxBytes is"); + } + + if($self->{max_bytes} and $self->{max_bytes} < 1000) { + die("cannot set MaxBytes to under 1000, each raw entry takes $STRUCT_SIZE bytes alone"); + } + + if($self->{max_size} && $self->{max_size} < 3) { + die("cannot set MaxSize to under 3 bytes, assuming error in config"); + } + + $self; +} + +# override to write data leaving cache +sub write { undef; } +# commented this section out for speed +# my($self, $key, $value) = @_; +# 1; +#} + +# override to get data if not in cache, should return $value +# associated with $key +sub read { undef; } +# commented this section out for speed +# my($self, $key) = @_; +# undef; +#} + +sub FETCH { + my($self, $key) = @_; + + my $node = $self->{nodes}{$key}; + if($node) { + # refresh node's entry + $self->{hit}++; # if $self->{dbg}; + + # we used to call delete then insert, but we streamlined code + if(my $after = $node->[$AFTER]) { + $self->{dbg} > 1 and $self->print("update() node $node to tail of list"); + # reconnect the nodes + my $before = $after->[$BEFORE] = $node->[$BEFORE]; + if($before) { + $before->[$AFTER] = $after; + } else { + $self->{head} = $after; + } + + # place at the end + $self->{tail}[$AFTER] = $node; + $node->[$BEFORE] = $self->{tail}; + $node->[$AFTER] = undef; + $self->{tail} = $node; # always true after this + } else { + # if there is nothing after node, then we are at the end already + # so don't do anything to move the nodes around + die("this node is the tail, so something's wrong") + unless($self->{tail} eq $node); + } + + $self->print("FETCH [$key, $node->[$VALUE]]") if ($self->{dbg} > 1); + $node->[$VALUE]; + } else { + # we have a cache miss here + $self->{miss}++; # if $self->{dbg}; + + # its fine to always insert a node, even when we have an undef, + # because even if we aren't a sub-class, we should assume use + # that would then set the entry. This model works well with + # sub-classing and reads() that might want to return undef as + # a valid value. + my $value; + if ($self->{subclass}) { + $self->print("read() for key $key") if $self->{dbg} > 1; + $value = $self->read($key); + } + + if(defined $value) { + my $length; + if($self->{max_size}) { + # check max size of entry, that it not exceed max size + $length = &_get_data_length(\$key, \$value); + if($length > $self->{max_size}) { + $self->print("direct read() [$key, $value]") if ($self->{dbg} > 1); + return $value; + } + } + # if we get here, we should insert the new node + $node = &create_node($self, \$key, \$value, $length); + &insert($self, $node); + $value; + } else { + undef; + } + } +} + +sub STORE { + my($self, $key, $value) = @_; + my $node; + + $self->print("STORE [$key,$value]") if ($self->{dbg} > 1); + + # do not cache undefined values + defined($value) || return(undef); + + # check max size of entry, that it not exceed max size + my $length; + if($self->{max_size}) { + $length = &_get_data_length(\$key, \$value); + if($length > $self->{max_size}) { + if ($self->{subclass}) { + $self->print("direct write() [$key, $value]") if ($self->{dbg} > 1); + $self->write($key, $value); + } + return $value; + } + } + + # do we have node already ? + if($self->{nodes}{$key}) { + $node = &delete($self, $key); +# $node = &delete($self, $key); +# $node->[$VALUE] = $value; +# $node->[$BYTES] = $length || &_get_data_length(\$key, \$value); + } + + # insert new node + $node = &create_node($self, \$key, \$value, $length); +# $node ||= &create_node($self, \$key, \$value, $length); + &insert($self, $node); + + # if the data is sync'd call write now, otherwise defer the data + # writing, but mark it dirty so it can be cleanup up at the end + if ($self->{subclass}) { + if($self->{sync}) { + $self->print("sync write() [$key, $value]") if $self->{dbg} > 1; + $self->write($key, $value); + } else { + $node->[$DIRTY] = 1; + } + } + + $value; +} + +sub DELETE { + my($self, $key) = @_; + + $self->print("DELETE $key") if ($self->{dbg} > 1); + my $node = $self->delete($key); + $node ? $node->[$VALUE] : undef; +} + +sub CLEAR { + my($self) = @_; + + $self->print("CLEAR CACHE") if ($self->{dbg} > 1); + + if($self->{subclass}) { + my $flushed = $self->flush(); + $self->print("FLUSH COUNT $flushed") if ($self->{dbg} > 1); + } + + my $node; + while($node = $self->{head}) { + $self->delete($self->{head}[$KEY]); + } + + 1; +} + +sub EXISTS { + my($self, $key) = @_; + exists $self->{nodes}{$key}; +} + +# firstkey / nextkey emulate keys() and each() behavior by +# taking a snapshot of all the nodes at firstkey, and +# iterating through the keys with nextkey +# +# this method therefore will only supports one each() / keys() +# happening during any given time. +# +sub FIRSTKEY { + my($self) = @_; + + $self->{'keys'} = []; + my $node = $self->{head}; + while($node) { + push(@{$self->{'keys'}}, $node->[$KEY]); + $node = $node->[$AFTER]; + } + + shift @{$self->{'keys'}}; +} + +sub NEXTKEY { + my($self, $lastkey) = @_; + shift @{$self->{'keys'}}; +} + +sub DESTROY { + my($self) = @_; + + # if debugging, snapshot cache before clearing + if($self->{dbg}) { + if($self->{hit} || $self->{miss}) { + $self->{hit_ratio} = + sprintf("%4.3f", $self->{hit} / ($self->{hit} + $self->{miss})); + } + $self->print($self->pretty_self()); + if($self->{dbg} > 1) { + $self->print($self->pretty_chains()); + } + } + + $self->print("DESTROYING") if $self->{dbg} > 1; + $self->CLEAR(); + + 1; +} + +####PERL##LRU##TIE##CACHE##PERL##LRU##TIE##CACHE##PERL##LRU##TIE##CACHE +## Helper Routines +####PERL##LRU##TIE##CACHE##PERL##LRU##TIE##CACHE##PERL##LRU##TIE##CACHE + +# we use scalar_refs for the data for speed +sub create_node { + my($self, $key, $value, $length) = @_; + (defined($$key) && defined($$value)) + || die("need more localized data than $$key and $$value"); + + # max_size always defined when max_bytes is + if (($self->{max_size})) { + $length = defined $length ? $length : &_get_data_length($key, $value) + } else { + $length = 0; + } + + # ORDER SPECIFIC, see top for NODE ARRAY STRUCT + my $node = [ $$key, $$value, $length ]; +} + +sub _get_data_length { + my($key, $value) = @_; + my $length = 0; + my %refs; + + my @data = ($$key, $$value); + while(my $elem = shift @data) { + next if $refs{$elem}; + $refs{$elem} = 1; + if(ref $elem && $elem =~ /(SCALAR|HASH|ARRAY)/) { + my $type = $1; + $length += $REF_SIZE; # guess, 16 bytes per ref, probably more + if (($type eq 'SCALAR')) { + $length += length($$elem); + } elsif (($type eq 'HASH')) { + while (my($k,$v) = each %$elem) { + for my $kv($k,$v) { + if ((ref $kv)) { + push(@data, $kv); + } else { + $length += length($kv); + } + } + } + } elsif (($type eq 'ARRAY')) { + for my $val (@$elem){ + if ((ref $val)) { + push(@data, $val); + } else { + $length += length($val); + } + } + } + } else { + $length += length($elem); + } + } + + $length; +} + +sub insert { + my($self, $new_node) = @_; + + $new_node->[$AFTER] = 0; + $new_node->[$BEFORE] = $self->{tail}; + $self->print("insert() [$new_node->[$KEY], $new_node->[$VALUE]]") if ($self->{dbg} > 1); + + $self->{nodes}{$new_node->[$KEY]} = $new_node; + + # current sizes + $self->{count}++; + $self->{bytes} += $new_node->[$BYTES] + $STRUCT_SIZE; + + if($self->{tail}) { + $self->{tail}[$AFTER] = $new_node; + } else { + $self->{head} = $new_node; + } + $self->{tail} = $new_node; + + ## if we are too big now, remove head + while(($self->{max_count} && ($self->{count} > $self->{max_count})) || + ($self->{max_bytes} && ($self->{bytes} > $self->{max_bytes}))) + { + if($self->{dbg} > 1) { + $self->print("current/max: ". + "bytes ($self->{bytes}/$self->{max_bytes}) ". + "count ($self->{count}/$self->{max_count}) " + ); + } + my $old_node = $self->delete($self->{head}[$KEY]); + if ($self->{subclass}) { + if($old_node->[$DIRTY]) { + $self->print("dirty write() [$old_node->[$KEY], $old_node->[$VALUE]]") + if ($self->{dbg} > 1); + $self->write($old_node->[$KEY], $old_node->[$VALUE]); + } + } +# if($self->{dbg} > 1) { +# $self->print("after delete - bytes $self->{bytes}; count $self->{count}"); +# } + } + + 1; +} + +sub delete { + my($self, $key) = @_; + my $node = $self->{nodes}{$key} || return; +# return unless $node; + + $self->print("delete() [$key, $node->[$VALUE]]") if ($self->{dbg} > 1); + + my $before = $node->[$BEFORE]; + my $after = $node->[$AFTER]; + + # my($before, $after) = $node->{before,after}; + if($before) { + ($before->[$AFTER] = $after); + } else { + $self->{head} = $after; + } + + if($after) { + ($after->[$BEFORE] = $before); + } else { + $self->{tail} = $before; + } + + delete $self->{nodes}{$key}; + $self->{bytes} -= ($node->[$BYTES] + $STRUCT_SIZE); + $self->{count}--; + + $node; +} + +sub flush { + my $self = shift; + + $self->print("FLUSH CACHE") if ($self->{dbg} > 1); + + my $node = $self->{head}; + my $flush_count = 0; + while($node) { + if($node->[$DIRTY]) { + $self->print("flush dirty write() [$node->[$KEY], $node->[$VALUE]]") + if ($self->{dbg} > 1); + $self->write($node->[$KEY], $node->[$VALUE]); + $node->[$DIRTY] = 0; + $flush_count++; + } + $node = $node->[$AFTER]; + } + + $flush_count; +} + +sub print { + my($self, $msg) = @_; + print "$self: $msg\n"; +} + +sub pretty_self { + my($self) = @_; + + my(@prints); + for(sort keys %{$self}) { + next unless defined $self->{$_}; + push(@prints, "$_=>$self->{$_}"); + } + + "{ " . join(", ", @prints) . " }"; +} + +sub pretty_chains { + my($self) = @_; + my($str); + my $k = $self->FIRSTKEY(); + + $str .= "[head]->"; + my($curr_node) = $self->{head}; + while($curr_node) { + $str .= "[$curr_node->[$KEY],$curr_node->[$VALUE]]->"; + $curr_node = $curr_node->[$AFTER]; + } + $str .= "[tail]->"; + + $curr_node = $self->{tail}; + while($curr_node) { + $str .= "[$curr_node->[$KEY],$curr_node->[$VALUE]]->"; + $curr_node = $curr_node->[$BEFORE]; + } + $str .= "[head]"; + + $str; +} + +1; + +__END__ + +=head1 INSTALLATION + +Tie::Cache installs easily using the make or nmake commands as +shown below. Otherwise, just copy Cache.pm to $PERLLIB/site/Tie + + > perl Makefile.PL + > make + > make test + > make install + + * use nmake for win32 + ** you can also just copy Cache.pm to $perllib/Tie + +=head1 BENCMARKS + +There is another simpler LRU cache implementation in CPAN, +Tie::Cache::LRU, which has the same basic size limiting +functionality, and for this functionality, the exact same +interface. + +Through healthy competition, Michael G Schwern got +Tie::Cache::LRU mostly faster than Tie::Cache on reads & writes: + + Cache Size 5000 Tie::Cache 0.17 Tie::Cache::LRU 0.21 + 10000 Writes 1.55 CPU sec 1.10 CPU sec + 40000 Reads 1.82 CPU sec 1.58 CPU sec + 10000 Deletes 0.55 CPU sec 0.59 CPU sec + +Unless you are using TRUE CACHE or MaxBytes functionality, +using Tie::Cache::LRU should be an easy replacement for Tie::Cache. + +=head1 TRUE CACHE + +To use class as a true cache, which acts as the sole interface +for some data set, subclass the real cache off Tie::Cache, +with @ISA = qw( 'Tie::Cache' ) notation. Then override +the read() method for behavior when there is a cache miss, +and the write() method for behavior when the cache's data +changes. + +When WriteSync is 1 or TRUE (DEFAULT), write() is called immediately +when data in the cache is modified. If set to 0, data that has +been modified in the cache gets written out when the entries are deleted or +during the DESTROY phase of the cache object, usually at the end of +a script. + +To have the dirty data write() periodically while WriteSync is set to 0, +there is a flush() cache API call that will flush the dirty writes +in this way. Just call the flush() API like: + + my $write_flush_count = tied(%cache)->flush(); + +The flush() API was added in the .17 release thanks to Rob Bloodgood. + +=head1 TRUE CACHE EXAMPLE + + use Tie::Cache; + + # personalize the Tie::Cache object, by inheriting from it + package My::Cache; + @ISA = qw(Tie::Cache); + + # override the read() and write() member functions + # these tell the cache what to do with a cache miss or flush + sub read { + my($self, $key) = @_; + print "cache miss for $key, read() data\n"; + rand() * $key; + } + sub write { + my($self, $key, $value) = @_; + print "flushing [$key, $value] from cache, write() data\n"; + } + + my $cache_size = $ARGV[0] || 2; + my $num_to_cache = $ARGV[1] || 4; + my $Debug = $ARGV[2] || 1; + + tie %cache, 'My::Cache', $cache_size, {Debug => $Debug}; + + # load the cache with new data, each through its contents, + # and then reload in reverse order. + for(1..$num_to_cache) { print "read data $_: $cache{$_}\n" } + while(my($k, $v) = each %cache) { print "each data $k: $v\n"; } + for(my $i=$num_to_cache; $i>0; $i--) { print "read data $i: $cache{$i}\n"; } + + # flush writes now, trivial use since will happen in DESTROY() anyway + tied(%cache)->flush(); + + # clear cache in 2 ways, write will flush out to disk + %cache = (); + undef %cache; + +=head1 NOTES + +Many thanks to all those who helped me make this module a reality, +including: + + :) Tom Hukins who provided me insight and motivation for + finishing this module. + :) Jamie McCarthy, for trying to make Tie::Cache be all + that it can be. + :) Rob Fugina who knows how to "TRULY CACHE". + :) Rob Bloodgood, for the TRUE CACHE flush() API + +=head1 AUTHOR + +Please send any questions or comments to Joshua Chamas +at chamas@alumni.stanford.org + +=head1 COPYRIGHT + +Copyright (c) 1999-2002 Joshua Chamas, Chamas Enterprises Inc. +Sponsored by development on NodeWorks http://www.nodeworks.com + +All rights reserved. This program is free software; +you can redistribute it and/or modify it under the same +terms as Perl itself. + +=cut + + + + + +