comparison variant_effect_predictor/Bio/EnsEMBL/Utils/Cache.pm @ 0:1f6dce3d34e0

Uploaded
author mahtabm
date Thu, 11 Apr 2013 02:01:53 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:1f6dce3d34e0
1 # This package, originally distributed by CPAN, has been modified from
2 # its original version in order to be used by the ensembl project.
3 #
4 # 8 July 2002 - changed package name
5 #
6
7 #package Tie::Cache; # old package
8 package Bio::EnsEMBL::Utils::Cache;
9
10 use strict;
11 use vars qw(
12 $VERSION $Debug $STRUCT_SIZE $REF_SIZE
13 $BEFORE $AFTER $KEY $VALUE $BYTES $DIRTY
14 );
15
16 $VERSION = .17;
17 $Debug = 0; # set to 1 for summary, 2 for debug output
18 $STRUCT_SIZE = 240; # per cached elem bytes overhead, approximate
19 $REF_SIZE = 16;
20
21 # NODE ARRAY STRUCT
22 $KEY = 0;
23 $VALUE = 1;
24 $BYTES = 2;
25 $BEFORE = 3;
26 $AFTER = 4;
27 $DIRTY = 5;
28
29 =pod
30
31 =head1 NAME
32
33 Tie::Cache - LRU Cache in Memory
34
35 =head1 SYNOPSIS
36
37 use Tie::Cache;
38 tie %cache, 'Tie::Cache', 100, { Debug => 1 };
39 tie %cache2, 'Tie::Cache', { MaxCount => 100, MaxBytes => 50000 };
40 tie %cache3, 'Tie::Cache', 100, { Debug => 1 , WriteSync => 0};
41
42 # Options ##################################################################
43 #
44 # Debug => 0 - DEFAULT, no debugging output
45 # 1 - prints cache statistics upon destroying
46 # 2 - prints detailed debugging info
47 #
48 # MaxCount => Maximum entries in cache.
49 #
50 # MaxBytes => Maximum bytes taken in memory for cache based on approximate
51 # size of total cache structure in memory
52 #
53 # There is approximately 240 bytes used per key/value pair in the cache for
54 # the cache data structures, so a cache of 5000 entries would take
55 # at approximately 1.2M plus the size of the data being cached.
56 #
57 # MaxSize => Maximum size of each cache entry. Larger entries are not cached.
58 # This helps prevent much of the cache being flushed when
59 # you set an exceptionally large entry. Defaults to MaxBytes/10
60 #
61 # WriteSync => 1 - DEFAULT, write() when data is dirtied for
62 # TRUE CACHE (see below)
63 # 0 - write() dirty data as late as possible, when leaving
64 # cache, or when cache is being DESTROY'd
65 #
66 ############################################################################
67
68 # cache supports normal tied hash functions
69 $cache{1} = 2; # STORE
70 print "$cache{1}\n"; # FETCH
71
72 # FIRSTKEY, NEXTKEY
73 while(($k, $v) = each %cache) { print "$k: $v\n"; }
74
75 delete $cache{1}; # DELETE
76 %cache = (); # CLEAR
77
78 =head1 DESCRIPTION
79
80 This module implements a least recently used (LRU) cache in memory
81 through a tie interface. Any time data is stored in the tied hash,
82 that key/value pair has an entry time associated with it, and
83 as the cache fills up, those members of the cache that are
84 the oldest are removed to make room for new entries.
85
86 So, the cache only "remembers" the last written entries, up to the
87 size of the cache. This can be especially useful if you access
88 great amounts of data, but only access a minority of the data a
89 majority of the time.
90
91 The implementation is a hash, for quick lookups,
92 overlaying a doubly linked list for quick insertion and deletion.
93 On a WinNT PII 300, writes to the hash were done at a rate
94 3100 per second, and reads from the hash at 6300 per second.
95 Work has been done to optimize refreshing cache entries that are
96 frequently read from, code like $cache{entry}, which moves the
97 entry to the end of the linked list internally.
98
99 =cut Documentation continues at the end of the module.
100
101 sub TIEHASH {
102 my($class, $max_count, $options) = @_;
103
104 if(ref($max_count)) {
105 $options = $max_count;
106 $max_count = $options->{MaxCount};
107 }
108
109 unless($max_count || $options->{MaxBytes}) {
110 die('you must specify cache size with either MaxBytes or MaxCount');
111 }
112
113 my $sync = exists($options->{WriteSync}) ? $options->{WriteSync} : 1;
114
115 my $self = bless
116 {
117 # how many items to cache
118 max_count=> $max_count,
119
120 # max bytes to cache
121 max_bytes => $options->{MaxBytes},
122
123 # max size (in bytes) of an individual cache entry
124 max_size => $options->{MaxSize} || ($options->{MaxBytes} ? (int($options->{MaxBytes}/10) + 1) : 0),
125
126 # class track, so know if overridden subs should be used
127 'class' => $class,
128 'subclass' => $class ne 'Tie::Cache' ? 1 : 0,
129
130 # current sizes
131 count=>0,
132 bytes=>0,
133
134 # inner structures
135 head=>0,
136 tail=>0,
137 nodes=>{},
138 'keys'=>[],
139
140 # statistics
141 hit => 0,
142 miss => 0,
143
144 # config
145 sync => $sync,
146 dbg => $options->{Debug} || $Debug
147
148
149 }, $class;
150
151 if (($self->{max_bytes} && ! $self->{max_size})) {
152 die("MaxSize must be defined when MaxBytes is");
153 }
154
155 if($self->{max_bytes} and $self->{max_bytes} < 1000) {
156 die("cannot set MaxBytes to under 1000, each raw entry takes $STRUCT_SIZE bytes alone");
157 }
158
159 if($self->{max_size} && $self->{max_size} < 3) {
160 die("cannot set MaxSize to under 3 bytes, assuming error in config");
161 }
162
163 $self;
164 }
165
166 # override to write data leaving cache
167 sub write { undef; }
168 # commented this section out for speed
169 # my($self, $key, $value) = @_;
170 # 1;
171 #}
172
173 # override to get data if not in cache, should return $value
174 # associated with $key
175 sub read { undef; }
176 # commented this section out for speed
177 # my($self, $key) = @_;
178 # undef;
179 #}
180
181 sub FETCH {
182 my($self, $key) = @_;
183
184 my $node = $self->{nodes}{$key};
185 if($node) {
186 # refresh node's entry
187 $self->{hit}++; # if $self->{dbg};
188
189 # we used to call delete then insert, but we streamlined code
190 if(my $after = $node->[$AFTER]) {
191 $self->{dbg} > 1 and $self->print("update() node $node to tail of list");
192 # reconnect the nodes
193 my $before = $after->[$BEFORE] = $node->[$BEFORE];
194 if($before) {
195 $before->[$AFTER] = $after;
196 } else {
197 $self->{head} = $after;
198 }
199
200 # place at the end
201 $self->{tail}[$AFTER] = $node;
202 $node->[$BEFORE] = $self->{tail};
203 $node->[$AFTER] = undef;
204 $self->{tail} = $node; # always true after this
205 } else {
206 # if there is nothing after node, then we are at the end already
207 # so don't do anything to move the nodes around
208 die("this node is the tail, so something's wrong")
209 unless($self->{tail} eq $node);
210 }
211
212 $self->print("FETCH [$key, $node->[$VALUE]]") if ($self->{dbg} > 1);
213 $node->[$VALUE];
214 } else {
215 # we have a cache miss here
216 $self->{miss}++; # if $self->{dbg};
217
218 # its fine to always insert a node, even when we have an undef,
219 # because even if we aren't a sub-class, we should assume use
220 # that would then set the entry. This model works well with
221 # sub-classing and reads() that might want to return undef as
222 # a valid value.
223 my $value;
224 if ($self->{subclass}) {
225 $self->print("read() for key $key") if $self->{dbg} > 1;
226 $value = $self->read($key);
227 }
228
229 if(defined $value) {
230 my $length;
231 if($self->{max_size}) {
232 # check max size of entry, that it not exceed max size
233 $length = &_get_data_length(\$key, \$value);
234 if($length > $self->{max_size}) {
235 $self->print("direct read() [$key, $value]") if ($self->{dbg} > 1);
236 return $value;
237 }
238 }
239 # if we get here, we should insert the new node
240 $node = &create_node($self, \$key, \$value, $length);
241 &insert($self, $node);
242 $value;
243 } else {
244 undef;
245 }
246 }
247 }
248
249 sub STORE {
250 my($self, $key, $value) = @_;
251 my $node;
252
253 $self->print("STORE [$key,$value]") if ($self->{dbg} > 1);
254
255 # do not cache undefined values
256 defined($value) || return(undef);
257
258 # check max size of entry, that it not exceed max size
259 my $length;
260 if($self->{max_size}) {
261 $length = &_get_data_length(\$key, \$value);
262 if($length > $self->{max_size}) {
263 if ($self->{subclass}) {
264 $self->print("direct write() [$key, $value]") if ($self->{dbg} > 1);
265 $self->write($key, $value);
266 }
267 return $value;
268 }
269 }
270
271 # do we have node already ?
272 if($self->{nodes}{$key}) {
273 $node = &delete($self, $key);
274 # $node = &delete($self, $key);
275 # $node->[$VALUE] = $value;
276 # $node->[$BYTES] = $length || &_get_data_length(\$key, \$value);
277 }
278
279 # insert new node
280 $node = &create_node($self, \$key, \$value, $length);
281 # $node ||= &create_node($self, \$key, \$value, $length);
282 &insert($self, $node);
283
284 # if the data is sync'd call write now, otherwise defer the data
285 # writing, but mark it dirty so it can be cleanup up at the end
286 if ($self->{subclass}) {
287 if($self->{sync}) {
288 $self->print("sync write() [$key, $value]") if $self->{dbg} > 1;
289 $self->write($key, $value);
290 } else {
291 $node->[$DIRTY] = 1;
292 }
293 }
294
295 $value;
296 }
297
298 sub DELETE {
299 my($self, $key) = @_;
300
301 $self->print("DELETE $key") if ($self->{dbg} > 1);
302 my $node = $self->delete($key);
303 $node ? $node->[$VALUE] : undef;
304 }
305
306 sub CLEAR {
307 my($self) = @_;
308
309 $self->print("CLEAR CACHE") if ($self->{dbg} > 1);
310
311 if($self->{subclass}) {
312 my $flushed = $self->flush();
313 $self->print("FLUSH COUNT $flushed") if ($self->{dbg} > 1);
314 }
315
316 my $node;
317 while($node = $self->{head}) {
318 $self->delete($self->{head}[$KEY]);
319 }
320
321 1;
322 }
323
324 sub EXISTS {
325 my($self, $key) = @_;
326 exists $self->{nodes}{$key};
327 }
328
329 # firstkey / nextkey emulate keys() and each() behavior by
330 # taking a snapshot of all the nodes at firstkey, and
331 # iterating through the keys with nextkey
332 #
333 # this method therefore will only supports one each() / keys()
334 # happening during any given time.
335 #
336 sub FIRSTKEY {
337 my($self) = @_;
338
339 $self->{'keys'} = [];
340 my $node = $self->{head};
341 while($node) {
342 push(@{$self->{'keys'}}, $node->[$KEY]);
343 $node = $node->[$AFTER];
344 }
345
346 shift @{$self->{'keys'}};
347 }
348
349 sub NEXTKEY {
350 my($self, $lastkey) = @_;
351 shift @{$self->{'keys'}};
352 }
353
354 sub DESTROY {
355 my($self) = @_;
356
357 # if debugging, snapshot cache before clearing
358 if($self->{dbg}) {
359 if($self->{hit} || $self->{miss}) {
360 $self->{hit_ratio} =
361 sprintf("%4.3f", $self->{hit} / ($self->{hit} + $self->{miss}));
362 }
363 $self->print($self->pretty_self());
364 if($self->{dbg} > 1) {
365 $self->print($self->pretty_chains());
366 }
367 }
368
369 $self->print("DESTROYING") if $self->{dbg} > 1;
370 $self->CLEAR();
371
372 1;
373 }
374
375 ####PERL##LRU##TIE##CACHE##PERL##LRU##TIE##CACHE##PERL##LRU##TIE##CACHE
376 ## Helper Routines
377 ####PERL##LRU##TIE##CACHE##PERL##LRU##TIE##CACHE##PERL##LRU##TIE##CACHE
378
379 # we use scalar_refs for the data for speed
380 sub create_node {
381 my($self, $key, $value, $length) = @_;
382 (defined($$key) && defined($$value))
383 || die("need more localized data than $$key and $$value");
384
385 # max_size always defined when max_bytes is
386 if (($self->{max_size})) {
387 $length = defined $length ? $length : &_get_data_length($key, $value)
388 } else {
389 $length = 0;
390 }
391
392 # ORDER SPECIFIC, see top for NODE ARRAY STRUCT
393 my $node = [ $$key, $$value, $length ];
394 }
395
396 sub _get_data_length {
397 my($key, $value) = @_;
398 my $length = 0;
399 my %refs;
400
401 my @data = ($$key, $$value);
402 while(my $elem = shift @data) {
403 next if $refs{$elem};
404 $refs{$elem} = 1;
405 if(ref $elem && $elem =~ /(SCALAR|HASH|ARRAY)/) {
406 my $type = $1;
407 $length += $REF_SIZE; # guess, 16 bytes per ref, probably more
408 if (($type eq 'SCALAR')) {
409 $length += length($$elem);
410 } elsif (($type eq 'HASH')) {
411 while (my($k,$v) = each %$elem) {
412 for my $kv($k,$v) {
413 if ((ref $kv)) {
414 push(@data, $kv);
415 } else {
416 $length += length($kv);
417 }
418 }
419 }
420 } elsif (($type eq 'ARRAY')) {
421 for my $val (@$elem){
422 if ((ref $val)) {
423 push(@data, $val);
424 } else {
425 $length += length($val);
426 }
427 }
428 }
429 } else {
430 $length += length($elem);
431 }
432 }
433
434 $length;
435 }
436
437 sub insert {
438 my($self, $new_node) = @_;
439
440 $new_node->[$AFTER] = 0;
441 $new_node->[$BEFORE] = $self->{tail};
442 $self->print("insert() [$new_node->[$KEY], $new_node->[$VALUE]]") if ($self->{dbg} > 1);
443
444 $self->{nodes}{$new_node->[$KEY]} = $new_node;
445
446 # current sizes
447 $self->{count}++;
448 $self->{bytes} += $new_node->[$BYTES] + $STRUCT_SIZE;
449
450 if($self->{tail}) {
451 $self->{tail}[$AFTER] = $new_node;
452 } else {
453 $self->{head} = $new_node;
454 }
455 $self->{tail} = $new_node;
456
457 ## if we are too big now, remove head
458 while(($self->{max_count} && ($self->{count} > $self->{max_count})) ||
459 ($self->{max_bytes} && ($self->{bytes} > $self->{max_bytes})))
460 {
461 if($self->{dbg} > 1) {
462 $self->print("current/max: ".
463 "bytes ($self->{bytes}/$self->{max_bytes}) ".
464 "count ($self->{count}/$self->{max_count}) "
465 );
466 }
467 my $old_node = $self->delete($self->{head}[$KEY]);
468 if ($self->{subclass}) {
469 if($old_node->[$DIRTY]) {
470 $self->print("dirty write() [$old_node->[$KEY], $old_node->[$VALUE]]")
471 if ($self->{dbg} > 1);
472 $self->write($old_node->[$KEY], $old_node->[$VALUE]);
473 }
474 }
475 # if($self->{dbg} > 1) {
476 # $self->print("after delete - bytes $self->{bytes}; count $self->{count}");
477 # }
478 }
479
480 1;
481 }
482
483 sub delete {
484 my($self, $key) = @_;
485 my $node = $self->{nodes}{$key} || return;
486 # return unless $node;
487
488 $self->print("delete() [$key, $node->[$VALUE]]") if ($self->{dbg} > 1);
489
490 my $before = $node->[$BEFORE];
491 my $after = $node->[$AFTER];
492
493 # my($before, $after) = $node->{before,after};
494 if($before) {
495 ($before->[$AFTER] = $after);
496 } else {
497 $self->{head} = $after;
498 }
499
500 if($after) {
501 ($after->[$BEFORE] = $before);
502 } else {
503 $self->{tail} = $before;
504 }
505
506 delete $self->{nodes}{$key};
507 $self->{bytes} -= ($node->[$BYTES] + $STRUCT_SIZE);
508 $self->{count}--;
509
510 $node;
511 }
512
513 sub flush {
514 my $self = shift;
515
516 $self->print("FLUSH CACHE") if ($self->{dbg} > 1);
517
518 my $node = $self->{head};
519 my $flush_count = 0;
520 while($node) {
521 if($node->[$DIRTY]) {
522 $self->print("flush dirty write() [$node->[$KEY], $node->[$VALUE]]")
523 if ($self->{dbg} > 1);
524 $self->write($node->[$KEY], $node->[$VALUE]);
525 $node->[$DIRTY] = 0;
526 $flush_count++;
527 }
528 $node = $node->[$AFTER];
529 }
530
531 $flush_count;
532 }
533
534 sub print {
535 my($self, $msg) = @_;
536 print "$self: $msg\n";
537 }
538
539 sub pretty_self {
540 my($self) = @_;
541
542 my(@prints);
543 for(sort keys %{$self}) {
544 next unless defined $self->{$_};
545 push(@prints, "$_=>$self->{$_}");
546 }
547
548 "{ " . join(", ", @prints) . " }";
549 }
550
551 sub pretty_chains {
552 my($self) = @_;
553 my($str);
554 my $k = $self->FIRSTKEY();
555
556 $str .= "[head]->";
557 my($curr_node) = $self->{head};
558 while($curr_node) {
559 $str .= "[$curr_node->[$KEY],$curr_node->[$VALUE]]->";
560 $curr_node = $curr_node->[$AFTER];
561 }
562 $str .= "[tail]->";
563
564 $curr_node = $self->{tail};
565 while($curr_node) {
566 $str .= "[$curr_node->[$KEY],$curr_node->[$VALUE]]->";
567 $curr_node = $curr_node->[$BEFORE];
568 }
569 $str .= "[head]";
570
571 $str;
572 }
573
574 1;
575
576 __END__
577
578 =head1 INSTALLATION
579
580 Tie::Cache installs easily using the make or nmake commands as
581 shown below. Otherwise, just copy Cache.pm to $PERLLIB/site/Tie
582
583 > perl Makefile.PL
584 > make
585 > make test
586 > make install
587
588 * use nmake for win32
589 ** you can also just copy Cache.pm to $perllib/Tie
590
591 =head1 BENCMARKS
592
593 There is another simpler LRU cache implementation in CPAN,
594 Tie::Cache::LRU, which has the same basic size limiting
595 functionality, and for this functionality, the exact same
596 interface.
597
598 Through healthy competition, Michael G Schwern got
599 Tie::Cache::LRU mostly faster than Tie::Cache on reads & writes:
600
601 Cache Size 5000 Tie::Cache 0.17 Tie::Cache::LRU 0.21
602 10000 Writes 1.55 CPU sec 1.10 CPU sec
603 40000 Reads 1.82 CPU sec 1.58 CPU sec
604 10000 Deletes 0.55 CPU sec 0.59 CPU sec
605
606 Unless you are using TRUE CACHE or MaxBytes functionality,
607 using Tie::Cache::LRU should be an easy replacement for Tie::Cache.
608
609 =head1 TRUE CACHE
610
611 To use class as a true cache, which acts as the sole interface
612 for some data set, subclass the real cache off Tie::Cache,
613 with @ISA = qw( 'Tie::Cache' ) notation. Then override
614 the read() method for behavior when there is a cache miss,
615 and the write() method for behavior when the cache's data
616 changes.
617
618 When WriteSync is 1 or TRUE (DEFAULT), write() is called immediately
619 when data in the cache is modified. If set to 0, data that has
620 been modified in the cache gets written out when the entries are deleted or
621 during the DESTROY phase of the cache object, usually at the end of
622 a script.
623
624 To have the dirty data write() periodically while WriteSync is set to 0,
625 there is a flush() cache API call that will flush the dirty writes
626 in this way. Just call the flush() API like:
627
628 my $write_flush_count = tied(%cache)->flush();
629
630 The flush() API was added in the .17 release thanks to Rob Bloodgood.
631
632 =head1 TRUE CACHE EXAMPLE
633
634 use Tie::Cache;
635
636 # personalize the Tie::Cache object, by inheriting from it
637 package My::Cache;
638 @ISA = qw(Tie::Cache);
639
640 # override the read() and write() member functions
641 # these tell the cache what to do with a cache miss or flush
642 sub read {
643 my($self, $key) = @_;
644 print "cache miss for $key, read() data\n";
645 rand() * $key;
646 }
647 sub write {
648 my($self, $key, $value) = @_;
649 print "flushing [$key, $value] from cache, write() data\n";
650 }
651
652 my $cache_size = $ARGV[0] || 2;
653 my $num_to_cache = $ARGV[1] || 4;
654 my $Debug = $ARGV[2] || 1;
655
656 tie %cache, 'My::Cache', $cache_size, {Debug => $Debug};
657
658 # load the cache with new data, each through its contents,
659 # and then reload in reverse order.
660 for(1..$num_to_cache) { print "read data $_: $cache{$_}\n" }
661 while(my($k, $v) = each %cache) { print "each data $k: $v\n"; }
662 for(my $i=$num_to_cache; $i>0; $i--) { print "read data $i: $cache{$i}\n"; }
663
664 # flush writes now, trivial use since will happen in DESTROY() anyway
665 tied(%cache)->flush();
666
667 # clear cache in 2 ways, write will flush out to disk
668 %cache = ();
669 undef %cache;
670
671 =head1 NOTES
672
673 Many thanks to all those who helped me make this module a reality,
674 including:
675
676 :) Tom Hukins who provided me insight and motivation for
677 finishing this module.
678 :) Jamie McCarthy, for trying to make Tie::Cache be all
679 that it can be.
680 :) Rob Fugina who knows how to "TRULY CACHE".
681 :) Rob Bloodgood, for the TRUE CACHE flush() API
682
683 =head1 AUTHOR
684
685 Please send any questions or comments to Joshua Chamas
686 at chamas@alumni.stanford.org
687
688 =head1 COPYRIGHT
689
690 Copyright (c) 1999-2002 Joshua Chamas, Chamas Enterprises Inc.
691 Sponsored by development on NodeWorks http://www.nodeworks.com
692
693 All rights reserved. This program is free software;
694 you can redistribute it and/or modify it under the same
695 terms as Perl itself.
696
697 =cut
698
699
700
701
702
703