Mercurial > repos > rico > testing_again
comparison cdblib.py @ 0:4d28d3295ac3 default tip
Uploaded
| author | rico |
|---|---|
| date | Fri, 06 Apr 2012 13:46:42 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:4d28d3295ac3 |
|---|---|
| 1 #!/usr/bin/env python2.5 | |
| 2 | |
| 3 ''' | |
| 4 Manipulate DJB's Constant Databases. These are 2 level disk-based hash tables | |
| 5 that efficiently handle many keys, while remaining space-efficient. | |
| 6 | |
| 7 http://cr.yp.to/cdb.html | |
| 8 | |
| 9 When generated databases are only used with Python code, consider using hash() | |
| 10 rather than djb_hash() for a tidy speedup. | |
| 11 ''' | |
| 12 | |
| 13 from _struct import Struct | |
| 14 from itertools import chain | |
| 15 | |
| 16 | |
| 17 def py_djb_hash(s): | |
| 18 '''Return the value of DJB's hash function for the given 8-bit string.''' | |
| 19 h = 5381 | |
| 20 for c in s: | |
| 21 h = (((h << 5) + h) ^ ord(c)) & 0xffffffff | |
| 22 return h | |
| 23 | |
| 24 try: | |
| 25 from _cdblib import djb_hash | |
| 26 except ImportError: | |
| 27 djb_hash = py_djb_hash | |
| 28 | |
| 29 read_2_le4 = Struct('<LL').unpack | |
| 30 write_2_le4 = Struct('<LL').pack | |
| 31 | |
| 32 | |
| 33 class Reader(object): | |
| 34 '''A dictionary-like object for reading a Constant Database accessed | |
| 35 through a string or string-like sequence, such as mmap.mmap().''' | |
| 36 | |
| 37 def __init__(self, data, hashfn=djb_hash): | |
| 38 '''Create an instance reading from a sequence and using hashfn to hash | |
| 39 keys.''' | |
| 40 if len(data) < 2048: | |
| 41 raise IOError('CDB too small') | |
| 42 | |
| 43 self.data = data | |
| 44 self.hashfn = hashfn | |
| 45 | |
| 46 self.index = [read_2_le4(data[i:i+8]) for i in xrange(0, 2048, 8)] | |
| 47 self.table_start = min(p[0] for p in self.index) | |
| 48 # Assume load load factor is 0.5 like official CDB. | |
| 49 self.length = sum(p[1] >> 1 for p in self.index) | |
| 50 | |
| 51 def iteritems(self): | |
| 52 '''Like dict.iteritems(). Items are returned in insertion order.''' | |
| 53 pos = 2048 | |
| 54 while pos < self.table_start: | |
| 55 klen, dlen = read_2_le4(self.data[pos:pos+8]) | |
| 56 pos += 8 | |
| 57 | |
| 58 key = self.data[pos:pos+klen] | |
| 59 pos += klen | |
| 60 | |
| 61 data = self.data[pos:pos+dlen] | |
| 62 pos += dlen | |
| 63 | |
| 64 yield key, data | |
| 65 | |
| 66 def items(self): | |
| 67 '''Like dict.items().''' | |
| 68 return list(self.iteritems()) | |
| 69 | |
| 70 def iterkeys(self): | |
| 71 '''Like dict.iterkeys().''' | |
| 72 return (p[0] for p in self.iteritems()) | |
| 73 __iter__ = iterkeys | |
| 74 | |
| 75 def itervalues(self): | |
| 76 '''Like dict.itervalues().''' | |
| 77 return (p[1] for p in self.iteritems()) | |
| 78 | |
| 79 def keys(self): | |
| 80 '''Like dict.keys().''' | |
| 81 return [p[0] for p in self.iteritems()] | |
| 82 | |
| 83 def values(self): | |
| 84 '''Like dict.values().''' | |
| 85 return [p[1] for p in self.iteritems()] | |
| 86 | |
| 87 def __getitem__(self, key): | |
| 88 '''Like dict.__getitem__().''' | |
| 89 value = self.get(key) | |
| 90 if value is None: | |
| 91 raise KeyError(key) | |
| 92 return value | |
| 93 | |
| 94 def has_key(self, key): | |
| 95 '''Return True if key exists in the database.''' | |
| 96 return self.get(key) is not None | |
| 97 __contains__ = has_key | |
| 98 | |
| 99 def __len__(self): | |
| 100 '''Return the number of records in the database.''' | |
| 101 return self.length | |
| 102 | |
| 103 def gets(self, key): | |
| 104 '''Yield values for key in insertion order.''' | |
| 105 # Truncate to 32 bits and remove sign. | |
| 106 h = self.hashfn(key) & 0xffffffff | |
| 107 start, nslots = self.index[h & 0xff] | |
| 108 | |
| 109 if nslots: | |
| 110 end = start + (nslots << 3) | |
| 111 slot_off = start + (((h >> 8) % nslots) << 3) | |
| 112 | |
| 113 for pos in chain(xrange(slot_off, end, 8), | |
| 114 xrange(start, slot_off, 8)): | |
| 115 rec_h, rec_pos = read_2_le4(self.data[pos:pos+8]) | |
| 116 | |
| 117 if not rec_h: | |
| 118 break | |
| 119 elif rec_h == h: | |
| 120 klen, dlen = read_2_le4(self.data[rec_pos:rec_pos+8]) | |
| 121 rec_pos += 8 | |
| 122 | |
| 123 if self.data[rec_pos:rec_pos+klen] == key: | |
| 124 rec_pos += klen | |
| 125 yield self.data[rec_pos:rec_pos+dlen] | |
| 126 | |
| 127 def get(self, key, default=None): | |
| 128 '''Get the first value for key, returning default if missing.''' | |
| 129 # Avoid exception catch when handling default case; much faster. | |
| 130 return chain(self.gets(key), (default,)).next() | |
| 131 | |
| 132 def getint(self, key, default=None, base=0): | |
| 133 '''Get the first value for key converted it to an int, returning | |
| 134 default if missing.''' | |
| 135 value = self.get(key, default) | |
| 136 if value is not default: | |
| 137 return int(value, base) | |
| 138 return value | |
| 139 | |
| 140 def getints(self, key, base=0): | |
| 141 '''Yield values for key in insertion order after converting to int.''' | |
| 142 return (int(v, base) for v in self.gets(key)) | |
| 143 | |
| 144 def getstring(self, key, default=None, encoding='utf-8'): | |
| 145 '''Get the first value for key decoded as unicode, returning default if | |
| 146 not found.''' | |
| 147 value = self.get(key, default) | |
| 148 if value is not default: | |
| 149 return value.decode(encoding) | |
| 150 return value | |
| 151 | |
| 152 def getstrings(self, key, encoding='utf-8'): | |
| 153 '''Yield values for key in insertion order after decoding as | |
| 154 unicode.''' | |
| 155 return (v.decode(encoding) for v in self.gets(key)) | |
| 156 | |
| 157 | |
| 158 class Writer(object): | |
| 159 '''Object for building new Constant Databases, and writing them to a | |
| 160 seekable file-like object.''' | |
| 161 | |
| 162 def __init__(self, fp, hashfn=djb_hash): | |
| 163 '''Create an instance writing to a file-like object, using hashfn to | |
| 164 hash keys.''' | |
| 165 self.fp = fp | |
| 166 self.hashfn = hashfn | |
| 167 | |
| 168 fp.write('\x00' * 2048) | |
| 169 self._unordered = [[] for i in xrange(256)] | |
| 170 | |
| 171 def put(self, key, value=''): | |
| 172 '''Write a string key/value pair to the output file.''' | |
| 173 assert type(key) is str and type(value) is str | |
| 174 | |
| 175 pos = self.fp.tell() | |
| 176 self.fp.write(write_2_le4(len(key), len(value))) | |
| 177 self.fp.write(key) | |
| 178 self.fp.write(value) | |
| 179 | |
| 180 h = self.hashfn(key) & 0xffffffff | |
| 181 self._unordered[h & 0xff].append((h, pos)) | |
| 182 | |
| 183 def puts(self, key, values): | |
| 184 '''Write more than one value for the same key to the output file. | |
| 185 Equivalent to calling put() in a loop.''' | |
| 186 for value in values: | |
| 187 self.put(key, value) | |
| 188 | |
| 189 def putint(self, key, value): | |
| 190 '''Write an integer as a base-10 string associated with the given key | |
| 191 to the output file.''' | |
| 192 self.put(key, str(value)) | |
| 193 | |
| 194 def putints(self, key, values): | |
| 195 '''Write zero or more integers for the same key to the output file. | |
| 196 Equivalent to calling putint() in a loop.''' | |
| 197 self.puts(key, (str(value) for value in values)) | |
| 198 | |
| 199 def putstring(self, key, value, encoding='utf-8'): | |
| 200 '''Write a unicode string associated with the given key to the output | |
| 201 file after encoding it as UTF-8 or the given encoding.''' | |
| 202 self.put(key, unicode.encode(value, encoding)) | |
| 203 | |
| 204 def putstrings(self, key, values, encoding='utf-8'): | |
| 205 '''Write zero or more unicode strings to the output file. Equivalent to | |
| 206 calling putstring() in a loop.''' | |
| 207 self.puts(key, (unicode.encode(value, encoding) for value in values)) | |
| 208 | |
| 209 def finalize(self): | |
| 210 '''Write the final hash tables to the output file, and write out its | |
| 211 index. The output file remains open upon return.''' | |
| 212 index = [] | |
| 213 for tbl in self._unordered: | |
| 214 length = len(tbl) << 1 | |
| 215 ordered = [(0, 0)] * length | |
| 216 for pair in tbl: | |
| 217 where = (pair[0] >> 8) % length | |
| 218 for i in chain(xrange(where, length), xrange(0, where)): | |
| 219 if not ordered[i][0]: | |
| 220 ordered[i] = pair | |
| 221 break | |
| 222 | |
| 223 index.append((self.fp.tell(), length)) | |
| 224 for pair in ordered: | |
| 225 self.fp.write(write_2_le4(*pair)) | |
| 226 | |
| 227 self.fp.seek(0) | |
| 228 for pair in index: | |
| 229 self.fp.write(write_2_le4(*pair)) | |
| 230 self.fp = None # prevent double finalize() |
