annotate cdblib.py @ 0:32f8fbdd754c default tip

Uploaded
author rico
date Fri, 06 Apr 2012 13:59:29 -0400
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
32f8fbdd754c Uploaded
rico
parents:
diff changeset
1 #!/usr/bin/env python2.5
32f8fbdd754c Uploaded
rico
parents:
diff changeset
2
32f8fbdd754c Uploaded
rico
parents:
diff changeset
3 '''
32f8fbdd754c Uploaded
rico
parents:
diff changeset
4 Manipulate DJB's Constant Databases. These are 2 level disk-based hash tables
32f8fbdd754c Uploaded
rico
parents:
diff changeset
5 that efficiently handle many keys, while remaining space-efficient.
32f8fbdd754c Uploaded
rico
parents:
diff changeset
6
32f8fbdd754c Uploaded
rico
parents:
diff changeset
7 http://cr.yp.to/cdb.html
32f8fbdd754c Uploaded
rico
parents:
diff changeset
8
32f8fbdd754c Uploaded
rico
parents:
diff changeset
9 When generated databases are only used with Python code, consider using hash()
32f8fbdd754c Uploaded
rico
parents:
diff changeset
10 rather than djb_hash() for a tidy speedup.
32f8fbdd754c Uploaded
rico
parents:
diff changeset
11 '''
32f8fbdd754c Uploaded
rico
parents:
diff changeset
12
32f8fbdd754c Uploaded
rico
parents:
diff changeset
13 from _struct import Struct
32f8fbdd754c Uploaded
rico
parents:
diff changeset
14 from itertools import chain
32f8fbdd754c Uploaded
rico
parents:
diff changeset
15
32f8fbdd754c Uploaded
rico
parents:
diff changeset
16
32f8fbdd754c Uploaded
rico
parents:
diff changeset
17 def py_djb_hash(s):
32f8fbdd754c Uploaded
rico
parents:
diff changeset
18 '''Return the value of DJB's hash function for the given 8-bit string.'''
32f8fbdd754c Uploaded
rico
parents:
diff changeset
19 h = 5381
32f8fbdd754c Uploaded
rico
parents:
diff changeset
20 for c in s:
32f8fbdd754c Uploaded
rico
parents:
diff changeset
21 h = (((h << 5) + h) ^ ord(c)) & 0xffffffff
32f8fbdd754c Uploaded
rico
parents:
diff changeset
22 return h
32f8fbdd754c Uploaded
rico
parents:
diff changeset
23
32f8fbdd754c Uploaded
rico
parents:
diff changeset
24 try:
32f8fbdd754c Uploaded
rico
parents:
diff changeset
25 from _cdblib import djb_hash
32f8fbdd754c Uploaded
rico
parents:
diff changeset
26 except ImportError:
32f8fbdd754c Uploaded
rico
parents:
diff changeset
27 djb_hash = py_djb_hash
32f8fbdd754c Uploaded
rico
parents:
diff changeset
28
32f8fbdd754c Uploaded
rico
parents:
diff changeset
29 read_2_le4 = Struct('<LL').unpack
32f8fbdd754c Uploaded
rico
parents:
diff changeset
30 write_2_le4 = Struct('<LL').pack
32f8fbdd754c Uploaded
rico
parents:
diff changeset
31
32f8fbdd754c Uploaded
rico
parents:
diff changeset
32
32f8fbdd754c Uploaded
rico
parents:
diff changeset
33 class Reader(object):
32f8fbdd754c Uploaded
rico
parents:
diff changeset
34 '''A dictionary-like object for reading a Constant Database accessed
32f8fbdd754c Uploaded
rico
parents:
diff changeset
35 through a string or string-like sequence, such as mmap.mmap().'''
32f8fbdd754c Uploaded
rico
parents:
diff changeset
36
32f8fbdd754c Uploaded
rico
parents:
diff changeset
37 def __init__(self, data, hashfn=djb_hash):
32f8fbdd754c Uploaded
rico
parents:
diff changeset
38 '''Create an instance reading from a sequence and using hashfn to hash
32f8fbdd754c Uploaded
rico
parents:
diff changeset
39 keys.'''
32f8fbdd754c Uploaded
rico
parents:
diff changeset
40 if len(data) < 2048:
32f8fbdd754c Uploaded
rico
parents:
diff changeset
41 raise IOError('CDB too small')
32f8fbdd754c Uploaded
rico
parents:
diff changeset
42
32f8fbdd754c Uploaded
rico
parents:
diff changeset
43 self.data = data
32f8fbdd754c Uploaded
rico
parents:
diff changeset
44 self.hashfn = hashfn
32f8fbdd754c Uploaded
rico
parents:
diff changeset
45
32f8fbdd754c Uploaded
rico
parents:
diff changeset
46 self.index = [read_2_le4(data[i:i+8]) for i in xrange(0, 2048, 8)]
32f8fbdd754c Uploaded
rico
parents:
diff changeset
47 self.table_start = min(p[0] for p in self.index)
32f8fbdd754c Uploaded
rico
parents:
diff changeset
48 # Assume load load factor is 0.5 like official CDB.
32f8fbdd754c Uploaded
rico
parents:
diff changeset
49 self.length = sum(p[1] >> 1 for p in self.index)
32f8fbdd754c Uploaded
rico
parents:
diff changeset
50
32f8fbdd754c Uploaded
rico
parents:
diff changeset
51 def iteritems(self):
32f8fbdd754c Uploaded
rico
parents:
diff changeset
52 '''Like dict.iteritems(). Items are returned in insertion order.'''
32f8fbdd754c Uploaded
rico
parents:
diff changeset
53 pos = 2048
32f8fbdd754c Uploaded
rico
parents:
diff changeset
54 while pos < self.table_start:
32f8fbdd754c Uploaded
rico
parents:
diff changeset
55 klen, dlen = read_2_le4(self.data[pos:pos+8])
32f8fbdd754c Uploaded
rico
parents:
diff changeset
56 pos += 8
32f8fbdd754c Uploaded
rico
parents:
diff changeset
57
32f8fbdd754c Uploaded
rico
parents:
diff changeset
58 key = self.data[pos:pos+klen]
32f8fbdd754c Uploaded
rico
parents:
diff changeset
59 pos += klen
32f8fbdd754c Uploaded
rico
parents:
diff changeset
60
32f8fbdd754c Uploaded
rico
parents:
diff changeset
61 data = self.data[pos:pos+dlen]
32f8fbdd754c Uploaded
rico
parents:
diff changeset
62 pos += dlen
32f8fbdd754c Uploaded
rico
parents:
diff changeset
63
32f8fbdd754c Uploaded
rico
parents:
diff changeset
64 yield key, data
32f8fbdd754c Uploaded
rico
parents:
diff changeset
65
32f8fbdd754c Uploaded
rico
parents:
diff changeset
66 def items(self):
32f8fbdd754c Uploaded
rico
parents:
diff changeset
67 '''Like dict.items().'''
32f8fbdd754c Uploaded
rico
parents:
diff changeset
68 return list(self.iteritems())
32f8fbdd754c Uploaded
rico
parents:
diff changeset
69
32f8fbdd754c Uploaded
rico
parents:
diff changeset
70 def iterkeys(self):
32f8fbdd754c Uploaded
rico
parents:
diff changeset
71 '''Like dict.iterkeys().'''
32f8fbdd754c Uploaded
rico
parents:
diff changeset
72 return (p[0] for p in self.iteritems())
32f8fbdd754c Uploaded
rico
parents:
diff changeset
73 __iter__ = iterkeys
32f8fbdd754c Uploaded
rico
parents:
diff changeset
74
32f8fbdd754c Uploaded
rico
parents:
diff changeset
75 def itervalues(self):
32f8fbdd754c Uploaded
rico
parents:
diff changeset
76 '''Like dict.itervalues().'''
32f8fbdd754c Uploaded
rico
parents:
diff changeset
77 return (p[1] for p in self.iteritems())
32f8fbdd754c Uploaded
rico
parents:
diff changeset
78
32f8fbdd754c Uploaded
rico
parents:
diff changeset
79 def keys(self):
32f8fbdd754c Uploaded
rico
parents:
diff changeset
80 '''Like dict.keys().'''
32f8fbdd754c Uploaded
rico
parents:
diff changeset
81 return [p[0] for p in self.iteritems()]
32f8fbdd754c Uploaded
rico
parents:
diff changeset
82
32f8fbdd754c Uploaded
rico
parents:
diff changeset
83 def values(self):
32f8fbdd754c Uploaded
rico
parents:
diff changeset
84 '''Like dict.values().'''
32f8fbdd754c Uploaded
rico
parents:
diff changeset
85 return [p[1] for p in self.iteritems()]
32f8fbdd754c Uploaded
rico
parents:
diff changeset
86
32f8fbdd754c Uploaded
rico
parents:
diff changeset
87 def __getitem__(self, key):
32f8fbdd754c Uploaded
rico
parents:
diff changeset
88 '''Like dict.__getitem__().'''
32f8fbdd754c Uploaded
rico
parents:
diff changeset
89 value = self.get(key)
32f8fbdd754c Uploaded
rico
parents:
diff changeset
90 if value is None:
32f8fbdd754c Uploaded
rico
parents:
diff changeset
91 raise KeyError(key)
32f8fbdd754c Uploaded
rico
parents:
diff changeset
92 return value
32f8fbdd754c Uploaded
rico
parents:
diff changeset
93
32f8fbdd754c Uploaded
rico
parents:
diff changeset
94 def has_key(self, key):
32f8fbdd754c Uploaded
rico
parents:
diff changeset
95 '''Return True if key exists in the database.'''
32f8fbdd754c Uploaded
rico
parents:
diff changeset
96 return self.get(key) is not None
32f8fbdd754c Uploaded
rico
parents:
diff changeset
97 __contains__ = has_key
32f8fbdd754c Uploaded
rico
parents:
diff changeset
98
32f8fbdd754c Uploaded
rico
parents:
diff changeset
99 def __len__(self):
32f8fbdd754c Uploaded
rico
parents:
diff changeset
100 '''Return the number of records in the database.'''
32f8fbdd754c Uploaded
rico
parents:
diff changeset
101 return self.length
32f8fbdd754c Uploaded
rico
parents:
diff changeset
102
32f8fbdd754c Uploaded
rico
parents:
diff changeset
103 def gets(self, key):
32f8fbdd754c Uploaded
rico
parents:
diff changeset
104 '''Yield values for key in insertion order.'''
32f8fbdd754c Uploaded
rico
parents:
diff changeset
105 # Truncate to 32 bits and remove sign.
32f8fbdd754c Uploaded
rico
parents:
diff changeset
106 h = self.hashfn(key) & 0xffffffff
32f8fbdd754c Uploaded
rico
parents:
diff changeset
107 start, nslots = self.index[h & 0xff]
32f8fbdd754c Uploaded
rico
parents:
diff changeset
108
32f8fbdd754c Uploaded
rico
parents:
diff changeset
109 if nslots:
32f8fbdd754c Uploaded
rico
parents:
diff changeset
110 end = start + (nslots << 3)
32f8fbdd754c Uploaded
rico
parents:
diff changeset
111 slot_off = start + (((h >> 8) % nslots) << 3)
32f8fbdd754c Uploaded
rico
parents:
diff changeset
112
32f8fbdd754c Uploaded
rico
parents:
diff changeset
113 for pos in chain(xrange(slot_off, end, 8),
32f8fbdd754c Uploaded
rico
parents:
diff changeset
114 xrange(start, slot_off, 8)):
32f8fbdd754c Uploaded
rico
parents:
diff changeset
115 rec_h, rec_pos = read_2_le4(self.data[pos:pos+8])
32f8fbdd754c Uploaded
rico
parents:
diff changeset
116
32f8fbdd754c Uploaded
rico
parents:
diff changeset
117 if not rec_h:
32f8fbdd754c Uploaded
rico
parents:
diff changeset
118 break
32f8fbdd754c Uploaded
rico
parents:
diff changeset
119 elif rec_h == h:
32f8fbdd754c Uploaded
rico
parents:
diff changeset
120 klen, dlen = read_2_le4(self.data[rec_pos:rec_pos+8])
32f8fbdd754c Uploaded
rico
parents:
diff changeset
121 rec_pos += 8
32f8fbdd754c Uploaded
rico
parents:
diff changeset
122
32f8fbdd754c Uploaded
rico
parents:
diff changeset
123 if self.data[rec_pos:rec_pos+klen] == key:
32f8fbdd754c Uploaded
rico
parents:
diff changeset
124 rec_pos += klen
32f8fbdd754c Uploaded
rico
parents:
diff changeset
125 yield self.data[rec_pos:rec_pos+dlen]
32f8fbdd754c Uploaded
rico
parents:
diff changeset
126
32f8fbdd754c Uploaded
rico
parents:
diff changeset
127 def get(self, key, default=None):
32f8fbdd754c Uploaded
rico
parents:
diff changeset
128 '''Get the first value for key, returning default if missing.'''
32f8fbdd754c Uploaded
rico
parents:
diff changeset
129 # Avoid exception catch when handling default case; much faster.
32f8fbdd754c Uploaded
rico
parents:
diff changeset
130 return chain(self.gets(key), (default,)).next()
32f8fbdd754c Uploaded
rico
parents:
diff changeset
131
32f8fbdd754c Uploaded
rico
parents:
diff changeset
132 def getint(self, key, default=None, base=0):
32f8fbdd754c Uploaded
rico
parents:
diff changeset
133 '''Get the first value for key converted it to an int, returning
32f8fbdd754c Uploaded
rico
parents:
diff changeset
134 default if missing.'''
32f8fbdd754c Uploaded
rico
parents:
diff changeset
135 value = self.get(key, default)
32f8fbdd754c Uploaded
rico
parents:
diff changeset
136 if value is not default:
32f8fbdd754c Uploaded
rico
parents:
diff changeset
137 return int(value, base)
32f8fbdd754c Uploaded
rico
parents:
diff changeset
138 return value
32f8fbdd754c Uploaded
rico
parents:
diff changeset
139
32f8fbdd754c Uploaded
rico
parents:
diff changeset
140 def getints(self, key, base=0):
32f8fbdd754c Uploaded
rico
parents:
diff changeset
141 '''Yield values for key in insertion order after converting to int.'''
32f8fbdd754c Uploaded
rico
parents:
diff changeset
142 return (int(v, base) for v in self.gets(key))
32f8fbdd754c Uploaded
rico
parents:
diff changeset
143
32f8fbdd754c Uploaded
rico
parents:
diff changeset
144 def getstring(self, key, default=None, encoding='utf-8'):
32f8fbdd754c Uploaded
rico
parents:
diff changeset
145 '''Get the first value for key decoded as unicode, returning default if
32f8fbdd754c Uploaded
rico
parents:
diff changeset
146 not found.'''
32f8fbdd754c Uploaded
rico
parents:
diff changeset
147 value = self.get(key, default)
32f8fbdd754c Uploaded
rico
parents:
diff changeset
148 if value is not default:
32f8fbdd754c Uploaded
rico
parents:
diff changeset
149 return value.decode(encoding)
32f8fbdd754c Uploaded
rico
parents:
diff changeset
150 return value
32f8fbdd754c Uploaded
rico
parents:
diff changeset
151
32f8fbdd754c Uploaded
rico
parents:
diff changeset
152 def getstrings(self, key, encoding='utf-8'):
32f8fbdd754c Uploaded
rico
parents:
diff changeset
153 '''Yield values for key in insertion order after decoding as
32f8fbdd754c Uploaded
rico
parents:
diff changeset
154 unicode.'''
32f8fbdd754c Uploaded
rico
parents:
diff changeset
155 return (v.decode(encoding) for v in self.gets(key))
32f8fbdd754c Uploaded
rico
parents:
diff changeset
156
32f8fbdd754c Uploaded
rico
parents:
diff changeset
157
32f8fbdd754c Uploaded
rico
parents:
diff changeset
158 class Writer(object):
32f8fbdd754c Uploaded
rico
parents:
diff changeset
159 '''Object for building new Constant Databases, and writing them to a
32f8fbdd754c Uploaded
rico
parents:
diff changeset
160 seekable file-like object.'''
32f8fbdd754c Uploaded
rico
parents:
diff changeset
161
32f8fbdd754c Uploaded
rico
parents:
diff changeset
162 def __init__(self, fp, hashfn=djb_hash):
32f8fbdd754c Uploaded
rico
parents:
diff changeset
163 '''Create an instance writing to a file-like object, using hashfn to
32f8fbdd754c Uploaded
rico
parents:
diff changeset
164 hash keys.'''
32f8fbdd754c Uploaded
rico
parents:
diff changeset
165 self.fp = fp
32f8fbdd754c Uploaded
rico
parents:
diff changeset
166 self.hashfn = hashfn
32f8fbdd754c Uploaded
rico
parents:
diff changeset
167
32f8fbdd754c Uploaded
rico
parents:
diff changeset
168 fp.write('\x00' * 2048)
32f8fbdd754c Uploaded
rico
parents:
diff changeset
169 self._unordered = [[] for i in xrange(256)]
32f8fbdd754c Uploaded
rico
parents:
diff changeset
170
32f8fbdd754c Uploaded
rico
parents:
diff changeset
171 def put(self, key, value=''):
32f8fbdd754c Uploaded
rico
parents:
diff changeset
172 '''Write a string key/value pair to the output file.'''
32f8fbdd754c Uploaded
rico
parents:
diff changeset
173 assert type(key) is str and type(value) is str
32f8fbdd754c Uploaded
rico
parents:
diff changeset
174
32f8fbdd754c Uploaded
rico
parents:
diff changeset
175 pos = self.fp.tell()
32f8fbdd754c Uploaded
rico
parents:
diff changeset
176 self.fp.write(write_2_le4(len(key), len(value)))
32f8fbdd754c Uploaded
rico
parents:
diff changeset
177 self.fp.write(key)
32f8fbdd754c Uploaded
rico
parents:
diff changeset
178 self.fp.write(value)
32f8fbdd754c Uploaded
rico
parents:
diff changeset
179
32f8fbdd754c Uploaded
rico
parents:
diff changeset
180 h = self.hashfn(key) & 0xffffffff
32f8fbdd754c Uploaded
rico
parents:
diff changeset
181 self._unordered[h & 0xff].append((h, pos))
32f8fbdd754c Uploaded
rico
parents:
diff changeset
182
32f8fbdd754c Uploaded
rico
parents:
diff changeset
183 def puts(self, key, values):
32f8fbdd754c Uploaded
rico
parents:
diff changeset
184 '''Write more than one value for the same key to the output file.
32f8fbdd754c Uploaded
rico
parents:
diff changeset
185 Equivalent to calling put() in a loop.'''
32f8fbdd754c Uploaded
rico
parents:
diff changeset
186 for value in values:
32f8fbdd754c Uploaded
rico
parents:
diff changeset
187 self.put(key, value)
32f8fbdd754c Uploaded
rico
parents:
diff changeset
188
32f8fbdd754c Uploaded
rico
parents:
diff changeset
189 def putint(self, key, value):
32f8fbdd754c Uploaded
rico
parents:
diff changeset
190 '''Write an integer as a base-10 string associated with the given key
32f8fbdd754c Uploaded
rico
parents:
diff changeset
191 to the output file.'''
32f8fbdd754c Uploaded
rico
parents:
diff changeset
192 self.put(key, str(value))
32f8fbdd754c Uploaded
rico
parents:
diff changeset
193
32f8fbdd754c Uploaded
rico
parents:
diff changeset
194 def putints(self, key, values):
32f8fbdd754c Uploaded
rico
parents:
diff changeset
195 '''Write zero or more integers for the same key to the output file.
32f8fbdd754c Uploaded
rico
parents:
diff changeset
196 Equivalent to calling putint() in a loop.'''
32f8fbdd754c Uploaded
rico
parents:
diff changeset
197 self.puts(key, (str(value) for value in values))
32f8fbdd754c Uploaded
rico
parents:
diff changeset
198
32f8fbdd754c Uploaded
rico
parents:
diff changeset
199 def putstring(self, key, value, encoding='utf-8'):
32f8fbdd754c Uploaded
rico
parents:
diff changeset
200 '''Write a unicode string associated with the given key to the output
32f8fbdd754c Uploaded
rico
parents:
diff changeset
201 file after encoding it as UTF-8 or the given encoding.'''
32f8fbdd754c Uploaded
rico
parents:
diff changeset
202 self.put(key, unicode.encode(value, encoding))
32f8fbdd754c Uploaded
rico
parents:
diff changeset
203
32f8fbdd754c Uploaded
rico
parents:
diff changeset
204 def putstrings(self, key, values, encoding='utf-8'):
32f8fbdd754c Uploaded
rico
parents:
diff changeset
205 '''Write zero or more unicode strings to the output file. Equivalent to
32f8fbdd754c Uploaded
rico
parents:
diff changeset
206 calling putstring() in a loop.'''
32f8fbdd754c Uploaded
rico
parents:
diff changeset
207 self.puts(key, (unicode.encode(value, encoding) for value in values))
32f8fbdd754c Uploaded
rico
parents:
diff changeset
208
32f8fbdd754c Uploaded
rico
parents:
diff changeset
209 def finalize(self):
32f8fbdd754c Uploaded
rico
parents:
diff changeset
210 '''Write the final hash tables to the output file, and write out its
32f8fbdd754c Uploaded
rico
parents:
diff changeset
211 index. The output file remains open upon return.'''
32f8fbdd754c Uploaded
rico
parents:
diff changeset
212 index = []
32f8fbdd754c Uploaded
rico
parents:
diff changeset
213 for tbl in self._unordered:
32f8fbdd754c Uploaded
rico
parents:
diff changeset
214 length = len(tbl) << 1
32f8fbdd754c Uploaded
rico
parents:
diff changeset
215 ordered = [(0, 0)] * length
32f8fbdd754c Uploaded
rico
parents:
diff changeset
216 for pair in tbl:
32f8fbdd754c Uploaded
rico
parents:
diff changeset
217 where = (pair[0] >> 8) % length
32f8fbdd754c Uploaded
rico
parents:
diff changeset
218 for i in chain(xrange(where, length), xrange(0, where)):
32f8fbdd754c Uploaded
rico
parents:
diff changeset
219 if not ordered[i][0]:
32f8fbdd754c Uploaded
rico
parents:
diff changeset
220 ordered[i] = pair
32f8fbdd754c Uploaded
rico
parents:
diff changeset
221 break
32f8fbdd754c Uploaded
rico
parents:
diff changeset
222
32f8fbdd754c Uploaded
rico
parents:
diff changeset
223 index.append((self.fp.tell(), length))
32f8fbdd754c Uploaded
rico
parents:
diff changeset
224 for pair in ordered:
32f8fbdd754c Uploaded
rico
parents:
diff changeset
225 self.fp.write(write_2_le4(*pair))
32f8fbdd754c Uploaded
rico
parents:
diff changeset
226
32f8fbdd754c Uploaded
rico
parents:
diff changeset
227 self.fp.seek(0)
32f8fbdd754c Uploaded
rico
parents:
diff changeset
228 for pair in index:
32f8fbdd754c Uploaded
rico
parents:
diff changeset
229 self.fp.write(write_2_le4(*pair))
32f8fbdd754c Uploaded
rico
parents:
diff changeset
230 self.fp = None # prevent double finalize()