import array
import collections
import itertools
# Placeholder constants
FREE = -1
DUMMY = -2
class Dict(collections.MutableMapping):
'Space efficient dictionary with fast iteration and cheap resizes.'
@staticmethod
def _gen_probes(hashvalue, mask):
'Same sequence of probes used in the current dictionary design'
PERTURB_SHIFT = 5
if hashvalue < 0:
hashvalue = -hashvalue
i = hashvalue & mask
yield i
perturb = hashvalue
while True:
i = (5 * i + perturb + 1)
yield i & mask
perturb >>= PERTURB_SHIFT
def _lookup(self, key, hashvalue):
'Same lookup logic as currently used in real dicts'
assert self.filled < len(self.indices) # At least one open slot
freeslot = None
for i in self._gen_probes(hashvalue, len(self.indices)-1):
index = self.indices[i]
if index == FREE:
return (FREE, i) if freeslot is None else (DUMMY, freeslot)
elif index == DUMMY:
freeslot = i
elif (self.keylist[index] is key or
self.hashlist[index] == hashvalue
and self.keylist[index] == key):
return (index, i)
@staticmethod
def _make_index(n):
'New sequence of indices using the smallest possible datatype'
if n <= 2**7: return array.array('b', [FREE]) * n # signed char
if n <= 2**15: return array.array('h', [FREE]) * n # signed short
if n <= 2**31: return array.array('l', [FREE]) * n # signed long
return [FREE] * n # python integers
def _resize(self, n):
'''Reindex the existing hash/key/value entries.
Entries do not get moved, they only get new indices.
No calls are made to hash() or __eq__().
'''
n = 2 ** n.bit_length() # round-up to power-of-two
self.indices = self._make_index(n)
for index, hashvalue in enumerate(self.hashlist):
for i in Dict._gen_probes(hashvalue, n-1):
if self.indices[i] == FREE:
break
self.indices[i] = index
self.filled = self.used
def clear(self):
self.indices = self._make_index(8)
self.hashlist = []
self.keylist = []
self.valuelist = []
self.used = 0
self.filled = 0 # used + dummies
def __getitem__(self, key):
hashvalue = hash(key)
index, i = self._lookup(key, hashvalue)
if index < 0:
raise KeyError(key)
return self.valuelist[index]
def __setitem__(self, key, value):
hashvalue = hash(key)
index, i = self._lookup(key, hashvalue)
if index < 0:
self.indices[i] = self.used
self.hashlist.append(hashvalue)
self.keylist.append(key)
self.valuelist.append(value)
self.used += 1
if index == FREE:
self.filled += 1
if self.filled * 3 > len(self.indices) * 2:
self._resize(4 * len(self))
else:
self.valuelist[index] = value
def __delitem__(self, key):
hashvalue = hash(key)
index, i = self._lookup(key, hashvalue)
if index < 0:
raise KeyError(key)
self.indices[i] = DUMMY
self.used -= 1
# If needed, swap with the lastmost entry to avoid leaving a "hole"
if index != self.used:
lasthash = self.hashlist[-1]
lastkey = self.keylist[-1]
lastvalue = self.valuelist[-1]
lastindex, j = self._lookup(lastkey, lasthash)
assert lastindex >= 0 and i != j
self.indices[j] = index
self.hashlist[index] = lasthash
self.keylist[index] = lastkey
self.valuelist[index] = lastvalue
# Remove the lastmost entry
self.hashlist.pop()
self.keylist.pop()
self.valuelist.pop()
def __init__(self, *args, **kwds):
if not hasattr(self, 'keylist'):
self.clear()
self.update(*args, **kwds)
def __len__(self):
return self.used
def __iter__(self):
return iter(self.keylist)
def iterkeys(self):
return iter(self.keylist)
def keys(self):
return list(self.keylist)
def itervalues(self):
return iter(self.valuelist)
def values(self):
return list(self.valuelist)
def iteritems(self):
return itertools.izip(self.keylist, self.valuelist)
def items(self):
return zip(self.keylist, self.valuelist)
def __contains__(self, key):
index, i = self._lookup(key, hash(key))
return index >= 0
def get(self, key, default=None):
'D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None.'
index, i = self._lookup(key, hash(key))
return self.valuelist[index] if index >= 0 else default
def popitem(self):
''' D.popitem() -> (k, v), remove and return some (key, value) pair as a
2-tuple; but raise KeyError if D is empty.
'''
try:
key = self.keylist[-1]
value = self.valuelist[-1]
except IndexError:
raise KeyError( 'popitem(): dictionary is empty')
del self[key]
return key, value
def __repr__(self):
return 'Dict(%r)' % self.items()
def show_structure(self):
'Diagnostic method. Not part of the API.'
print '=' * 50
print self
print 'Indices:', self.indices
for i, row in enumerate(zip(self.hashlist, self.keylist, self.valuelist)):
print i, row
print '-' * 50
if __name__ == '__main__':
d = Dict([('timmy', 'red'), ('barry', 'green'), ('guido', 'blue')])
d.show_structure()
Diff to Previous Revision
--- revision 17 2012-12-31 06:40:04
+++ revision 18 2013-01-04 18:14:19
@@ -10,24 +10,24 @@
'Space efficient dictionary with fast iteration and cheap resizes.'
@staticmethod
- def _gen_probes(hashvalue, n):
+ def _gen_probes(hashvalue, mask):
'Same sequence of probes used in the current dictionary design'
PERTURB_SHIFT = 5
if hashvalue < 0:
hashvalue = -hashvalue
- i = hashvalue & (n-1)
+ i = hashvalue & mask
yield i
perturb = hashvalue
while True:
- i = 5 * i + perturb + 1
- yield i & (n-1)
+ i = (5 * i + perturb + 1)
+ yield i & mask
perturb >>= PERTURB_SHIFT
def _lookup(self, key, hashvalue):
'Same lookup logic as currently used in real dicts'
assert self.filled < len(self.indices) # At least one open slot
freeslot = None
- for i in self._gen_probes(hashvalue, len(self.indices)):
+ for i in self._gen_probes(hashvalue, len(self.indices)-1):
index = self.indices[i]
if index == FREE:
return (FREE, i) if freeslot is None else (DUMMY, freeslot)
@@ -55,7 +55,7 @@
n = 2 ** n.bit_length() # round-up to power-of-two
self.indices = self._make_index(n)
for index, hashvalue in enumerate(self.hashlist):
- for i in Dict._gen_probes(hashvalue, n):
+ for i in Dict._gen_probes(hashvalue, n-1):
if self.indices[i] == FREE:
break
self.indices[i] = index
@@ -68,13 +68,6 @@
self.valuelist = []
self.used = 0
self.filled = 0 # used + dummies
-
- def __init__(self, *args, **kwds):
- try:
- self.keylist
- except AttributeError:
- self.clear()
- self.update(*args, **kwds)
def __getitem__(self, key):
hashvalue = hash(key)
@@ -121,6 +114,11 @@
self.hashlist.pop()
self.keylist.pop()
self.valuelist.pop()
+
+ def __init__(self, *args, **kwds):
+ if not hasattr(self, 'keylist'):
+ self.clear()
+ self.update(*args, **kwds)
def __len__(self):
return self.used