import array
import collections
import itertools
import pprint
# Placeholder constants
FREE = -1
DUMMY = -2
def round_upto_powtwo(n):
'Round-up to the next power of two'
result = 1
while result <= n:
result <<= 1
return result
class Dict(collections.MutableMapping):
'''Space efficient dictionary with fast iteration
and cheap resizes (minimum data movement)
'''
@staticmethod
def _gen_probes(hashvalue, n):
'Same sequence of probes used in the current dictionary design'
PERTURB_SHIFT = 5
if hashvalue < 0:
hashvalue = int(2 ** 64 - hashvalue)
i = hashvalue & (n-1)
yield int(i)
perturb = hashvalue
while True:
i = ((i << 2) + i + perturb + 1)
yield int(i & (n-1))
perturb >>= PERTURB_SHIFT
def _lookup(self, key, hashvalue):
'Same lookup logic as currently used in real dicts'
assert len(self.keylist) < len(self.indices) # At least one open slot
freeslot = None
for i in self._gen_probes(hashvalue, len(self.indices)):
if self.indices[i] is FREE:
if freeslot is not None:
i = freeslot
return (False, i)
elif self.indices[i] is DUMMY:
freeslot = i
else:
index = self.indices[i]
if (self.keylist[index] is key or
self.hashlist[index] == hashvalue
and self.keylist[index] == key):
return (True, i)
def _next_open_index(self):
'''New logic: Store hash/key/value entries in a dense list.
Append a new entry to the end of the list. '''
assert self.size == len(self.keylist)
index = len(self.keylist)
self.hashlist.append(None)
self.keylist.append(None)
self.valuelist.append(None)
return index
@staticmethod
def _make_index(n):
'New sequence of indices using the smallest possible datatype'
typecode = 'b' if n <= 128 else 'h' if n <= 65536 else 'l'
return array.array(typecode, itertools.repeat(FREE, n))
def _resize(self, n):
'''Reindex the existing hash/key/value entries.
Entries do not get moved, the only get new indices.
No calls are made to hash() or __eq__().
'''
n = round_upto_powtwo(n)
self.indices = self._make_index(n)
for index, hashvalue in enumerate(self.hashlist):
for i in Dict._gen_probes(hashvalue, n):
if self.indices[i] is FREE:
break
self.indices[i] = index
def __init__(self, *args, **kwds):
self.indices = self._make_index(8)
self.hashlist = []
self.keylist = []
self.valuelist = []
self.size = 0
self.update(*args, **kwds)
def __setitem__(self, key, value):
hashvalue = hash(key)
found, i = self._lookup(key, hashvalue)
if found:
index = self.indices[i]
self.valuelist[index] = value
else:
index = self._next_open_index()
self.indices[i] = index
self.hashlist[index] = hashvalue
self.keylist[index] = key
self.valuelist[index] = value
self.size += 1
if self.indices.count(FREE) * 3 < len(self.indices):
self._resize(2 * len(self))
def __getitem__(self, key):
hashvalue = hash(key)
found, i = self._lookup(key, hashvalue)
if found:
index = self.indices[i]
return self.valuelist[index]
else:
raise KeyError(key)
def __delitem__(self, key):
hashvalue = hash(key)
found, i = self._lookup(key, hashvalue)
if found:
index = self.indices[i]
self.indices[i] = DUMMY
self.size -= 1
if index != self.size:
lasthash = self.hashlist[-1]
lastkey = self.keylist[-1]
lastvalue = self.valuelist[-1]
found, j = self._lookup(lastkey, lasthash)
assert found and i != j
self.indices[j] = index
self.hashlist[index] = lasthash
self.keylist[index] = lastkey
self.valuelist[index] = lastvalue
self.hashlist.pop()
self.keylist.pop()
self.valuelist.pop()
else:
raise KeyError(key)
def __iter__(self):
for key in self.keylist:
yield key
def __len__(self):
return self.size
def __repr__(self):
return 'Dict(%r)' % self.items()
###### Diagnostic methods. Not part of the API ##############
def show_structure(self):
print '=' * 50
print 'Dict:', self
print 'Size in bytes:', self.memory()
print 'Indices:', self.indices
entries = itertools.izip_longest(
xrange(len(self.hashlist)),
self.hashlist, self.keylist, self.valuelist,
fillvalue = ':ERROR:')
pprint.pprint(list(entries))
print '-' * 50
def memory(self):
'Size in bytes assuming a 64-bit build'
return (len(self.indices) * self.indices.itemsize
+ len(self.hashlist) * 8 * 3)
if __name__ == '__main__':
d = Dict([('timmy', 'red'), ('barry', 'green'), ('guido', 'blue')])
d.show_structure()
Diff to Previous Revision
--- revision 4 2012-12-10 04:39:57
+++ revision 5 2012-12-11 04:57:48
@@ -6,7 +6,6 @@
# Placeholder constants
FREE = -1
DUMMY = -2
-UNUSED = ':UNUSED:' # In C, be sure to use a unique object
def round_upto_powtwo(n):
'Round-up to the next power of two'
@@ -54,43 +53,29 @@
def _next_open_index(self):
'''New logic: Store hash/key/value entries in a dense list.
- If an unused entry is available, re-use it; otherwise,
- append a new entry to the end of the list. '''
- if self.size == len(self.keylist):
- index = len(self.keylist)
- self.hashlist.append(None)
- self.keylist.append(None)
- self.valuelist.append(None)
- return index
- # Least common case. Can make fast in C with a pointer chain.
- return self.keylist.index(UNUSED)
+ Append a new entry to the end of the list. '''
+ assert self.size == len(self.keylist)
+ index = len(self.keylist)
+ self.hashlist.append(None)
+ self.keylist.append(None)
+ self.valuelist.append(None)
+ return index
- def _make_index(self, n):
+ @staticmethod
+ def _make_index(n):
'New sequence of indices using the smallest possible datatype'
typecode = 'b' if n <= 128 else 'h' if n <= 65536 else 'l'
return array.array(typecode, itertools.repeat(FREE, n))
def _resize(self, n):
- '''Reindex the existing hash/key/value entries
- and compact the entry list.
-
+ '''Reindex the existing hash/key/value entries.
+ Entries do not get moved, the only get new indices.
No calls are made to hash() or __eq__().
- No hash/key/value entries get moved except
- to fill-in an UNUSED slot in the entry sequence.
'''
n = round_upto_powtwo(n)
self.indices = self._make_index(n)
for index, hashvalue in enumerate(self.hashlist):
- while hashvalue is UNUSED:
- hashvalue = self.hashlist.pop()
- key = self.keylist.pop()
- value = self.valuelist.pop()
- if index >= len(self.hashlist):
- return
- self.hashlist[index] = hashvalue
- self.keylist[index] = key
- self.valuelist[index] = value
for i in Dict._gen_probes(hashvalue, n):
if self.indices[i] is FREE:
break
@@ -129,24 +114,32 @@
else:
raise KeyError(key)
- def __delitem__(self, key, hashvalue=None):
- if hashvalue is None:
- hashvalue = hash(key)
+ def __delitem__(self, key):
+ hashvalue = hash(key)
found, i = self._lookup(key, hashvalue)
if found:
index = self.indices[i]
self.indices[i] = DUMMY
- self.hashlist[index] = UNUSED
- self.keylist[index] = UNUSED
- self.valuelist[index] = UNUSED
self.size -= 1
+ if index != self.size:
+ lasthash = self.hashlist[-1]
+ lastkey = self.keylist[-1]
+ lastvalue = self.valuelist[-1]
+ found, j = self._lookup(lastkey, lasthash)
+ assert found and i != j
+ self.indices[j] = index
+ self.hashlist[index] = lasthash
+ self.keylist[index] = lastkey
+ self.valuelist[index] = lastvalue
+ self.hashlist.pop()
+ self.keylist.pop()
+ self.valuelist.pop()
else:
raise KeyError(key)
def __iter__(self):
for key in self.keylist:
- if key is not UNUSED:
- yield key
+ yield key
def __len__(self):
return self.size