Welcome, guest | Sign In | My Account | Store | Cart
import array
import collections
import itertools
import pprint

# Placeholder constants
FREE = -1
DUMMY = -2

def round_upto_powtwo(n):
    'Round-up to the next power of two'
    result = 1
    while result <= n:
        result <<= 1
    return result

class Dict(collections.MutableMapping):
    '''Space efficient dictionary with fast iteration
       and cheap resizes (minimum data movement)
    '''

    @staticmethod
    def _gen_probes(hashvalue, n):
        'Same sequence of probes used in the current dictionary design'
        PERTURB_SHIFT = 5
        if hashvalue < 0:
            hashvalue = int(2 ** 64 - hashvalue)
        i = hashvalue & (n-1)
        yield int(i)
        perturb = hashvalue
        while True:
            i = ((i << 2) + i + perturb + 1)
            yield int(i & (n-1))
            perturb >>= PERTURB_SHIFT

    def _lookup(self, key, hashvalue):
        'Same lookup logic as currently used in real dicts'
        assert len(self.keylist) < len(self.indices)   # At least one open slot
        freeslot = None
        for i in self._gen_probes(hashvalue, len(self.indices)):
            if self.indices[i] is FREE:
                if freeslot is not None:
                    i = freeslot
                return (False, i)
            elif self.indices[i] is DUMMY:
                freeslot = i
            else:
                index = self.indices[i]
                if (self.keylist[index] is key or
                    self.hashlist[index] == hashvalue
                    and self.keylist[index] == key):
                    return (True, i)

    def _next_open_index(self):
        '''New logic:  Store hash/key/value entries in a dense list.
           Append a new entry to the end of the list. '''
        assert self.size == len(self.keylist)
        index = len(self.keylist)
        self.hashlist.append(None)
        self.keylist.append(None)
        self.valuelist.append(None)
        return index

    @staticmethod
    def _make_index(n):
        'New sequence of indices using the smallest possible datatype'
        typecode = 'b' if n <= 128 else 'h' if n <= 65536 else 'l'
        return array.array(typecode, itertools.repeat(FREE, n))

    def _resize(self, n):
        '''Reindex the existing hash/key/value entries.
           Entries do not get moved, the only get new indices.
           No calls are made to hash() or __eq__().

        '''
        n = round_upto_powtwo(n)
        self.indices = self._make_index(n)
        for index, hashvalue in enumerate(self.hashlist):
            for i in Dict._gen_probes(hashvalue, n):
                if self.indices[i] is FREE:
                    break
            self.indices[i] = index

    def __init__(self, *args, **kwds):
        self.indices = self._make_index(8)
        self.hashlist = []
        self.keylist = []
        self.valuelist = []
        self.size = 0
        self.update(*args, **kwds)

    def __setitem__(self, key, value):
        hashvalue = hash(key)
        found, i = self._lookup(key, hashvalue)
        if found:
            index = self.indices[i]
            self.valuelist[index] = value
        else:
            index = self._next_open_index()
            self.indices[i] = index
            self.hashlist[index] = hashvalue
            self.keylist[index] = key
            self.valuelist[index] = value
            self.size += 1
            if self.indices.count(FREE) * 3 < len(self.indices):
                self._resize(2 * len(self))

    def __getitem__(self, key):
        hashvalue = hash(key)
        found, i = self._lookup(key, hashvalue)
        if found:
            index = self.indices[i]
            return self.valuelist[index]
        else:
            raise KeyError(key)

    def __delitem__(self, key):
        hashvalue = hash(key)
        found, i = self._lookup(key, hashvalue)
        if found:
            index = self.indices[i]
            self.indices[i] = DUMMY
            self.size -= 1
            if index != self.size:
                lasthash = self.hashlist[-1]
                lastkey = self.keylist[-1]
                lastvalue = self.valuelist[-1]
                found, j = self._lookup(lastkey, lasthash)
                assert found and i != j
                self.indices[j] = index
                self.hashlist[index] = lasthash
                self.keylist[index] = lastkey
                self.valuelist[index] = lastvalue
            self.hashlist.pop()
            self.keylist.pop()
            self.valuelist.pop()
        else:
            raise KeyError(key)

    def __iter__(self):
        for key in self.keylist:
            yield key

    def __len__(self):
        return self.size

    def __repr__(self):
        return 'Dict(%r)' % self.items()

    ###### Diagnostic methods.  Not part of the API ##############

    def show_structure(self):
        print '=' * 50
        print 'Dict:', self
        print 'Size in bytes:', self.memory()
        print 'Indices:', self.indices
        entries = itertools.izip_longest(
                        xrange(len(self.hashlist)),
                        self.hashlist, self.keylist, self.valuelist,
                        fillvalue = ':ERROR:')
        pprint.pprint(list(entries))
        print '-' * 50

    def memory(self):
        'Size in bytes assuming a 64-bit build'
        return (len(self.indices) * self.indices.itemsize
                + len(self.hashlist) * 8 * 3)

if __name__ == '__main__':

    d = Dict([('timmy', 'red'), ('barry', 'green'), ('guido', 'blue')])
    d.show_structure()

Diff to Previous Revision

--- revision 4 2012-12-10 04:39:57
+++ revision 5 2012-12-11 04:57:48
@@ -6,7 +6,6 @@
 # Placeholder constants
 FREE = -1
 DUMMY = -2
-UNUSED = ':UNUSED:'          # In C, be sure to use a unique object
 
 def round_upto_powtwo(n):
     'Round-up to the next power of two'
@@ -54,43 +53,29 @@
 
     def _next_open_index(self):
         '''New logic:  Store hash/key/value entries in a dense list.
-           If an unused entry is available, re-use it; otherwise,
-           append a new entry to the end of the list. '''
-        if self.size == len(self.keylist):
-            index = len(self.keylist)
-            self.hashlist.append(None)
-            self.keylist.append(None)
-            self.valuelist.append(None)
-            return index
-        # Least common case.  Can make fast in C with a pointer chain.
-        return self.keylist.index(UNUSED)
+           Append a new entry to the end of the list. '''
+        assert self.size == len(self.keylist)
+        index = len(self.keylist)
+        self.hashlist.append(None)
+        self.keylist.append(None)
+        self.valuelist.append(None)
+        return index
 
-    def _make_index(self, n):
+    @staticmethod
+    def _make_index(n):
         'New sequence of indices using the smallest possible datatype'
         typecode = 'b' if n <= 128 else 'h' if n <= 65536 else 'l'
         return array.array(typecode, itertools.repeat(FREE, n))
 
     def _resize(self, n):
-        '''Reindex the existing hash/key/value entries
-           and compact the entry list.
-
+        '''Reindex the existing hash/key/value entries.
+           Entries do not get moved, the only get new indices.
            No calls are made to hash() or __eq__().
-           No hash/key/value entries get moved except
-           to fill-in an UNUSED slot in the entry sequence.
 
         '''
         n = round_upto_powtwo(n)
         self.indices = self._make_index(n)
         for index, hashvalue in enumerate(self.hashlist):
-            while hashvalue is UNUSED:
-                hashvalue = self.hashlist.pop()
-                key = self.keylist.pop()
-                value = self.valuelist.pop()
-                if index >= len(self.hashlist):
-                    return
-                self.hashlist[index] = hashvalue
-                self.keylist[index] = key
-                self.valuelist[index] = value
             for i in Dict._gen_probes(hashvalue, n):
                 if self.indices[i] is FREE:
                     break
@@ -129,24 +114,32 @@
         else:
             raise KeyError(key)
 
-    def __delitem__(self, key, hashvalue=None):
-        if hashvalue is None:
-            hashvalue = hash(key)
+    def __delitem__(self, key):
+        hashvalue = hash(key)
         found, i = self._lookup(key, hashvalue)
         if found:
             index = self.indices[i]
             self.indices[i] = DUMMY
-            self.hashlist[index] = UNUSED
-            self.keylist[index] = UNUSED
-            self.valuelist[index] = UNUSED
             self.size -= 1
+            if index != self.size:
+                lasthash = self.hashlist[-1]
+                lastkey = self.keylist[-1]
+                lastvalue = self.valuelist[-1]
+                found, j = self._lookup(lastkey, lasthash)
+                assert found and i != j
+                self.indices[j] = index
+                self.hashlist[index] = lasthash
+                self.keylist[index] = lastkey
+                self.valuelist[index] = lastvalue
+            self.hashlist.pop()
+            self.keylist.pop()
+            self.valuelist.pop()
         else:
             raise KeyError(key)
 
     def __iter__(self):
         for key in self.keylist:
-            if key is not UNUSED:
-                yield key
+            yield key
 
     def __len__(self):
         return self.size

History