Proof-of-concept for a more space-efficient, faster-looping dictionary « Python recipes

Save space and improve iteration speed by moving the hash/key/value entries to a densely packed array keeping only a sparse array of indices. This eliminates wasted space without requiring any algorithmic changes.

      import array
import collections
import itertools

# Placeholder constants
FREE = -1
DUMMY = -2

class Dict(collections.MutableMapping):
    'Space efficient dictionary with fast iteration and cheap resizes.'

    @staticmethod
    def _gen_probes(hashvalue, mask):
        'Same sequence of probes used in the current dictionary design'
        PERTURB_SHIFT = 5
        if hashvalue < 0:
            hashvalue = -hashvalue
        i = hashvalue & mask
        yield i
        perturb = hashvalue
        while True:
            i = (5 * i + perturb + 1) & 0xFFFFFFFFFFFFFFFF
            yield i & mask
            perturb >>= PERTURB_SHIFT

    def _lookup(self, key, hashvalue):
        'Same lookup logic as currently used in real dicts'
        assert self.filled < len(self.indices)   # At least one open slot
        freeslot = None
        for i in self._gen_probes(hashvalue, len(self.indices)-1):
            index = self.indices[i]
            if index == FREE:
                return (FREE, i) if freeslot is None else (DUMMY, freeslot)
            elif index == DUMMY:
                if freeslot is None:
                    freeslot = i
            elif (self.keylist[index] is key or
                  self.hashlist[index] == hashvalue
                  and self.keylist[index] == key):
                    return (index, i)

    @staticmethod
    def _make_index(n):
        'New sequence of indices using the smallest possible datatype'
        if n <= 2**7: return array.array('b', [FREE]) * n       # signed char
        if n <= 2**15: return array.array('h', [FREE]) * n      # signed short
        if n <= 2**31: return array.array('l', [FREE]) * n      # signed long
        return [FREE] * n                                       # python integers

    def _resize(self, n):
        '''Reindex the existing hash/key/value entries.
           Entries do not get moved, they only get new indices.
           No calls are made to hash() or __eq__().

        '''
        n = 2 ** n.bit_length()                     # round-up to power-of-two
        self.indices = self._make_index(n)
        for index, hashvalue in enumerate(self.hashlist):
            for i in Dict._gen_probes(hashvalue, n-1):
                if self.indices[i] == FREE:
                    break
            self.indices[i] = index
        self.filled = self.used

    def clear(self):
        self.indices = self._make_index(8)
        self.hashlist = []
        self.keylist = []
        self.valuelist = []
        self.used = 0
        self.filled = 0                                         # used + dummies

    def __getitem__(self, key):
        hashvalue = hash(key)
        index, i = self._lookup(key, hashvalue)
        if index < 0:
            raise KeyError(key)
        return self.valuelist[index]

    def __setitem__(self, key, value):
        hashvalue = hash(key)
        index, i = self._lookup(key, hashvalue)
        if index < 0:
            self.indices[i] = self.used
            self.hashlist.append(hashvalue)
            self.keylist.append(key)
            self.valuelist.append(value)
            self.used += 1
            if index == FREE:
                self.filled += 1
                if self.filled * 3 > len(self.indices) * 2:
                    self._resize(4 * len(self))
        else:
            self.valuelist[index] = value

    def __delitem__(self, key):
        hashvalue = hash(key)
        index, i = self._lookup(key, hashvalue)
        if index < 0:
            raise KeyError(key)
        self.indices[i] = DUMMY
        self.used -= 1
        # If needed, swap with the lastmost entry to avoid leaving a "hole"
        if index != self.used:
            lasthash = self.hashlist[-1]
            lastkey = self.keylist[-1]
            lastvalue = self.valuelist[-1]
            lastindex, j = self._lookup(lastkey, lasthash)
            assert lastindex >= 0 and i != j
            self.indices[j] = index
            self.hashlist[index] = lasthash
            self.keylist[index] = lastkey
            self.valuelist[index] = lastvalue
        # Remove the lastmost entry
        self.hashlist.pop()
        self.keylist.pop()
        self.valuelist.pop()

    def __init__(self, *args, **kwds):
        if not hasattr(self, 'keylist'):
            self.clear()
        self.update(*args, **kwds)

    def __len__(self):
        return self.used

    def __iter__(self):
        return iter(self.keylist)

    def iterkeys(self):
        return iter(self.keylist)

    def keys(self):
        return list(self.keylist)

    def itervalues(self):
        return iter(self.valuelist)

    def values(self):
        return list(self.valuelist)

    def iteritems(self):
        return itertools.izip(self.keylist, self.valuelist)

    def items(self):
        return zip(self.keylist, self.valuelist)

    def __contains__(self, key):
        index, i = self._lookup(key, hash(key))
        return index >= 0

    def get(self, key, default=None):
        index, i = self._lookup(key, hash(key))
        return self.valuelist[index] if index >= 0 else default

    def popitem(self):
        if not self.keylist:
            raise KeyError('popitem(): dictionary is empty')
        key = self.keylist[-1]
        value = self.valuelist[-1]
        del self[key]
        return key, value

    def __repr__(self):
        return 'Dict(%r)' % self.items()

    def show_structure(self):
        'Diagnostic method.  Not part of the API.'
        print '=' * 50
        print self
        print 'Indices:', self.indices
        for i, row in enumerate(zip(self.hashlist, self.keylist, self.valuelist)):
            print i, row
        print '-' * 50


if __name__ == '__main__':

    d = Dict([('timmy', 'red'), ('barry', 'green'), ('guido', 'blue')])
    d.show_structure()

      

The current memory layout for dictionaries is unnecessarily inefficient. It has a sparse table of 24-byte entries containing the hash value, key pointer, and value pointer.

Instead, the 24-byte entries should be stored in a dense table referenced by a sparse table of indices.

For example, the dictionary:

d = {'timmy': 'red', 'barry': 'green', 'guido': 'blue'}

is currently stored as:

entries = [['--', '--', '--'],
           [-8522787127447073495, 'barry', 'green'],
           ['--', '--', '--'],
           ['--', '--', '--'],
           ['--', '--', '--'],
           [-9092791511155847987, 'timmy', 'red'],
           ['--', '--', '--'],
           [-6480567542315338377, 'guido', 'blue']]

Instead, the data should be organized as follows:

indices =  [None, 1, None, None, None, 0, None, 2]
entries =  [[-9092791511155847987, 'timmy', 'red'],
            [-8522787127447073495, 'barry', 'green'],
            [-6480567542315338377, 'guido', 'blue']]

Only the data layout needs to change. The hash table algorithms would stay the same. All of the current optimizations would be kept, including key-sharing dicts and custom lookup functions for string-only dicts. There is no change to the hash functions, the table search order, or collision statistics.

The memory savings are significant (from 30% to 95% compression depending on the how full the table is). Small dicts (size 0, 1, or 2) get the most benefit.

For a sparse table of size t with n entries, the sizes are:

curr_size = 24 * t
new_size = 24 * n + sizeof(index) * t

In the above timmy/barry/guido example, the current size is 192 bytes (eight 24-byte entries) and the new size is 80 bytes (three 24-byte entries plus eight 1-byte indices). That gives 58% compression.

Note, the sizeof(index) can be as small as a single byte for small dicts, two bytes for bigger dicts and up to sizeof(Py_ssize_t) for huge dict.

In addition to space savings, the new memory layout makes iteration faster. Currently, keys(), values(), and items() loop over the sparse table, skipping-over free slots in the hash table. Now, keys/values/items can loop directly over the dense table, using fewer memory accesses.

Another benefit is that resizing is faster and touches fewer pieces of memory. Currently, every hash/key/value entry is moved or copied during a resize. In the new layout, only the indices are updated. For the most part, the hash/key/value entries never move (except for swaps to fill holes left by a deletion).

With the reduced memory footprint, we can also expect better cache utilization.

YMMV: Keep in mind that the above size statics assume a build with 64-bit Py_ssize_t and 64-bit pointers. The space savings percentages are a bit different on other builds. Also, note that in many applications, the size of the data dominates the size of the container (i.e. the weight of a bucket of water is mostly the water, not the bucket).

The entries list uses regular Python lists which over-allocate in order to append() efficiently. The growth pattern for regular Python lists is: 0, 4, 8, 16, 25, 35, 46, 58, 72, 88, ... For larger lists, this over-allocation is no more than 12.5%.

In summary, the memory and speed performance of the entries table is the same as for regular Python lists -- it has the same iteration speed, growth rate, and append/pop performance,

The performance of the index table is the same as for regular Python dictionaries but is substantially more space efficient (from 3 to 24 times more space efficient than regular dictionaries). The smaller size makes resizing substantially faster and it improves cache performance.

Tags: compact, dictionary

3 comments

Tal Einat 11 years, 4 months ago # | flag

In clear(), why do self.indices = self._make_index(8)? Would the performance hit of doing this only upon the first insertion be significant? The way it is, having many empty dicts has an extra 8 bytes of overhead per dict for the unused array of indices.

derek zhou 10 years, 5 months ago # | flag

benchmark results:

Dict

insert:  2.7
update:  0.94
delete:  2.16

dict

insert:  0.22
update:  0.16
delete:  0.12

tests:

import time
import random
d = Dict()
length = 1000000
l = range(length)
r = random.sample(l, length/2)

print 'Dict'
start = time.clock()
for num in l:
    d[num] = l[length-1-num]
elapsed = time.clock() - start
print 'insert:  ' + str(elapsed)
start = time.clock()
for num in r:
    d[num] = num
elapsed = time.clock() - start
print 'update:  ' + str(elapsed)
start = time.clock()
for num in r:
    del d[num]
elapsed = time.clock() - start
print 'delete:  ' + str(elapsed)

print 'dict'
d = {}
start = time.clock()
for num in l:
    d[num] = l[length-1-num]
elapsed = time.clock() - start
print 'insert:  ' + str(elapsed)
start = time.clock()
for num in r:
    d[num] = num
elapsed = time.clock() - start
print 'update:  ' + str(elapsed)
start = time.clock()
for num in r:
    del d[num]
elapsed = time.clock() - start
print 'delete:  ' + str(elapsed)

Joe Limome 8 years, 3 months ago # | flag

@derek zhou I think you are missing the point. This is not meant to be faster than a dict for insert, update and delete at least not in a pure Python implementation. This is meant to be more compact and faster looping, and it meets this goal even in a pure Python.

◄	Python recipes (4591)	►
◄	Raymond Hettinger's recipes (97)	►

Proof-of-concept for a more space-efficient, faster-looping dictionary (Python recipe) by Raymond Hettinger
ActiveState Code (http://code.activestate.com/recipes/578375/)

3 comments

Tags

Required Modules

Other Information and Tasks

Accounts

Code Recipes

Feedback & Information

ActiveState

Proof-of-concept for a more space-efficient, faster-looping dictionary (Python recipe) by Raymond Hettinger ActiveState Code (http://code.activestate.com/recipes/578375/)

3 comments

Tags

Required Modules

Other Information and Tasks

Accounts

Code Recipes

Feedback & Information

ActiveState

Proof-of-concept for a more space-efficient, faster-looping dictionary (Python recipe) by Raymond Hettinger
ActiveState Code (http://code.activestate.com/recipes/578375/)