A memory efficient implementation of a mapping type for mappings with fixed keys.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 | __all__ = ['fkdict']
__author__ = 'George Sakkis <george.sakkis AT gmail DOT com>'
import UserDict
from itertools import chain,izip
class fkdict(object):
'''A dict-like class for mappings with fixed keys.
The main feature of this class is memory efficiency in the scenario of
having many dicts with the same keys. Example use cases are rows read from a
csv.DictReader or constructed out of rows fetched from a database. All such
rows share the same keys() and therefore each row has to only hold the
values().
An additional feature is predictable ordering:
fkdict.fromkeys('abcd').keys() == list('abcd')
Currently fkdict does not support the dict operations that change the size
of the container. Therefore, __delitem__(), pop(), popitem(), clear() and
setdefault() are not provided. Also __setitem__() raises KeyError if the key
does not already exist.
'''
__slots__ = ['_values']
def __init__(self, seq=(), **kwds):
# normalize arguments to a (key,value) iterable
if hasattr(seq, 'keys'):
get = seq.__getitem__
seq = ((k,get(k)) for k in seq.keys())
if kwds:
seq = chain(seq, kwds.iteritems())
# scan the items keeping track of the keys' order
keys,values = [],[]
keys_set = set()
for k,v in seq:
if k not in keys_set:
keys_set.add(k)
keys.append(k)
values.append(v)
else:
values[keys.index(k)] = v
# mutate self to the appropriate subclass
self.__class__ = self._subcls_factory(*keys)
self._values = values
@classmethod
def fromkeys(cls, keys, default=None):
self = cls()
self.__class__ = cls._subcls_factory(*keys)
self._values = [default] * len(self)
return self
def __len__(self):
return len(self._keys)
def __contains__(self, key):
return key in self._key2index
def has_key(self, key):
return key in self._key2index
def __getitem__(self, key):
return self._values[self._key2index[key]]
def get(self, key, default=None):
try: return self[key]
except KeyError: return default
def __setitem__(self, key, value):
self._values[self._key2index[key]] = value
def update(self, other=None, **kwargs):
# copied from UserDict.DictMixin
# Make progressively weaker assumptions about "other"
if other is None:
pass
elif hasattr(other, 'iteritems'):
for k, v in other.iteritems():
self[k] = v
elif hasattr(other, 'keys'):
for k in other.keys():
self[k] = other[k]
else:
for k, v in other:
self[k] = v
for k, v in kwargs.iteritems():
self[k] = v
def __iter__(self):
return iter(self._keys)
def iterkeys(self):
return iter(self._keys)
def itervalues(self):
return iter(self._values)
def iteritems(self):
return izip(self._keys, self._values)
def keys(self):
return list(self._keys)
def values(self):
return list(self._values)
def items(self):
return zip(self._keys, self._values)
def copy(self):
return self.__class__(self.iteritems())
def __cmp__(self, other):
# copied from UserDict.DictMixin
if other is None:
return 1
if isinstance(other, UserDict.DictMixin):
other = dict(other.iteritems())
return cmp(dict(self.iteritems()), other)
def __repr__(self):
return '{%s}' % ', '.join('%r: %r' % item for item in self.iteritems())
@classmethod
def _subcls_factory(cls, *keys):
# if cls is hidden, find its first non hidden ancestor
cls = (c for c in cls.mro() if not issubclass(c,_Hidden)).next()
# each (non hidden) class maintains its own registry
try: registry = cls.__dict__['_Registry']
except KeyError:
registry = cls._Registry = {}
try: return registry[keys]
except KeyError:
cls_name = '%s_%d' % (cls.__name__, abs(hash(keys)))
cls_dict = {
'__slots__' : (),
'_keys' : keys,
'_key2index' : dict((k,i) for i,k in enumerate(keys))
}
registry[keys] = sub = type(cls_name, (cls,_Hidden), cls_dict)
return sub
class _Hidden(object):
__slots__ = ()
|
Builtin dicts are usually the most efficient way to hold and manipulate a mapping. A special case for which generic dicts are less than ideal in terms of memory consumption is when having many mappings with the same keys. Use cases include rows read from a csv.DictReader or constructed out of rows fetched from a database. Data of this kind are often represented as tuples (or lists), but it's usually more convenient to be able to index an item by name rather than position. On the other hand, it is much more expensive to hold tens or hundreds thousands of rows as dicts than as tuples.
fkdict combines the convenience of dicts with the lower overhead of tuples/lists. In the snippet below, test_mem(dict) takes up around 246 MB while test_mem(fkdict) takes only 49 MB: <pre> def test_mem(maptype): d = [(i,str(i)) for i in range(1000)] ds = [maptype(d) for i in xrange(10000)] raw_input('finished') </pre> This implementation has each fkdict instance store only the values as a list. The keys and the mapping of keys to indices are stored in a dynamically generated subclass of fkdict, so that self._keys and self._key2index are also accessible from the instance. The dynamically generated subclasses are cached so that the second time an fkdict with the same keys is created, the "hidden" cached class is called. Since the keys are determined in fkdict.__init__(), this scheme requires changing self.__class__ to the dynamically generated subclass. Inheritance is also supported as each (non hidden) class has its own separate cache.
fkdict supports all dict operations except for those that change the size of the container. Therefore, __delitem__(), pop(), popitem(), clear() and setdefault() are not provided. Also __setitem__() raises KeyError if the key does not already exist. This restriction can be raised by having the instance change its class to the appropriate hidden cached subclass every time its size changes. However, treating fkdict as a generic dict and allowing arbitrary additions and removals may cause a serious performance overhead, canceling the very reason of its usefulness.