Implementation of an abstract, thread-safe cache with minimal locking. Four concrete implementations : a validating file cache, a validating HTTP cache, an experimental Python module cache and a function cache. Plus, an abstract cache with weak references to its values.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 | # -*- coding: iso-8859-1 -*-
from os import stat
from time import time, mktime
from rfc822 import parsedate
from calendar import timegm
import urllib2
import re
import weakref
import new
try:
from threading import Lock
except ImportError:
from dummy_threading import Lock
NOT_INITIALIZED = object()
class Entry(object):
""" A cache entry, mostly an internal object. """
def __init__(self, key):
object.__init__(self)
self._key=key
self._value=NOT_INITIALIZED
self._lock=Lock()
class Cache(object):
""" An abstract, multi-threaded cache object. """
def __init__(self, max_size=0):
""" Builds a cache with a limit of max_size entries.
If this limit is exceeded, the Least Recently Used entry is discarded.
if max_size==0, the cache is unbounded (no LRU rule is applied).
"""
object.__init__(self)
self._maxsize=max_size
self._dict={}
self._lock=Lock()
# Header of the access list
if self._maxsize:
self._head=Entry(None)
self._head._previous=self._head
self._head._next=self._head
def __setitem__(self, name, value):
""" Populates the cache with a given name and value. """
key = self.key(name)
entry = self._get_entry(key)
entry._lock.acquire()
try:
self._pack(entry,value)
self.commit()
finally:
entry._lock.release()
def __getitem__(self, name):
""" Gets a value from the cache, builds it if required.
"""
return self._checkitem(name)[2]
def __delitem__(self, name):
self._lock.acquire()
try:
key = self.key(name)
del self._dict[key]
finally:
self._lock.release()
def _get_entry(self,key):
self._lock.acquire()
try:
entry = self._dict.get(key)
if not entry:
entry = Entry(key)
self._dict[key]=entry
if self._maxsize:
entry._next = entry._previous = None
self._access(entry)
self._checklru()
elif self._maxsize:
self._access(entry)
return entry
finally:
self._lock.release()
def _checkitem(self, name):
""" Gets a value from the cache, builds it if required.
Returns a tuple is_new, key, value, entry.
If is_new is True, the result had to be rebuilt.
"""
key = self.key(name)
entry = self._get_entry(key)
entry._lock.acquire()
try:
value = self._unpack(entry)
is_new = False
if value is NOT_INITIALIZED:
opened = self.check(key, name, entry)
value = self.build(key, name, opened, entry)
is_new = True
self._pack(entry, value)
self.commit()
else:
opened = self.check(key, name, entry)
if opened is not None:
value = self.build(key, name, opened, entry)
is_new = True
self._pack(entry, value)
self.commit()
return is_new, key, value, entry
finally:
entry._lock.release()
def mru(self):
""" Returns the Most Recently Used key """
if self._maxsize:
self._lock.acquire()
try:
return self._head._previous._key
finally:
self._lock.release()
else:
return None
def lru(self):
""" Returns the Least Recently Used key """
if self._maxsize:
self._lock.acquire()
try:
return self._head._next._key
finally:
self._lock.release()
else:
return None
def key(self, name):
""" Override this method to extract a key from the name passed to the [] operator """
return name
def commit(self):
""" Override this method if you want to do something each time the underlying dictionary is modified (e.g. make it persistent). """
pass
def clear(self):
""" Clears the cache """
self._lock.acquire()
try:
self._dict.clear()
if self._maxsize:
self._head._next=self._head
self._head._previous=self._head
finally:
self._lock.release()
def check(self, key, name, entry):
""" Override this method to check whether the entry with the given name is stale. Return None if it is fresh
or an opened resource if it is stale. The object returned will be passed to the 'build' method as the 'opened' parameter.
Use the 'entry' parameter to store meta-data if required. Don't worry about multiple threads accessing the same name,
as this method is properly isolated.
"""
return None
def build(self, key, name, opened, entry):
""" Build the cached value with the given name from the given opened resource. Use entry to obtain or store meta-data if needed.
Don't worry about multiple threads accessing the same name, as this method is properly isolated.
"""
raise NotImplementedError()
def _access(self, entry):
" Internal use only, must be invoked within a cache lock. Updates the access list. """
if entry._next is not self._head:
if entry._previous is not None:
# remove the entry from the access list
entry._previous._next=entry._next
entry._next._previous=entry._previous
# insert the entry at the end of the access list
entry._previous=self._head._previous
entry._previous._next=entry
entry._next=self._head
entry._next._previous=entry
if self._head._next is self._head:
self._head._next=entry
def _checklru(self):
" Internal use only, must be invoked within a cache lock. Removes the LRU entry if needed. """
if len(self._dict)>self._maxsize:
lru=self._head._next
lru._previous._next=lru._next
lru._next._previous=lru._previous
del self._dict[lru._key]
def _pack(self, entry, value):
""" Store the value in the entry. """
entry._value=value
def _unpack(self, entry):
""" Recover the value from the entry, returns NOT_INITIALIZED if it is not OK. """
return entry._value
class WeakCache(Cache):
""" This cache holds weak references to the values it stores. Whenever a value is not longer
normally referenced, it is removed from the cache. Useful for sharing the result of long
computations but letting them go as soon as they are not needed by anybody.
"""
def _pack(self, entry, value):
entry._value=weakref.ref(value, lambda ref: self.__delitem__(entry._key))
def _unpack(self, entry):
if entry._value is NOT_INITIALIZED:
return NOT_INITIALIZED
value = entry._value()
if value is None:
return NOT_INITIALIZED
else:
return value
class FileCache(Cache):
""" A file cache. Returns the content of the files as a string, given their filename.
Whenever the files are modified (according to their modification time) the cache is updated.
Override the build method to obtain more interesting behaviour.
"""
def __init__(self, max_size=0, mode='rb'):
Cache.__init__(self, max_size)
self.mode=mode
def check(self, key, name, entry):
timestamp = stat(key).st_mtime
if entry._value is NOT_INITIALIZED:
entry._timestamp = timestamp
return file(key, self.mode)
else:
if entry._timestamp != timestamp:
entry._timestamp = timestamp
return file(key, self.mode)
else:
return None
def build(self, key, name, opened, entry):
""" Return the content of the file as a string. Override this for better behaviour. """
try:
return opened.read()
finally:
opened.close()
def parseRFC822Time(t):
return mktime(parsedate(t))
re_max_age=re.compile('max-age\s*=\s*(\d+)', re.I)
class HTTPEntity(object):
def __init__(self, entity, metadata):
self.entity=entity
self.metadata=metadata
def __repr__(self):
return 'HTTPEntity(%s, %s)'%(repr(self.entity), self.metadata)
def __str__(self):
return self.entity
class HTTPCache(Cache):
""" An HTTP cache. Returns the entity found at the given URL.
Uses Expires, ETag and Last-Modified headers to minimize bandwidth usage.
Partial Cache-Control support (only max-age is supported).
"""
def check(self, key, name, entry):
request = urllib2.Request(key)
try:
if time()<entry._expires:
return None
except AttributeError:
pass
try:
header, value = entry._validator
request.headers[header]=value
except AttributeError:
pass
opened = None
try:
opened = urllib2.urlopen(request)
headers = opened.info()
# expiration handling
expiration = False
try:
match = re_max_age.match(headers['cache-control'])
if match:
entry._expires=time()+int(match.group(1))
expiration = True
except (KeyError, ValueError):
pass
if not expiration:
try:
date = parseRFC822Time(headers['date'])
expires = parseRFC822Time(headers['expires'])
entry._expires = time()+(expires-date)
expiration = True
except KeyError:
pass
# validator handling
validation = False
try:
entry._validator='If-None-Match', headers['etag']
validation = True
except KeyError:
pass
if not validation:
try:
entry._validator='If-Modified-Since', headers['last-modified']
except KeyError:
pass
return opened
except urllib2.HTTPError, error:
if opened: opened.close()
if error.code==304:
return None
else:
raise error
def build(self, key, name, opened, entry):
try:
return HTTPEntity(opened.read(), dict(opened.info()))
finally:
opened.close()
re_not_word = re.compile(r'\W+')
class ModuleCache(FileCache):
""" A module cache. Give it a file name, it returns a module
which results from the execution of the Python script it contains.
This module is not inserted into sys.modules.
"""
def __init__(self, max_size=0):
FileCache.__init__(self, max_size, 'r')
def build(self, key, name, opened, entry):
try:
module = new.module(re_not_word.sub('_',key))
module.__file__ = key
exec opened in module.__dict__
return module
finally:
opened.close()
class HttpModuleCache(HTTPCache):
""" A module cache. Give it an HTTP URL, it returns a module
which results from the execution of the Python script it contains.
This module is not inserted into sys.modules.
"""
def __init__(self, max_size=0):
HTTPCache.__init__(self, max_size)
def build(self, key, name, opened, entry):
try:
module = new.module(re_not_word.sub('_',key))
module.__file__ = key
text = opened.read().replace('\r\n', '\n')
code = compile(text, name, 'exec')
exec code in module.__dict__
return module
finally:
opened.close()
class FunctionCache(Cache):
def __init__(self, function, max_size=0):
Cache.__init__(self, max_size)
self.function=function
def __call__(self, *args, **kw):
if kw:
# a dict is not hashable so we build a tuple of (key, value) pairs
kw = tuple(kw.iteritems())
return self[args, kw]
else:
return self[args, ()]
def build(self, key, name, opened, entry):
args, kw = key
return self.function(*args, **dict(kw))
|
Two years ago I was a definite Java fan (writing in this language since the 1.0 beta version of 1995). Now I'm in love with Python. The trouble is that I left behind quite a few useful classes I wrote (thread-safe caches, pools, etc.), so I had to reimplement them in Python.
A cache is a pretty simple dictionary-like object : you provide it an index or a name, it gives you back an object. For example, for an HTTP cache, the index is an URL, the object is the data you can fetch from the URL.
The trick is that the corresponding object can be quite expensive (in CPU, bandwitdh, time or memory) to build, so you have to balance between building the object every time you need it, or pre-building all the objects you could require, knowing that the target object can change in time (think of how an URL can point to different data over time). A cache is precisely a way to find a balance between these two extremities.
This recipes provides you with an abstract Cache class, from which you can inherit, overriding the check() and build() methods, and four specialisations. FileCache and HTTPCache are quite what their name describe. ModuleCache is an experimental specialisation of FileCache, which can come handy when playing with dynamic code, since it allow you to load any arbitrary file as a python module, and dynamically reload it each time the file is modified. FunctionCache is the good old function call cache (already presented many times in this Cookbook), with thread-safety included thanks to the Cache base class.
Thread-safety of the cache is a must, since the purpose of a cache is to be shared a used by as much code as possible... A multi-threaded application such as a web application server has a strong need for thread-safe cache and pool structures.
For a sample usage of FileCache:
>>> fc = FileCache(10) # 10 files in memory at most
>>> f = open('test.txt','w')
>>> f.write('Hello, world !')
>>> f.close()
>>> fc['test.txt']
'Hello, world !'
>>> fc['test.txt'] # this time the file is checked but not read
'Hello, world !'
>>> f = open('test.txt','w')
>>> f.write('Hello, me !')
>>> f.close()
>>> fc['test.txt'] # this time the file is checked and re-read
'Hello, me'
A sample usage of HTTPCache :
>>> hc = HTTPCache(1000) # maximum 1000 documents in the cache
>>> hc['http://www.google.com/']
HTTPEntity('<html>[snipped]</html>',...)
>>> hc['http://www.google.com'] # the problem is, google don't want its homepage to be cached, so there is no gain
HTTPEntity('<html>[snipped]</html>',...)
>>> hc['http://www.google.com'].metadata # that's why : no Last-Modified, no ETag, no Expires headers.
{'content-length': '2360', 'set-cookie': 'PREF=[snipped]; expires=Sun, 17-Jan-2038 19:14:07 GMT; path=/; domain=.google.fr', 'server': 'GWS/2.1', 'connection': 'Keep-Alive', 'cache-control': 'private', 'date': 'Wed, 01 Sep 2004 21:21:50 GMT', 'content-type': 'text/html'}
>>> hc['http://diveintomark.org/xml/atom.xml']
HTTPEntity('<?xml version="1.0" encoding="utf-8"?>[snipped]',...)
>>> hc['http://diveintomark.org/xml/atom.xml'] # the second call is much faster, since Mark put some cache hint in order to save his bandwidth
HTTPEntity('<?xml version="1.0" encoding="utf-8"?>[snipped]',...)
>>> hc['http://diveintomark.org/xml/atom.xml'].metadata # here's why : nice Expires, Last-Modified and ETag headers
{'content-length': '9785', 'accept-ranges': 'bytes', 'expires': 'Thu, 02 Sep 2004 01:23:43 GMT', 'vary': '*', 'server':'Apache/1.3.31 (Debian GNU/Linux)', 'last-modified': 'Wed, 01 Sep 2004 03:26:16 GMT', 'connection': 'close', 'etag': '"e80a6-2639-41354158"', 'cache-control': 'max-age=14400', 'date': 'Wed, 01 Sep 2004 21:23:43 GMT', 'content-type': 'application/xml'}
Sample usage of FunctionCache:
>>> from time import sleep
>>> def my_long_function(value):
... sleep(5)
... return value+1
>>> my_long_function(2) # 5 seconds later...
3
>>> cached = FunctionCache(my_long_function,10) # keep the 10 last calls in memory
>>> cached(2) # 5 seconds later...
3
>>> cached(2) # immediate answer
3
Update. Changes :
added a WeakCache abstract class
all short member names such a entry._p, cache._d and so on have been renamed to longer, more explicit names. Hopefully the code is more readable now.
Cache.extract is renamed to Cache._unpack, and must return NOT_INITIALIZED if the entry is invalid. Cache._pack does the opposite of Cache._unpack. See WeakCache for an example use.
ModuleCache now return Module objects, which still are placeholder classes but with a better repr().
Update. In __setitem__, an entry lock was forgotten when setting the value on an already existing entry.
Update. Updated the code to the latest version I'm using. The API for check() and build() has slightly changed : the "key" parameter has been added. The ModuleCache class now uses real module objects instead of fake ones.
Update. Compatibility improvements : this latest version is compatible with Python 2.2, and with Python versions which do not include thread support.
Performance improvements : FileCache stat() the file to test if it is modified, and open() it only if it is the case. Previously it opened the file each and every time, even if it wasn't modified.
Cleaning up the access list in __delitem__. I think you need to put in the following in the __delitem__ before the del self._dict[key] since otherwise you'll keep leaking your access list.
For those who would prefer a module: the DiskCache library provides a thread-safe cache object. DiskCache is an Apache2 licensed disk and file backed cache library, written in pure-Python, and compatible with Django. The cache benchmarks show lookups are faster than typical solutions like Memcached and Redis.