Welcome, guest | Sign In | My Account | Store | Cart

Shelving dictionaries are quick and easy, until they grow too large and access is slowed to a crawl. This recipe is a directory cache with the filename an md5 of the key and the value is the file contents (as a pickled dump).

Python, 48 lines
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
#!/usr/bin/env python
# by Brian O. Bush, Thu 24-Jan-2008 06:53 bushbo
from __future__ import with_statement

import os, sys, pickle, md5, threading

# This file cache is thread-safe
class FileCache:
    def __init__(self, path): 
        self.path = path # path assumed existing; check externally
        if not os.path.exists(self.path): 
            os.makedirs(self.path)        
        self.gen_key = lambda x: md5.new(x).hexdigest()
        self.lock = threading.Lock()
    def get(self, key, default=None):
        with self.lock:
            retval = default
            fn = os.path.join(self.path, self.gen_key(key))
            try:
                f = file(fn, 'r')
                retval = pickle.load(f)
                f.close()                
            except IOError: pass
            return retval
    def __getitem__(self, key):
        return self.get(key)
    def __setitem__(self, key, value):
        with self.lock:
            fn = os.path.join(self.path, self.gen_key(key))
            f = open(fn, 'wb')
            pickle.dump(value.__dict__, f)
            f.close()

if __name__=='__main__':
    class Site:
        def __init__(self, name, hits=0):
            self.name = name
            self.hits = hits
        def __str__(self):
            return '%s, %d hits' % (self.name, self.hits)
    cache = FileCache('test')
    sites = [Site('cnn.com'), Site('kd7yhr.org', 1), Site('asdf.com', 3)]
    # We will use the site url as the key for our cache
    # Comment out the next two lines to test cache reading
    for site in sites:    
        cache[site.name] = site
    entry = cache.get('cnn.com')
    if entry: print Site(**entry)

The need for this came up when I was reading and storing thousands of RSS feeds (my own toy aggregator). No databases please. Shelving didn't work since it quickly grew to hundreds of megabytes and access obviously was hindered.

The locking is optional (my app had multiple threads reading/updating the cache). If you don't need locking simple remove the "with self.lock" statements and the self.lock object in the cache itself.