A class for persistent queues.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 | import os, sys, marshal, glob, thread
# Filename used for index files, must not contain numbers
INDEX_FILENAME = 'index'
# Exception thrown when calling get() on an empty queue
class Empty(Exception): pass
class PersistentQueue:
def __init__(self, name, cache_size=512, marshal=marshal):
"""
Create a persistent FIFO queue named by the 'name' argument.
The number of cached queue items at the head and tail of the queue
is determined by the optional 'cache_size' parameter. By default
the marshal module is used to (de)serialize queue items, but you
may specify an alternative serialize module/instance with the
optional 'marshal' argument (e.g. pickle).
"""
assert cache_size > 0, 'Cache size must be larger than 0'
self.name = name
self.cache_size = cache_size
self.marshal = marshal
self.index_file = os.path.join(name, INDEX_FILENAME)
self.temp_file = os.path.join(name, 'tempfile')
self.mutex = thread.allocate_lock()
self._init_index()
def _init_index(self):
if not os.path.exists(self.name):
os.mkdir(self.name)
if os.path.exists(self.index_file):
index_file = open(self.index_file)
self.head, self.tail = map(lambda x: int(x),
index_file.read().split(' '))
index_file.close()
else:
self.head, self.tail = 0, 1
def _load_cache(cache, num):
name = os.path.join(self.name, str(num))
mode = 'rb+' if os.path.exists(name) else 'wb+'
cachefile = open(name, mode)
try:
setattr(self, cache, self.marshal.load(cachefile))
except EOFError:
setattr(self, cache, [])
cachefile.close()
_load_cache('put_cache', self.tail)
_load_cache('get_cache', self.head)
assert self.head < self.tail, 'Head not less than tail'
def _sync_index(self):
assert self.head < self.tail, 'Head not less than tail'
index_file = open(self.temp_file, 'w')
index_file.write('%d %d' % (self.head, self.tail))
index_file.close()
if os.path.exists(self.index_file):
os.remove(self.index_file)
os.rename(self.temp_file, self.index_file)
def _split(self):
put_file = os.path.join(self.name, str(self.tail))
temp_file = open(self.temp_file, 'wb')
self.marshal.dump(self.put_cache, temp_file)
temp_file.close()
if os.path.exists(put_file):
os.remove(put_file)
os.rename(self.temp_file, put_file)
self.tail += 1
if len(self.put_cache) <= self.cache_size:
self.put_cache = []
else:
self.put_cache = self.put_cache[:self.cache_size]
self._sync_index()
def _join(self):
current = self.head + 1
if current == self.tail:
self.get_cache = self.put_cache
self.put_cache = []
else:
get_file = open(os.path.join(self.name, str(current)), 'rb')
self.get_cache = self.marshal.load(get_file)
get_file.close()
try:
os.remove(os.path.join(self.name, str(self.head)))
except:
pass
self.head = current
if self.head == self.tail:
self.head = self.tail - 1
self._sync_index()
def _sync(self):
self._sync_index()
get_file = os.path.join(self.name, str(self.head))
temp_file = open(self.temp_file, 'wb')
self.marshal.dump(self.get_cache, temp_file)
temp_file.close()
if os.path.exists(get_file):
os.remove(get_file)
os.rename(self.temp_file, get_file)
put_file = os.path.join(self.name, str(self.tail))
temp_file = open(self.temp_file, 'wb')
self.marshal.dump(self.put_cache, temp_file)
temp_file.close()
if os.path.exists(put_file):
os.remove(put_file)
os.rename(self.temp_file, put_file)
def __len__(self):
"""
Return number of items in queue.
"""
self.mutex.acquire()
try:
return (((self.tail-self.head)-1)*self.cache_size) + \
len(self.put_cache) + len(self.get_cache)
finally:
self.mutex.release()
def sync(self):
"""
Synchronize memory caches to disk.
"""
self.mutex.acquire()
try:
self._sync()
finally:
self.mutex.release()
def put(self, obj):
"""
Put the item 'obj' on the queue.
"""
self.mutex.acquire()
try:
self.put_cache.append(obj)
if len(self.put_cache) >= self.cache_size:
self._split()
finally:
self.mutex.release()
def get(self):
"""
Get an item from the queue.
Throws Empty exception if the queue is empty.
"""
self.mutex.acquire()
try:
if len(self.get_cache) > 0:
return self.get_cache.pop(0)
else:
self._join()
if len(self.get_cache) > 0:
return self.get_cache.pop(0)
else:
raise Empty
finally:
self.mutex.release()
def close(self):
"""
Close the queue. Implicitly synchronizes memory caches to disk.
No further accesses should be made through this queue instance.
"""
self.mutex.acquire()
try:
self._sync()
if os.path.exists(self.temp_file):
try:
os.remove(self.temp_file)
except:
pass
finally:
self.mutex.release()
## Tests
if __name__ == "__main__":
ELEMENTS = 1000
p = PersistentQueue('test', 10)
print 'Enqueueing %d items, cache size = %d' % (ELEMENTS,
p.cache_size)
for a in range(ELEMENTS):
p.put(str(a))
p.sync()
print 'Queue length (using __len__):', len(p)
print 'Dequeueing %d items' % (ELEMENTS/2)
for a in range(ELEMENTS/2):
p.get()
print 'Queue length (using __len__):', len(p)
print 'Dequeueing %d items' % (ELEMENTS/2)
for a in range(ELEMENTS/2):
p.get()
print 'Queue length (using __len__):', len(p)
p.sync()
p.close()
|
A persistent queue is useful when the size of the queue or the items prohibits the entire queue from residing in memory.
Persistent queues operate with a directory structured as follows:
/ index 0 1 . . N
0 .. N are files that contain items, the number of items per file is configured in the queue constructor with the cache_size parameter.
The 'index' file contains pointers to the current item file for dequeue (or head of the queue) and the item file for enqueue (or tail of the queue).
Whenever the queue cache for dequeue is empty, the file containing the head of the queue by the head pointer is loaded into the cache. Whenever the queue cache for enqueue is full, the queue cache for enqueue is stored in the file pointed to by the tail pointer. The enqueue and dequeue cache can be explicitly synchronized onto stable storage by calling the sync() method.
By default, items enqueued on the queue must be serializable with the marshal module in the standard python distribution. However, you may pass an alternative module to the 'marshal' argument in the queue constructor, e.g. pickle.
The PersistentQueue does not use fsync() to flush the disk cache onto disk, but this would be a straightforward modification through subclassing.
Multiple threads can safely use the queue in the same process, but there is no external synchronization between processes. Thus, if two processes open the same queue, the queue will ultimately be corrupted if/when a process causes the queue data to be written to disk without the other process knowing.
shelve approach. Why not store the queue as consecutively numbered entries in a shelve, deleting low numbered entries as you extract them from the queue. Easy-peasy.
shelve. Funny you should mention shelve, as one of the main motivations for making this class was trouble with an approach which used shelve :)
During periods of heavy load, a shutdown of the application which used the shelve approach caused the on-disk database to be corrupted, with the result of irrecoverable queue data. This was a couple of years ago, so the problems with bsddb (and gdbm for that matter) may have been fixed later.
Still, the PersistentQueue class does not depend on any 3rdparty libraries for storage, which I believe to be an advantage. Also, if there is a crash where the enqueue and dequeue caches are not properly synched to disk, you may still be able to recover other queue segments and not lose all data.
Problem if cache size is larger than number of items added in a session. It's possible to lose data under certain circumstances if your cache size is larger than the number of items you add in a given session, as demonstrated in the code at the end of this post. Or am I using PersistentQueue incorrectly?
Here is the output from two runs of the test program:
New version. There was a small bug in the index initialization which caused the problem revealed by your test program. I've updated the code with a fix .
Thanks for reporting!
Kjetil, thank you for creating this example. It appears to be in the public domain. Can you confirm that interpretation?
Thanks!