Welcome, guest | Sign In | My Account | Store | Cart

A class for persistent queues.

Python, 198 lines
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
import os, sys, marshal, glob, thread

# Filename used for index files, must not contain numbers
INDEX_FILENAME = 'index'

# Exception thrown when calling get() on an empty queue
class Empty(Exception):  pass

class PersistentQueue:

    def __init__(self, name, cache_size=512, marshal=marshal):
        """
        Create a persistent FIFO queue named by the 'name' argument.

        The number of cached queue items at the head and tail of the queue
        is determined by the optional 'cache_size' parameter.  By default
        the marshal module is used to (de)serialize queue items, but you
        may specify an alternative serialize module/instance with the
        optional 'marshal' argument (e.g. pickle).
        """
        assert cache_size > 0, 'Cache size must be larger than 0'
        self.name = name
        self.cache_size = cache_size
        self.marshal = marshal
        self.index_file = os.path.join(name, INDEX_FILENAME)
        self.temp_file = os.path.join(name, 'tempfile')        
        self.mutex = thread.allocate_lock()
        self._init_index()

    def _init_index(self):
        if not os.path.exists(self.name):
            os.mkdir(self.name)
        if os.path.exists(self.index_file):
            index_file = open(self.index_file)
            self.head, self.tail = map(lambda x: int(x),
                                       index_file.read().split(' '))
            index_file.close()
        else:
            self.head, self.tail = 0, 1
        def _load_cache(cache, num):
            name = os.path.join(self.name, str(num))
            mode = 'rb+' if os.path.exists(name) else 'wb+'
            cachefile = open(name, mode)
            try:
                setattr(self, cache, self.marshal.load(cachefile))
            except EOFError:
                setattr(self, cache, [])
            cachefile.close()
        _load_cache('put_cache', self.tail)
        _load_cache('get_cache', self.head)
        assert self.head < self.tail, 'Head not less than tail'

    def _sync_index(self):
        assert self.head < self.tail, 'Head not less than tail'
        index_file = open(self.temp_file, 'w')
        index_file.write('%d %d' % (self.head, self.tail))
        index_file.close()
        if os.path.exists(self.index_file):
            os.remove(self.index_file)
        os.rename(self.temp_file, self.index_file)

    def _split(self):
        put_file = os.path.join(self.name, str(self.tail))
        temp_file = open(self.temp_file, 'wb')
        self.marshal.dump(self.put_cache, temp_file)
        temp_file.close()
        if os.path.exists(put_file):
            os.remove(put_file)
        os.rename(self.temp_file, put_file)
        self.tail += 1
        if len(self.put_cache) <= self.cache_size:
            self.put_cache = []
        else:
            self.put_cache = self.put_cache[:self.cache_size]
        self._sync_index()

    def _join(self):
        current = self.head + 1
        if current == self.tail:
            self.get_cache = self.put_cache
            self.put_cache = []
        else:
            get_file = open(os.path.join(self.name, str(current)), 'rb')
            self.get_cache = self.marshal.load(get_file)
            get_file.close()
            try:
                os.remove(os.path.join(self.name, str(self.head)))
            except:
                pass
            self.head = current
        if self.head == self.tail:
            self.head = self.tail - 1
        self._sync_index()

    def _sync(self):
        self._sync_index()
        get_file = os.path.join(self.name, str(self.head))
        temp_file = open(self.temp_file, 'wb')
        self.marshal.dump(self.get_cache, temp_file)
        temp_file.close()
        if os.path.exists(get_file):
            os.remove(get_file)
        os.rename(self.temp_file, get_file)
        put_file = os.path.join(self.name, str(self.tail))
        temp_file = open(self.temp_file, 'wb')
        self.marshal.dump(self.put_cache, temp_file)
        temp_file.close()
        if os.path.exists(put_file):
            os.remove(put_file)
        os.rename(self.temp_file, put_file)

    def __len__(self):
        """
        Return number of items in queue.
        """
        self.mutex.acquire()
        try:
            return (((self.tail-self.head)-1)*self.cache_size) + \
                    len(self.put_cache) + len(self.get_cache)
        finally:
            self.mutex.release()

    def sync(self):
        """
        Synchronize memory caches to disk.
        """
        self.mutex.acquire()
        try:
            self._sync()
        finally:
            self.mutex.release()

    def put(self, obj):
        """
        Put the item 'obj' on the queue.
        """
        self.mutex.acquire()
        try:
            self.put_cache.append(obj)
            if len(self.put_cache) >= self.cache_size:
                self._split()
        finally:
            self.mutex.release()

    def get(self):
        """
        Get an item from the queue.
        Throws Empty exception if the queue is empty.
        """
        self.mutex.acquire()
        try:
            if len(self.get_cache) > 0:
                return self.get_cache.pop(0)
            else:
                self._join()
                if len(self.get_cache) > 0:
                    return self.get_cache.pop(0)
                else:
                    raise Empty
        finally:
            self.mutex.release()

    def close(self):
        """
        Close the queue.  Implicitly synchronizes memory caches to disk.
        No further accesses should be made through this queue instance.
        """
        self.mutex.acquire()
        try:
            self._sync()
            if os.path.exists(self.temp_file):
                try:
                    os.remove(self.temp_file)
                except:
                    pass
        finally:
            self.mutex.release()

## Tests
if __name__ == "__main__":
    ELEMENTS = 1000
    p = PersistentQueue('test', 10)
    print 'Enqueueing %d items, cache size = %d' % (ELEMENTS,
                                                    p.cache_size)
    for a in range(ELEMENTS):
        p.put(str(a))
    p.sync()
    print 'Queue length (using __len__):', len(p)
    print 'Dequeueing %d items' % (ELEMENTS/2)
    for a in range(ELEMENTS/2):
        p.get()
    print 'Queue length (using __len__):', len(p)
    print 'Dequeueing %d items' % (ELEMENTS/2)
    for a in range(ELEMENTS/2):
        p.get()
    print 'Queue length (using __len__):', len(p)
    p.sync()
    p.close()

A persistent queue is useful when the size of the queue or the items prohibits the entire queue from residing in memory.

Persistent queues operate with a directory structured as follows:

/ index 0 1 . . N

0 .. N are files that contain items, the number of items per file is configured in the queue constructor with the cache_size parameter.

The 'index' file contains pointers to the current item file for dequeue (or head of the queue) and the item file for enqueue (or tail of the queue).

Whenever the queue cache for dequeue is empty, the file containing the head of the queue by the head pointer is loaded into the cache. Whenever the queue cache for enqueue is full, the queue cache for enqueue is stored in the file pointed to by the tail pointer. The enqueue and dequeue cache can be explicitly synchronized onto stable storage by calling the sync() method.

By default, items enqueued on the queue must be serializable with the marshal module in the standard python distribution. However, you may pass an alternative module to the 'marshal' argument in the queue constructor, e.g. pickle.

The PersistentQueue does not use fsync() to flush the disk cache onto disk, but this would be a straightforward modification through subclassing.

Multiple threads can safely use the queue in the same process, but there is no external synchronization between processes. Thus, if two processes open the same queue, the queue will ultimately be corrupted if/when a process causes the queue data to be written to disk without the other process knowing.

5 comments

D Torpey 15 years, 4 months ago  # | flag

shelve approach. Why not store the queue as consecutively numbered entries in a shelve, deleting low numbered entries as you extract them from the queue. Easy-peasy.

Kjetil Jacobsen (author) 15 years, 3 months ago  # | flag

shelve. Funny you should mention shelve, as one of the main motivations for making this class was trouble with an approach which used shelve :)

During periods of heavy load, a shutdown of the application which used the shelve approach caused the on-disk database to be corrupted, with the result of irrecoverable queue data. This was a couple of years ago, so the problems with bsddb (and gdbm for that matter) may have been fixed later.

Still, the PersistentQueue class does not depend on any 3rdparty libraries for storage, which I believe to be an advantage. Also, if there is a crash where the enqueue and dequeue caches are not properly synched to disk, you may still be able to recover other queue segments and not lose all data.

Art Peel 15 years, 1 month ago  # | flag

Problem if cache size is larger than number of items added in a session. It's possible to lose data under certain circumstances if your cache size is larger than the number of items you add in a given session, as demonstrated in the code at the end of this post. Or am I using PersistentQueue incorrectly?

Here is the output from two runs of the test program:

material:~/dev art$ python queue-tester.py
At startup: Queue length: 0
Enqueueing 5 items, cache size = 15
At close: queue length: 5
At startup: Queue length: 5
Enqueueing 5 items, cache size = 15
At close: queue length: 10
At startup: Queue length: 5

                        *** EXPECTED 10 but got 5
Enqueueing 5 items, cache size = 15
At close: queue length: 10
material:~/dev art$ python queue-tester.py
At startup: Queue length: 0

                        *** EXPECTED 5 but got 0
Enqueueing 5 items, cache size = 15
At close: queue length: 5
At startup: Queue length: 5
Enqueueing 5 items, cache size = 15
At close: queue length: 10
At startup: Queue length: 5

                        *** EXPECTED 10 but got 5
Enqueueing 5 items, cache size = 15
At close: queue length: 10
material:~/dev art$



from PersistentQueue import PersistentQueue

def testLargeCacheImpl(expected_size_at_start, name, ELEMENTS, CACHE_SIZE):
    p = PersistentQueue(name, CACHE_SIZE)
    print 'At startup: Queue length:', len(p)
    if len(p) != expected_size_at_start:
        print '\n\t\t\t*** EXPECTED %d but got %d' % (expected_size_at_start, len(p))
    print 'Enqueueing %d items, cache size = %d' % (ELEMENTS,
                                                    p.cache_size)
    for a in range(ELEMENTS):
        p.put(str(a))
    p.sync()
    size_at_close = len(p)
    print 'At close: queue length:', size_at_close
    p.close()
    return size_at_close

def testLargeCache():
    name = 'cache_size_issue'
    ELEMENTS = 5
    CACHE_SIZE = ELEMENTS * 3
    p = PersistentQueue(name, CACHE_SIZE)
    expected = len(p)
    p.sync()
    p.close()
    expected = testLargeCacheImpl(expected, name, ELEMENTS, CACHE_SIZE)
    expected = testLargeCacheImpl(expected, name, ELEMENTS, CACHE_SIZE)
    testLargeCacheImpl(expected, name, ELEMENTS, CACHE_SIZE)

if __name__ == "__main__":
    testLargeCache()

New version. There was a small bug in the index initialization which caused the problem revealed by your test program. I've updated the code with a fix .

Thanks for reporting!

ps 12 years, 6 months ago  # | flag

Kjetil, thank you for creating this example. It appears to be in the public domain. Can you confirm that interpretation?

Thanks!