Recipe 577684 revision 7 « ActiveState Code

from random import Random

class BloomFilter:
    # http://en.wikipedia.org/wiki/Bloom_filter

    def __init__(self, num_bytes, num_probes):
        self.array = bytearray([0]) * num_bytes
        self.num_probes = num_probes

    def get_probes(self, key):
        random = Random(key).random
        num_bits = len(self.array) * 8
        for _ in range(self.num_probes):
            yield int(random() * num_bits)

    def add(self, key):
        for i in self.get_probes(key):
            self.array[i>>3] |= 1 << (i&7)

    def __contains__(self, key):
        return all(self.array[i>>3] & (1<<(i&7)) for i in self.get_probes(key))


class SpellChecker(BloomFilter):

    def __init__(self, wordlistfiles, estimated_number_of_words=125000):
        num_probes = 14           # set higher for fewer false positives
        num_bytes = estimated_number_of_words * num_probes * 3 // 2 // 8
        BloomFilter.__init__(self, num_bytes, num_probes)
        for filename in wordlistfiles:
            for line in open(filename):
                self.add(line.rstrip())

    def spellcheck(self, text):
        print('Misspellings:')
        for word in text.lower().split():
            if word not in self:
                print(word)


if __name__ == '__main__':

    ## Compute effectiveness statistics for a 125 byte filter with 50 entries

    from random import sample
    from string import ascii_letters

    states = '''Alabama Alaska Arizona Arkansas California Colorado Connecticut
        Delaware Florida Georgia Hawaii Idaho Illinois Indiana Iowa Kansas
        Kentucky Louisiana Maine Maryland Massachusetts Michigan Minnesota
        Mississippi Missouri Montana Nebraska Nevada NewHampshire NewJersey
        NewMexico NewYork NorthCarolina NorthDakota Ohio Oklahoma Oregon
        Pennsylvania RhodeIsland SouthCarolina SouthDakota Tennessee Texas Utah
        Vermont Virginia Washington WestVirginia Wisconsin Wyoming'''.split()

    bf = BloomFilter(num_bytes=125, num_probes=14)
    for state in states:
        bf.add(state)

    m = sum(state in bf for state in states)
    print('%d true positives and 0 false negatives out of %d trials' % (m, len(states)))

    trials = 100000
    m = sum(''.join(sample(ascii_letters, 8)) in bf for i in range(trials))
    print('%d true negatives and %d false positives out of %d trials'
          % (trials-m, m, trials))

    c = ''.join(bin(x).replace('Ob', '') for x in bf.array)
    print('Bit density:', c.count('1') / float(len(c)))


    ## Demonstrate a simple spell checker using a 125,000 word English wordlist

    from glob import glob

    # Use the GNU ispell wordlist found at http://bit.ly/english_dictionary
    checker = SpellChecker(glob('/Users/raymondhettinger/dictionary/english.?'))
    checker.spellcheck('''
        All the werldz a stage
        And all the mehn and wwomen merrely players
        They have their exits and their entrances
        And one man in his thaim pllays many parts
        His actts being sevven ages
    ''')

Diff to Previous Revision

--- revision 6 2011-05-08 23:41:47
+++ revision 7 2011-05-10 17:03:00
@@ -4,22 +4,21 @@
     # http://en.wikipedia.org/wiki/Bloom_filter
 
     def __init__(self, num_bytes, num_probes):
-        self.arr = bytearray([0]) * num_bytes
+        self.array = bytearray([0]) * num_bytes
         self.num_probes = num_probes
 
     def get_probes(self, key):
         random = Random(key).random
+        num_bits = len(self.array) * 8
         for _ in range(self.num_probes):
-            array_index = int(random() * len(self.arr))
-            bit_index = int(random() * 8)
-            yield array_index, 1 << bit_index
+            yield int(random() * num_bits)
 
     def add(self, key):
-        for i, mask in self.get_probes(key):
-            self.arr[i] |= mask
+        for i in self.get_probes(key):
+            self.array[i>>3] |= 1 << (i&7)
 
     def __contains__(self, key):
-        return all(self.arr[i] & mask for i, mask in self.get_probes(key))
+        return all(self.array[i>>3] & (1<<(i&7)) for i in self.get_probes(key))
 
 
 class SpellChecker(BloomFilter):
@@ -66,7 +65,7 @@
     print('%d true negatives and %d false positives out of %d trials'
           % (trials-m, m, trials))
 
-    c = ''.join(bin(x).replace('Ob', '') for x in bf.arr)
+    c = ''.join(bin(x).replace('Ob', '') for x in bf.array)
     print('Bit density:', c.count('1') / float(len(c)))

Recipe 577684 revision 7

Diff to Previous Revision

History

Accounts

Code Recipes

Feedback & Information

ActiveState