from random import Random
class BloomFilter:
# http://en.wikipedia.org/wiki/Bloom_filter
def __init__(self, num_bytes, num_probes, iterable=()):
self.array = bytearray([0]) * num_bytes
self.num_probes = num_probes
self.update(iterable)
def get_probes(self, key):
random = Random(key).random
num_bits = len(self.array) * 8
return [int(random() * num_bits) for _ in range(self.num_probes)]
def update(self, keys):
for key in keys:
for i in self.get_probes(key):
self.array[i>>3] |= 1 << (i&7)
def __contains__(self, key):
return all(self.array[i>>3] & (1<<(i&7)) for i in self.get_probes(key))
class SpellChecker(BloomFilter):
def __init__(self, wordlistfiles, estimated_number_of_words=125000):
num_probes = 14 # set higher for fewer false positives
num_bytes = estimated_number_of_words * num_probes * 3 // 2 // 8
wordlist = (w.strip() for f in wordlistfiles for w in open(f))
BloomFilter.__init__(self, num_bytes, num_probes, wordlist)
def spellcheck(self, text):
print('Misspellings:')
for word in text.lower().split():
if word not in self:
print(word)
if __name__ == '__main__':
## Compute effectiveness statistics for a 125 byte filter with 50 entries
from random import sample
from string import ascii_letters
states = '''Alabama Alaska Arizona Arkansas California Colorado Connecticut
Delaware Florida Georgia Hawaii Idaho Illinois Indiana Iowa Kansas
Kentucky Louisiana Maine Maryland Massachusetts Michigan Minnesota
Mississippi Missouri Montana Nebraska Nevada NewHampshire NewJersey
NewMexico NewYork NorthCarolina NorthDakota Ohio Oklahoma Oregon
Pennsylvania RhodeIsland SouthCarolina SouthDakota Tennessee Texas Utah
Vermont Virginia Washington WestVirginia Wisconsin Wyoming'''.split()
bf = BloomFilter(num_bytes=125, num_probes=14, iterable=states)
m = sum(state in bf for state in states)
print('%d true positives and 0 false negatives out of %d positive trials' % (m, len(states)))
trials = 100000
m = sum(''.join(sample(ascii_letters, 8)) in bf for i in range(trials))
print('%d true negatives and %d false positives out of %d negative trials'
% (trials-m, m, trials))
c = ''.join(format(x, '08b') for x in bf.array)
print('Bit density:', c.count('1') / float(len(c)))
## Demonstrate a simple spell checker using a 125,000 word English wordlist
from glob import glob
# Use the GNU ispell wordlist found at http://bit.ly/english_dictionary
checker = SpellChecker(glob('/Users/raymondhettinger/dictionary/english.?'))
checker.spellcheck('''
All the werldz a stage
And all the mehn and wwomen merrely players
They have their exits and their entrances
And one man in his thaim pllays many parts
His actts being sevven ages
''')
Diff to Previous Revision
--- revision 9 2011-05-10 19:53:28
+++ revision 10 2011-05-11 01:32:54
@@ -27,9 +27,8 @@
def __init__(self, wordlistfiles, estimated_number_of_words=125000):
num_probes = 14 # set higher for fewer false positives
num_bytes = estimated_number_of_words * num_probes * 3 // 2 // 8
- BloomFilter.__init__(self, num_bytes, num_probes)
- for filename in wordlistfiles:
- self.update(line.rstrip() for line in open(filename))
+ wordlist = (w.strip() for f in wordlistfiles for w in open(f))
+ BloomFilter.__init__(self, num_bytes, num_probes, wordlist)
def spellcheck(self, text):
print('Misspellings:')
@@ -56,14 +55,14 @@
bf = BloomFilter(num_bytes=125, num_probes=14, iterable=states)
m = sum(state in bf for state in states)
- print('%d true positives and 0 false negatives out of %d trials' % (m, len(states)))
+ print('%d true positives and 0 false negatives out of %d positive trials' % (m, len(states)))
trials = 100000
m = sum(''.join(sample(ascii_letters, 8)) in bf for i in range(trials))
- print('%d true negatives and %d false positives out of %d trials'
+ print('%d true negatives and %d false positives out of %d negative trials'
% (trials-m, m, trials))
- c = ''.join(bin(x).replace('Ob', '') for x in bf.array)
+ c = ''.join(format(x, '08b') for x in bf.array)
print('Bit density:', c.count('1') / float(len(c)))