#!/usr/bin/env python # encoding: utf-8 """ Bloomfilter.py Created by You XU on 2007-06-08. Just type this file in according to http://lists.canonical.org/pipermail/kragen-hacks/2006-August/000431.html For the purpose of learning the Bloom filter and python, with some comments added. All rights are not reserved Thanks to Kragen Sitaker """ import sys import os import sha def nbits_required(n): """Bits required to represent any integer in [0, n)""" n -=1 rv =0 while n: n>>=1 rv+=1 return rv class Bloomfilter: """Bloom filter is a compact hash table for membership tests with false positive (e.g. error words in)""" # Since python don't provide bit-wise computation, I just use integer to store it. def __init__(self, size, nhashes, bucketbits=256): """size: number of bots. Should be power of 2. nhashes: nummber of separate hashes to use Making nhashes larger will make it slower. There are also tradeoffs between, size, performance, and false-positive rate. Actually we can find a Bloom filter calculator. Based on the false positive rate, the space requirements, the k, m and n to get the suitable configuration. """ self.bucketbits = bucketbits self.filter = [0L] * int((size + bucketbits-1)/bucketbits) self.size = size self.nhashes = nhashes self.hashbits = nbits_required(size) assert self.hashbits * nhashes <=160 # 160 is all we get with SHA1 # Here we can see the nice design. # sha provides 160 bits and we use nhashes segments as different hash functions def add(self, astr): """Add a string to the membership of the filter.""" for offset in self._hashes(astr): bucket, bit = divmod(offset, self.bucketbits) self.filter[bucket] |= (1L <>=self.hashbits return rv def test_bloom(): """Very basic sanity test for bloom filter implementation""" def ok(a, b): assert a==b, (a, b) ok(map(nbits_required, range(1, 10)), [0, 1, 2, 2, 3, 3, 3, 3, 4]) # the nbits works for small numbers. ok(nbits_required(131072), 17) # 2^17 ok(nbits_required(131073), 18) # 2^17 +1 b = Bloomfilter(1024, 5) assert 'asdf' not in b assert 'fdsa' not in b b.add('asdf') assert 'asdf' in b assert 'fdsa' not in b # false positives (depends on the hash function): x = Bloomfilter(8, 3) x.add('asdf') # about 5% chance of flase positives assert 'asdf' in x assert 'fdsa' not in x ok (filter(x.__contains__, ['foo%d' % ii for ii in range(25)]), ['foo22']) test_bloom() def misspelling(infile): """Demo: spell check""" import re, cPickle, sys try: bf = cPickle.load(file('./dict.pck', 'rb')) # cPickle stores the binary object. except IOError: # /usr/share/dict/words has 234937 words on the Mac and it is about 2.4M sys.stderr.write("reading dictionary...\n") words = file('/usr/share/dict/words') bf = Bloomfilter(4194304, 5) lineno=0 for line in words: bf.add(line[:-1].lower()) lineno +=1 if not lineno % 100: sys.stderr.write('%s%s\r' % (lineno, ' ' * 40)) sys.stderr.flush() sys.stderr.write("Done with reading dictionary;\n") try: cPickle.dump(bf, file('./dict.pck', 'wb'), 2) except: pass def candicate(word): """Words you might find in the dictionary in English""" yield word for suffix in ['s', 'ing', 'ed', 'es', 'er', 'ly']: if word.endswith(suffix): yield word[:-len(suffix)] for suffix, repl in [('ed', 'e'), ('er', 'e'), ('ing', 'e'), ('ies', 'y'), ('ide', 'y')]: if word.endswith(suffix): yield word[:-len(suffix)] + repl # Nice part in dealing with the suffix. for line in infile: for word in re.findall(r"['\w]+", line): # we drop the "'" because our dictionary has didnt but not didn't for chance in candicate(word.replace("'", '').lower()): if chance in bf: break else: sys.stdout.write(word + ' ') sys.stdout.flush() # to prevent repeating it.... bf.add(word) sys.stdout.write('\n') if __name__ == '__main__': import sys misspelling(file(sys.argv[1]))