#!/usr/bin/env python
# encoding: utf-8
"""
Bloomfilter.py

Created by You XU on 2007-06-08.
Just type this file in according to 
http://lists.canonical.org/pipermail/kragen-hacks/2006-August/000431.html
For the purpose of learning the Bloom filter and python, with some comments added.
All rights are not reserved
Thanks to Kragen Sitaker

"""

import sys
import os
import sha

def nbits_required(n):
	"""Bits required to represent any integer in [0, n)"""
	n -=1
	rv =0
	while n:
		n>>=1
		rv+=1
	return rv

class Bloomfilter:
	"""Bloom filter is a compact hash table for membership tests with false positive (e.g. error words in)"""
	# Since python don't provide bit-wise computation, I just use integer to store it.
	def __init__(self, size, nhashes, bucketbits=256):
		"""size: number of bots. Should be power of 2.
		nhashes: nummber of separate hashes to use
		
		Making nhashes larger will make it slower. There are also tradeoffs between, size, 
		performance, and false-positive rate. 
		Actually we can find a Bloom filter calculator. Based on the false positive rate, the 
		space requirements, the k, m and n to get the suitable configuration.
		"""
		
		self.bucketbits = bucketbits
		self.filter = [0L] * int((size + bucketbits-1)/bucketbits)
		self.size = size
		self.nhashes = nhashes
		self.hashbits = nbits_required(size)
		assert self.hashbits * nhashes <=160 # 160 is all we get with SHA1
		# Here we can see the nice design. 
		# sha provides 160 bits and we use nhashes segments as different hash functions

	def add(self, astr):
		"""Add a string to the membership of the filter."""
		for offset in self._hashes(astr):
			bucket, bit = divmod(offset, self.bucketbits)
			self.filter[bucket] |= (1L <<bit)
			# if not self.filter[bucket] & (1L << bit): return 0
			
	def __contains__(self, astr):
		"""Returns true of the string is in the filter or it feels like it(well, possible false positive)."""
		for offset in self._hashes(astr):
			bucket, bit = divmod(offset, self.bucketbits)
			if not self.filter[bucket] & (1L <<bit): return 0
		return 1
		
	def _hashes(self, astr):
		"""The hashes of a particular string."""
		digest = sha.sha(astr).digest()
		# how to convert a byte string to a long?
		hashlong = 0L
		for ch in digest: hashlong = (hashlong<<8) |ord(ch)
		rv = []
		mask = (1L <<self.hashbits) -1
		for ii in range(self.nhashes):
			# note that this will give substantically nonuniform results of self.size iis not a power of 2, 
			# Inorder to avoid wasting hash bits or doing long division.
			rv.append((hashlong & mask) % self.size)
			hashlong >>=self.hashbits
		return rv
	
def test_bloom():
	"""Very basic sanity test for bloom filter implementation"""
	def ok(a, b): assert a==b, (a, b)
	ok(map(nbits_required, range(1, 10)), [0, 1, 2, 2, 3, 3, 3, 3, 4])
	# the nbits works for small numbers.
	ok(nbits_required(131072), 17) # 2^17
	ok(nbits_required(131073), 18) # 2^17 +1 
	
	b = Bloomfilter(1024, 5)
	assert 'asdf' not in b
	assert 'fdsa' not in b
	b.add('asdf')
	assert 'asdf' in b
	assert 'fdsa' not in b
	
	# false positives (depends on the hash function):
	x = Bloomfilter(8, 3)
	x.add('asdf') # about 5% chance of flase positives
	assert 'asdf' in x
	assert 'fdsa' not in x
	ok (filter(x.__contains__, ['foo%d' % ii for ii in range(25)]), ['foo22'])

test_bloom()

def misspelling(infile):
	"""Demo: spell check"""
	import re, cPickle, sys
	try: bf = cPickle.load(file('./dict.pck', 'rb'))
	# cPickle stores the binary object. 
	except IOError:
		# /usr/share/dict/words has 234937 words on the Mac and it is about 2.4M
		sys.stderr.write("reading dictionary...\n")
		words = file('/usr/share/dict/words')
		bf = Bloomfilter(4194304, 5)
		lineno=0
		for line in words:
			bf.add(line[:-1].lower())
			lineno +=1
			if not lineno % 100:
				sys.stderr.write('%s%s\r' % (lineno, ' ' * 40))
				sys.stderr.flush()
		sys.stderr.write("Done with reading dictionary;\n")
		try: cPickle.dump(bf, file('./dict.pck', 'wb'), 2)
		except: pass
		
	def candicate(word):
		"""Words you might find in the dictionary in English"""
		yield word
		for suffix in ['s', 'ing', 'ed', 'es', 'er', 'ly']:
			if word.endswith(suffix): yield word[:-len(suffix)]
		for suffix, repl in [('ed', 'e'), ('er', 'e'), ('ing', 'e'), ('ies', 'y'), ('ide', 'y')]:
			if word.endswith(suffix): yield word[:-len(suffix)] + repl
		# Nice part in dealing with the suffix. 
		
	for line in infile:
		for word in re.findall(r"['\w]+", line):
			# we drop the "'" because our dictionary has didnt but not didn't
			for chance in candicate(word.replace("'", '').lower()):
				if chance in bf: break
			else:
				sys.stdout.write(word + ' ')
				sys.stdout.flush()
				# to prevent repeating it....
				bf.add(word)
		sys.stdout.write('\n')
	
if __name__ == '__main__':
	import sys
	misspelling(file(sys.argv[1]))


