An implementation of noisy channel model
A simple implementation of noisy channel model to correct the non-word, real-word spelling errors occured in a sentence
Method
I learned this topic from the 5th chapter of the book Speech and Language Processing by Daniel Jurafsky & James H. Martin
The algorithm takes the input sentence X = {x1, x2, x3, x4… xn}, which might contains some spelling errors, next generates a large set of candidate correction sentences C(X), finally picks the sentence with the highest language model probability.
To generate the candidate correction sentences, I start by generating a set of candidate words for each input word xi. I used the python package PyEnchant to find these candidates word. Then each new proposed sentences is scored by the noisy channel In formula, W_hat = argmax(P(X/W) * P(W)), X is the sentence we observed, W represents one of our candidate sentences. For P(W), I use the bigram stupid backoff with laplace smoothing. For the channel model, P(X/W) will be 0.95 when x = w, and then distribute the rest 0.05 evenly over all other candidate corrections C(X).
The training dataset I used is from Peter Norvig. You could have access to this file in http://norvig.com/big.txt
Python Code
1 |
import enchant
import numpy as np
import csv
import math, collections
import pandas as pd
import re
import itertools
from nltk import tokenize
import nltk.data
class Sentence_Corrector :
def __init__(self, training_file) :
self.laplaceUnigramCounts = collections.defaultdict(lambda: 0)
self.laplaceBigramCounts = collections.defaultdict(lambda: 0)
self.total = 0
self.sentences = []
self.importantKeywords = set()
self.d = enchant.Dict("en_US")
self.tokenize_file(training_file)
self.train()
def tokenize_file(self, file) :
# """
# Read the file, tokenize and build a list of sentences
# """
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
f = open(file)
content = f.read()
for sentence in tokenizer.tokenize(content):
sentence_clean = [i.lower() for i in re.split('[^a-zA-Z]+', sentence) if i]
self.sentences.append(sentence_clean)
def train(self):
# """
# Train unigram and bigram
# """
for sentence in self.sentences:
sentence.insert(0, '<s>')
sentence.append('</s>')
for i in xrange(len(sentence) - 1):
token1 = sentence[i]
token2 = sentence[i + 1]
self.laplaceUnigramCounts[token1] += 1
self.laplaceBigramCounts[(token1, token2)] += 1
self.total += 1
self.total += 1
self.laplaceUnigramCounts[sentence[-1]] += 1
def candidate_word(self, word):
# """
# Generate similar word for a given word
# """
suggests = []
for candidate in self.importantKeywords:
if candidate.startswith(word):
suggests.append(candidate)
suggests.append(word)
if len(suggests) == 1:
suggests = self.d.suggest(word)
suggests = [suggest.lower() for suggest in suggests][:4]
suggests.append(word)
suggests = list(set(suggests))
return suggests, len(suggests)
def candidate_sentence(self, sentence):
# """
# Takes one sentence, and return all the possible sentences, and also return a dictionary of word : suggested number of words
# """
candidate_sentences = []
words_count = {}
for word in sentence:
candidate_sentences.append(self.candidate_word(word)[0])
words_count[word] = self.candidate_word(word)[1]
candidate_sentences = list(itertools.product(*candidate_sentences))
return candidate_sentences, words_count
def correction_score(self, words_count, old_sentence, new_sentence) :
# """
# Take a old sentence and a new sentence, for each words in the new sentence, if it's same as the orginal sentence, assign 0.95 prob
# If it's not same as original sentence, give 0.05 / (count(similarword) - 1)
# """
score = 1
for i in xrange(len(new_sentence)) :
if new_sentence[i] in words_count :
score *= 0.95
else :
score *= (0.05 / (words_count[old_sentence[i]] - 1))
return math.log(score)
def score(self, sentence):
# """
# Takes a list of strings as argument and returns the log-probability of the
# sentence using the stupid backoff language model.
# Use laplace smoothing to avoid new words with 0 probability
# """
score = 0.0
for i in range(len(sentence) - 1):
if self.laplaceBigramCounts[(sentence[i],sentence[i + 1])] > 0:
score += math.log(self.laplaceBigramCounts[(sentence[i],sentence[i + 1])])
score -= math.log(self.laplaceUnigramCounts[sentence[i]])
else:
score += (math.log(self.laplaceUnigramCounts[sentence[i + 1]] + 1) + math.log(0.4))
score -= math.log(self.total + len(self.laplaceUnigramCounts))
return score
def return_best_sentence(self, old_sentence) :
# """
# Generate all candiate sentences and
# Calculate the prob of each one and return the one with highest probability
# Probability involves two part 1. correct probability and 2. language model prob
# correct prob : p(c | w)
# language model prob : use stupid backoff algorithm
# """
bestScore = float('-inf')
bestSentence = []
old_sentence = [word.lower() for word in old_sentence.split()]
sentences, word_count = self.candidate_sentence(old_sentence)
for new_sentence in sentences:
new_sentence = list(new_sentence)
score = self.correction_score(word_count, new_sentence, old_sentence)
new_sentence.insert(0, '<s>')
new_sentence.append('</s>')
score += self.score(new_sentence)
if score >= bestScore:
bestScore = score
bestSentence = new_sentence
bestSentence = ' '.join(bestSentence[1:-1])
return bestSentence, bestScore
|
Usage Examples
1
2
3
4
5
6
7
corrector = Sentence_Corrector('../data/big.txt')
corrector.return_best_sentence('this is wron spallin word')
corrector.return_best_sentence('aoccdrning to a resarch at cmabridge university')
corrector.return_best_sentence('it does not mttaer in waht oredr the ltteers')
corrector.return_best_sentence('the olny important tihng is taht')
corrector.return_best_sentence('Can they leav him my messages')
corrector.return_best_sentence('This used to belong to thew queen')