# -*- coding: iso-8859-1 -* #!/usr/bin/python """Spelling Corrector. Copyright 2007 Peter Norvig. Open source code under MIT license: http://www.opensource.org/licenses/mit-license.php Improved by Peter Bengtsson, 2007. * added support for Unicode, * changed how candidates are created * made model a dict * made compatible with python 2.4 * added train() and save() version 0.1.5 * feature: ability to ignore the language files corpus * feature: faster importing since it no longer loads the languages twice version 0.1.6 * feature: added count_trained_words() to inspect how many words are trained on * feature: added __str__() method for basic inspection * bug: when training new words, it's misspelled alternatives could damage already existing trained words. * bug: _train() expects a list or tuple but would still explode the words internally with the _word() function. Eg. _train(['peter@domain.com']) would actually train the words ['peter','domain','com'] version 0.1.7 * feature: loading your own training file is cached with a pickle file unless you create the Spellcorrector() with use_pickle_cache=False. MUCH faster consecutive instansiations. version 0.1.8 * feature: If you don't load the language files, then when you train it doesn't load variants of trained words which makes training MUCH faster. version 0.2.0 * feature: the big language files are not loaded by default. * feature: new parameter: @load_language_file= If it is a valid file path string that can be the loaded language files. Eg. Spellcorrector('en', load_language_file='/home/peterbe/en.txt') """ import cPickle import re, os, sys import codecs import time import stat LANGUAGE_FILES_BASEPATH = os.path.join(os.path.dirname(__file__), 'languagefiles') __all__ = ['supported_languages','Spellcorrector'] __version__ = '0.2.0' def _words(text): return re.findall(u'[a-z\xe5\xe4\xf6]+', text.lower()) def _train(features): model ={} for f in features: model[f] = model.get(f,0) + 1 return model NWORDS = {} ALPHABETS = {} for lang in ('en','sv'): if lang == 'sv': ALPHABETS[lang] = u'abcdefghijklmnopqrstuvwxyz\xe5\xe4\xf6' else: ALPHABETS[lang] = u'abcdefghijklmnopqrstuvwxyz' def supported_languages(): return ALPHABETS.keys() class Spellcorrector(object): """ A class for correcting a word or for finding spelling suggestions using Peter Norvig's algorithm from http://www.norvig.com/spell-correct.html You instanciate this class with one of the supported languages. If you don't want to load the big heavy language files but instead rely completely on your won training file, then you can pass load_language_files=False when you create the Spellcorrector object. """ def __init__(self, lang='en', own_training_file=None, load_language_files=False, use_pickle_cache=True, load_language_file=None): assert lang in ('en','sv'), "Unsupported language %s" % lang self.lang = lang self.own_training_file = own_training_file self._use_pickle_cache = use_pickle_cache self._load_language_files = load_language_files self._load_language_file = load_language_file self._init() def _init(self, encoding='latin1'): self.alphabet = ALPHABETS[self.lang] if self._load_language_file: _txt_filepath = self._load_language_file _nwords_filepath = _txt_filepath.replace(os.path.splitext(_txt_filepath)[1], '.pickle') elif self._load_language_files: _nwords_filepath = os.path.join(LANGUAGE_FILES_BASEPATH, 'nwords_%s.pickle' % self.lang) _txt_filepath = os.path.join(LANGUAGE_FILES_BASEPATH, '%s.txt' % self.lang) else: _txt_filepath = _nwords_filepath = None if _txt_filepath and _nwords_filepath: try: if self._use_pickle_cache: nwords = cPickle.load(file(_nwords_filepath,'rb')) else: words = _words(unicode(file(_txt_filepath).read(), encoding)) nwords = _train(words) except IOError: words = _words(unicode(file(_txt_filepath).read(), encoding)) nwords = _train(words) cPickle.dump(nwords, file(_nwords_filepath,'wb'), protocol=-1) else: nwords = {} self.nwords = nwords if self.own_training_file: if os.path.isfile(self.own_training_file): if self._use_pickle_cache and self._hasOwnTrainingCache(self.own_training_file): self._loadOwnTrainingCache(self.own_training_file) else: own_words = unicode(file(self.own_training_file).read(), encoding) # self._train() will change self's 'nwords' dict self._train(own_words.split(u'\n')) if self._use_pickle_cache: self._saveOwnTrainingCache(self.own_training_file) else: print >>sys.stderr, "%s does not exist. Starting fresh." % self.own_training_file def _hasOwnTrainingCache(self, txt_filename): """ return true if there is a cached pickle file that older or same age as the txt file where the own training words are in. """ pickle_filename = self._getOwnTrainingPickleFilname(txt_filename) if os.path.isfile(pickle_filename): return os.stat(txt_filename)[stat.ST_MTIME] <= os.stat(pickle_filename)[stat.ST_MTIME] return False def _getOwnTrainingPickleFilname(self, txt_filename): """ if the txt_filename is '/home/peterbe/foo.txt' then return '/home/peterbe/foo.pickle' If the txt_filename is not ending in .txt just return the @txt_filename plus '.pickle' """ if txt_filename.endswith('.txt'): filename, __ = os.path.splitext(txt_filename) return filename + '.pickle' else: return txt_filename + '.pickle' def _saveOwnTrainingCache(self, txt_filename): """ dump self.nwords into this file (or pickle equivalent) """ pickle_filename = self._getOwnTrainingPickleFilname(txt_filename) # dump self.nwords cPickle.dump(self.nwords, file(pickle_filename,'wb'), protocol=-1) def _loadOwnTrainingCache(self, txt_filename): """ return a tuple of (words [dict], timestamp [int/long], pickle_filename[str]) """ pickle_filename = self._getOwnTrainingPickleFilname(txt_filename) self.nwords = cPickle.load(file(pickle_filename,'rb')) def _edits1(self, word): n = len(word) return set(# deletion [word[0:i]+word[i+1:] for i in range(n)] + # transposition [word[0:i]+word[i+1]+word[i]+word[i+2:] for i in range(n-1)] + # alteration [word[0:i]+c+word[i+1:] for i in range(n) for c in self.alphabet] + # insertion [word[0:i]+c+word[i:] for i in range(n+1) for c in self.alphabet]) def _known_edits2(self, word): return set(e2 for e1 in self._edits1(word) for e2 in self._edits1(e1) if e2 in self.nwords) def known(self, words): return set(w for w in words if w in self.nwords) def _candidates(self, word): word = word.lower() if len(word) > 10: # if the word is this big, edits2() returns a HUUUUGE # set which kills the CPU and takes forever. return self.known([word]) | self.known(self._edits1(word)) or [word] else: return self.known([word]) | self.known(self._edits1(word)) or \ self._known_edits2(word) or [word] def _train(self, words, unicode_encoding='latin1'): def safe_unicode(s, encoding=unicode_encoding): if isinstance(s, str): return unicode(s, encoding) return s words = [safe_unicode(x).lower() for x in words] _trained_words = getattr(self, '_trained_words', []) new_words = words for word in new_words: # if we've already trained a variant of this word, e.g 'mike', # that would have added the misspellt alternatives too (set to -1) # like 'mqke','mbke','make' etc. Perhaps now the second word # we train on is 'make' which was a misspelled alternative of 'mike' # entered in the previous loop. # When we train on words, we never want to add a word with a bad start # (e.g. -1) so that's why we use this max() function here so that # the previous (that we +1 to) never is -1) if self._load_language_files: # since there are other word already in there like this, make sure # our trained word gets higher no matter what. p = max(self.nwords.get(self.correct(word), 0), 0) + 1 else: # if we haven't loaded the language file, there's no need to find # an alternative that is higher than this. p = max(self.nwords.get(word.lower(), 0), 0) + 1 self.nwords[word] = p # Right, now that we've added this new strange word we also # add all the possible misspellings of this new word and # assign them the lowest possible score, -1. # The reason for doing is that only words that exist in the # nwords variable can be corrected. That means that you can # do spelling corrections on the trained new word. # For example, if you add your own word 'Laphroaig' and assume # that's correct the you want to be able to spellcorrect if # someone queries the word 'Laproaig' or 'Laphroaigh' # If you read the comment a couple of lines above about the necessary # use of the max() function you'll understand the thinkin here. # Training can be done in separate batches. # For example, in the first batch to train on the words: # ['mike','peter'] # in the second batch to train on # ['make','george'] # Thanks to the max() function above, that make word end up with # a count of 0. # No, suppose you train on these words: # ['mike','sivia','make'] # Then, in the first element, you're expected to add # misspelled alternatives of the word 'mike' (e.g 'mbke') # but thanks to this limiting list compression here below we can # make sure we're not going to bother with the alternative 'make' # because that's in the list of training words. if self._load_language_files: new_word_alternatives = [x for x in self._edits1(word) if x not in new_words] for variant in new_word_alternatives: if variant not in self.nwords: self.nwords[variant] = -1 _trained_words.append(word) self._trained_words = _trained_words def train(self, words): if not isinstance(words, (tuple, list)): words = [words] self._train(words) def save(self, own_training_file=None, encoding='latin1'): """ save the trained words """ if own_training_file: self.own_training_file = own_training_file if not hasattr(self, '_trained_words'): raise ValueError, "No trained words to save" codecs.open(self.own_training_file,'w', encoding).write( u'\n'.join(self._trained_words) ) def correct(self, word): candidates = self._candidates(word) if word in getattr(self, '_trained_words', []): # this test is important because if you've trained a word, specifcally, # we can be pretty certain that it's correct and doesn't need to # be corrected. Without this if statement the max() function below # might "break" because the two top candidates have the same score # Suppose...: # s = Spellcorrector('en') # s.train('peter') # s.train('petter') # (at this point self.nwords is {'peter':1, 'petter':1} # s.correct('peter') --> 'petter' # With this if statement, the word 'peter' which shares the max # score position won't be considered incorrect. return word try: return max(candidates, key=lambda w: self.nwords.get(w,1)) except TypeError: # old Python 2.4 suggestions = list([(self.nwords.get(x,1), x) for x in candidates]) suggestions.sort() suggestions.reverse() return suggestions[0][1] def suggestions(self, word, detailed=False): candidates = self._candidates(word) suggestions = list([(self.nwords.get(x,0), x) for x in candidates]) suggestions = [(a,b) for (a,b) in suggestions if a > 0] suggestions.sort() suggestions.reverse() if detailed: total = sum([a for (a,b) in suggestions]) return [{'word':b, 'count':a, 'percentage': 100.0*a/total} for (a,b) in suggestions] else: return [b for (a,b) in suggestions] def count_trained_words(self): return len(self.nwords) def __str__(self): msg = 'Spellcorrector - %s different words counted %s times' return msg % (self.count_trained_words(), sum(self.nwords.values())) def test_spellcorrector(): sc = Spellcorrector('en') sc.train('peter') assert sc.correct('peter') == 'peter' assert sc.correct('petter') == 'peter' sc.train('petter') assert sc.correct('peter') == 'peter' assert sc.correct('petter') == 'petter' assert sc.suggestions('peterr') == ['peter'] assert sc.suggestions('petterr') == ['petter'] def test_spellcorrector_with_own_language_file(): from tempfile import gettempdir lfile = os.path.join(gettempdir(), 'test_spellcorrector.txt') pfile = os.path.join(gettempdir(), 'test_spellcorrector.pickle') try: os.remove(lfile) except OSError: pass try: os.remove(pfile) except OSError: pass file(lfile, 'w').write('einstein, planck') sc = Spellcorrector('en', load_language_file=lfile) assert sc.correct('einsteim') == 'einstein' assert sc.correct('Plank') == 'planck' sc.train('plank') # this takes presedence over the loaded file assert sc.correct('plank') == 'plank' os.remove(lfile) os.remove(pfile) if __name__ == '__main__': import sys from time import time lang = sys.argv[1] spellcorrector = Spellcorrector(lang, load_language_files=True) correct = spellcorrector.correct suggestions = spellcorrector.suggestions for arg in sys.argv[2:]: arg = unicode(arg, 'utf8') t0=time() corrected_arg = correct(arg) t1=time() print t1-t0 print arg, '-->', if arg.lower() == corrected_arg.lower(): print "Fine!" print suggestions(arg) else: print corrected_arg _suggestions = suggestions(arg) print _suggestions print