shibing624
/
pycorrector
mirror of https://github.com/shibing624/pycorrector.git

 
			
							# -*- coding: utf-8 -*-
"""
@author:XuMing(xuming624@qq.com)
@description: english correction
refer: http://norvig.com/spell-correct.html
"""

import gzip
import json
import operator
import os
from codecs import open
from typing import List

from loguru import logger

from pycorrector.utils.text_utils import is_alphabet_string
from pycorrector.utils.tokenizer import split_text_into_sentences_by_symbol

pwd_path = os.path.abspath(os.path.dirname(__file__))
# 英文拼写词频文件
default_en_dict_path = os.path.join(pwd_path, 'data/en.json.gz')


class EnSpellCorrector:
    def __init__(self, word_freq_dict: dict = None, custom_confusion_dict: dict = None, en_dict_path: str = None):
        """
        Init english spell corrector
        Args:
            word_freq_dict: Word freq dict, k=word, v=int(freq)
            custom_confusion_dict:
            en_dict_path:
        """
        if word_freq_dict and en_dict_path:
            raise ValueError('word_freq_dict and en_dict_path can not be set at the same time.')
        if word_freq_dict is None:
            word_freq_dict = {}
        if custom_confusion_dict is None:
            custom_confusion_dict = {}
        if not word_freq_dict and en_dict_path is None:
            en_dict_path = default_en_dict_path
        self.word_freq_dict = word_freq_dict
        self.custom_confusion_dict = custom_confusion_dict
        if en_dict_path and os.path.exists(en_dict_path):
            with gzip.open(en_dict_path, "rb") as f:
                all_word_freq_dict = json.loads(f.read())
                word_freq = {}
                for k, v in all_word_freq_dict.items():
                    # 英语常用单词3万个，取词频高于400
                    if v > 400:
                        word_freq[k] = v
                self.word_freq_dict = word_freq
                logger.debug("load en spell data: %s, size: %d" % (
                    en_dict_path, len(self.word_freq_dict)))

    @staticmethod
    def edits1(word):
        """
        all edits that are one edit away from 'word'
        :param word:
        :return:
        """
        letters = 'abcdefghijklmnopqrstuvwxyz'
        splits = [(word[:i], word[i:]) for i in range(len(word) + 1)]
        deletes = [L + R[1:] for L, R in splits if R]
        transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R) > 1]
        replaces = [L + c + R[1:] for L, R in splits if R for c in letters]
        inserts = [L + c + R for L, R in splits for c in letters]
        return set(deletes + transposes + replaces + inserts)

    def edits2(self, word):
        """
        all edit that are two edits away from 'word'
        :param word:
        :return:
        """
        return (e2 for e1 in self.edits1(word) for e2 in self.edits1(e1))

    def known(self, word_freq_dict):
        """
        the subset of 'word_freq_dict' that appear in the dictionary of word_freq_dict
        :param word_freq_dict:
        :param limit_count:
        :return:
        """
        return set(w for w in word_freq_dict if w in self.word_freq_dict)

    def probability(self, word):
        """
        probability of word
        :param word:
        :return:float
        """
        N = sum(self.word_freq_dict.values())
        return self.word_freq_dict.get(word, 0) / N

    def candidates(self, word):
        """
        generate possible spelling corrections for word.
        :param word:
        :return:
        """
        return self.known([word]) or self.known(self.edits1(word)) or self.known(self.edits2(word)) or {word}

    def correct_word(self, word):
        """
        most probable spelling correction for word
        :param word:
        :param mini_prob:
        :return:
        """
        candi_prob = {i: self.probability(i) for i in self.candidates(word)}
        sort_candi_prob = sorted(candi_prob.items(), key=operator.itemgetter(1))
        return sort_candi_prob[-1][0]

    @staticmethod
    def _get_custom_confusion_dict(path):
        """
        取自定义困惑集
        :param path:
        :return: dict, {variant: origin}, eg: {"交通先行": "交通限行"}
        """
        confusion = {}
        if path and os.path.exists(path):
            with open(path, 'r', encoding='utf-8') as f:
                for line in f:
                    line = line.strip()
                    if line.startswith('#'):
                        continue
                    terms = line.split()
                    if len(terms) < 2:
                        continue
                    wrong = terms[0]
                    right = terms[1]
                    confusion[wrong] = right
        return confusion

    def set_en_custom_confusion_dict(self, path):
        """
        设置混淆纠错词典
        :param path:
        :return:
        """
        self.custom_confusion_dict = self._get_custom_confusion_dict(path)
        logger.debug('Loaded en spell confusion path: %s, size: %d' % (path, len(self.custom_confusion_dict)))

    def correct(self, sentence, include_symbol=True):
        """
        most probable spelling correction for text
        :param sentence: input query
        :param include_symbol: True, default
        :return: {'source': 'src', 'target': 'trg', 'errors': [(error_word, correct_word, position), ...]}
        example:
            cann you speling it? [['cann', 'can'], ['speling', 'spelling']]
        """
        text_new = ''
        details = []
        blocks = split_text_into_sentences_by_symbol(sentence, include_symbol=include_symbol)
        for w, idx in blocks:
            # 大于1个字符的英文词
            if len(w) > 1 and is_alphabet_string(w):
                if w in self.custom_confusion_dict:
                    corrected_item = self.custom_confusion_dict[w]
                else:
                    corrected_item = self.correct_word(w)
                if corrected_item != w:
                    begin_idx = idx
                    detail_word = (w, corrected_item, begin_idx)
                    details.append(detail_word)
                    w = corrected_item
            text_new += w
        # 以begin_idx排序
        details = sorted(details, key=operator.itemgetter(2))
        return {'source': sentence, 'target': text_new, 'errors': details}

    def correct_batch(self, sentences: List[str], **kwargs):
        """
        批量句子纠错
        :param sentences: 句子文本列表
        :param kwargs: 其他参数
        :return: list of {'source': 'src', 'target': 'trg', 'errors': [(error_word, correct_word, position), ...]}
        """
        return [self.correct(s, **kwargs) for s in sentences]