shibing624
/
pycorrector
mirror of https://github.com/shibing624/pycorrector.git

 
			
							# -*- coding: utf-8 -*-
"""
@author:XuMing(xuming624@qq.com)
@description:
"""

import sys
import unittest

sys.path.append('..')
from pycorrector import EnSpellCorrector
from pycorrector.utils.tokenizer import whitespace_tokenize

spell = EnSpellCorrector()


class EnTestCase(unittest.TestCase):

    def test_en_correct(self):
        """测试英文纠错"""
        en_correct = spell.correct
        print(
            en_correct('spelling speling correctud gorrectud bycycle inconvient arrainged peotrry word quintessential'))
        print(en_correct('spelling')['target'])
        assert en_correct('spelling')['target'] == 'spelling'  # no error
        assert en_correct('speling')['target'] == 'spelling'  # insert
        assert en_correct('correctud')['target'] == 'corrected'  # replace 1
        assert en_correct('gorrectud')['target'] == 'corrected'  # replace 2
        assert en_correct('bycycle')['target'] == 'bicycle'  # replace
        assert en_correct('inconvient')['target'] == 'inconvenient'  # insert 2
        assert en_correct('arrainged')['target'] == 'arranged'  # delete
        assert en_correct('peotrry')['target'] == 'poetry'  # transpose + delete
        assert en_correct('word')['target'] == 'word'  # know
        assert en_correct('quintessential')['target'] == 'quintessential'  # unknow

        return 'unit_test pass'

    def test_tokenizer(self):
        """测试英文切词"""
        sent = "test is it."
        white_split = whitespace_tokenize(sent)
        print(white_split)
        assert white_split == ['test', 'is', 'it', '.']  # segment
        res = ['This', 'is', 'a', 'test', 'of', 'the', 'word', 'parser', '.', 'It', 'should', 'work', 'correctly',
               '!!!']
        self.assertEqual(whitespace_tokenize('This is a test of the word parser. It should work correctly!!!'), res)

    @staticmethod
    def spell_t(tests):
        """
        run en_correct(wrong) on all (right,wrong) pairs, and report result
        :param tests:
        :return:
        """
        import time
        start = time.time()
        good, unknown = 0, 0
        n = len(tests)
        en_correct = spell.correct
        for right, wrong in tests:
            w = en_correct(wrong)['target']
            good += (w == right)
        all_time = time.time() - start
        print('acc: {:.0%}, total num: {}, ({:.0%} unknown), speed: {:.0f} '
              'words per second'.format(good / n, n, unknown / n, n / all_time))

    @staticmethod
    def get_set(lines):
        """
        parse 'right, wrong1, wrong2' lines into [('right', 'wrong1'), ('right', 'wrong2')] pairs
        :param lines:
        :return:
        """
        return [(right, wrong) for (right, wrongs) in (line.split(':') for line in lines) for wrong in wrongs.split()]

    def test_spell1(self):
        """测试英文文本纠错-dev"""
        # self.spell_t(self.get_set(open('./spell-testset1.txt')))  # Dev set
        pass

    def test_spell2(self):
        """测试英文文本纠错-test"""
        # self.spell_t(self.get_set(open('./spell-testset2.txt')))  # final test set
        pass

    def test_en_bug_correct1(self):
        """测试英文纠错bug"""
        r = spell.correct('folder payroll connectivity website')
        print(r)
        assert spell.correct('spelling')['target'] == 'spelling'  # no error

    def test_en_bug_correct2(self):
        """测试英文纠错bug"""

        print(spell.word_freq_dict.get('whould'))
        print(spell.candidates('whould'))

        a = spell.correct_word('whould')
        print(a)
        r = spell.correct('contend proble poety adress whould niether  quaties')
        print(r)
        assert spell.correct('whould')['target'] == 'would'  # no error


if __name__ == '__main__':
    unittest.main()