OpenI
/
AISafety

 
			
							# !/usr/bin/env python
# coding=UTF-8
"""
@Author: WEN Hao
@LastEditors: WEN Hao
@Description:
@Date: 2021-08-12
@LastEditTime: 2022-04-15

中文字符串的一些特有的一些操作，分词，POS标记等
"""

import re
from string import punctuation

import stanza
import jieba
import jieba.posseg as pseg
from zhon.hanzi import punctuation as zh_punctuation

from .strings import normalize_pos_tag, normalize_ner_tag

# Issue: 目前jieba在paddle模式下，分词有问题，会把标点符号分到下一个词上
# try:
#     jieba.enable_paddle()
#     _jieba_use_paddle = True
# except Exception:
#     _jieba_use_paddle = False
_jieba_use_paddle = False


__all__ = [
    "words_from_text_cn",
    "jieba_tag",
    "stanza_tag",
]


def words_from_text_cn(s: str, words_to_ignore: list = [], tol: int = 1) -> list:
    """

    目前，允许以空格分词，但是不能差 `jieba.lcut` 分词结果太远，
    否则还是采用 `jieba.lcut` 的分词。

    例子（以下是某个对抗样本的分词示例，以及原样本分词结果）：
    -------
    >>> words_from_text_cn("以后 还会 继续 在 杨晓卫 上 购物")
    ['以后', '还会', '继续', '在', '杨晓卫', '上', '购物']  # 7个词
    >>> words_from_text_cn("以后还会继续在杨晓卫上购物")
    ['以后', '还会', '继续', '在', '杨晓卫上', '购物']  # 6个词
    >>> words_from_text_cn("以后还会继续在京东上购物")  # 原样本
    ['以后', '还会', '继续', '在', '京东', '上', '购物']  # 7个词

    """
    pseudo_words = s.split(" ")
    words = jieba.lcut(
        re.sub("[\ufeff\\s]", "", s.strip()), use_paddle=_jieba_use_paddle
    )
    # words = list(filter(lambda w: re.search("\\w+", w), words))  # ignore empty words
    if abs(len(words) - len(pseudo_words)) <= tol:
        words = pseudo_words
    purifier = (
        lambda w: len(
            re.sub(
                f"([{punctuation+zh_punctuation}\\s]+)|{'|'.join(words_to_ignore)}",
                "",
                w.strip(),
            )
        )
        > 0
    )
    words = list(filter(purifier, words))
    return words


# fmt: off
_jieba_ner_mapping = {
    "nr": "PER", "PER": "PER",  # 人名
    "ns": "LOC", "LOC": "LOC",  # 地点
    "nt": "ORG", "ORG": "ORG",  # 机构
    "t": "TIME", "TIME": "TIME",  # 时间
    "nw": "WORK",  # 作品名
}
# fmt: on


def jieba_tag(s: str, tag_type: str = "pos") -> tuple:
    """ """
    assert tag_type.lower() in [
        "pos",
        "ner",
    ], f"""tag_type must be "pos" or "ner", but got {tag_type}"""
    purifier = (
        lambda w: len(re.sub(f"[{punctuation+zh_punctuation}\\s]+", "", w)) > 0
    )  # noqa: E731
    words, tags = [], []
    for item in pseg.cut(s.strip().replace("\ufeff", ""), use_paddle=_jieba_use_paddle):
        if not purifier(item.word):
            continue
        words.append(item.word)
        if tag_type.lower() == "ner":
            tags.append(
                normalize_ner_tag(
                    _jieba_ner_mapping.get(item.flag, "OTHER"),
                )
            )
        else:  # pos
            tags.append(normalize_pos_tag(item.flag))
    res = {"words": words, "tags": tags}
    return res


def stanza_tag(s: str, tag_type: str = "pos") -> dict:
    """ """
    raise NotImplementedError("stanza 需要下载大模型，暂时不用")
    stanza.Pipeline(
        lang="zh",
        processors="tokenize, pos",
        tokenize_pretokenized=True,
    )