|
- # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
- # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
- #
- # Licensed under the Apache License, Version 2.0 (the "License");
- # you may not use this file except in compliance with the License.
- # You may obtain a copy of the License at
- #
- # http://www.apache.org/licenses/LICENSE-2.0
- #
- # Unless required by applicable law or agreed to in writing, software
- # distributed under the License is distributed on an "AS IS" BASIS,
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- # See the License for the specific language governing permissions and
- # limitations under the License.
-
- import copy
- import io
- import json
- import os
- import six
- import unicodedata
-
- from utils.tokenizer_utils import convert_to_unicode, whitespace_tokenize, _is_whitespace, _is_control, _is_punctuation, PretrainedTokenizer
-
- __all__ = ['BasicTokenizer', 'BertTokenizer', 'WordpieceTokenizer']
-
-
- class BasicTokenizer(object):
- """
- Runs basic tokenization (punctuation splitting, lower casing, etc.).
- Args:
- do_lower_case (bool): Whether the text strips accents and convert to
- lower case. If you use the BERT Pretrained model, lower is set to
- Flase when using the cased model, otherwise it is set to True.
- Default: True.
- """
-
- def __init__(self, do_lower_case=True):
- """Constructs a BasicTokenizer."""
-
- self.do_lower_case = do_lower_case
-
- def tokenize(self, text):
- """
- Tokenizes a piece of text using basic tokenizer.
- Args:
- text (str): A piece of text.
- Returns:
- list(str): A list of tokens.
- """
- text = convert_to_unicode(text)
- text = self._clean_text(text)
- text = self._tokenize_chinese_chars(text)
-
- orig_tokens = whitespace_tokenize(text)
- split_tokens = []
- for token in orig_tokens:
- if self.do_lower_case:
- token = token.lower()
- token = self._run_strip_accents(token)
- split_tokens.extend(self._run_split_on_punc(token))
-
- output_tokens = whitespace_tokenize(" ".join(split_tokens))
- return output_tokens
-
- def _run_strip_accents(self, text):
- """
- Strips accents from a piece of text.
- """
- text = unicodedata.normalize("NFD", text)
- output = []
- for char in text:
- cat = unicodedata.category(char)
- if cat == "Mn":
- continue
- output.append(char)
- return "".join(output)
-
- def _run_split_on_punc(self, text):
- """
- Splits punctuation on a piece of text.
- """
- chars = list(text)
- i = 0
- start_new_word = True
- output = []
- while i < len(chars):
- char = chars[i]
- if _is_punctuation(char):
- output.append([char])
- start_new_word = True
- else:
- if start_new_word:
- output.append([])
- start_new_word = False
- output[-1].append(char)
- i += 1
-
- return ["".join(x) for x in output]
-
- def _tokenize_chinese_chars(self, text):
- """
- Adds whitespace around any CJK character.
- """
- output = []
- for char in text:
- cp = ord(char)
- if self._is_chinese_char(cp):
- output.append(" ")
- output.append(char)
- output.append(" ")
- else:
- output.append(char)
- return "".join(output)
-
- def _is_chinese_char(self, cp):
- """
- Checks whether CP is the codepoint of a CJK character.
- """
-
- # This defines a "chinese character" as anything in the CJK Unicode block:
- # https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
- #
- # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
- # despite its name. The modern Korean Hangul alphabet is a different block,
- # as is Japanese Hiragana and Katakana. Those alphabets are used to write
- # space-separated words, so they are not treated specially and handled
- # like the all of the other languages.
- if ((cp >= 0x4E00 and cp <= 0x9FFF) or #
- (cp >= 0x3400 and cp <= 0x4DBF) or #
- (cp >= 0x20000 and cp <= 0x2A6DF) or #
- (cp >= 0x2A700 and cp <= 0x2B73F) or #
- (cp >= 0x2B740 and cp <= 0x2B81F) or #
- (cp >= 0x2B820 and cp <= 0x2CEAF) or
- (cp >= 0xF900 and cp <= 0xFAFF) or #
- (cp >= 0x2F800 and cp <= 0x2FA1F)): #
- return True
-
- return False
-
- def _clean_text(self, text):
- """
- Performs invalid character removal and whitespace cleanup on text.
- """
- output = []
- for char in text:
- cp = ord(char)
- if cp == 0 or cp == 0xfffd or _is_control(char):
- continue
- if _is_whitespace(char):
- output.append(" ")
- else:
- output.append(char)
- return "".join(output)
-
-
- class WordpieceTokenizer(object):
- """
- Runs WordPiece tokenization.
- Args:
- vocab (Vocab|dict): Vocab of the word piece tokenizer.
- unk_token (str): A specific token to replace all unkown tokens.
- max_input_chars_per_word (int): If a word's length is more than
- max_input_chars_per_word, it will be dealt as unknown word.
- Default: 100.
- """
-
- def __init__(self, vocab, unk_token, max_input_chars_per_word=100):
- self.vocab = vocab
- self.unk_token = unk_token
- self.max_input_chars_per_word = max_input_chars_per_word
-
- def tokenize(self, text):
- """
- Tokenizes a piece of text into its word pieces.
- This uses a greedy longest-match-first algorithm to perform tokenization
- using the given vocabulary.
- Args:
- text: A single token or whitespace separated tokens. This should have
- already been passed through `BasicTokenizer`.
- Returns:
- list (str): A list of wordpiece tokens.
- Example:
- input = "unaffable"
- output = ["un", "##aff", "##able"]
- """
-
- output_tokens = []
- for token in whitespace_tokenize(text):
- chars = list(token)
- if len(chars) > self.max_input_chars_per_word:
- output_tokens.append(self.unk_token)
- continue
-
- is_bad = False
- start = 0
- sub_tokens = []
- while start < len(chars):
- end = len(chars)
- cur_substr = None
- while start < end:
- substr = "".join(chars[start:end])
- if start > 0:
- substr = "##" + substr
- if substr in self.vocab:
- cur_substr = substr
- break
- end -= 1
- if cur_substr is None:
- is_bad = True
- break
- sub_tokens.append(cur_substr)
- start = end
-
- if is_bad:
- output_tokens.append(self.unk_token)
- else:
- output_tokens.extend(sub_tokens)
- return output_tokens
-
-
- class BertTokenizer(PretrainedTokenizer):
- """
- Constructs a BERT tokenizer. It uses a basic tokenizer to do punctuation
- splitting, lower casing and so on, and follows a WordPiece tokenizer to
- tokenize as subwords.
- Args:
- vocab_file (str): file path of the vocabulary
- do_lower_case (bool): Whether the text strips accents and convert to
- lower case. If you use the BERT pretrained model, lower is set to
- Flase when using the cased model, otherwise it is set to True.
- Default: True.
- unk_token (str): The special token for unkown words. Default: "[UNK]".
- sep_token (str): The special token for separator token . Default: "[SEP]".
- pad_token (str): The special token for padding. Default: "[PAD]".
- cls_token (str): The special token for cls. Default: "[CLS]".
- mask_token (str): The special token for mask. Default: "[MASK]".
-
- Examples:
- .. code-block:: python
- from paddle.hapi.text import BertTokenizer
- tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
- # the following line get: ['he', 'was', 'a', 'puppet', '##eer']
- tokens = tokenizer('He was a puppeteer')
- # the following line get: 'he was a puppeteer'
- tokenizer.convert_tokens_to_string(tokens)
- """
- resource_files_names = {"vocab_file": "vocab.txt"} # for save_pretrained
- pretrained_resource_files_map = {
- "vocab_file": {
- "bert-base-uncased":
- "https://paddle-hapi.bj.bcebos.com/models/bert/bert-base-uncased-vocab.txt",
- "bert-large-uncased":
- "https://paddle-hapi.bj.bcebos.com/models/bert/bert-large-uncased-vocab.txt",
- "bert-base-cased":
- "https://paddle-hapi.bj.bcebos.com/models/bert/bert-base-cased-vocab.txt",
- "bert-large-cased":
- "https://paddle-hapi.bj.bcebos.com/models/bert/bert-large-cased-vocab.txt",
- "bert-base-multilingual-uncased":
- "https://paddle-hapi.bj.bcebos.com/models/bert/bert-base-multilingual-uncased-vocab.txt",
- "bert-base-multilingual-cased":
- "https://paddle-hapi.bj.bcebos.com/models/bert/bert-base-multilingual-cased-vocab.txt",
- "bert-base-chinese":
- "https://paddle-hapi.bj.bcebos.com/models/bert/bert-base-chinese-vocab.txt",
- "bert-wwm-chinese":
- "http://paddlenlp.bj.bcebos.com/models/transformers/bert/bert-wwm-chinese-vocab.txt",
- "bert-wwm-ext-chinese":
- "http://paddlenlp.bj.bcebos.com/models/transformers/bert/bert-wwm-ext-chinese-vocab.txt",
- "macbert-large-chinese":
- "https://paddle-hapi.bj.bcebos.com/models/bert/bert-base-chinese-vocab.txt",
- "macbert-base-chinese":
- "https://paddle-hapi.bj.bcebos.com/models/bert/bert-base-chinese-vocab.txt",
- "simbert-base-chinese":
- "https://paddlenlp.bj.bcebos.com/models/transformers/simbert/vocab.txt",
- }
- }
- pretrained_init_configuration = {
- "bert-base-uncased": {
- "do_lower_case": True
- },
- "bert-large-uncased": {
- "do_lower_case": True
- },
- "bert-base-cased": {
- "do_lower_case": False
- },
- "bert-large-cased": {
- "do_lower_case": False
- },
- "bert-base-multilingual-uncased": {
- "do_lower_case": True
- },
- "bert-base-multilingual-cased": {
- "do_lower_case": False
- },
- "bert-base-chinese": {
- "do_lower_case": False
- },
- "bert-wwm-chinese": {
- "do_lower_case": False
- },
- "bert-wwm-ext-chinese": {
- "do_lower_case": False
- },
- "macbert-large-chinese": {
- "do_lower_case": False
- },
- "macbert-base-chinese": {
- "do_lower_case": False
- },
- "simbert-base-chinese":{
- "do_lower_case": True
- },
- }
- padding_side = 'right'
-
- def __init__(self,
- vocab_file,
- do_lower_case=True,
- unk_token="[UNK]",
- sep_token="[SEP]",
- pad_token="[PAD]",
- cls_token="[CLS]",
- mask_token="[MASK]"):
-
- if not os.path.isfile(vocab_file):
- raise ValueError(
- "Can't find a vocabulary file at path '{}'. To load the "
- "vocabulary from a pretrained model please use "
- "`tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
- .format(vocab_file))
- self.vocab = self.load_vocabulary(vocab_file, unk_token=unk_token)
- self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
- self.wordpiece_tokenizer = WordpieceTokenizer(
- vocab=self.vocab, unk_token=unk_token)
-
- @property
- def vocab_size(self):
- """
- return the size of vocabulary.
- Returns:
- int: the size of vocabulary.
- """
- return len(self.vocab)
-
- def _tokenize(self, text):
- """
- End-to-end tokenization for BERT models.
- Args:
- text (str): The text to be tokenized.
-
- Returns:
- list: A list of string representing converted tokens.
- """
- split_tokens = []
- for token in self.basic_tokenizer.tokenize(text):
- for sub_token in self.wordpiece_tokenizer.tokenize(token):
- split_tokens.append(sub_token)
- return split_tokens
-
- def tokenize(self, text):
- """
- End-to-end tokenization for BERT models.
- Args:
- text (str): The text to be tokenized.
-
- Returns:
- list: A list of string representing converted tokens.
- """
- return self._tokenize(text)
-
- def convert_tokens_to_string(self, tokens):
- """
- Converts a sequence of tokens (list of string) in a single string. Since
- the usage of WordPiece introducing `##` to concat subwords, also remove
- `##` when converting.
- Args:
- tokens (list): A list of string representing tokens to be converted.
- Returns:
- str: Converted string from tokens.
- """
- out_string = " ".join(tokens).replace(" ##", "").strip()
- return out_string
-
- def num_special_tokens_to_add(self, pair=False):
- """
- Returns the number of added tokens when encoding a sequence with special tokens.
-
- Note:
- This encodes inputs and checks the number of added tokens, and is therefore not efficient. Do not put this
- inside your training loop.
-
- Args:
- pair: Returns the number of added tokens in the case of a sequence pair if set to True, returns the
- number of added tokens in the case of a single sequence if set to False.
-
- Returns:
- Number of tokens added to sequences
- """
- token_ids_0 = []
- token_ids_1 = []
- return len(
- self.build_inputs_with_special_tokens(token_ids_0, token_ids_1
- if pair else None))
-
- def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
- """
- Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
- adding special tokens.
-
- A BERT sequence has the following format:
- ::
- - single sequence: ``[CLS] X [SEP]``
- - pair of sequences: ``[CLS] A [SEP] B [SEP]``
-
- Args:
- token_ids_0 (:obj:`List[int]`):
- List of IDs to which the special tokens will be added.
- token_ids_1 (:obj:`List[int]`, `optional`):
- Optional second list of IDs for sequence pairs.
-
- Returns:
- :obj:`List[int]`: List of input_id with the appropriate special tokens.
- """
- if token_ids_1 is None:
- return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
- _cls = [self.cls_token_id]
- _sep = [self.sep_token_id]
- return _cls + token_ids_0 + _sep + token_ids_1 + _sep
-
- def build_offset_mapping_with_special_tokens(self,
- offset_mapping_0,
- offset_mapping_1=None):
- """
- Build offset map from a pair of offset map by concatenating and adding offsets of special tokens.
-
- A BERT offset_mapping has the following format:
- ::
- - single sequence: ``(0,0) X (0,0)``
- - pair of sequences: `(0,0) A (0,0) B (0,0)``
-
- Args:
- offset_mapping_ids_0 (:obj:`List[tuple]`):
- List of char offsets to which the special tokens will be added.
- offset_mapping_ids_1 (:obj:`List[tuple]`, `optional`):
- Optional second list of char offsets for offset mapping pairs.
-
- Returns:
- :obj:`List[tuple]`: List of char offsets with the appropriate offsets of special tokens.
- """
- if offset_mapping_1 is None:
- return [(0, 0)] + offset_mapping_0 + [(0, 0)]
-
- return [(0, 0)] + offset_mapping_0 + [(0, 0)
- ] + offset_mapping_1 + [(0, 0)]
-
- def create_token_type_ids_from_sequences(self,
- token_ids_0,
- token_ids_1=None):
- """
- Create a mask from the two sequences passed to be used in a sequence-pair classification task.
-
- A BERT sequence pair mask has the following format:
- ::
-
- 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
- | first sequence | second sequence |
-
- If :obj:`token_ids_1` is :obj:`None`, this method only returns the first portion of the mask (0s).
-
- Args:
- token_ids_0 (:obj:`List[int]`):
- List of IDs.
- token_ids_1 (:obj:`List[int]`, `optional`):
- Optional second list of IDs for sequence pairs.
-
- Returns:
- :obj:`List[int]`: List of token_type_id according to the given sequence(s).
- """
- _sep = [self.sep_token_id]
- _cls = [self.cls_token_id]
- if token_ids_1 is None:
- return len(_cls + token_ids_0 + _sep) * [0]
- return len(_cls + token_ids_0 + _sep) * [0] + len(token_ids_1 +
- _sep) * [1]
-
- def get_special_tokens_mask(self,
- token_ids_0,
- token_ids_1=None,
- already_has_special_tokens=False):
- """
- Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
- special tokens using the tokenizer ``encode`` methods.
-
- Args:
- token_ids_0 (List[int]): List of ids of the first sequence.
- token_ids_1 (List[int], optinal): List of ids of the second sequence.
- already_has_special_tokens (bool, optional): Whether or not the token list is already
- formatted with special tokens for the model. Defaults to None.
-
- Returns:
- results (List[int]): The list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
- """
-
- if already_has_special_tokens:
- if token_ids_1 is not None:
- raise ValueError(
- "You should not supply a second sequence if the provided sequence of "
- "ids is already formatted with special tokens for the model."
- )
- return list(
- map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0,
- token_ids_0))
-
- if token_ids_1 is not None:
- return [1] + ([0] * len(token_ids_0)) + [1] + (
- [0] * len(token_ids_1)) + [1]
- return [1] + ([0] * len(token_ids_0)) + [1]
|