|
- import os
- import numpy as np
- from mindnlp.configs import DEFAULT_ROOT
- from mindnlp.utils.download import cache_file
- from tokenizers import Tokenizer, models
-
- URL = {
- "gpt2": "https://huggingface.co/gpt2/resolve/main/tokenizer.json",
- "gpt2-medium": "https://huggingface.co/gpt2-medium/resolve/main/tokenizer.json",
- "gpt2-large": "https://huggingface.co/gpt2-large/resolve/main/tokenizer.json",
- "gpt2-xl": "https://huggingface.co/gpt2-xl/resolve/main/tokenizer.json",
- "distilgpt2": "https://huggingface.co/distilgpt2/resolve/main/tokenizer.json",
-
- }
-
- class GPT2Tokenizer():
- def __init__(
- self,
- tokenizer_file=None,
- ):
- if tokenizer_file != None:
- self._tokenizer = Tokenizer(models.BPE()).from_file(tokenizer_file)
-
- def __call__(self, text_input):
- """
- Call method for input conversion for eager mode with C++ implementation.
- """
- if isinstance(text_input, str):
- text_input = np.array(text_input)
- elif not isinstance(text_input, np.ndarray):
- raise TypeError(
- f"Input should be a text line in 1-D NumPy format, got {type(text_input)}.")
- return super().__call__(text_input)
-
- @classmethod
- def from_pretrained(cls, size: str):
- cache_dir = os.path.join(DEFAULT_ROOT, "tokenizers", size)
- path, _ = cache_file(None, url=URL[size], cache_dir=cache_dir)
- tokenizer = cls(tokenizer_file=str(path))
- return tokenizer
-
- def encode(self, text_input):
- tokens = self._tokenizer.encode(text_input)
- return tokens
-
- def decode(self, ids: list):
- return self.decode(ids)
|