OpenModelZoo
/
seq2seq

 
			
							# Copyright 2021 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""Tokenizer."""
import os
from collections import defaultdict
from functools import partial
import subword_nmt.apply_bpe
import sacremoses


class Tokenizer:
    """
    Constructor for the Tokenizer class.

    Args:
        vocab_address: vocabulary address.
        bpe_code_address: path to the file with bpe codes.
        vocab_pad: pads vocabulary to a multiple of 'vocab_pad' tokens.
        isolator: tokenization isolator.
    """

    def __init__(self, vocab_address=None, bpe_code_address=None,
                 src_en='en', tgt_fr='fr', vocab_pad=8, isolator='@@'):
        self.padding_index = 0
        self.unk_index = 1
        self.bos_index = 2
        self.eos_index = 3
        self.pad_word = '<pad>'
        self.unk_word = '<unk>'
        self.bos_word = '<s>'
        self.eos_word = r'<\s>'
        self.isolator = isolator
        self.init_bpe(bpe_code_address)
        self.vocab_establist(vocab_address, vocab_pad)
        # TODO src = english, tgt = french
        self.sacremoses_tokenizer = sacremoses.MosesTokenizer(src_en)
        self.sacremoses_detokenizer = sacremoses.MosesDetokenizer(tgt_fr)

    def init_bpe(self, bpe_code_address):
        """Init bpe."""
        if (bpe_code_address is not None) and os.path.exists(bpe_code_address):
            with open(bpe_code_address, 'r') as f1:
                self.bpe = subword_nmt.apply_bpe.BPE(f1)

    def vocab_establist(self, vocab_address, vocab_pad):
        """Establish vocabulary."""
        if (vocab_address is None) or (not os.path.exists(vocab_address)):
            return
        vocab_words = [self.pad_word, self.unk_word, self.bos_word, self.eos_word]
        with open(vocab_address) as f1:
            for sentence in f1:
                vocab_words.append(sentence.strip())
        vocab_size = len(vocab_words)
        padded_vocab_size = (vocab_size + vocab_pad - 1) // vocab_pad * vocab_pad
        for idx in range(0, padded_vocab_size - vocab_size):
            fil_token = f'filled{idx:04d}'
            vocab_words.append(fil_token)
        self.vocab_size = len(vocab_words)
        self.tok2idx = defaultdict(partial(int, self.unk_index))
        for idx, token in enumerate(vocab_words):
            self.tok2idx[token] = idx
        self.idx2tok = {}
        self.idx2tok = defaultdict(partial(str, ","))
        for token, idx in self.tok2idx.items():
            self.idx2tok[idx] = token

    def tokenize(self, sentence):
        """Tokenize sentence."""
        tokenized = self.sacremoses_tokenizer.tokenize(sentence, return_str=True)
        bpe = self.bpe.process_line(tokenized)
        sentence = bpe.strip().split()
        inputs = [self.tok2idx[i] for i in sentence]
        inputs = [self.bos_index] + inputs + [self.eos_index]
        return inputs

    def detokenize(self, indexes, gap=' '):
        """Detokenizes single sentence and removes token isolator characters."""
        reconstruction_bpe = gap.join([self.idx2tok[idx] for idx in indexes])
        reconstruction_bpe = reconstruction_bpe.replace(self.isolator + ' ', '')
        reconstruction_bpe = reconstruction_bpe.replace(self.isolator, '')
        reconstruction_bpe = reconstruction_bpe.replace(self.bos_word, '')
        reconstruction_bpe = reconstruction_bpe.replace(self.eos_word, '')
        reconstruction_bpe = reconstruction_bpe.replace(self.unk_word, '')
        reconstruction_bpe = reconstruction_bpe.replace(self.pad_word, '')
        reconstruction_bpe = reconstruction_bpe.strip()
        reconstruction_words = self.sacremoses_detokenizer.detokenize(reconstruction_bpe.split())
        return reconstruction_words