PaddlePaddle
/
Paddle
mirror of https://github.com/PaddlePaddle/Paddle

 
			
							#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import numpy as np

SEED = 2020


def build_fake_sentence(seed):
    random = np.random.RandomState(seed)
    sentence_len = random.randint(5, 15)
    token_ids = [random.randint(0, 1000) for _ in range(sentence_len - 1)]
    return token_ids


def get_data_iter(batch_size, mode='train', cache_num=20):
    self_random = np.random.RandomState(SEED)

    def to_pad_np(data, source=False):
        max_len = 0
        bs = min(batch_size, len(data))
        for ele in data:
            if len(ele) > max_len:
                max_len = len(ele)

        ids = np.ones((bs, max_len), dtype='int64') * 2
        mask = np.zeros((bs), dtype='int32')

        for i, ele in enumerate(data):
            ids[i, : len(ele)] = ele
            if not source:
                mask[i] = len(ele) - 1
            else:
                mask[i] = len(ele)

        return ids, mask

    b_src = []

    if mode != "train":
        cache_num = 1
    data_len = 1000
    for j in range(data_len):
        if len(b_src) == batch_size * cache_num:
            if mode == 'infer':
                new_cache = b_src
            else:
                new_cache = sorted(b_src, key=lambda k: len(k[0]))

            for i in range(cache_num):
                batch_data = new_cache[i * batch_size : (i + 1) * batch_size]
                src_cache = [w[0] for w in batch_data]
                tar_cache = [w[1] for w in batch_data]
                src_ids, src_mask = to_pad_np(src_cache, source=True)
                tar_ids, tar_mask = to_pad_np(tar_cache)
                yield (src_ids, src_mask, tar_ids, tar_mask)

            b_src = []
        src_seed = self_random.randint(0, data_len)
        tar_seed = self_random.randint(0, data_len)
        src_data = build_fake_sentence(src_seed)
        tar_data = build_fake_sentence(tar_seed)
        b_src.append((src_data, tar_data))

    if len(b_src) == batch_size * cache_num or mode == 'infer':
        if mode == 'infer':
            new_cache = b_src
        else:
            new_cache = sorted(b_src, key=lambda k: len(k[0]))

        for i in range(cache_num):
            batch_end = min(len(new_cache), (i + 1) * batch_size)
            batch_data = new_cache[i * batch_size : batch_end]
            src_cache = [w[0] for w in batch_data]
            tar_cache = [w[1] for w in batch_data]
            src_ids, src_mask = to_pad_np(src_cache, source=True)
            tar_ids, tar_mask = to_pad_np(tar_cache)
            yield (src_ids, src_mask, tar_ids, tar_mask)


class Seq2SeqModelHyperParams:
    # Whether use attention model
    attention = False

    # learning rate for optimizer
    learning_rate = 0.01

    # layers number of encoder and decoder
    num_layers = 2

    # hidden size of encoder and decoder
    hidden_size = 8

    src_vocab_size = 1000
    tar_vocab_size = 1000
    batch_size = 8
    max_epoch = 12

    # max length for source and target sentence
    max_len = 30

    # drop probability
    dropout = 0.0

    # init scale for parameter
    init_scale = 0.1

    # max grad norm for global norm clip
    max_grad_norm = 5.0

    # model path for model to save

    base_model_path = "dy2stat/model/base_seq2seq"
    attn_model_path = "dy2stat/model/attn_seq2seq"

    # reload model to inference
    reload_model = "model/epoch_0.pdparams"

    beam_size = 4

    max_seq_len = 3