wuzb1005
/
PARL
forked from OpenI/PARL

 
			
							#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import numpy as np
import parl
from parl import layers
from paddle import fluid
from parl.utils import logger


class MujocoAgent(parl.Agent):
    def __init__(self,
                 algorithm,
                 obs_dim,
                 act_dim,
                 kl_targ,
                 loss_type,
                 beta=1.0,
                 epsilon=0.2,
                 policy_learn_times=20,
                 value_learn_times=10,
                 value_batch_size=256):
        self.obs_dim = obs_dim
        self.act_dim = act_dim
        assert loss_type == 'CLIP' or loss_type == 'KLPEN'
        self.loss_type = loss_type
        super(MujocoAgent, self).__init__(algorithm)

        self.policy_learn_times = policy_learn_times
        # Adaptive kl penalty coefficient
        self.beta = beta
        self.kl_targ = kl_targ

        self.value_learn_times = value_learn_times
        self.value_batch_size = value_batch_size
        self.value_learn_buffer = None

    def build_program(self):
        self.policy_predict_program = fluid.Program()
        self.policy_sample_program = fluid.Program()
        self.policy_learn_program = fluid.Program()
        self.value_predict_program = fluid.Program()
        self.value_learn_program = fluid.Program()

        with fluid.program_guard(self.policy_sample_program):
            obs = layers.data(
                name='obs', shape=[self.obs_dim], dtype='float32')
            sampled_act = self.alg.sample(obs)
            self.policy_sample_output = [sampled_act]

        with fluid.program_guard(self.policy_predict_program):
            obs = layers.data(
                name='obs', shape=[self.obs_dim], dtype='float32')
            means = self.alg.predict(obs)
            self.policy_predict_output = [means]

        with fluid.program_guard(self.policy_learn_program):
            obs = layers.data(
                name='obs', shape=[self.obs_dim], dtype='float32')
            actions = layers.data(
                name='actions', shape=[self.act_dim], dtype='float32')
            advantages = layers.data(
                name='advantages', shape=[1], dtype='float32')
            if self.loss_type == 'KLPEN':
                beta = layers.data(name='beta', shape=[], dtype='float32')
                loss, kl = self.alg.policy_learn(obs, actions, advantages,
                                                 beta)
            else:
                loss, kl = self.alg.policy_learn(obs, actions, advantages)

            self.policy_learn_output = [loss, kl]

        with fluid.program_guard(self.value_predict_program):
            obs = layers.data(
                name='obs', shape=[self.obs_dim], dtype='float32')
            value = self.alg.value_predict(obs)
            self.value_predict_output = [value]

        with fluid.program_guard(self.value_learn_program):
            obs = layers.data(
                name='obs', shape=[self.obs_dim], dtype='float32')
            val = layers.data(name='val', shape=[], dtype='float32')
            value_loss = self.alg.value_learn(obs, val)
            self.value_learn_output = [value_loss]

    def policy_sample(self, obs):
        feed = {'obs': obs}
        sampled_act = self.fluid_executor.run(
            self.policy_sample_program,
            feed=feed,
            fetch_list=self.policy_sample_output)[0]
        return sampled_act

    def policy_predict(self, obs):
        feed = {'obs': obs}
        means = self.fluid_executor.run(
            self.policy_predict_program,
            feed=feed,
            fetch_list=self.policy_predict_output)[0]
        return means

    def value_predict(self, obs):
        feed = {'obs': obs}
        value = self.fluid_executor.run(
            self.value_predict_program,
            feed=feed,
            fetch_list=self.value_predict_output)[0]
        return value

    def _batch_policy_learn(self, obs, actions, advantages):
        if self.loss_type == 'KLPEN':
            feed = {
                'obs': obs,
                'actions': actions,
                'advantages': advantages,
                'beta': self.beta
            }
        else:
            feed = {'obs': obs, 'actions': actions, 'advantages': advantages}
        [loss, kl] = self.fluid_executor.run(
            self.policy_learn_program,
            feed=feed,
            fetch_list=self.policy_learn_output)
        return loss, kl

    def _batch_value_learn(self, obs, val):
        feed = {'obs': obs, 'val': val}
        value_loss = self.fluid_executor.run(
            self.value_learn_program,
            feed=feed,
            fetch_list=self.value_learn_output)[0]
        return value_loss

    def policy_learn(self, obs, actions, advantages):
        """ Learn policy:

        1. Sync parameters of policy model to old policy model
        2. Fix old policy model, and learn policy model multi times
        3. if use KLPEN loss, Adjust kl loss coefficient: beta
        """
        self.alg.sync_old_policy()

        all_loss, all_kl = [], []
        for _ in range(self.policy_learn_times):
            loss, kl = self._batch_policy_learn(obs, actions, advantages)
            all_loss.append(loss)
            all_kl.append(kl)

        if self.loss_type == 'KLPEN':
            # Adative KL penalty coefficient
            if kl > self.kl_targ * 2:
                self.beta = 1.5 * self.beta
            elif kl < self.kl_targ / 2:
                self.beta = self.beta / 1.5

        return np.mean(all_loss), np.mean(all_kl)

    def value_learn(self, obs, value):
        """ Fit model to current data batch + previous data batch
        """
        data_size = obs.shape[0]

        if self.value_learn_buffer is None:
            obs_train, value_train = obs, value
        else:
            obs_train = np.concatenate([obs, self.value_learn_buffer[0]])
            value_train = np.concatenate([value, self.value_learn_buffer[1]])
        self.value_learn_buffer = (obs, value)

        all_loss = []
        for _ in range(self.value_learn_times):
            random_ids = np.arange(obs_train.shape[0])
            np.random.shuffle(random_ids)
            shuffle_obs_train = obs_train[random_ids]
            shuffle_value_train = value_train[random_ids]
            start = 0
            while start < data_size:
                end = start + self.value_batch_size
                value_loss = self._batch_value_learn(
                    shuffle_obs_train[start:end, :],
                    shuffle_value_train[start:end])
                all_loss.append(value_loss)
                start += self.value_batch_size
        return np.mean(all_loss)