OpenI
/
PARL
mirror of https://gitee.com/paddlepaddle/PARL.git

 
			
							#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
import parl

import numpy as np

from MCTS import MCTS
from Arena import Arena
from parl.utils import logger
from utils import win_loss_draw
from alphazero_agent import create_agent


@parl.remote_class(wait=False)
class Actor(object):
    def __init__(self, game, args, seed):
        np.random.seed(seed)
        os.environ['OMP_NUM_THREADS'] = "1"
        self.game = game
        self.args = args

        # neural network of previous generation
        self.previous_agent = create_agent(self.game)
        # neural network of current generation
        self.current_agent = create_agent(self.game)

        # MCTS of previous generation
        self.previous_mcts = MCTS(
            self.game, self.previous_agent, self.args, dirichlet_noise=True)
        # MCTS of current generation
        self.current_mcts = MCTS(
            self.game, self.current_agent, self.args, dirichlet_noise=True)

    def self_play(self, current_weights, game_num):
        """Collecting training data by self-play.
        
        Args:
            current_weights (numpy.array): latest weights of neural network
            game_num (int): game number of self-play

        Returns:
            train_examples (list): examples of the form (canonicalBoard, currPlayer, pi,v)
        """

        # update weights of current neural network with latest weights
        self.current_agent.set_weights(current_weights)

        train_examples = []
        for i in range(game_num):
            logger.info('self play iteration #{}'.format(i))
            # reset node state of MCTS
            self.current_mcts = MCTS(
                self.game, self.current_agent, self.args, dirichlet_noise=True)
            train_examples.extend(self._executeEpisode())
        return train_examples

    def pitting(self, previous_weights, current_weights, games_num):
        """Fighting between previous generation agent and current generation agent

        Args:
            previous_weights (numpy.array): weights of previous generation neural network
            current_weights (numpy.array): weights of current generation neural network
            game_num (int): game number of fighting 

        Returns:
            tuple of (game number of previous agent won, game number of current agent won, game number of draw)
        """
        # update weights of previous and current neural network
        self.previous_agent.set_weights(previous_weights)
        self.current_agent.set_weights(current_weights)

        # reset node state of MCTS
        self.previous_mcts = MCTS(self.game, self.previous_agent, self.args)
        self.current_mcts = MCTS(self.game, self.current_agent, self.args)

        arena = Arena(
            lambda x: np.argmax(self.previous_mcts.getActionProb(x, temp=0)),
            lambda x: np.argmax(self.current_mcts.getActionProb(x, temp=0)),
            self.game)
        previous_wins, current_wins, draws = arena.playGames(games_num)

        return (previous_wins, current_wins, draws)

    def evaluate_test_dataset(self, current_weights, test_dataset):
        """Evaluate performance of latest neural nerwork
        
        Args:
            current_weights (numpy.array): latest weights of neural network
            test_dataset (list): game number of self-play

        Returns:
            tuple of (number of perfect moves, number of good moves)
        """
        # update weights of current neural network with latest weights
        self.current_agent.set_weights(current_weights)

        perfect_move_count, good_move_count = 0, 0
        # for data in test_dataset:
        for i in range(len(test_dataset)):
            data = test_dataset[i]
            self.current_mcts = MCTS(self.game, self.current_agent, self.args)

            x = self.game.getCanonicalForm(data['board'], data['player'])
            agent_move = int(
                np.argmax(self.current_mcts.getActionProb(x, temp=0)))

            moves = data["move_score"]
            perfect_score = max(moves)
            perfect_moves = [i for i in range(7) if moves[i] == perfect_score]

            if agent_move in perfect_moves:
                perfect_move_count += 1
            if win_loss_draw(
                    moves[agent_move]) == win_loss_draw(perfect_score):
                good_move_count += 1
            if i % 100 == 0:
                logger.info('test datset testing {}, perfect move count {}, good move count {}'\
                            .format(i, perfect_move_count, good_move_count))

        return (perfect_move_count, good_move_count)

    def _executeEpisode(self):
        """

        This function executes one episode of self-play, starting with player 1.
        As the game goes on, each turn is added as a training example to
        trainExamples. The game is played till the game ends. After the game
        ends, the outcome of the game is used to assign values to each example
        in trainExamples.

        It uses a temp=1 if episodeStep < tempThresholdStep, and thereafter
        uses temp=0.

        Returns:
            trainExamples: a list of examples of the form (canonicalBoard, currPlayer, pi,v)
                           pi is the MCTS informed policy vector, v is +1 if
                           the player eventually won the game, else -1.
        """
        trainExamples = []
        board = self.game.getInitBoard()
        self.curPlayer = 1
        episodeStep = 0

        while True:
            episodeStep += 1
            canonicalBoard = self.game.getCanonicalForm(board, self.curPlayer)
            temp = int(episodeStep < self.args.tempThresholdStep)

            pi = self.current_mcts.getActionProb(canonicalBoard, temp=temp)
            sym = self.game.getSymmetries(canonicalBoard, pi)
            for b, p in sym:  # board, pi
                trainExamples.append([b, self.curPlayer, p, None])

            action = np.random.choice(len(pi), p=pi)
            board, self.curPlayer = self.game.getNextState(
                board, self.curPlayer, action)

            r = self.game.getGameEnded(board, self.curPlayer)

            if r != 0:
                return [(x[0], x[2], r * ((-1)**(x[1] != self.curPlayer)))
                        for x in trainExamples]