PaddlePaddle
/
Paddle
mirror of https://github.com/PaddlePaddle/Paddle

 
			
							# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
import subprocess
import tempfile
import unittest


class TestCustomOpReluModelStaticMultiDevice(unittest.TestCase):
    def install_custom_op(self):
        cmds = [
            "python",
            "setup_for_static_multidevice_test.py",
            "install",
        ]
        p = subprocess.run(cmds)
        assert p.returncode == 0, f"Install Custom Op: Failed: {p}"

    def setUp(self):
        self.fleet_log_dir = tempfile.TemporaryDirectory()
        self.model_dir = tempfile.TemporaryDirectory()
        self.output_log_dir = tempfile.TemporaryDirectory()
        self.install_custom_op()

    def train(self, use_custom_op: bool = True):
        cmds = [
            "python",
            "-m",
            "paddle.distributed.launch",
        ]
        cmds += ["--log_dir", self.fleet_log_dir.name]
        cmds += ["custom_op_multidevice_model_train.py"]
        cmds += ["--output_dir", self.output_log_dir.name]
        cmds += ["--model_dir", self.model_dir.name]
        if use_custom_op:
            cmds += ["--use_custom_op"]
        cmds += ["--train_mode"]
        p = subprocess.run(cmds)
        assert p.returncode == 0, f"Fleet train: Failed: {p}"

    def eval(self, use_custom_op: bool = True):
        cmds = [
            "python",
            "-m",
            "paddle.distributed.launch",
        ]
        cmds += ["--log_dir", self.fleet_log_dir.name]
        cmds += ["custom_op_multidevice_model_train.py"]
        cmds += ["--output_dir", self.output_log_dir.name]
        cmds += ["--model_dir", self.model_dir.name]
        if use_custom_op:
            cmds += ["--use_custom_op"]
        p = subprocess.run(cmds)
        assert p.returncode == 0, f"Fleet eval: Failed: {p}"

    def tearDown(self):
        self.fleet_log_dir.cleanup()
        self.model_dir.cleanup()
        self.output_log_dir.cleanup()

    def test_train_and_eval(self):
        self.train(use_custom_op=True)
        self.train(use_custom_op=False)

        import numpy as np

        import paddle

        count = 0
        if paddle.framework.core.is_compiled_with_cuda():
            count = paddle.framework.core.get_cuda_device_count()
        elif paddle.framework.core.is_compiled_with_xpu():
            count = paddle.framework.core.get_xpu_device_count()
        assert (
            count > 1
        ), "TestCustomOpReluModelStaticMultiDevice needs at least two devices"

        for id in range(count):
            loss_custom = np.load(
                os.path.join(self.output_log_dir.name, f'train_{id}_{True}.npz')
            )
            loss_origin = np.load(
                os.path.join(
                    self.output_log_dir.name,
                    f'train_{id}_{False}.npz',
                )
            )
            np.testing.assert_array_equal(
                loss_custom['losses'], loss_origin['losses']
            )
            np.testing.assert_array_equal(
                loss_custom['relu_out1_list'], loss_origin['relu_out1_list']
            )
            np.testing.assert_array_equal(
                loss_custom['relu_out2_list'], loss_origin['relu_out2_list']
            )

        self.eval(use_custom_op=True)
        self.eval(use_custom_op=False)
        for id in range(count):
            loss_custom = np.load(
                os.path.join(self.output_log_dir.name, f'eval_{id}_{True}.npz')
            )
            loss_origin = np.load(
                os.path.join(self.output_log_dir.name, f'eval_{id}_{False}.npz')
            )
            np.testing.assert_array_equal(
                loss_custom['losses'], loss_origin['losses']
            )
            np.testing.assert_array_equal(
                loss_custom['relu_out1_list'], loss_origin['relu_out1_list']
            )
            np.testing.assert_array_equal(
                loss_custom['relu_out2_list'], loss_origin['relu_out2_list']
            )


if __name__ == '__main__':
    unittest.main()