PaddlePaddle
/
Paddle
mirror of https://github.com/PaddlePaddle/Paddle

 
			
							# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import unittest

import numpy as np
from simple_nets import simple_fc_net_with_inputs

import paddle
from paddle.base.dygraph.base import switch_to_static_graph
from paddle.device.cuda.graphs import CUDAGraph


def can_use_cuda_graph():
    return paddle.is_compiled_with_cuda() and not paddle.is_compiled_with_rocm()


def build_program(main, startup, batch_size, class_num):
    image_shape = [batch_size, 784]
    label_shape = [batch_size, 1]
    with paddle.static.program_guard(main, startup):
        image = paddle.static.data(
            name="image", shape=image_shape, dtype='float32'
        )
        label = paddle.static.data(
            name="label", shape=label_shape, dtype='int64'
        )
        image.persistable = True
        label.persistable = True
        loss = simple_fc_net_with_inputs(image, label, class_num)
        loss.persistable = True
        lr = paddle.optimizer.lr.PiecewiseDecay(
            boundaries=[2, 3, 4], values=[0.01, 0.02, 0.03, 0.04]
        )
        optimizer = paddle.optimizer.SGD(learning_rate=lr)
        optimizer.minimize(loss)
    return image, label, loss, lr


@unittest.skipIf(
    not paddle.is_compiled_with_cuda() or float(paddle.version.cuda()) < 11.0,
    "only support cuda >= 11.0",
)
class TestCUDAGraphInStaticMode(unittest.TestCase):
    def setUp(self):
        if can_use_cuda_graph():
            # The behavior of `FLAGS_use_stream_safe_cuda_allocator` in static
            # mode is inconsistent with that in dygraph mode.
            # In static mode, FLAGS_use_stream_safe_cuda_allocator must be True.
            # In dygraph mode, FLAGS_use_stream_safe_cuda_allocator must be False.
            # These two types of unittests need to be written separately, because
            # the allocator may only be initialized once, and the flag
            # `FLAGS_use_stream_safe_cuda_allocator` only takes effect during
            # initialization.
            paddle.set_flags(
                {
                    'FLAGS_allocator_strategy': 'auto_growth',
                    'FLAGS_sync_nccl_allreduce': False,
                    'FLAGS_cudnn_deterministic': True,
                    'FLAGS_use_stream_safe_cuda_allocator': True,
                }
            )

    @switch_to_static_graph
    def test_cuda_graph_static_graph(self):
        if not can_use_cuda_graph():
            return

        seed = 100
        loss_cuda_graph = self.cuda_graph_static_graph_main(
            seed, use_cuda_graph=True
        )
        loss_no_cuda_graph = self.cuda_graph_static_graph_main(
            seed, use_cuda_graph=False
        )
        self.assertEqual(loss_cuda_graph, loss_no_cuda_graph)

    def cuda_graph_static_graph_main(self, seed, use_cuda_graph):
        batch_size = 1
        class_num = 10
        image_shape = [batch_size, 784]
        label_shape = [batch_size, 1]

        paddle.seed(seed)
        np.random.seed(seed)
        startup = paddle.static.Program()
        main = paddle.static.Program()

        image, label, loss, lr = build_program(
            main, startup, batch_size, class_num
        )

        place = paddle.CUDAPlace(0)
        exe = paddle.static.Executor(place)
        scope = paddle.static.Scope()
        with paddle.static.scope_guard(scope):
            exe.run(startup)
            build_strategy = paddle.static.BuildStrategy()
            build_strategy.allow_cuda_graph_capture = True
            build_strategy.fix_op_run_order = True
            build_strategy.fuse_all_optimizer_ops = True
            compiled_program = paddle.static.CompiledProgram(
                main, build_strategy=build_strategy
            )
            image_t = scope.var(image.name).get_tensor()
            label_t = scope.var(label.name).get_tensor()
            loss_t = scope.var(loss.name).get_tensor()
            lr_var = main.global_block().var(lr._var_name)
            self.assertTrue(lr_var.persistable)
            lr_t = scope.var(lr_var.name).get_tensor()
            cuda_graph = None
            for batch_id in range(20):
                image_np = np.random.rand(*image_shape).astype('float32')
                label_np = np.random.randint(
                    low=0, high=class_num, size=label_shape, dtype='int64'
                )
                image_t.set(image_np, place)
                label_t.set(label_np, place)

                if batch_id == 1 and use_cuda_graph:
                    cuda_graph = CUDAGraph(place, mode="global")
                    cuda_graph.capture_begin()
                    exe.run(compiled_program)
                    cuda_graph.capture_end()

                if cuda_graph:
                    lr_t.set(np.array([lr()], dtype='float32'), place)
                    cuda_graph.replay()
                else:
                    exe.run(compiled_program)
                lr.step()
            if cuda_graph:
                cuda_graph.reset()
        return np.array(loss_t)


if __name__ == "__main__":
    unittest.main()