|
- # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
- #
- # Licensed under the Apache License, Version 2.0 (the "License");
- # you may not use this file except in compliance with the License.
- # You may obtain a copy of the License at
- #
- # http://www.apache.org/licenses/LICENSE-2.0
- #
- # Unless required by applicable law or agreed to in writing, software
- # distributed under the License is distributed on an "AS IS" BASIS,
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- # See the License for the specific language governing permissions and
- # limitations under the License.
-
- import unittest
- from functools import partial
-
- import numpy as np
- from get_test_cover_info import (
- XPUOpTestWrapper,
- create_test_class,
- get_xpu_op_support_types,
- )
- from op_test import convert_float_to_uint16
- from op_test_xpu import XPUOpTest
-
- import paddle
- from paddle import base
-
-
- def adamw_step(inputs, attributes):
- param = inputs['Param']
- grad = inputs['Grad']
- moment1 = inputs['Moment1']
- moment2 = inputs['Moment2']
- lr = inputs['LearningRate']
- beta1_pow = inputs['Beta1Pow']
- beta2_pow = inputs['Beta2Pow']
-
- epsilon = attributes['epsilon']
-
- if 'lr_ratio' in attributes:
- lr = lr * attributes['lr_ratio']
-
- if attributes["with_decay"]:
- coeff = attributes["coeff"]
- decay = 1.0 - lr * coeff
- param2 = param * decay
- param = param2.copy()
-
- if 'beta1' in attributes:
- beta1 = attributes['beta1']
- else:
- beta1 = inputs['Beta1Tensor'][0]
- if 'beta2' in attributes:
- beta2 = attributes['beta2']
- else:
- beta2 = inputs['Beta2Tensor'][0]
-
- moment1_out = beta1 * moment1 + (1 - beta1) * grad
- moment2_out = beta2 * moment2 + (1 - beta2) * np.square(grad)
- denom = (np.sqrt(moment2_out) / np.sqrt(1.0 - beta2_pow)) + epsilon
- param_out = param + ((moment1_out / denom) * (-(lr / (1.0 - beta1_pow))))
- return param_out, moment1_out, moment2_out
-
-
- def simple_lr_setting(param, decay_rate, n_layers):
- if "fc_0" in param.name or "linear_1" in param.name:
- depth = int(param.name.split("_")[2]) + 1
- elif "fc_1" in param.name or "linear_2" in param.name:
- depth = int(param.name.split("_")[2]) + 2
- else:
- depth = 0
-
- return decay_rate ** (n_layers + 2 - depth)
-
-
- class XPUTestAdamwOp1(XPUOpTestWrapper):
- def __init__(self):
- self.op_name = 'adamw'
- self.use_dynamic_create_class = False
-
- class TestAdamW(XPUOpTest):
- def setUp(self):
- # Test AdamW Op with supplied attributes
- self.op_type = "adamw"
- self.init_shape()
- self.dtype = self.in_type
- param = np.random.uniform(-1, 1, self.shape)
- grad = np.random.uniform(-1, 1, self.shape)
- moment1 = np.random.uniform(-1, 1, self.shape).astype("float32")
- # The second moment is positive
- moment2 = np.random.random(self.shape).astype("float32")
-
- learning_rate = 0.004
- beta1 = 0.78
- beta2 = 0.836
- epsilon = 1e-4
- beta1_pow = beta1**10
- beta2_pow = beta2**10
- if self.dtype != np.uint16:
- param = param.astype(self.dtype)
- grad = grad.astype(self.dtype)
- self.inputs = {
- 'Param': param,
- 'Grad': grad,
- 'Moment1': moment1,
- 'Moment2': moment2,
- 'LearningRate': np.array([learning_rate]).astype("float32"),
- 'Beta1Pow': np.array([beta1_pow]).astype("float32"),
- 'Beta2Pow': np.array([beta2_pow]).astype("float32"),
- }
-
- self.attrs = {
- 'epsilon': epsilon,
- 'beta1': beta1,
- 'beta2': beta2,
- "coeff": 0.5,
- "with_decay": True,
- }
-
- param_out, moment1_out, moment2_out = adamw_step(
- self.inputs, self.attrs
- )
-
- self.outputs = {
- 'Moment1Out': moment1_out,
- 'Moment2Out': moment2_out,
- 'ParamOut': param_out,
- 'Beta1PowOut': np.array([beta1_pow]).astype("float32") * beta1,
- 'Beta2PowOut': np.array([beta2_pow]).astype("float32") * beta2,
- }
-
- if self.dtype == np.uint16:
- self.inputs['Param'] = convert_float_to_uint16(
- self.inputs['Param']
- )
- self.inputs['Grad'] = convert_float_to_uint16(
- self.inputs['Grad']
- )
- self.outputs['ParamOut'] = convert_float_to_uint16(param_out)
-
- def init_shape(self):
- self.shape = [102, 105]
-
- def test_check_output(self):
- paddle.enable_static()
- self.check_output_with_place(place=paddle.XPUPlace(0))
-
- def infer_dtype_from_inputs_outputs(self, inputs, outputs):
- self.__class__.dtype = self.dtype
- self.output_dtype = self.dtype
-
- class TestAdamW2(TestAdamW):
- def init_shape(self):
- self.shape = [
- 1000,
- ]
-
- class TestAdamW3(TestAdamW):
- def init_shape(self):
- self.shape = [200, 3000]
-
-
- class XPUTestAdamwOp2(XPUOpTestWrapper):
- def __init__(self):
- self.op_name = 'adamw'
- self.use_dynamic_create_class = False
-
- class TestAdamWOp(unittest.TestCase):
- def test_adamw_op_dygraph(self):
- paddle.disable_static()
- value = np.arange(26).reshape(2, 13).astype(self.in_type_str)
- a = paddle.to_tensor(value)
- linear = paddle.nn.Linear(13, 5)
- adam = paddle.optimizer.AdamW(
- learning_rate=0.01,
- parameters=linear.parameters(),
- apply_decay_param_fun=lambda name: True,
- weight_decay=0.01,
- )
-
- for _ in range(2):
- out = linear(a)
- out.backward()
- adam.step()
- adam.clear_gradients()
-
- def test_adamw_op_coverage(self):
- paddle.disable_static()
- value = np.arange(26).reshape(2, 13).astype(self.in_type_str)
- a = paddle.to_tensor(value)
- linear = paddle.nn.Linear(13, 5)
- adam = paddle.optimizer.AdamW(
- learning_rate=0.0,
- parameters=linear.parameters(),
- apply_decay_param_fun=lambda name: True,
- weight_decay=0.01,
- )
- assert adam.__str__() is not None
-
- def test_adamw_op(self):
- paddle.enable_static()
- place = base.XPUPlace(0)
- shape = [2, 3, 8, 8]
- exe = base.Executor(place)
- train_prog = base.Program()
- startup = base.Program()
- with base.program_guard(train_prog, startup):
- with base.unique_name.guard():
- data = paddle.static.data(name="data", shape=shape)
- conv = paddle.static.nn.conv2d(data, 8, 3)
- loss = paddle.mean(conv)
-
- beta1 = paddle.static.create_global_var(
- shape=[1],
- value=0.85,
- dtype=self.in_type_str,
- persistable=True,
- )
- beta2 = paddle.static.create_global_var(
- shape=[1],
- value=0.95,
- dtype=self.in_type_str,
- persistable=True,
- )
- betas = [beta1, beta2]
- opt = paddle.optimizer.AdamW(
- learning_rate=1e-5,
- beta1=beta1,
- beta2=beta2,
- weight_decay=0.01,
- epsilon=1e-8,
- )
- opt.minimize(loss)
-
- exe.run(startup)
- data_np = np.random.random(shape).astype(self.in_type_str)
- rets = exe.run(
- train_prog, feed={"data": data_np}, fetch_list=[loss]
- )
- assert rets[0] is not None
- paddle.disable_static()
-
- def test_adamw_op_invalid_input(self):
- paddle.disable_static()
- linear = paddle.nn.Linear(10, 10)
- with self.assertRaises(ValueError):
- adam = paddle.optimizer.AdamW(
- 0.1, beta1=-1, parameters=linear.parameters()
- )
- with self.assertRaises(ValueError):
- adam = paddle.optimizer.AdamW(
- 0.1, beta2=-1, parameters=linear.parameters()
- )
- with self.assertRaises(ValueError):
- adam = paddle.optimizer.AdamW(
- 0.1, epsilon=-1, parameters=linear.parameters()
- )
-
- class TestAdamWOpGroup(TestAdamWOp):
- def test_adamw_op_dygraph(self):
- paddle.disable_static()
- value = np.arange(26).reshape(2, 13).astype(self.in_type_str)
- a = paddle.to_tensor(value)
- linear_1 = paddle.nn.Linear(13, 5)
- linear_2 = paddle.nn.Linear(5, 3)
- adam = paddle.optimizer.AdamW(
- learning_rate=0.01,
- parameters=[
- {'params': linear_1.parameters()},
- {'params': linear_2.parameters(), 'weight_decay': 0.001},
- ],
- apply_decay_param_fun=lambda name: True,
- weight_decay=0.01,
- )
-
- for _ in range(2):
- out = linear_1(a)
- out = linear_2(out)
- out.backward()
- adam.step()
- adam.clear_gradients()
-
- class TestAdamWOpGroupWithLR(TestAdamWOp):
- def test_adamw_op_dygraph(self):
- paddle.disable_static()
- value = np.arange(26).reshape(2, 13).astype(self.in_type_str)
- a = paddle.to_tensor(value)
- linear_1 = paddle.nn.Linear(13, 5)
- linear_2 = paddle.nn.Linear(5, 3)
- adam = paddle.optimizer.AdamW(
- learning_rate=paddle.optimizer.lr.PiecewiseDecay(
- boundaries=[3, 6], values=[0.1, 0.2, 0.3]
- ),
- parameters=[
- {
- 'params': linear_1.parameters(),
- 'learning_rate': 0.1,
- },
- {
- 'params': linear_2.parameters(),
- 'weight_decay': 0.001,
- },
- ],
- apply_decay_param_fun=lambda name: True,
- weight_decay=0.01,
- )
-
- for _ in range(2):
- out = linear_1(a)
- out = linear_2(out)
- out.backward()
- adam.step()
- adam.clear_gradients()
-
- class TestAdamWOpLayerwiseLR(TestAdamWOp):
- def setUp(self):
- np.random.seed(2022)
- paddle.seed(2022)
-
- def test_adamw_op_dygraph(self):
- paddle.disable_static()
- linear1 = paddle.nn.Linear(
- 13, 8, bias_attr=paddle.nn.initializer.Constant(value=1.0)
- )
- linear2 = paddle.nn.Linear(
- 8, 5, bias_attr=paddle.nn.initializer.Constant(value=1.0)
- )
-
- # fix the linear name, simple_lr_setting function will use the name
- linear1.weight.name = "linear_1.w_0"
- linear1.bias.name = "linear_1.b_0"
- linear2.weight.name = "linear_2.w_0"
- linear2.bias.name = "linear_2.b_0"
-
- fc1_w = np.array(linear1.weight)
- fc1_w_mon1 = np.zeros_like(fc1_w)
- fc1_w_mon2 = np.zeros_like(fc1_w)
- fc1_b = np.array(linear1.bias)
- fc1_b_mon1 = np.zeros_like(fc1_b)
- fc1_b_mon2 = np.zeros_like(fc1_b)
-
- fc2_w = np.array(linear2.weight)
- fc2_w_mon1 = np.zeros_like(fc2_w)
- fc2_w_mon2 = np.zeros_like(fc2_w)
- fc2_b = np.array(linear2.bias)
- fc2_b_mon1 = np.zeros_like(fc2_b)
- fc2_b_mon2 = np.zeros_like(fc2_b)
-
- simple_lr_fun = partial(
- simple_lr_setting, decay_rate=0.8, n_layers=2
- )
- learning_rate = 0.001
- weight_decay = 0.01
- beta1 = 0.9
- beta2 = 0.999
-
- opt = paddle.optimizer.AdamW(
- learning_rate=learning_rate,
- parameters=[
- {'params': linear1.parameters()},
- {
- 'params': linear2.parameters(),
- },
- ],
- apply_decay_param_fun=lambda name: True,
- weight_decay=weight_decay,
- lr_ratio=simple_lr_fun,
- )
-
- def get_numpy_output(param, grad, moment1, moment2, lr_ratio, t):
- np_inputs = {
- 'Param': param,
- 'Grad': grad,
- 'Moment1': moment1,
- 'Moment2': moment2,
- 'LearningRate': np.array([learning_rate]).astype("float32"),
- 'Beta1Pow': np.array([beta1**t]).astype("float32"),
- 'Beta2Pow': np.array([beta2**t]).astype("float32"),
- }
-
- np_attrs = {
- 'epsilon': 1e-8,
- 'beta1': beta1,
- 'beta2': beta2,
- "lr_ratio": lr_ratio,
- "coeff": weight_decay,
- "with_decay": True,
- }
- param_out, moment1_out, moment2_out = adamw_step(
- np_inputs, np_attrs
- )
- return param_out, moment1_out, moment2_out
-
- for i in range(5):
- a = paddle.to_tensor(
- np.random.uniform(-1, 1, (2, 13)).astype("float32")
- )
- a1 = linear1(a)
- out = linear2(a1)
- out = paddle.mean(out)
- out.backward()
-
- fc1_w, fc1_w_mon1, fc1_w_mon2 = get_numpy_output(
- fc1_w,
- np.array(linear1.weight.grad),
- fc1_w_mon1,
- fc1_w_mon2,
- simple_lr_fun(linear1.weight),
- i + 1,
- )
- fc1_b, fc1_b_mon1, fc1_b_mon2 = get_numpy_output(
- fc1_b,
- np.array(linear1.bias.grad),
- fc1_b_mon1,
- fc1_b_mon2,
- simple_lr_fun(linear1.bias),
- i + 1,
- )
- fc2_w, fc2_w_mon1, fc2_w_mon2 = get_numpy_output(
- fc2_w,
- np.array(linear2.weight.grad),
- fc2_w_mon1,
- fc2_w_mon2,
- simple_lr_fun(linear2.weight),
- i + 1,
- )
- fc2_b, fc2_b_mon1, fc2_b_mon2 = get_numpy_output(
- fc2_b,
- np.array(linear2.bias.grad),
- fc2_b_mon1,
- fc2_b_mon2,
- simple_lr_fun(linear2.bias),
- i + 1,
- )
-
- opt.step()
- opt.clear_gradients()
-
- np.testing.assert_allclose(
- linear1.weight.numpy(), fc1_w, rtol=1e-5, atol=1e-5
- )
- np.testing.assert_allclose(
- linear1.bias.numpy(), fc1_b, rtol=1e-5, atol=1e-5
- )
- np.testing.assert_allclose(
- linear2.weight.numpy(), fc2_w, rtol=1e-5, atol=1e-5
- )
- np.testing.assert_allclose(
- linear2.bias.numpy(), fc2_b, rtol=1e-5, atol=1e-5
- )
-
- def test_adamw_op(self):
- paddle.enable_static()
- place = base.XPUPlace(0)
-
- learning_rate = 0.0001
- beta1 = 0.85
- beta2 = 0.95
- weight_decay = 0.01
- epsilon = 1e-8
-
- train_prog = base.Program()
- startup = base.Program()
- with base.program_guard(train_prog, startup):
- with base.unique_name.guard():
- x = paddle.static.data(
- name='x', shape=[None, 10], dtype='float32'
- )
- y = paddle.static.data(
- name='y', shape=[None, 1], dtype='float32'
- )
-
- weight_attr1 = paddle.framework.ParamAttr(
- name="linear_0.w_0"
- )
- bias_attr1 = paddle.framework.ParamAttr(
- name="linear_0.b_0",
- initializer=paddle.nn.initializer.Constant(value=1.0),
- )
- weight_attr2 = paddle.framework.ParamAttr(
- name="linear_1.w_0"
- )
- bias_attr2 = paddle.framework.ParamAttr(
- name="linear_1.b_0",
- initializer=paddle.nn.initializer.Constant(value=1.0),
- )
- linear1 = paddle.nn.Linear(
- 10, 32, weight_attr=weight_attr1, bias_attr=bias_attr1
- )
- linear2 = paddle.nn.Linear(
- 32, 1, weight_attr=weight_attr2, bias_attr=bias_attr2
- )
-
- out = linear1(x)
- out = linear2(out)
-
- fc1_w_mon1 = np.zeros(linear1.weight.shape).astype(
- "float32"
- )
- fc1_w_mon2 = np.zeros(linear1.weight.shape).astype(
- "float32"
- )
- fc1_b_mon1 = np.zeros(linear1.bias.shape).astype("float32")
- fc1_b_mon2 = np.zeros(linear1.bias.shape).astype("float32")
- fc2_w_mon1 = np.zeros(linear2.weight.shape).astype(
- "float32"
- )
- fc2_w_mon2 = np.zeros(linear2.weight.shape).astype(
- "float32"
- )
- fc2_b_mon1 = np.zeros(linear2.bias.shape).astype("float32")
- fc2_b_mon2 = np.zeros(linear2.bias.shape).astype("float32")
-
- cost = paddle.nn.functional.square_error_cost(
- input=out, label=y
- )
- avg_cost = paddle.mean(cost)
-
- simple_lr_fun = partial(
- simple_lr_setting, decay_rate=0.8, n_layers=2
- )
-
- opt = paddle.optimizer.AdamW(
- learning_rate=learning_rate,
- beta1=beta1,
- beta2=beta2,
- weight_decay=weight_decay,
- epsilon=epsilon,
- lr_ratio=simple_lr_fun,
- )
- opt.minimize(avg_cost)
-
- def get_numpy_output(param, grad, moment1, moment2, lr_ratio, t):
- np_inputs = {
- 'Param': param,
- 'Grad': grad,
- 'Moment1': moment1,
- 'Moment2': moment2,
- 'LearningRate': np.array([learning_rate]).astype("float32"),
- 'Beta1Pow': np.array([beta1**t]).astype("float32"),
- 'Beta2Pow': np.array([beta2**t]).astype("float32"),
- }
-
- np_attrs = {
- 'epsilon': epsilon,
- 'beta1': beta1,
- 'beta2': beta2,
- "lr_ratio": lr_ratio,
- "coeff": weight_decay,
- "with_decay": True,
- }
- param_out, moment1_out, moment2_out = adamw_step(
- np_inputs, np_attrs
- )
- return param_out, moment1_out, moment2_out
-
- fetch_list1 = [
- "linear_0.w_0",
- "linear_0.b_0",
- "linear_1.w_0",
- "linear_1.b_0",
- ]
- fetch_list2 = [
- "linear_0.w_0",
- "linear_0.w_0@GRAD",
- "linear_0.b_0",
- "linear_0.b_0@GRAD",
- "linear_1.w_0",
- "linear_1.w_0@GRAD",
- "linear_1.b_0",
- "linear_1.b_0@GRAD",
- ]
-
- exe = base.Executor(place)
- exe.run(startup)
- test_prog = train_prog.clone(for_test=True)
-
- for i in range(5):
- inputs = np.random.random(size=[8, 10]).astype('float32')
- outputs = np.random.random(size=[8, 1]).astype('float32')
-
- param = exe.run(
- test_prog,
- feed={"x": inputs, "y": outputs},
- fetch_list=fetch_list1,
- )
- params_and_gras = exe.run(
- train_prog,
- feed={"x": inputs, "y": outputs},
- fetch_list=fetch_list2,
- )
-
- fc1_w = param[0]
- fc1_w_grad = params_and_gras[1]
- fc1_b = param[1]
- fc1_b_grad = params_and_gras[3]
- fc2_w = param[2]
- fc2_w_grad = params_and_gras[5]
- fc2_b = param[3]
- fc2_b_grad = params_and_gras[7]
-
- fc1_w, fc1_w_mon1, fc1_w_mon2 = get_numpy_output(
- fc1_w,
- fc1_w_grad,
- fc1_w_mon1,
- fc1_w_mon2,
- simple_lr_fun(linear1.weight),
- i + 1,
- )
- fc1_b, fc1_b_mon1, fc1_b_mon2 = get_numpy_output(
- fc1_b,
- fc1_b_grad,
- fc1_b_mon1,
- fc1_b_mon2,
- simple_lr_fun(linear1.bias),
- i + 1,
- )
- fc2_w, fc2_w_mon1, fc2_w_mon2 = get_numpy_output(
- fc2_w,
- fc2_w_grad,
- fc2_w_mon1,
- fc2_w_mon2,
- simple_lr_fun(linear2.weight),
- i + 1,
- )
- fc2_b, fc2_b_mon1, fc2_b_mon2 = get_numpy_output(
- fc2_b,
- fc2_b_grad,
- fc2_b_mon1,
- fc2_b_mon2,
- simple_lr_fun(linear2.bias),
- i + 1,
- )
-
- np.testing.assert_allclose(
- params_and_gras[0], fc1_w, rtol=1e-5, atol=1e-5
- )
- np.testing.assert_allclose(
- params_and_gras[2], fc1_b, rtol=1e-5, atol=1e-5
- )
- np.testing.assert_allclose(
- params_and_gras[4], fc2_w, rtol=1e-5, atol=1e-5
- )
- np.testing.assert_allclose(
- params_and_gras[6], fc2_b, rtol=1e-5, atol=1e-5
- )
-
- paddle.disable_static()
-
-
- class TestAdamWOpMultiPrecisionWithMainGrad(unittest.TestCase):
- def _test_adamw_op_dygraph_place_amp_with_maingrad(
- self, place, shape, use_main_grad
- ):
- paddle.disable_static()
- paddle.seed(10)
- paddle.set_device(place)
-
- found_inf = None
-
- _weight_decay = 0.1
- with_decay = True
- _lazy_mode = False
- find_master = True
-
- _epsilon = 1e-8
-
- _beta1 = 0.9
- _beta2 = 0.99
- lr_ratio_ = 1.0
-
- lr_rate = 1e-8
-
- param = paddle.randn(shape).astype(paddle.bfloat16)
- master_weight = param.astype(paddle.float32)
- grad = paddle.randn(shape).astype(paddle.bfloat16)
- main_grad = grad.astype(paddle.float32)
- moment1 = paddle.randn(shape).astype(paddle.float32)
- moment2 = paddle.randn(shape).astype(paddle.float32).abs()
- lr = paddle.zeros([1]).astype(paddle.float32)
- lr[0] = lr_rate
- beta1_pow_acc = paddle.ones([1]).astype(paddle.float32)
- beta1_pow_acc[0] = _beta1**10
- beta2_pow_acc = paddle.ones([1]).astype(paddle.float32)
- beta2_pow_acc[0] = _beta2**10
-
- ref_param = param.astype(paddle.float32)
- ref_beta1_pow_acc = beta1_pow_acc.astype(paddle.float32)
- ref_beta2_pow_acc = beta2_pow_acc.astype(paddle.float32)
- ref_moment_1 = moment1.astype(paddle.float32)
- ref_moment_2 = moment2.astype(paddle.float32)
-
- # reference code
- _, _, _, _, _, _ = paddle._C_ops.adamw_(
- ref_param,
- main_grad,
- lr,
- ref_moment_1,
- ref_moment_2,
- ref_beta1_pow_acc,
- ref_beta2_pow_acc,
- master_weight,
- found_inf,
- _beta1,
- _beta2,
- _epsilon,
- lr_ratio_,
- _weight_decay,
- with_decay,
- _lazy_mode,
- 1000,
- False,
- False,
- )
-
- if use_main_grad:
- _, _, _, _, _, _ = paddle._C_ops.adamw_(
- param,
- main_grad,
- lr,
- moment1,
- moment2,
- beta1_pow_acc,
- beta2_pow_acc,
- master_weight,
- found_inf,
- _beta1,
- _beta2,
- _epsilon,
- lr_ratio_,
- _weight_decay,
- with_decay,
- _lazy_mode,
- 1000,
- find_master,
- False,
- )
- np.testing.assert_allclose(
- param.astype("float32").numpy(), ref_param.numpy(), rtol=1e-2
- )
- np.testing.assert_allclose(
- master_weight.numpy(), ref_param.numpy(), rtol=1e-6
- )
- else:
- _, _, _, _, _, _ = paddle._C_ops.adamw_(
- param,
- grad,
- lr,
- moment1,
- moment2,
- beta1_pow_acc,
- beta2_pow_acc,
- master_weight,
- found_inf,
- _beta1,
- _beta2,
- _epsilon,
- lr_ratio_,
- _weight_decay,
- with_decay,
- _lazy_mode,
- 1000,
- find_master,
- False,
- )
- np.testing.assert_allclose(
- param.astype("float32").numpy(), ref_param.numpy(), rtol=1e-2
- )
- np.testing.assert_allclose(
- master_weight.numpy(), ref_param.numpy(), rtol=1e-6
- )
-
- def _get_places(self):
- places = []
- if paddle.is_compiled_with_xpu():
- places.append('xpu')
- return places
-
- def test_main(self):
- for _ in range(1):
- shape = paddle.randint(1, 1024, [2])
- for place in self._get_places():
- use_main_grad_list = [True, False]
- for use_main_grad in use_main_grad_list:
- self._test_adamw_op_dygraph_place_amp_with_maingrad(
- place, shape, use_main_grad
- )
-
-
- class TestAdamWOpMultiPrecision(unittest.TestCase):
- def _test_adamw_op_dygraph_place_amp(self, place, use_amp=False):
- paddle.disable_static()
- paddle.seed(10)
- paddle.set_device(place)
-
- input = paddle.randn((5, 5))
-
- model = paddle.nn.Linear(5, 5)
-
- optimizer = paddle.optimizer.AdamW(
- parameters=[
- {
- 'params': model.parameters(),
- 'weight_decay': 0.001,
- 'beta1': 0.1,
- 'beta2': 0.99,
- }
- ],
- multi_precision=use_amp,
- )
-
- for idx in range(2):
- if place == 'xpu' and use_amp:
- model = paddle.amp.decorate(models=model, level='O2')
- scaler = paddle.amp.GradScaler(init_loss_scaling=1024)
-
- if place == 'xpu' and use_amp:
- with paddle.amp.auto_cast(level='O2'):
- output = model(input)
- loss = paddle.mean(output)
- scaled = scaler.scale(loss)
- scaled.backward()
- scaler.step(optimizer)
- optimizer.clear_grad()
- else:
- output = model(input)
- loss = paddle.mean(output)
- loss.backward()
- optimizer.step()
- optimizer.clear_grad()
-
- def _get_places(self):
- places = ['cpu']
- if paddle.is_compiled_with_xpu():
- places.append('xpu')
- return places
-
- def test_main(self):
- for place in self._get_places():
- use_amp_list = [True, False]
- for use_amp in use_amp_list:
- self._test_adamw_op_dygraph_place_amp(place, use_amp)
-
-
- support_types = get_xpu_op_support_types('adamw')
- for stype in support_types:
- create_test_class(globals(), XPUTestAdamwOp1, stype)
- if stype == "float32":
- create_test_class(globals(), XPUTestAdamwOp2, stype)
-
- if __name__ == "__main__":
- paddle.enable_static()
- unittest.main()
|