Browse Source

loss*20

pull/8/head
Taoqiuyu 2 years ago
parent
commit
ecf8b3e122
7 changed files with 22 additions and 18 deletions
  1. +4
    -4
      ElmoTrainOne.py
  2. +1
    -1
      elmo/model.py
  3. +2
    -0
      elmo/modules/highway.py
  4. +4
    -2
      elmo/modules/loss.py
  5. +3
    -3
      elmo/nn/rnn_cells.py
  6. +4
    -4
      elmo/ops/sampled_softmax_loss.py
  7. +4
    -4
      train.py

+ 4
- 4
ElmoTrainOne.py View File

@@ -80,9 +80,9 @@ class ElmoTrainOnestepWithLoss(nn.Cell):
# grad reducer on grads
grads = self.grad_reducer(grads)
grads = self.hyper_map(F.partial(grad_scale, self.loss_scale*self.degree), grads)
grads = P.clip_by_global_norm(grads)
grads = P.clip_by_global_norm(grads, 10.0)

train_perplexity = P.Exp()(loss)
train_perplexity = P.Exp()(loss/20)

if not self.gpu_target:
self.get_status(init)
@@ -104,5 +104,5 @@ class ElmoTrainOnestepWithLoss(nn.Cell):
succ = False
else:
succ = self.optimizer(grads)
# ret = (train_perplexity, cond, self.loss_scale)
return F.depend(train_perplexity, succ)
ret = (train_perplexity, cond, self.loss_scale)
return F.depend(ret, succ)

+ 1
- 1
elmo/model.py View File

@@ -49,7 +49,7 @@ class LanguageModel(nn.Cell):
self.loss = LossCell(projection_dim, n_tokens_vocab, sample_softmax, n_negative_samples_batch, training=training)
self.cast = P.Cast()

@ms_function
#@ms_function
def construct(self, inputs, inputs_backward, next_ids_forward, next_ids_backward):
"""
args:


+ 2
- 0
elmo/modules/highway.py View File

@@ -3,6 +3,7 @@ import mindspore.nn as nn
import mindspore.ops as P
import numpy as np
from mindspore.common.initializer import Normal, Constant
from mindspore import ms_function

class HighWay(nn.Cell):
"""
@@ -36,6 +37,7 @@ class HighWay(nn.Cell):
self._activation = nn.get_activation(activation)

@ms_function
def construct(self, inputs):
current_input = inputs
for layer in self._layers:


+ 4
- 2
elmo/modules/loss.py View File

@@ -2,7 +2,7 @@ import mindspore
import mindspore.nn as nn
import mindspore.ops as P
import numpy as np
from mindspore import Tensor, Parameter
from mindspore import Tensor, Parameter, ms_function
from elmo.ops.sampled_softmax_loss import SampledSoftmaxLoss
from mindspore.common.initializer import initializer, Normal, Zero

@@ -28,6 +28,8 @@ class LossCell(nn.Cell):
self.sparse_softmax_cross_entropy_with_logits = nn.SoftmaxCrossEntropyWithLogits(sparse=True)
self.matmul = nn.MatMul(False, True)
self.reduce_mean = P.ReduceMean()
#@ms_function
def construct(self, lstm_outputs, next_ids):
total_loss = []
for lstm_output, next_token_id in zip(lstm_outputs, next_ids):
@@ -42,4 +44,4 @@ class LossCell(nn.Cell):
loss = self.sparse_softmax_cross_entropy_with_logits(output_scores, next_token_id_flat)
total_loss.append(self.reduce_mean(loss))
return 0.5 * (total_loss[0] + total_loss[1])
return 0.5 * (total_loss[0] + total_loss[1]) * 20

+ 3
- 3
elmo/nn/rnn_cells.py View File

@@ -70,8 +70,8 @@ class RNNCellBase(nn.Cell):
self.bias = bias

hidden_size = proj_size if proj_size else cell_size
self.weight_ih = Parameter(glorot_uniform(num_chunks * cell_size, input_size))
self.weight_hh = Parameter(glorot_uniform(num_chunks * cell_size, hidden_size))
self.weight_ih = Parameter(glorot_uniform((num_chunks * cell_size, input_size)))
self.weight_hh = Parameter(glorot_uniform((num_chunks * cell_size, hidden_size)))
if bias:
self.bias_ih = Parameter(Tensor(np.zeros((num_chunks * cell_size)).astype(np.float32)))
self.bias_hh = Parameter(Tensor(np.zeros((num_chunks * cell_size)).astype(np.float32)))
@@ -121,7 +121,7 @@ class LSTMCellWithProjection(RNNCellBase):
self.proj_size = proj_size
self.proj_clip = proj_clip
if proj_size is not None:
self.proj_weight = Parameter(Tensor(np.random.randn(hidden_size, proj_size).astype(np.float32)))
self.proj_weight = Parameter(glorot_uniform((hidden_size, proj_size)))

self.matmul = P.MatMul()



+ 4
- 4
elmo/ops/sampled_softmax_loss.py View File

@@ -114,7 +114,7 @@ class SampledSoftmaxLoss(_Loss):
sampled_values=self.sampled_values,
subtract_log_q=True)

labels = ops.stop_gradient(labels)
# labels = ops.stop_gradient(labels)
x = self._softmax_cross_entropy(logits, labels)
return x

@@ -170,9 +170,9 @@ class SampledSoftmaxLoss(_Loss):
sampled_values = self.sampler(labels)

(sampled, true_expected_count, sampled_expected_count) = sampled_values
sampled = ops.stop_gradient(sampled)
true_expected_count = ops.stop_gradient(true_expected_count)
sampled_expected_count = ops.stop_gradient(sampled_expected_count)
# sampled = ops.stop_gradient(sampled)
# true_expected_count = ops.stop_gradient(true_expected_count)
# sampled_expected_count = ops.stop_gradient(sampled_expected_count)

if not sampled.dtype == mstype.int32:
sampled = self.cast(sampled, mstype.int32)


+ 4
- 4
train.py View File

@@ -55,17 +55,17 @@ def train():
dataset = create_elmo_dataset(batch_size=options['batch_size'], data_file_path=args.data_url)

steps_per_epoch = dataset.get_dataset_size()
callback_size = opt.sink_size
actual_epoch_num = int(opt.epoch_num * steps_per_epoch / callback_size)
#callback_size = opt.sink_size
#actual_epoch_num = int(args.epoch_num * steps_per_epoch / callback_size)

config_ck = CheckpointConfig(save_checkpoint_steps=steps_per_epoch, keep_checkpoint_max=1)
ckpoint_cb = ModelCheckpoint(prefix="elmo", directory=opt.train_url, config=config_ck)
ckpoint_cb = ModelCheckpoint(prefix="elmo", directory=args.train_url, config=config_ck)

callback = [LossCallBack(steps_per_epoch), TimeMonitor(steps_per_epoch), ckpoint_cb]
update_scale_cell = nn.DynamicLossScaleUpdateCell(loss_scale_value=2**12, scale_factor=2, scale_window=1000)
train_one_step = ElmoTrainOnestepWithLoss(lm, opt, update_scale_cell)
model = Model(train_one_step)
model.train(actual_epoch_num, dataset, callbacks=callback)
model.train(args.epoch_num, dataset, callbacks=callback)

if __name__=='__main__':
train()

Loading…
Cancel
Save