|
- import os
- import time
- import argparse
- import moxing as mox
- import mindspore as ms
- import mindspore.nn as nn
- from mindspore.train import Model
- from mindspore import Tensor, context
- from mindspore.context import ParallelMode
- from dataset import create_mindrecord, create_EfficientDet_datasets
- from mindspore.communication.management import init
- from backbone import EfficientDetBackbone
- from utils import init_weights, get_lr_cosine
- from efficientdet.detloss import FocalLoss
- from mindspore.train.callback import LossMonitor, TimeMonitor
-
-
- ### Copy single dataset from obs to training image###
- def ObsToEnv(obs_data_url, data_dir):
- try:
- mox.file.copy_parallel(obs_data_url, data_dir)
- print("Successfully Download {} to {}".format(obs_data_url, data_dir))
- except Exception as e:
- print('moxing download {} to {} failed: '.format(obs_data_url, data_dir) + str(e))
- # Set a cache file to determine whether the data has been copied to obs.
- # If this file exists during multi-card training, there is no need to copy the dataset multiple times.
- f = open("/cache/download_input.txt", 'w')
- f.close()
- try:
- if os.path.exists("/cache/download_input.txt"):
- print("download_input succeed")
- except Exception as e:
- print("download_input failed")
- return
-
-
- ### Copy the output to obs###
- def EnvToObs(train_dir, obs_train_url):
- try:
- mox.file.copy_parallel(train_dir, obs_train_url)
- print("Successfully Upload {} to {}".format(train_dir,obs_train_url))
- except Exception as e:
- print('moxing upload {} to {} failed: '.format(train_dir,obs_train_url) + str(e))
- return
-
-
- def DownloadFromQizhi(obs_data_url, data_dir):
- device_num = int(os.getenv('RANK_SIZE'))
- if device_num == 1:
- ObsToEnv(obs_data_url, data_dir)
- context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target)
- if device_num > 1:
- # set device_id and init for multi-card training
- context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target,
- device_id=int(os.getenv('ASCEND_DEVICE_ID')))
- context.reset_auto_parallel_context()
- context.set_auto_parallel_context(device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL,
- gradients_mean=True, parameter_broadcast=True)
- init()
- # Copying obs data does not need to be executed multiple times, just let the 0th card copy the data
- local_rank = int(os.getenv('RANK_ID'))
- if local_rank%8 == 0:
- ObsToEnv(obs_data_url, data_dir)
- # If the cache file does not exist, it means that the copy data has not been completed,
- # and Wait for 0th card to finish copying data
- while not os.path.exists("/cache/download_input.txt"):
- time.sleep(1)
- return
-
-
- def UploadToQizhi(train_dir, obs_train_url):
- device_num = int(os.getenv('RANK_SIZE'))
- local_rank=int(os.getenv('RANK_ID'))
- if device_num == 1:
- EnvToObs(train_dir, obs_train_url)
- if device_num > 1:
- if local_rank%8==0:
- EnvToObs(train_dir, obs_train_url)
- return
-
-
- class WithLossCell(nn.Cell):
- def __init__(self, backbone, loss):
- super(WithLossCell, self).__init__()
- self.backbone = backbone
- self.loss = loss
-
- def construct(self, x, y):
- _, reg, cls, anchor = self.backbone(x)
- cls_loss, reg_loss = self.loss(reg, cls, anchor, y)
- return cls_loss + reg_loss
-
-
- parser = argparse.ArgumentParser(description='MindSpore')
- parser.add_argument('--data_url',
- help='path to training/inference dataset folder',
- default='/cache/data/')
-
- parser.add_argument('--train_url',
- help='output folder to save/load',
- default='/cache/output/')
-
- parser.add_argument(
- '--device_target',
- type=str,
- default="Ascend",
- choices=['Ascend', 'CPU'],
- help='device where the code will be implemented (default: Ascend),if to use the CPU on the Qizhi platform:device_target=CPU')
-
- parser.add_argument('--epoch_size',
- type=int,
- default=5,
- help='Training epochs.')
-
-
- if __name__ == "__main__":
- args, unknown = parser.parse_known_args()
- coco_root = '/cache/data'
- train_dir = '/cache/output'
- if not os.path.exists(coco_root):
- os.makedirs(coco_root)
- if not os.path.exists(train_dir):
- os.makedirs(train_dir)
-
- DownloadFromQizhi(args.data_url, coco_root)
-
- if args.device_target != "Ascend":
- context.set_context(mode=context.GRAPH_MODE, device_target="Ascend")
- else:
- context.set_context(mode=context.GRAPH_MODE, device_target="CPU")
-
- mindrecord = r"efficientdet_ch/MindRecordImg"
- mindrecord_dir = os.path.join(coco_root, mindrecord)
-
- if not os.path.exists(mindrecord_dir):
- create_mindrecord(dataset="coco", prefix="EfficientDet.mindrecord", mindrecord=mindrecord, is_training=True,
- coco_path=coco_root)
- print("create mindrecord file done.")
-
- local_data_path = os.path.join(mindrecord_dir, "EfficientDet.mindrecord0")
- dataset = create_EfficientDet_datasets(local_data_path, repeat_num=1, num_parallel_workers=8, batch_size=16,
- compound_coef=0, device_num=2)
- # device_num=device_num, rank=rank_id
- dataset_size = dataset.get_dataset_size()
- print("Create dataset done!")
-
- net = EfficientDetBackbone(90, 0, False, True)
- net.set_train()
- net.to_float(ms.float32)
-
- init_weights(net) # 初始化权重
-
- loss = FocalLoss()
-
- net_withloss = WithLossCell(net, loss)
-
- epoch_size = args.epoch_size
- loss_scale = 1
-
- lr = Tensor(get_lr_cosine(init_lr=0.012, steps_per_epoch=dataset_size, warmup_epochs=50, max_epoch=epoch_size,
- t_max=epoch_size, eta_min=0.0))
-
- opt = nn.Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), lr, momentum=.9, weight_decay=5e-4,
- loss_scale=loss_scale)
- net_with_grads = nn.TrainOneStepCell(net_withloss, optimizer=opt, sens=loss_scale)
-
- model = Model(net_with_grads, amp_level="O0")
-
- cb = [LossMonitor(), TimeMonitor()]
-
- print("============== Starting Training ==============")
- model.train(epoch_size, dataset, callbacks=cb, dataset_sink_mode=True)
- print("============== End Training ==============")
|