lanpher
/
efficientdet

 
			
							import os
import time
import argparse
import moxing as mox
import mindspore as ms
import mindspore.nn as nn
from mindspore.train import Model
from mindspore import Tensor, context
from mindspore.context import ParallelMode
from dataset import create_mindrecord, create_EfficientDet_datasets
from mindspore.communication.management import init
from backbone import EfficientDetBackbone
from utils import init_weights, get_lr_cosine
from efficientdet.detloss import FocalLoss
from mindspore.train.callback import LossMonitor, TimeMonitor


### Copy single dataset from obs to training image###
def ObsToEnv(obs_data_url, data_dir):
    try:
        mox.file.copy_parallel(obs_data_url, data_dir)
        print("Successfully Download {} to {}".format(obs_data_url, data_dir))
    except Exception as e:
        print('moxing download {} to {} failed: '.format(obs_data_url, data_dir) + str(e))
    # Set a cache file to determine whether the data has been copied to obs.
    # If this file exists during multi-card training, there is no need to copy the dataset multiple times.
    f = open("/cache/download_input.txt", 'w')
    f.close()
    try:
        if os.path.exists("/cache/download_input.txt"):
            print("download_input succeed")
    except Exception as e:
        print("download_input failed")
    return


### Copy the output to obs###
def EnvToObs(train_dir, obs_train_url):
    try:
        mox.file.copy_parallel(train_dir, obs_train_url)
        print("Successfully Upload {} to {}".format(train_dir,obs_train_url))
    except Exception as e:
        print('moxing upload {} to {} failed: '.format(train_dir,obs_train_url) + str(e))
    return


def DownloadFromQizhi(obs_data_url, data_dir):
    device_num = int(os.getenv('RANK_SIZE'))
    if device_num == 1:
        ObsToEnv(obs_data_url, data_dir)
        context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target)
    if device_num > 1:
        # set device_id and init for multi-card training
        context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target,
                            device_id=int(os.getenv('ASCEND_DEVICE_ID')))
        context.reset_auto_parallel_context()
        context.set_auto_parallel_context(device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL,
                                          gradients_mean=True, parameter_broadcast=True)
        init()
        # Copying obs data does not need to be executed multiple times, just let the 0th card copy the data
        local_rank = int(os.getenv('RANK_ID'))
        if local_rank%8 == 0:
            ObsToEnv(obs_data_url, data_dir)
        # If the cache file does not exist, it means that the copy data has not been completed,
        # and Wait for 0th card to finish copying data
        while not os.path.exists("/cache/download_input.txt"):
            time.sleep(1)
    return


def UploadToQizhi(train_dir, obs_train_url):
    device_num = int(os.getenv('RANK_SIZE'))
    local_rank=int(os.getenv('RANK_ID'))
    if device_num == 1:
        EnvToObs(train_dir, obs_train_url)
    if device_num > 1:
        if local_rank%8==0:
            EnvToObs(train_dir, obs_train_url)
    return


class WithLossCell(nn.Cell):
    def __init__(self, backbone, loss):
        super(WithLossCell, self).__init__()
        self.backbone = backbone
        self.loss = loss

    def construct(self, x, y):
        _, reg, cls, anchor = self.backbone(x)
        cls_loss, reg_loss = self.loss(reg, cls, anchor, y)
        return cls_loss + reg_loss


parser = argparse.ArgumentParser(description='MindSpore')
parser.add_argument('--data_url',
                    help='path to training/inference dataset folder',
                    default='/cache/data/')

parser.add_argument('--train_url',
                    help='output folder to save/load',
                    default='/cache/output/')

parser.add_argument(
    '--device_target',
    type=str,
    default="Ascend",
    choices=['Ascend', 'CPU'],
    help='device where the code will be implemented (default: Ascend),if to use the CPU on the Qizhi platform:device_target=CPU')

parser.add_argument('--epoch_size',
                    type=int,
                    default=5,
                    help='Training epochs.')


if __name__ == "__main__":
    args, unknown = parser.parse_known_args()
    coco_root = '/cache/data'
    train_dir = '/cache/output'
    if not os.path.exists(coco_root):
        os.makedirs(coco_root)
    if not os.path.exists(train_dir):
        os.makedirs(train_dir)

    DownloadFromQizhi(args.data_url, coco_root)

    if args.device_target != "Ascend":
        context.set_context(mode=context.GRAPH_MODE, device_target="Ascend")
    else:
        context.set_context(mode=context.GRAPH_MODE, device_target="CPU")

    mindrecord = r"efficientdet_ch/MindRecordImg"
    mindrecord_dir = os.path.join(coco_root, mindrecord)

    if not os.path.exists(mindrecord_dir):
        create_mindrecord(dataset="coco", prefix="EfficientDet.mindrecord", mindrecord=mindrecord, is_training=True,
                          coco_path=coco_root)
        print("create mindrecord file done.")

    local_data_path = os.path.join(mindrecord_dir, "EfficientDet.mindrecord0")
    dataset = create_EfficientDet_datasets(local_data_path, repeat_num=1, num_parallel_workers=8, batch_size=16,
                                           compound_coef=0, device_num=2)
    # device_num=device_num, rank=rank_id
    dataset_size = dataset.get_dataset_size()
    print("Create dataset done!")

    net = EfficientDetBackbone(90, 0, False, True)
    net.set_train()
    net.to_float(ms.float32)

    init_weights(net)  # 初始化权重

    loss = FocalLoss()

    net_withloss = WithLossCell(net, loss)

    epoch_size = args.epoch_size
    loss_scale = 1

    lr = Tensor(get_lr_cosine(init_lr=0.012, steps_per_epoch=dataset_size, warmup_epochs=50, max_epoch=epoch_size,
                              t_max=epoch_size, eta_min=0.0))

    opt = nn.Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), lr, momentum=.9, weight_decay=5e-4,
                      loss_scale=loss_scale)
    net_with_grads = nn.TrainOneStepCell(net_withloss, optimizer=opt, sens=loss_scale)

    model = Model(net_with_grads, amp_level="O0")

    cb = [LossMonitor(), TimeMonitor()]

    print("============== Starting Training ==============")
    model.train(epoch_size, dataset, callbacks=cb, dataset_sink_mode=True)
    print("============== End Training ==============")