qingtian
/
VoVNet19_dw_Imagenet

 
			
							# Copyright 2021 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""
#################train vovnet example on imagenet2012########################
python train.py
"""
import os
import argparse
import moxing as mox
from mindspore import context
from mindspore.common import set_seed
from mindspore.nn.optim import Momentum
from mindspore.train.loss_scale_manager import FixedLossScaleManager
from mindspore.train import Model
from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor
from mindspore import Tensor
from mindspore.context import ParallelMode
from mindspore.nn.metrics import Accuracy
from mindspore.communication.management import init, get_rank
import mindspore.ops as ops
import mindspore.nn as nn
import mindspore.common.initializer as weight_init
import time
from upload import UploadOutput

from src.VoVNet import _VoVNet19_dw_eSE as vovnet
from src.config import config
from src.dataset import create_dataset
from src.lr_generator import get_linear_lr as get_lr
from src.cross_entropy_smooth import CrossEntropySmooth


### Copy single dataset from obs to training image###
def ObsToEnv(obs_data_url, data_dir):
    try:     
        mox.file.copy_parallel(obs_data_url, data_dir)
        print("Successfully Download {} to {}".format(obs_data_url, data_dir))
    except Exception as e:
        print('moxing download {} to {} failed: '.format(obs_data_url, data_dir) + str(e))
    #Set a cache file to determine whether the data has been copied to obs. 
    #If this file exists during multi-card training, there is no need to copy the dataset multiple times.
    f = open("/cache/download_input.txt", 'w')    
    f.close()
    try:
        if os.path.exists("/cache/download_input.txt"):
            print("download_input succeed")
    except Exception as e:
        print("download_input failed")
    return 
### Copy the output to obs###
def EnvToObs(train_dir, obs_train_url):
    try:
        mox.file.copy_parallel(train_dir, obs_train_url)
        print("Successfully Upload {} to {}".format(train_dir,obs_train_url))
    except Exception as e:
        print('moxing upload {} to {} failed: '.format(train_dir,obs_train_url) + str(e))
    return      
def DownloadFromQizhi(obs_data_url, data_dir):
    device_num = int(os.getenv('RANK_SIZE'))
    if device_num == 1:
        ObsToEnv(obs_data_url,data_dir)
        context.set_context(mode=context.GRAPH_MODE,device_target=args.device_target)
    if device_num > 1:
        # set device_id and init for multi-card training
        context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target, device_id=int(os.getenv('ASCEND_DEVICE_ID')))
        context.reset_auto_parallel_context()
        context.set_auto_parallel_context(device_num = device_num, parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, parameter_broadcast=True)
        init()
        #Copying obs data does not need to be executed multiple times, just let the 0th card copy the data
        local_rank=int(os.getenv('RANK_ID'))
        if local_rank%8==0:
            ObsToEnv(obs_data_url,data_dir)
        #If the cache file does not exist, it means that the copy data has not been completed,
        #and Wait for 0th card to finish copying data
        while not os.path.exists("/cache/download_input.txt"):
            time.sleep(1)  
    return
def UploadToQizhi(train_dir, obs_train_url):
    device_num = int(os.getenv('RANK_SIZE'))
    local_rank=int(os.getenv('RANK_ID'))
    if device_num == 1:
        EnvToObs(train_dir, obs_train_url)
    if device_num > 1:
        if local_rank%8==0:
            EnvToObs(train_dir, obs_train_url)
    return

### --data_url,--train_url,--device_target,These 3 parameters must be defined first in a single dataset, 
### otherwise an error will be reported.
###There is no need to add these parameters to the running parameters of the Qizhi platform, 
###because they are predefined in the background, you only need to define them in your code.
parser = argparse.ArgumentParser(description='MindSpore Lenet Example')
parser.add_argument('--data_url',
                    help='path to training/inference dataset folder',
                    default= '/cache/data/')

parser.add_argument('--train_url',
                    help='output folder to save/load',
                    default= '/cache/output/')

parser.add_argument(
    '--device_target',
    type=str,
    default="Ascend",
    choices=['Ascend', 'CPU'],
    help='device where the code will be implemented (default: Ascend),if to use the CPU on the Qizhi platform:device_target=CPU')


set_seed(1)

if __name__ == '__main__':

    args, unknown = parser.parse_known_args()
    data_dir = '/cache/data'  
    train_dir = '/cache/output'
    if not os.path.exists(data_dir):
        os.makedirs(data_dir)
    if not os.path.exists(train_dir):
        os.makedirs(train_dir)
    ###Initialize and copy data to training image
    DownloadFromQizhi(args.data_url, data_dir)
    ###The dataset path is used here:data_dir +"/train" 
      
    device_num = int(os.getenv('RANK_SIZE'))

    # create dataset
    dataset_train = create_dataset(os.path.join(data_dir, "imagenet/train"), do_train=True,
                                   batch_size=config.batch_size)
    step_size = dataset_train.get_dataset_size()

    net = vovnet(num_class=config.class_num)

    for _, cell in net.cells_and_names():
        if isinstance(cell, nn.Conv2d):
            cell.weight.set_data(weight_init.initializer(weight_init.XavierUniform(),
                                                         cell.weight.shape,
                                                         cell.weight.dtype))
        if isinstance(cell, nn.Dense):
            cell.weight.set_data(weight_init.initializer(weight_init.TruncatedNormal(),
                                                         cell.weight.shape,
                                                         cell.weight.dtype))

    # define loss function
    loss = CrossEntropySmooth(sparse=True, reduction="mean",
                              smooth_factor=config.label_smooth_factor,
                              num_classes=config.class_num)
    loss_scale = FixedLossScaleManager(loss_scale=config.loss_scale, drop_overflow_update=False)

    # init lr
    lr = get_lr(lr_init=config.lr_init,
                lr_end=config.lr_end,
                lr_max=config.lr_max,
                warmup_epochs=config.warmup_epochs,
                total_epochs=config.epoch_size,
                steps_per_epoch=step_size)
    lr = Tensor(lr)

    decayed_params = []
    no_decayed_params = []
    for param in net.trainable_params():
        if 'beta' not in param.name and 'gamma' not in param.name and 'bias' not in param.name:
            decayed_params.append(param)
        else:
            no_decayed_params.append(param)

    group_params = [{'params': decayed_params, 'weight_decay': config.weight_decay},
                    {'params': no_decayed_params},
                    {'order_params': net.trainable_params()}]
    opt = Momentum(group_params, lr, config.momentum, loss_scale=config.loss_scale)

    # define model
    if args.device_target != "Ascend":
        model = Model(network,
                      net_loss,
                      net_opt,
                      metrics={"accuracy": Accuracy()})
    else:
        model = Model(net, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale,metrics={'top_1_accuracy', 'top_5_accuracy'},amp_level="O2", keep_batchnorm_fp32=False)

    # define callbacks
    time_cb = TimeMonitor(data_size=step_size)
    loss_cb = LossMonitor()

    if device_num == 1:
        outputDirectory = train_dir + "/"
    if device_num > 1:
        outputDirectory = train_dir + "/" + str(get_rank()) + "/"

    if config.save_checkpoint:
        config_ck = CheckpointConfig(save_checkpoint_steps=config.save_checkpoint_epochs * step_size,
                                     keep_checkpoint_max=config.keep_checkpoint_max)
        ckpt_cb = ModelCheckpoint(prefix="vovnet", directory=outputDirectory, config=config_ck)

    print("============== Starting Training ==============")
    print('epoch_size is: ', config.epoch_size)
    uploadOutput = UploadOutput(train_dir,args.train_url)
    model.train(epoch=config.epoch_size, train_dataset=dataset_train, callbacks=[time_cb, ckpt_cb,loss_cb, uploadOutput])

    # ###Copy the trained output data from the local running environment back to obs,
    # ###and download it in the training task corresponding to the Qizhi platform
    # #This step is not required if UploadOutput is called    
    # UploadToQizhi(train_dir,args.train_url)