|
- import os
- import argparse
- import moxing as mox
- import time
-
- def EnvToObs(train_dir, obs_train_url):
- try:
- mox.file.copy_parallel(train_dir, obs_train_url)
- print("Successfully Upload {} to {}".format(train_dir,
- obs_train_url))
- except Exception as e:
- print('moxing upload {} to {} failed: '.format(train_dir,
- obs_train_url) + str(e))
- return
-
-
- parser = argparse.ArgumentParser(description='PaddleClas VGG16 Training Example')
- # data_url,train_url是固定用于在modelarts上训练的参数,表示数据集的路径和输出模型的路径
- parser.add_argument('--data_url',
- help='path to training/inference dataset folder',
- default='./label')
-
- parser.add_argument('--train_url',
- help='model folder to save/load',
- default='./model')
-
-
- parser.add_argument('--save_checkpoint_path',
- type=str,
- default="./ckpt",
- help='if is test, must provide\
- path where the trained ckpt file')
- parser.add_argument(
- '--device_target',
- type=str,
- default="Ascend",
- choices=['Ascend', 'GPU', 'CPU'],
- help='device where the code will be implemented (default: Ascend)')
- args = parser.parse_args()
-
- device_num = int(os.getenv('RANK_SIZE'))
- local_rank=int(os.getenv('RANK_ID'))
- data_dir = '/cache/label'
- train_dir = os.getcwd() + "/output/VGG16/best_model*"
-
- if device_num == 1 or (device_num > 1 and local_rank % 8 == 0):
- try:
- """这部分代码是自动更新paddle与paddle-custom-device-npu"""
- os.system(f"wget https://paddle-device.bj.bcebos.com/develop/cpu/paddlepaddle-0.0.0-cp37-cp37m-linux_aarch64.whl")
- os.system(f"wget https://paddle-device.bj.bcebos.com/develop/npu/paddle_custom_npu-0.0.0-cp37-cp37m-linux_aarch64.whl")
- os.system(f"pip install --force-reinstall --no-deps paddlepaddle-0.0.0-cp37-cp37m-linux_aarch64.whl")
- os.system(f"pip install --force-reinstall paddle_custom_npu-0.0.0-cp37-cp37m-linux_aarch64.whl")
- print("更新最新paddle和paddle-custom成功")
- except:
- print("更新最新paddle和paddle-custom失败")
-
- try:
- """这部分代码是自动下载paddleclas并自动配置环境"""
- os.system(f"export LD_PRELOAD=$LD_PRELOAD:/usr/local/ma/python3.7/lib/python3.7/site-packages/scikit_learn.libs/libgomp-d22c30c5.so.1.0.0")
- os.system(f"pip install filelock -i https://mirror.baidu.com/pypi/simple")
- os.system(f"git clone https://openi.pcl.ac.cn/PaddlePaddle/PaddleClas.git")
- os.chdir('./PaddleClas')
- os.system(f"pip install --upgrade -r requirements.txt -i https://mirror.baidu.com/pypi/simple")
- os.system(f"python setup.py install")
- print("依赖安装成功")
- except:
- print("依赖修复失败")
-
- try:
- # 在训练环境中定义data_url和train_url,并把数据从obs拷贝到相应的固定路径
- """这部分代码是链接数据集并自动配置环境"""
- os.system(f"ln -s /cache/sfs/data/imagenet/ ./dataset/ILSVRC2012")
- try:
- if not os.path.exists(data_dir):
- os.makedirs(data_dir)
- except Exception as e:
- print("path already exists")
-
- try:
- mox.file.copy_parallel(args.data_url, data_dir)
- print("Successfully Download {} to {}".format(args.data_url,
- data_dir))
- except Exception as e:
- print('moxing download {} to {} failed: '.format(
- args.data_url, data_dir) + str(e))
-
- f = open("/cache/download_input.txt", 'w')
- f.close()
- try:
- if os.path.exists("/cache/download_input.txt"):
- print("download_input succeed")
- except Exception as e:
- print("download_input failed")
-
- print("数据加载成功")
- except:
- print("数据加载失败")
-
- try:
-
- """这部分代码主要是运行训练脚本的"""
- while not os.path.exists("/cache/download_input.txt"):
- time.sleep(1)
- """训练"""
- os.system("python -m paddle.distributed.launch --devices=0,1,2,3 tools/train.py -c ppcls/configs/ImageNet/ResNet/ResNet152.yaml -o Global.device=npu -o Arch.pretrained=False -o Global.epochs=1 -o DataLoader.Eval.dataset.cls_label_path=/cache/label/val_list.txt -o DataLoader.Train.dataset.cls_label_path=/cache/label/train_list.txt")
- """使用预训练模型评估"""
- os.system("python tools/eval.py -c ppcls/configs/ImageNet/ResNet/ResNet152.yaml -o Global.device=npu -o Arch.pretrained=True -o DataLoader.Eval.dataset.cls_label_path=/cache/label/val_list.txt -o DataLoader.Train.dataset.cls_label_path=/cache/label/train_list.txt")
- """使用预训练推理"""
- os.system("python tools/infer.py -c ppcls/configs/ImageNet/ResNet/ResNet152.yaml -o Global.device=npu -o Infer.infer_imgs=dataset/ILSVRC2012/val/n01751748/ILSVRC2012_val_00000001.JPEG -o Arch.pretrained=True -o DataLoader.Eval.dataset.cls_label_path=/cache/label/val_list.txt -o DataLoader.Train.dataset.cls_label_path=/cache/label/train_list.txt")
-
- print("运行成功")
- except:
- print("运行失败")
-
- ######################## 将输出的模型拷贝到obs(固定写法) ########################
- # 把训练后的模型数据从本地的运行环境拷贝回obs,在启智平台相对应的训练任务中会提供下载
- if device_num == 1 or (device_num > 1 and local_rank % 8 == 0):
- EnvToObs(train_dir, args.train_url)
|