|
- import os
- import time
- import argparse
- import moxing as mox
-
- parser_openi = argparse.ArgumentParser(description='OpenI Launch')
- parser_openi.add_argument('--device_target', type=str, default='Ascend',
- help="type of device")
- parser_openi.add_argument('--data_url', type=str, default='s3:///open-data',
- help='path to dataset')
- parser_openi.add_argument('--train_url', type=str, default='s3:///open-data',
- help='model folder to save/load')
-
- parser_script = argparse.ArgumentParser(description='Arguments Refined for Launched Script')
- parser_script.add_argument('-c', '--config', type=str, default='')
- parser_script.add_argument('--data_dir', type=str, default='/cache/data')
- parser_script.add_argument('--ckpt_save_dir', type=str, default="/cache/output")
-
- _DOWNLOAD_DONE_FILE = "/cache/download-done.flag"
- _UPLOAD_DONE_FILE = "/cache/upload-done.flag"
-
-
- def get_device_num():
- return int(os.getenv('RANK_SIZE'))
-
-
- def get_local_rank():
- return int(os.getenv('RANK_ID'))
-
-
- # Copy single dataset from obs to training image
- def obs2env(obs_data_url, data_dir):
- try:
- mox.file.copy_parallel(obs_data_url, data_dir)
- print(f"Successfully downloaded {obs_data_url} to {data_dir}")
- except Exception as e:
- print(f"Downloading {obs_data_url} to {data_dir} failed: " + str(e))
- # Set a cache file to determine whether the data has been copied to env.
- # If this file exists during multi-card training, there is no need to copy the dataset multiple times.
- f = open(_DOWNLOAD_DONE_FILE, 'w')
- f.close()
- return
-
-
- # Copy the output to obs
- def env2obs(train_dir, obs_train_url):
- try:
- mox.file.copy_parallel(train_dir, obs_train_url)
- print(f"Successfully uploaded {train_dir} to {obs_train_url}")
- except Exception as e:
- print(f"Uploading {train_dir} to {obs_train_url} failed: " + str(e))
- # Set a cache file to determine whether the output has been copied to obs.
- # If this file exists during multi-card training, there is no need to copy the output multiple times.
- f = open(_UPLOAD_DONE_FILE, 'w')
- f.close()
- return
-
-
- def download(obs_data_url, data_dir):
- if get_device_num() == 1:
- obs2env(obs_data_url, data_dir)
- if get_device_num() > 1:
- # Copying obs data does not need to be executed multiple times, just let the 0th card copy the data
- if get_local_rank() % 8 == 0:
- obs2env(obs_data_url, data_dir)
- # If the cache file does not exist, it means that the copy data has not been completed,
- # and wait for 0th card to finish copying data
- while not os.path.exists(_DOWNLOAD_DONE_FILE):
- time.sleep(1)
- time.sleep(10) # wait all process unblock
- print(f"Rank {get_local_rank()} is alive!")
- if get_local_rank() % 8 == 0:
- os.remove(_DOWNLOAD_DONE_FILE)
- return
-
-
- def upload(train_dir, obs_train_url):
- if get_device_num() == 1:
- env2obs(train_dir, obs_train_url)
- if get_device_num() > 1:
- if get_local_rank() % 8 == 0:
- env2obs(train_dir, obs_train_url)
- while not os.path.exists(_UPLOAD_DONE_FILE):
- time.sleep(1)
- time.sleep(10) # wait all process unblock
- print(f"Rank {get_local_rank()} is alive!")
- if get_local_rank() % 8 == 0:
- os.remove(_UPLOAD_DONE_FILE)
- return
-
-
- if __name__ == "__main__":
- work_dir = os.path.dirname(os.path.abspath(__file__))
- args_openi, remaining = parser_openi.parse_known_args()
- args_script, remaining = parser_script.parse_known_args(remaining)
- args_script.config = os.path.join(work_dir, args_script.config)
- if get_device_num() == 1 or (get_device_num() > 1 and get_local_rank() % 8 == 0):
- os.makedirs(args_script.data_dir, exist_ok=True)
- os.makedirs(args_script.ckpt_save_dir, exist_ok=True)
-
- # Copy data to training image
- download(args_openi.data_url, args_script.data_dir)
-
- # Launch train script
- model_names = [
- 'regnet_x_200mf', 'regnet_y_200mf',
- 'regnet_x_400mf', 'regnet_y_400mf',
- 'regnet_x_600mf', 'regnet_y_600mf',
- 'regnet_x_800mf', 'regnet_y_800mf',
- 'regnet_x_1_6gf', 'regnet_y_1_6gf',
- 'regnet_x_3_2gf', 'regnet_y_3_2gf',
- 'regnet_x_4_0gf', 'regnet_y_4_0gf',
- 'regnet_x_6_4gf', 'regnet_y_6_4gf',
- 'regnet_x_8_0gf', 'regnet_y_8_0gf',
- 'regnet_x_12gf', 'regnet_y_12gf',
- 'regnet_x_16gf', 'regnet_y_16gf',
- 'regnet_x_32gf', 'regnet_y_32gf',
- ]
- for model_name in model_names:
- args_dict = {
- "config": args_script.config,
- "data_dir": os.path.join(args_script.data_dir, "imagenet"),
- "ckpt_save_dir": os.path.join(args_script.ckpt_save_dir, model_name),
- "model": model_name,
- }
- args_list = []
- for k, v in args_dict.items():
- args_list.append(f"--{k}={v}")
- launch_arg = " ".join(remaining+args_list)
- launch_cmd = f"python {os.path.join(work_dir, 'train.py')} {launch_arg}"
- print(f"Launch command: {launch_cmd}")
- os.system(launch_cmd)
-
- # Copy the trained output data from the local running environment back to obs,
- # and download it in the training task corresponding to the QiZhi platform
- # This step is not required if UploadOutput is called
- ckpt_save_dir = os.path.join(args_script.ckpt_save_dir, model_name)
- train_url = os.path.join(args_openi.train_url, model_name)
- upload(ckpt_save_dir, train_url)
|