|
- # install pcl_pangu
- import json
- import os
- os.system('python -m pip install pcl_pangu -i https://pypi.python.org/simple/')
- import argparse
- from pcl_pangu.context import set_context
- from pcl_pangu.model import alpha
- import moxing as mox
- import time
- WORKROOT = None
-
- ### Defines whether the task is a training environment or a debugging environment ###
- def WorkEnvironment(environment):
- global WORKROOT
- if environment == 'train':
- workroot = '/home/work/user-job-dir'
- elif environment == 'debug':
- workroot = '/home/ma-user/work'
- print('current work mode:' + environment + ', workroot:' + workroot)
- WORKROOT = workroot
- return workroot
-
- def MultiObsToEnv(multi_data_url, workroot):
- for each_ckpt_file_url in multi_data_url:
- try:
- mox.file.copy_parallel(each_ckpt_file_url, workroot)
- print("Successfully Download {} to {}".format(each_ckpt_file_url,
- workroot))
- except Exception as e:
- print('moxing download {} to {} failed: '.format(
- each_ckpt_file_url, workroot) + str(e))
- return
-
- def setting_openi_cache_env_fromOBS(cache_ckpt_saving_dir = '/cache/ckpts',
- cache_strategy_saving_dir = '/cache/strategy_ckpt',
- cache_dataset_saving_dir = '/cache/dataset',
- ):
- mindrecord_file_num = 0
- ckpt_url_list = []
- dataset_url_list = []
- strategy_url_list = []
-
- if not os.path.exists(cache_ckpt_saving_dir):
- os.mkdir(cache_ckpt_saving_dir)
- if not os.path.exists(cache_dataset_saving_dir):
- os.mkdir(cache_dataset_saving_dir)
- if not os.path.exists(cache_strategy_saving_dir):
- os.mkdir(cache_strategy_saving_dir)
-
- multi_data_url_json = json.loads(multi_data_url)
- for item in multi_data_url_json:
- if 'mindrecord' in item['dataset_name']:
- mindrecord_file_num += 1
- assert mindrecord_file_num <= 1, "Multi Mindrecord dataset not support now! please raise issue in 'https://git.openi.org.cn/PCL-Platform.Intelligence/pcl_pangu'!"
- dataset_url_list.append(item['dataset_url'])
- elif 'strategy' in item['dataset_name']:
- strategy_url_list.append(item['dataset_url'])
- else:
- ckpt_url_list.append(item['dataset_url'])
-
- ## unzip or tar the pretrain_ckpt_files
- # step 1: copying obs_files to /cache/ckpts/
- MultiObsToEnv(ckpt_url_list, cache_ckpt_saving_dir)
- MultiObsToEnv(dataset_url_list, cache_dataset_saving_dir)
- MultiObsToEnv(strategy_url_list, cache_strategy_saving_dir)
-
- print(os.listdir(cache_ckpt_saving_dir))
- print(os.listdir(cache_dataset_saving_dir))
- print(os.listdir(cache_strategy_saving_dir))
-
- ####################### rename the embedding.npys ##########################
- for item in os.listdir(cache_ckpt_saving_dir):
- this_file = os.path.join(cache_ckpt_saving_dir, item)
- rename_npys_list = ['position_embedding.npy',
- 'top_query_embedding.npy',
- 'word_embedding.npy']
- for rename_npy in rename_npys_list:
- if rename_npy in item:
- os.system('mv {} {}'.format(
- this_file, os.path.join(cache_ckpt_saving_dir, rename_npy)))
- #############################################################################
- print("setting env success.")
- # 下载模型文件结束后,写一个文件来表示下载成功
- f = open("/cache/download_ckpt.txt", 'w')
- f.close()
-
-
- if __name__ == '__main__':
- parser = argparse.ArgumentParser()
- parser.add_argument('--model', default='2B6',
- type=str, choices=['350M', '2B6', '13B'],
- help="setting model size from ['350M', '2B6', '13B'], \n"
- "'350M' mp=1;\n"
- "'2B6' mp=2\n"
- "'13B' mp=8")
- parser.add_argument('--device_target', default='Ascend',
- type=str, choices=['Ascend'],
- help="setting device_type with ['Ascend']")
- parser.add_argument('--data_url', default='/tmp/dataset/text_document',
- type=str,
- help="setting mindrecord dataset from: '/cache/dataset' or from os.env('data_url').")
- parser.add_argument('--multi_data_url',
- help='path to multi dataset',
- default=WorkEnvironment('train'))
- parser.add_argument('--load', default='/tmp/dataset/',
- type=str,
- help="loading pretrained model ckpt, from: '/tmp/dataset'.")
- parser.add_argument('--train_url', default='/tmp/output/',
- type=str,
- help="save your model to: '/tmp/output' or to os.env('train_url').")
- args = parser.parse_args()
- set_context(backend='mindspore')
- #print(args)
-
- local_model_saving_dir = os.path.join(WORKROOT, 'model')
- multi_data_url = args.multi_data_url
- cache_ckpt_saving_dir = '/cache/ckpts'
- cache_strategy_saving_dir = '/cache/strategy_ckpt'
- cache_dataset_saving_dir = '/cache/dataset'
- ###################### using one-node to do FilesCopying #############################
- rank_id_str = os.getenv('RANK_ID', '0')
- rank_id = int(
- rank_id_str[rank_id_str.rfind('-') +
- 1:]) # 'RANK_ID': 'job24535502-job-facereidtome-hn-0/1'
- local_rank = rank_id
- if local_rank % 8 == 0:
- setting_openi_cache_env_fromOBS(cache_ckpt_saving_dir=cache_ckpt_saving_dir,
- cache_strategy_saving_dir=cache_strategy_saving_dir,
- cache_dataset_saving_dir=cache_dataset_saving_dir)
- while not os.path.exists("/cache/download_ckpt.txt"):
- time.sleep(1)
- ########################################################################################
-
- model = args.model
- load = cache_ckpt_saving_dir
- data_path = cache_dataset_saving_dir
- save = local_model_saving_dir
- tmp = os.listdir(cache_strategy_saving_dir)
- if len(tmp) == 1:
- strategy_file_path = os.path.join(cache_strategy_saving_dir, tmp[0])
- else:
- strategy_file_path = None
- #### 训练或微调350M模型至少需要1卡;
- # 2B6模型至少需要2卡;13B模型至少需要4/8卡,选择model='2B6'时,pcl_pangu已预制mp,使用多计算卡资源
- config = alpha.model_config_npu(model=model,
- load=load,
- save=save,
- strategy_load_ckpt_path=strategy_file_path,
- data_path=data_path,
- finetune=True)
- alpha.fine_tune(config)
- pass
|