|
- # Copyright 2021 Huawei Technologies Co., Ltd
- #
- # Licensed under the Apache License, Version 2.0 (the "License");
- # you may not use this file except in compliance with the License.
- # You may obtain a copy of the License at
- #
- # http://www.apache.org/licenses/LICENSE-2.0
- #
- # Unless required by applicable law or agreed to in writing, software
- # distributed under the License is distributed on an "AS IS" BASIS,
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- # See the License for the specific language governing permissions and
- # limitations under the License.
- # ============================================================================
- """
- Unsupervised Training 多卡
- """
- import os
- import argparse
- import ast
- import time
- import numpy as np
- #import mindcv
- from mindspore import context, nn, Tensor, ops, save_checkpoint
- from mindspore.communication.management import init, get_group_size, get_rank
- from mindspore.context import ParallelMode
- from src.logger import get_logger
- from src.dataset import create_dataset_moco
- from src.config import config
- from src.resnet import resnet50
- from src.builder import MoCo, MoCoTrainOneStepCell, WithLossCell, MoCoEvalOneStepCell, ParamUpdate_k
- from src.warmup_cosine_decay_lr import warmup_cosine_lr
-
- parser = argparse.ArgumentParser(description='Unsupervised Training')
-
- # parser.add_argument('data', metavar='DIR',help='path to dataset')
- parser.add_argument('--device_target', type=str, default='Ascend', help='Device target')
- parser.add_argument('--distribute', type=ast.literal_eval, default=False)
- parser.add_argument('--traindir', type=str)
- parser.add_argument('--train_url', type=str)
- parser.add_argument('--data_url', type=str)
- parser.add_argument('--checkpoint_dir', type=str, default='./checkpoints',
- help='Path to directory for saving checkpoints.')
- parser.add_argument('--isModelArts', type=ast.literal_eval, default=False)
- parser.add_argument('--device_id', type=int, default=7, help='device_id')
- parser.add_argument('--lr', default=0.03, type=float,
- help='initial learning rate')
- parser.add_argument('--momentum', default=0.9, type=float,
- help='momentum of SGD solver')
- parser.add_argument('--weight-decay', default=1e-4, type=float,
- help='weight decay (default: 1e-4)')
- parser.add_argument('--epochs', default=200, type=int,
- help='number of total epochs to run')
- parser.add_argument('--start-epoch', default=0, type=int,
- help='manual epoch number (useful on restarts)')
- # moco specific configs:
- parser.add_argument('--moco_dim', default=128, type=int,
- help='feature dimension (default: 128)')
- parser.add_argument('--moco_k', default=65536, type=int,
- help='queue size; number of negative keys (default: 65536)')
- parser.add_argument('--moco_m', default=0.999, type=float,
- help='moco momentum of updating key encoder (default: 0.999)')
- parser.add_argument('--moco_t', default=0.2, type=float,
- help='softmax temperature (default: 0.07)')
-
- # options for moco v2
- parser.add_argument('--mlp', action='store_true',
- help='use mlp head')
- parser.add_argument('--aug-plus', action='store_true',
- help='use moco v2 data augmentation')
- parser.add_argument('--cos', action='store_true',
- help='use cosine lr schedule')
-
- args = parser.parse_args()
-
-
- def main():
- #os.system('pip install mindcv')
- target = args.device_target
-
- if args.checkpoint_dir and not os.path.exists(args.checkpoint_dir):
- print('Making new checkpoint directory', args.checkpoint_dir)
- os.makedirs(args.checkpoint_dir)
-
- if args.distribute:
- # device_id = int(os.getenv('DEVICE_ID'))
- # device_num = int(os.getenv('RANK_SIZE'))
- init()
- rank = get_rank()
- device_num = get_group_size()
- context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", save_graphs=False)
- # context.set_context(device_id=device_id)
- context.set_auto_parallel_context(device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL,
- gradients_mean=True, parameter_broadcast=True)
- init()
- else:
- # init context
- rank = 0
- # context.set_context(mode=context.PYNATIVE_MODE,
- context.set_context(mode=context.GRAPH_MODE,
- device_target=target,
- save_graphs=False)
- if args.device_target == "Ascend":
- context.set_context(device_id=args.device_id)
-
- logger = get_logger("./", rank)
- # dataset
- # traindir = os.path.join(args.data, 'train')
- # traindir = "/mass_store/dataset/imagenet/train"
- # traindir = "/mass_store/dataset/imagenet/train"
- # traindir = "/old/fyy/mocov2/mscode/imagenet_test/"
- if args.isModelArts:
- import moxing as mox
- # download dataset from obs to cache
- mox.file.copy_parallel(src_url=args.data_url, dst_url='/cache/dataset/device_' + os.getenv('DEVICE_ID'))
- train_dataset_path = '/cache/dataset/device_' + os.getenv('DEVICE_ID') + args.traindir
- # create dataset
- train_datasetv2 = create_dataset_moco(dataset_path=train_dataset_path, aug_plus=config.aug_plus,
- repeat_num=1, batch_size=config.batch_size,
- target=target, distribute=args.distribute)
- train_datasetv1 = create_dataset_moco(dataset_path=train_dataset_path, aug_plus=config.aug_plus,
- repeat_num=1, batch_size=config.batch_size,
- target=target, distribute=False)
- else:
- traindir = "/mass_store/dataset/imagenet/train"
- # traindir = "/old/fyy/mocov2/mscode/imagenet_test/"
- train_datasetv2 = create_dataset_moco(dataset_path=traindir, aug_plus=config.aug_plus,
- repeat_num=1, batch_size=config.batch_size,
- target=target, distribute=args.distribute)
- train_datasetv1 = create_dataset_moco(dataset_path=traindir, aug_plus=config.aug_plus,
- repeat_num=1, batch_size=config.batch_size,
- target=target, distribute=False)
-
- step_size_trainv2 = train_datasetv2.get_dataset_size()
- # step_size_trainv1 = train_datasetv1.get_dataset_size()
-
- if step_size_trainv2 == 0:
- raise ValueError("Please check dataset_v2 size > 0 and batch_size <= dataset size")
- '''
- if step_size_trainv1 == 0:
- raise ValueError("Please check dataset_v1 size > 0 and batch_size <= dataset size")
- '''
-
- # net = resnet50(1001)
- encoder_q = resnet50(num_classes=128)
- encoder_k = resnet50(num_classes=128)
- #encoder_q = mindcv.create_model('resnet50', num_classes=128)
- #encoder_k = mindcv.create_model('resnet50', num_classes=128)
-
- if config.mlp:
- dim_mlp = encoder_q.classifier.weight.shape[1]
- encoder_q.classifier = nn.SequentialCell(
- [nn.Dense(dim_mlp, dim_mlp, weight_init='uniform', bias_init='uniform'), nn.ReLU(), encoder_q.classifier])
- encoder_k.classifier = nn.SequentialCell(
- [nn.Dense(dim_mlp, dim_mlp, weight_init='uniform', bias_init='uniform'), nn.ReLU(), encoder_k.classifier])
-
- for par_q, par_k in zip(encoder_q.get_parameters(), encoder_k.get_parameters()):
- # param_k._data = param_q.data.copy() # initialize #_data???可以吗
- par_k.set_data(par_q.data)
- par_k.requires_grad = False # not update by gradient
-
- paramq = list(encoder_q.get_parameters())
- paramk = list(encoder_k.get_parameters())
-
- paramupdatek = ParamUpdate_k(paramq, paramk, args.moco_m)
- model = MoCo(
- encoder_k, encoder_q, args.moco_dim, args.moco_k, args.moco_m, args.moco_t, args.mlp)
-
- # param_dict = mindspore.load_checkpoint('/old/fyy/mocov2/code/moco_v1_200ep_pretrain.ckpt'))
- # mindspore.load_param_into_net(model, param_dict)
-
- # warmupcosindecaylr = WarmupCosineDecayLR(args.lr, step_size_trainv2, 5, 200)
- # global_step = step_size_trainv2 * args.epochs
- lr = warmup_cosine_lr(args.lr, step_size_trainv2, 200)
-
- loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True)
- net_with_loss = WithLossCell(model, loss)
- optim = nn.SGD(model.trainable_params(), learning_rate=lr, momentum=args.momentum, weight_decay=args.weight_decay)
- train_net = MoCoTrainOneStepCell(paramupdatek, paramq, paramk, net_with_loss, optim)
- train_net.set_train()
- '''
- #print(model)
- #print("model", len(model.trainable_params()))
- #eval_net = MoCoEvalOneStepCell(model)
- #eval_net.set_train(False)
- topk1 = nn.Top1CategoricalAccuracy()
- topk5 = nn.Top5CategoricalAccuracy()
- topk1.clear()
- topk5.clear()
- '''
- shape = (args.moco_dim, args.moco_k)
- queue = ops.StandardNormal()(shape)
- queue = ops.L2Normalize(axis=0, epsilon=1e-12)(queue)
- # ptr = 0
- first_step = True
- t_end = time.time()
- # loss_meter = []
- logger.info(step_size_trainv2)
- # profiler = mindspore.Profiler(start_profile=False)
- for epoch in range(args.start_epoch, args.epochs):
- # output_topk1 = []
- # output_topk5 = []
-
- for step, data in enumerate(train_datasetv2.create_dict_iterator()):
- # output, target = model(im_q=data["im_q"], im_k=data["im_k"])
- # print('output', type(output), output.shape, output, 'target', type(target), target.shape, target)
- # loss, kt, ptr = train_net(im_q=data["im_q"], im_k=data["im_k"], queue=queue, paramq=paramq, paramk=paramk)
- loss, kt, ptr = train_net(im_q=data["im_q"], im_k=data["im_k"], queue=queue)
- # loss, kt, ptr = net_with_loss(im_q=data["im_q"], im_k=data["im_k"], queue=queue)
- loss = np.mean(loss.asnumpy())
- # logits, labels = eval_net(im_q=data["im_q"], im_k=data["im_k"], queue=queue)
- # print("logits, labels====", logits, labels)
- # topk1.update(logits, labels)
- # topk5.update(logits, labels)
- # output_topk1.append(topk1.eval())
- # output_topk5.append(topk5.eval())
- # loss = np.mean(loss.asnumpy())
- ptr = int(ptr)
- queue[:, ptr:ptr + config.batch_size] = kt
- time_used = time.time() - t_end
- per_step_time = time_used * 1000
- logger.info('epoch %d , step %d, loss %f, per step time: %f ms', epoch + 1, step + 1, loss, per_step_time)
- # print("epoch====", epoch, ",step====", step, ",loss====", loss, ",per step time:", per_step_time)
- t_end = time.time()
-
- if args.isModelArts:
- if not os.path.exists('/cache/outputs/device_' + os.getenv('DEVICE_ID') + '/'):
- os.mkdir('/cache/outputs/device_' + os.getenv('DEVICE_ID') + '/')
- save_checkpoint_path = '/cache/outputs/device_' + os.getenv('DEVICE_ID') + '/'
- else:
- if target == "GPU" and args.distribute:
- save_checkpoint_path = os.path.join(args.checkpoint_dir, 'ckpt_' + str(get_rank()) + '/')
- else:
- save_checkpoint_path = args.checkpoint_dir
-
- if args.distribute:
- if get_rank() == 0:
- ckpt_name = os.path.join(save_checkpoint_path,
- "mocov2_{}.ckpt".format(epoch + 1))
- save_checkpoint(train_net, ckpt_name)
- else:
- ckpt_name = os.path.join(save_checkpoint_path,
- "mocov2_{}.ckpt".format(epoch + 1))
- save_checkpoint(train_net, ckpt_name)
- # profiler.analyse()
- # top1 = np.mean(output_topk1)
- # top5 = np.mean(output_topk5)
- # print("output_topk1, output_topk5====", top1, top5)
-
- # print("loss====", loss)
- logger.info("\n\n========================")
- # print("Dataset path: {}".format(args.data))
- logger.info("batch_size %d", config.batch_size)
- logger.info("dataset_v2 size %d", step_size_trainv2)
- # print("dataset_v1 size: {}".format(step_size_trainv1))
- logger.info("=======Training end========")
-
- if args.isModelArts:
- mox.file.copy_parallel(src_url='/cache/outputs', dst_url=args.train_url)
-
-
- if __name__ == '__main__':
- main()
|