@@ -13,8 +13,10 @@ import (
"time"
"code.gitea.io/gitea/services/cloudbrain/resource"
"code.gitea.io/gitea/services/reward/point/account"
"code.gitea.io/gitea/modules/auth"
"code.gitea.io/gitea/modules/git"
"code.gitea.io/gitea/modules/grampus"
@@ -45,7 +47,7 @@ const (
)
func GrampusTrainJobGPUNew(ctx *context.Context) {
ctx.Data["datasetType"] = models.TypeCloudBrainOn e
ctx.Data["IsCreate"] = tru e
err := grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
if err != nil {
ctx.ServerError("get new train-job info failed", err)
@@ -56,7 +58,7 @@ func GrampusTrainJobGPUNew(ctx *context.Context) {
}
func GrampusTrainJobNPUNew(ctx *context.Context) {
ctx.Data["datasetType"] = models.TypeCloudBrainTwo
ctx.Data["IsCreate"] = true
err := grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeNPU)
if err != nil {
ctx.ServerError("get new train-job info failed", err)
@@ -138,9 +140,56 @@ func grampusTrainJobNewDataPrepare(ctx *context.Context, processType string) err
ctx.Data["WaitCount"] = waitCount
}
if ctx.Cloudbrain != nil {
ctx.Data["attachment"] = ctx.Cloudbrain.Uuid
ctx.Data["boot_file"] = ctx.Cloudbrain.BootFile
ctx.Data["image_id"] = ctx.Cloudbrain.ImageID
ctx.Data["run_para_list"] = ctx.Cloudbrain.Parameters
ctx.Data["description"] = ctx.Cloudbrain.Description
ctx.Data["branch_name"] = ctx.Cloudbrain.BranchName
ctx.Data["engine_name"] = ctx.Cloudbrain.EngineName
ctx.Data["WorkServerNumber"] = ctx.Cloudbrain.WorkServerNumber
if ctx.Cloudbrain.Image != "" {
ctx.Data["image"] = ctx.Cloudbrain.Image
} else {
ctx.Data["image"] = ctx.Cloudbrain.EngineName
}
ctx.Data["dataset_name"] = ctx.Cloudbrain.DatasetName
ctx.Data["model_name"] = ctx.Cloudbrain.ModelName
ctx.Data["model_version"] = ctx.Cloudbrain.ModelVersion
ctx.Data["ckpt_name"] = ctx.Cloudbrain.CkptName
ctx.Data["label_names"] = ctx.Cloudbrain.LabelName
ctx.Data["PreTrainModelUrl"] = ctx.Cloudbrain.PreTrainModelUrl
spec, _ := resource.GetCloudbrainSpec(ctx.Cloudbrain.ID)
if spec != nil {
ctx.Data["spec_id"] = spec.ID
}
}
return nil
}
func GrampusTrainJobVersionNew(ctx *context.Context) {
task := ctx.Cloudbrain
ctx.Data["IsCreate"] = false
if task.ComputeResource == models.GPUResource {
err := grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
if err != nil {
ctx.ServerError("get new train-job version info failed", err)
return
}
ctx.HTML(http.StatusOK, tplGrampusTrainJobGPUNew)
} else if task.ComputeResource == models.NPUResource {
err := grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeNPU)
if err != nil {
ctx.ServerError("get new train-job version info failed", err)
return
}
ctx.HTML(200, tplGrampusTrainJobNPUNew)
}
}
func prepareGrampusTrainSpecs(ctx *context.Context, computeResource string) {
noteBookSpecs, _ := resource.FindAvailableSpecs(ctx.User.ID, models.FindSpecsOptions{
JobType: models.JobTypeTrain,
@@ -205,6 +254,7 @@ func grampusParamCheckCreateTrainJob(form auth.CreateGrampusTrainJobForm) error
}
func GrampusTrainJobGpuCreate(ctx *context.Context, form auth.CreateGrampusTrainJobForm) {
ctx.Data["IsCreate"] = true
displayJobName := form.DisplayJobName
jobName := util.ConvertDisplayJobNameToJobName(displayJobName)
uuid := form.Attachment
@@ -214,9 +264,9 @@ func GrampusTrainJobGpuCreate(ctx *context.Context, form auth.CreateGrampusTrain
repo := ctx.Repo.Repository
codeLocalPath := setting.JobPath + jobName + cloudbrain.CodeMountPath + "/"
codeMinioPath := setting.CBCodePathPrefix + jobName + cloudbrain.CodeMountPath + "/"
dataMinioPath := setting.Attachment.Minio.BasePath + path.Join(uuid[0:1], uuid[1:2]) + "/" + uuid
branchName := form.BranchName
image := strings.TrimSpace(form.Image)
tpl := tplGrampusTrainJobGPUNew
lock := redis_lock.NewDistributeLock(redis_key.CloudbrainBindingJobNameKey(fmt.Sprint(repo.ID), string(models.JobTypeTrain), displayJobName))
isOk, err := lock.Lock(models.CloudbrainKeyDuration)
@@ -230,7 +280,7 @@ func GrampusTrainJobGpuCreate(ctx *context.Context, form auth.CreateGrampusTrain
if !jobNamePattern.MatchString(displayJobName) {
grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
ctx.RenderWithErr(ctx.Tr("repo.cloudbrain_jobname_err"), tplGrampusTrainJobGPUNew , &form)
ctx.RenderWithErr(ctx.Tr("repo.cloudbrain_jobname_err"), tpl, &form)
return
}
@@ -238,7 +288,7 @@ func GrampusTrainJobGpuCreate(ctx *context.Context, form auth.CreateGrampusTrain
if err != nil || !bootFileExist {
log.Error("Get bootfile error:", err, ctx.Data["MsgID"])
grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
ctx.RenderWithErr(ctx.Tr("repo.cloudbrain_bootfile_err"), tplGrampusTrainJobGPUNew , &form)
ctx.RenderWithErr(ctx.Tr("repo.cloudbrain_bootfile_err"), tpl, &form)
return
}
@@ -247,13 +297,13 @@ func GrampusTrainJobGpuCreate(ctx *context.Context, form auth.CreateGrampusTrain
if err != nil {
log.Error("GetGrampusCountByUserID failed:%v", err, ctx.Data["MsgID"])
grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
ctx.RenderWithErr("system error", tplGrampusTrainJobGPUNew , &form)
ctx.RenderWithErr("system error", tpl, &form)
return
} else {
if count >= 1 {
log.Error("the user already has running or waiting task", ctx.Data["MsgID"])
grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
ctx.RenderWithErr("you have already a running or waiting task, can not create more", tplGrampusTrainJobGPUNew , &form)
ctx.RenderWithErr("you have already a running or waiting task, can not create more", tpl, &form)
return
}
}
@@ -262,7 +312,7 @@ func GrampusTrainJobGpuCreate(ctx *context.Context, form auth.CreateGrampusTrain
if err := grampusParamCheckCreateTrainJob(form); err != nil {
log.Error("paramCheckCreateTrainJob failed:(%v)", err, ctx.Data["MsgID"])
grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
ctx.RenderWithErr(err.Error(), tplGrampusTrainJobGPUNew , &form)
ctx.RenderWithErr(err.Error(), tpl, &form)
return
}
@@ -272,14 +322,14 @@ func GrampusTrainJobGpuCreate(ctx *context.Context, form auth.CreateGrampusTrain
if len(tasks) != 0 {
log.Error("the job name did already exist", ctx.Data["MsgID"])
grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
ctx.RenderWithErr("the job name did already exist", tplGrampusTrainJobGPUNew , &form)
ctx.RenderWithErr("the job name did already exist", tpl, &form)
return
}
} else {
if !models.IsErrJobNotExist(err) {
log.Error("system error, %v", err, ctx.Data["MsgID"])
grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
ctx.RenderWithErr("system error", tplGrampusTrainJobGPUNew , &form)
ctx.RenderWithErr("system error", tpl, &form)
return
}
}
@@ -292,7 +342,7 @@ func GrampusTrainJobGpuCreate(ctx *context.Context, form auth.CreateGrampusTrain
})
if err != nil || spec == nil {
grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
ctx.RenderWithErr("Resource specification not available", tplGrampusTrainJobGPUNew , &form)
ctx.RenderWithErr("Resource specification not available", tpl, &form)
return
}
@@ -304,11 +354,12 @@ func GrampusTrainJobGpuCreate(ctx *context.Context, form auth.CreateGrampusTrain
}
//check dataset
attachment, err := models.GetAttachmentByUUID(uuid)
datasetInfos, datasetNames, err := models.GetDatasetInfo(uuid, models.GPU)
if err != nil {
log.Error("GetAttachmentByUUID failed:", err.Error() , ctx.Data["MsgID"])
log.Error("GetDatasetInfo failed: %v", err , ctx.Data["MsgID"])
grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
ctx.RenderWithErr("dataset is not exist", tplGrampusTrainJobGPUNew , &form)
ctx.RenderWithErr(ctx.Tr("cloudbrain.error.dataset_select"), tpl , &form)
return
}
@@ -321,7 +372,7 @@ func GrampusTrainJobGpuCreate(ctx *context.Context, form auth.CreateGrampusTrain
if err := downloadZipCode(ctx, codeLocalPath, branchName); err != nil {
log.Error("downloadZipCode failed, server timed out: %s (%v)", repo.FullName(), err, ctx.Data["MsgID"])
grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
ctx.RenderWithErr(ctx.Tr("cloudbrain.load_code_failed"), tplGrampusTrainJobGPUNew , &form)
ctx.RenderWithErr(ctx.Tr("cloudbrain.load_code_failed"), tpl, &form)
return
}
@@ -330,7 +381,7 @@ func GrampusTrainJobGpuCreate(ctx *context.Context, form auth.CreateGrampusTrain
if err := uploadCodeToMinio(codeLocalPath+"/", jobName, cloudbrain.CodeMountPath+"/"); err != nil {
log.Error("Failed to uploadCodeToMinio: %s (%v)", repo.FullName(), err, ctx.Data["MsgID"])
grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
ctx.RenderWithErr(ctx.Tr("cloudbrain.load_code_failed"), tplGrampusTrainJobGPUNew , &form)
ctx.RenderWithErr(ctx.Tr("cloudbrain.load_code_failed"), tpl, &form)
return
}
@@ -338,7 +389,7 @@ func GrampusTrainJobGpuCreate(ctx *context.Context, form auth.CreateGrampusTrain
if err := mkModelPath(modelPath); err != nil {
log.Error("Failed to mkModelPath: %s (%v)", repo.FullName(), err, ctx.Data["MsgID"])
grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
ctx.RenderWithErr(ctx.Tr("cloudbrain.load_code_failed"), tplGrampusTrainJobGPUNew , &form)
ctx.RenderWithErr(ctx.Tr("cloudbrain.load_code_failed"), tpl, &form)
return
}
@@ -346,52 +397,102 @@ func GrampusTrainJobGpuCreate(ctx *context.Context, form auth.CreateGrampusTrain
if err := uploadCodeToMinio(modelPath, jobName, cloudbrain.ModelMountPath+"/"); err != nil {
log.Error("Failed to uploadCodeToMinio: %s (%v)", repo.FullName(), err, ctx.Data["MsgID"])
grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
ctx.RenderWithErr(ctx.Tr("cloudbrain.load_code_failed"), tplGrampusTrainJobGPUNew , &form)
ctx.RenderWithErr(ctx.Tr("cloudbrain.load_code_failed"), tpl, &form)
return
}
var datasetRemotePath, allFileName string
for _, datasetInfo := range datasetInfos {
if datasetRemotePath == "" {
datasetRemotePath = datasetInfo.DataLocalPath
allFileName = datasetInfo.FullName
} else {
datasetRemotePath = datasetRemotePath + ";" + datasetInfo.DataLocalPath
allFileName = allFileName + ";" + datasetInfo.FullName
}
}
//prepare command
command, err := generateCommand(repo.Name, grampus.ProcessorTypeGPU, codeMinioPath+cloudbrain.DefaultBranchName+".zip", dataMinioPath, bootFile, params, setting.CBCodePathPrefix+jobName+cloudbrain.ModelMountPath+"/", attachment.Name)
preTrainModelPath := getPreTrainModelPath(form.PreTrainModelUrl, form.CkptName)
command, err := generateCommand(repo.Name, grampus.ProcessorTypeGPU, codeMinioPath+cloudbrain.DefaultBranchName+".zip", datasetRemotePath, bootFile, params, setting.CBCodePathPrefix+jobName+cloudbrain.ModelMountPath+"/", allFileName, preTrainModelPath, form.CkptName)
if err != nil {
log.Error("Failed to generateCommand: %s (%v)", displayJobName, err, ctx.Data["MsgID"])
grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
ctx.RenderWithErr("Create task failed, internal error", tplGrampusTrainJobGPUNew, &form)
ctx.RenderWithErr("Create task failed, internal error", tpl, &form)
return
}
commitID, _ := ctx.Repo.GitRepo.GetBranchCommitID(branchName)
req := &grampus.GenerateTrainJobReq{
JobName: jobName,
DisplayJobName: displayJobName,
ComputeResource: models.GPUResource,
ProcessType: grampus.ProcessorTypeGPU,
Command: command,
ImageUrl: image,
Description: description,
BootFile: bootFile,
Uuid: uuid,
CommitID: commitID,
BranchName: branchName,
Params: form.Params,
EngineName: image,
DatasetName: attachment.Name,
JobName: jobName,
DisplayJobName: displayJobName,
ComputeResource: models.GPUResource,
ProcessType: grampus.ProcessorTypeGPU,
Command: command,
ImageUrl: image,
Description: description,
BootFile: bootFile,
Uuid: uuid,
CommitID: commitID,
BranchName: branchName,
Params: form.Params,
EngineName: image,
DatasetNames: datasetNames,
DatasetInfos: datasetInfos,
IsLatestVersion: modelarts.IsLatestVersion,
VersionCount: modelarts.VersionCountOne,
WorkServerNumber: 1,
Spec: spec,
}
if form.ModelName != "" { //使用预训练模型训练
req.ModelName = form.ModelName
req.LabelName = form.LabelName
req.CkptName = form.CkptName
req.ModelVersion = form.ModelVersion
req.PreTrainModelUrl = form.PreTrainModelUrl
}
err = grampus.GenerateTrainJob(ctx, req)
if err != nil {
log.Error("GenerateTrainJob failed:%v", err.Error(), ctx.Data["MsgID"])
grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
ctx.RenderWithErr(err.Error(), tplGrampusTrainJobGPUNew, &form)
ctx.RenderWithErr(err.Error(), tpl, &form)
return
}
ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/modelarts/train-job")
}
func getPreTrainModelPath(pretrainModelDir string, fileName string) string {
index := strings.Index(pretrainModelDir, "/")
if index > 0 {
filterBucket := pretrainModelDir[index+1:]
return filterBucket + fileName
} else {
return ""
}
}
func GrampusTrainJobVersionCreate(ctx *context.Context, form auth.CreateGrampusTrainJobForm) {
ctx.Data["IsCreate"] = false
computeResource := ctx.Query("compute_resource")
if computeResource == models.GPUResource {
GrampusTrainJobGpuCreate(ctx, form)
} else if computeResource == models.NPUResource {
GrampusTrainJobNpuCreate(ctx, form)
} else {
ctx.ServerError("resource error", errors.New("compute resource is not support"))
return
}
}
func checkSpecialPool(ctx *context.Context, resourceType string) string {
grampus.InitSpecialPool()
if grampus.SpecialPools != nil {
@@ -415,6 +516,7 @@ func checkSpecialPool(ctx *context.Context, resourceType string) string {
}
func GrampusTrainJobNpuCreate(ctx *context.Context, form auth.CreateGrampusTrainJobForm) {
ctx.Data["IsCreate"] = true
displayJobName := form.DisplayJobName
jobName := util.ConvertDisplayJobNameToJobName(displayJobName)
uuid := form.Attachment
@@ -424,11 +526,12 @@ func GrampusTrainJobNpuCreate(ctx *context.Context, form auth.CreateGrampusTrain
repo := ctx.Repo.Repository
codeLocalPath := setting.JobPath + jobName + modelarts.CodePath
codeObsPath := grampus.JobPath + jobName + modelarts.CodePath
dataObsPath := setting.BasePath + path.Join(uuid[0:1], uuid[1:2]) + "/" + uuid + "/"
// dataObsPath := setting.BasePath + path.Join(uuid[0:1], uuid[1:2]) + "/" + uuid + "/"
branchName := form.BranchName
isLatestVersion := modelarts.IsLatestVersion
versionCount := modelarts.VersionCountOne
engineName := form.EngineName
tpl := tplGrampusTrainJobNPUNew
lock := redis_lock.NewDistributeLock(redis_key.CloudbrainBindingJobNameKey(fmt.Sprint(repo.ID), string(models.JobTypeTrain), displayJobName))
isOk, err := lock.Lock(models.CloudbrainKeyDuration)
@@ -442,7 +545,7 @@ func GrampusTrainJobNpuCreate(ctx *context.Context, form auth.CreateGrampusTrain
if !jobNamePattern.MatchString(displayJobName) {
grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeNPU)
ctx.RenderWithErr(ctx.Tr("repo.cloudbrain_jobname_err"), tplGrampusTrainJobNPUNew , &form)
ctx.RenderWithErr(ctx.Tr("repo.cloudbrain_jobname_err"), tpl, &form)
return
}
@@ -450,7 +553,7 @@ func GrampusTrainJobNpuCreate(ctx *context.Context, form auth.CreateGrampusTrain
if err != nil || !bootFileExist {
log.Error("Get bootfile error:", err, ctx.Data["MsgID"])
grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeNPU)
ctx.RenderWithErr(ctx.Tr("repo.cloudbrain_bootfile_err"), tplGrampusTrainJobNPUNew , &form)
ctx.RenderWithErr(ctx.Tr("repo.cloudbrain_bootfile_err"), tpl, &form)
return
}
@@ -459,13 +562,13 @@ func GrampusTrainJobNpuCreate(ctx *context.Context, form auth.CreateGrampusTrain
if err != nil {
log.Error("GetGrampusCountByUserID failed:%v", err, ctx.Data["MsgID"])
grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeNPU)
ctx.RenderWithErr("system error", tplGrampusTrainJobNPUNew , &form)
ctx.RenderWithErr("system error", tpl, &form)
return
} else {
if count >= 1 {
log.Error("the user already has running or waiting task", ctx.Data["MsgID"])
grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeNPU)
ctx.RenderWithErr("you have already a running or waiting task, can not create more", tplGrampusTrainJobNPUNew , &form)
ctx.RenderWithErr("you have already a running or waiting task, can not create more", tpl, &form)
return
}
}
@@ -474,7 +577,7 @@ func GrampusTrainJobNpuCreate(ctx *context.Context, form auth.CreateGrampusTrain
if err := grampusParamCheckCreateTrainJob(form); err != nil {
log.Error("paramCheckCreateTrainJob failed:(%v)", err)
grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeNPU)
ctx.RenderWithErr(err.Error(), tplGrampusTrainJobNPUNew , &form)
ctx.RenderWithErr(err.Error(), tpl, &form)
return
}
@@ -484,14 +587,14 @@ func GrampusTrainJobNpuCreate(ctx *context.Context, form auth.CreateGrampusTrain
if len(tasks) != 0 {
log.Error("the job name did already exist", ctx.Data["MsgID"])
grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeNPU)
ctx.RenderWithErr("the job name did already exist", tplGrampusTrainJobNPUNew , &form)
ctx.RenderWithErr("the job name did already exist", tpl, &form)
return
}
} else {
if !models.IsErrJobNotExist(err) {
log.Error("system error, %v", err, ctx.Data["MsgID"])
grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeNPU)
ctx.RenderWithErr("system error", tplGrampusTrainJobNPUNew , &form)
ctx.RenderWithErr("system error", tpl, &form)
return
}
}
@@ -504,7 +607,7 @@ func GrampusTrainJobNpuCreate(ctx *context.Context, form auth.CreateGrampusTrain
})
if err != nil || spec == nil {
grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeNPU)
ctx.RenderWithErr("Resource specification not available", tplGrampusTrainJobNPUNew , &form)
ctx.RenderWithErr("Resource specification not available", tpl, &form)
return
}
if !account.IsPointBalanceEnough(ctx.User.ID, spec.UnitPrice) {
@@ -515,11 +618,11 @@ func GrampusTrainJobNpuCreate(ctx *context.Context, form auth.CreateGrampusTrain
}
//check dataset
attachment, err := models.GetAttachmentByUUID(uuid )
datasetInfos, datasetNames, err := models.GetDatasetInfo(uuid, models.NPU )
if err != nil {
log.Error("GetAttachmentByUUID failed:", err.Error() , ctx.Data["MsgID"])
log.Error("GetDatasetInfo failed: %v", err , ctx.Data["MsgID"])
grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeNPU)
ctx.RenderWithErr("dataset is not exist", tplGrampusTrainJobNPUNew , &form)
ctx.RenderWithErr(ctx.Tr("cloudbrain.error.dataset_select"), tpl , &form)
return
}
@@ -532,7 +635,7 @@ func GrampusTrainJobNpuCreate(ctx *context.Context, form auth.CreateGrampusTrain
if err := downloadZipCode(ctx, codeLocalPath, branchName); err != nil {
log.Error("downloadZipCode failed, server timed out: %s (%v)", repo.FullName(), err)
grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeNPU)
ctx.RenderWithErr(ctx.Tr("cloudbrain.load_code_failed"), tplGrampusTrainJobNPUNew , &form)
ctx.RenderWithErr(ctx.Tr("cloudbrain.load_code_failed"), tpl, &form)
return
}
@@ -540,23 +643,36 @@ func GrampusTrainJobNpuCreate(ctx *context.Context, form auth.CreateGrampusTrain
if err := obsMkdir(setting.CodePathPrefix + jobName + modelarts.OutputPath); err != nil {
log.Error("Failed to obsMkdir_output: %s (%v)", repo.FullName(), err)
grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeNPU)
ctx.RenderWithErr(ctx.Tr("cloudbrain.load_code_failed"), tplGrampusTrainJobNPUNew , &form)
ctx.RenderWithErr(ctx.Tr("cloudbrain.load_code_failed"), tpl, &form)
return
}
if err := uploadCodeToObs(codeLocalPath, jobName, ""); err != nil {
log.Error("Failed to uploadCodeToObs: %s (%v)", repo.FullName(), err)
grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeNPU)
ctx.RenderWithErr(ctx.Tr("cloudbrain.load_code_failed"), tplGrampusTrainJobNPUNew , &form)
ctx.RenderWithErr(ctx.Tr("cloudbrain.load_code_failed"), tpl, &form)
return
}
var datasetRemotePath, allFileName string
for _, datasetInfo := range datasetInfos {
if datasetRemotePath == "" {
datasetRemotePath = datasetInfo.DataLocalPath + "'" + datasetInfo.FullName + "'"
allFileName = datasetInfo.FullName
} else {
datasetRemotePath = datasetRemotePath + ";" + datasetInfo.DataLocalPath + "'" + datasetInfo.FullName + "'"
allFileName = allFileName + ";" + datasetInfo.FullName
}
}
//prepare command
command, err := generateCommand(repo.Name, grampus.ProcessorTypeNPU, codeObsPath+cloudbrain.DefaultBranchName+".zip", dataObsPath+"'"+attachment.Name+"'", bootFile, params, setting.CodePathPrefix+jobName+modelarts.OutputPath, attachment.Name)
preTrainModelPath := getPreTrainModelPath(form.PreTrainModelUrl, form.CkptName)
command, err := generateCommand(repo.Name, grampus.ProcessorTypeNPU, codeObsPath+cloudbrain.DefaultBranchName+".zip", datasetRemotePath, bootFile, params, setting.CodePathPrefix+jobName+modelarts.OutputPath, allFileName, preTrainModelPath, form.CkptName)
if err != nil {
log.Error("Failed to generateCommand: %s (%v)", displayJobName, err, ctx.Data["MsgID"])
grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeNPU)
ctx.RenderWithErr("Create task failed, internal error", tplGrampusTrainJobNPUNew, &form)
ctx.RenderWithErr("Create task failed, internal error", tpl, &form)
return
}
@@ -569,7 +685,6 @@ func GrampusTrainJobNpuCreate(ctx *context.Context, form auth.CreateGrampusTrain
ProcessType: grampus.ProcessorTypeNPU,
Command: command,
ImageId: form.ImageID,
DataUrl: dataObsPath,
Description: description,
CodeObsPath: codeObsPath,
BootFileUrl: codeObsPath + bootFile,
@@ -583,15 +698,24 @@ func GrampusTrainJobNpuCreate(ctx *context.Context, form auth.CreateGrampusTrain
EngineName: engineName,
VersionCount: versionCount,
TotalVersionCount: modelarts.TotalVersionCount,
DatasetName: attachment.Name,
DatasetNames: datasetNames,
DatasetInfos: datasetInfos,
Spec: spec,
}
if form.ModelName != "" { //使用预训练模型训练
req.ModelName = form.ModelName
req.LabelName = form.LabelName
req.CkptName = form.CkptName
req.ModelVersion = form.ModelVersion
req.PreTrainModelUrl = form.PreTrainModelUrl
}
err = grampus.GenerateTrainJob(ctx, req)
if err != nil {
log.Error("GenerateTrainJob failed:%v", err.Error())
grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeNPU)
ctx.RenderWithErr(err.Error(), tplGrampusTrainJobNPUNew, &form)
ctx.RenderWithErr(err.Error(), tpl, &form)
return
}
ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/modelarts/train-job")
@@ -802,7 +926,7 @@ func GrampusGetLog(ctx *context.Context) {
return
}
func generateCommand(repoName, processorType, codeRemotePath, dataRemotePath, bootFile, paramSrc, outputRemotePath, datasetName string) (string, error) {
func generateCommand(repoName, processorType, codeRemotePath, dataRemotePath, bootFile, paramSrc, outputRemotePath, datasetName, pretrainModelPath, pretrainModelFileName string) (string, error) {
var command string
workDir := grampus.NpuWorkDir
@@ -810,22 +934,22 @@ func generateCommand(repoName, processorType, codeRemotePath, dataRemotePath, bo
workDir = grampus.GpuWorkDir
}
command += "pwd;cd " + workDir + grampus.CommandPrepareScript
command += "pwd;cd " + workDir + fmt.Sprintf( grampus.CommandPrepareScript, setting.Grampus.SyncScriptProject, setting.Grampus.SyncScriptProject)
//download code & dataset
if processorType == grampus.ProcessorTypeNPU {
commandDownload := "./downloader_for_obs " + setting.Bucket + " " + codeRemotePath + " " + grampus.CodeArchiveName + " " + dataRemotePath + " '" + datasetName + "';"
commandDownload := "./downloader_for_obs " + setting.Bucket + " " + codeRemotePath + " " + grampus.CodeArchiveName + " '" + dataRemotePath + "' '" + datasetName + "'"
commandDownload = processPretrainModelParameter(pretrainModelPath, pretrainModelFileName, commandDownload)
command += commandDownload
} else if processorType == grampus.ProcessorTypeGPU {
commandDownload := "./downloader_for_minio " + setting.Grampus.Env + " " + codeRemotePath + " " + grampus.CodeArchiveName + " " + dataRemotePath + " '" + datasetName + "';"
commandDownload := "./downloader_for_minio " + setting.Grampus.Env + " " + codeRemotePath + " " + grampus.CodeArchiveName + " '" + dataRemotePath + "' '" + datasetName + "'"
commandDownload = processPretrainModelParameter(pretrainModelPath, pretrainModelFileName, commandDownload)
command += commandDownload
}
//unzip code & dataset
toolUnzip := "unzip -q '"
if strings.HasSuffix(datasetName, ".tar.gz") {
toolUnzip = "tar -zxvf '"
}
commandUnzip := "cd " + workDir + "code;unzip -q master.zip;echo \"start to unzip dataset\";cd " + workDir + "dataset;" + toolUnzip + datasetName + "';"
unZipDatasetCommand := generateDatasetUnzipCommand(datasetName)
commandUnzip := "cd " + workDir + "code;unzip -q master.zip;echo \"start to unzip dataset\";cd " + workDir + "dataset;" + unZipDatasetCommand
command += commandUnzip
command += "echo \"unzip finished;start to exec code;\";"
@@ -856,6 +980,10 @@ func generateCommand(repoName, processorType, codeRemotePath, dataRemotePath, bo
}
}
if pretrainModelFileName != "" {
paramCode += " --pretrainmodelname" + "=" + pretrainModelFileName
}
var commandCode string
if processorType == grampus.ProcessorTypeNPU {
commandCode = "/bin/bash /home/work/run_train_for_openi.sh " + workDir + "code/" + strings.ToLower(repoName) + "/" + bootFile + " /tmp/log/train.log" + paramCode + ";"
@@ -871,10 +999,10 @@ func generateCommand(repoName, processorType, codeRemotePath, dataRemotePath, bo
//upload models
if processorType == grampus.ProcessorTypeNPU {
commandUpload := "cd " + workDir + "script_for_grampus /;./uploader_for_npu " + setting.Bucket + " " + outputRemotePath + " " + workDir + "output/;"
commandUpload := "cd " + workDir + setting.Grampus.SyncScriptProject + " /;./uploader_for_npu " + setting.Bucket + " " + outputRemotePath + " " + workDir + "output/;"
command += commandUpload
} else if processorType == grampus.ProcessorTypeGPU {
commandUpload := "cd " + workDir + "script_for_grampus /;./uploader_for_gpu " + setting.Grampus.Env + " " + outputRemotePath + " " + workDir + "output/;"
commandUpload := "cd " + workDir + setting.Grampus.SyncScriptProject + " /;./uploader_for_gpu " + setting.Grampus.Env + " " + outputRemotePath + " " + workDir + "output/;"
command += commandUpload
}
@@ -885,6 +1013,38 @@ func generateCommand(repoName, processorType, codeRemotePath, dataRemotePath, bo
return command, nil
}
func processPretrainModelParameter(pretrainModelPath string, pretrainModelFileName string, commandDownload string) string {
commandDownloadTemp := commandDownload
if pretrainModelPath != "" {
commandDownloadTemp += " '" + pretrainModelPath + "' '" + pretrainModelFileName + "'"
}
commandDownloadTemp += ";"
return commandDownloadTemp
}
func generateDatasetUnzipCommand(datasetName string) string {
var unZipDatasetCommand string
datasetNameArray := strings.Split(datasetName, ";")
if len(datasetNameArray) == 1 { //单数据集
unZipDatasetCommand = "unzip -q '" + datasetName + "';"
if strings.HasSuffix(datasetName, ".tar.gz") {
unZipDatasetCommand = "tar --strip-components=1 -zxvf '" + datasetName + "';"
}
} else { //多数据集
for _, datasetNameTemp := range datasetNameArray {
if strings.HasSuffix(datasetName, ".tar.gz") {
unZipDatasetCommand = unZipDatasetCommand + "tar -zxvf '" + datasetName + "';"
} else {
unZipDatasetCommand = unZipDatasetCommand + "unzip -q '" + datasetNameTemp + "' -d './" + strings.TrimSuffix(datasetNameTemp, ".zip") + "';"
}
}
}
return unZipDatasetCommand
}
func downloadZipCode(ctx *context.Context, codePath, branchName string) error {
archiveType := git.ZIP
archivePath := codePath