#2661 fix-2629 新建gpu调试任务时应检测代码分支 (推理和训练也有类似问题,一并修改。 也优化智算网络任务加载代码错误时的错误信息)

Merged
lewis merged 2 commits from fix-2591 into V20220815 1 year ago
  1. +1
    -0
      options/locale/locale_en-US.ini
  2. +1
    -0
      options/locale/locale_zh-CN.ini
  3. +37
    -16
      routers/repo/cloudbrain.go
  4. +7
    -7
      routers/repo/grampus.go

+ 1
- 0
options/locale/locale_en-US.ini View File

@@ -3141,5 +3141,6 @@ Not_Stopped=The job is not stopped, can not be deleted.
Already_stopped=The job is already stopped.
Stopped_failed=Fail to stop the job, please try again later.
Stopped_success_update_status_fail=Succeed in stopping th job, but failed to update the job status and duration time.
load_code_failed=Fail to load code, please check if the right branch is selected.

error.dataset_select = dataset select error:the count exceed the limit or has same name

+ 1
- 0
options/locale/locale_zh-CN.ini View File

@@ -3156,6 +3156,7 @@ Not_Stopped=任务还未终止,不能删除。
Already_stopped=任务已停止。
Stopped_failed=任务停止失败,请稍后再试。
Stopped_success_update_status_fail=任务停止成功,状态及运行时间更新失败。
load_code_failed=代码加载失败,请确认选择了正确的分支。


error.dataset_select = 数据集选择错误:数量超过限制或者有同名数据集

+ 37
- 16
routers/repo/cloudbrain.go View File

@@ -328,12 +328,12 @@ func CloudBrainCreate(ctx *context.Context, form auth.CreateCloudBrainForm) {
if branchName == "" {
branchName = cloudbrain.DefaultBranchName
}
downloadCode(repo, codePath, branchName)
uploadCodeToMinio(codePath+"/", jobName, cloudbrain.CodeMountPath+"/")
modelPath := setting.JobPath + jobName + cloudbrain.ModelMountPath + "/"
mkModelPath(modelPath)
uploadCodeToMinio(modelPath, jobName, cloudbrain.ModelMountPath+"/")
errStr = loadCodeAndMakeModelPath(repo, codePath, branchName, jobName, cloudbrain.ModelMountPath)
if errStr != "" {
cloudBrainNewDataPrepare(ctx)
ctx.RenderWithErr(ctx.Tr(errStr), tpl, &form)
return
}

commitID, _ := ctx.Repo.GitRepo.GetBranchCommitID(branchName)

@@ -378,6 +378,30 @@ func CloudBrainCreate(ctx *context.Context, form auth.CreateCloudBrainForm) {
}
}

func loadCodeAndMakeModelPath(repo *models.Repository, codePath string, branchName string, jobName string, resultPath string) string {
err := downloadCode(repo, codePath, branchName)
if err != nil {
return "cloudbrain.load_code_failed"
}

err = uploadCodeToMinio(codePath+"/", jobName, cloudbrain.CodeMountPath+"/")
if err != nil {
return "cloudbrain.load_code_failed"
}

modelPath := setting.JobPath + jobName + resultPath + "/"
err = mkModelPath(modelPath)
if err != nil {
return "cloudbrain.load_code_failed"
}
err = uploadCodeToMinio(modelPath, jobName, resultPath+"/")
if err != nil {
return "cloudbrain.load_code_failed"
}

return ""
}

func CloudBrainInferenceJobCreate(ctx *context.Context, form auth.CreateCloudBrainInferencForm) {
ctx.Data["PageIsCloudBrain"] = true
displayJobName := form.DisplayJobName
@@ -444,11 +468,12 @@ func CloudBrainInferenceJobCreate(ctx *context.Context, form auth.CreateCloudBra
if branchName == "" {
branchName = cloudbrain.DefaultBranchName
}
downloadCode(repo, codePath, branchName)
uploadCodeToMinio(codePath+"/", jobName, cloudbrain.CodeMountPath+"/")
resultPath := setting.JobPath + jobName + cloudbrain.ResultPath + "/"
mkResultPath(resultPath)
uploadCodeToMinio(resultPath, jobName, cloudbrain.ResultPath+"/")
errStr := loadCodeAndMakeModelPath(repo, codePath, branchName, jobName, cloudbrain.ResultPath)
if errStr != "" {
cloudBrainNewDataPrepare(ctx)
ctx.RenderWithErr(ctx.Tr(errStr), tpl, &form)
return
}

commitID, _ := ctx.Repo.GitRepo.GetBranchCommitID(branchName)

@@ -1664,11 +1689,7 @@ func uploadCodeToMinio(codePath, jobName, parentDir string) error {
}

func mkModelPath(modelPath string) error {
return mkPathAndReadMeFile(modelPath, "You can put the model file into this directory and download it by the web page.")
}

func mkResultPath(resultPath string) error {
return mkPathAndReadMeFile(resultPath, "You can put the result file into this directory and download it by the web page.")
return mkPathAndReadMeFile(modelPath, "You can put the files into this directory and download the files by the web page.")
}

func mkPathAndReadMeFile(path string, text string) error {


+ 7
- 7
routers/repo/grampus.go View File

@@ -281,7 +281,7 @@ func GrampusTrainJobGpuCreate(ctx *context.Context, form auth.CreateGrampusTrain
if err := downloadZipCode(ctx, codeLocalPath, branchName); err != nil {
log.Error("downloadZipCode failed, server timed out: %s (%v)", repo.FullName(), err, ctx.Data["MsgID"])
grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
ctx.RenderWithErr("Create task failed, internal error", tplGrampusTrainJobGPUNew, &form)
ctx.RenderWithErr(ctx.Tr("cloudbrain.load_code_failed"), tplGrampusTrainJobGPUNew, &form)
return
}

@@ -290,7 +290,7 @@ func GrampusTrainJobGpuCreate(ctx *context.Context, form auth.CreateGrampusTrain
if err := uploadCodeToMinio(codeLocalPath+"/", jobName, cloudbrain.CodeMountPath+"/"); err != nil {
log.Error("Failed to uploadCodeToMinio: %s (%v)", repo.FullName(), err, ctx.Data["MsgID"])
grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
ctx.RenderWithErr("Create task failed, internal error", tplGrampusTrainJobGPUNew, &form)
ctx.RenderWithErr(ctx.Tr("cloudbrain.load_code_failed"), tplGrampusTrainJobGPUNew, &form)
return
}

@@ -298,7 +298,7 @@ func GrampusTrainJobGpuCreate(ctx *context.Context, form auth.CreateGrampusTrain
if err := mkModelPath(modelPath); err != nil {
log.Error("Failed to mkModelPath: %s (%v)", repo.FullName(), err, ctx.Data["MsgID"])
grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
ctx.RenderWithErr("Create task failed, internal error", tplGrampusTrainJobGPUNew, &form)
ctx.RenderWithErr(ctx.Tr("cloudbrain.load_code_failed"), tplGrampusTrainJobGPUNew, &form)
return
}

@@ -306,7 +306,7 @@ func GrampusTrainJobGpuCreate(ctx *context.Context, form auth.CreateGrampusTrain
if err := uploadCodeToMinio(modelPath, jobName, cloudbrain.ModelMountPath+"/"); err != nil {
log.Error("Failed to uploadCodeToMinio: %s (%v)", repo.FullName(), err, ctx.Data["MsgID"])
grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
ctx.RenderWithErr("Create task failed, internal error", tplGrampusTrainJobGPUNew, &form)
ctx.RenderWithErr(ctx.Tr("cloudbrain.load_code_failed"), tplGrampusTrainJobGPUNew, &form)
return
}

@@ -465,7 +465,7 @@ func GrampusTrainJobNpuCreate(ctx *context.Context, form auth.CreateGrampusTrain
if err := downloadZipCode(ctx, codeLocalPath, branchName); err != nil {
log.Error("downloadZipCode failed, server timed out: %s (%v)", repo.FullName(), err)
grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeNPU)
ctx.RenderWithErr("Create task failed, server timed out", tplGrampusTrainJobNPUNew, &form)
ctx.RenderWithErr(ctx.Tr("cloudbrain.load_code_failed"), tplGrampusTrainJobNPUNew, &form)
return
}

@@ -473,14 +473,14 @@ func GrampusTrainJobNpuCreate(ctx *context.Context, form auth.CreateGrampusTrain
if err := obsMkdir(setting.CodePathPrefix + jobName + modelarts.OutputPath); err != nil {
log.Error("Failed to obsMkdir_output: %s (%v)", repo.FullName(), err)
grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeNPU)
ctx.RenderWithErr("Failed to obsMkdir_output", tplGrampusTrainJobNPUNew, &form)
ctx.RenderWithErr(ctx.Tr("cloudbrain.load_code_failed"), tplGrampusTrainJobNPUNew, &form)
return
}

if err := uploadCodeToObs(codeLocalPath, jobName, ""); err != nil {
log.Error("Failed to uploadCodeToObs: %s (%v)", repo.FullName(), err)
grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeNPU)
ctx.RenderWithErr("Failed to uploadCodeToObs", tplGrampusTrainJobNPUNew, &form)
ctx.RenderWithErr(ctx.Tr("cloudbrain.load_code_failed"), tplGrampusTrainJobNPUNew, &form)
return
}



Loading…
Cancel
Save