@@ -44,14 +44,37 @@ import (
const (
tplGrampusTrainJobShow base.TplName = "repo/grampus/trainjob/show"
tplGrampusNotebookShow base.TplName = "repo/grampus/notebook/show"
//GPU
tplGrampusNotebookGPUNew base.TplName = "repo/grampus/notebook/gpu/new"
tplGrampusTrainJobGPUNew base.TplName = "repo/grampus/trainjob/gpu/new"
//NPU
tplGrampusNotebookNPUNew base.TplName = "repo/grampus/notebook/npu/new"
tplGrampusTrainJobNPUNew base.TplName = "repo/grampus/trainjob/npu/new"
)
func GrampusNotebookNew(ctx *context.Context) {
ctx.Data["IsCreate"] = true
notebookType := ctx.QueryInt("type")
processType := grampus.ProcessorTypeGPU
if notebookType == 1 {
processType = grampus.ProcessorTypeNPU
}
err := grampusNotebookNewDataPrepare(ctx, processType)
if err != nil {
ctx.ServerError("get new notebook-job info failed", err)
return
}
if processType == grampus.ProcessorTypeGPU {
ctx.HTML(http.StatusOK, tplGrampusNotebookGPUNew)
} else {
ctx.HTML(http.StatusOK, tplGrampusNotebookNPUNew)
}
}
func GrampusTrainJobGPUNew(ctx *context.Context) {
ctx.Data["IsCreate"] = true
err := grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
@@ -72,57 +95,262 @@ func GrampusTrainJobNPUNew(ctx *context.Context) {
}
ctx.HTML(200, tplGrampusTrainJobNPUNew)
}
func GrampusNotebookCreate(ctx *context.Context, form auth.CreateGrampusNotebookForm) {
ctx.Data["IsCreate"] = true
displayJobName := form.DisplayJobName
jobName := util.ConvertDisplayJobNameToJobName(displayJobName)
uuid := form.Attachment
description := form.Description
repo := ctx.Repo.Repository
branchName := form.BranchName
image := strings.TrimSpace(form.Image)
func grampusTrainJobNewDataPrepare(ctx *context.Context, processType string) error {
codeStoragePath := setting.CBCodePathPrefix + jobName + cloudbrain.CodeMountPath + "/"
tpl := tplGrampusNotebookGPUNew
processType := grampus.ProcessorTypeGPU
computeSource := models.GPUResource
computeSourceSimple := models.GPU
if form.Type == 1 {
tpl = tplGrampusNotebookNPUNew
processType = grampus.ProcessorTypeNPU
computeSource = models.NPUResource
computeSourceSimple = models.NPU
codeStoragePath = grampus.JobPath + jobName + modelarts.CodePath
}
lock := redis_lock.NewDistributeLock(redis_key.CloudbrainBindingJobNameKey(fmt.Sprint(repo.ID), string(models.JobTypeDebug), displayJobName))
defer lock.UnLock()
isOk, err := lock.Lock(models.CloudbrainKeyDuration)
if !isOk {
log.Error("lock processed failed:%v", err, ctx.Data["MsgID"])
grampusNotebookNewDataPrepare(ctx, processType)
ctx.RenderWithErr(ctx.Tr("repo.cloudbrain_samejob_err"), tpl, &form)
return
}
if !jobNamePattern.MatchString(displayJobName) {
grampusNotebookNewDataPrepare(ctx, processType)
ctx.RenderWithErr(ctx.Tr("repo.cloudbrain_jobname_err"), tpl, &form)
return
}
//check count limit
count, err := cloudbrainTask.GetNotFinalStatusTaskCount(ctx.User.ID, models.TypeC2Net, string(models.JobTypeDebug), computeSource)
if err != nil {
log.Error("GetGrampusCountByUserID failed:%v", err, ctx.Data["MsgID"])
grampusNotebookNewDataPrepare(ctx, processType)
ctx.RenderWithErr("system error", tpl, &form)
return
} else {
if count >= 1 {
log.Error("the user already has running or waiting task", ctx.Data["MsgID"])
grampusNotebookNewDataPrepare(ctx, processType)
ctx.RenderWithErr("you have already a running or waiting task, can not create more", tpl, &form)
return
}
}
//check whether the task name in the project is duplicated
tasks, err := models.GetCloudbrainsByDisplayJobName(repo.ID, string(models.JobTypeDebug), displayJobName)
if err == nil {
if len(tasks) != 0 {
log.Error("the job name did already exist", ctx.Data["MsgID"])
grampusNotebookNewDataPrepare(ctx, processType)
ctx.RenderWithErr("the job name did already exist", tpl, &form)
return
}
} else {
if !models.IsErrJobNotExist(err) {
log.Error("system error, %v", err, ctx.Data["MsgID"])
grampusNotebookNewDataPrepare(ctx, processType)
ctx.RenderWithErr("system error", tpl, &form)
return
}
}
//check specification
spec, err := resource.GetAndCheckSpec(ctx.User.ID, form.SpecId, models.FindSpecsOptions{
JobType: models.JobTypeDebug,
ComputeResource: computeSourceSimple,
Cluster: models.C2NetCluster,
})
if err != nil || spec == nil {
grampusNotebookNewDataPrepare(ctx, processType)
ctx.RenderWithErr("Resource specification not available", tpl, &form)
return
}
if !account.IsPointBalanceEnough(ctx.User.ID, spec.UnitPrice) {
log.Error("point balance is not enough,userId=%d specId=%d", ctx.User.ID, spec.ID)
grampusNotebookNewDataPrepare(ctx, processType)
ctx.RenderWithErr(ctx.Tr("points.insufficient_points_balance"), tpl, &form)
return
}
var datasetInfos map[string]models.DatasetInfo
var datasetNames string
//var
if uuid != "" {
datasetInfos, datasetNames, err = models.GetDatasetInfo(uuid, computeSourceSimple)
if err != nil {
log.Error("GetDatasetInfo failed: %v", err, ctx.Data["MsgID"])
grampusNotebookNewDataPrepare(ctx, processType)
ctx.RenderWithErr(ctx.Tr("cloudbrain.error.dataset_select"), tpl, &form)
return
}
}
//prepare code and out path
codeLocalPath := setting.JobPath + jobName + cloudbrain.CodeMountPath + "/"
_, err = ioutil.ReadDir(codeLocalPath)
if err == nil {
os.RemoveAll(codeLocalPath)
}
if err := downloadZipCode(ctx, codeLocalPath, branchName); err != nil {
log.Error("downloadZipCode failed, server timed out: %s (%v)", repo.FullName(), err)
grampusNotebookNewDataPrepare(ctx, processType)
ctx.RenderWithErr(ctx.Tr("cloudbrain.load_code_failed"), tpl, &form)
return
}
if processType == grampus.ProcessorTypeGPU {
if err := uploadCodeToMinio(codeLocalPath+"/", jobName, cloudbrain.CodeMountPath+"/"); err != nil {
log.Error("Failed to uploadCodeToMinio: %s (%v)", repo.FullName(), err, ctx.Data["MsgID"])
grampusNotebookNewDataPrepare(ctx, processType)
ctx.RenderWithErr(ctx.Tr("cloudbrain.load_code_failed"), tpl, &form)
return
}
} else {
if err := uploadCodeToObs(codeLocalPath, jobName, ""); err != nil {
log.Error("Failed to uploadCodeToObs: %s (%v)", repo.FullName(), err)
grampusNotebookNewDataPrepare(ctx, processType)
ctx.RenderWithErr(ctx.Tr("cloudbrain.load_code_failed"), tpl, &form)
return
}
}
commitID, _ := ctx.Repo.GitRepo.GetBranchCommitID(branchName)
req := &grampus.GenerateNotebookJobReq{
JobName: jobName,
DisplayJobName: displayJobName,
ComputeResource: computeSource,
ProcessType: processType,
ImageUrl: image,
ImageId: form.ImageID,
Description: description,
Uuid: uuid,
CommitID: commitID,
BranchName: branchName,
DatasetNames: datasetNames,
DatasetInfos: datasetInfos,
Spec: spec,
CodeStoragePath: codeStoragePath,
CodeName: strings.ToLower(repo.Name),
}
if form.ModelName != "" { //使用预训练模型训练
_, err := models.QueryModelByPath(form.PreTrainModelUrl)
if err != nil {
log.Error("Can not find model", err)
grampusNotebookNewDataPrepare(ctx, processType)
ctx.RenderWithErr(ctx.Tr("repo.modelconvert.manage.model_not_exist"), tpl, &form)
return
}
req.ModelName = form.ModelName
req.LabelName = form.LabelName
req.CkptName = form.CkptName
req.ModelVersion = form.ModelVersion
req.PreTrainModelUrl = form.PreTrainModelUrl
req.PreTrainModelPath = getPreTrainModelPath(form.PreTrainModelUrl, form.CkptName)
}
_, err = grampus.GenerateNotebookJob(ctx, req)
if err != nil {
log.Error("GenerateNotebookJob failed:%v", err.Error(), ctx.Data["MsgID"])
grampusTrainJobNewDataPrepare(ctx, processType)
ctx.RenderWithErr(err.Error(), tpl, &form)
return
}
ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/debugjob?debugListType=all")
}
func grampusNotebookNewDataPrepare(ctx *context.Context, processType string) error {
ctx.Data["PageIsCloudBrain"] = true
var displayJobName = cloudbrainService.GetDisplayJobName(ctx.User.Name)
ctx.Data["display_job_name"] = displayJobName
//get valid images
images, err := grampus.GetImages(processType)
if processType == grampus.ProcessorTypeNPU {
images, err := grampus.GetImages(processType, string(models.JobTypeDebug))
if err != nil {
log.Error("GetImages failed:", err.Error())
} else {
ctx.Data["images"] = images.Infos
}
}
//prepare available specs
computeResourceSimple := models.GPU
datasetType := models.TypeCloudBrainOne
computeResource := models.GPUResource
if processType == grampus.ProcessorTypeNPU {
computeResourceSimple = models.NPU
datasetType = models.TypeCloudBrainTwo
computeResource = models.NPUResource
}
prepareGrampusSpecs(ctx, computeResourceSimple, models.JobTypeDebug)
//get branches
branches, _, err := ctx.Repo.GitRepo.GetBranches(0, 0)
if err != nil {
log.Error("GetImages failed:", err.Error())
log.Error("GetBranches error :", err.Error())
} else {
ctx.Data["images"] = images.Infos
ctx.Data["branches"] = branche s
}
grampus.InitSpecialPool()
ctx.Data["branchName"] = ctx.Repo.BranchName
ctx.Data["GPUEnabled"] = true
ctx.Data["NPUEnabled"] = true
includeCenters := make(map[string]struct{})
excludeCenters := make(map[string]struct{})
if grampus.SpecialPools != nil {
for _, pool := range grampus.SpecialPools.Pools {
if pool.IsExclusive {
if !IsUserInOrgPool(ctx.User.ID, pool) {
ctx.Data[pool.Type+"Enabled"] = false
}
} else {
if strings.Contains(strings.ToLower(processType), strings.ToLower(pool.Type)) {
if IsUserInOrgPool(ctx.User.ID, pool) {
for _, center := range pool.Pool {
includeCenters[center.Queue] = struct{}{}
}
} else {
for _, center := range pool.Pool {
excludeCenters[center.Queue] = struct{}{}
}
ctx.Data["datasetType"] = datasetType
waitCount := cloudbrain.GetWaitingCloudbrainCount(models.TypeC2Net, computeResource, models.JobTypeDebug)
ctx.Data["WaitCount"] = waitCount
NotStopTaskCount, _ := cloudbrainTask.GetNotFinalStatusTaskCount(ctx.User.ID, models.TypeC2Net, string(models.JobTypeDebug), computeResource)
ctx.Data["NotStopTaskCount"] = NotStopTaskCount
}
ctx.Data["code_path"] = cloudbrain.CodeMountPath
ctx.Data["dataset_path"] = cloudbrain.DataSetMountPath
ctx.Data["model_path"] = cloudbrain.ModelMountPath
}
return nil
}
}
func grampusTrainJobNewDataPrepare(ctx *context.Context, processType string) error {
ctx.Data["PageIsCloudBrain"] = true
var displayJobName = cloudbrainService.GetDisplayJobName(ctx.User.Name)
ctx.Data["display_job_name"] = displayJobName
//get valid images
if processType == grampus.ProcessorTypeNPU {
images, err := grampus.GetImages(processType, string(models.JobTypeTrain))
if err != nil {
log.Error("GetImages failed:", err.Error())
} else {
ctx.Data["images"] = images.Infos
}
}
//prepare available specs
if processType == grampus.ProcessorTypeNPU {
prepareGrampusTrainSpecs(ctx, models.NPU)
prepareGrampusSpecs(ctx, models.NPU)
} else if processType == grampus.ProcessorTypeGPU {
prepareGrampusTrainSpecs(ctx, models.GPU)
prepareGrampusSpecs(ctx, models.GPU)
}
//get branches
@@ -201,55 +429,19 @@ func GrampusTrainJobVersionNew(ctx *context.Context) {
}
}
func prepareGrampusTrainSpecs(ctx *context.Context, computeResource string) {
func prepareGrampusSpecs(ctx *context.Context, computeResource string, jobType ...models.JobType) {
tempJobType := models.JobTypeTrain
if len(jobType) > 0 {
tempJobType = jobType[0]
}
noteBookSpecs, _ := resource.FindAvailableSpecs(ctx.User.ID, models.FindSpecsOptions{
JobType: models.JobTypeTrain,
JobType: tempJobType ,
ComputeResource: computeResource,
Cluster: models.C2NetCluster,
})
ctx.Data["Specs"] = noteBookSpecs
}
func getFilterSpecBySpecialPool(specs *models.GetGrampusResourceSpecsResult, includeCenters map[string]struct{}, excludeCenters map[string]struct{}) []models.GrampusSpec {
if len(includeCenters) == 0 && len(excludeCenters) == 0 {
return specs.Infos
}
var grampusSpecs []models.GrampusSpec
for _, info := range specs.Infos {
if isInIncludeCenters(info, includeCenters) || (len(excludeCenters) != 0 && isNotAllInExcludeCenters(info, excludeCenters)) {
grampusSpecs = append(grampusSpecs, info)
}
}
return grampusSpecs
}
func isInIncludeCenters(grampusSpec models.GrampusSpec, centers map[string]struct{}) bool {
for _, center := range grampusSpec.Centers {
if _, ok := centers[center.ID]; ok {
return true
}
}
return false
}
func isNotAllInExcludeCenters(grampusSpec models.GrampusSpec, centers map[string]struct{}) bool {
for _, center := range grampusSpec.Centers {
if _, ok := centers[center.ID]; !ok {
return true
}
}
return false
}
func IsUserInOrgPool(userId int64, pool *models.SpecialPool) bool {
org, _ := models.GetOrgByName(pool.Org)
if org != nil {
isOrgMember, _ := models.IsOrganizationMember(org.ID, userId)
return isOrgMember
}
return false
}
func grampusParamCheckCreateTrainJob(form auth.CreateGrampusTrainJobForm) error {
if !strings.HasSuffix(strings.TrimSpace(form.BootFile), ".py") {
log.Error("the boot file(%s) must be a python file", form.BootFile)
@@ -721,30 +913,64 @@ func grampusTrainJobNpuCreate(ctx *context.Context, form auth.CreateGrampusTrain
ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/modelarts/train-job")
}
func GetGrampusNotebook(ctx *context.APIContext) {
var (
err error
)
ID := ctx.Params(":id")
job, err := models.GetCloudbrainByID(ID)
if err != nil {
ctx.NotFound("", err)
log.Error("GetCloudbrainByID failed:", err)
return
}
jobAfter, err := cloudbrainTask.SyncGrampusNotebookStatus(job)
aiCenterName := cloudbrainService.GetAiCenterShow(jobAfter.AiCenter, ctx.Context)
if err != nil {
ctx.NotFound(err)
log.Error("Sync cloud brain one status failed:", err)
return
}
ctx.JSON(http.StatusOK, map[string]interface{}{
"ID": ID,
"JobName": jobAfter.JobName,
"JobStatus": jobAfter.Status,
"AiCenter": aiCenterName,
"CreatedTime": jobAfter.CreatedUnix.Format("2006-01-02 15:04:05"),
"CompletedTime": jobAfter.UpdatedUnix.Format("2006-01-02 15:04:05"),
"JobDuration": jobAfter.TrainJobDuration,
})
}
func GrampusStopJob(ctx *context.Context) {
var ID = ctx.Params(":jobid")
var ID = ctx.Params(":id")
var resultCode = "0"
var errorMsg = ""
var status = ""
task := ctx.Cloudbrain
for {
if task.Status == string(models.GrampusStatusStopped) || task.Status == string(models.GrampusStatusFailed) || task.Status == string(models.GrampusStatusSucceeded) {
if task.Status == models.GrampusStatusStopped || task.Status == models.GrampusStatusFailed || task.Status == models.GrampusStatusSucceeded {
log.Error("the job(%s) has been stopped", task.JobName, ctx.Data["msgID"])
resultCode = "-1"
errorMsg = "system error"
errorMsg = ctx.Tr("cloudbrain.Already_stopped")
break
}
res, err := grampus.StopJob(task.JobID)
res, err := grampus.StopJob(task.JobID, task.JobType )
if err != nil {
log.Error("StopJob(%s) failed:%v", task.JobName, err, ctx.Data["msgID"])
resultCode = strconv.Itoa(res.ErrorCode)
errorMsg = res.ErrorMsg
errorMsg = ctx.Tr("cloudbrain.Stopped_failed")
break
}
oldStatus := task.Status
task.Status = string(models.GrampusStatusStopped)
task.Status = getStopJobResponseStatus(res )
if task.EndTime == 0 {
task.EndTime = timeutil.TimeStampNow()
}
@@ -773,6 +999,33 @@ func GrampusStopJob(ctx *context.Context) {
})
}
func getStopJobResponseStatus(res *models.GrampusStopJobResponse) string {
newStatus := models.GrampusStatusStopping
if res.Status != "" {
newStatus = grampus.TransTrainJobStatus(res.Status)
}
return newStatus
}
func GrampusNotebookDel(ctx *context.Context) {
var listType = ctx.Query("listType")
if err := deleteGrampusJob(ctx); err != nil {
log.Error("deleteGrampusJob failed: %v", err, ctx.Data["msgID"])
ctx.ServerError(err.Error(), err)
return
}
var isAdminPage = ctx.Query("isadminpage")
var isHomePage = ctx.Query("ishomepage")
if ctx.IsUserSiteAdmin() && isAdminPage == "true" {
ctx.Redirect(setting.AppSubURL + "/admin" + "/cloudbrains")
} else if isHomePage == "true" {
ctx.Redirect(setting.AppSubURL + "/cloudbrains")
} else {
ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/debugjob?debugListType=" + listType)
}
}
func GrampusTrainJobDel(ctx *context.Context) {
var listType = ctx.Query("listType")
if err := deleteGrampusJob(ctx); err != nil {
@@ -795,9 +1048,9 @@ func GrampusTrainJobDel(ctx *context.Context) {
func deleteGrampusJob(ctx *context.Context) error {
task := ctx.Cloudbrain
if task.Status != string( models.GrampusStatusStopped) && task.Status != string( models.GrampusStatusSucceeded) && task.Status != string( models.GrampusStatusFailed) {
if task.Status != models.GrampusStatusStopped && task.Status != models.GrampusStatusSucceeded && task.Status != models.GrampusStatusFailed {
log.Error("the job(%s) has not been stopped", task.JobName, ctx.Data["msgID"])
return errors.New("the job has not been stopped" )
return errors.New(ctx.Tr("cloudbrain.Not_Stopped") )
}
err := models.DeleteJob(task)
@@ -815,6 +1068,166 @@ func deleteGrampusJob(ctx *context.Context) error {
return nil
}
type NotebookDataset struct {
DatasetUrl string `json:"dataset_url"`
}
func GrampusNotebookShow(ctx *context.Context) {
ctx.Data["PageIsCloudBrain"] = true
var task *models.Cloudbrain
task, err := models.GetCloudbrainByIDWithDeleted(ctx.Params(":id"))
if err != nil {
log.Error("GetCloudbrainByID failed:" + err.Error())
ctx.NotFound(ctx.Req.URL.RequestURI(), nil)
return
}
task.ContainerIp = ""
if task.DeletedAt.IsZero() && cloudbrainTask.IsTaskNotStop(task) { //normal record
result, err := grampus.GetNotebookJob(task.JobID)
if err != nil {
log.Error("GetJob failed:" + err.Error())
ctx.NotFound(ctx.Req.URL.RequestURI(), nil)
return
}
if result != nil {
if len(result.JobInfo.Tasks[0].CenterID) == 1 && len(result.JobInfo.Tasks[0].CenterName) == 1 {
task.AiCenter = result.JobInfo.Tasks[0].CenterID[0] + "+" + result.JobInfo.Tasks[0].CenterName[0]
}
oldStatus := task.Status
task.Status = grampus.TransTrainJobStatus(result.JobInfo.Status)
if task.Status != oldStatus || task.Status == models.GrampusStatusRunning {
task.Duration = result.JobInfo.RunSec
if task.Duration < 0 {
task.Duration = 0
}
task.TrainJobDuration = models.ConvertDurationToStr(task.Duration)
if task.StartTime == 0 && result.JobInfo.StartedAt > 0 {
task.StartTime = timeutil.TimeStamp(result.JobInfo.StartedAt)
}
if task.EndTime == 0 && models.IsTrainJobTerminal(task.Status) && task.StartTime > 0 {
task.EndTime = task.StartTime.Add(task.Duration)
}
task.CorrectCreateUnix()
if oldStatus != task.Status {
notification.NotifyChangeCloudbrainStatus(task, oldStatus)
if models.IsTrainJobTerminal(task.Status) && task.ComputeResource == models.NPUResource {
if len(result.JobInfo.Tasks[0].CenterID) == 1 {
urchin.GetBackNpuModel(task.ID, grampus.GetRemoteEndPoint(result.JobInfo.Tasks[0].CenterID[0]), grampus.BucketRemote, grampus.GetNpuModelObjectKey(task.JobName), grampus.GetCenterProxy(setting.Grampus.LocalCenterID))
}
}
}
}
err = models.UpdateJob(task)
if err != nil {
log.Error("UpdateJob failed:" + err.Error())
}
}
}
if len(task.Parameters) > 0 {
var parameters models.Parameters
err := json.Unmarshal([]byte(task.Parameters), ¶meters)
if err != nil {
log.Error("Failed to Unmarshal Parameters: %s (%v)", task.Parameters, err)
ctx.ServerError("system error", err)
return
}
if len(parameters.Parameter) > 0 {
paramTemp := ""
for _, Parameter := range parameters.Parameter {
param := Parameter.Label + " = " + Parameter.Value + "; "
paramTemp = paramTemp + param
}
task.Parameters = paramTemp[:len(paramTemp)-2]
} else {
task.Parameters = ""
}
}
user, err := models.GetUserByID(task.UserID)
if err == nil {
task.User = user
}
prepareSpec4Show(ctx, task)
ctx.Data["task"] = task
ctx.Data["datasetDownload"] = getDatasetDownloadInfo(ctx, task)
ctx.Data["modelDownload"] = getModelDownloadInfo(ctx, task)
ctx.Data["canDownload"] = cloudbrain.CanModifyJob(ctx, task)
ctx.Data["ai_center"] = cloudbrainService.GetAiCenterShow(task.AiCenter, ctx)
ctx.Data["code_path"] = cloudbrain.CodeMountPath
ctx.Data["dataset_path"] = cloudbrain.DataSetMountPath
ctx.Data["model_path"] = cloudbrain.ModelMountPath
ctx.HTML(http.StatusOK, tplGrampusNotebookShow)
}
func getDatasetDownloadInfo(ctx *context.Context, task *models.Cloudbrain) []*models.DatasetDownload {
datasetDownload := make([]*models.DatasetDownload, 0)
if ctx.IsSigned {
if task.Uuid != "" && task.UserID == ctx.User.ID {
if task.IsGPUTask() {
return GetCloudBrainDataSetInfo(task.Uuid, task.DatasetName, false)
} else {
datasetDownload = GetCloudBrainDataSetInfo(task.Uuid, task.DatasetName, false)
datasetObsUrlList := make([]NotebookDataset, 0)
_ = json.Unmarshal([]byte(task.DataUrl), &datasetObsUrlList)
for _, datasetInfo := range datasetDownload {
for _, datasetObs := range datasetObsUrlList {
log.Info("datasetObsUrl:" + datasetObs.DatasetUrl + "datasetName:" + datasetInfo.DatasetName)
if strings.Contains(datasetObs.DatasetUrl, datasetInfo.DatasetName) {
datasetInfo.DatasetDownloadLink = datasetObs.DatasetUrl
break
}
}
}
}
}
}
return datasetDownload
}
func getModelDownloadInfo(ctx *context.Context, task *models.Cloudbrain) *models.ModelDownload {
var modelDownload models.ModelDownload
if ctx.IsSigned {
if task.ModelName != "" && task.UserID == ctx.User.ID {
if task.IsNPUTask() {
modelDownload = models.ModelDownload{
Name: task.CkptName,
DownloadLink: "",
IsDelete: false,
}
if !HasModelFile(task) {
modelDownload.IsDelete = true
}
datasetObsUrlList := make([]NotebookDataset, 0)
_ = json.Unmarshal([]byte(task.DataUrl), &datasetObsUrlList)
for _, datasetObs := range datasetObsUrlList {
if strings.Contains(datasetObs.DatasetUrl, task.CkptName) {
modelDownload.DownloadLink = datasetObs.DatasetUrl
break
}
}
}
}
}
return &modelDownload
}
func GrampusTrainJobShow(ctx *context.Context) {
ctx.Data["PageIsCloudBrain"] = true
@@ -1158,3 +1571,172 @@ func HandleTaskWithAiCenter(ctx *context.Context) {
r["updateCounts"] = updateCounts
ctx.JSON(http.StatusOK, response.SuccessWithData(r))
}
func GrampusNotebookDebug(ctx *context.Context) {
result, err := grampus.GetNotebookJob(ctx.Cloudbrain.JobID)
if err != nil {
ctx.RenderWithErr(err.Error(), tplDebugJobIndex, nil)
return
}
if len(result.JobInfo.Tasks) > 0 {
ctx.Redirect(result.JobInfo.Tasks[0].Url + "?token=" + result.JobInfo.Tasks[0].Token)
return
}
ctx.NotFound("Can not find the job.", nil)
}
func GrampusNotebookRestart(ctx *context.Context) {
var id = ctx.Params(":id")
var resultCode = "-1"
var errorMsg = ""
var status = ""
var spec *models.Specification
task := ctx.Cloudbrain
if ctx.Written() {
return
}
for {
if task.Status != models.GrampusStatusStopped && task.Status != models.GrampusStatusSucceeded && task.Status != models.GrampusStatusFailed {
log.Error("the job(%s) is not stopped", task.JobName, ctx.Data["MsgID"])
errorMsg = "the job is not stopped"
break
}
count, err := cloudbrainTask.GetNotFinalStatusTaskCount(ctx.User.ID, models.TypeC2Net, string(models.JobTypeDebug), task.ComputeResource)
if err != nil {
log.Error("GetCloudbrainNotebookCountByUserID failed:%v", err, ctx.Data["MsgID"])
errorMsg = "system error"
break
} else {
if count >= 1 {
log.Error("the user already has running or waiting task", ctx.Data["MsgID"])
resultCode = "2"
errorMsg = ctx.Tr("repo.cloudbrain.morethanonejob")
break
}
}
oldSpec, err := resource.GetCloudbrainSpec(task.ID)
if err != nil || oldSpec == nil {
log.Error("NotebookManage GetCloudbrainSpec error.%v", err)
errorMsg = "Resource specification not available"
break
}
computeSourceSimple := models.GPU
action := models.ActionCreateGrampusGPUDebugTask
if task.ComputeResource == models.NPUResource {
computeSourceSimple = models.NPU
action = models.ActionCreateGrampusNPUDebugTask
}
spec, err = resource.GetAndCheckSpec(ctx.User.ID, oldSpec.ID, models.FindSpecsOptions{
JobType: models.JobType(task.JobType),
ComputeResource: computeSourceSimple,
Cluster: models.C2NetCluster,
})
if err != nil || spec == nil {
log.Error("NotebookManage GetAndCheckSpec error.task.id = %d", task.ID)
errorMsg = "Resource specification not support any more"
break
}
if !account.IsPointBalanceEnough(ctx.User.ID, spec.UnitPrice) {
log.Error("point balance is not enough,userId=%d specId=%d", ctx.User.ID, spec.ID)
errorMsg = ctx.Tr("points.insufficient_points_balance")
break
}
if task.IsGPUTask() {
if _, err := os.Stat(getOldJobPath(task)); err != nil {
log.Error("Can not find job minio path", err)
resultCode = "-1"
errorMsg = ctx.Tr("cloudbrain.result_cleared")
break
}
}
if !HasModelFile(task) { //使用预训练模型训练
errorMsg = ctx.Tr("repo.debug.manage.model_not_exist")
break
}
if hasDatasetDeleted(task) {
errorMsg = ctx.Tr("repo.debug.manage.dataset_not_exist")
break
}
createTime := timeutil.TimeStampNow()
res, err := grampus.RestartNotebookJob(task.JobID)
if err != nil {
log.Error("ManageNotebook2(%s) failed:%v", task.DisplayJobName, err.Error(), ctx.Data["MsgID"])
errorMsg = ctx.Tr("repo.debug_again_fail")
break
}
if res.GrampusResult.ErrorCode != 0 || res.NewId == "" {
log.Error("ManageNotebook2 failed:" + res.GrampusResult.ErrorMsg)
errorMsg = ctx.Tr("repo.debug_again_fail")
break
}
newTask := &models.Cloudbrain{
Status: res.Status,
UserID: task.UserID,
RepoID: task.RepoID,
JobID: res.NewId,
JobName: task.JobName,
DisplayJobName: task.DisplayJobName,
JobType: task.JobType,
Type: task.Type,
Uuid: task.Uuid,
Image: task.Image,
ImageID: task.ImageID,
EngineID: task.EngineID,
CommitID: task.CommitID,
EngineName: task.EngineName,
IsLatestVersion: "1",
BranchName: task.BranchName,
DatasetName: task.DatasetName,
ComputeResource: task.ComputeResource,
Description: task.Description,
CreatedUnix: createTime,
UpdatedUnix: createTime,
Spec: spec,
ModelName: task.ModelName,
ModelVersion: task.ModelVersion,
LabelName: task.LabelName,
PreTrainModelUrl: task.PreTrainModelUrl,
CkptName: task.CkptName,
WorkServerNumber: 1,
}
err = models.RestartCloudbrain(task, newTask)
if err != nil {
log.Error("RestartCloudbrain(%s) failed:%v", task.JobName, err.Error(), ctx.Data["MsgID"])
errorMsg = "system error"
break
}
id = strconv.FormatInt(newTask.ID, 10)
status = res.Status
resultCode = "0"
notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, id, newTask.DisplayJobName, action)
break
}
ctx.JSON(200, map[string]string{
"result_code": resultCode,
"error_msg": errorMsg,
"status": status,
"id": id,
})
}