From c6bb0fa6d1ca7899e2bed9def211b48f5e0c3d06 Mon Sep 17 00:00:00 2001 From: liuzx Date: Thu, 23 Mar 2023 15:01:51 +0800 Subject: [PATCH 01/11] =?UTF-8?q?#3812=20=E8=BF=90=E8=A1=8C=E7=AE=80?= =?UTF-8?q?=E5=86=B5?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- modules/grampus/resty.go | 38 ++++++++++++++++++++++++++++++++++---- routers/api/v1/api.go | 3 ++- routers/repo/grampus.go | 28 ++++++++++++++++++++++++---- 3 files changed, 60 insertions(+), 9 deletions(-) diff --git a/modules/grampus/resty.go b/modules/grampus/resty.go index 576635ae0e..911a057c4c 100755 --- a/modules/grampus/resty.go +++ b/modules/grampus/resty.go @@ -425,7 +425,7 @@ sendjob: return &result, nil } -func GetJobEvents(jobID string) (*models.GetGrampusJobEventsResponse, error) { +func GetTrainJobEvents(jobID string) (*models.GetGrampusJobEventsResponse, error) { checkSetting() client := getRestyClient() var result models.GetGrampusJobEventsResponse @@ -438,7 +438,7 @@ sendjob: SetResult(&result). Get(HOST + urlTrainJob + "/" + jobID + "/events") if err != nil { - return nil, fmt.Errorf("resty GetJobEvents: %v", err) + return nil, fmt.Errorf("resty GetTrainJobEvents: %v", err) } if result.ErrorCode == errorIllegalToken && retry < 1 { @@ -449,8 +449,38 @@ sendjob: } if result.ErrorCode != 0 { - log.Error("GetJobEvents failed(%d): %s", result.ErrorCode, result.ErrorMsg) - return nil, fmt.Errorf("GetJobEvents failed(%d): %s", result.ErrorCode, result.ErrorMsg) + log.Error("GetTrainJobEvents failed(%d): %s", result.ErrorCode, result.ErrorMsg) + return nil, fmt.Errorf("GetTrainJobEvents failed(%d): %s", result.ErrorCode, result.ErrorMsg) + } + + return &result, nil +} +func GetDebugJobEvents(jobID string) (*models.GetGrampusJobEventsResponse, error) { + checkSetting() + client := getRestyClient() + var result models.GetGrampusJobEventsResponse + + retry := 0 + +sendjob: + _, err := client.R(). + SetAuthToken(TOKEN). + SetResult(&result). + Get(HOST + urlNotebookJob + "/" + jobID + "/events") + if err != nil { + return nil, fmt.Errorf("resty GetDebugJobEvents: %v", err) + } + + if result.ErrorCode == errorIllegalToken && retry < 1 { + retry++ + log.Info("retry get token") + _ = getToken() + goto sendjob + } + + if result.ErrorCode != 0 { + log.Error("GetDebugJobEvents failed(%d): %s", result.ErrorCode, result.ErrorMsg) + return nil, fmt.Errorf("GetDebugJobEvents failed(%d): %s", result.ErrorCode, result.ErrorMsg) } return &result, nil diff --git a/routers/api/v1/api.go b/routers/api/v1/api.go index af37e9251a..8ad99b5402 100755 --- a/routers/api/v1/api.go +++ b/routers/api/v1/api.go @@ -1122,6 +1122,7 @@ func RegisterRoutes(m *macaron.Macaron) { m.Group("/grampus", func() { m.Group("/notebook", func() { m.Get("/:id", repo_ext.GetGrampusNotebook) + m.Get("/job_event", repo_ext.GrampusDebugJobEvents) }) m.Group("/train-job", func() { m.Group("/:jobid", func() { @@ -1131,7 +1132,7 @@ func RegisterRoutes(m *macaron.Macaron) { m.Get("/metrics", repo_ext.GrampusMetrics) m.Get("/download_multi_model", cloudbrain.AdminOrJobCreaterRightForTrain, repo.MultiModelDownload) m.Get("/download_log", cloudbrain.AdminOrJobCreaterRightForTrain, repo_ext.GrampusDownloadLog) - m.Get("/job_event", repo_ext.GrampusJobEvents) + m.Get("/job_event", repo_ext.GrampusTrainJobEvents) }) }) }, reqRepoReader(models.UnitTypeCloudBrain)) diff --git a/routers/repo/grampus.go b/routers/repo/grampus.go index 3cc30d161c..c72dc97f03 100755 --- a/routers/repo/grampus.go +++ b/routers/repo/grampus.go @@ -1690,8 +1690,28 @@ func GrampusMetrics(ctx *context.Context) { return } +func GrampusDebugJobEvents(ctx *context.Context) { + jobID := ctx.Params(":jobid") + job, err := models.GetCloudbrainByJobID(jobID) + if err != nil { + log.Error("GetCloudbrainByJobID failed: %v", err, ctx.Data["MsgID"]) + ctx.ServerError(err.Error(), err) + return + } + + result, err := grampus.GetDebugJobEvents(job.JobID) + if err != nil { + log.Error("GetJobEvents failed: %v", err, ctx.Data["MsgID"]) + } + ctx.JSON(http.StatusOK, map[string]interface{}{ + "JobID": jobID, + "JobEvents": result.JobEvents, + }) + + return +} -func GrampusJobEvents(ctx *context.Context) { +func GrampusTrainJobEvents(ctx *context.Context) { jobID := ctx.Params(":jobid") job, err := models.GetCloudbrainByJobID(jobID) if err != nil { @@ -1700,7 +1720,7 @@ func GrampusJobEvents(ctx *context.Context) { return } - result, err := grampus.GetJobEvents(job.JobID) + result, err := grampus.GetTrainJobEvents(job.JobID) if err != nil { log.Error("GetJobEvents failed: %v", err, ctx.Data["MsgID"]) } @@ -1775,12 +1795,12 @@ func generateCommand(repoName, processorType, bootFile, paramSrc, outputRemotePa commandCode = "source /home/ma-user/.bashrc;python /home/ma-user/davinci/train/davincirun.py python /home/ma-user/openi.py " + paramCode + ";" } else if processorType == grampus.ProcessorTypeGPU { if pretrainModelFileName != "" { - paramCode += " --ckpt_url" + "='" + workDir + "pretrainmodel/" + pretrainModelFileName+"'" + paramCode += " --ckpt_url" + "='" + workDir + "pretrainmodel/" + pretrainModelFileName + "'" } commandCode = "cd " + workDir + "code/" + strings.ToLower(repoName) + ";python " + bootFile + paramCode + ";" } else if processorType == grampus.ProcessorTypeGCU { if pretrainModelFileName != "" { - paramCode += " --ckpt_url" + "='" + workDir + "pretrainmodel/" + pretrainModelFileName+"'" + paramCode += " --ckpt_url" + "='" + workDir + "pretrainmodel/" + pretrainModelFileName + "'" } commandCode = "cd " + workDir + "code/" + strings.ToLower(repoName) + ";python3 " + bootFile + paramCode + ";" } -- 2.34.1 From 8e94d2ee631dd62e615bb28f22b2fcc4c27c9cc9 Mon Sep 17 00:00:00 2001 From: liuzx Date: Thu, 23 Mar 2023 15:10:42 +0800 Subject: [PATCH 02/11] update --- routers/api/v1/api.go | 2 +- routers/repo/grampus.go | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/routers/api/v1/api.go b/routers/api/v1/api.go index 8ad99b5402..0615e28d80 100755 --- a/routers/api/v1/api.go +++ b/routers/api/v1/api.go @@ -1122,7 +1122,7 @@ func RegisterRoutes(m *macaron.Macaron) { m.Group("/grampus", func() { m.Group("/notebook", func() { m.Get("/:id", repo_ext.GetGrampusNotebook) - m.Get("/job_event", repo_ext.GrampusDebugJobEvents) + m.Get("/:id/job_event", repo_ext.GrampusDebugJobEvents) }) m.Group("/train-job", func() { m.Group("/:jobid", func() { diff --git a/routers/repo/grampus.go b/routers/repo/grampus.go index c72dc97f03..2692030140 100755 --- a/routers/repo/grampus.go +++ b/routers/repo/grampus.go @@ -1691,8 +1691,8 @@ func GrampusMetrics(ctx *context.Context) { return } func GrampusDebugJobEvents(ctx *context.Context) { - jobID := ctx.Params(":jobid") - job, err := models.GetCloudbrainByJobID(jobID) + ID := ctx.Params(":id") + job, err := models.GetCloudbrainByID(ID) if err != nil { log.Error("GetCloudbrainByJobID failed: %v", err, ctx.Data["MsgID"]) ctx.ServerError(err.Error(), err) -- 2.34.1 From 14c6ca8fbc5dcce4b68ebc3ec5412cc8d6643746 Mon Sep 17 00:00:00 2001 From: liuzx Date: Thu, 23 Mar 2023 15:15:21 +0800 Subject: [PATCH 03/11] update --- routers/repo/grampus.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/routers/repo/grampus.go b/routers/repo/grampus.go index 2692030140..111d1c11dc 100755 --- a/routers/repo/grampus.go +++ b/routers/repo/grampus.go @@ -1704,7 +1704,7 @@ func GrampusDebugJobEvents(ctx *context.Context) { log.Error("GetJobEvents failed: %v", err, ctx.Data["MsgID"]) } ctx.JSON(http.StatusOK, map[string]interface{}{ - "JobID": jobID, + "JobID": ID, "JobEvents": result.JobEvents, }) -- 2.34.1 From 4cf7567b12203437c33ea3591fcd201e89f2beb7 Mon Sep 17 00:00:00 2001 From: liuzx Date: Thu, 23 Mar 2023 15:19:19 +0800 Subject: [PATCH 04/11] update --- models/cloudbrain.go | 5 +++++ modules/grampus/resty.go | 4 ++-- routers/repo/grampus.go | 2 +- 3 files changed, 8 insertions(+), 3 deletions(-) diff --git a/models/cloudbrain.go b/models/cloudbrain.go index 86999d1b6b..6307a0152e 100755 --- a/models/cloudbrain.go +++ b/models/cloudbrain.go @@ -1677,6 +1677,11 @@ type GetGrampusJobEventsResponse struct { JobEvents []GrampusJobEvents `json:"jobEvents"` TotalSize int `json:"totalSize"` } +type GetGrampusDebugJobEventsResponse struct { + GrampusResult + NotebookEvents []GrampusJobEvents `json:"notebookEvents"` + TotalSize int `json:"totalSize"` +} type GrampusTasks struct { Command string `json:"command"` diff --git a/modules/grampus/resty.go b/modules/grampus/resty.go index 911a057c4c..3edcc15fab 100755 --- a/modules/grampus/resty.go +++ b/modules/grampus/resty.go @@ -455,10 +455,10 @@ sendjob: return &result, nil } -func GetDebugJobEvents(jobID string) (*models.GetGrampusJobEventsResponse, error) { +func GetDebugJobEvents(jobID string) (*models.GetGrampusDebugJobEventsResponse, error) { checkSetting() client := getRestyClient() - var result models.GetGrampusJobEventsResponse + var result models.GetGrampusDebugJobEventsResponse retry := 0 diff --git a/routers/repo/grampus.go b/routers/repo/grampus.go index 111d1c11dc..11748f0478 100755 --- a/routers/repo/grampus.go +++ b/routers/repo/grampus.go @@ -1705,7 +1705,7 @@ func GrampusDebugJobEvents(ctx *context.Context) { } ctx.JSON(http.StatusOK, map[string]interface{}{ "JobID": ID, - "JobEvents": result.JobEvents, + "JobEvents": result.NotebookEvents, }) return -- 2.34.1 From 82d23e0509a9b8773810361526e0cd914b87bb3f Mon Sep 17 00:00:00 2001 From: zhoupzh Date: Thu, 23 Mar 2023 15:29:56 +0800 Subject: [PATCH 05/11] add grampus GPU run info --- templates/repo/grampus/notebook/show.tmpl | 18 ++++++++++++++++++ templates/repo/grampus/trainjob/show.tmpl | 4 +++- 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/templates/repo/grampus/notebook/show.tmpl b/templates/repo/grampus/notebook/show.tmpl index 4b744511b2..a590a61056 100644 --- a/templates/repo/grampus/notebook/show.tmpl +++ b/templates/repo/grampus/notebook/show.tmpl @@ -52,6 +52,9 @@
@@ -391,7 +394,21 @@
+
+
+
+
+
+
+ + + +
+
+ +
@@ -424,6 +441,7 @@ {{template "base/footer" .}}