#4204 fix-4166

Merged
zouap merged 3 commits from fix-4166 into V20230531 11 months ago
  1. +11
    -3
      modules/grampus/resty.go
  2. +3
    -0
      routers/api/v1/api.go
  3. +37
    -3
      routers/repo/grampus.go

+ 11
- 3
modules/grampus/resty.go View File

@@ -295,15 +295,20 @@ sendjob:
return &result, nil
}

func GetTrainJobLog(jobID string) (string, error) {
func GetTrainJobLog(jobID string, nodeId ...int) (string, error) {
checkSetting()
client := getRestyClient()
var logContent string

url := HOST + urlTrainJob + "/" + jobID + "/task/0/replica/0/log"
if len(nodeId) > 0 {
url = HOST + urlTrainJob + "/" + jobID + "/task/0/replica/0/log/node/" + strconv.Itoa(nodeId[0])
}

res, err := client.R().
SetAuthToken(TOKEN).
SetResult(&logContent).
Get(HOST + urlTrainJob + "/" + jobID + "/task/0/replica/0/log")
Get(url)

if err != nil {
return logContent, fmt.Errorf("resty GetTrainJobLog: %v", err)
@@ -324,11 +329,14 @@ func GetTrainJobLog(jobID string) (string, error) {
return logContent, nil
}

func GetGrampusMetrics(jobID string, startTime int64, endTime int64) (models.NewModelArtsMetricStatisticResult, error) {
func GetGrampusMetrics(jobID string, startTime int64, endTime int64, nodeId ...int) (models.NewModelArtsMetricStatisticResult, error) {
checkSetting()
client := getRestyClient()
var result models.NewModelArtsMetricStatisticResult
url := HOST + urlTrainJob + "/" + jobID + "/task/0/replica/0/metrics"
if len(nodeId) > 0 {
url = HOST + urlTrainJob + "/" + jobID + "/task/0/replica/0/metrics/node/" + strconv.Itoa(nodeId[0])
}
if startTime > 0 {
var step int64 = 60



+ 3
- 0
routers/api/v1/api.go View File

@@ -1244,8 +1244,11 @@ func RegisterRoutes(m *macaron.Macaron) {
m.Post("/stop_version", cloudbrain.AdminOrOwnerOrJobCreaterRightForTrain, repo_ext.GrampusStopJob)
m.Get("/log", repo_ext.GrampusGetLog)
m.Get("/metrics", repo_ext.GrampusMetrics)
m.Get("/metrics/:nodeId", repo_ext.GrampusMetrics)
m.Get("/log/:nodeId", repo_ext.GrampusGetLog)
m.Get("/download_multi_model", cloudbrain.AdminOrJobCreaterRightForTrain, repo.MultiModelDownload)
m.Get("/download_log", cloudbrain.AdminOrJobCreaterRightForTrain, repo_ext.GrampusDownloadLog)
m.Get("/download_log/:nodeId", cloudbrain.AdminOrJobCreaterRightForTrain, repo_ext.GrampusDownloadLog)
m.Get("/job_event", repo_ext.GrampusTrainJobEvents)
})
})


+ 37
- 3
routers/repo/grampus.go View File

@@ -8,6 +8,7 @@ import (
"net/http"
"os"
"path"
"strconv"
"strings"
"time"
"unicode/utf8"
@@ -1660,7 +1661,18 @@ func GrampusDownloadLog(ctx *context.Context) {
return
}

content, err := grampus.GetTrainJobLog(job.JobID)
nodeIdStr := ctx.Params(":nodeId")
var content string
if nodeIdStr != "" {
nodeId, _ := strconv.Atoi(nodeIdStr)
if job.WorkServerNumber < 2 || nodeId > job.WorkServerNumber-1 {
ctx.NotFound("query parameter is wrong", nil)
return
}
content, err = grampus.GetTrainJobLog(job.JobID, nodeId)
} else {
content, err = grampus.GetTrainJobLog(job.JobID)
}
if err != nil {
log.Error("GetTrainJobLog failed: %v", err, ctx.Data["MsgID"])
content = ""
@@ -1696,7 +1708,19 @@ func GrampusGetLog(ctx *context.Context) {
exitDiagnostics = result.ExitDiagnostics
}

content, err := grampus.GetTrainJobLog(job.JobID)
nodeIdStr := ctx.Params(":nodeId")
var content string
if nodeIdStr != "" {
nodeId, _ := strconv.Atoi(nodeIdStr)
if job.WorkServerNumber < 2 || nodeId > job.WorkServerNumber-1 {
ctx.NotFound("query parameter is wrong", nil)
return
}
content, err = grampus.GetTrainJobLog(job.JobID, nodeId)
} else {
content, err = grampus.GetTrainJobLog(job.JobID)
}

if err != nil {
log.Error("GetTrainJobLog failed: %v", err, ctx.Data["MsgID"])
ctx.JSON(http.StatusOK, map[string]interface{}{
@@ -1734,7 +1758,17 @@ func GrampusMetrics(ctx *context.Context) {
}
var result models.NewModelArtsMetricStatisticResult
if job.IsNPUTask() {
result, err = grampus.GetGrampusMetrics(job.JobID, 0, 0)
nodeIdStr := ctx.Params(":nodeId")
if nodeIdStr != "" {
nodeId, _ := strconv.Atoi(nodeIdStr)
if job.WorkServerNumber < 2 || nodeId > job.WorkServerNumber-1 {
ctx.NotFound("query parameter is wrong", nil)
return
}
result, err = grampus.GetGrampusMetrics(job.JobID, 0, 0, nodeId)
} else {
result, err = grampus.GetGrampusMetrics(job.JobID, 0, 0)
}
} else if job.IsGPUTask() {
startTime := int64(job.StartTime)
if startTime == 0 {


Loading…
Cancel
Save