#3908 fix-3812

Merged
ychao_1983 merged 14 commits from fix-3812 into V20230322 1 year ago
  1. +8
    -0
      models/cloudbrain.go
  2. +36
    -4
      modules/grampus/resty.go
  3. +2
    -1
      routers/api/v1/api.go
  4. +24
    -4
      routers/repo/grampus.go
  5. +1
    -1
      templates/repo/grampus/notebook/gcu/new.tmpl
  6. +18
    -0
      templates/repo/grampus/notebook/show.tmpl
  7. +1
    -1
      templates/repo/grampus/trainjob/gcu/new.tmpl
  8. +4
    -2
      templates/repo/grampus/trainjob/show.tmpl

+ 8
- 0
models/cloudbrain.go View File

@@ -1677,6 +1677,11 @@ type GetGrampusJobEventsResponse struct {
JobEvents []GrampusJobEvents `json:"jobEvents"`
TotalSize int `json:"totalSize"`
}
type GetGrampusDebugJobEventsResponse struct {
GrampusResult
NotebookEvents []GrampusJobEvents `json:"notebookEvents"`
TotalSize int `json:"totalSize"`
}

type GrampusTasks struct {
Command string `json:"command"`
@@ -1947,6 +1952,9 @@ func QueryModelTrainJobList(repoId int64) ([]*Cloudbrain, int, error) {
// cond = cond.And(
// builder.In("type", 0, 1),
// )
cond = cond.And(
builder.In("compute_resource", "NPU", "CPU/GPU"),
)

cloudbrains := make([]*Cloudbrain, 0)
if err := sess.Select("*").Table(&Cloudbrain{}).Where(cond).OrderBy("created_unix DESC").


+ 36
- 4
modules/grampus/resty.go View File

@@ -425,7 +425,39 @@ sendjob:
return &result, nil
}

func GetJobEvents(jobID string) (*models.GetGrampusJobEventsResponse, error) {
func GetDebugJobEvents(jobID string) (*models.GetGrampusDebugJobEventsResponse, error) {
checkSetting()
client := getRestyClient()
var result models.GetGrampusDebugJobEventsResponse

retry := 0

sendjob:
_, err := client.R().
SetAuthToken(TOKEN).
SetResult(&result).
Get(HOST + urlNotebookJob + "/" + jobID + "/events")

if err != nil {
return nil, fmt.Errorf("resty GetDebugJobEvents: %v", err)
}

if result.ErrorCode == errorIllegalToken && retry < 1 {
retry++
log.Info("retry get token")
_ = getToken()
goto sendjob
}

if result.ErrorCode != 0 {
log.Error("GetDebugJobEvents failed(%d): %s", result.ErrorCode, result.ErrorMsg)
return nil, fmt.Errorf("GetDebugJobEvents failed(%d): %s", result.ErrorCode, result.ErrorMsg)
}

return &result, nil
}

func GetTrainJobEvents(jobID string) (*models.GetGrampusJobEventsResponse, error) {
checkSetting()
client := getRestyClient()
var result models.GetGrampusJobEventsResponse
@@ -438,7 +470,7 @@ sendjob:
SetResult(&result).
Get(HOST + urlTrainJob + "/" + jobID + "/events")
if err != nil {
return nil, fmt.Errorf("resty GetJobEvents: %v", err)
return nil, fmt.Errorf("resty GetTrainJobEvents: %v", err)
}

if result.ErrorCode == errorIllegalToken && retry < 1 {
@@ -449,8 +481,8 @@ sendjob:
}

if result.ErrorCode != 0 {
log.Error("GetJobEvents failed(%d): %s", result.ErrorCode, result.ErrorMsg)
return nil, fmt.Errorf("GetJobEvents failed(%d): %s", result.ErrorCode, result.ErrorMsg)
log.Error("GetTrainJobEvents failed(%d): %s", result.ErrorCode, result.ErrorMsg)
return nil, fmt.Errorf("GetTrainJobEvents failed(%d): %s", result.ErrorCode, result.ErrorMsg)
}

return &result, nil


+ 2
- 1
routers/api/v1/api.go View File

@@ -1122,6 +1122,7 @@ func RegisterRoutes(m *macaron.Macaron) {
m.Group("/grampus", func() {
m.Group("/notebook", func() {
m.Get("/:id", repo_ext.GetGrampusNotebook)
m.Get("/:id/job_event", repo_ext.GrampusDebugJobEvents)
})
m.Group("/train-job", func() {
m.Group("/:jobid", func() {
@@ -1131,7 +1132,7 @@ func RegisterRoutes(m *macaron.Macaron) {
m.Get("/metrics", repo_ext.GrampusMetrics)
m.Get("/download_multi_model", cloudbrain.AdminOrJobCreaterRightForTrain, repo.MultiModelDownload)
m.Get("/download_log", cloudbrain.AdminOrJobCreaterRightForTrain, repo_ext.GrampusDownloadLog)
m.Get("/job_event", repo_ext.GrampusJobEvents)
m.Get("/job_event", repo_ext.GrampusTrainJobEvents)
})
})
}, reqRepoReader(models.UnitTypeCloudBrain))


+ 24
- 4
routers/repo/grampus.go View File

@@ -1690,8 +1690,28 @@ func GrampusMetrics(ctx *context.Context) {

return
}
func GrampusDebugJobEvents(ctx *context.Context) {
ID := ctx.Params(":id")
job, err := models.GetCloudbrainByID(ID)
if err != nil {
log.Error("GetCloudbrainByID failed: %v", err, ctx.Data["MsgID"])
ctx.ServerError(err.Error(), err)
return
}

result, err := grampus.GetDebugJobEvents(job.JobID)
if err != nil {
log.Error("GetDebugJobEvents failed: %v", err, ctx.Data["MsgID"])
}
ctx.JSON(http.StatusOK, map[string]interface{}{
"JobID": ID,
"JobEvents": result.NotebookEvents,
})

return
}

func GrampusJobEvents(ctx *context.Context) {
func GrampusTrainJobEvents(ctx *context.Context) {
jobID := ctx.Params(":jobid")
job, err := models.GetCloudbrainByJobID(jobID)
if err != nil {
@@ -1700,7 +1720,7 @@ func GrampusJobEvents(ctx *context.Context) {
return
}

result, err := grampus.GetJobEvents(job.JobID)
result, err := grampus.GetTrainJobEvents(job.JobID)
if err != nil {
log.Error("GetJobEvents failed: %v", err, ctx.Data["MsgID"])
}
@@ -1775,12 +1795,12 @@ func generateCommand(repoName, processorType, bootFile, paramSrc, outputRemotePa
commandCode = "source /home/ma-user/.bashrc;python /home/ma-user/davinci/train/davincirun.py python /home/ma-user/openi.py " + paramCode + ";"
} else if processorType == grampus.ProcessorTypeGPU {
if pretrainModelFileName != "" {
paramCode += " --ckpt_url" + "='" + workDir + "pretrainmodel/" + pretrainModelFileName+"'"
paramCode += " --ckpt_url" + "='" + workDir + "pretrainmodel/" + pretrainModelFileName + "'"
}
commandCode = "cd " + workDir + "code/" + strings.ToLower(repoName) + ";python " + bootFile + paramCode + ";"
} else if processorType == grampus.ProcessorTypeGCU {
if pretrainModelFileName != "" {
paramCode += " --ckpt_url" + "='" + workDir + "pretrainmodel/" + pretrainModelFileName+"'"
paramCode += " --ckpt_url" + "='" + workDir + "pretrainmodel/" + pretrainModelFileName + "'"
}
commandCode = "cd " + workDir + "code/" + strings.ToLower(repoName) + ";python3 " + bootFile + paramCode + ";"
}


+ 1
- 1
templates/repo/grampus/notebook/gcu/new.tmpl View File

@@ -99,7 +99,7 @@
{{end}}
</select>
</div>
{{template "custom/select_model" .}}
<!-- {{template "custom/select_model" .}} -->
<div class="inline min_title required field">
<label class="label-fix-width" style="font-weight: normal;">{{.i18n.Tr "cloudbrain.mirror"}}</label>
<select class="ui search dropdown cloudbrain_image width48" placeholder="{{.i18n.Tr "cloudbrain.choose_mirror"}}" style='width:385px' name="image_id">


+ 18
- 0
templates/repo/grampus/notebook/show.tmpl View File

@@ -52,6 +52,9 @@
<div class="content-pad">
<div class="ui pointing secondary menu" style="border-bottom: 1px solid rgba(34,36,38,.15);">
<a class="active item" data-tab="first">{{$.i18n.Tr "repo.modelarts.train_job.config"}}</a>
{{if eq .ComputeResource "CPU/GPU"}}
<a class="item run_info" data-tab="five" data-version="{{.VersionName}}">{{$.i18n.Tr "repo.cloudbrain.runinfo"}}</a>
{{end}}
</div>
<div class="ui tab active" data-tab="first">
<div style="padding-top: 10px;">
@@ -391,7 +394,21 @@
</div>
</div>

<div class="ui tab" data-tab="five">
<div style="position: relative;border: 1px solid rgba(0,0,0,.2);padding: 0 10px;margin-top: 10px;">
<div class="ui attached info" id="info{{.VersionName}}"
style="height: 300px !important; overflow: auto;">
<div class="ui inverted active dimmer">
<div class="ui loader"></div>
</div>
<span class="info_text">
</span>
</div>

</div>

</div>
</div>
</div>
</div>
@@ -424,6 +441,7 @@
{{template "base/footer" .}}
<script src="{{StaticUrlPrefix}}/js/specsuse.js?v={{MD5 AppVer}}" type="text/javascript"></script>
<script>
$('.menu .item').tab()
;(function() {
var SPEC = {{ .Spec }};
var showPoint = false;


+ 1
- 1
templates/repo/grampus/trainjob/gcu/new.tmpl View File

@@ -141,7 +141,7 @@
{{end}}
</select>
</div>
{{template "custom/select_model" .}}
<!-- {{template "custom/select_model" .}} -->
<div class="required min_title inline field" id="engine_name">
<label class="label-fix-width" style="font-weight: normal;">{{.i18n.Tr "cloudbrain.mirror"}}</label>
<select class="ui dropdown cloudbrain_image width81" id="trainjob_images" name="image_id">


+ 4
- 2
templates/repo/grampus/trainjob/show.tmpl View File

@@ -70,7 +70,7 @@
</span>
</div>
<div style="float: right;">
{{if and ($.canDownload) (ne .Status "WAITING") ($.Permission.CanWrite $.UnitTypeModelManage) }}
{{if and ($.canDownload) (ne .Status "WAITING") ($.Permission.CanWrite $.UnitTypeModelManage) (ne .ComputeResource "GCU")}}
<a class="ti-action-menu-item" id="{{.VersionName}}-create-model"
onclick="showcreate({DisplayJobName:{{.DisplayJobName}},JobName:{{.JobName}},JobID:{{.JobID}},VersionName:{{.VersionName}},EngineName:{{.EngineName}},ComputeResource:{{.ComputeResource}},Type:{{.Type}}})">{{$.i18n.Tr "repo.modelarts.create_model"}}</a>
{{else}}
@@ -89,7 +89,9 @@
<a class="active item" data-tab="first{{$k}}">{{$.i18n.Tr "repo.modelarts.train_job.config"}}</a>
<a class="item log_bottom" data-tab="second{{$k}}" data-version="{{.VersionName}}">{{$.i18n.Tr "repo.modelarts.log"}}</a>
<a class="item metric_chart" data-tab="four{{$k}}" data-version="{{.VersionName}}" data-path="{{$.RepoRelPath}}/grampus/train-job/{{.JobID}}/metrics">{{$.i18n.Tr "cloudbrain.resource_use"}}</a>

{{if eq .ComputeResource "CPU/GPU"}}
<a class="item run_info" data-tab="five{{$k}}" data-version="{{.VersionName}}">{{$.i18n.Tr "repo.cloudbrain.runinfo"}}</a>
{{end}}
<a class="item load-model-file" data-tab="third{{$k}}" data-download-flag="{{$.canDownload}}" data-path="{{$.RepoLink}}/modelarts/train-job/{{.JobID}}/model_list" data-version="{{.VersionName}}" data-parents="" data-filename="" data-init="init" >{{$.i18n.Tr "repo.model_download"}}</a>
</div>
<div class="ui tab active" data-tab="first{{$k}}">


Loading…
Cancel
Save