From e704c7d583104aeaad5f98bc11cc7d9b7a723c75 Mon Sep 17 00:00:00 2001 From: liuzx Date: Fri, 17 Mar 2023 11:22:04 +0800 Subject: [PATCH 01/13] =?UTF-8?q?#3812=20=E5=A2=9E=E5=8A=A0=E6=99=BA?= =?UTF-8?q?=E7=AE=97gpu=E8=BF=90=E8=A1=8C=E7=AE=80=E5=86=B5=E4=BF=A1?= =?UTF-8?q?=E6=81=AF?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- models/cloudbrain.go | 13 +++++++++++++ modules/grampus/resty.go | 31 +++++++++++++++++++++++++++++++ routers/api/v1/api.go | 1 + routers/repo/grampus.go | 21 +++++++++++++++++++++ 4 files changed, 66 insertions(+) diff --git a/models/cloudbrain.go b/models/cloudbrain.go index 798cdefa53..0549630e7d 100755 --- a/models/cloudbrain.go +++ b/models/cloudbrain.go @@ -1610,6 +1610,13 @@ type SpecInfo struct { MemorySize string `json:"memorySize"` } +type GrampusJobEvents struct { + Message string `json:"message"` + Name string `json:"name"` + Reason string `json:"reason"` + Timestamp string `json:"timestamp"` +} + type GetGrampusResourceSpecsResult struct { GrampusResult Infos []GrampusSpec `json:"resourceSpecs"` @@ -1664,6 +1671,12 @@ type GrampusStopJobResponse struct { Status string `json:"status"` } +type GetGrampusJobEventsResponse struct { + GrampusResult + JobEvents []GrampusJobEvents `json:"jobEvents"` + TotalSize int `json:"totalSize"` +} + type GrampusTasks struct { Command string `json:"command"` Name string `json:"name"` diff --git a/modules/grampus/resty.go b/modules/grampus/resty.go index 65d195318f..a8ea84746c 100755 --- a/modules/grampus/resty.go +++ b/modules/grampus/resty.go @@ -408,6 +408,37 @@ sendjob: return &result, nil } +func GetJobEvents(jobID string) (*models.GetGrampusJobEventsResponse, error) { + checkSetting() + client := getRestyClient() + var result models.GetGrampusJobEventsResponse + + retry := 0 + +sendjob: + _, err := client.R(). + SetAuthToken(TOKEN). + SetResult(&result). + Get(HOST + urlTrainJob + "/" + jobID + "/events") + if err != nil { + return nil, fmt.Errorf("resty GetJobEvents: %v", err) + } + + if result.ErrorCode == errorIllegalToken && retry < 1 { + retry++ + log.Info("retry get token") + _ = getToken() + goto sendjob + } + + if result.ErrorCode != 0 { + log.Error("GetJobEvents failed(%d): %s", result.ErrorCode, result.ErrorMsg) + return nil, fmt.Errorf("GetJobEvents failed(%d): %s", result.ErrorCode, result.ErrorMsg) + } + + return &result, nil +} + func RestartNotebookJob(jobID string) (*models.GrampusNotebookRestartResponse, error) { checkSetting() client := getRestyClient() diff --git a/routers/api/v1/api.go b/routers/api/v1/api.go index fe5b797d1f..c560556309 100755 --- a/routers/api/v1/api.go +++ b/routers/api/v1/api.go @@ -1128,6 +1128,7 @@ func RegisterRoutes(m *macaron.Macaron) { m.Get("/metrics", repo_ext.GrampusMetrics) m.Get("/download_multi_model", cloudbrain.AdminOrJobCreaterRightForTrain, repo.MultiModelDownload) m.Get("/download_log", cloudbrain.AdminOrJobCreaterRightForTrain, repo_ext.GrampusDownloadLog) + m.Get("/job_event", repo_ext.GrampusJobEvents) }) }) }, reqRepoReader(models.UnitTypeCloudBrain)) diff --git a/routers/repo/grampus.go b/routers/repo/grampus.go index 3c8da0ae1f..f1eadf34df 100755 --- a/routers/repo/grampus.go +++ b/routers/repo/grampus.go @@ -1414,6 +1414,27 @@ func GrampusMetrics(ctx *context.Context) { return } +func GrampusJobEvents(ctx *context.Context) { + jobID := ctx.Params(":jobid") + job, err := models.GetCloudbrainByJobID(jobID) + if err != nil { + log.Error("GetCloudbrainByJobID failed: %v", err, ctx.Data["MsgID"]) + ctx.ServerError(err.Error(), err) + return + } + + result, err := grampus.GetJobEvents(job.JobID) + if err != nil { + log.Error("GetJobEvents failed: %v", err, ctx.Data["MsgID"]) + } + ctx.JSON(http.StatusOK, map[string]interface{}{ + "JobID": jobID, + "JobEvents": result.JobEvents, + }) + + return +} + func generateCommand(repoName, processorType, bootFile, paramSrc, outputRemotePath, datasetName, pretrainModelFileName, modelRemoteObsUrl string) (string, error) { var command string -- 2.34.1 From 77f0f28cbab054e22e66d2e0b6507203075ffc8a Mon Sep 17 00:00:00 2001 From: liuzx Date: Mon, 20 Mar 2023 17:09:10 +0800 Subject: [PATCH 02/13] =?UTF-8?q?#3820=20=E5=A2=9E=E5=8A=A0=E7=BB=93?= =?UTF-8?q?=E6=9E=9C=E4=B8=8B=E8=BD=BD=E7=9A=84=E5=87=A0=E7=A7=8D=E7=8A=B6?= =?UTF-8?q?=E6=80=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- routers/api/v1/repo/modelarts.go | 9 ++++-- .../cloudbrain/cloudbrainTask/sync_status.go | 30 ------------------- 2 files changed, 7 insertions(+), 32 deletions(-) diff --git a/routers/api/v1/repo/modelarts.go b/routers/api/v1/repo/modelarts.go index 731867aced..72d5875f6d 100755 --- a/routers/api/v1/repo/modelarts.go +++ b/routers/api/v1/repo/modelarts.go @@ -6,7 +6,6 @@ package repo import ( - "code.gitea.io/gitea/routers/response" "encoding/json" "net/http" "path" @@ -15,6 +14,8 @@ import ( "strings" "time" + "code.gitea.io/gitea/routers/response" + "code.gitea.io/gitea/modules/cloudbrain" "code.gitea.io/gitea/services/cloudbrain/cloudbrainTask" @@ -504,7 +505,11 @@ func ModelList(ctx *context.APIContext) { } if task.Type == models.TypeC2Net { - status = cloudbrainTask.GetModelListFileStatus(fileInfos, task) + status, err = cloudbrainTask.GetModelScheduleStatus(task.JobID) + if err != nil { + log.Error("GetModelScheduleStatus(%s) failed:%v", task.JobName, err.Error()) + return + } } ctx.JSON(http.StatusOK, map[string]interface{}{ diff --git a/services/cloudbrain/cloudbrainTask/sync_status.go b/services/cloudbrain/cloudbrainTask/sync_status.go index 9ebe57b2df..29d99c7901 100644 --- a/services/cloudbrain/cloudbrainTask/sync_status.go +++ b/services/cloudbrain/cloudbrainTask/sync_status.go @@ -12,7 +12,6 @@ import ( "code.gitea.io/gitea/modules/modelarts_cd" "code.gitea.io/gitea/modules/notification" "code.gitea.io/gitea/modules/setting" - "code.gitea.io/gitea/modules/storage" "code.gitea.io/gitea/modules/timeutil" ) @@ -170,32 +169,3 @@ func StopDebugJob(task *models.Cloudbrain) error { return err } - -func GetModelListFileStatus(fileInfos []storage.FileInfo, task *models.Cloudbrain) (status int) { - if len(fileInfos) > 0 { - status = models.StorageScheduleSucceed - } else { - if models.IsTrainJobTerminal(task.Status) { - if task.Status == models.GrampusStatusStopped { - status = models.StorageNoFile - } else if task.Status == models.GrampusStatusFailed { - if task.AiCenter == "" { - status = models.StorageNoFile - } - } else { - record, _ := models.GetScheduleRecordByCloudbrainID(task.ID) - if record != nil { - status = record.Status - if status == models.StorageScheduleSucceed { - status = models.StorageNoFile - } - } else { - status = models.StorageScheduleProcessing - } - } - } else { - status = models.StorageScheduleWaiting - } - } - return status -} -- 2.34.1 From 3ac8cfe0d1eff3e7da12a549bea3616bf0ee86c2 Mon Sep 17 00:00:00 2001 From: liuzx Date: Tue, 21 Mar 2023 14:58:17 +0800 Subject: [PATCH 03/13] =?UTF-8?q?#3820=20=E4=BF=AE=E6=94=B94=E7=A7=8D?= =?UTF-8?q?=E8=BF=94=E5=9B=9E=E7=8A=B6=E6=80=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- web_src/js/features/cloudbrainShow.js | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/web_src/js/features/cloudbrainShow.js b/web_src/js/features/cloudbrainShow.js index 64aa0c4c60..8455934149 100644 --- a/web_src/js/features/cloudbrainShow.js +++ b/web_src/js/features/cloudbrainShow.js @@ -592,21 +592,21 @@ export default async function initCloudrainSow() { ${i18n['file_sync_fail']} `); - } else if (data.StatusOK == 3) { // 无文件 3 + } else if (data.StatusOK == 3) { // 任务未结束 3 $(`#file_breadcrumb${version_name}`).empty(); $(`#dir_list${version_name}`).html(`
- +
- ${i18n['no_file_to_download']} + ${i18n['task_not_finished']}
`); - } else if (data.StatusOK == 4) { // 任务未结束 4 + } else if (data.StatusOK == 4) { // 无文件 4 $(`#file_breadcrumb${version_name}`).empty(); $(`#dir_list${version_name}`).html(`
- +
- ${i18n['task_not_finished']} + ${i18n['no_file_to_download']}
`); } }).fail(function (err) { -- 2.34.1 From b12f071d288a25e0fbdba7ee4065bd35b2b64c67 Mon Sep 17 00:00:00 2001 From: zhoupzh Date: Tue, 21 Mar 2023 16:05:36 +0800 Subject: [PATCH 04/13] add c2net gpu run info --- templates/repo/grampus/trainjob/show.tmpl | 16 +++++++++++ web_src/js/features/cloudbrainShow.js | 34 +++++++++++++++++++++++ 2 files changed, 50 insertions(+) diff --git a/templates/repo/grampus/trainjob/show.tmpl b/templates/repo/grampus/trainjob/show.tmpl index 7a15d0f8e8..97e0f67dbc 100755 --- a/templates/repo/grampus/trainjob/show.tmpl +++ b/templates/repo/grampus/trainjob/show.tmpl @@ -88,6 +88,7 @@ {{$.i18n.Tr "repo.modelarts.train_job.config"}} {{$.i18n.Tr "repo.modelarts.log"}} + {{$.i18n.Tr "repo.cloudbrain.runinfo"}} {{ if eq $.Spec.ComputeResource "NPU"}} {{$.i18n.Tr "cloudbrain.resource_use"}} {{end}} @@ -385,6 +386,21 @@ + +
+
+
+
+
+
+ + + +
+ +
+
{ + $(`#info${version_name} .ui.inverted.active.dimmer`).css( + "display", "none", + ); + parseInfo(data,version_name) + }) + }); + function parseInfo(jsonObj,version_name){ + let html = ""; + if (jsonObj != null){ + let podEventArray = jsonObj['JobEvents']; + console.log("podEventArray",podEventArray) + if(podEventArray != null){ + for(var i=0; i < podEventArray.length;i++){ + if (podEventArray[i]["reason"] != "") { + let time = new Date(podEventArray[i]["timestamp"]) + html += `

[${podEventArray[i]["reason"]}] ${time.toLocaleString()}

` + html += `

${podEventArray[i]["message"]}

`; + } + } + } + } + $(`#info${version_name} .info_text`)[0].innerHTML = html + } // $(".log-scroll-max").scroll(); $(".full-log-dialog").click(function () { -- 2.34.1 From 5cb09d04f343129f66f09a73d86a2b8d8f4ce4f3 Mon Sep 17 00:00:00 2001 From: liuzx Date: Wed, 22 Mar 2023 11:21:24 +0800 Subject: [PATCH 05/13] =?UTF-8?q?=E4=BF=AE=E6=94=B9=E8=BF=94=E5=9B=9E?= =?UTF-8?q?=E7=BB=93=E6=9E=9C=E5=88=9D=E5=A7=8B=E7=8A=B6=E6=80=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- modules/urfs_client/urchin/schedule.go | 114 +++++++++++++------------ 1 file changed, 60 insertions(+), 54 deletions(-) diff --git a/modules/urfs_client/urchin/schedule.go b/modules/urfs_client/urchin/schedule.go index 261508ffed..447dd1c4d4 100755 --- a/modules/urfs_client/urchin/schedule.go +++ b/modules/urfs_client/urchin/schedule.go @@ -71,16 +71,17 @@ func GetGPUDataBack(cloudbrainID int64, jobName, centerId string) error { log.Error("ScheduleDataToPeerByKey failed info is EndPoint:%s,Bucket:%s,ObjectKey:%s,ProxyServer:%s,TargetObjectKey:%s,error:%v", endpoint, bucket, objectKey, destPeerHost, grampus.GetGPUModelObjectKey(jobName), err) _, err = models.InsertScheduleRecord(&models.ScheduleRecord{ - CloudbrainID: cloudbrainID, - EndPoint: endpoint, - Bucket: bucket, - ObjectKey: objectKey, - ProxyServer: destPeerHost, - Status: models.StorageScheduleFailed, - IsDir: true, - ComputeSource: models.GPUResource, - TargetObjectKey: grampus.GetGPUModelObjectKey(jobName), - Remark: interceptErrorMessages(err), + CloudbrainID: cloudbrainID, + EndPoint: endpoint, + Bucket: bucket, + ObjectKey: objectKey, + ProxyServer: destPeerHost, + Status: models.StorageScheduleFailed, + IsDir: true, + ComputeSource: models.GPUResource, + TargetObjectKey: grampus.GetGPUModelObjectKey(jobName), + Remark: interceptErrorMessages(err), + LocalOperateStatus: models.StorageLocalOperateFailed, }) if err != nil { log.Error("InsertScheduleRecord failed:%v", err) @@ -90,15 +91,16 @@ func GetGPUDataBack(cloudbrainID int64, jobName, centerId string) error { } _, err = models.InsertScheduleRecord(&models.ScheduleRecord{ - CloudbrainID: cloudbrainID, - EndPoint: endpoint, - Bucket: bucket, - ObjectKey: objectKey, - ProxyServer: destPeerHost, - Status: res.StatusCode, - IsDir: true, - ComputeSource: models.GPUResource, - TargetObjectKey: grampus.GetGPUModelObjectKey(jobName), + CloudbrainID: cloudbrainID, + EndPoint: endpoint, + Bucket: bucket, + ObjectKey: objectKey, + ProxyServer: destPeerHost, + Status: res.StatusCode, + IsDir: true, + ComputeSource: models.GPUResource, + TargetObjectKey: grampus.GetGPUModelObjectKey(jobName), + LocalOperateStatus: models.StorageLocalOperateWaiting, }) if err != nil { log.Error("InsertScheduleRecord failed:%v", err) @@ -148,15 +150,16 @@ func GetGCUDataBack(cloudbrainID int64, jobName, centerId string) error { log.Error("ScheduleDataToPeerByKey failed info is EndPoint:%s,Bucket:%s,ObjectKey:%s,ProxyServer:%s,TargetObjectKey:%s,error:%v", endpoint, bucket, objectKey, destPeerHost, grampus.GetGPUModelObjectKey(jobName), err) _, err = models.InsertScheduleRecord(&models.ScheduleRecord{ - CloudbrainID: cloudbrainID, - EndPoint: endpoint, - Bucket: bucket, - ObjectKey: objectKey, - ProxyServer: destPeerHost, - Status: models.StorageScheduleFailed, - IsDir: true, - ComputeSource: models.GCUResource, - TargetObjectKey: grampus.GetGPUModelObjectKey(jobName), + CloudbrainID: cloudbrainID, + EndPoint: endpoint, + Bucket: bucket, + ObjectKey: objectKey, + ProxyServer: destPeerHost, + Status: models.StorageScheduleFailed, + IsDir: true, + ComputeSource: models.GCUResource, + TargetObjectKey: grampus.GetGPUModelObjectKey(jobName), + LocalOperateStatus: models.StorageLocalOperateFailed, }) if err != nil { log.Error("InsertScheduleRecord failed:%v", err) @@ -166,15 +169,16 @@ func GetGCUDataBack(cloudbrainID int64, jobName, centerId string) error { } _, err = models.InsertScheduleRecord(&models.ScheduleRecord{ - CloudbrainID: cloudbrainID, - EndPoint: endpoint, - Bucket: bucket, - ObjectKey: objectKey, - ProxyServer: destPeerHost, - Status: res.StatusCode, - IsDir: true, - ComputeSource: models.GCUResource, - TargetObjectKey: grampus.GetGPUModelObjectKey(jobName), + CloudbrainID: cloudbrainID, + EndPoint: endpoint, + Bucket: bucket, + ObjectKey: objectKey, + ProxyServer: destPeerHost, + Status: res.StatusCode, + IsDir: true, + ComputeSource: models.GCUResource, + TargetObjectKey: grampus.GetGPUModelObjectKey(jobName), + LocalOperateStatus: models.StorageLocalOperateWaiting, }) if err != nil { log.Error("InsertScheduleRecord failed:%v", err) @@ -224,15 +228,16 @@ func GetNPUDataBack(cloudbrainID int64, jobName, centerId string) error { log.Error("ScheduleDataToPeerByKey failed after retrying, errorInfo is EndPoint:%s,Bucket:%s,ObjectKey:%s,ProxyServer:%s,error:%v", endpoint, bucket, objectKey, destPeerHost, err) _, err = models.InsertScheduleRecord(&models.ScheduleRecord{ - CloudbrainID: cloudbrainID, - EndPoint: endpoint, - Bucket: bucket, - ObjectKey: objectKey, - ProxyServer: destPeerHost, - Status: models.StorageScheduleFailed, - IsDir: false, - ComputeSource: models.NPUResource, - Remark: interceptErrorMessages(err), + CloudbrainID: cloudbrainID, + EndPoint: endpoint, + Bucket: bucket, + ObjectKey: objectKey, + ProxyServer: destPeerHost, + Status: models.StorageScheduleFailed, + IsDir: false, + ComputeSource: models.NPUResource, + Remark: interceptErrorMessages(err), + LocalOperateStatus: models.StorageLocalOperateFailed, }) if err != nil { log.Error("InsertScheduleRecord failed:%v", err) @@ -242,14 +247,15 @@ func GetNPUDataBack(cloudbrainID int64, jobName, centerId string) error { } _, err = models.InsertScheduleRecord(&models.ScheduleRecord{ - CloudbrainID: cloudbrainID, - EndPoint: endpoint, - Bucket: bucket, - ObjectKey: objectKey, - ProxyServer: destPeerHost, - Status: res.StatusCode, - IsDir: false, - ComputeSource: models.NPUResource, + CloudbrainID: cloudbrainID, + EndPoint: endpoint, + Bucket: bucket, + ObjectKey: objectKey, + ProxyServer: destPeerHost, + Status: res.StatusCode, + IsDir: false, + ComputeSource: models.NPUResource, + LocalOperateStatus: models.StorageLocalOperateWaiting, }) if err != nil { log.Error("InsertScheduleRecord failed:%v", err) -- 2.34.1 From b26684b1848130e887df45cb52f76ebe5b1ba7d6 Mon Sep 17 00:00:00 2001 From: liuzx Date: Wed, 22 Mar 2023 16:03:47 +0800 Subject: [PATCH 06/13] =?UTF-8?q?=E5=88=9D=E5=A7=8B=E5=8C=96=E8=BF=94?= =?UTF-8?q?=E5=9B=9E=E7=BB=93=E6=9E=9C=E7=8A=B6=E6=80=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- modules/urfs_client/urchin/schedule.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/urfs_client/urchin/schedule.go b/modules/urfs_client/urchin/schedule.go index 447dd1c4d4..5632cc20ce 100755 --- a/modules/urfs_client/urchin/schedule.go +++ b/modules/urfs_client/urchin/schedule.go @@ -100,7 +100,7 @@ func GetGPUDataBack(cloudbrainID int64, jobName, centerId string) error { IsDir: true, ComputeSource: models.GPUResource, TargetObjectKey: grampus.GetGPUModelObjectKey(jobName), - LocalOperateStatus: models.StorageLocalOperateWaiting, + LocalOperateStatus: models.StorageLocalOperating, }) if err != nil { log.Error("InsertScheduleRecord failed:%v", err) @@ -255,7 +255,7 @@ func GetNPUDataBack(cloudbrainID int64, jobName, centerId string) error { Status: res.StatusCode, IsDir: false, ComputeSource: models.NPUResource, - LocalOperateStatus: models.StorageLocalOperateWaiting, + LocalOperateStatus: models.StorageLocalOperating, }) if err != nil { log.Error("InsertScheduleRecord failed:%v", err) -- 2.34.1 From c77e92ad795a27c5720c85c1f69f46ba68f04d36 Mon Sep 17 00:00:00 2001 From: chenyifan01 Date: Wed, 22 Mar 2023 17:16:17 +0800 Subject: [PATCH 07/13] update --- modules/urfs_client/urchin/schedule.go | 2 ++ 1 file changed, 2 insertions(+) diff --git a/modules/urfs_client/urchin/schedule.go b/modules/urfs_client/urchin/schedule.go index 5632cc20ce..ffd90e9b92 100755 --- a/modules/urfs_client/urchin/schedule.go +++ b/modules/urfs_client/urchin/schedule.go @@ -380,9 +380,11 @@ func handleScheduleResult(record *models.ScheduleRecord, res *PeerResult) error if record.ComputeSource == models.GPUResource || record.ComputeSource == models.GCUResource { err = MoveBucketInOpenIMinio(res.DataPath, record.TargetObjectKey, res.DataRoot, setting.Attachment.Minio.Bucket) if err != nil { + models.UpdateScheduleLocalOperateStatus(record, models.StorageLocalOperateFailed) log.Error("GetBackModel MoveBucketInOpenIMinio err.%v", err) return err } + models.UpdateScheduleLocalOperateStatus(record, models.StorageLocalOperateSucceed) } else { decompress(record.Bucket+"/"+record.ObjectKey, setting.Bucket+"/"+strings.TrimSuffix(record.ObjectKey, models.ModelSuffix)) } -- 2.34.1 From cb42af567932d3b42346859312114e96ef79bb63 Mon Sep 17 00:00:00 2001 From: liuzx Date: Wed, 22 Mar 2023 17:30:21 +0800 Subject: [PATCH 08/13] =?UTF-8?q?=E4=BF=AE=E6=94=B9=E5=88=9D=E5=A7=8B?= =?UTF-8?q?=E7=8A=B6=E6=80=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- modules/urfs_client/urchin/schedule.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/urfs_client/urchin/schedule.go b/modules/urfs_client/urchin/schedule.go index ffd90e9b92..380edc65d9 100755 --- a/modules/urfs_client/urchin/schedule.go +++ b/modules/urfs_client/urchin/schedule.go @@ -100,7 +100,7 @@ func GetGPUDataBack(cloudbrainID int64, jobName, centerId string) error { IsDir: true, ComputeSource: models.GPUResource, TargetObjectKey: grampus.GetGPUModelObjectKey(jobName), - LocalOperateStatus: models.StorageLocalOperating, + LocalOperateStatus: models.StorageLocalOperateWaiting, }) if err != nil { log.Error("InsertScheduleRecord failed:%v", err) @@ -255,7 +255,7 @@ func GetNPUDataBack(cloudbrainID int64, jobName, centerId string) error { Status: res.StatusCode, IsDir: false, ComputeSource: models.NPUResource, - LocalOperateStatus: models.StorageLocalOperating, + LocalOperateStatus: models.StorageLocalOperateWaiting, }) if err != nil { log.Error("InsertScheduleRecord failed:%v", err) -- 2.34.1 From d583cd2ce294e5263fac3fb709e524b9b467cf73 Mon Sep 17 00:00:00 2001 From: liuzx Date: Wed, 22 Mar 2023 17:33:07 +0800 Subject: [PATCH 09/13] =?UTF-8?q?=E4=BF=AE=E6=94=B9=E6=8F=90=E7=A4=BA?= =?UTF-8?q?=E8=AF=AD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- web_src/js/features/i18nVue.js | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/web_src/js/features/i18nVue.js b/web_src/js/features/i18nVue.js index d2c1b7c014..891886419d 100644 --- a/web_src/js/features/i18nVue.js +++ b/web_src/js/features/i18nVue.js @@ -72,7 +72,7 @@ export const i18nVue = { model_failed:"模型加载失败", file_sync_ing:"文件同步中,请稍侯", file_sync_fail:"文件同步失败", - no_file_to_download:"没有文件可以下载", + no_file_to_download:"没有文件可以下载,稍后再来看看", task_not_finished:"任务还未结束,稍后再来看看", local:"本地", online:"线上", @@ -212,7 +212,7 @@ export const i18nVue = { model_failed:"Failed", file_sync_ing:"File synchronization in progress, please wait", file_sync_fail:"File synchronization failed", - no_file_to_download:"No files can be downloaded", + no_file_to_download:"No files can be downloaded, please wait", task_not_finished:"Task not finished yet, please wait", local:"Local", online:"Online", -- 2.34.1 From ae03239558d22c8959ef07d96c99653140fb1d04 Mon Sep 17 00:00:00 2001 From: chenyifan01 Date: Wed, 22 Mar 2023 18:40:27 +0800 Subject: [PATCH 10/13] update --- models/schedule_record.go | 32 ++++++---- modules/urfs_client/urchin/schedule.go | 30 ++++----- routers/api/v1/repo/modelarts.go | 2 +- .../cloudbrain/cloudbrainTask/schedule.go | 63 +++++++++++++------ 4 files changed, 81 insertions(+), 46 deletions(-) diff --git a/models/schedule_record.go b/models/schedule_record.go index e352adabfc..84af017421 100755 --- a/models/schedule_record.go +++ b/models/schedule_record.go @@ -1,25 +1,33 @@ package models import ( - "fmt" "time" "code.gitea.io/gitea/modules/timeutil" ) const ( - StorageScheduleSucceed int = iota - StorageScheduleProcessing - StorageScheduleFailed - StorageNoFile - StorageScheduleWaiting + StorageUrchinScheduleSucceed int = iota + StorageUrchinScheduleProcessing + StorageUrchinScheduleFailed + StorageUrchinNoFile + StorageUrchinScheduleWaiting ) const ( - StorageLocalOperateSucceed int = iota - StorageLocalOperating - StorageLocalOperateFailed - StorageLocalOperateWaiting + MoveBucketSucceed int = iota + MoveBucketOperating + MoveBucketFailed + MoveBucketWaiting +) + +type ModelScheduleStatus int + +const ( + ModelScheduleSucceed ModelScheduleStatus = iota + ModelScheduleOperating + ModelScheduleFailed + ModelScheduleWaiting ) const UrchinDefaultBucket = "urchincache" @@ -59,7 +67,7 @@ func UpdateScheduleLocalOperateStatus(record *ScheduleRecord, newLocalOperateSta func GetSchedulingRecord() ([]*ScheduleRecord, error) { records := make([]*ScheduleRecord, 0, 10) return records, x. - Where("status = ?", StorageScheduleProcessing). + Where("status = ?", StorageUrchinScheduleProcessing). Limit(100). Find(&records) } @@ -79,7 +87,7 @@ func getScheduleRecordByPrID(e Engine, cloudbrainId int64) (*ScheduleRecord, err if err != nil { return nil, err } else if !has { - return nil, fmt.Errorf("get record by cloudbrain_id failed(%d)", cloudbrainId) + return nil, ErrRecordNotExist{} } return record, nil } diff --git a/modules/urfs_client/urchin/schedule.go b/modules/urfs_client/urchin/schedule.go index ffd90e9b92..da43d076f0 100755 --- a/modules/urfs_client/urchin/schedule.go +++ b/modules/urfs_client/urchin/schedule.go @@ -76,12 +76,12 @@ func GetGPUDataBack(cloudbrainID int64, jobName, centerId string) error { Bucket: bucket, ObjectKey: objectKey, ProxyServer: destPeerHost, - Status: models.StorageScheduleFailed, + Status: models.StorageUrchinScheduleFailed, IsDir: true, ComputeSource: models.GPUResource, TargetObjectKey: grampus.GetGPUModelObjectKey(jobName), Remark: interceptErrorMessages(err), - LocalOperateStatus: models.StorageLocalOperateFailed, + LocalOperateStatus: models.MoveBucketFailed, }) if err != nil { log.Error("InsertScheduleRecord failed:%v", err) @@ -100,7 +100,7 @@ func GetGPUDataBack(cloudbrainID int64, jobName, centerId string) error { IsDir: true, ComputeSource: models.GPUResource, TargetObjectKey: grampus.GetGPUModelObjectKey(jobName), - LocalOperateStatus: models.StorageLocalOperating, + LocalOperateStatus: models.MoveBucketOperating, }) if err != nil { log.Error("InsertScheduleRecord failed:%v", err) @@ -155,11 +155,11 @@ func GetGCUDataBack(cloudbrainID int64, jobName, centerId string) error { Bucket: bucket, ObjectKey: objectKey, ProxyServer: destPeerHost, - Status: models.StorageScheduleFailed, + Status: models.StorageUrchinScheduleFailed, IsDir: true, ComputeSource: models.GCUResource, TargetObjectKey: grampus.GetGPUModelObjectKey(jobName), - LocalOperateStatus: models.StorageLocalOperateFailed, + LocalOperateStatus: models.MoveBucketFailed, }) if err != nil { log.Error("InsertScheduleRecord failed:%v", err) @@ -178,7 +178,7 @@ func GetGCUDataBack(cloudbrainID int64, jobName, centerId string) error { IsDir: true, ComputeSource: models.GCUResource, TargetObjectKey: grampus.GetGPUModelObjectKey(jobName), - LocalOperateStatus: models.StorageLocalOperateWaiting, + LocalOperateStatus: models.MoveBucketWaiting, }) if err != nil { log.Error("InsertScheduleRecord failed:%v", err) @@ -233,11 +233,11 @@ func GetNPUDataBack(cloudbrainID int64, jobName, centerId string) error { Bucket: bucket, ObjectKey: objectKey, ProxyServer: destPeerHost, - Status: models.StorageScheduleFailed, + Status: models.StorageUrchinScheduleFailed, IsDir: false, ComputeSource: models.NPUResource, Remark: interceptErrorMessages(err), - LocalOperateStatus: models.StorageLocalOperateFailed, + LocalOperateStatus: models.MoveBucketFailed, }) if err != nil { log.Error("InsertScheduleRecord failed:%v", err) @@ -255,7 +255,7 @@ func GetNPUDataBack(cloudbrainID int64, jobName, centerId string) error { Status: res.StatusCode, IsDir: false, ComputeSource: models.NPUResource, - LocalOperateStatus: models.StorageLocalOperating, + LocalOperateStatus: models.MoveBucketOperating, }) if err != nil { log.Error("InsertScheduleRecord failed:%v", err) @@ -374,24 +374,24 @@ func HandleScheduleRecords() error { func handleScheduleResult(record *models.ScheduleRecord, res *PeerResult) error { var err error switch res.StatusCode { - case models.StorageScheduleSucceed: + case models.StorageUrchinScheduleSucceed: log.Info("ScheduleDataToPeerByKey(%s) succeed", record.ObjectKey) - models.UpdateScheduleLocalOperateStatus(record, models.StorageLocalOperating) + models.UpdateScheduleLocalOperateStatus(record, models.MoveBucketOperating) if record.ComputeSource == models.GPUResource || record.ComputeSource == models.GCUResource { err = MoveBucketInOpenIMinio(res.DataPath, record.TargetObjectKey, res.DataRoot, setting.Attachment.Minio.Bucket) if err != nil { - models.UpdateScheduleLocalOperateStatus(record, models.StorageLocalOperateFailed) + models.UpdateScheduleLocalOperateStatus(record, models.MoveBucketFailed) log.Error("GetBackModel MoveBucketInOpenIMinio err.%v", err) return err } - models.UpdateScheduleLocalOperateStatus(record, models.StorageLocalOperateSucceed) + models.UpdateScheduleLocalOperateStatus(record, models.MoveBucketSucceed) } else { decompress(record.Bucket+"/"+record.ObjectKey, setting.Bucket+"/"+strings.TrimSuffix(record.ObjectKey, models.ModelSuffix)) } - case models.StorageScheduleProcessing: + case models.StorageUrchinScheduleProcessing: log.Info("ScheduleDataToPeerByKey(%s) processing", record.ObjectKey) - case models.StorageScheduleFailed: + case models.StorageUrchinScheduleFailed: log.Error("ScheduleDataToPeerByKey(%s) failed:%s", record.ObjectKey, res.StatusMsg) default: diff --git a/routers/api/v1/repo/modelarts.go b/routers/api/v1/repo/modelarts.go index b712ec96f3..dcce09af3a 100755 --- a/routers/api/v1/repo/modelarts.go +++ b/routers/api/v1/repo/modelarts.go @@ -466,7 +466,7 @@ func ModelList(ctx *context.APIContext) { return } - status := models.StorageScheduleSucceed + status := models.StorageUrchinScheduleSucceed var fileInfos []storage.FileInfo if task.ComputeResource == models.NPUResource { prefix := strings.TrimPrefix(path.Join(setting.TrainJobModelPath, task.JobName, setting.OutPutPath, versionName), "/") diff --git a/services/cloudbrain/cloudbrainTask/schedule.go b/services/cloudbrain/cloudbrainTask/schedule.go index 5f908930c3..9c9f899590 100644 --- a/services/cloudbrain/cloudbrainTask/schedule.go +++ b/services/cloudbrain/cloudbrainTask/schedule.go @@ -10,7 +10,7 @@ import ( "strings" ) -func GetModelScheduleStatus(jobId string) (int, error) { +func GetModelScheduleStatus(jobId string) (models.ModelScheduleStatus, error) { job, err := models.GetCloudbrainByJobID(jobId) if err != nil { log.Error("GetModelScheduleStatus GetCloudbrainByJobID err.jobId=%s err=%v", jobId, err) @@ -18,33 +18,60 @@ func GetModelScheduleStatus(jobId string) (int, error) { } if !job.IsTerminal() { log.Info("GetModelScheduleStatus job is not terminal.jobId=%s", jobId) - return models.StorageLocalOperateWaiting, nil + return models.ModelScheduleWaiting, nil } record, err := models.GetScheduleRecordByCloudbrainID(job.ID) if err != nil { log.Error("GetModelScheduleStatus GetScheduleRecordByCloudbrainID err.jobId=%s err=%v", jobId, err) - return models.StorageScheduleSucceed, nil + if models.IsErrRecordNotExist(err) { + return models.ModelScheduleSucceed, nil + } + return models.ModelScheduleFailed, err } + switch record.Status { - case models.StorageScheduleProcessing, models.StorageScheduleWaiting: - return models.StorageLocalOperating, nil - case models.StorageNoFile, models.StorageScheduleFailed: - return models.StorageLocalOperateFailed, nil - case models.StorageScheduleSucceed: - if record.ComputeSource == models.GPUResource || record.ComputeSource == models.GCUResource { - return record.LocalOperateStatus, nil - } - if record.LocalOperateStatus != models.StorageLocalOperating { - return record.LocalOperateStatus, nil + case models.StorageUrchinScheduleWaiting: + return models.ModelScheduleWaiting, nil + case models.StorageUrchinScheduleProcessing: + return models.ModelScheduleOperating, nil + case models.StorageUrchinScheduleFailed: + return models.ModelScheduleFailed, nil + case models.StorageUrchinNoFile: + return models.ModelScheduleSucceed, nil + case models.StorageUrchinScheduleSucceed: + moveStatus, err := GetMoveBucketStatus(record, job.JobName, job.VersionName) + if err != nil { + log.Error("GetMoveBucketStatus err.%v", err) + return models.ModelScheduleFailed, err } - //由于NPU回传后还有异步的解压,所以对于进行中的状态需要进一步查询是否已解压结束 - //判断方法是查询模型目录是否有文件 - if IsNPUModelDirHasFile(job.JobName, job.VersionName) { - models.UpdateScheduleLocalOperateStatus(record, models.StorageLocalOperateSucceed) - return models.StorageLocalOperateSucceed, nil + switch moveStatus { + case models.MoveBucketSucceed: + return models.ModelScheduleSucceed, nil + case models.MoveBucketOperating: + return models.ModelScheduleOperating, nil + case models.MoveBucketFailed: + return models.ModelScheduleFailed, nil } } + + return models.ModelScheduleFailed, nil +} + +func GetMoveBucketStatus(record *models.ScheduleRecord, jobName, versionName string) (int, error) { + + if record.ComputeSource == models.GPUResource || record.ComputeSource == models.GCUResource { + return record.LocalOperateStatus, nil + } + if record.LocalOperateStatus != models.MoveBucketOperating { + return record.LocalOperateStatus, nil + } + //由于NPU回传后还有异步的解压,所以对于进行中的状态需要进一步查询是否已解压结束 + //判断方法是查询模型目录是否有文件 + if IsNPUModelDirHasFile(jobName, versionName) { + models.UpdateScheduleLocalOperateStatus(record, models.MoveBucketSucceed) + return models.MoveBucketSucceed, nil + } return record.LocalOperateStatus, nil } -- 2.34.1 From cc66ce28437ae8a1309f0a43eca6cd4d8fdba62b Mon Sep 17 00:00:00 2001 From: liuzx Date: Thu, 23 Mar 2023 10:19:49 +0800 Subject: [PATCH 11/13] #3820 --- models/cloudbrain.go | 1 + routers/api/v1/repo/modelarts.go | 15 ++++++++++----- web_src/js/features/cloudbrainShow.js | 12 ++++++++++-- web_src/js/features/i18nVue.js | 4 +++- 4 files changed, 24 insertions(+), 8 deletions(-) diff --git a/models/cloudbrain.go b/models/cloudbrain.go index 0549630e7d..86999d1b6b 100755 --- a/models/cloudbrain.go +++ b/models/cloudbrain.go @@ -31,6 +31,7 @@ const ( TypeCloudBrainAll = -1 AccCardsNumAll = -1 + JobNoTeminal = -1 ) const ( diff --git a/routers/api/v1/repo/modelarts.go b/routers/api/v1/repo/modelarts.go index dcce09af3a..7749aa76fc 100755 --- a/routers/api/v1/repo/modelarts.go +++ b/routers/api/v1/repo/modelarts.go @@ -466,7 +466,7 @@ func ModelList(ctx *context.APIContext) { return } - status := models.StorageUrchinScheduleSucceed + status := models.ModelScheduleSucceed var fileInfos []storage.FileInfo if task.ComputeResource == models.NPUResource { prefix := strings.TrimPrefix(path.Join(setting.TrainJobModelPath, task.JobName, setting.OutPutPath, versionName), "/") @@ -505,10 +505,15 @@ func ModelList(ctx *context.APIContext) { } if task.Type == models.TypeC2Net { - status, err = cloudbrainTask.GetModelScheduleStatus(task.JobID) - if err != nil { - log.Error("GetModelScheduleStatus(%s) failed:%v", task.JobName, err.Error()) - return + if !task.IsTerminal() { + log.Info("GetModelScheduleStatus job is not terminal.jobId=%s", jobID) + status = models.JobNoTeminal + } else { + status, err = cloudbrainTask.GetModelScheduleStatus(task.JobID) + if err != nil { + log.Error("GetModelScheduleStatus(%s) failed:%v", task.JobName, err.Error()) + return + } } } diff --git a/web_src/js/features/cloudbrainShow.js b/web_src/js/features/cloudbrainShow.js index 64d465122d..4e33169f72 100644 --- a/web_src/js/features/cloudbrainShow.js +++ b/web_src/js/features/cloudbrainShow.js @@ -604,7 +604,15 @@ export default async function initCloudrainSow() { gpuFlag ); } - } else if (data.StatusOK == 1) { // 处理中 1 + }else if (data.StatusOK == -1) { // 任务未结束 -1 + $(`#file_breadcrumb${version_name}`).empty(); + $(`#dir_list${version_name}`).html(`
+
+ +
+ ${i18n['task_not_finished']} +
`); + }else if (data.StatusOK == 1) { // 处理中 1 $(`#file_breadcrumb${version_name}`).empty(); $(`#dir_list${version_name}`).html(`