#5033 fix-4948

Merged
zouap merged 7 commits from fix-4948 into V20240109 4 months ago
  1. +3
    -11
      routers/api/v1/repo/cloudbrain.go
  2. +12
    -2
      services/ai_task_service/container_builder/code_builder.go
  3. +20
    -0
      services/ai_task_service/task/grampus_notebook_task.go
  4. +9
    -8
      services/ai_task_service/task/task_service.go
  5. +22
    -12
      services/cloudbrain/cloudbrainTask/notebook.go

+ 3
- 11
routers/api/v1/repo/cloudbrain.go View File

@@ -166,7 +166,7 @@ func CreateFileNotebookTask(ctx *context.Context, option api.CreateFileNotebookJ
imageUrl := setting.FileNoteBook.ImageGPU
imageId := ""
imageName := imageUrl
cluster := entity.OpenICloudbrainOne
cluster := entity.C2Net

if option.Type == 0 {
specId = setting.FileNoteBook.SpecIdCPU
@@ -283,13 +283,12 @@ func GetFileNoteBookInfo(ctx *context.APIContext) {
}

var specCPU, specGpu, specNPU, specNPUCD *api.SpecificationShow
var specGpuQueueCode string
for _, spec := range specs {
if spec.ID == setting.FileNoteBook.SpecIdCPU {
specCPU = convert.ToSpecification(spec)
} else if spec.ID == setting.FileNoteBook.SpecIdGPU {
specGpu = convert.ToSpecification(spec)
specGpuQueueCode = spec.QueueCode
} else if spec.ID == setting.FileNoteBook.SpecIdNPU {
specNPU = convert.ToSpecification(spec)
} else if spec.ID == setting.FileNoteBook.SpecIdNPUCD {
@@ -299,14 +298,7 @@ func GetFileNoteBookInfo(ctx *context.APIContext) {

waitCountNpu := cloudbrain.GetWaitingCloudbrainCount(models.TypeCloudBrainTwo, "")

queuesMap, err := cloudbrain.GetQueuesDetail()
var waitCountGPU int
if err != nil {
log.Error("Fail to query gpu queues waiting count", err)
waitCountGPU = 0
} else {
waitCountGPU = (*queuesMap)[specGpuQueueCode]
}
waitCountGPU := cloudbrain.GetWaitingCloudbrainCount(models.TypeC2Net, models.GPUResource, models.JobTypeDebug)

var a *models.PointAccount
if ctx.User != nil {


+ 12
- 2
services/ai_task_service/container_builder/code_builder.go View File

@@ -1,6 +1,11 @@
package container_builder

import (
"path"
"strings"

"code.gitea.io/gitea/models"

"code.gitea.io/gitea/entity"
"code.gitea.io/gitea/modules/cloudbrain"
"code.gitea.io/gitea/modules/log"
@@ -8,8 +13,6 @@ import (
"code.gitea.io/gitea/routers/response"
"code.gitea.io/gitea/services/ai_task_service/context"
"code.gitea.io/gitea/services/ai_task_service/storage_helper"
"path"
"strings"
)

type CodeBuilder struct {
@@ -57,6 +60,13 @@ func (b *CodeBuilder) Build(ctx *context.CreationContext) ([]entity.ContainerDat
return nil, response.LOAD_CODE_FAILED
}
}
if ctx.Request.IsFileNoteBookRequest && ctx.Request.Cluster == entity.C2Net && ctx.Request.ComputeSource.Name == models.GPU {
err := uploader.MKDIR(remoteDir)
if err != nil {
log.Error("MKDIR err.displayJobName = %s err=%v", ctx.Request.DisplayJobName, err)
return nil, response.LOAD_CODE_FAILED
}
}

var codeArchiveName, objectKey string
if !b.Opts.Uncompressed && !b.Opts.VolumeFolder {


+ 20
- 0
services/ai_task_service/task/grampus_notebook_task.go View File

@@ -118,6 +118,26 @@ func GetGrampusNoteBookConfig(opts entity.AITaskConfigKey) *entity.AITaskBaseCon
},
}
}
//在线运行notebook配置
if opts.IsFileNoteBookRequest {
config = &entity.AITaskBaseConfig{
ContainerSteps: map[entity.ContainerDataType]*entity.ContainerBuildOpts{
entity.ContainerFileNoteBookCode: {},
entity.ContainerCode: {
ContainerPath: codePath,
ReadOnly: false,
AcceptStorageType: []entity.StorageType{entity.MINIO},
Uncompressed: true,
},
entity.ContainerDataset: {
ContainerPath: datasetPath,
ReadOnly: true,
AcceptStorageType: []entity.StorageType{entity.MINIO, entity.OBS},
},
},
}

}

switch opts.ComputeSource {
case models.NPU:


+ 9
- 8
services/ai_task_service/task/task_service.go View File

@@ -299,14 +299,14 @@ func UpdateByQueryResponse(res *entity.QueryTaskResponse, task *models.Cloudbrai
var noteBookOKMap = make(map[string]int, 20)
var noteBookFailMap = make(map[string]int, 20)

//if a task notebook url can get successfulCount times, the notebook can browser.
// if a task notebook url can get successfulCount times, the notebook can browser.
const successfulCount = 3
const maxSuccessfulCount = 10

//云脑一存在状态为RUNNING但实际不可用的情况,且存在访问调试链接成功后又失败的情况,因此需要一段时间内多次成功才认为调试可用
//下列代码来源于services/cloudbrain/cloudbrainTask/sync_status.go:118
//func isNoteBookReady(task *models.Cloudbrain) bool
//为了解决循环引用copy了一份到此类,稍有改编
// 云脑一存在状态为RUNNING但实际不可用的情况,且存在访问调试链接成功后又失败的情况,因此需要一段时间内多次成功才认为调试可用
// 下列代码来源于services/cloudbrain/cloudbrainTask/sync_status.go:118
// func isNoteBookReady(task *models.Cloudbrain) bool
// 为了解决循环引用copy了一份到此类,稍有改编
func isCloudbrainOneNotebookReady(jobId string) bool {
url, err := new(cluster.CloudbrainOneClusterAdapter).GetNoteBookUrl(jobId)
if err != nil {
@@ -393,7 +393,7 @@ func StopTask(id int64, stopRemote StopFunc) error {
return nil
}

//jobId string, baseLine int64, lines int64, order int64
// jobId string, baseLine int64, lines int64, order int64
func QueryTaskLog(opts entity.QueryLogOpts, getLogRemote GetLogFunc) (*entity.ClusterLog, error) {
cloudbrain, err := models.GetCloudbrainByCloudbrainID(opts.CloudbrainId)
if err != nil {
@@ -650,7 +650,8 @@ func CreateAITask(form entity.CreateReq, gitRepo *git.Repository, repo *models.R
GitRepo: gitRepo,
Repository: repo,
User: user,
Config: t.GetConfig(entity.AITaskConfigKey{ComputeSource: form.ComputeSourceStr}),
Config: t.GetConfig(entity.AITaskConfigKey{ComputeSource: form.ComputeSourceStr,
IsFileNoteBookRequest: form.IsFileNoteBookRequest}),
})
}

@@ -706,7 +707,7 @@ func GetOperationProfile(id int64, getOperationProfile GetOperationProfileFunc)
return s, err
}

//SyncAITaskStatus 定时更新云脑状态和运行时长
// SyncAITaskStatus 定时更新云脑状态和运行时长
func SyncAITaskStatus() {
defer func() {
if err := recover(); err != nil {


+ 22
- 12
services/cloudbrain/cloudbrainTask/notebook.go View File

@@ -10,6 +10,8 @@ import (
"strconv"
"strings"

"code.gitea.io/gitea/entity"

"code.gitea.io/gitea/modules/storage"

"code.gitea.io/gitea/modules/grampus"
@@ -294,15 +296,6 @@ func FileNotebookCreate(ctx *context.Context, option api.CreateFileNotebookJobOp
return
}

//var imageIdNpu string
//
//if option.Type == NPUType {
// imageIdNpu, err = getNpuImageId(option)
// if err != nil {
// ctx.JSON(http.StatusOK, models.BaseErrorMessageApi(ctx.Tr("repo.parameter_is_wrong")))
// return
// }
//}
var err error
isNotebookFileExist, _ := isNoteBookFileExist(ctx, option)
if !isNotebookFileExist {
@@ -476,8 +469,22 @@ func FileNotebookStatus(ctx *context.Context, option api.CreateFileNotebookJobOp
func getBaseUrlAndToken(task *models.Cloudbrain) (string, string, error) {
var debugBaseUrl string
var token string
if task.Type == models.TypeCloudBrainOne {
debugBaseUrl = setting.DebugServerHost + "jpylab_" + task.JobID + "_" + task.SubTaskName + "/lab"
if task.Type == models.TypeC2Net {
log.Info("get grampus debug url begin: ", task.JobID)
result, err := grampus.GetNotebookJob(task.JobID)
if err != nil {
return "", "", err
}
if result == nil {
return "", "", fmt.Errorf("can not get job response.")
}
convertRes := entity.ConvertGrampusNotebookResponse(result.JobInfo)
debugBaseUrl = convertRes.Url
token = convertRes.Token
if debugBaseUrl == "" {
log.Error("notebook job not found:"+task.JobID, err)
return "", "", fmt.Errorf("can not get job or job is invalid.")
}

} else {
var result *models.GetNotebook2Result
@@ -496,6 +503,9 @@ func getBaseUrlAndToken(task *models.Cloudbrain) (string, string, error) {
token = result.Token

}
if !strings.HasSuffix(debugBaseUrl, "/") {
debugBaseUrl = debugBaseUrl + "/"
}
return debugBaseUrl, token, nil
}

@@ -695,7 +705,7 @@ func cloudBrainFileNoteBookCreate(ctx *context.Context, option api.CreateFileNot

func getCloudbrainType(optionType int) int {
if optionType <= GPUType {
return models.TypeCloudBrainOne
return models.TypeC2Net
}
if setting.ModelartsCD.Enabled {
return models.TypeCDCenter


Loading…
Cancel
Save