#5206 path_unite_all

Merged
liuzx merged 9 commits from path_unite_all into V20240116 2 months ago
  1. +1
    -1
      services/ai_task_service/cluster/c2net.go
  2. +17
    -16
      services/ai_task_service/container_builder/dataset_builder.go
  3. +29
    -7
      services/ai_task_service/container_builder/pre_model_builder.go

+ 1
- 1
services/ai_task_service/cluster/c2net.go View File

@@ -129,7 +129,7 @@ func convertNoteBookReq2Grampus(req entity.CreateNoteBookTaskRequest) (models.Cr
var commandGpuDebug = "mkdir -p /tmp/dataset;jupyter lab --ServerApp.shutdown_no_activity_timeout=%s --TerminalManager.cull_inactive_timeout=%s --TerminalManager.cull_interval=%s --MappingKernelManager.cull_idle_timeout=%s --MappingKernelManager.cull_interval=%s --MappingKernelManager.cull_connected=True --MappingKernelManager.cull_busy=True --no-browser --ip=0.0.0.0 --allow-root --notebook-dir='%s' --port=$OCTOPUS_NOTEBOOK_PORT --LabApp.token='' --LabApp.allow_origin='*' --LabApp.base_url=$OCTOPUS_NOTEBOOK_BASE_URL;"
command := fmt.Sprintf(commandGpuDebug, setting.CullIdleTimeout, setting.CullIdleTimeout, setting.CullInterval, setting.CullIdleTimeout, setting.CullInterval, codePath)
if models.DCU == req.Tasks[0].Spec.ComputeResource {
command = "cp -r /code /tmp;cp -r /dataset /tmp;cp -r /pretrainmodel /tmp;"
command = ""
}
if models.NPU == req.Tasks[0].Spec.ComputeResource {
command = ""


+ 17
- 16
services/ai_task_service/container_builder/dataset_builder.go View File

@@ -26,22 +26,6 @@ func (b *DatasetBuilder) SetOpts(opts *entity.ContainerBuildOpts) {
}

func (b *DatasetBuilder) Build(ctx *context.CreationContext) ([]entity.ContainerData, *response.BizError) {
if b.Opts.Disable {
return nil, nil
}
uuid := ctx.Request.DatasetUUIDStr
if uuid == "" {
return nil, nil
}
datasetInfos, err := models.GetDatasetInfo4AITask(uuid)
if err != nil {
log.Error("GetDatasetInfo failed: %v", err)
return nil, response.DATASET_SELECT_ERROR
}
if len(datasetInfos) < len(strings.Split(uuid, ";")) {
log.Error("GetDatasetInfo count error.displayJobName=%s jobType=%s cluster=%s", ctx.Request.DisplayJobName, ctx.Request.JobType, ctx.Request.Cluster)
return nil, response.PARTIAL_DATASETS_NOT_AVAILABLE
}
var data []entity.ContainerData

//如果是智算GPU调试任务,需要把dataset文件夹也挂载,这样提交镜像时才不会把dataset下的文件提交到镜像中
@@ -76,6 +60,23 @@ func (b *DatasetBuilder) Build(ctx *context.CreationContext) ([]entity.Container
})
}

if b.Opts.Disable {
return data, nil
}
uuid := ctx.Request.DatasetUUIDStr
if uuid == "" {
return data, nil
}
datasetInfos, err := models.GetDatasetInfo4AITask(uuid)
if err != nil {
log.Error("GetDatasetInfo failed: %v", err)
return nil, response.DATASET_SELECT_ERROR
}
if len(datasetInfos) < len(strings.Split(uuid, ";")) {
log.Error("GetDatasetInfo count error.displayJobName=%s jobType=%s cluster=%s", ctx.Request.DisplayJobName, ctx.Request.JobType, ctx.Request.Cluster)
return nil, response.PARTIAL_DATASETS_NOT_AVAILABLE
}

for _, datasetInfo := range datasetInfos {
var name, objectKey, s3DownloadUrl string
//如果不是压缩包,那么文件名是去掉后缀以后的数据集名称


+ 29
- 7
services/ai_task_service/container_builder/pre_model_builder.go View File

@@ -1,11 +1,10 @@
package container_builder

import (
"code.gitea.io/gitea/services/ai_model"
"path"
"strings"

"code.gitea.io/gitea/services/ai_model"

"code.gitea.io/gitea/routers/response"

"code.gitea.io/gitea/entity"
@@ -29,17 +28,39 @@ func (b *PretrainModelBuilder) SetOpts(opts *entity.ContainerBuildOpts) {
}

func (b *PretrainModelBuilder) Build(ctx *context.CreationContext) ([]entity.ContainerData, *response.BizError) {
form := ctx.Request
var preTrainModelEntity []entity.ContainerData
if ctx.Request.Cluster == entity.C2Net && (ctx.Request.JobType == models.JobTypeDebug || ctx.Request.JobType == models.JobTypeTrain) && ctx.Request.ComputeSource.Name == models.GPU {
//挂载一个文件夹保证容器内pretrainmodel目录提交镜像时不被打包
uploader := storage_helper.SelectStorageHelperFromStorageType(entity.OBS)
objectKey := path.Join(uploader.GetJobDefaultObjectKeyPrefix(form.JobName), "pretrain_model_mount")
uploader.MKDIR(objectKey, "pretrain model folder")
preTrainModelEntity = append(preTrainModelEntity, entity.ContainerData{
Name: "pretrain_model_mount",
Bucket: uploader.GetBucket(),
EndPoint: uploader.GetEndpoint(),
ObjectKey: objectKey + "/",
ReadOnly: false,
ContainerPath: b.Opts.ContainerPath,
RealPath: uploader.GetRealPath(objectKey),
S3DownloadUrl: uploader.GetS3DownloadUrl(objectKey),
IsDir: true,
IsOverwrite: true,
IsNeedUnzip: false,
})
}

if b.Opts.Disable {
return nil, nil
return preTrainModelEntity, nil
}
form := ctx.Request
storageTypes := b.Opts.AcceptStorageType
if storageTypes == nil || len(storageTypes) == 0 {
return nil, response.SYSTEM_ERROR
}
//未选择预训练模型,跳过此步
if form.PretrainModelId == "" {
return nil, nil
return preTrainModelEntity, nil
}
//查出模型数据
uuids := strings.Split(form.PretrainModelId, ";")
@@ -48,7 +69,7 @@ func (b *PretrainModelBuilder) Build(ctx *context.CreationContext) ([]entity.Con
log.Error("Can not find model", err)
return nil, response.MODEL_NOT_EXISTS
}
var preTrainModelEntity []entity.ContainerData
for _, m := range modelInfoMaps {
ai_model.InitModelMeta(m.ID)
data, err := b.buildModelData(m, form.JobName)
@@ -57,6 +78,7 @@ func (b *PretrainModelBuilder) Build(ctx *context.CreationContext) ([]entity.Con
}
preTrainModelEntity = append(preTrainModelEntity, data)
}

return preTrainModelEntity, nil
}

@@ -101,7 +123,7 @@ func (b *PretrainModelBuilder) buildModelData(m *models.AiModelManage, jobName s
Bucket: uploader.GetBucket(),
EndPoint: uploader.GetEndpoint(),
ObjectKey: preTrainModelPath,
ReadOnly: b.Opts.ReadOnly,
ReadOnly: false,
ContainerPath: path.Join(b.Opts.ContainerPath, m.Name),
RealPath: uploader.GetRealPath(preTrainModelPath),
S3DownloadUrl: uploader.GetS3DownloadUrl(preTrainModelPath),


Loading…
Cancel
Save