#4357 V20230531

Merged
ychao_1983 merged 396 commits from V20230531 into develop 11 months ago
  1. +15
    -0
      README.md
  2. +0
    -1
      custom/public/css/git.openi.css
  3. +208
    -0
      entity/ai_task.go
  4. +35
    -0
      entity/ai_task_config.go
  5. +0
    -28
      entity/ai_task_entity/container.go
  6. +0
    -1
      entity/ai_task_entity/creation.go
  7. +0
    -70
      entity/ai_task_entity/task.go
  8. +12
    -0
      entity/ai_task_list.go
  9. +75
    -13
      entity/cluster.go
  10. +45
    -0
      entity/container.go
  11. +6
    -4
      entity/creation.go
  12. +13
    -0
      entity/err_code.go
  13. +3
    -0
      entity/grampus_err_code.go
  14. +1
    -1
      entity/images.go
  15. +35
    -0
      entity/operation_profile.go
  16. +21
    -0
      entity/storage.go
  17. +44
    -0
      entity/user.go
  18. +1527
    -0
      manager/client/cloudbrain_two/resty.go
  19. +233
    -0
      manager/client/cloudbrain_two_cd/resty.go
  20. +65
    -1
      manager/client/grampus/grampus.go
  21. +39
    -2
      models/action.go
  22. +14
    -0
      models/ai_model_manage.go
  23. +1
    -1
      models/attachment.go
  24. +114
    -10
      models/cloudbrain.go
  25. +24
    -0
      models/cloudbrain_spec.go
  26. +17
    -0
      models/cloudbrain_static.go
  27. +22
    -0
      models/file_chunk.go
  28. +30
    -0
      models/ip_location.go
  29. +3
    -2
      models/model_migrate_record.go
  30. +11
    -0
      models/modelarts_deploy.go
  31. +1
    -0
      models/models.go
  32. +2
    -0
      models/repo_watch.go
  33. +1
    -0
      models/task_config.go
  34. +9
    -0
      models/user_login_log.go
  35. +5
    -2
      modules/auth/wechat/cloudbrain.go
  36. +0
    -8
      modules/context/repo.go
  37. +7
    -0
      modules/grampus/grampus.go
  38. +15
    -3
      modules/grampus/resty.go
  39. +44
    -0
      modules/ipinfo/ipinfo.go
  40. +1
    -1
      modules/minio_ext/constants.go
  41. +1
    -1
      modules/modelappservice/modelsevice.go
  42. +25
    -19
      modules/modelarts/resty.go
  43. +63
    -0
      modules/modelarts/wenxinresty.go
  44. +24
    -0
      modules/setting/screen_map.go
  45. +68
    -33
      modules/setting/setting.go
  46. +1
    -0
      modules/structs/cloudbrain.go
  47. +1
    -1
      modules/templates/helper.go
  48. +16
    -4
      options/locale/locale_en-US.ini
  49. +15
    -3
      options/locale/locale_zh-CN.ini
  50. +48
    -16
      package-lock.json
  51. +1
    -1
      package.json
  52. +35
    -10
      public/home/home.js
  53. +2
    -0
      public/home/search.js
  54. +122
    -72
      routers/ai_task/ai_task.go
  55. +1
    -0
      routers/ai_task/notebook.go
  56. +75
    -13
      routers/api/v1/api.go
  57. +18
    -1
      routers/api/v1/finetune/panguervice.go
  58. +53
    -2
      routers/api/v1/repo/attachments.go
  59. +175
    -0
      routers/api/v1/repo/cloudbrain.go
  60. +109
    -0
      routers/api/v1/repo/cloudbrain_dashboard.go
  61. +13
    -0
      routers/api/v1/repo/datasets.go
  62. +11
    -0
      routers/api/v1/repo/modelarts.go
  63. +10
    -0
      routers/api/v1/repo/modelmanage.go
  64. +7
    -0
      routers/home.go
  65. +1
    -0
      routers/private/internal.go
  66. +17
    -0
      routers/private/setting.go
  67. +65
    -33
      routers/repo/attachment.go
  68. +3
    -3
      routers/repo/attachment_model.go
  69. +74
    -15
      routers/repo/cloudbrain.go
  70. +1
    -0
      routers/repo/cloudbrain_statistic.go
  71. +5
    -2
      routers/repo/dataset.go
  72. +146
    -0
      routers/repo/flow_control.go
  73. +133
    -34
      routers/repo/grampus.go
  74. +30
    -0
      routers/repo/grampus_onlineinfer.go
  75. +36
    -2
      routers/repo/modelarts.go
  76. +7
    -7
      routers/repo/setting.go
  77. +4
    -0
      routers/response/api_response.go
  78. +26
    -1
      routers/response/error.go
  79. +10
    -1
      routers/response/response_list.go
  80. +10
    -1
      routers/routes/routes.go
  81. +1
    -0
      routers/user/home.go
  82. +167
    -37
      services/ai_task_service/cluster/c2net.go
  83. +91
    -24
      services/ai_task_service/cluster/cloudbrain_one.go
  84. +297
    -0
      services/ai_task_service/cluster/cloudbrain_two.go
  85. +18
    -14
      services/ai_task_service/cluster/cluster_base.go
  86. +81
    -0
      services/ai_task_service/container_builder/code_builder.go
  87. +95
    -0
      services/ai_task_service/container_builder/common.go
  88. +40
    -12
      services/ai_task_service/container_builder/container_builder.go
  89. +7
    -1
      services/ai_task_service/container_builder/container_builder_chan.go
  90. +55
    -32
      services/ai_task_service/container_builder/dataset_builder.go
  91. +47
    -0
      services/ai_task_service/container_builder/file_notebook_code_builder.go
  92. +0
    -59
      services/ai_task_service/container_builder/minio_code_builder.go
  93. +0
    -18
      services/ai_task_service/container_builder/obs_code_builder.go
  94. +40
    -6
      services/ai_task_service/container_builder/output_path_builder.go
  95. +0
    -59
      services/ai_task_service/container_builder/output_readme_builder.go
  96. +120
    -36
      services/ai_task_service/container_builder/pre_model_builder.go
  97. +12
    -12
      services/ai_task_service/context/context.go
  98. +51
    -7
      services/ai_task_service/schedule/model_schedule.go
  99. +94
    -76
      services/ai_task_service/task/cloudbrain_one_notebook_task.go
  100. +217
    -0
      services/ai_task_service/task/cloudbrain_two_notebook_task.go

+ 15
- 0
README.md View File

@@ -172,6 +172,21 @@
> [attachment]
> PATH = /data/gitea/attachments
>
> ENABLED = true
> MAX_SIZE = 1048576
> ALLOWED_TYPES = */*
> MAX_FILES = 10
> STORE_TYPE = minio
> MINIO_ENDPOINT =
>
> MINIO_ACCESS_KEY_ID =
> MINIO_SECRET_ACCESS_KEY =
> MINIO_BUCKET =
> MINIO_LOCATION =
> MINIO_BASE_PATH = attachment/
> MINIO_USE_SSL = true
> MINIO_REAL_PATH =
>
> [log]
> MODE = file
> LEVEL = info


+ 0
- 1
custom/public/css/git.openi.css View File

@@ -419,7 +419,6 @@
@media only screen and (min-width: 1920px) {

}

/* rotation3D */
#app{
position: relative;


+ 208
- 0
entity/ai_task.go View File

@@ -0,0 +1,208 @@
package entity

import (
"strings"

"code.gitea.io/gitea/models"
"code.gitea.io/gitea/modules/git"
"code.gitea.io/gitea/modules/setting"
"code.gitea.io/gitea/modules/structs"
"code.gitea.io/gitea/modules/timeutil"
)

//todo 暂时保留之前各种云脑属性的定义
type CreateReq struct {
JobType models.JobType `json:"job_type" binding:"Required"`
DisplayJobName string `json:"display_job_name" binding:"Required"`
JobName string `json:"job_name"`
SpecId int64 `json:"spec_id" binding:"Required"`
ComputeSourceStr string `json:"compute_source" binding:"Required"`
Cluster ClusterType `json:"cluster" binding:"Required"`
WorkServerNumber int `json:"work_server_number"`
BranchName string `json:"branch_name"`
PreTrainModelUrl string `json:"pretrain_model_url"`
PretrainModelCkptName string `json:"pretrain_model_ckpt_name"`
ImageUrl string `json:"image_url"`
ImageID string `json:"image_id"`
ImageName string `json:"image_name"`
PretrainModelName string `json:"pretrain_model_name"`
PretrainModelVersion string `json:"pretrain_model_version"`
PretrainModelId string `json:"pretrain_model_id"`
Description string `json:"description"`
LabelName string `json:"label_names"`
DatasetUUIDStr string `json:"dataset_uuid_str"`
Params string `json:"run_para_list"`
BootFile string `json:"boot_file"`
ParamArray models.Parameters
ComputeSource *models.ComputeSource
ReqCommitID string
IsFileNoteBookRequest bool
FileRepository *models.Repository
FileBranchName string
IsRestartRequest bool
DatasetNames string
}

type CreationResponse struct {
Error error
JobID string
Status string //todo 考虑统一状态
CreateTime timeutil.TimeStamp
}

type QueryAITaskRes struct {
Task *AITaskDetailInfo `json:"task"`
EarlyVersionList []*AITaskDetailInfo `json:"early_version_list"`
CanCreateVersion bool `json:"can_create_version"`
}

type AITaskDetailInfo struct {
ID int64 `json:"id"`
JobID string `json:"job_id"`
Status string `json:"status"`
JobType string `json:"job_type"`
Cluster string `json:"cluster"`
DisplayJobName string `json:"display_job_name"`
FormattedDuration string `json:"formatted_duration"`
ComputeSource string `json:"compute_source"`
AICenter string `json:"ai_center"`
BootFile string `json:"boot_file"`
PreVersionName string `json:"pre_version_name"`
CurrentVersionName string `json:"current_version_name"`
WorkServerNumber int `json:"work_server_number"`
Spec *structs.SpecificationShow `json:"spec"`
DatasetList []*models.DatasetDownload `json:"dataset_list"`
PretrainModelList []*models.ModelDownload `json:"pretrain_model_list"`
Parameters *models.Parameters `json:"parameters"`
CreatedUnix timeutil.TimeStamp `json:"created_unix"`
CodePath string `json:"code_path"`
DatasetPath string `json:"dataset_path"`
PretrainModelPath string `json:"pretrain_model_path"`
OutputPath string `json:"output_path"`
CodeUrl string `json:"code_url"`
PretrainModelName string `json:"pretrain_model_name"`
PretrainModelVersion string `json:"pretrain_model_version"`
PretrainCkptName string `json:"pretrain_model_ckpt_name"`
StartTime timeutil.TimeStamp `json:"start_time"`
EndTime timeutil.TimeStamp `json:"end_time"`
Description string `json:"description"`
CommitID string `json:"commit_id"`
BranchName string `json:"branch_name"`
ImageUrl string `json:"image_url"`
ImageID string `json:"image_id"`
ImageName string `json:"image_name"`
CreatorName string `json:"creator_name"`
EngineName string `json:"engine_name"`
}

func (a *AITaskDetailInfo) Tr(language string) {
a.AICenter = getAiCenterShow(a.AICenter, language)
}

func (a *AITaskDetailInfo) RemoveDatasets() {
a.DatasetList = []*models.DatasetDownload{}
}
func (a *AITaskDetailInfo) RemovePretrainModelList() {
a.PretrainModelList = []*models.ModelDownload{}
}

func getAiCenterShow(aiCenter string, language string) string {
aiCenterInfo := strings.Split(aiCenter, "+")

if len(aiCenterInfo) == 2 {
if setting.C2NetMapInfo != nil {
if info, ok := setting.C2NetMapInfo[aiCenterInfo[0]]; ok {
if language == defaultLanguage {
return info.Content
} else {
return info.ContentEN
}
} else {
return aiCenterInfo[1]
}

} else {
return aiCenterInfo[1]
}

}

return ""

}

var defaultLanguage = "zh-CN"

type CreateTaskRes struct {
ID int64 `json:"id"`
Status string `json:"status"`
}

type GetAITaskCreationInfoReq struct {
User *models.User
JobType models.JobType
ClusterType ClusterType
ComputeSource *models.ComputeSource
Repo *models.Repository
GitRepo *git.Repository
IsOnlineType bool
}

type AITaskBriefInfo struct {
ID int64 `json:"id"`
JobType string `json:"job_type"`
Status string `json:"status"`
DisplayJobName string `json:"display_job_name"`
CreatedUnix timeutil.TimeStamp `json:"created_unix"`
StartTime timeutil.TimeStamp `json:"start_time"`
EndTime timeutil.TimeStamp `json:"end_time"`
FormattedDuration string `json:"formatted_duration"`
Cluster string `json:"cluster"`
ComputeSource string `json:"compute_source"`
AICenter string `json:"ai_center"`
IsFileNotebook bool `json:"is_file_notebook"`
}

func (a *AITaskBriefInfo) Tr(language string) {
a.AICenter = getAiCenterShow(a.AICenter, language)
}

type AITaskListRes struct {
Tasks []*AITaskInfo4List `json:"tasks"`
Total int64 `json:"total"`
PageSize int `json:"page_size"`
Page int `json:"page"`
CanCreateTask bool `json:"can_create_task"`
}
type AITaskInfo4List struct {
Task *AITaskBriefInfo `json:"task"`
Creator UserBriefInfo `json:"creator"`
CanModify bool `json:"can_modify"`
CanDelete bool `json:"can_delete"`
}

func ConvertCloudbrainToAITaskBriefInfo(task *models.Cloudbrain) *AITaskBriefInfo {
computeSource := ""
c := models.GetComputeSourceInstance(task.ComputeResource)
if c != nil {
computeSource = c.Name
}
return &AITaskBriefInfo{
ID: task.ID,
JobType: task.JobType,
Status: task.Status,
DisplayJobName: task.DisplayJobName,
CreatedUnix: task.CreatedUnix,
FormattedDuration: task.TrainJobDuration,
Cluster: GetClusterTypeFromCloudbrainType(task.Type).GetParentCluster(),
ComputeSource: computeSource,
StartTime: task.StartTime,
EndTime: task.EndTime,
AICenter: task.AiCenter,
IsFileNotebook: task.IsFileNoteBookTask(),
}
}

type NotebookDataset struct {
DatasetUrl string `json:"dataset_url"`
}

+ 35
- 0
entity/ai_task_config.go View File

@@ -0,0 +1,35 @@
package entity

type AITaskConfig struct {
ContainerSteps map[ContainerDataType]*ContainerBuildOpts `json:"container_configs"`
DatasetMaxSize int
}

type ContainerConfig struct {
Enable bool
ContainerPath string
ReadOnly bool
AcceptStorageType []StorageType
}

type GetAITaskConfigOpts struct {
ComputeSource string
IsFileNoteBookRequest bool
}

func (c *AITaskConfig) GetContainerConfig(containerDataType ContainerDataType) *ContainerBuildOpts {
containerConfigs := c.ContainerSteps
if containerConfigs != nil {
return containerConfigs[containerDataType]
}
return nil

}
func (c *AITaskConfig) GetContainerPath(containerDataType ContainerDataType) string {
config := c.GetContainerConfig(containerDataType)
if config == nil {
return ""
}
return config.ContainerPath

}

+ 0
- 28
entity/ai_task_entity/container.go View File

@@ -1,28 +0,0 @@
package ai_task_entity

type TaskData struct {
Code ContainerData
Dataset []ContainerData
PreTrainModel ContainerData
OutPutPath ContainerData
}

type ContainerData struct {
Name string `json:"name"`
Bucket string `json:"bucket"`
EndPoint string `json:"endPoint"`
ObjectKey string `json:"objectKey"`
ContainerPath string `json:"containerPath"`
RealPath string `json:"realPath"`
ReadOnly bool `json:"readOnly"`
}

type ContainerDataType string

const (
ContainerCode ContainerDataType = "code"
ContainerDataset ContainerDataType = "dataset"
ContainerPreTrainModel ContainerDataType = "pre_train_model"
ContainerOutPutPath ContainerDataType = "output"
ContainerCloudbrainOneOutPutReadMe ContainerDataType = "cloudbrain_one_readme"
)

+ 0
- 1
entity/ai_task_entity/creation.go View File

@@ -1 +0,0 @@
package ai_task_entity

+ 0
- 70
entity/ai_task_entity/task.go View File

@@ -1,70 +0,0 @@
package ai_task_entity

import (
"code.gitea.io/gitea/models"
"code.gitea.io/gitea/modules/git"
"code.gitea.io/gitea/modules/structs"
"code.gitea.io/gitea/modules/timeutil"
)

//todo 暂时保留之前各种云脑属性的定义
type CreateReq struct {
JobType models.JobType `json:"job_type" binding:"Required"`
DisplayJobName string `json:"display_job_name" binding:"Required"`
JobName string `json:"job_name"`
SpecId int64 `json:"spec_id" binding:"Required"`
ComputeSourceStr string `json:"compute_source" binding:"Required"`
Cluster ClusterType `json:"cluster" binding:"Required"`
WorkServerNumber int `json:"work_server_number"`
BranchName string `json:"branch_name"`
PreTrainModelUrl string `json:"pre_train_model_url"`
CkptName string `json:"ckpt_name"`
ImageUrl string `json:"image_url"`
ImageID string `json:"image_id"`
ImageName string `json:"image_name"`
ModelName string `json:"model_name"`
ModelVersion string `json:"model_version"`
ModelId string `json:"model_id"`
Description string `json:"description"`
LabelName string `json:"label_names"`
DatasetUUIDStr string `json:"dataset_uuid_str"`
Params string `json:"run_para_list"`
BootFile string `json:"boot_file"`
ParamArray models.Parameters
ComputeSource *models.ComputeSource
}

type CreationResponse struct {
Error error
JobID string
Status string //todo 考虑统一状态
CreateTime timeutil.TimeStamp
}

type QueryTaskInfo struct {
ID int64 `json:"id"`
JobID string `json:"job_id"`
Status string `json:"status"`
JobType string `json:"job_type"`
Cluster string `json:"cluster"`
DisplayJobName string `json:"display_job_name"`
Duration string `json:"duration"`
ComputeSource string `json:"compute_source"`
AiCenter string `json:"ai_center"`
WorkServerNumber int `json:"work_server_number"`
Spec *structs.SpecificationShow `json:"spec"`
DatasetList []*models.DatasetDownload `json:"dataset_list"`
}

type CreateTaskRes struct {
ID int64 `json:"id"`
}

type GetAITaskCreationInfoReq struct {
User *models.User
JobType models.JobType
ClusterType ClusterType
ComputeSource *models.ComputeSource
Repo *models.Repository
GitRepo *git.Repository
}

+ 12
- 0
entity/ai_task_list.go View File

@@ -0,0 +1,12 @@
package entity

import "code.gitea.io/gitea/models"

type GetTaskListReq struct {
models.ListOptions
ComputeSource *models.ComputeSource
JobTypes []string
RepoID int64
Operator *models.User
IsRepoOwner bool
}

entity/ai_task_entity/cluster.go → entity/cluster.go View File

@@ -1,26 +1,37 @@
package ai_task_entity
package entity

import (
"encoding/json"
"fmt"
"strconv"

"code.gitea.io/gitea/models"
"code.gitea.io/gitea/modules/cloudbrain"
"code.gitea.io/gitea/modules/log"
"code.gitea.io/gitea/modules/timeutil"
)

type CreateNoteBookTaskRequest struct {
Name string
Description string
Tasks []NoteBookTask
PrimitiveDatasetName string
RepoName string
}

type NoteBookTask struct {
AutoStopDuration int
AutoStopDuration int64
Name string
Capacity int
CenterID []string
Code []ContainerData
Datasets []ContainerData
PreTrainModel []ContainerData
OutPut []ContainerData
ImageId string
ImageUrl string
ResourceSpecId string
BootFile string
Spec *models.Specification
}

@@ -56,6 +67,10 @@ type QueryTaskResponse struct {
Token string `json:"token"`
CenterId string `json:"center_id"`
CenterName string `json:"center_name"`
CodeUrl string `json:"code_url"`
DataUrl string `json:"data_url"`
ContainerIP string `json:"container_ip"`
ContainerID string `json:"container_id"`
}

func ConvertGrampusNotebookResponse(job models.GrampusNotebookInfo) *QueryTaskResponse {
@@ -71,10 +86,13 @@ func ConvertGrampusNotebookResponse(job models.GrampusNotebookInfo) *QueryTaskRe
if len(task.CenterName) > 0 {
centerName = task.CenterName[0]
}
var url, token string
var url, token, codeUrl, dataUrl string
if len(job.Tasks) > 0 {
url = job.Tasks[0].Url
token = job.Tasks[0].Token
t := job.Tasks[0]
url = t.Url
token = t.Token
codeUrl = t.CodeUrl
dataUrl = t.DataUrl
}
return &QueryTaskResponse{
StartedAt: timeutil.TimeStamp(job.StartedAt),
@@ -85,6 +103,8 @@ func ConvertGrampusNotebookResponse(job models.GrampusNotebookInfo) *QueryTaskRe
Url: url,
Token: token,
JobId: job.JobID,
CodeUrl: codeUrl,
DataUrl: dataUrl,
}
}
func ConvertGrampusTrainResponse(job models.GrampusJobInfo) *QueryTaskResponse {
@@ -122,16 +142,56 @@ func ConvertCloudbrainOneQueryNotebookByNameResponse(result models.JobResultInLi
}
}

func ConvertCloudbrainOneNotebookResponse(result models.JobResultPayload) *QueryTaskResponse {
if result.JobStatus.State == "" {
return nil
func ConvertCloudbrainOneNotebookResponse(input map[string]interface{}) (*QueryTaskResponse, error) {
data, _ := json.Marshal(input)
var jobResultPayload models.JobResultPayload
err := json.Unmarshal(data, &jobResultPayload)
if err != nil {
log.Error("parse cloudbrain one result err,result=%+v err=%v", input, err)
return nil, err
}
if jobResultPayload.JobStatus.State == "" {
return nil, nil
}

startTime := jobResultPayload.JobStatus.AppLaunchedTime / 1000
var endTime int64
switch jobResultPayload.JobStatus.AppCompletedTime.(type) {
case float64:
f := jobResultPayload.JobStatus.AppCompletedTime.(float64)
s := fmt.Sprintf("%.0f", f)
i, err := strconv.ParseInt(s, 10, 64)
if err == nil {
endTime = i / 1000
}
}

if jobResultPayload.JobStatus.State == string(models.JobWaiting) {
startTime = 0
endTime = 0
}
var containerIP, containerID string
taskRoles := jobResultPayload.TaskRoles
if taskRoles != nil && len(taskRoles) > 0 {
subTask := taskRoles[cloudbrain.SubTaskName]
if subTask != nil {
taskRes, _ := models.ConvertToTaskPod(taskRoles[cloudbrain.SubTaskName].(map[string]interface{}))
if taskRes.TaskStatuses != nil && len(taskRes.TaskStatuses) > 0 {
containerIP = taskRes.TaskStatuses[0].ContainerIP
containerID = taskRes.TaskStatuses[0].ContainerID
}
}
return &QueryTaskResponse{
StartedAt: timeutil.TimeStamp(result.JobStatus.CreatedTime / 1000),
CompletedAt: timeutil.TimeStamp(result.JobStatus.CompletedTime / 1000),
Status: result.JobStatus.State,
JobId: result.ID,
}

res := &QueryTaskResponse{
StartedAt: timeutil.TimeStamp(startTime),
CompletedAt: timeutil.TimeStamp(endTime),
Status: jobResultPayload.JobStatus.State,
JobId: jobResultPayload.ID,
ContainerIP: containerIP,
ContainerID: containerID,
}
return res, nil
}

type ClusterLog struct {
@@ -205,6 +265,8 @@ func GetClusterTypeFromCloudbrainType(t int) ClusterType {
return OpenICloudbrainTwo
case models.TypeC2Net:
return C2Net
case models.TypeCDCenter:
return OpenICloudbrainTwo
}
return ""
}

+ 45
- 0
entity/container.go View File

@@ -0,0 +1,45 @@
package entity

type TaskData struct {
Code ContainerData
Dataset []ContainerData
PreTrainModel ContainerData
OutPutPath ContainerData
}

type ContainerData struct {
Name string `json:"name"`
Bucket string `json:"bucket"`
EndPoint string `json:"endPoint"`
ObjectKey string `json:"objectKey"`
ContainerPath string `json:"containerPath"`
RealPath string `json:"realPath"`
ReadOnly bool `json:"readOnly"`
}

type ContainerDataType string

const (
ContainerCode ContainerDataType = "code"
ContainerDataset ContainerDataType = "dataset"
ContainerPreTrainModel ContainerDataType = "pre_train_model"
ContainerOutPutPath ContainerDataType = "output"
ContainerFileNoteBookCode ContainerDataType = "file_note_book_code"
)

type ContainerBuildOpts struct {
Disable bool
ContainerPath string
ReadOnly bool
AcceptStorageType []StorageType
NotArchive bool
}

func (opts ContainerBuildOpts) IsStorageTypeIn(storageType StorageType) bool {
for _, s := range opts.AcceptStorageType {
if string(s) == string(storageType) {
return true
}
}
return false
}

entity/ai_task_entity/task_list.go → entity/creation.go View File

@@ -1,13 +1,10 @@
package ai_task_entity
package entity

import (
"code.gitea.io/gitea/models"
"code.gitea.io/gitea/modules/structs"
)

type TaskBriefInfo struct {
}

type CreationRequiredInfo struct {
//排队信息、代码分支信息、查询是否有正在运行的任务、查询镜像列表、查询资源规格(积分余额,开关)
Specs []*structs.SpecificationShow `json:"specs"`
@@ -20,6 +17,11 @@ type CreationRequiredInfo struct {
DisplayJobName string `json:"display_job_name"`
PointAccount *PointAccountInfo `json:"point_account"`
PaySwitch bool `json:"pay_switch"`
Config AITaskCreationConfig `json:"config"`
}

type AITaskCreationConfig struct {
DatasetMaxSize int `json:"dataset_max_size"`
}

type SpecificationInfo struct {

+ 13
- 0
entity/err_code.go View File

@@ -0,0 +1,13 @@
package entity

import "fmt"

type ErrCode struct {
CodeVal string
CodeMsg string
CodeTrCode string
}

func (e *ErrCode) IsMatch(code interface{}) bool {
return fmt.Sprint(code) == e.CodeVal
}

+ 3
- 0
entity/grampus_err_code.go View File

@@ -0,0 +1,3 @@
package entity

var GrampusJobCanNotRestart = &ErrCode{CodeVal: "5005", CodeMsg: "Job can not restart", CodeTrCode: "ai_task.can_not_restart"}

entity/ai_task_entity/images.go → entity/images.go View File

@@ -1,4 +1,4 @@
package ai_task_entity
package entity

import "code.gitea.io/gitea/models"


+ 35
- 0
entity/operation_profile.go View File

@@ -0,0 +1,35 @@
package entity

type OperationProfile struct {
Events []ProfileEvent `json:"events"`
}

type ProfileEvent struct {
Message string `json:"message"`
Name string `json:"name"`
Reason string `json:"reason"`
Timestamp string `json:"timestamp"`
Action string `json:"action"`
}

type CloudbrainOneAppExitDiagnostics struct {
PodRoleName struct {
Task10 string `json:"task1-0"`
} `json:"podRoleName"`
PodEvents struct {
Task10 []struct {
Uid string `json:"uid"`
Reason string `json:"reason"`
Message string `json:"message"`
ReportingController string `json:"reportingController"`
Action string `json:"action"`
} `json:"task1-0"`
} `json:"podEvents"`
Extras []struct {
Uid string `json:"uid"`
Reason string `json:"reason"`
Message string `json:"message"`
ReportingController string `json:"reportingController"`
Action string `json:"action"`
} `json:"extras"`
}

+ 21
- 0
entity/storage.go View File

@@ -0,0 +1,21 @@
package entity

import "code.gitea.io/gitea/models"

type StorageType string

const (
MINIO StorageType = "MINIO"
OBS StorageType = "OBS"
)

func GetStorageTypeFromCloudbrainType(cloudbrainType int) StorageType {
switch cloudbrainType {
case models.TypeCloudBrainOne:
return MINIO
case models.TypeCloudBrainTwo:
return OBS

}
return ""
}

+ 44
- 0
entity/user.go View File

@@ -0,0 +1,44 @@
package entity

import (
"code.gitea.io/gitea/models"
"code.gitea.io/gitea/modules/timeutil"
)

type UserBriefInfo struct {
ID int64 `json:"id"`
LowerName string `json:"lower_name"`
Name string `json:"name"`
FullName string `json:"full_name"`
Email string `json:"email"`
Language string `json:"language"`
Description string `json:"description"`
RelAvatarLink string `json:"rel_avatar_link"`
NumMembers int `json:"num_members"`
CreatedUnix timeutil.TimeStamp `json:"created_unix"`
UpdatedUnix timeutil.TimeStamp `json:"updated_unix"`
}

func ConvertUserToBrief(u *models.User) *UserBriefInfo {
fullName := u.Name
if u.FullName != "" {
fullName = u.FullName
}
uf := &UserBriefInfo{
ID: u.ID,
LowerName: u.LowerName,
Name: u.Name,
FullName: fullName,
Email: u.Email,
Language: u.Language,
Description: u.Description,
CreatedUnix: u.CreatedUnix,
UpdatedUnix: u.UpdatedUnix,
NumMembers: u.NumMembers,
}
if !u.KeepEmailPrivate {
uf.Email = u.Email
}
uf.RelAvatarLink = u.RelAvatarLink()
return uf
}

+ 1527
- 0
manager/client/cloudbrain_two/resty.go View File

@@ -0,0 +1,1527 @@
package cloudbrain_two

import (
"crypto/tls"
"encoding/json"
"fmt"
"net/http"
"strconv"

"code.gitea.io/gitea/models"
"code.gitea.io/gitea/modules/log"
"code.gitea.io/gitea/modules/setting"
"github.com/go-resty/resty/v2"
)

var (
restyClient *resty.Client
HOST string
TOKEN string
AutoStopDurationMs = 4 * 60 * 60 * 1000
)

const (
methodPassword = "password"

urlGetToken = "/v3/auth/tokens"
urlNotebook = "/demanager/instances"
urlTrainJob = "/training-jobs"
urlResourceSpecs = "/job/resource-specs"
urlTrainJobConfig = "/training-job-configs"
errorCodeExceedLimit = "ModelArts.0118"

//notebook 2.0
urlNotebook2 = "/notebooks"

//error code
modelartsIllegalToken = "ModelArts.6401"
NotebookNotFound = "ModelArts.6404"
NotebookNoPermission = "ModelArts.6407"
NotebookInvalid = "ModelArts.6400"
UnknownErrorPrefix = "UNKNOWN:"

ModelArtsJobInTargetState = "ModelArts.6357"
ModelArtsJobNotExists = "ModelArts.0102"
ModelArtsJobInternalError = "ModelArts.0010"
)

func getRestyClient() *resty.Client {
if restyClient == nil {
restyClient = resty.New()
restyClient.SetTLSClientConfig(&tls.Config{InsecureSkipVerify: true})
}
return restyClient
}

func checkSetting() {
if len(HOST) != 0 && len(TOKEN) != 0 && restyClient != nil {
return
}

err := getToken()
if err != nil {
log.Error("getToken failed:%v", err)
}
}

func getToken() error {
HOST = setting.ModelArtsHost

client := getRestyClient()
params := models.GetTokenParams{
Auth: models.Auth{
Identity: models.Identity{
Methods: []string{methodPassword},
Password: models.Password{
User: models.NotebookUser{
Name: setting.ModelArtsUsername,
Password: setting.ModelArtsPassword,
Domain: models.Domain{
Name: setting.ModelArtsDomain,
},
},
},
},
Scope: models.Scope{
Project: models.Project{
Name: setting.ProjectName,
},
},
},
}

res, err := client.R().
SetHeader("Content-Type", "application/json").
SetBody(params).
Post(setting.IamHost + urlGetToken)
if err != nil {
return fmt.Errorf("resty getToken: %v", err)
}

if res.StatusCode() != http.StatusCreated {
return fmt.Errorf("getToken failed:%s", res.String())
}

TOKEN = res.Header().Get("X-Subject-Token")

return nil
}

func CreateJob(createJobParams models.CreateNotebookParams) (*models.CreateNotebookResult, error) {
checkSetting()
client := getRestyClient()
var result models.CreateNotebookResult

retry := 0

sendjob:
res, err := client.R().
SetHeader("Content-Type", "application/json").
SetAuthToken(TOKEN).
SetBody(createJobParams).
SetResult(&result).
Post(HOST + "/v1/" + setting.ProjectID + urlNotebook)

if err != nil {
return nil, fmt.Errorf("resty create notebook: %s", err)
}

if res.StatusCode() == http.StatusUnauthorized && retry < 1 {
retry++
_ = getToken()
goto sendjob
}

var response models.NotebookResult
err = json.Unmarshal(res.Body(), &response)
if err != nil {
log.Error("json.Unmarshal failed: %s", err.Error())
return &result, fmt.Errorf("son.Unmarshal failed: %s", err.Error())
}

if len(response.ErrorCode) != 0 {
log.Error("createNotebook failed(%s): %s", response.ErrorCode, response.ErrorMsg)
if response.ErrorCode == errorCodeExceedLimit {
response.ErrorMsg = "所选规格使用数量已超过最大配额限制。"
}
return &result, fmt.Errorf("createNotebook failed(%s): %s", response.ErrorCode, response.ErrorMsg)
}

return &result, nil
}

func GetJob(jobID string) (*models.GetNotebookResult, error) {
checkSetting()
client := getRestyClient()
var result models.GetNotebookResult

retry := 0

sendjob:
res, err := client.R().
SetHeader("Content-Type", "application/json").
SetAuthToken(TOKEN).
SetResult(&result).
Get(HOST + "/v1/" + setting.ProjectID + urlNotebook + "/" + jobID)

if err != nil {
return nil, fmt.Errorf("resty GetJob: %v", err)
}

if res.StatusCode() == http.StatusUnauthorized && retry < 1 {
retry++
_ = getToken()
goto sendjob
}

var response models.NotebookResult
err = json.Unmarshal(res.Body(), &response)
if err != nil {
log.Error("json.Unmarshal failed: %s", err.Error())
return &result, fmt.Errorf("son.Unmarshal failed: %s", err.Error())
}

if len(response.ErrorCode) != 0 {
log.Error("GetJob failed(%s): %s", response.ErrorCode, response.ErrorMsg)
return &result, fmt.Errorf("GetJob failed(%s): %s", response.ErrorCode, response.ErrorMsg)
}

return &result, nil
}

func GetNotebook2(jobID string) (*models.GetNotebook2Result, error) {
checkSetting()
client := getRestyClient()
var result models.GetNotebook2Result

retry := 0

sendjob:
res, err := client.R().
SetHeader("Content-Type", "application/json").
SetAuthToken(TOKEN).
SetResult(&result).
Get(HOST + "/v1/" + setting.ProjectID + urlNotebook2 + "/" + jobID)

if err != nil {
return nil, fmt.Errorf("resty GetJob: %v", err)
}

if res.StatusCode() == http.StatusUnauthorized && retry < 1 {
retry++
_ = getToken()
goto sendjob
}

var response models.NotebookResult
err = json.Unmarshal(res.Body(), &response)
if err != nil {
log.Error("json.Unmarshal failed: %s", err.Error())
return &result, fmt.Errorf("son.Unmarshal failed: %s", err.Error())
}

if len(response.ErrorCode) != 0 {
log.Error("GetJob failed(%s): %s", response.ErrorCode, response.ErrorMsg)
if response.ErrorCode == modelartsIllegalToken && retry < 1 {
retry++
_ = getToken()
goto sendjob
}
return &result, fmt.Errorf("GetJob failed(%s): %s", response.ErrorCode, response.ErrorMsg)
}

return &result, nil
}

func ManageNotebook(jobID string, param models.NotebookAction) (*models.NotebookActionResult, error) {
checkSetting()
client := getRestyClient()
var result models.NotebookActionResult

retry := 0

sendjob:
res, err := client.R().
SetHeader("Content-Type", "application/json").
SetBody(param).
SetAuthToken(TOKEN).
SetResult(&result).
Post(HOST + "/v1/" + setting.ProjectID + urlNotebook + "/" + jobID + "/action")

if err != nil {
return &result, fmt.Errorf("resty StopJob: %v", err)
}

if res.StatusCode() == http.StatusUnauthorized && retry < 1 {
retry++
_ = getToken()
goto sendjob
}

var response models.NotebookResult
err = json.Unmarshal(res.Body(), &response)
if err != nil {
log.Error("json.Unmarshal failed: %s", err.Error())
return &result, fmt.Errorf("son.Unmarshal failed: %s", err.Error())
}

if len(response.ErrorCode) != 0 {
log.Error("ManageNotebook failed(%s): %s", response.ErrorCode, response.ErrorMsg)
return &result, fmt.Errorf("ManageNotebook failed(%s): %s", response.ErrorCode, response.ErrorMsg)
}

return &result, nil
}

func ManageNotebook2(jobID string, param models.NotebookAction) (*models.NotebookActionResult, error) {
checkSetting()
client := getRestyClient()
var result models.NotebookActionResult

retry := 0

sendjob:
res, err := client.R().
SetHeader("Content-Type", "application/json").
SetAuthToken(TOKEN).
SetResult(&result).
Post(HOST + "/v1/" + setting.ProjectID + urlNotebook2 + "/" + jobID + "/" + param.Action + "?duration=" + strconv.Itoa(AutoStopDurationMs))

if err != nil {
return &result, fmt.Errorf("resty ManageNotebook2: %v", err)
}

if res.StatusCode() == http.StatusUnauthorized && retry < 1 {
retry++
_ = getToken()
goto sendjob
}

var response models.NotebookResult
err = json.Unmarshal(res.Body(), &response)
if err != nil {
log.Error("json.Unmarshal failed: %s", err.Error())
return &result, fmt.Errorf("son.Unmarshal failed: %s", err.Error())
}

if res.StatusCode() == http.StatusBadGateway {
return &result, fmt.Errorf(UnknownErrorPrefix+"createNotebook2 failed(%s): %s", response.ErrorCode, response.ErrorMsg)
}

if len(response.ErrorCode) != 0 {
log.Error("ManageNotebook2 failed(%s): %s", response.ErrorCode, response.ErrorMsg)
if response.ErrorCode == modelartsIllegalToken && retry < 1 {
retry++
_ = getToken()
goto sendjob
}
return &result, fmt.Errorf("ManageNotebook2 failed(%s): %s", response.ErrorCode, response.ErrorMsg)
}

return &result, nil
}

func DelNotebook(jobID string) (*models.NotebookDelResult, error) {
checkSetting()
client := getRestyClient()
var result models.NotebookDelResult

retry := 0

sendjob:
res, err := client.R().
SetHeader("Content-Type", "application/json").
SetAuthToken(TOKEN).
SetResult(&result).
Delete(HOST + "/v1/" + setting.ProjectID + urlNotebook + "/" + jobID)

if err != nil {
return &result, fmt.Errorf("resty DelJob: %v", err)
}

if res.StatusCode() == http.StatusUnauthorized && retry < 1 {
retry++
_ = getToken()
goto sendjob
}

var response models.NotebookResult
err = json.Unmarshal(res.Body(), &response)
if err != nil {
log.Error("json.Unmarshal failed: %s", err.Error())
return &result, fmt.Errorf("son.Unmarshal failed: %s", err.Error())
}

if len(response.ErrorCode) != 0 {
log.Error("DelJob failed(%s): %s", response.ErrorCode, response.ErrorMsg)
return &result, fmt.Errorf("DelJob failed(%s): %s", response.ErrorCode, response.ErrorMsg)
}

return &result, nil
}

func DelNotebook2(jobID string) (*models.NotebookDelResult, error) {
checkSetting()
client := getRestyClient()
var result models.NotebookDelResult

retry := 0

sendjob:
res, err := client.R().
SetHeader("Content-Type", "application/json").
SetAuthToken(TOKEN).
SetResult(&result).
Delete(HOST + "/v1/" + setting.ProjectID + urlNotebook2 + "/" + jobID)

if err != nil {
return &result, fmt.Errorf("resty DelJob: %v", err)
}

if res.StatusCode() == http.StatusUnauthorized && retry < 1 {
retry++
_ = getToken()
goto sendjob
}

var response models.NotebookResult
err = json.Unmarshal(res.Body(), &response)
if err != nil {
log.Error("json.Unmarshal failed: %s", err.Error())
return &result, fmt.Errorf("son.Unmarshal failed: %s", err.Error())
}

if len(response.ErrorCode) != 0 {
log.Error("DelNotebook2 failed(%s): %s", response.ErrorCode, response.ErrorMsg)
if response.ErrorCode == modelartsIllegalToken && retry < 1 {
retry++
_ = getToken()
goto sendjob
}
if response.ErrorCode == ModelArtsJobNotExists || response.ErrorCode == ModelArtsJobInTargetState {
//任务不存在或者已经处于被删除的状态,此时认为删除成功
return &models.NotebookDelResult{}, nil
}
if result.ErrorCode == ModelArtsJobInternalError {
log.Error("ModelArt internal error when del job,jobId=%s", jobID)
return &models.NotebookDelResult{}, nil
}
return &result, fmt.Errorf("DelNotebook2 failed(%s): %s", response.ErrorCode, response.ErrorMsg)
}

return &result, nil
}

func DelJob(jobID string) (*models.NotebookDelResult, error) {
checkSetting()
client := getRestyClient()
var result models.NotebookDelResult

retry := 0

sendjob:
res, err := client.R().
SetHeader("Content-Type", "application/json").
SetAuthToken(TOKEN).
SetResult(&result).
Delete(HOST + "/v1/" + setting.ProjectID + urlNotebook + "/" + jobID)

if err != nil {
return &result, fmt.Errorf("resty DelJob: %v", err)
}

if res.StatusCode() == http.StatusUnauthorized && retry < 1 {
retry++
_ = getToken()
goto sendjob
}

var response models.NotebookResult
err = json.Unmarshal(res.Body(), &response)
if err != nil {
log.Error("json.Unmarshal failed: %s", err.Error())
return &result, fmt.Errorf("son.Unmarshal failed: %s", err.Error())
}

if len(response.ErrorCode) != 0 {
log.Error("DelJob failed(%s): %s", response.ErrorCode, response.ErrorMsg)
return &result, fmt.Errorf("DelJob failed(%s): %s", response.ErrorCode, response.ErrorMsg)
}

return &result, nil
}

func GetJobToken(jobID string) (*models.NotebookGetJobTokenResult, error) {
checkSetting()
client := getRestyClient()
var result models.NotebookGetJobTokenResult

retry := 0

sendjob:
res, err := client.R().
SetHeader("Content-Type", "application/json").
SetAuthToken(TOKEN).
SetResult(&result).
Get(HOST + "/v1/" + setting.ProjectID + urlNotebook + "/" + jobID + "/token")

if err != nil {
return &result, fmt.Errorf("resty GetJobToken: %v", err)
}

if res.StatusCode() == http.StatusUnauthorized && retry < 1 {
retry++
_ = getToken()
goto sendjob
}

var response models.NotebookResult
err = json.Unmarshal(res.Body(), &response)
if err != nil {
log.Error("json.Unmarshal failed: %s", err.Error())
return &result, fmt.Errorf("son.Unmarshal failed: %s", err.Error())
}

if len(response.ErrorCode) != 0 {
log.Error("GetJobToken failed(%s): %s", response.ErrorCode, response.ErrorMsg)
return &result, fmt.Errorf("GetJobToken failed(%s): %s", response.ErrorCode, response.ErrorMsg)
}

return &result, nil
}

func createTrainJobUserImage(createJobParams models.CreateUserImageTrainJobParams) (*models.CreateTrainJobResult, error) {
checkSetting()
client := getRestyClient()
var result models.CreateTrainJobResult

retry := 0

sendjob:
res, err := client.R().
SetHeader("Content-Type", "application/json").
SetAuthToken(TOKEN).
SetBody(createJobParams).
SetResult(&result).
Post(HOST + "/v1/" + setting.ProjectID + urlTrainJob)

if err != nil {
return nil, fmt.Errorf("resty create train-job: %s", err)
}

req, _ := json.Marshal(createJobParams)
log.Info("postapi json: %s", req)

if res.StatusCode() == http.StatusUnauthorized && retry < 1 {
retry++
_ = getToken()
goto sendjob
}

if res.StatusCode() != http.StatusOK {
var temp models.ErrorResult
if err = json.Unmarshal([]byte(res.String()), &temp); err != nil {
log.Error("json.Unmarshal failed(%s): %v", res.String(), err.Error())
return &result, fmt.Errorf("json.Unmarshal failed(%s): %v", res.String(), err.Error())
}
log.Error("createTrainJobUserImage failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
bootFileErrorMsg := "Invalid OBS path '" + createJobParams.Config.BootFileUrl + "'."
dataSetErrorMsg := "Invalid OBS path '" + createJobParams.Config.DataUrl + "'."
if temp.ErrorMsg == bootFileErrorMsg {
log.Error("启动文件错误!createTrainJobUserImage failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
return &result, fmt.Errorf("启动文件错误!")
}
if temp.ErrorMsg == dataSetErrorMsg {
log.Error("数据集错误!createTrainJobUserImage failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
return &result, fmt.Errorf("数据集错误!")
}
if res.StatusCode() == http.StatusBadGateway {
return &result, fmt.Errorf(UnknownErrorPrefix+"createTrainJobUserImage failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
} else {
return &result, fmt.Errorf("createTrainJobUserImage failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
}
}

if !result.IsSuccess {
log.Error("createTrainJobUserImage failed(%s): %s", result.ErrorCode, result.ErrorMsg)
return &result, fmt.Errorf("createTrainJobUserImage failed(%s): %s", result.ErrorCode, result.ErrorMsg)
}

return &result, nil
}

func createTrainJob(createJobParams models.CreateTrainJobParams) (*models.CreateTrainJobResult, error) {
checkSetting()
client := getRestyClient()
var result models.CreateTrainJobResult

retry := 0
req, _ := json.Marshal(createJobParams)
log.Info("postapi json: %s", req)

sendjob:
res, err := client.R().
SetHeader("Content-Type", "application/json").
SetAuthToken(TOKEN).
SetBody(createJobParams).
SetResult(&result).
Post(HOST + "/v1/" + setting.ProjectID + urlTrainJob)

if err != nil {
return nil, fmt.Errorf("resty create train-job: %s", err)
}

if res.StatusCode() == http.StatusUnauthorized && retry < 1 {
retry++
_ = getToken()
goto sendjob
}

if res.StatusCode() != http.StatusOK {
var temp models.ErrorResult
if err = json.Unmarshal([]byte(res.String()), &temp); err != nil {
log.Error("json.Unmarshal failed(%s): %v", res.String(), err.Error())
return &result, fmt.Errorf("json.Unmarshal failed(%s): %v", res.String(), err.Error())
}
log.Error("createTrainJob failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
bootFileErrorMsg := "Invalid OBS path '" + createJobParams.Config.BootFileUrl + "'."
dataSetErrorMsg := "Invalid OBS path '" + createJobParams.Config.DataUrl + "'."
if temp.ErrorMsg == bootFileErrorMsg {
log.Error("启动文件错误!createTrainJob failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
return &result, fmt.Errorf("启动文件错误!")
}
if temp.ErrorMsg == dataSetErrorMsg {
log.Error("数据集错误!createTrainJob failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
return &result, fmt.Errorf("数据集错误!")
}
if res.StatusCode() == http.StatusBadGateway {
return &result, fmt.Errorf(UnknownErrorPrefix+"createTrainJob failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
} else {
return &result, fmt.Errorf("createTrainJob failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
}
}

if !result.IsSuccess {
log.Error("createTrainJob failed(%s): %s", result.ErrorCode, result.ErrorMsg)
return &result, fmt.Errorf("createTrainJob failed(%s): %s", result.ErrorCode, result.ErrorMsg)
}

return &result, nil
}

func createTrainJobVersion(createJobVersionParams models.CreateTrainJobVersionParams, jobID string) (*models.CreateTrainJobResult, error) {
checkSetting()
client := getRestyClient()
var result models.CreateTrainJobResult

retry := 0

sendjob:
res, err := client.R().
SetHeader("Content-Type", "application/json").
SetAuthToken(TOKEN).
SetBody(createJobVersionParams).
SetResult(&result).
Post(HOST + "/v1/" + setting.ProjectID + urlTrainJob + "/" + jobID + "/versions")

if err != nil {
return nil, fmt.Errorf("resty create train-job version: %s", err)
}

if res.StatusCode() == http.StatusUnauthorized && retry < 1 {
retry++
_ = getToken()
goto sendjob
}

if res.StatusCode() != http.StatusOK {
var temp models.ErrorResult
if err = json.Unmarshal([]byte(res.String()), &temp); err != nil {
log.Error("json.Unmarshal failed(%s): %v", res.String(), err.Error())
return &result, fmt.Errorf("json.Unmarshal failed(%s): %v", res.String(), err.Error())
}

log.Error("createTrainJobVersion failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
bootFileErrorMsg := "Invalid OBS path '" + createJobVersionParams.Config.BootFileUrl + "'."
dataSetErrorMsg := "Invalid OBS path '" + createJobVersionParams.Config.DataUrl + "'."
if temp.ErrorMsg == bootFileErrorMsg {
log.Error("启动文件错误!createTrainJobVersion failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
return &result, fmt.Errorf("启动文件错误!")
}
if temp.ErrorMsg == dataSetErrorMsg {
log.Error("数据集错误!createTrainJobVersion failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
return &result, fmt.Errorf("数据集错误!")
}
if res.StatusCode() == http.StatusBadGateway {
return &result, fmt.Errorf(UnknownErrorPrefix+"createTrainJobVersion failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
} else {
return &result, fmt.Errorf("createTrainJobVersion failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
}
}

if !result.IsSuccess {
log.Error("createTrainJobVersion failed(%s): %s", result.ErrorCode, result.ErrorMsg)
return &result, fmt.Errorf("createTrainJobVersion failed(%s): %s", result.ErrorCode, result.ErrorMsg)
}

return &result, nil
}

func createTrainJobVersionUserImage(createJobVersionParams models.CreateTrainJobVersionUserImageParams, jobID string) (*models.CreateTrainJobResult, error) {
checkSetting()
client := getRestyClient()
var result models.CreateTrainJobResult

retry := 0

sendjob:
res, err := client.R().
SetHeader("Content-Type", "application/json").
SetAuthToken(TOKEN).
SetBody(createJobVersionParams).
SetResult(&result).
Post(HOST + "/v1/" + setting.ProjectID + urlTrainJob + "/" + jobID + "/versions")

if err != nil {
return nil, fmt.Errorf("resty create train-job version: %s", err)
}

req, _ := json.Marshal(createJobVersionParams)
log.Info("%s", req)

if res.StatusCode() == http.StatusUnauthorized && retry < 1 {
retry++
_ = getToken()
goto sendjob
}

if res.StatusCode() != http.StatusOK {
var temp models.ErrorResult
if err = json.Unmarshal([]byte(res.String()), &temp); err != nil {
log.Error("json.Unmarshal failed(%s): %v", res.String(), err.Error())
return &result, fmt.Errorf("json.Unmarshal failed(%s): %v", res.String(), err.Error())
}
BootFileErrorMsg := "Invalid OBS path '" + createJobVersionParams.Config.BootFileUrl + "'."
DataSetErrorMsg := "Invalid OBS path '" + createJobVersionParams.Config.DataUrl + "'."
if temp.ErrorMsg == BootFileErrorMsg {
log.Error("启动文件错误!createTrainJobVersion failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
return &result, fmt.Errorf("启动文件错误!")
}
if temp.ErrorMsg == DataSetErrorMsg {
log.Error("数据集错误!createTrainJobVersion failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
return &result, fmt.Errorf("数据集错误!")
}
return &result, fmt.Errorf("createTrainJobVersion failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
}

if !result.IsSuccess {
log.Error("createTrainJobVersion failed(%s): %s", result.ErrorCode, result.ErrorMsg)
return &result, fmt.Errorf("createTrainJobVersion failed(%s): %s", result.ErrorCode, result.ErrorMsg)
}

return &result, nil
}

func GetResourceSpecs() (*models.GetResourceSpecsResult, error) {
checkSetting()
client := getRestyClient()
var result models.GetResourceSpecsResult

retry := 0

sendjob:
res, err := client.R().
SetHeader("Content-Type", "application/json").
SetAuthToken(TOKEN).
SetResult(&result).
Get(HOST + "/v1/" + setting.ProjectID + urlResourceSpecs)

if err != nil {
return nil, fmt.Errorf("resty GetResourceSpecs: %v", err)
}

if res.StatusCode() == http.StatusUnauthorized && retry < 1 {
retry++
_ = getToken()
goto sendjob
}

if res.StatusCode() != http.StatusOK {
var temp models.ErrorResult
if err = json.Unmarshal([]byte(res.String()), &temp); err != nil {
log.Error("json.Unmarshal failed(%s): %v", res.String(), err.Error())
return &result, fmt.Errorf("json.Unmarshal failed(%s): %v", res.String(), err.Error())
}
log.Error("GetResourceSpecs failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
return &result, fmt.Errorf("GetResourceSpecs failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
}

if !result.IsSuccess {
log.Error("GetResourceSpecs failed(%s): %s", result.ErrorCode, result.ErrorMsg)
return &result, fmt.Errorf("GetResourceSpecs failed(%s): %s", result.ErrorCode, result.ErrorMsg)
}

return &result, nil
}

func CreateTrainJobConfig(req models.CreateConfigParams) (*models.CreateTrainJobConfigResult, error) {
checkSetting()
client := getRestyClient()
var result models.CreateTrainJobConfigResult

retry := 0

sendjob:
res, err := client.R().
SetHeader("Content-Type", "application/json").
SetAuthToken(TOKEN).
SetBody(req).
SetResult(&result).
Post(HOST + "/v1/" + setting.ProjectID + urlTrainJobConfig)

if err != nil {
return nil, fmt.Errorf("resty CreateTrainJobConfig: %s", err)
}

if res.StatusCode() == http.StatusUnauthorized && retry < 1 {
retry++
_ = getToken()
goto sendjob
}

if res.StatusCode() != http.StatusOK {
var temp models.ErrorResult
if err = json.Unmarshal([]byte(res.String()), &temp); err != nil {
log.Error("json.Unmarshal failed(%s): %v", res.String(), err.Error())
return &result, fmt.Errorf("json.Unmarshal failed(%s): %v", res.String(), err.Error())
}
log.Error("CreateTrainJobConfig failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
return &result, fmt.Errorf("CreateTrainJobConfig failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
}

if !result.IsSuccess {
log.Error("CreateTrainJobConfig failed(%s): %s", result.ErrorCode, result.ErrorMsg)
return &result, fmt.Errorf("CreateTrainJobConfig failed(%s): %s", result.ErrorCode, result.ErrorMsg)
}

return &result, nil
}

func GetConfigList(perPage, page int, sortBy, order, searchContent, configType string) (*models.GetConfigListResult, error) {
checkSetting()
client := getRestyClient()
var result models.GetConfigListResult

retry := 0

sendjob:
res, err := client.R().
SetQueryParams(map[string]string{
"per_page": strconv.Itoa(perPage),
"page": strconv.Itoa(page),
"sortBy": sortBy,
"order": order,
"search_content": searchContent,
"config_type": configType,
}).
SetAuthToken(TOKEN).
SetResult(&result).
Get(HOST + "/v1/" + setting.ProjectID + urlTrainJobConfig)

if err != nil {
return nil, fmt.Errorf("resty GetConfigList: %v", err)
}

if res.StatusCode() == http.StatusUnauthorized && retry < 1 {
retry++
_ = getToken()
goto sendjob
}

if res.StatusCode() != http.StatusOK {
var temp models.ErrorResult
if err = json.Unmarshal([]byte(res.String()), &temp); err != nil {
log.Error("json.Unmarshal failed(%s): %v", res.String(), err.Error())
return &result, fmt.Errorf("json.Unmarshal failed(%s): %v", res.String(), err.Error())
}
log.Error("GetConfigList failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
return &result, fmt.Errorf("获取参数配置列表失败(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
}

if !result.IsSuccess {
log.Error("GetConfigList failed(%s): %s", result.ErrorCode, result.ErrorMsg)
return &result, fmt.Errorf("获取参数配置列表失败(%s): %s", result.ErrorCode, result.ErrorMsg)
}

return &result, nil
}

func GetParaConfig(configName, configType string) (models.GetConfigResult, error) {
checkSetting()
client := getRestyClient()
var result models.GetConfigResult

retry := 0

sendjob:
res, err := client.R().
SetQueryParams(map[string]string{
"config_type": configType,
}).
SetAuthToken(TOKEN).
SetResult(&result).
Get(HOST + "/v1/" + setting.ProjectID + urlTrainJobConfig + "/" + configName)

if err != nil {
return result, fmt.Errorf("resty GetParaConfig: %v", err)
}

if res.StatusCode() == http.StatusUnauthorized && retry < 1 {
retry++
_ = getToken()
goto sendjob
}

if res.StatusCode() != http.StatusOK {
var temp models.ErrorResult
if err = json.Unmarshal([]byte(res.String()), &temp); err != nil {
log.Error("json.Unmarshal failed(%s): %v", res.String(), err.Error())
return result, fmt.Errorf("json.Unmarshal failed(%s): %v", res.String(), err.Error())
}
log.Error("GetParaConfig failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
return result, fmt.Errorf("获取参数配置详情失败(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
}

if !result.IsSuccess {
log.Error("GetParaConfig failed(%s): %s", result.ErrorCode, result.ErrorMsg)
return result, fmt.Errorf("获取参数配置详情失败(%s): %s", result.ErrorCode, result.ErrorMsg)
}

return result, nil
}

func GetTrainJob(jobID, versionID string) (*models.GetTrainJobResult, error) {
checkSetting()
client := getRestyClient()
var result models.GetTrainJobResult

retry := 0

sendjob:
res, err := client.R().
SetAuthToken(TOKEN).
SetResult(&result).
Get(HOST + "/v1/" + setting.ProjectID + urlTrainJob + "/" + jobID + "/versions/" + versionID)

if err != nil {
return nil, fmt.Errorf("resty GetTrainJob: %v", err)
}

if res.StatusCode() == http.StatusUnauthorized && retry < 1 {
retry++
_ = getToken()
goto sendjob
}

if res.StatusCode() != http.StatusOK {
var temp models.ErrorResult
if err = json.Unmarshal([]byte(res.String()), &temp); err != nil {
log.Error("json.Unmarshal failed(%s): %v", res.String(), err.Error())
return &result, fmt.Errorf("json.Unmarshal failed(%s): %v", res.String(), err.Error())
}
log.Error("GetTrainJob failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
return &result, fmt.Errorf("获取作业详情失败(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
}

if !result.IsSuccess {
log.Error("GetTrainJob(%s) failed", jobID)
return &result, fmt.Errorf("获取作业详情失败")
}

return &result, nil
}

func GetTrainJobLog(jobID, versionID, baseLine, logFile, order string, lines int) (*models.GetTrainJobLogResult, error) {
checkSetting()
client := getRestyClient()
var result models.GetTrainJobLogResult

retry := 0

sendjob:
res, err := client.R().
SetQueryParams(map[string]string{
"base_line": baseLine,
"lines": strconv.Itoa(lines),
"log_file": logFile,
"order": order,
}).
SetAuthToken(TOKEN).
SetResult(&result).
Get(HOST + "/v1/" + setting.ProjectID + urlTrainJob + "/" + jobID + "/versions/" + versionID + "/aom-log")

if err != nil {
return nil, fmt.Errorf("resty GetTrainJobLog: %v", err)
}

if res.StatusCode() == http.StatusUnauthorized && retry < 1 {
retry++
_ = getToken()
goto sendjob
}

if res.StatusCode() != http.StatusOK {
var temp models.ErrorResult
if err = json.Unmarshal([]byte(res.String()), &temp); err != nil {
log.Error("json.Unmarshal failed(%s): %v", res.String(), err.Error())
return &result, fmt.Errorf("json.Unmarshal failed(%s): %v", res.String(), err.Error())
}
log.Error("GetTrainJobLog failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
return &result, fmt.Errorf("获取作业日志失败(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
}

if !result.IsSuccess {
log.Error("GetTrainJobLog(%s) failed", jobID)
return &result, fmt.Errorf("获取作业日志失败:%s", result.ErrorMsg)
}

return &result, nil
}

func GetTrainJobLogFileNames(jobID, versionID string) (*models.GetTrainJobLogFileNamesResult, error) {
checkSetting()
client := getRestyClient()
var result models.GetTrainJobLogFileNamesResult

retry := 0

sendjob:
res, err := client.R().
SetAuthToken(TOKEN).
SetResult(&result).
Get(HOST + "/v1/" + setting.ProjectID + urlTrainJob + "/" + jobID + "/versions/" + versionID + "/log/file-names")

if err != nil {
return nil, fmt.Errorf("resty GetTrainJobLogFileNames: %v", err)
}

if res.StatusCode() == http.StatusUnauthorized && retry < 1 {
retry++
_ = getToken()
goto sendjob
}

if res.StatusCode() != http.StatusOK {
var temp models.ErrorResult
if err = json.Unmarshal([]byte(res.String()), &temp); err != nil {
log.Error("json.Unmarshal failed(%s): %v", res.String(), err.Error())
return &result, fmt.Errorf("json.Unmarshal failed(%s): %v", res.String(), err.Error())
}
log.Error("GetTrainJobLogFileNames failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
return &result, fmt.Errorf("GetTrainJobLogFileNames failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
}

if !result.IsSuccess {
log.Error("GetTrainJobLogFileNames(%s) failed", jobID)
return &result, fmt.Errorf("获取作业日志文件失败:%s", result.ErrorMsg)
}

return &result, nil
}

func DelTrainJob(jobID string) (*models.TrainJobResult, error) {
checkSetting()
client := getRestyClient()
var result models.TrainJobResult

retry := 0

sendjob:
res, err := client.R().
SetAuthToken(TOKEN).
SetResult(&result).
Delete(HOST + "/v1/" + setting.ProjectID + urlTrainJob + "/" + jobID)

if err != nil {
return &result, fmt.Errorf("resty DelTrainJob: %v", err)
}

if res.StatusCode() == http.StatusUnauthorized && retry < 1 {
retry++
_ = getToken()
goto sendjob
}

if res.StatusCode() != http.StatusOK {
var temp models.ErrorResult
if err = json.Unmarshal([]byte(res.String()), &temp); err != nil {
log.Error("json.Unmarshal failed(%s): %v", res.String(), err.Error())
return &result, fmt.Errorf("json.Unmarshal failed(%s): %v", res.String(), err.Error())
}
log.Error("DelTrainJob failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
if temp.ErrorCode == ModelArtsJobNotExists || temp.ErrorCode == ModelArtsJobInTargetState {
//任务不存在或者已经处于被删除的状态,此时认为删除成功
return &models.TrainJobResult{IsSuccess: true}, nil
}
if result.ErrorCode == ModelArtsJobInternalError {
log.Error("ModelArt internal error when del job,jobId=%s", jobID)
return &models.TrainJobResult{IsSuccess: true}, nil
}
return &result, fmt.Errorf("删除训练作业失败(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
}

if !result.IsSuccess {
log.Error("DelTrainJob(%s) failed", jobID)
if result.ErrorCode == ModelArtsJobNotExists || result.ErrorCode == ModelArtsJobInTargetState {
//任务不存在或者已经处于被删除的状态,此时认为删除成功
return &models.TrainJobResult{IsSuccess: true}, nil
}
if result.ErrorCode == ModelArtsJobInternalError {
log.Error("ModelArt internal error when del job,jobId=%s", jobID)
return &models.TrainJobResult{IsSuccess: true}, nil
}
return &result, fmt.Errorf("删除训练作业失败:%s", result.ErrorMsg)
}

return &result, nil
}

func StopTrainJob(jobID, versionID string) (*models.TrainJobResult, error) {
checkSetting()
client := getRestyClient()
var result models.TrainJobResult

retry := 0

sendjob:
res, err := client.R().
SetAuthToken(TOKEN).
SetResult(&result).
Post(HOST + "/v1/" + setting.ProjectID + urlTrainJob + "/" + jobID + "/versions/" + versionID + "/stop")

if err != nil {
return &result, fmt.Errorf("resty StopTrainJob: %v", err)
}

if res.StatusCode() == http.StatusUnauthorized && retry < 1 {
retry++
_ = getToken()
goto sendjob
}

if res.StatusCode() != http.StatusOK {
var temp models.ErrorResult
if err = json.Unmarshal([]byte(res.String()), &temp); err != nil {
log.Error("json.Unmarshal failed(%s): %v", res.String(), err.Error())
return &result, fmt.Errorf("json.Unmarshal failed(%s): %v", res.String(), err.Error())
}
log.Error("StopTrainJob failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
return &result, fmt.Errorf("停止训练作业失败(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
}

if !result.IsSuccess {
log.Error("StopTrainJob(%s) failed", jobID)
return &result, fmt.Errorf("停止训练作业失败:%s", result.ErrorMsg)
}

return &result, nil
}

func DelTrainJobVersion(jobID string, versionID string) (*models.TrainJobResult, error) {
checkSetting()
client := getRestyClient()
var result models.TrainJobResult

retry := 0

sendjob:
res, err := client.R().
SetAuthToken(TOKEN).
SetResult(&result).
Delete(HOST + "/v1/" + setting.ProjectID + urlTrainJob + "/" + jobID + "/versions/" + versionID)

if err != nil {
return &result, fmt.Errorf("resty DelTrainJobVersion: %v", err)
}

if res.StatusCode() == http.StatusUnauthorized && retry < 1 {
retry++
_ = getToken()
goto sendjob
}

if res.StatusCode() != http.StatusOK {
var temp models.ErrorResult
if err = json.Unmarshal([]byte(res.String()), &temp); err != nil {
log.Error("json.Unmarshal failed(%s): %v", res.String(), err.Error())
return &result, fmt.Errorf("json.Unmarshal failed(%s): %v", res.String(), err.Error())
}

if temp.ErrorCode == ModelArtsJobNotExists || temp.ErrorCode == ModelArtsJobInTargetState {
//任务不存在或者已经处于被删除的状态,此时认为删除成功
return &models.TrainJobResult{IsSuccess: true}, nil
}
if result.ErrorCode == ModelArtsJobInternalError {
log.Error("ModelArt internal error when del job,jobId=%s", jobID)
return &models.TrainJobResult{IsSuccess: true}, nil
}
log.Error("DelTrainJob failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
return &result, fmt.Errorf("删除训练作业版本失败(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
}

if !result.IsSuccess {
log.Error("DelTrainJob(%s) failed", jobID)
return &result, fmt.Errorf("删除训练作业版本失败:%s", result.ErrorMsg)
}

return &result, nil
}

func createInferenceJob(createJobParams models.CreateInferenceJobParams) (*models.CreateTrainJobResult, error) {
checkSetting()
client := getRestyClient()
var result models.CreateTrainJobResult

retry := 0

sendjob:
res, err := client.R().
SetHeader("Content-Type", "application/json").
SetAuthToken(TOKEN).
SetBody(createJobParams).
SetResult(&result).
Post(HOST + "/v1/" + setting.ProjectID + urlTrainJob)

if err != nil {
return nil, fmt.Errorf("resty create inference-job: %s", err)
}

req, _ := json.Marshal(createJobParams)
log.Info("%s", req)

if res.StatusCode() == http.StatusUnauthorized && retry < 1 {
retry++
_ = getToken()
goto sendjob
}

if res.StatusCode() != http.StatusOK {
var temp models.ErrorResult
if err = json.Unmarshal([]byte(res.String()), &temp); err != nil {
log.Error("json.Unmarshal failed(%s): %v", res.String(), err.Error())
return &result, fmt.Errorf("json.Unmarshal failed(%s): %v", res.String(), err.Error())
}
log.Error("createInferenceJob failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
BootFileErrorMsg := "Invalid OBS path '" + createJobParams.InfConfig.BootFileUrl + "'."
DataSetErrorMsg := "Invalid OBS path '" + createJobParams.InfConfig.DataUrl + "'."
if temp.ErrorMsg == BootFileErrorMsg {
log.Error("启动文件错误!createInferenceJob failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
return &result, fmt.Errorf("启动文件错误!")
}
if temp.ErrorMsg == DataSetErrorMsg {
log.Error("数据集错误!createInferenceJob failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
return &result, fmt.Errorf("数据集错误!")
}
if res.StatusCode() == http.StatusBadGateway {
return &result, fmt.Errorf(UnknownErrorPrefix+"createInferenceJob failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
} else {
return &result, fmt.Errorf("createInferenceJob failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
}
}

if !result.IsSuccess {
log.Error("createInferenceJob failed(%s): %s", result.ErrorCode, result.ErrorMsg)
return &result, fmt.Errorf("createInferenceJob failed(%s): %s", result.ErrorCode, result.ErrorMsg)
}

return &result, nil
}

func createInferenceJobUserImage(createJobParams models.CreateInfUserImageParams) (*models.CreateTrainJobResult, error) {
checkSetting()
client := getRestyClient()
var result models.CreateTrainJobResult

retry := 0

sendjob:
res, err := client.R().
SetHeader("Content-Type", "application/json").
SetAuthToken(TOKEN).
SetBody(createJobParams).
SetResult(&result).
Post(HOST + "/v1/" + setting.ProjectID + urlTrainJob)

if err != nil {
return nil, fmt.Errorf("resty create train-job: %s", err)
}

req, _ := json.Marshal(createJobParams)
log.Info("%s", req)

if res.StatusCode() == http.StatusUnauthorized && retry < 1 {
retry++
_ = getToken()
goto sendjob
}

if res.StatusCode() != http.StatusOK {
var temp models.ErrorResult
if err = json.Unmarshal([]byte(res.String()), &temp); err != nil {
log.Error("json.Unmarshal failed(%s): %v", res.String(), err.Error())
return &result, fmt.Errorf("json.Unmarshal failed(%s): %v", res.String(), err.Error())
}
log.Error("createInferenceJobUserImage failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
bootFileErrorMsg := "Invalid OBS path '" + createJobParams.Config.BootFileUrl + "'."
dataSetErrorMsg := "Invalid OBS path '" + createJobParams.Config.DataUrl + "'."
if temp.ErrorMsg == bootFileErrorMsg {
log.Error("启动文件错误!createInferenceJobUserImage failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
return &result, fmt.Errorf("启动文件错误!")
}
if temp.ErrorMsg == dataSetErrorMsg {
log.Error("数据集错误!createInferenceJobUserImage failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
return &result, fmt.Errorf("数据集错误!")
}
if res.StatusCode() == http.StatusBadGateway {
return &result, fmt.Errorf(UnknownErrorPrefix+"createInferenceJobUserImage failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
} else {
return &result, fmt.Errorf("createInferenceJobUserImage failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
}
}

if !result.IsSuccess {
log.Error("createInferenceJobUserImage failed(%s): %s", result.ErrorCode, result.ErrorMsg)
return &result, fmt.Errorf("createInferenceJobUserImage failed(%s): %s", result.ErrorCode, result.ErrorMsg)
}

return &result, nil
}

func CreateNotebook2(createJobParams models.CreateNotebook2Params) (*models.CreateNotebookResult, error) {
checkSetting()
client := getRestyClient()
var result models.CreateNotebookResult

retry := 0

sendjob:
res, err := client.R().
SetHeader("Content-Type", "application/json").
SetAuthToken(TOKEN).
SetBody(createJobParams).
SetResult(&result).
Post(HOST + "/v1/" + setting.ProjectID + urlNotebook2)

if err != nil {
return nil, fmt.Errorf("resty create notebook2: %s", err)
}

if res.StatusCode() == http.StatusUnauthorized && retry < 1 {
retry++
_ = getToken()
goto sendjob
}

var response models.NotebookResult
err = json.Unmarshal(res.Body(), &response)
if err != nil {
log.Error("json.Unmarshal failed: %s", err.Error())
return &result, fmt.Errorf("json.Unmarshal failed: %s", err.Error())
}

if res.StatusCode() == http.StatusBadGateway {
return &result, fmt.Errorf(UnknownErrorPrefix+"createNotebook2 failed(%s): %s", response.ErrorCode, response.ErrorMsg)
}

if len(response.ErrorCode) != 0 {
log.Error("createNotebook2 failed(%s): %s", response.ErrorCode, response.ErrorMsg)
if response.ErrorCode == errorCodeExceedLimit {
response.ErrorMsg = "所选规格使用数量已超过最大配额限制。"
}
if response.ErrorCode == modelartsIllegalToken && retry < 1 {
retry++
_ = getToken()
goto sendjob
}
return &result, fmt.Errorf("createNotebook2 failed(%s): %s", response.ErrorCode, response.ErrorMsg)
}

return &result, nil
}

func GetTrainJobMetricStatistic(jobID, versionID, podName string) (*models.GetTrainJobMetricStatisticResult, error) {
checkSetting()
client := getRestyClient()
var result models.GetTrainJobMetricStatisticResult

retry := 0

sendjob:
res, err := client.R().
SetAuthToken(TOKEN).
SetResult(&result).
Get(HOST + "/v1/" + setting.ProjectID + urlTrainJob + "/" + jobID + "/versions/" + versionID + "/pod/" + podName + "/metric-statistic?statistic_type=each")

if err != nil {
return nil, fmt.Errorf("resty GetTrainJobMetricStatistic: %v", err)
}

if res.StatusCode() == http.StatusUnauthorized && retry < 1 {
retry++
_ = getToken()
goto sendjob
}

if res.StatusCode() != http.StatusOK {
var temp models.ErrorResult
if err = json.Unmarshal([]byte(res.String()), &temp); err != nil {
log.Error("json.Unmarshal failed(%s): %v", res.String(), err.Error())
return &result, fmt.Errorf("json.Unmarshal failed(%s): %v", res.String(), err.Error())
}
log.Error("GetTrainJobMetricStatistic failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
return &result, fmt.Errorf("GetTrainJobMetricStatistic failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
}

if !result.IsSuccess {
log.Error("GetTrainJobMetricStatistic(%s) failed", jobID)
return &result, fmt.Errorf("获取任务资源占用情况失败:%s", result.ErrorMsg)
}

return &result, nil
}

func GetTrainJobList(perPage, page int, sortBy, order, searchContent string) (*models.GetTrainJobListResult, error) {
checkSetting()
client := getRestyClient()
var result models.GetTrainJobListResult

retry := 0

sendjob:
res, err := client.R().
SetQueryParams(map[string]string{
"per_page": strconv.Itoa(perPage),
"page": strconv.Itoa(page),
"sortBy": sortBy,
"order": order,
"search_content": searchContent,
}).
SetAuthToken(TOKEN).
SetResult(&result).
Get(HOST + "/v1/" + setting.ProjectID + urlTrainJob)

if err != nil {
return nil, fmt.Errorf("resty GetTrainJobList: %v", err)
}

if res.StatusCode() == http.StatusUnauthorized && retry < 1 {
retry++
_ = getToken()
goto sendjob
}

if res.StatusCode() != http.StatusOK {
var temp models.ErrorResult
if err = json.Unmarshal([]byte(res.String()), &temp); err != nil {
log.Error("json.Unmarshal failed(%s): %v", res.String(), err.Error())
return &result, fmt.Errorf("json.Unmarshal failed(%s): %v", res.String(), err.Error())
}
log.Error("GetTrainJobList failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
return &result, fmt.Errorf(temp.ErrorMsg)
}

if !result.IsSuccess {
log.Error("GetTrainJobList failed(%s): %s", result.ErrorCode, result.ErrorMsg)
return &result, fmt.Errorf(result.ErrorMsg)
}

return &result, nil
}

func GetTrainJobVersionList(perPage, page int, jobID string) (*models.GetTrainJobVersionListResult, error) {
checkSetting()
client := getRestyClient()
var result models.GetTrainJobVersionListResult

retry := 0

sendjob:
res, err := client.R().
SetQueryParams(map[string]string{
"per_page": strconv.Itoa(perPage),
"page": strconv.Itoa(page),
}).
SetAuthToken(TOKEN).
SetResult(&result).
Get(HOST + "/v1/" + setting.ProjectID + urlTrainJob + "/" + jobID + "/versions")

if err != nil {
return nil, fmt.Errorf("resty GetTrainJobVersionList: %v", err)
}

if res.StatusCode() == http.StatusUnauthorized && retry < 1 {
retry++
_ = getToken()
goto sendjob
}

if res.StatusCode() != http.StatusOK {
var temp models.ErrorResult
if err = json.Unmarshal([]byte(res.String()), &temp); err != nil {
log.Error("json.Unmarshal failed(%s): %v", res.String(), err.Error())
return &result, fmt.Errorf("json.Unmarshal failed(%s): %v", res.String(), err.Error())
}
log.Error("GetTrainJobVersionList failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
return &result, fmt.Errorf(temp.ErrorMsg)
}

if !result.IsSuccess {
log.Error("GetTrainJobVersionList failed(%s): %s", result.ErrorCode, result.ErrorMsg)
return &result, fmt.Errorf(result.ErrorMsg)
}

return &result, nil
}

func GetNotebookList(limit, offset int, sortBy, order, searchContent string) (*models.GetNotebookListResult, error) {
checkSetting()
client := getRestyClient()
var result models.GetNotebookListResult

retry := 0

sendjob:
res, err := client.R().
SetQueryParams(map[string]string{
"limit": strconv.Itoa(limit),
"offset": strconv.Itoa(offset),
"name": searchContent,
"sort_key": sortBy,
"sort_dir": order,
}).
SetAuthToken(TOKEN).
SetResult(&result).
Get(HOST + "/v1/" + setting.ProjectID + urlNotebook2)

if err != nil {
return nil, fmt.Errorf("resty GetNotebookList: %v", err)
}

if res.StatusCode() == http.StatusUnauthorized && retry < 1 {
retry++
_ = getToken()
goto sendjob
}

if res.StatusCode() != http.StatusOK {
var temp models.ErrorResult
if err = json.Unmarshal([]byte(res.String()), &temp); err != nil {
log.Error("json.Unmarshal failed(%s): %v", res.String(), err.Error())
return &result, fmt.Errorf("json.Unmarshal failed(%s): %v", res.String(), err.Error())
}
log.Error("GetNotebookList failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
return &result, fmt.Errorf(temp.ErrorMsg)
}

return &result, nil
}

+ 233
- 0
manager/client/cloudbrain_two_cd/resty.go View File

@@ -0,0 +1,233 @@
package cloudbrain_two_cd

import (
"bytes"
"code.gitea.io/gitea/modules/modelarts_gateway/core"
"crypto/tls"
"encoding/json"
"fmt"
"io/ioutil"
"net/http"
"strconv"
"time"

"code.gitea.io/gitea/models"
"code.gitea.io/gitea/modules/log"
"code.gitea.io/gitea/modules/setting"
)

var (
httpClient *http.Client
HOST string
TOKEN string
autoStopDurationMs = 4 * 60 * 60 * 1000
)

const (
errorCodeExceedLimit = "ModelArts.0118"

//notebook 2.0
urlNotebook2 = "/notebooks"

//error code
modelartsIllegalToken = "ModelArts.6401"
NotebookNotFound = "ModelArts.6404"
NotebookNoPermission = "ModelArts.6407"
NotebookInvalid = "ModelArts.6400"
UnknownErrorPrefix = "UNKNOWN:"
ModelArtsJobNotExists = "ModelArts.0102"
ModelArtsJobInTargetState = "ModelArts.6357"
ModelArtsJobInternalError = "ModelArts.0010"
)

func getHttpClient() *http.Client {
if httpClient == nil {
httpClient = &http.Client{
Timeout: 30 * time.Second,
Transport: &http.Transport{TLSClientConfig: &tls.Config{InsecureSkipVerify: true}},
}
}
return httpClient
}

func GetNotebook(jobID string) (*models.GetNotebook2Result, error) {
var result models.GetNotebook2Result

client := getHttpClient()
s := core.Signer{
Key: setting.ModelartsCD.AccessKey,
Secret: setting.ModelartsCD.SecretKey,
}
r, _ := http.NewRequest(http.MethodGet,
setting.ModelartsCD.EndPoint+"/v1/"+setting.ModelartsCD.ProjectID+urlNotebook2+"/"+jobID,
nil)

r.Header.Add("content-type", "application/json")
s.Sign(r)

resp, err := client.Do(r)
if err != nil {
log.Error("client.Do failed: %s", err.Error())
return &result, fmt.Errorf("client.Do failed: %s", err.Error())
}

defer resp.Body.Close()
body, err := ioutil.ReadAll(resp.Body)
if err != nil {
log.Error("ioutil.ReadAll failed: %s", err.Error())
return &result, fmt.Errorf("ioutil.ReadAll failed: %s", err.Error())
}

err = json.Unmarshal(body, &result)
if err != nil {
log.Error("json.Unmarshal failed: %s", err.Error())
return &result, fmt.Errorf("son.Unmarshal failed: %s", err.Error())
}

if len(result.ErrorCode) != 0 {
log.Error("GetNotebook failed(%s): %s", result.ErrorCode, result.ErrorMsg)
return &result, fmt.Errorf("GetNotebook failed(%s): %s", result.ErrorCode, result.ErrorMsg)
}

return &result, nil
}

func ManageNotebook(jobID string, param models.NotebookAction) (*models.NotebookActionResult, error) {
var result models.NotebookActionResult

client := getHttpClient()
s := core.Signer{
Key: setting.ModelartsCD.AccessKey,
Secret: setting.ModelartsCD.SecretKey,
}
r, _ := http.NewRequest(http.MethodPost,
setting.ModelartsCD.EndPoint+"/v1/"+setting.ModelartsCD.ProjectID+urlNotebook2+"/"+jobID+"/"+param.Action+"?duration="+strconv.Itoa(autoStopDurationMs),
nil)

r.Header.Add("content-type", "application/json")
s.Sign(r)

resp, err := client.Do(r)
if err != nil {
log.Error("client.Do failed: %s", err.Error())
return &result, fmt.Errorf("client.Do failed: %s", err.Error())
}

defer resp.Body.Close()
body, err := ioutil.ReadAll(resp.Body)
if err != nil {
log.Error("ioutil.ReadAll failed: %s", err.Error())
return &result, fmt.Errorf("ioutil.ReadAll failed: %s", err.Error())
}

err = json.Unmarshal(body, &result)
if err != nil {
log.Error("json.Unmarshal failed: %s", err.Error())
return &result, fmt.Errorf("son.Unmarshal failed: %s", err.Error())
}

if len(result.ErrorCode) != 0 {
log.Error("ManageNotebook2 failed(%s): %s", result.ErrorCode, result.ErrorMsg)
return &result, fmt.Errorf("ManageNotebook failed(%s): %s", result.ErrorCode, result.ErrorMsg)
}

return &result, nil
}

func DelNotebook(jobID string) (*models.NotebookDelResult, error) {
var result models.NotebookDelResult

client := getHttpClient()
s := core.Signer{
Key: setting.ModelartsCD.AccessKey,
Secret: setting.ModelartsCD.SecretKey,
}

r, _ := http.NewRequest(http.MethodDelete,
setting.ModelartsCD.EndPoint+"/v1/"+setting.ModelartsCD.ProjectID+urlNotebook2+"/"+jobID,
nil)

r.Header.Add("content-type", "application/json")
s.Sign(r)

resp, err := client.Do(r)
if err != nil {
log.Error("client.Do failed: %s", err.Error())
return &result, fmt.Errorf("client.Do failed: %s", err.Error())
}

defer resp.Body.Close()
body, err := ioutil.ReadAll(resp.Body)
if err != nil {
log.Error("ioutil.ReadAll failed: %s", err.Error())
return &result, fmt.Errorf("ioutil.ReadAll failed: %s", err.Error())
}

err = json.Unmarshal(body, &result)
if err != nil {
log.Error("json.Unmarshal failed: %s", err.Error())
return &result, fmt.Errorf("son.Unmarshal failed: %s", err.Error())
}

if len(result.ErrorCode) != 0 {
log.Error("DelNotebook2 failed(%s): %s", result.ErrorCode, result.ErrorMsg)
if result.ErrorCode == ModelArtsJobNotExists || result.ErrorCode == ModelArtsJobInTargetState {
//任务不存在或者已经处于被删除的状态,此时认为删除成功
return &models.NotebookDelResult{}, nil
}

if result.ErrorCode == ModelArtsJobInternalError {
log.Error("ModelArt internal error when del job,jobId=%s", jobID)
return &models.NotebookDelResult{}, nil
}
return &result, fmt.Errorf("DelNotebook2 failed(%s): %s", result.ErrorCode, result.ErrorMsg)
}

return &result, nil
}

func CreateNotebook(createJobParams models.CreateNotebookWithoutPoolParams) (*models.CreateNotebookResult, error) {
var result models.CreateNotebookResult
client := getHttpClient()
s := core.Signer{
Key: setting.ModelartsCD.AccessKey,
Secret: setting.ModelartsCD.SecretKey,
}

req, _ := json.Marshal(createJobParams)
r, _ := http.NewRequest(http.MethodPost,
setting.ModelartsCD.EndPoint+"/v1/"+setting.ModelartsCD.ProjectID+urlNotebook2,
ioutil.NopCloser(bytes.NewBuffer(req)))

r.Header.Add("content-type", "application/json")
s.Sign(r)

resp, err := client.Do(r)
if err != nil {
log.Error("client.Do failed: %s", err.Error())
return &result, fmt.Errorf("client.Do failed: %s", err.Error())
}

defer resp.Body.Close()
body, err := ioutil.ReadAll(resp.Body)
if err != nil {
log.Error("ioutil.ReadAll failed: %s", err.Error())
return &result, fmt.Errorf("ioutil.ReadAll failed: %s", err.Error())
}

err = json.Unmarshal(body, &result)
if err != nil {
log.Error("json.Unmarshal failed: %s", err.Error())
return &result, fmt.Errorf("json.Unmarshal failed: %s", err.Error())
}

if len(result.ErrorCode) != 0 {
log.Error("createNotebook failed(%s): %s", result.ErrorCode, result.ErrorMsg)
if result.ErrorCode == errorCodeExceedLimit {
result.ErrorMsg = "所选规格使用数量已超过最大配额限制。"
}
return &result, fmt.Errorf("createNotebook failed(%s): %s", result.ErrorCode, result.ErrorMsg)
}

return &result, nil
}

+ 65
- 1
manager/client/grampus/grampus.go View File

@@ -118,7 +118,7 @@ sendjob:
log.Error("CreateNotebookJob failed(%d): %s", result.ErrorCode, result.ErrorMsg)
return &result, fmt.Errorf("CreateNotebookJob failed(%d): %s", result.ErrorCode, result.ErrorMsg)
}
log.Info("CreateNotebookJob success.req.JobName = %s ,result=%+v", req.Name, result)
return &result, nil
}

@@ -466,6 +466,70 @@ sendjob:
log.Error("resty grampus restart note book job failed(%s): %v", res.String(), err.Error())
return nil, fmt.Errorf("resty grampus restart note book job failed: %v", err)
}
log.Info("RestartNotebookJob success.jobId = %s ,result=%+v", jobID, restartResponse)

return restartResponse, nil
}

func GetDebugJobEvents(jobID string) (*models.GetGrampusDebugJobEventsResponse, error) {
checkSetting()
client := getRestyClient()
var result models.GetGrampusDebugJobEventsResponse

retry := 0

sendjob:
res, err := client.R().
SetAuthToken(TOKEN).
SetResult(&result).
Get(HOST + urlNotebookJob + "/" + jobID + "/events")
log.Info("res=%v", res)
if err != nil {
return nil, fmt.Errorf("resty GetDebugJobEvents: %v", err)
}

if result.ErrorCode == errorIllegalToken && retry < 1 {
retry++
log.Info("retry get token")
_ = getToken()
goto sendjob
}

if result.ErrorCode != 0 {
log.Error("GetDebugJobEvents failed(%d): %s", result.ErrorCode, result.ErrorMsg)
return nil, fmt.Errorf("GetDebugJobEvents failed(%d): %s", result.ErrorCode, result.ErrorMsg)
}

return &result, nil
}

func GetTrainJobEvents(jobID string) (*models.GetGrampusJobEventsResponse, error) {
checkSetting()
client := getRestyClient()
var result models.GetGrampusJobEventsResponse

retry := 0

sendjob:
_, err := client.R().
SetAuthToken(TOKEN).
SetResult(&result).
Get(HOST + urlTrainJob + "/" + jobID + "/events")
if err != nil {
return nil, fmt.Errorf("resty GetTrainJobEvents: %v", err)
}

if result.ErrorCode == errorIllegalToken && retry < 1 {
retry++
log.Info("retry get token")
_ = getToken()
goto sendjob
}

if result.ErrorCode != 0 {
log.Error("GetTrainJobEvents failed(%d): %s", result.ErrorCode, result.ErrorMsg)
return nil, fmt.Errorf("GetTrainJobEvents failed(%d): %s", result.ErrorCode, result.ErrorMsg)
}

return &result, nil
}

+ 39
- 2
models/action.go View File

@@ -70,6 +70,8 @@ const (
ActionCreateGrampusGCUDebugTask //41
ActionCreateGrampusGCUTrainTask //42
ActionCreateGrampusMLUDebugTask //43
ActionCreateGrampusMLUTrainTask //44
ActionCreateGrampusGPUOnlineInferTask //45
)

// Action represents user operation type and other information to
@@ -126,6 +128,20 @@ func (a *Action) loadActUser() {
}
}

func (a *Action) FilterCloudbrainInfo() {
if a.Cloudbrain == nil {
return
}

if a.Cloudbrain.DeletedAt.IsZero() {
newCloudbrain := &Cloudbrain{}
newCloudbrain.ID = a.Cloudbrain.ID
a.Cloudbrain = newCloudbrain
} else {
a.Cloudbrain = nil
}
}

func (a *Action) loadRepo() {
if a.Repo != nil {
return
@@ -136,6 +152,26 @@ func (a *Action) loadRepo() {
log.Error("GetRepositoryByID(%d): %v", a.RepoID, err)
}
}
func (a *Action) loadCloudbrain() {
if !a.IsCloudbrainAction() {
return
}
cloudbrain := &Cloudbrain{}
cloudbrainId, _ := strconv.ParseInt(a.Content, 10, 64)
jobId := a.Content

//由于各个类型的云脑任务在发布action的时候,content字段保存的ID含义不同,部分取的是ID,部分取的是jobId
//所以在查询action对应的cloudbrain对象时,以这两个字段做为条件查询
if has, err := x.
Where(builder.Or(builder.Eq{"id": cloudbrainId}).Or(builder.Eq{"job_id": jobId})).Unscoped().
Get(cloudbrain); err != nil || !has {
return
}
if cloudbrain.DisplayJobName == a.RefName || cloudbrain.JobName == a.RefName {
a.Cloudbrain = cloudbrain
}

}

// GetActFullName gets the action's user full name.
func (a *Action) GetActFullName() string {
@@ -381,6 +417,7 @@ func (a *Action) IsCloudbrainAction() bool {
ActionCreateBenchMarkTask,
ActionCreateGPUTrainTask,
ActionCreateGrampusGPUDebugTask,
ActionCreateGrampusGPUOnlineInferTask,
ActionCreateGrampusNPUDebugTask,
ActionCreateGrampusNPUTrainTask,
ActionCreateGrampusGPUTrainTask,
@@ -463,7 +500,7 @@ func GetFeeds(opts GetFeedsOptions) ([]*Action, error) {
return nil, fmt.Errorf("Find: %v", err)
}

if err := ActionList(actions).LoadAttributes(); err != nil {
if err := ActionList(actions).LoadAllAttributes(); err != nil {
return nil, fmt.Errorf("LoadAttributes: %v", err)
}

@@ -483,7 +520,7 @@ func GetLast20PublicFeeds(opTypes []int) ([]*Action, error) {
return nil, fmt.Errorf("Find: %v", err)
}

if err := ActionList(actions).LoadAttributes(); err != nil {
if err := ActionList(actions).LoadAllAttributes(); err != nil {
return nil, fmt.Errorf("LoadAttributes: %v", err)
}



+ 14
- 0
models/ai_model_manage.go View File

@@ -819,3 +819,17 @@ func QueryModelForSearch(opts *AiModelQueryOptions) ([]*AiModelManage, int64, er

return aiModelManages, count, nil
}

func QueryModelRepoByModelID(modelId string) (*Repository, error) {
r := &Repository{}
has, err := x.Where(builder.NewCond().
And(builder.Eq{"id": builder.Select("repo_id").
From("ai_model_manage").
Where(builder.Eq{"id": modelId})})).Get(r)
if err != nil {
return nil, err
} else if !has {
return nil, &ErrRecordNotExist{}
}
return r, nil
}

+ 1
- 1
models/attachment.go View File

@@ -329,7 +329,7 @@ func DeleteAttachments(attachments []*Attachment, remove bool) (int, error) {
log.Info("Message:%s\n", obsError.Message)
}
}
DeleteFileChunkById(a.UUID)
//rf := path.Join(a.UUID[0:1], a.UUID[1:2])
/*
files, err := repo.GetDatasetDirs(a.UUID, "")


+ 114
- 10
models/cloudbrain.go View File

@@ -77,6 +77,7 @@ const (
JobTypeSim2BrainSNN JobType = "SIM2BRAIN_SNN"
JobTypeTrain JobType = "TRAIN"
JobTypeInference JobType = "INFERENCE"
JobTypeOnlineInference JobType = "ONLINEINFERENCE"

//notebook
ModelArtsCreateQueue ModelArtsJobStatus = "CREATE_QUEUING" //免费资源创建排队中
@@ -237,6 +238,7 @@ type Cloudbrain struct {
EngineID int64 //引擎id
ImageID string //grampus image_id
AiCenter string //grampus ai center: center_id+center_name
FailedReason string `xorm:"text"`

TrainUrl string //输出模型的obs路径
BranchName string `xorm:"varchar(2550)"` //分支名称
@@ -344,15 +346,56 @@ func (task *Cloudbrain) CorrectCreateUnix() {
task.CreatedUnix = task.StartTime
}
}
func (task *Cloudbrain) GetAiCenter() string {
if task.Type == TypeCloudBrainOne {
return AICenterOfCloudBrainOne
} else if task.Type == TypeCloudBrainTwo {
return AICenterOfCloudBrainTwo
} else if task.Type == TypeCDCenter {
return AICenterOfChengdu
} else {
return strings.Split(task.AiCenter, "+")[0]
}

}

//是否为在线notebook文件任务
func (task *Cloudbrain) IsFileNoteBookTask() bool {
return task.JobType == string(JobTypeDebug) && task.BootFile != ""
}

func (task *Cloudbrain) CanUserModify(user *User) bool {
if user == nil {
return false
}

return user.IsAdmin || user.ID == task.UserID
}
func (task *Cloudbrain) CanUserDelete(user *User, isRepoOwner bool) bool {
if user == nil {
return false
}

return isRepoOwner || user.IsAdmin || user.ID == task.UserID
}

func AllTerminalStatus() []string {
return []string{string(ModelArtsTrainJobCompleted), string(ModelArtsTrainJobFailed),
string(ModelArtsTrainJobKilled), string(ModelArtsStopped),
string(JobStopped), string(JobFailed),
string(ModelArtsTrainJobKilled), string(ModelArtsStopped), string(ModelArtsCreateFailed),
string(ModelArtsStartFailed), string(JobStopped), string(JobFailed),
string(JobSucceeded), GrampusStatusFailed,
GrampusStatusSucceeded, GrampusStatusStopped, LocalStatusFailed}
}

func IsCloudbrainTerminalStatus(status string) bool {
for _, s := range AllTerminalStatus() {
if strings.ToUpper(status) == strings.ToUpper(s) {
return true
}
}
return false
}

func AllStoppingStatus() []string {
return []string{string(ModelArtsStopping), string(ModelArtsDeleting),
string(ModelArtsTrainJobKilling), GrampusStatusStopping}
@@ -388,12 +431,7 @@ func AllStoppingAndTerminalStatus() []string {

func (task *Cloudbrain) IsTerminal() bool {
status := task.Status
for _, s := range AllTerminalStatus() {
if status == s {
return true
}
}
return false
return IsCloudbrainTerminalStatus(status)
}
func (task *Cloudbrain) IsPreparing() bool {
return task.Status == LocalStatusPreparing
@@ -405,6 +443,15 @@ func (task *Cloudbrain) NeedActiveStop() bool {
return task.IsCreating() || (task.IsPreparing() && int64(task.CreatedUnix) < time.Now().Add(-1*setting.PREPARING_MAX_WAIT_DURATION).Unix())
}

//是否允许创建多版本
//目前只有启智NPU可以
func (task *Cloudbrain) IsAllowedToCreateMultipleVersions() bool {
if task.Type == TypeCloudBrainTwo && task.ComputeResource == NPUResource {
return true
}
return false
}

func (task *Cloudbrain) IsNewAITask() bool {
for k, v := range setting.AI_TASK_RANGE {
if k == task.JobType+"_"+fmt.Sprint(task.Type) {
@@ -1225,6 +1272,11 @@ type GetNotebook2Result struct {
Ownership string `json:"ownership"`
Status string `json:"status"`
} `json:"volume"`
ActionProgress []struct {
Step int `json:"step"`
Status string `json:"status"`
Description string `json:"description"`
} `json:"action_progress"`
}

type GetTokenParams struct {
@@ -1694,6 +1746,11 @@ type NotebookList struct {
JobName string `json:"name"`
JobID string `json:"id"`
Status string `json:"status"`
Lease struct {
CreateTime int64 `json:"create_at"` //实例创建的时间,UTC毫秒
Duration int64 `json:"duration"` //实例运行时长,以创建时间为起点计算,即“创建时间+duration > 当前时刻”时,系统会自动停止实例
UpdateTime int64 `json:"update_at"` //实例最后更新(不包括保活心跳)的时间,UTC毫秒
} `json:"lease"` //实例自动停止的倒计时信息
}

type GetNotebookListResult struct {
@@ -1886,7 +1943,7 @@ type GrampusTasks struct {
WorkServerNumber int `json:"nodeCount"`
}
type GrampusNotebookTask struct {
AutoStopDuration int `json:"autoStopDuration"`
AutoStopDuration int64 `json:"autoStopDuration"`
Name string `json:"name"`
Capacity int `json:"capacity"`
CenterID []string `json:"centerID"`
@@ -2227,6 +2284,22 @@ func CloudbrainsVersionList(opts *CloudbrainsOptions) ([]*CloudbrainInfo, int, e
return cloudbrains, int(count), nil
}

func GetCloudbrainEarlyVersionList(task *Cloudbrain) ([]*Cloudbrain, error) {
cloudbrains := make([]*Cloudbrain, 0)
if err := x.Where(builder.NewCond().
And(builder.Eq{"cloudbrain.repo_id": task.RepoID}).
And(builder.Eq{"cloudbrain.type": task.Type}).
And(builder.Eq{"cloudbrain.job_id": task.JobID}).
And(builder.Eq{"cloudbrain.job_type": task.JobType}).
And(builder.Lt{"cloudbrain.created_unix": task.CreatedUnix})).
OrderBy("cloudbrain.created_unix DESC").
Find(&cloudbrains); err != nil {
return nil, fmt.Errorf("Find: %v", err)
}

return cloudbrains, nil
}

func CreateCloudbrain(cloudbrain *Cloudbrain) (err error) {
session := x.NewSession()
defer session.Close()
@@ -2302,6 +2375,26 @@ func GetCloudbrainByJobID(jobID string) (*Cloudbrain, error) {
return getRepoCloudBrain(cb)
}

func GetCloudbrainListByJobID(jobID string) ([]*Cloudbrain, error) {
r := make([]*Cloudbrain, 0)
if err := x.Where("job_id = ?", jobID).OrderBy("id desc").Find(&r); err != nil {
return nil, err
}
return r, nil

}

func GetNewestCloudbrainByJobId(jobID string) (*Cloudbrain, error) {
r := &Cloudbrain{}
if has, err := x.Where("job_id = ?", jobID).OrderBy("id desc").Limit(1).Get(r); err != nil {
return nil, err
} else if !has {
return nil, ErrRecordNotExist{}
}
return r, nil

}

func GetCloudbrainByJobIDWithDeleted(jobID string) (*Cloudbrain, error) {
cb := &Cloudbrain{JobID: jobID}
return getRepoCloudBrainWithDeleted(cb)
@@ -2663,7 +2756,7 @@ func GetModelSafetyCountByUserID(userID int64) (int, error) {
}

func GetWaitingCloudbrainCount(cloudbrainType int, computeResource string, jobTypes ...JobType) (int64, error) {
sess := x.Where("status=? and type=?", JobWaiting, cloudbrainType)
sess := x.Where(builder.NewCond().And(builder.In("status", JobWaiting, LocalStatusPreparing, LocalStatusCreating)).And(builder.Eq{"type": cloudbrainType}))
if len(jobTypes) > 0 {
sess.In("job_type", jobTypes)
}
@@ -2947,6 +3040,15 @@ func CloudbrainAllStatic(opts *CloudbrainsOptions) ([]*CloudbrainInfo, int64, er
return cloudbrains, count, nil
}

func GetLastestNCloudbrain(n int) ([]*Cloudbrain, error) {
r := make([]*Cloudbrain, 0)
err := x.Where("ai_center!='' or type!=2").Desc("id").Limit(n).Unscoped().Find(&r)
if err != nil {
return nil, err
}
return r, nil

}
func CloudbrainAllKanBan(opts *CloudbrainsOptions) ([]*CloudbrainInfo, int64, error) {
sess := x.NewSession()
defer sess.Close()
@@ -3224,3 +3326,5 @@ func GetCloudBrainByRepoIdAndModelName(repoId int64, modelName string) ([]*Cloud
err := x.AllCols().Where("model_name=? and repo_id=?", modelName, repoId).OrderBy("created_unix asc").Find(&cloudBrains)
return cloudBrains, err
}

var SubTaskName = "task1"

+ 24
- 0
models/cloudbrain_spec.go View File

@@ -134,3 +134,27 @@ func GetCloudbrainTaskUnitPrice(task Cloudbrain) (int, error) {
}
return s.UnitPrice * n, nil
}

func UpdateCloudbrainSpec(cloudbrainId int64, s *Specification) (int64, error) {
new := CloudbrainSpec{
CloudbrainID: cloudbrainId,
SpecId: s.ID,
SourceSpecId: s.SourceSpecId,
AccCardsNum: s.AccCardsNum,
AccCardType: s.AccCardType,
CpuCores: s.CpuCores,
MemGiB: s.MemGiB,
GPUMemGiB: s.GPUMemGiB,
ShareMemGiB: s.ShareMemGiB,
ComputeResource: s.ComputeResource,
UnitPrice: s.UnitPrice,
QueueId: s.QueueId,
QueueCode: s.QueueCode,
Cluster: s.Cluster,
AiCenterCode: s.AiCenterCode,
AiCenterName: s.AiCenterName,
IsExclusive: s.IsExclusive,
ExclusiveOrg: s.ExclusiveOrg,
}
return x.Where("cloudbrain_id = ?", cloudbrainId).Update(&new)
}

+ 17
- 0
models/cloudbrain_static.go View File

@@ -144,6 +144,23 @@ func GetCloudbrainStatusCount() ([]map[string]string, error) {
return x.QueryString(countSql)
}

func GetCloudbrainCardTimeAndCountGroupByAICenter() ([]map[string]string, error) {
countSql := `select ai_center,SUM(
COALESCE(a.duration *
CASE
WHEN a.work_server_number = 0 THEN 1
ELSE COALESCE(a.work_server_number, 1)
END *
COALESCE(cloudbrain_spec.acc_cards_num, 1), 0)
) as card_duration,count(*) num from

(select id,duration,work_server_number,case when type=0 then 'OpenIOne' when type=1 then 'OpenITwo' when type=3 then 'OpenIChengdu' else split_part(ai_center, '+',1)
end ai_center
FROM public.cloudbrain ) a Left JOIN cloudbrain_spec on a.id = cloudbrain_spec.cloudbrain_id
where ai_center!='' group by a.ai_center order by card_duration desc`
return x.QueryString(countSql)
}

func GetCloudbrainTpyeDurationSum() ([]map[string]string, error) {
countSql := "SELECT type,sum(duration) FROM public.cloudbrain group by type order by sum(duration) desc"
return x.QueryString(countSql)


+ 22
- 0
models/file_chunk.go View File

@@ -5,6 +5,7 @@ import (

"code.gitea.io/gitea/modules/log"
"code.gitea.io/gitea/modules/timeutil"
"xorm.io/builder"
"xorm.io/xorm"
)

@@ -92,6 +93,27 @@ func getFileChunkByMD5AndUser(e Engine, md5 string, userID int64, typeCloudBrain
return fileChunk, nil
}

// GetAttachmentByID returns attachment by given id
func GetFileChunksByUserId(userId int64, lastTime int64, isUploadFinished bool) ([]*FileChunk, error) {
return getFileChunksByUserId(x, userId, lastTime, isUploadFinished)
}

func getFileChunksByUserId(e Engine, userId int64, lastTime int64, isUploadFinished bool) ([]*FileChunk, error) {
fileChunks := make([]*FileChunk, 0)
cond := builder.NewCond()
cond = cond.And(builder.Eq{"user_id": userId})
if lastTime > 0 {
cond = cond.And(builder.Gte{"created_unix": lastTime})
}
if !isUploadFinished {
cond = cond.And(builder.Eq{"is_uploaded": 0})
}
if err := e.Where(cond).Find(&fileChunks); err != nil {
return nil, err
}
return fileChunks, nil
}

// GetAttachmentByID returns attachment by given id
func GetFileChunkByUUID(uuid string) (*FileChunk, error) {
return getFileChunkByUUID(x, uuid)


+ 30
- 0
models/ip_location.go View File

@@ -0,0 +1,30 @@
package models

type IPLocation struct {
ID int64 `xorm:"pk autoincr"`
IpAddr string `xorm:"unique"`
Longitude string
Latitude string
}

func CreateIPLocation(ipLocation *IPLocation) (err error) {
_, err = x.Insert(ipLocation)
return err

}

func GetIpLocation(ip string) (*IPLocation, error) {

ipLocation := &IPLocation{IpAddr: ip}
has, err := x.Get(ipLocation)
if err != nil {
return nil, err
}

if has {
return ipLocation, nil
} else {
return nil, ErrRecordNotExist{}
}

}

+ 3
- 2
models/model_migrate_record.go View File

@@ -176,8 +176,9 @@ func UpdateModelMigrateRecordByStep(record *ModelMigrateRecord) error {

func GetUnfinishedModelMigrateRecords() ([]*ModelMigrateRecord, error) {
records := make([]*ModelMigrateRecord, 0, 10)
return records, x.
Where(builder.NewCond().And(builder.In("current_step", UnFinishedMigrateSteps))).
return records, x.Cols("model_migrate_record.id", "model_migrate_record.cloudbrain_id", "model_migrate_record.dest_bucket", "model_migrate_record.dest_endpoint", "model_migrate_record.dest_object_key", "model_migrate_record.dest_proxy", "model_migrate_record.src_bucket", "model_migrate_record.src_endpoint", "model_migrate_record.src_object_key", "model_migrate_record.status", "model_migrate_record.current_step", "model_migrate_record.retry_count", "model_migrate_record.created_unix", "model_migrate_record.updated_unix", "model_migrate_record.deleted_at", "model_migrate_record.remark").Table("model_migrate_record").
Join("inner", "cloudbrain", "cloudbrain.id = model_migrate_record.cloudbrain_id").
Where(builder.NewCond().And(builder.In("model_migrate_record.current_step", UnFinishedMigrateSteps)).And(builder.Eq{"cloudbrain.deleted_at": "0001-01-01 00:00:00"}.Or(builder.IsNull{"cloudbrain.deleted_at"}))).
Limit(100).
Find(&records)
}


+ 11
- 0
models/modelarts_deploy.go View File

@@ -241,3 +241,14 @@ func DeployStatusConvert(status string) string {
return statusConvert
}
}

func GetModelartsDeployFinishTimebyJobID(jobID string) (finishTime timeutil.TimeStamp, err error) {
finishTime = timeutil.TimeStamp(0)
deploy, err := GetModelartsDeployByJobID(jobID)
if err != nil || deploy.CompleteUnix == timeutil.TimeStamp(0) {
return finishTime, err
} else {
finishTime = deploy.CompleteUnix.Add(int64(30 * 60))
return finishTime, nil
}
}

+ 1
- 0
models/models.go View File

@@ -173,6 +173,7 @@ func init() {
new(AiModelCollect),
new(AiModelFile),
new(ModelMigrateRecord),
new(IPLocation),
new(ModelartsDeploy),
new(ModelartsDeployQueue),
)


+ 2
- 0
models/repo_watch.go View File

@@ -332,6 +332,8 @@ func NotifyWatchers(actions ...*Action) error {
func producer(actions ...*Action) {
for _, action := range actions {
if !action.IsPrivate {
action.loadCloudbrain()
action.FilterCloudbrainInfo()
ActionChan <- action
}
}


+ 1
- 0
models/task_config.go View File

@@ -42,6 +42,7 @@ func GetTaskTypeFromAction(a ActionType) TaskType {
ActionCreateGrampusGCUDebugTask,
ActionCreateGrampusGCUTrainTask,
ActionCreateGrampusMLUDebugTask,
ActionCreateGrampusGPUOnlineInferTask,
ActionCreateGrampusGPUTrainTask:
return TaskCreateCloudbrainTask
case ActionCreateRepo:


+ 9
- 0
models/user_login_log.go View File

@@ -13,6 +13,15 @@ type UserLoginLog struct {
CreatedUnix timeutil.TimeStamp `xorm:"created"`
}

func GetIpByUID(uid int64) string {
userLoginLog := new(UserLoginLog)
has, err := xStatistic.Where("u_id=?", uid).Desc("id").Limit(1).Get(userLoginLog)
if err != nil || !has {
return ""
}
return userLoginLog.IpAddr
}

func SaveLoginInfoToDb(r *http.Request, u *User) {
statictisSess := xStatistic.NewSession()
defer statictisSess.Close()


+ 5
- 2
modules/auth/wechat/cloudbrain.go View File

@@ -1,11 +1,12 @@
package wechat

import (
"fmt"
"time"

"code.gitea.io/gitea/models"
"code.gitea.io/gitea/modules/log"
"code.gitea.io/gitea/modules/setting"
"fmt"
"time"
)

type JobOperateType string
@@ -157,6 +158,8 @@ func getJobTypeDisplayName(jobType string) string {
switch jobType {
case string(models.JobTypeDebug):
return "调试任务"
case string(models.JobTypeOnlineInference):
return "在线推理"
case string(models.JobTypeBenchmark):
return "评测任务"
case string(models.JobTypeTrain):


+ 0
- 8
modules/context/repo.go View File

@@ -378,7 +378,6 @@ func RepoAssignment() macaron.Handler {
owner *models.User
err error
)

userName := ctx.Params(":username")
repoName := ctx.Params(":reponame")

@@ -431,7 +430,6 @@ func RepoAssignment() macaron.Handler {
if ctx.Written() {
return
}

ctx.Repo.RepoLink = repo.Link()
ctx.Data["RepoLink"] = ctx.Repo.RepoLink
ctx.Data["RepoRelPath"] = ctx.Repo.Owner.Name + "/" + ctx.Repo.Repository.Name
@@ -464,7 +462,6 @@ func RepoAssignment() macaron.Handler {
ctx.ServerError("CanUserFork", err)
return
}

ctx.Data["DisableSSH"] = setting.SSH.Disabled
ctx.Data["ExposeAnonSSH"] = setting.SSH.ExposeAnonymous
ctx.Data["DisableHTTP"] = setting.Repository.DisableHTTPGit
@@ -581,7 +578,6 @@ func RepoAssignment() macaron.Handler {
}
ctx.Data["CanCompareOrPull"] = canCompare
ctx.Data["PullRequestCtx"] = ctx.Repo.PullRequest

if ctx.Query("go-get") == "1" {
ctx.Data["GoGetImport"] = ComposeGoGetImport(owner.Name, repo.Name)
prefix := setting.AppURL + path.Join(owner.Name, repo.Name, "src", "branch", ctx.Repo.BranchName)
@@ -696,7 +692,6 @@ func RepoRefByType(refType RepoRefType) macaron.Handler {
if ctx.Repo.Repository.IsEmpty {
return
}

var (
refName string
err error
@@ -718,7 +713,6 @@ func RepoRefByType(refType RepoRefType) macaron.Handler {
}
}()
}

// Get default branch.
if len(ctx.Params("*")) == 0 {
refName = ctx.Repo.Repository.DefaultBranch
@@ -789,7 +783,6 @@ func RepoRefByType(refType RepoRefType) macaron.Handler {
return
}
}

ctx.Data["BranchName"] = ctx.Repo.BranchName
ctx.Data["BranchNameSubURL"] = ctx.Repo.BranchNameSubURL()
ctx.Data["CommitID"] = ctx.Repo.CommitID
@@ -805,7 +798,6 @@ func RepoRefByType(refType RepoRefType) macaron.Handler {
return
}
ctx.Data["CommitsCount"] = ctx.Repo.CommitsCount

ctx.Next()
}
}


+ 7
- 0
modules/grampus/grampus.go View File

@@ -137,6 +137,8 @@ func getDatasetGrampus(datasetInfos map[string]models.DatasetInfo) []models.Gram
Bucket: setting.Bucket,
EndPoint: endPoint,
ObjectKey: datasetInfo.DataLocalPath + datasetInfo.FullName,
ReadOnly: true,
ContainerPath: "/tmp/dataset/" + datasetInfo.FullName,
})

}
@@ -356,6 +358,8 @@ func GenerateTrainJob(ctx *context.Context, req *GenerateTrainJobReq) (jobId str
Bucket: setting.Bucket,
EndPoint: getEndPoint(),
ObjectKey: req.PreTrainModelPaths[i],
ContainerPath: "/tmp/pretrainmodel/" + req.CkptName,
ReadOnly: true,
}
}
modelGrampus = append(modelGrampus, ckptGrampus)
@@ -365,8 +369,11 @@ func GenerateTrainJob(ctx *context.Context, req *GenerateTrainJobReq) (jobId str
Bucket: setting.Bucket,
EndPoint: getEndPoint(),
ObjectKey: req.CodeObsPath + cloudbrain.DefaultBranchName + ".zip",
ReadOnly: false,
ContainerPath: "/tmp/code/" + cloudbrain.DefaultBranchName + ".zip",
}
outputGrampus = models.GrampusDataset{
ContainerPath: "/tmp/output",
GetBackEndpoint: getEndPoint(),
}
} else if ProcessorTypeGPU == req.ProcessType {


+ 15
- 3
modules/grampus/resty.go View File

@@ -164,6 +164,10 @@ sendjob:
}

func GetNotebookJob(jobID string) (*models.GrampusNotebookResponse, error) {
if jobID == "" {
return nil, fmt.Errorf("jobID is emmpty")
}

checkSetting()
client := getRestyClient()
var result models.GrampusNotebookResponse
@@ -295,15 +299,20 @@ sendjob:
return &result, nil
}

func GetTrainJobLog(jobID string) (string, error) {
func GetTrainJobLog(jobID string, nodeId ...int) (string, error) {
checkSetting()
client := getRestyClient()
var logContent string

url := HOST + urlTrainJob + "/" + jobID + "/task/0/replica/0/log"
if len(nodeId) > 0 {
url = HOST + urlTrainJob + "/" + jobID + "/task/0/replica/0/log/node/" + strconv.Itoa(nodeId[0])
}

res, err := client.R().
SetAuthToken(TOKEN).
SetResult(&logContent).
Get(HOST + urlTrainJob + "/" + jobID + "/task/0/replica/0/log")
Get(url)

if err != nil {
return logContent, fmt.Errorf("resty GetTrainJobLog: %v", err)
@@ -324,11 +333,14 @@ func GetTrainJobLog(jobID string) (string, error) {
return logContent, nil
}

func GetGrampusMetrics(jobID string, startTime int64, endTime int64) (models.NewModelArtsMetricStatisticResult, error) {
func GetGrampusMetrics(jobID string, startTime int64, endTime int64, nodeId ...int) (models.NewModelArtsMetricStatisticResult, error) {
checkSetting()
client := getRestyClient()
var result models.NewModelArtsMetricStatisticResult
url := HOST + urlTrainJob + "/" + jobID + "/task/0/replica/0/metrics"
if len(nodeId) > 0 {
url = HOST + urlTrainJob + "/" + jobID + "/task/0/replica/0/metrics/node/" + strconv.Itoa(nodeId[0])
}
if startTime > 0 {
var step int64 = 60



+ 44
- 0
modules/ipinfo/ipinfo.go View File

@@ -0,0 +1,44 @@
package ipinfo

import (
"crypto/tls"
"fmt"
"net/http"

"code.gitea.io/gitea/modules/setting"

"github.com/go-resty/resty/v2"
)

var restyClient *resty.Client

type IpInfoResponse struct {
Ip string `json:"ip"`
Loc string `json:"loc"`
Bogon bool `json:"bogon"`
}

func getRestyClient() *resty.Client {
if restyClient == nil {
restyClient = resty.New()
restyClient.SetTLSClientConfig(&tls.Config{InsecureSkipVerify: true})
}
return restyClient
}

func GetLocationByIp(ip string) (*IpInfoResponse, error) {
client := getRestyClient()
var result IpInfoResponse
res, err := client.R().
SetHeader("Accept", "application/json").
SetAuthToken(setting.IPInfo.Token).
SetResult(&result).
Get(setting.IPInfo.Host + "/" + ip)
if err != nil {
return nil, err
}
if res.StatusCode() != http.StatusOK {
return nil, fmt.Errorf("http status is %d", res.StatusCode())
}
return &result, nil
}

+ 1
- 1
modules/minio_ext/constants.go View File

@@ -40,7 +40,7 @@ const maxSinglePutObjectSize = 1024 * 1024 * 1024 * 5

// maxMultipartPutObjectSize - maximum size 5TiB of object for
// Multipart operation.
const MaxMultipartPutObjectSize = 1024 * 1024 * 1024 * 1024 * 5
const MaxMultipartPutObjectSize = 1024 * 1024 * 1024 * 200

// unsignedPayload - value to be set to X-Amz-Content-Sha256 header when
// we don't want to sign the request payload


+ 1
- 1
modules/modelappservice/modelsevice.go View File

@@ -45,7 +45,7 @@ func consumerOrder(in <-chan *models.ModelApp, url string) {
continue
}
log.Info("goroutine id=" + fmt.Sprint(goroutine_id) + " wenxin text=" + modelApp.Desc)
result, err := modelarts.CreateWenXinJob(modelApp, url)
result, err := modelarts.CreateWenXinJobToCD(modelApp, url)
if err == nil {
if !modelarts.SendPictureReivew(result.Result) {
modelApp.Status = -1


+ 25
- 19
modules/modelarts/resty.go View File

@@ -1036,15 +1036,17 @@ func DelTrainJob(jobID string) (*models.TrainJobResult, error) {
var result models.TrainJobResult

//get cloudbrain job by jobid
finetuneJob, _ := models.GetCloudbrainByJobID(jobID)
log.Info("调试:%s", finetuneJob.FineTune)
if finetuneJob, err := models.GetCloudbrainByJobID(jobID); finetuneJob != nil && err == nil {
if finetuneJob.FineTune {
err := ServiceDelete(jobID)
if err != nil {
log.Error("盘古微调部署: Delete Deploy failed:%s %v", jobID, err.Error())
return &result, err
log.Error("panguService: Delete Deploy failed:%s %v", jobID, err.Error())
return nil, err
}
}
} else if err != nil {
log.Warn("DelTrainJob GetCloudbrainByJobID from DB failed:%s %v", jobID, err.Error())
}

retry := 0

@@ -1145,14 +1147,16 @@ func DelTrainJobVersion(jobID string, versionID string) (*models.TrainJobResult,
var result models.TrainJobResult

//get cloudbrain job by jobid
finetuneJob, _ := models.GetCloudbrainByJobID(jobID)
log.Info("调试:%s", finetuneJob.FineTune)
if finetuneJob, err := models.GetCloudbrainByJobID(jobID); finetuneJob != nil && err == nil {
if finetuneJob.FineTune {
err := ServiceDelete(jobID)
if err != nil {
log.Error("盘古微调部署: Delete Deploy failed:%s %v", jobID, err.Error())
return &result, err
log.Error("panguService: Delete Deploy failed:%s %v", jobID, err.Error())
return nil, err
}
}
} else if err != nil {
log.Warn("DelTrainJobVersion GetCloudbrainByJobID failed, cannnot get job from DB:%s %v", jobID, err.Error())
}

retry := 0
@@ -1859,28 +1863,30 @@ sendjob:
func ServiceDelete(jobID string) error {
if deploy, _ := models.GetModelartsDeployByJobID(jobID); deploy != nil {
if deploy.Status == "STOP" || deploy.Status == "FAILED" {

if deploy.ServiceID != "" {
err := DeleteDeployService(deploy.ServiceID)
if err != nil {
if err := DeleteDeployService(deploy.ServiceID); err != nil {
log.Error("panguService: Delete DeployService API failed:%s %v", jobID, err.Error())
return err
}
} else {
log.Info("panguService: deploy service delete success %s", jobID)
}
}

if deploy.ModelID != "" {
err := DeleteDeployModel(deploy.ModelID)
if err != nil {
if err := DeleteDeployModel(deploy.ModelID); err != nil {
log.Error("panguService: Delete DeployModel API failed:%s %v", jobID, err.Error())
return err
}
} else {
log.Info("panguService: deploy model delete success %s", jobID)
}
err := models.DeleteModelartsDeploy(jobID)
if err != nil {
}

if err := models.DeleteModelartsDeploy(jobID); err != nil {
log.Error("panguService: Delete ModelartsDeploy from DB failed:%s %v", jobID, err.Error())
return err
}
} else {
log.Info("panguService: deploy DB record delete success %s", jobID)
}

} else {
log.Error("the job(%s) is a deploying finetune job, can be not deleted", jobID)
return fmt.Errorf("1")


+ 63
- 0
modules/modelarts/wenxinresty.go View File

@@ -1,6 +1,8 @@
package modelarts

import (
"bytes"
"crypto/tls"
"encoding/base64"
"encoding/json"
"fmt"
@@ -13,6 +15,7 @@ import (

"code.gitea.io/gitea/models"
"code.gitea.io/gitea/modules/log"
"code.gitea.io/gitea/modules/modelarts_gateway/core"
"code.gitea.io/gitea/modules/setting"
)

@@ -29,6 +32,66 @@ type WenXinResult struct {
Result string `json:"result"`
}

var (
cdHttpClient *http.Client
)

func getCDHttpClient() *http.Client {
if cdHttpClient == nil {
cdHttpClient = &http.Client{
Timeout: 30 * time.Second,
Transport: &http.Transport{TLSClientConfig: &tls.Config{InsecureSkipVerify: true}},
}
}
return cdHttpClient
}

func CreateWenXinJobToCD(modelapp *models.ModelApp, url string) (*WenXinResult, error) {
createJobParams := &CreateWenXinParams{
Data: WenXinText{
Prompt: modelapp.Desc,
},
Parameters: make(map[string]string),
}
var result WenXinResult

client := getCDHttpClient()
s := core.Signer{
Key: setting.ModelartsCD.AccessKey,
Secret: setting.ModelartsCD.SecretKey,
}

req, _ := json.Marshal(createJobParams)
r, _ := http.NewRequest(http.MethodPost, url, ioutil.NopCloser(bytes.NewBuffer(req)))
log.Info("send to cd modelarts")
r.Header.Add("content-type", "application/json")
s.Sign(r)
res, err := client.Do(r)
if err == nil {
if res.StatusCode == 200 {
defer res.Body.Close()
body, err := ioutil.ReadAll(res.Body)
if err != nil {
log.Error("ioutil.ReadAll failed: %s", err.Error())
return &result, fmt.Errorf("ioutil.ReadAll failed: %s", err.Error())
}
err = json.Unmarshal(body, &result)
if err != nil {
log.Error("json.Unmarshal failed: %s", err.Error())
return &result, fmt.Errorf("son.Unmarshal failed: %s", err.Error())
}
return &result, nil
} else {
log.Info("res.status=" + fmt.Sprint(res.StatusCode))
return nil, fmt.Errorf("Service unavailable")
}
} else {
log.Info("error =" + err.Error())
return nil, fmt.Errorf("Service unavailable")
}

}

func CreateWenXinJob(modelapp *models.ModelApp, url string) (*WenXinResult, error) {
createJobParams := &CreateWenXinParams{
Data: WenXinText{


+ 24
- 0
modules/setting/screen_map.go View File

@@ -0,0 +1,24 @@
package setting

var ScreenMap = struct {
ShowData bool
MinValue int
MaxValue int
}{}

var IPInfo = struct {
Host string
Token string
}{}

func NewScreenMapConfig() {
sec := Cfg.Section("Screen")
ScreenMap.ShowData = sec.Key("ShowData").MustBool(false)
ScreenMap.MinValue = sec.Key("MinValue").MustInt(130)
ScreenMap.MaxValue = sec.Key("MaxValue").MustInt(190)

sec = Cfg.Section("IPInfo")

IPInfo.Host = sec.Key("Host").MustString("https://ipinfo.io")
IPInfo.Token = sec.Key("Token").MustString("df2b002afe582a")
}

+ 68
- 33
modules/setting/setting.go View File

@@ -70,6 +70,8 @@ type C2NetSequenceInfo struct {
Name string `json:"name"`
Content string `json:"content"`
ContentEN string `json:"content_en"`
Loc string `json:"loc"`
Type string `json:"type"`
}

type C2NetSqInfos struct {
@@ -635,6 +637,7 @@ var (
GPULocalCenterID string
AiCenterInfo string
AiCenterCodeAndNameInfo string
AiCenterCodeAndNameAndLocInfo string
UsageRateBeginTime string
GPUImageCommonName string
MultiNode string
@@ -655,6 +658,8 @@ var (
C2NetMapInfo map[string]*C2NetSequenceInfo
AiCenterCodeAndNameMapInfo map[string]*C2NetSequenceInfo

AiCenterCodeAndNameAndLocMapInfo map[string]*C2NetSequenceInfo

//elk config
ElkUrl string
ElkUser string
@@ -842,6 +847,13 @@ var (
ModelApp = struct {
DesensitizationUrl string
}{}

FLOW_CONTROL = struct {
ATTACHEMENT_NUM_A_USER_LAST24HOUR int
ATTACHEMENT_NUM_A_USER_LAST10M int
ATTACHEMENT_SIZE_A_USER int64 //G
ALL_ATTACHEMENT_NUM_SDK int
}{}
)

// DateLang transforms standard language locale name to corresponding value in datetime plugin.
@@ -1623,30 +1635,7 @@ func NewContext() {
UserBasePath = sec.Key("BASE_PATH_USER").MustString("users/")
PROXYURL = sec.Key("PROXY_URL").MustString("")

sec = Cfg.Section("modelarts")
ModelArtsHost = sec.Key("ENDPOINT").MustString("")
IamHost = sec.Key("IAMHOST").MustString("")
ProjectID = sec.Key("PROJECT_ID").MustString("")
ProjectName = sec.Key("PROJECT_NAME").MustString("")
ModelArtsUsername = sec.Key("USERNAME").MustString("")
ModelArtsPassword = sec.Key("PASSWORD").MustString("")
ModelArtsDomain = sec.Key("DOMAIN").MustString("")
AllowedOrg = sec.Key("ORGANIZATION").MustString("")
ProfileID = sec.Key("PROFILE_ID").MustString("")
PoolInfos = sec.Key("POOL_INFOS").MustString("")
ImageInfos = sec.Key("IMAGE_INFOS").MustString("")
Capacity = sec.Key("CAPACITY").MustInt(100)
MaxTempQueryTimes = sec.Key("MAX_TEMP_QUERY_TIMES").MustInt(30)
ResourcePools = sec.Key("Resource_Pools").MustString("")
Engines = sec.Key("Engines").MustString("")
EngineVersions = sec.Key("Engine_Versions").MustString("")
FlavorInfos = sec.Key("FLAVOR_INFOS").MustString("")
TrainJobFLAVORINFOS = sec.Key("TrainJob_FLAVOR_INFOS").MustString("")
ModelArtsSpecialPools = sec.Key("SPECIAL_POOL").MustString("")
ModelArtsMultiNode = sec.Key("MULTI_NODE").MustString("")
ModelArtsShareAddr = sec.Key("ModelArts_Share_Addr").MustString("192.168.0.30:/")
ModelArtsMountPath = sec.Key("ModelArts_Mount_Path").MustString("/cache/sfs")
ModelArtsNasType = sec.Key("ModelArts_Nas_Type").MustString("nfs")
GetModelartsConfig()

sec = Cfg.Section("elk")
ElkUrl = sec.Key("ELKURL").MustString("")
@@ -1742,13 +1731,43 @@ func NewContext() {
BaiduWenXin.RUN_WORKERS = sec.Key("RUN_WORKERS").MustInt(1)
BaiduWenXin.MODEL_SERVERS = sec.Key("MODEL_SERVERS").MustInt(1)

getGrampusConfig()
getModelartsCDConfig()
GetGrampusConfig()
GetModelartsCDConfig()
getModelConvertConfig()
getModelSafetyConfig()
getModelAppConfig()
getClearStrategy()
NewScreenMapConfig()
}

func GetModelartsConfig() {
sec := Cfg.Section("modelarts")
ModelArtsHost = sec.Key("ENDPOINT").MustString("")
IamHost = sec.Key("IAMHOST").MustString("")
ProjectID = sec.Key("PROJECT_ID").MustString("")
ProjectName = sec.Key("PROJECT_NAME").MustString("")
ModelArtsUsername = sec.Key("USERNAME").MustString("")
ModelArtsPassword = sec.Key("PASSWORD").MustString("")
ModelArtsDomain = sec.Key("DOMAIN").MustString("")
AllowedOrg = sec.Key("ORGANIZATION").MustString("")
ProfileID = sec.Key("PROFILE_ID").MustString("")
PoolInfos = sec.Key("POOL_INFOS").MustString("")
ImageInfos = sec.Key("IMAGE_INFOS").MustString("")
Capacity = sec.Key("CAPACITY").MustInt(100)
MaxTempQueryTimes = sec.Key("MAX_TEMP_QUERY_TIMES").MustInt(30)
ResourcePools = sec.Key("Resource_Pools").MustString("")
Engines = sec.Key("Engines").MustString("")
EngineVersions = sec.Key("Engine_Versions").MustString("")
FlavorInfos = sec.Key("FLAVOR_INFOS").MustString("")
TrainJobFLAVORINFOS = sec.Key("TrainJob_FLAVOR_INFOS").MustString("")
ModelArtsSpecialPools = sec.Key("SPECIAL_POOL").MustString("")
ModelArtsMultiNode = sec.Key("MULTI_NODE").MustString("")
ModelArtsShareAddr = sec.Key("ModelArts_Share_Addr").MustString("192.168.0.30:/")
ModelArtsMountPath = sec.Key("ModelArts_Mount_Path").MustString("/cache/sfs")
ModelArtsNasType = sec.Key("ModelArts_Nas_Type").MustString("nfs")

getFineTuneConfig()
getFlowControlConfig()
}

func getModelSafetyConfig() {
@@ -1787,14 +1806,20 @@ func getModelConvertConfig() {
ModelConvert.PaddleOnnxBootFile = sec.Key("PaddleOnnxBootFile").MustString("convert_paddle.py")
ModelConvert.MXnetOnnxBootFile = sec.Key("MXnetOnnxBootFile").MustString("convert_mxnet.py")
}
func getFlowControlConfig() {
sec := Cfg.Section("flow_control")
FLOW_CONTROL.ALL_ATTACHEMENT_NUM_SDK = sec.Key("ALL_ATTACHEMENT_NUM_SDK").MustInt(100)
FLOW_CONTROL.ATTACHEMENT_NUM_A_USER_LAST24HOUR = sec.Key("ATTACHEMENT_NUM_A_USER_LAST24HOUR").MustInt(1000)
FLOW_CONTROL.ATTACHEMENT_NUM_A_USER_LAST10M = sec.Key("ATTACHEMENT_NUM_A_USER_LAST10M").MustInt(10)
FLOW_CONTROL.ATTACHEMENT_SIZE_A_USER = sec.Key("ATTACHEMENT_SIZE_A_USER").MustInt64(500)
}

func getModelAppConfig() {
sec := Cfg.Section("model_app")
ModelApp.DesensitizationUrl = sec.Key("desensitization_url").MustString("")

}

func getModelartsCDConfig() {
func GetModelartsCDConfig() {
sec := Cfg.Section("modelarts-cd")

ModelartsCD.Enabled = sec.Key("ENABLED").MustBool(false)
@@ -1821,7 +1846,7 @@ func getClearStrategy() {
ClearStrategy.RunAtStart = sec.Key("RUN_AT_START").MustBool(false)
}

func getGrampusConfig() {
func GetGrampusConfig() {
sec := Cfg.Section("grampus")

Grampus.Env = sec.Key("ENV").MustString("TEST")
@@ -1831,6 +1856,8 @@ func getGrampusConfig() {
Grampus.SpecialPools = sec.Key("SPECIAL_POOL").MustString("")
Grampus.C2NetSequence = sec.Key("C2NET_SEQUENCE").MustString("{\"sequence\":[{\"id\":1,\"name\":\"cloudbrain_one\",\"content\":\"鹏城云脑一号\",\"content_en\":\"Pencheng Cloudbrain Ⅰ\"},{\"id\":2,\"name\":\"cloudbrain_two\",\"content\":\"鹏城云脑二号\",\"content_en\":\"Pencheng Cloudbrain Ⅱ\"},{\"id\":3,\"name\":\"beida\",\"content\":\"北大人工智能集群系统\",\"content_en\":\"Peking University AI Center\"},{\"id\":4,\"name\":\"hefei\",\"content\":\"合肥类脑智能开放平台\",\"content_en\":\"Hefei AI Center\"},{\"id\":5,\"name\":\"wuhan\",\"content\":\"武汉人工智能计算中心\",\"content_en\":\"Wuhan AI Center\"},{\"id\":6,\"name\":\"xian\",\"content\":\"西安未来人工智能计算中心\",\"content_en\":\"Xi'an AI Center\"},{\"id\":7,\"pclcci\":\"more\",\"content\":\"鹏城云计算所\",\"content_en\":\"Pengcheng Cloud Computing Institute\"},{\"id\":8,\"name\":\"xuchang\",\"content\":\"中原人工智能计算中心\",\"content_en\":\"Zhongyuan AI Center\"},{\"id\":9,\"name\":\"chengdu\",\"content\":\"成都人工智能计算中心\",\"content_en\":\"Chengdu AI Center\"},{\"id\":10,\"name\":\"more\",\"content\":\"横琴先进智能计算中心\",\"content_en\":\"Hengqin AI Center\"},{\"id\":11,\"name\":\"more\",\"content\":\"国家超级计算济南中心\",\"content_en\":\"HPC & AI Center\"}]}")
Grampus.AiCenterCodeAndNameInfo = sec.Key("AI_CENTER_CODE_AND_NAME").MustString("{\"sequence\":[{\"id\":1,\"name\":\"cloudbrain_one\",\"content\":\"鹏城云脑一号\",\"content_en\":\"Pencheng Cloudbrain Ⅰ\"},{\"id\":2,\"name\":\"cloudbrain_two\",\"content\":\"鹏城云脑二号\",\"content_en\":\"Pencheng Cloudbrain Ⅱ\"},{\"id\":3,\"name\":\"beida\",\"content\":\"北大人工智能集群系统\",\"content_en\":\"Peking University AI Center\"},{\"id\":4,\"name\":\"hefei\",\"content\":\"合肥类脑智能开放平台\",\"content_en\":\"Hefei AI Center\"},{\"id\":5,\"name\":\"wuhan\",\"content\":\"武汉人工智能计算中心\",\"content_en\":\"Wuhan AI Center\"},{\"id\":6,\"name\":\"xian\",\"content\":\"西安未来人工智能计算中心\",\"content_en\":\"Xi'an AI Center\"},{\"id\":7,\"pclcci\":\"more\",\"content\":\"鹏城云计算所\",\"content_en\":\"Pengcheng Cloud Computing Institute\"},{\"id\":8,\"name\":\"xuchang\",\"content\":\"中原人工智能计算中心\",\"content_en\":\"Zhongyuan AI Center\"},{\"id\":9,\"name\":\"chengdu\",\"content\":\"成都人工智能计算中心\",\"content_en\":\"Chengdu AI Center\"},{\"id\":10,\"name\":\"more\",\"content\":\"横琴先进智能计算中心\",\"content_en\":\"Hengqin AI Center\"},{\"id\":11,\"name\":\"more\",\"content\":\"国家超级计算济南中心\",\"content_en\":\"HPC & AI Center\"}]}")
Grampus.AiCenterCodeAndNameAndLocInfo = sec.Key("AI_CENTER_CODE_AND_NAME_AND_LOC").MustString("{\"sequence\":[{\"id\":1,\"name\":\"cloudbrain_one\",\"content\":\"鹏城云脑一号\",\"content_en\":\"Pencheng Cloudbrain Ⅰ\"},{\"id\":2,\"name\":\"cloudbrain_two\",\"content\":\"鹏城云脑二号\",\"content_en\":\"Pencheng Cloudbrain Ⅱ\"},{\"id\":3,\"name\":\"beida\",\"content\":\"北大人工智能集群系统\",\"content_en\":\"Peking University AI Center\"},{\"id\":4,\"name\":\"hefei\",\"content\":\"合肥类脑智能开放平台\",\"content_en\":\"Hefei AI Center\"},{\"id\":5,\"name\":\"wuhan\",\"content\":\"武汉人工智能计算中心\",\"content_en\":\"Wuhan AI Center\"},{\"id\":6,\"name\":\"xian\",\"content\":\"西安未来人工智能计算中心\",\"content_en\":\"Xi'an AI Center\"},{\"id\":7,\"pclcci\":\"more\",\"content\":\"鹏城云计算所\",\"content_en\":\"Pengcheng Cloud Computing Institute\"},{\"id\":8,\"name\":\"xuchang\",\"content\":\"中原人工智能计算中心\",\"content_en\":\"Zhongyuan AI Center\"},{\"id\":9,\"name\":\"chengdu\",\"content\":\"成都人工智能计算中心\",\"content_en\":\"Chengdu AI Center\"},{\"id\":10,\"name\":\"more\",\"content\":\"横琴先进智能计算中心\",\"content_en\":\"Hengqin AI Center\"},{\"id\":11,\"name\":\"more\",\"content\":\"国家超级计算济南中心\",\"content_en\":\"HPC & AI Center\"}]}")

Grampus.UsageRateBeginTime = sec.Key("USAGE_RATE_BEGIN_TIME").MustString("2021-01-01 00:00:00")
Grampus.GPUImageCommonName = sec.Key("GPU_IMAGE_COMMON_NAME").MustString("image")
if Grampus.C2NetSequence != "" {
@@ -1842,6 +1869,15 @@ func getGrampusConfig() {
C2NetMapInfo[value.Name] = value
}
}
if Grampus.AiCenterCodeAndNameAndLocInfo != "" {
if err := json.Unmarshal([]byte(Grampus.AiCenterCodeAndNameAndLocInfo), &C2NetInfos); err != nil {
log.Error("Unmarshal(AiCenterCodeAndNameLocInfo) failed:%v", err)
}
AiCenterCodeAndNameAndLocMapInfo = make(map[string]*C2NetSequenceInfo)
for _, value := range C2NetInfos.C2NetSqInfo {
AiCenterCodeAndNameAndLocMapInfo[value.Name] = value
}
}
if Grampus.AiCenterCodeAndNameInfo != "" {
if err := json.Unmarshal([]byte(Grampus.AiCenterCodeAndNameInfo), &C2NetInfos); err != nil {
log.Error("Unmarshal(AiCenterCodeAndNameInfo) failed:%v", err)
@@ -1851,6 +1887,7 @@ func getGrampusConfig() {
AiCenterCodeAndNameMapInfo[value.Name] = value
}
}

Grampus.SyncScriptProject = sec.Key("SYNC_SCRIPT_PROJECT").MustString("script_for_grampus")
Grampus.LocalCenterID = sec.Key("LOCAL_CENTER_ID").MustString("cloudbrain2")
Grampus.GPULocalCenterID = sec.Key("GPU_LOCAL_CENTER_ID").MustString("openi")
@@ -1984,23 +2021,21 @@ func ensureLFSDirectory() {
}

func getNotebookImageInfos() {
if StImageInfos == nil {
if ModelartsCD.Enabled {
json.Unmarshal([]byte(ModelartsCD.ImageInfos), &StImageInfos)
} else {
json.Unmarshal([]byte(ImageInfos), &StImageInfos)
}
}
}

func getNotebookFlavorInfos() {
if StFlavorInfo == nil {
if ModelartsCD.Enabled {
json.Unmarshal([]byte(ModelartsCD.FlavorInfos), &StFlavorInfo)
} else {
json.Unmarshal([]byte(FlavorInfos), &StFlavorInfo)
}
}
}

// NewServices initializes the services


+ 1
- 0
modules/structs/cloudbrain.go View File

@@ -99,6 +99,7 @@ type CreateFileNotebookJobOption struct {
OwnerName string `json:"owner_name" binding:"Required"`
ProjectName string `json:"project_name" binding:"Required"`
JobId string `json:"job_id"`
ID int64 `json:"id"`
}

type Cloudbrain struct {


+ 1
- 1
modules/templates/helper.go View File

@@ -794,7 +794,7 @@ func licenses() []string {

// Dataset tasks
func tasks() []string {
return []string{"machine_translation", "question_answering_system", "information_retrieval", "knowledge_graph", "text_annotation", "text_categorization", "emotion_analysis", "language_modeling", "speech_recognition", "automatic_digest", "information_extraction", "description_generation", "image_classification", "face_recognition", "image_search", "target_detection", "image_description_generation", "vehicle_license_plate_recognition", "medical_image_analysis", "unmanned", "unmanned_security", "drone", "vr_ar", "2_d_vision", "2.5_d_vision", "3_d_reconstruction", "image_processing", "video_processing", "visual_input_system", "speech_coding", "speech_enhancement", "speech_synthesis","ROS_hmci"}
return []string{"machine_translation", "question_answering_system", "information_retrieval", "knowledge_graph", "text_annotation", "text_categorization", "emotion_analysis", "language_modeling", "speech_recognition", "automatic_digest", "information_extraction", "description_generation", "image_classification", "face_recognition", "image_search", "target_detection", "image_description_generation", "vehicle_license_plate_recognition", "medical_image_analysis", "unmanned", "unmanned_security", "drone", "vr_ar", "2_d_vision", "2.5_d_vision", "3_d_reconstruction", "image_processing", "video_processing", "visual_input_system", "speech_coding", "speech_enhancement", "speech_synthesis", "ros_hmci_datasets"}
}

func GetRefType(ref string) string {


+ 16
- 4
options/locale/locale_en-US.ini View File

@@ -933,7 +933,7 @@ task.speech_coding= speech coding
task.speech_enhancement= speech enhancement
task.speech_recognition= speech recognition
task.speech_synthesis= speech synthesis
task.ROS_hmci=ROS-hmci Community
task.ros_hmci_datasets=ROS-hmci datasets
category.computer_vision= computer vision
category.natural_language_processing= natural language processing
category.speech_processing= speech processing
@@ -968,7 +968,8 @@ download = Download
modify_description = Modify Description
set_public = Set Public
set_private = Set Private
annotation = Annotation
annotation = Image Annotation
more_annotation = More Annotation
upload_dataset_file = Upload Dataset File
file_description = File Description
data_upload = Dataset Upload
@@ -1091,6 +1092,7 @@ repo_mirror_add=Mirror Project Increment
repo_self_add=Custom Project Increment

debug=Debug
online_debug = Start
debug_again=Restart
stop=Stop
delete=Delete
@@ -1267,6 +1269,7 @@ cloudbrain.morethanonejob=You already have a running or waiting task, create it
cloudbrain.morethanonejob1=You have created an <span style="color:rgba(242, 113, 28, 1);"> equivalent task </span> that is waiting or running, please wait for the task to finish before creating it.
cloudbrain.morethanonejob2=You can view all your Cloud Brain tasks in <a href="/cloudbrains" target="_blank"> Home > Cloudbrain Task </a>.

modelarts.online_infer = Online Inference
modelarts.infer_job_model = Model
modelarts.infer_job_model_file = Model File
modelarts.infer_job = Inference Job
@@ -3176,6 +3179,7 @@ task_c2ent_gcudebugjob=`created GCU type debugging task <a href="%s/grampus/trai
task_c2ent_gcutrainjob=`created GCU type train task <a href="%s/modelarts/train-job/%s">%s</a>`
task_c2ent_mludebugjob=`created MLU type debugging task <a href="%s/grampus/train-job/%s">%s</a>`
task_c2ent_mlutrainjob=`created MLU type train task <a href="%s/modelarts/train-job/%s">%s</a>`
task_c2ent_onlineinferjob=`created GPU type online inference task <a href="%s/grampus/onlineinfer/%s">%s</a>`
task_nputrainjob=`created NPU training task <a href="%s/modelarts/train-job/%s">%s</a>`
task_inferencejob=`created reasoning task <a href="%s/modelarts/inference-job/%s">%s</a>`
task_benchmark=`created profiling task <a href="%s/cloudbrain/benchmark/%s">%s</a>`
@@ -3344,6 +3348,7 @@ SIM2BRAIN_SNN = BENCHMARK
TRAIN = TRAIN
INFERENCE = INFERENCE
BENCHMARK = BENCHMARK
ONLINEINFERENCE = ONLINEINFERENCE
brain_area = Brain Area

Delete_failed=Fail to delete the job, please try again later.
@@ -3363,7 +3368,7 @@ new_debug_gpu_tooltips1 = The code is storaged in <strong style="color:#010101">
new_train_npu_tooltips = The code is storaged in <strong style="color:#010101">%s</strong>, the pre-trained model is storaged in the run parameter <strong style="color:#010101">%s</strong>, and please put your model into <strong style="color:#010101">%s</strong> then you can download it online
new_infer_gpu_tooltips = The dataset is stored in <strong style="color:#010101">%s</strong>, the model file is stored in <strong style="color:#010101">%s</strong>, please store the inference output in <strong style="color:#010101">%s</strong> for subsequent downloads.
code_obs_address = Code OBS address
task_save_most_time = <p><span>*</span>The platform only retains the results of debugge, train, inference and evaluation tasks for nearly<span> 30 </span> days <span>Tasks over 30 days will not be able to download results and view logs, and cannot be debugged or trained again</span></p>
task_save_most_time = <p><span>*</span>The platform only retains the results of debug, train, inference and evaluation tasks for nearly<span> 30 </span> days. <span>Tasks over 30 days will not be able to download results and view logs, and cannot be debugged or trained again</span></p>
query_finetune_fail=Fail to query fine tuning job, please try again later.
finetune_max=The number of fine tuning job you created exceed the limit. please delete some first.
dataset_same_fail=The name of dataset file is used by the fine tune job, please select other dataset file.
@@ -3422,12 +3427,19 @@ multi_task = You have already a running or waiting task, can not create more
job_name_already_used = The job name did already exist
insufficient_point_balance = Insufficient point balance
create_failed = Create AI task failed
restart_failed = Restart AI task failed
restart_failed = Restart AI task failed, please try again later.
stop_failed = Fail to stop the job, please try again later.
can_not_restart = The task was not scheduled successfully before, so it cannot be restart.
dataset_size_over_limit = The size of dataset exceeds limitation (%dGB)
boot_file_must_python = The boot file must be a python file
boot_file_not_exist= The boot file is not exists.
branch_not_exists= The branch does not exist. Please refresh and select again.

[common_error]
system_error = System error.Please try again later
insufficient_permission = Insufficient permissions
param_error = The parameter you submitted is incorrect
wechat_not_bind = Please scan the code and bind to wechat first

[deployment]
deploy_max = The maximum deployment is %v per user


+ 15
- 3
options/locale/locale_zh-CN.ini View File

@@ -938,7 +938,7 @@ task.speech_coding=语音编码
task.speech_enhancement=语音增强
task.speech_recognition=语音识别
task.speech_synthesis=语音合成
task.ROS_hmci=开源开放社区
task.ros_hmci_datasets=开源开放社区数据集
category.computer_vision=计算机视觉
category.natural_language_processing=自然语言处理
category.speech_processing=语音处理
@@ -973,7 +973,8 @@ download = 下载
modify_description = 修改描述
set_public = 设为公开
set_private = 设为私有
annotation = 标注
annotation = 图片标注
more_annotation = 更多标注
upload_dataset_file = 上传数据集文件
file_description = 文件描述
data_upload = 数据上传
@@ -1090,6 +1091,7 @@ repo_mirror_add=新增镜像项目
repo_self_add=新增自建项目

debug=调试
online_debug = 在线推理
debug_again=再次调试
stop=停止
delete=删除
@@ -1279,6 +1281,7 @@ cloudbrain.morethanonejob=您已经创建了一个正在等待或运行中的同
cloudbrain.morethanonejob1=您已经有 <span style="color:rgba(242, 113, 28, 1);">同类任务</span> 正在等待或运行中,请等待任务结束再创建;
cloudbrain.morethanonejob2=可以在 “<a href="/cloudbrains" target="_blank" >个人中心 > 云脑任务</a>” 查看您所有的云脑任务。

modelarts.online_infer = 在线推理
modelarts.infer_job_model = 模型名称
modelarts.infer_job_model_file = 模型文件
modelarts.infer_job = 推理任务
@@ -3194,6 +3197,7 @@ task_c2ent_gcudebugjob=`创建了GCU类型调试任务 <a href="%s/grampus/noteb
task_c2ent_gcutrainjob=`创建了GCU类型训练任务 <a href="%s/grampus/train-job/%s">%s</a>`
task_c2ent_mludebugjob=`创建了MLU类型调试任务 <a href="%s/grampus/notebook/%s">%s</a>`
task_c2ent_mlutrainjob=`创建了MLU类型训练任务 <a href="%s/grampus/train-job/%s">%s</a>`
task_c2ent_onlineinferjob=`创建了GPU类型在线推理任务 <a href="%s/grampus/onlineinfer/%s">%s</a>`
task_nputrainjob=`创建了NPU类型训练任务 <a href="%s/modelarts/train-job/%s">%s</a>`
task_inferencejob=`创建了推理任务 <a href="%s/modelarts/inference-job/%s">%s</a>`
task_benchmark=`创建了评测任务 <a href="%s/cloudbrain/benchmark/%s">%s</a>`
@@ -3365,6 +3369,7 @@ SIM2BRAIN_SNN = 评测任务
TRAIN = 训练任务
INFERENCE = 推理任务
BENCHMARK = 评测任务
ONLINEINFERENCE = 在线推理
brain_area = 脑区

Delete_failed=任务删除失败,请稍后再试。
@@ -3444,12 +3449,19 @@ multi_task = 您已经有一个正在等待或运行中的任务,请结束该
job_name_already_used = 任务名已被使用,请换一个名称
insufficient_point_balance = 积分余额不足
create_failed = 创建AI任务失败
restart_failed = 再次调试AI任务失败
restart_failed = 再次调试失败,请稍后再试
stop_failed = 任务停止失败,请稍后再试
can_not_restart = 这个任务之前没有调度成功,不能再次调试。
dataset_size_over_limit = 数据集大小超过限制(%dGB)
boot_file_must_python = 启动文件必须是python文件
boot_file_not_exist =启动文件不存在
branch_not_exists= 代码分支不存在,请刷新后重试

[common_error]
system_error = 当前服务不可用,请稍后再试
insufficient_permission = 权限不足
param_error = 提交的参数有误
wechat_not_bind = 请先扫码绑定微信

[deployment]
deploy_max = 每个用户只能同时创建 %v 个部署任务


+ 48
- 16
package-lock.json View File

@@ -1,5 +1,5 @@
{
"name": "aiforge",
"name": "aiforge1",
"lockfileVersion": 2,
"requires": true,
"packages": {
@@ -21,7 +21,7 @@
"dayjs": "1.10.7",
"domino": "2.1.5",
"dropzone": "5.7.2",
"echarts": "3.8.5",
"echarts": "5.4.2",
"element-ui": "2.15.5",
"esdk-obs-browserjs": "3.22.3",
"esdk-obs-nodejs": "3.20.11",
@@ -5448,13 +5448,19 @@
}
},
"node_modules/echarts": {
"version": "3.8.5",
"resolved": "https://registry.npmmirror.com/echarts/download/echarts-3.8.5.tgz",
"integrity": "sha1-WOSlHSdDxvt1JXsNwKnPn1N4rA4=",
"version": "5.4.2",
"resolved": "https://registry.npmmirror.com/echarts/-/echarts-5.4.2.tgz",
"integrity": "sha512-2W3vw3oI2tWJdyAz+b8DuWS0nfXtSDqlDmqgin/lfzbkB01cuMEN66KWBlmur3YMp5nEDEEt5s23pllnAzB4EA==",
"dependencies": {
"zrender": "3.7.4"
"tslib": "2.3.0",
"zrender": "5.4.3"
}
},
"node_modules/echarts/node_modules/tslib": {
"version": "2.3.0",
"resolved": "https://registry.npmmirror.com/tslib/-/tslib-2.3.0.tgz",
"integrity": "sha512-N82ooyxVNm6h1riLCoyS9e3fuJ3AMG2zIZs2Gd1ATcSFjSA23Q0fzjjZeh0jbJvWVDZ0cJT8yaNNaaXHzueNjg=="
},
"node_modules/editions": {
"version": "1.3.4",
"resolved": "https://registry.npmjs.org/editions/-/editions-1.3.4.tgz",
@@ -20940,9 +20946,17 @@
"integrity": "sha1-6NV3TRwHOKR7z6hynzcS4t7d6yU="
},
"node_modules/zrender": {
"version": "3.7.4",
"resolved": "https://registry.nlark.com/zrender/download/zrender-3.7.4.tgz",
"integrity": "sha1-+EfVOUhIHvbUKQbR6prux6y+/fI="
"version": "5.4.3",
"resolved": "https://registry.npmmirror.com/zrender/-/zrender-5.4.3.tgz",
"integrity": "sha512-DRUM4ZLnoaT0PBVvGBDO9oWIDBKFdAVieNWxWwK0niYzJCMwGchRk21/hsE+RKkIveH3XHCyvXcJDkgLVvfizQ==",
"dependencies": {
"tslib": "2.3.0"
}
},
"node_modules/zrender/node_modules/tslib": {
"version": "2.3.0",
"resolved": "https://registry.npmmirror.com/tslib/-/tslib-2.3.0.tgz",
"integrity": "sha512-N82ooyxVNm6h1riLCoyS9e3fuJ3AMG2zIZs2Gd1ATcSFjSA23Q0fzjjZeh0jbJvWVDZ0cJT8yaNNaaXHzueNjg=="
}
},
"dependencies": {
@@ -25312,11 +25326,19 @@
}
},
"echarts": {
"version": "3.8.5",
"resolved": "https://registry.npmmirror.com/echarts/download/echarts-3.8.5.tgz",
"integrity": "sha1-WOSlHSdDxvt1JXsNwKnPn1N4rA4=",
"version": "5.4.2",
"resolved": "https://registry.npmmirror.com/echarts/-/echarts-5.4.2.tgz",
"integrity": "sha512-2W3vw3oI2tWJdyAz+b8DuWS0nfXtSDqlDmqgin/lfzbkB01cuMEN66KWBlmur3YMp5nEDEEt5s23pllnAzB4EA==",
"requires": {
"zrender": "3.7.4"
"tslib": "2.3.0",
"zrender": "5.4.3"
},
"dependencies": {
"tslib": {
"version": "2.3.0",
"resolved": "https://registry.npmmirror.com/tslib/-/tslib-2.3.0.tgz",
"integrity": "sha512-N82ooyxVNm6h1riLCoyS9e3fuJ3AMG2zIZs2Gd1ATcSFjSA23Q0fzjjZeh0jbJvWVDZ0cJT8yaNNaaXHzueNjg=="
}
}
},
"editions": {
@@ -37726,9 +37748,19 @@
"integrity": "sha1-6NV3TRwHOKR7z6hynzcS4t7d6yU="
},
"zrender": {
"version": "3.7.4",
"resolved": "https://registry.nlark.com/zrender/download/zrender-3.7.4.tgz",
"integrity": "sha1-+EfVOUhIHvbUKQbR6prux6y+/fI="
"version": "5.4.3",
"resolved": "https://registry.npmmirror.com/zrender/-/zrender-5.4.3.tgz",
"integrity": "sha512-DRUM4ZLnoaT0PBVvGBDO9oWIDBKFdAVieNWxWwK0niYzJCMwGchRk21/hsE+RKkIveH3XHCyvXcJDkgLVvfizQ==",
"requires": {
"tslib": "2.3.0"
},
"dependencies": {
"tslib": {
"version": "2.3.0",
"resolved": "https://registry.npmmirror.com/tslib/-/tslib-2.3.0.tgz",
"integrity": "sha512-N82ooyxVNm6h1riLCoyS9e3fuJ3AMG2zIZs2Gd1ATcSFjSA23Q0fzjjZeh0jbJvWVDZ0cJT8yaNNaaXHzueNjg=="
}
}
}
}
}

+ 1
- 1
package.json View File

@@ -20,7 +20,7 @@
"dayjs": "1.10.7",
"domino": "2.1.5",
"dropzone": "5.7.2",
"echarts": "3.8.5",
"echarts": "5.4.2",
"element-ui": "2.15.5",
"esdk-obs-browserjs": "3.22.3",
"esdk-obs-nodejs": "3.20.11",


+ 35
- 10
public/home/home.js View File

@@ -246,11 +246,21 @@ document.onreadystatechange = function () {
else if(record.OpType == "24" || record.OpType == "26" || record.OpType == "27" || record.OpType == "28" || record.OpType == "30"
|| record.OpType == "31" || record.OpType == "32" || record.OpType == "33" || record.OpType == "42" || record.OpType == "44"){
html += recordPrefix + actionName;
html += " <a href=\"" + getTaskLink(record) + "\" rel=\"nofollow\">" + record.RefName + "</a>"
const taskLink = getTaskLink(record);
if (taskLink) {
html += " <a href=\"" + taskLink + "\" rel=\"nofollow\">" + record.RefName + "</a>"
} else {
html += " <span style=\"color: rgba(0,0,0,0.3)\">" + record.RefName + "</span>"
}
}
else if(record.OpType == "25" || record.OpType == "29" || record.OpType == "39" || record.OpType == "40" || record.OpType == "41" || record.OpType == "43"){
else if(record.OpType == "25" || record.OpType == "29" || record.OpType == "39" || record.OpType == "40" || record.OpType == "41" || record.OpType == "43"|| record.OpType == "44"|| record.OpType == "45"){
html += recordPrefix + actionName;
html += " <a href=\"" + getTaskLink(record) + "\" rel=\"nofollow\">" + record.RefName + "</a>"
const taskLink = getTaskLink(record);
if (taskLink) {
html += " <a href=\"" + taskLink + "\" rel=\"nofollow\">" + record.RefName + "</a>"
} else {
html += " <span style=\"color: rgba(0,0,0,0.3)\">" + record.RefName + "</span>"
}
}
else if(record.OpType == "35"){
var datasetLink = "<a href=\"" + getRepoLink(record) + "/datasets" + "\" rel=\"nofollow\">" + record.Content.split('|')[1] + "</a>";
@@ -280,9 +290,17 @@ function getTaskLink(record){
if(record.OpType == 24){
re = re + "/datasets";
}else if(record.OpType == 25){
re = re + "/cloudbrain/" + record.Content;
if (record.Cloudbrain) {
re = re + "/cloudbrain/" + record.Cloudbrain.ID;
} else {
re = '';
}
}else if(record.OpType == 26){
if (record.Cloudbrain) {
re = re + "/modelarts/notebook/" + record.Content;
} else {
re = '';
}
}else if(record.OpType == 27){
re = re + "/modelarts/train-job/" + record.Content;
}else if(record.OpType == 28){
@@ -296,9 +314,14 @@ function getTaskLink(record){
}else if(record.OpType == 32 || record.OpType == 33 || record.OpType == 42 || record.OpType == 44){
re = re + "/grampus/train-job/" + record.Content;
}else if(record.OpType == 39 || record.OpType == 40 || record.OpType == 41 || record.OpType == 43){
re = re + "/grampus/notebook/" + record.Content;
if (record.Cloudbrain) {
re = re + "/grampus/notebook/" + record.Cloudbrain.ID;
} else {
re = '';
}
} else if(record.OpType == 45){
re = re + "/grampus/onlineinfer/" + record.Content;
}
re = encodeURI(re);
return re;
}
@@ -455,12 +478,13 @@ var actionNameZH={
"35":"创建的数据集 {dataset} 被设置为推荐数据集",
"36":"提交了镜像 {image}",
"37":"提交的镜像 {image} 被设置为推荐镜像",
"39":"创建了CPU/GPU类型调试任务",
"40":"创建了NPU类型调试任务",
"39":"创建了NPU类型调试任务",
"40":"创建了CPU/GPU类型调试任务",
"41":"创建了GCU类型调试任务",
"42":"创建了GCU类型训练任务",
"43":"创建了MLU类型调试任务",
"44":"创建了MLU类型训练任务",
"45":"创建了GPU在线推理任务",
};

var actionNameEN={
@@ -492,12 +516,13 @@ var actionNameEN={
"35":" created dataset {dataset} was set as recommended dataset",
"36":"committed image {image}",
"37":"committed image {image} was set as recommended image",
"39":" created CPU/GPU type debugging task ",
"40":" created NPU type debugging task ",
"39":" created NPU type debugging task ",
"40":" created CPU/GPU type debugging task ",
"41":" created GCU type debugging task ",
"42":" created GCU type training task ",
"43":" created MLU type debugging task ",
"44":" created MLU type training task ",
"45":" created GPU type online inference task ",
};

var repoAndOrgZH={


+ 2
- 0
public/home/search.js View File

@@ -390,6 +390,7 @@ var taskDesc = {
speech_enhancement: "语音增强",
speech_recognition: "语音识别",
speech_synthesis: "语音合成",
ros_hmci_datasets: "开源开放社区",
};

var taskENDesc = {
@@ -426,6 +427,7 @@ var taskENDesc = {
speech_enhancement: "speech enhancement",
speech_recognition: "speech recognition",
speech_synthesis: "speech synthesis",
ros_hmci_datasets: "ROS-hmci datasets",
};

function getCategoryDesc(isZh, key) {


+ 122
- 72
routers/ai_task/ai_task.go View File

@@ -1,46 +1,22 @@
package ai_task

import (
"code.gitea.io/gitea/entity/ai_task_entity"
"net/http"

"code.gitea.io/gitea/entity"
"code.gitea.io/gitea/models"
"code.gitea.io/gitea/modules/cloudbrain"
"code.gitea.io/gitea/modules/context"
"code.gitea.io/gitea/modules/log"
"code.gitea.io/gitea/modules/setting"
"code.gitea.io/gitea/modules/util"
"code.gitea.io/gitea/routers/response"
creation_context "code.gitea.io/gitea/services/ai_task_service/context"
"code.gitea.io/gitea/services/ai_task_service/task"
cloudbrainService "code.gitea.io/gitea/services/cloudbrain"
"code.gitea.io/gitea/services/lock"
"net/http"
)

func CreateAITask(ctx *context.Context, form ai_task_entity.CreateReq) {
func CreateAITask(ctx *context.Context, form entity.CreateReq) {
handCreateReq(&form)

t, err := task.GetAITask(form.JobType, form.Cluster)
if err != nil {
log.Error("param error")
ctx.JSON(http.StatusOK, response.OuterTrBizError(err, ctx))
return
}

lockOperator, errMsg := cloudbrainService.Lock4CloudbrainCreation(&lock.LockContext{Repo: ctx.Repo.Repository, Task: &models.Cloudbrain{DisplayJobName: form.DisplayJobName, JobType: string(form.JobType)}, User: ctx.User})
defer func() {
if lockOperator != nil {
lockOperator.Unlock()
}
}()
if errMsg != "" {
log.Error("lock processed failed:%s", errMsg)
ctx.JSON(http.StatusOK, response.OuterServerError(ctx.Tr(errMsg)))
return
}
res, err := t.Create(&creation_context.CreationContext{
Request: form,
GitRepo: ctx.Repo.GitRepo,
Repository: ctx.Repo.Repository,
User: ctx.User,
})
res, err := task.CreateAITask(form, ctx.Repo.GitRepo, ctx.Repo.Repository, ctx.User)
if err != nil {
ctx.JSON(http.StatusOK, response.OuterTrBizError(err, ctx))
return
@@ -49,7 +25,7 @@ func CreateAITask(ctx *context.Context, form ai_task_entity.CreateReq) {
}
func DelAITask(ctx *context.Context) {
id := ctx.QueryInt64("id")
t, _ := task.GetAITaskByCloudbrainId(id)
t, _ := task.GetAITaskTemplateByCloudbrainId(id)
if t == nil {
log.Error("param error")
ctx.JSON(http.StatusOK, response.OuterTrBizError(response.PARAM_ERROR, ctx))
@@ -65,7 +41,7 @@ func DelAITask(ctx *context.Context) {
}
func StopAITask(ctx *context.Context) {
id := ctx.QueryInt64("id")
t, err := task.GetAITaskByCloudbrainId(id)
t, err := task.GetAITaskTemplateByCloudbrainId(id)
if err != nil {
log.Error("param error")
ctx.JSON(http.StatusOK, response.OuterTrBizError(response.PARAM_ERROR, ctx))
@@ -81,36 +57,7 @@ func StopAITask(ctx *context.Context) {
}
func RestartAITask(ctx *context.Context) {
id := ctx.QueryInt64("id")
cloudbrain, err := models.GetCloudbrainByCloudbrainID(id)
if err != nil {
log.Error("RestartAITask GetCloudbrainByJobID err.%v", err)
ctx.JSON(http.StatusOK, response.OuterTrBizError(response.AI_TASK_NOT_EXISTS, ctx))
return
}
t, bizErr := task.GetAITaskFromCloudbrain(cloudbrain)
if err != nil {
log.Error("param error")
ctx.JSON(http.StatusOK, response.OuterTrBizError(response.PARAM_ERROR, ctx))
return
}

lockOperator, errMsg := cloudbrainService.Lock4CloudbrainRestart(&lock.LockContext{Repo: ctx.Repo.Repository, Task: &models.Cloudbrain{DisplayJobName: cloudbrain.DisplayJobName, JobType: cloudbrain.JobType}, User: ctx.User})
defer func() {
if lockOperator != nil {
lockOperator.Unlock()
}
}()
if errMsg != "" {
log.Error("lock processed failed:%s", errMsg)
ctx.JSON(http.StatusOK, response.OuterServerError(ctx.Tr(errMsg)))
return
}
res, bizErr := t.Restart(&creation_context.CreationContext{
GitRepo: ctx.Repo.GitRepo,
Repository: ctx.Repo.Repository,
User: ctx.User,
SourceCloudbrain: cloudbrain,
})
res, bizErr := task.RestartAITask(id, ctx.Repo.GitRepo, ctx.Repo.Repository, ctx.User)
if bizErr != nil {
ctx.JSON(http.StatusOK, response.OuterTrBizError(bizErr, ctx))
return
@@ -121,7 +68,7 @@ func RestartAITask(ctx *context.Context) {

func GetAITaskLog(ctx *context.Context) {
id := ctx.QueryInt64("id")
t, err := task.GetAITaskByCloudbrainId(id)
t, err := task.GetAITaskTemplateByCloudbrainId(id)
if err != nil {
log.Error("param error")
ctx.JSON(http.StatusOK, response.OuterTrBizError(err, ctx))
@@ -133,18 +80,63 @@ func GetAITaskLog(ctx *context.Context) {

func GetAITaskInfo(ctx *context.Context) {
id := ctx.QueryInt64("id")
t, err := task.GetAITaskByCloudbrainId(id)
cloudbrain, bizErr := models.GetCloudbrainByCloudbrainID(id)
if bizErr != nil {
log.Error("GetAITaskInfo GetCloudbrainByCloudbrainID err.%v", bizErr)
ctx.JSON(http.StatusOK, response.OuterTrBizError(response.AI_TASK_NOT_EXISTS, ctx))
return
}
t, err := task.GetAITaskTemplateFromCloudbrain(cloudbrain)
if err != nil {
log.Error("param error")
ctx.JSON(http.StatusOK, response.OuterTrBizError(err, ctx))
return
}
resultTask, err := t.Query(id)
if err != nil {
log.Error("Query error.id=%d err=%v", id, err)
ctx.JSON(http.StatusOK, response.OuterTrBizError(err, ctx))
return
}
//国际化
resultTask.Tr(ctx.Language())
//根据权限去掉数据集和模型信息
var operatorId int64
if ctx.User != nil {
operatorId = ctx.User.ID
}
if operatorId == 0 || cloudbrain.UserID != operatorId {
resultTask.RemoveDatasets()
resultTask.RemovePretrainModelList()
}
//加载关联版本
earlyVersionList, bizErr := task.QueryTaskEarlyVersionList(id, operatorId)
if bizErr != nil {
log.Error("QueryTaskEarlyVersionList err.id=%d err=%v", id, err)
ctx.JSON(http.StatusOK, response.OuterResponseError(bizErr))
return
}
ctx.JSON(http.StatusOK, response.OuterSuccessWithData(&entity.QueryAITaskRes{
Task: resultTask,
EarlyVersionList: earlyVersionList,
CanCreateVersion: cloudbrain.CanUserModify(ctx.User),
}))
}
func GetAITaskBriefInfo(ctx *context.Context) {
id := ctx.QueryInt64("id")
t, err := task.GetAITaskTemplateByCloudbrainId(id)
if err != nil {
log.Error("param error")
ctx.JSON(http.StatusOK, response.OuterTrBizError(err, ctx))
return
}
res, err := t.Query(id)
res, err := t.BriefQuery(id)
if err != nil {
log.Error("Query error.%v", err)
log.Error("BriefQuery error.%v", err)
ctx.JSON(http.StatusOK, response.OuterTrBizError(err, ctx))
return
}
res.Tr(ctx.Language())
ctx.JSON(http.StatusOK, response.OuterSuccessWithData(res))
}

@@ -154,35 +146,44 @@ func GetAITaskOutput(ctx *context.Context) {

func GetNotebookUrl(ctx *context.Context) {
id := ctx.QueryInt64("id")
t, err := task.GetAITaskByCloudbrainId(id)
t, err := task.GetAITaskTemplateByCloudbrainId(id)
if err != nil {
log.Error("param error")
ctx.JSON(http.StatusOK, response.OuterTrBizError(err, ctx))
return
}
url, err := t.GetDebugUrl(id)
fileName := ctx.QueryTrim("file")
url, err := t.GetDebugUrl(id, fileName)
if err != nil {
log.Error("GetNotebookUrl error.%v", err)
ctx.JSON(http.StatusOK, response.OuterTrBizError(err, ctx))
return
}

m := map[string]interface{}{"url": url}
ctx.JSON(http.StatusOK, response.OuterSuccessWithData(m))
}

func GetCreationRequiredInfo(ctx *context.Context) {
jobType := ctx.Query("job_type")
var isOnlineType bool
if models.JobType(jobType) == (models.JobTypeOnlineInference) {
isOnlineType = true
jobType = string(models.JobTypeDebug)
}
log.Info("required jobType=" + jobType)
computeSourceName := ctx.Query("compute_source")
clusterType := ctx.Query("cluster_type")
computeSource := models.GetComputeSourceInstance(computeSourceName)

result, err := task.GetAITaskCreationInfo(ai_task_entity.GetAITaskCreationInfoReq{
result, err := task.GetAITaskCreationInfo(entity.GetAITaskCreationInfoReq{
User: ctx.User,
JobType: models.JobType(jobType),
ClusterType: ai_task_entity.ClusterType(clusterType),
ClusterType: entity.ClusterType(clusterType),
ComputeSource: computeSource,
Repo: ctx.Repo.Repository,
GitRepo: ctx.Repo.GitRepo,
IsOnlineType: isOnlineType,
})
if err != nil {
log.Error("GetAITaskCreationInfo error,err=%v", err)
@@ -192,7 +193,56 @@ func GetCreationRequiredInfo(ctx *context.Context) {
ctx.JSON(http.StatusOK, response.OuterSuccessWithData(result))
}

func handCreateReq(req *ai_task_entity.CreateReq) {
func GetAITaskList(ctx *context.Context) {
jobType := ctx.Query("job_type")
computeSourceName := ctx.Query("compute_source")
page := ctx.QueryInt("page")
computeSource := models.GetComputeSourceInstance(computeSourceName)
if page <= 0 {
page = 1
}
jobTypes := make([]string, 0)
if jobType != "" {
jobTypes = append(jobTypes, jobType)
}
result, err := task.GetAITaskList(entity.GetTaskListReq{
ListOptions: models.ListOptions{
PageSize: setting.UI.IssuePagingNum,
Page: page,
},
ComputeSource: computeSource,
JobTypes: jobTypes,
RepoID: ctx.Repo.Repository.ID,
Operator: ctx.User,
IsRepoOwner: ctx.Repo.IsOwner(),
})
if err != nil {
log.Error("GetAITaskList error,err=%v", err)
ctx.JSON(http.StatusOK, response.OuterTrBizError(err, ctx))
return
}
result.CanCreateTask = cloudbrain.CanCreateOrDebugJob(ctx)
ctx.JSON(http.StatusOK, response.OuterSuccessWithData(result))
}

func GetAITaskOperationProfile(ctx *context.Context) {
id := ctx.QueryInt64("id")
t, err := task.GetAITaskTemplateByCloudbrainId(id)
if err != nil {
log.Error("param error")
ctx.JSON(http.StatusOK, response.OuterTrBizError(err, ctx))
return
}
r, err := t.GetOperationProfile(id)
if err != nil {
log.Error("GetOperationProfile error.%v", err)
ctx.JSON(http.StatusOK, response.OuterTrBizError(err, ctx))
return
}
ctx.JSON(http.StatusOK, response.OuterSuccessWithData(r))
}

func handCreateReq(req *entity.CreateReq) {
req.JobName = util.ConvertDisplayJobNameToJobName(req.DisplayJobName)
if req.WorkServerNumber == 0 {
req.WorkServerNumber = 1


+ 1
- 0
routers/ai_task/notebook.go View File

@@ -0,0 +1 @@
package ai_task

+ 75
- 13
routers/api/v1/api.go View File

@@ -59,10 +59,15 @@
package v1

import (
"code.gitea.io/gitea/entity"
"code.gitea.io/gitea/routers/response"
"net/http"
"strings"

"code.gitea.io/gitea/entity/ai_task_entity"


cloudbrainService "code.gitea.io/gitea/services/cloudbrain"

"code.gitea.io/gitea/routers/ai_task"

"code.gitea.io/gitea/routers/api/v1/finetune"
@@ -123,6 +128,29 @@ func sudo() macaron.Handler {
}
}

func reqAITaskInRepo() macaron.Handler {
return func(ctx *context.APIContext) {
if ctx.Repo == nil {
ctx.Context.Error(http.StatusUnauthorized)
return
}
id := ctx.QueryInt64("id")
if id <= 0 {
ctx.Context.Error(http.StatusUnauthorized)
return
}
t, err := models.GetCloudbrainByCloudbrainID(id)
if err != nil {
ctx.Context.Error(http.StatusUnauthorized)
return
}
if t.RepoID != ctx.Repo.Repository.ID {
ctx.Context.Error(http.StatusUnauthorized)
return
}
}
}

func repoAssignment() macaron.Handler {
return func(ctx *context.APIContext) {
userName := ctx.Params(":username")
@@ -341,6 +369,15 @@ func reqWeChat() macaron.Handler {
}
}

func reqWeChatStandard() macaron.Handler {
return func(ctx *context.Context) {
if setting.WechatAuthSwitch && ctx.User.WechatOpenId == "" {
ctx.JSON(http.StatusOK, response.OuterTrBizError(response.WECHAT_NOT_BIND, ctx))
return
}
}
}

// reqAnyRepoReader user should have any permission to read repository or permissions of site admin
func reqAnyRepoReader() macaron.Handler {
return func(ctx *context.Context) {
@@ -610,17 +647,22 @@ func RegisterRoutes(m *macaron.Macaron) {

m.Group("/:username/:reponame", func() {
m.Group("/ai_task", func() {
m.Post("/create", reqWeChat(), reqRepoWriter(models.UnitTypeCloudBrain), bind(ai_task_entity.CreateReq{}), ai_task.CreateAITask)
m.Get("", reqWeChat(), reqRepoWriter(models.UnitTypeCloudBrain), ai_task.GetAITaskInfo)
m.Post("/stop", reqWeChat(), reqRepoWriter(models.UnitTypeCloudBrain), reqAdminOrOwnerAITaskCreator(), ai_task.StopAITask)
m.Post("/del", reqWeChat(), reqRepoWriter(models.UnitTypeCloudBrain), reqAdminOrOwnerAITaskCreator(), ai_task.DelAITask)
m.Post("/restart", reqWeChat(), reqRepoWriter(models.UnitTypeCloudBrain), reqAdminOrAITaskCreator(), ai_task.RestartAITask)
m.Get("/log", reqWeChat(), reqRepoWriter(models.UnitTypeCloudBrain), ai_task.GetAITaskLog)
m.Get("/output", reqWeChat(), reqRepoWriter(models.UnitTypeCloudBrain), ai_task.GetAITaskOutput)
m.Get("/debug_url", reqWeChat(), reqRepoWriter(models.UnitTypeCloudBrain), ai_task.GetNotebookUrl)
m.Get("/creation/required", reqWeChat(), reqRepoWriter(models.UnitTypeCloudBrain), ai_task.GetCreationRequiredInfo)
}, context.RepoRef())
}, reqToken(), repoAssignment())
m.Post("/create", reqWeChatStandard(), reqRepoWriter(models.UnitTypeCloudBrain), bind(entity.CreateReq{}), ai_task.CreateAITask)
m.Post("/stop", reqWeChatStandard(), reqRepoWriter(models.UnitTypeCloudBrain), reqAITaskInRepo(), reqAdminOrOwnerAITaskCreator(), ai_task.StopAITask)
m.Post("/del", reqWeChatStandard(), reqRepoWriter(models.UnitTypeCloudBrain), reqAITaskInRepo(), reqAdminOrOwnerAITaskCreator(), ai_task.DelAITask)
m.Post("/restart", reqWeChatStandard(), reqRepoWriter(models.UnitTypeCloudBrain), reqAITaskInRepo(), reqAdminOrAITaskCreator(), ai_task.RestartAITask)
m.Get("/log", reqWeChatStandard(), reqRepoWriter(models.UnitTypeCloudBrain), reqAITaskInRepo(), ai_task.GetAITaskLog)
m.Get("/output", reqWeChatStandard(), reqRepoWriter(models.UnitTypeCloudBrain), reqAITaskInRepo(), ai_task.GetAITaskOutput)
m.Get("/debug_url", reqWeChatStandard(), reqRepoWriter(models.UnitTypeCloudBrain), reqAITaskInRepo(), ai_task.GetNotebookUrl)
m.Get("/creation/required", reqWeChatStandard(), reqRepoWriter(models.UnitTypeCloudBrain), ai_task.GetCreationRequiredInfo)
}, reqToken(), context.RepoRef())
m.Group("/ai_task", func() {
m.Get("", reqRepoReader(models.UnitTypeCloudBrain), ai_task.GetAITaskInfo)
m.Get("/brief", reqRepoReader(models.UnitTypeCloudBrain), reqAITaskInRepo(), ai_task.GetAITaskBriefInfo)
m.Get("/list", reqRepoReader(models.UnitTypeCloudBrain), ai_task.GetAITaskList)
m.Get("/operation_profile", reqRepoReader(models.UnitTypeCloudBrain), reqAITaskInRepo(), ai_task.GetAITaskOperationProfile)
})
}, repoAssignment())
// Miscellaneous
if setting.API.EnableSwagger {
m.Get("/swagger", misc.Swagger)
@@ -670,6 +712,12 @@ func RegisterRoutes(m *macaron.Macaron) {
m.Post("/complete_multipart", repo.CompleteMultipart)

}, reqToken())
m.Group("/attachments/model", func() {
m.Get("/get_chunks", repo.GetModelChunks)
m.Get("/new_multipart", repo.NewModelMultipart)
m.Get("/get_multipart_url", repo.GetModelMultipartUploadUrl)
m.Post("/complete_multipart", repo.CompleteModelMultipart)
})
m.Group("/pipeline", func() {
m.Post("/notification", bind(api.PipelineNotification{}), notify.PipelineNotify)

@@ -748,6 +796,9 @@ func RegisterRoutes(m *macaron.Macaron) {

//cloudbrain board
m.Get("/cloudbrainboard/cloudbrain/resource_queues", repo.GetResourceQueues)
m.Get("/cloudbrainboard/ai_center_overview", repo.GetCloubrainOverviewGroupByAiCenter)
m.Get("/cloudbrainboard/location", cloudbrainService.GetCloudbrainLocationInfo)

m.Group("/cloudbrainboard", func() {
m.Get("/downloadAll", repo.DownloadCloudBrainBoard)
m.Group("/cloudbrain", func() {
@@ -870,11 +921,16 @@ func RegisterRoutes(m *macaron.Macaron) {
m.Get("/my_datasets", repo.MyDatasetsMultiple)
m.Get("/public_datasets", repo.PublicDatasetMultiple)
m.Get("/my_favorite", repo.MyFavoriteDatasetMultiple)
m.Group("/model", func() {
m.Get("/getmodelfile", repo.GetDataSetSelectItemByJobId)
m.Get("/getprogress", repo.GetExportDataSetByMsgId)
m.Post("/export_exist_dataset", repo.ExportModelToExistDataSet)
})
}, reqToken(), repoAssignment())

m.Group("/file_notebook", func() {
m.Get("", repo.GetFileNoteBookInfo)
m.Post("/create", reqToken(), reqWeChat(), bind(api.CreateFileNotebookJobOption{}), repo.CreateFileNoteBook)
m.Post("/create", reqToken(), reqWeChatStandard(), bind(api.CreateFileNotebookJobOption{}), repo.CreateFileNoteBook)
m.Post("/status", reqToken(), bind(api.CreateFileNotebookJobOption{}), repo.FileNoteBookStatus)
})

@@ -1179,6 +1235,8 @@ func RegisterRoutes(m *macaron.Macaron) {
}, reqRepoReader(models.UnitTypeCloudBrain))
m.Group("/modelmanage", func() {
m.Post("/create_new_model", repo.CreateNewModel)
m.Post("/create_local_model", repo.SaveLocalModel)
m.Delete("/delete_model_file", repo.DeleteModelFile)
m.Get("/show_model_api", repo.ShowModelManageApi)
m.Delete("/delete_model", repo.DeleteModel)
m.Get("/downloadall", repo.DownloadModel)
@@ -1225,6 +1283,7 @@ func RegisterRoutes(m *macaron.Macaron) {
m.Post("/del_version", repo.DelTrainJobVersion)
m.Post("/stop_version", repo.StopTrainJobVersion)
m.Get("/result_list", repo.ResultList)
m.Get("/downloadall", repo.DownloadMultiResultFile)
})
})
}, reqRepoReader(models.UnitTypeCloudBrain))
@@ -1239,8 +1298,11 @@ func RegisterRoutes(m *macaron.Macaron) {
m.Post("/stop_version", cloudbrain.AdminOrOwnerOrJobCreaterRightForTrain, repo_ext.GrampusStopJob)
m.Get("/log", repo_ext.GrampusGetLog)
m.Get("/metrics", repo_ext.GrampusMetrics)
m.Get("/metrics/:nodeId", repo_ext.GrampusMetrics)
m.Get("/log/:nodeId", repo_ext.GrampusGetLog)
m.Get("/download_multi_model", cloudbrain.AdminOrJobCreaterRightForTrain, repo.MultiModelDownload)
m.Get("/download_log", cloudbrain.AdminOrJobCreaterRightForTrain, repo_ext.GrampusDownloadLog)
m.Get("/download_log/:nodeId", cloudbrain.AdminOrJobCreaterRightForTrain, repo_ext.GrampusDownloadLog)
m.Get("/job_event", repo_ext.GrampusTrainJobEvents)
})
})


+ 18
- 1
routers/api/v1/finetune/panguervice.go View File

@@ -199,9 +199,26 @@ func SyncPanguDeployStatus() {

func GetPanguDeployStatus(ctx *context.APIContext) {
var jobID = ctx.Params(":jobid")
status, _ := models.GetModelartsDeployStatusByJobID(jobID)

status, err := models.GetModelartsDeployStatusByJobID(jobID)
if err != nil {
log.Info("panguService: GetPanguDeployStatus, jobID %s, err %v", jobID, status, err)
ctx.JSON(http.StatusOK, models.BaseErrorMessageApi(err.Error()))
return
}

finishTime, err := models.GetModelartsDeployFinishTimebyJobID(jobID)
if err != nil {
log.Info("panguService: GetModelartsDeployFinishTimebyJobID, jobID %s, err %v", jobID, status, err)
ctx.JSON(http.StatusOK, models.BaseErrorMessageApi(err.Error()))
return
}

log.Info("panguService: GetPanguDeployStatus, jobID %s, status %s, finishTime %s", jobID, status, finishTime)

ctx.JSON(http.StatusOK, map[string]interface{}{
"fineTuneDeployStatus": status,
"fineTuneDeployFinishTime": finishTime,
})
}



+ 53
- 2
routers/api/v1/repo/attachments.go View File

@@ -2,6 +2,7 @@ package repo

import (
"net/http"
"sync"

"code.gitea.io/gitea/modules/log"

@@ -10,6 +11,8 @@ import (
routeRepo "code.gitea.io/gitea/routers/repo"
)

var mutex *sync.Mutex = new(sync.Mutex)

func GetSuccessChunks(ctx *context.APIContext) {
if errStr := checkDatasetPermission(ctx); errStr != "" {
ctx.JSON(http.StatusForbidden, ctx.Tr(errStr))
@@ -47,9 +50,34 @@ func checkDatasetPermission(ctx *context.APIContext) string {

func NewMultipart(ctx *context.APIContext) {
if errStr := checkDatasetPermission(ctx); errStr != "" {
ctx.JSON(http.StatusForbidden, ctx.Tr(errStr))
ctx.JSON(200, map[string]string{
"result_code": "-1",
"msg": ctx.Tr(errStr),
})
return
}
if err := routeRepo.CheckFlowForDatasetSDK(); err != nil {
ctx.JSON(200, map[string]string{
"result_code": "-1",
"msg": err.Error(),
})
return
}
mutex.Lock()
defer mutex.Unlock()
datasetId := ctx.QueryInt64("dataset_id")
fileName := ctx.Query("file_name")
re, err := routeRepo.NewMultipartForApi(ctx.Context, true)
if err != nil {
ctx.JSON(200, map[string]string{
"result_code": "-1",
"msg": err.Error(),
})
} else {
routeRepo.AddFileNameToCache(datasetId, fileName, ctx.User.ID)
re["result_code"] = "0"
ctx.JSON(200, re)
}
routeRepo.NewMultipart(ctx.Context)
}
func GetMultipartUploadUrl(ctx *context.APIContext) {
if errStr := checkDatasetPermission(ctx); errStr != "" {
@@ -62,9 +90,32 @@ func CompleteMultipart(ctx *context.APIContext) {
if errStr := checkDatasetPermission(ctx); errStr != "" {
ctx.JSON(http.StatusForbidden, ctx.Tr(errStr))
}
datasetId := ctx.QueryInt64("dataset_id")
fileName := ctx.Query("file_name")
routeRepo.RemoveFileFromCache(datasetId, fileName, ctx.User.ID)
routeRepo.CompleteMultipart(ctx.Context)

}
func GetAttachment(ctx *context.APIContext) {
routeRepo.GetAttachment(ctx.Context)
}

func GetModelChunks(ctx *context.APIContext) {
log.Info("GetModelChunks by api.")
routeRepo.GetModelChunks(ctx.Context)
}

func NewModelMultipart(ctx *context.APIContext) {
log.Info("NewModelMultipart by api.")
routeRepo.NewModelMultipart(ctx.Context)
}

func GetModelMultipartUploadUrl(ctx *context.APIContext) {
log.Info("GetModelMultipartUploadUrl by api.")
routeRepo.GetModelMultipartUploadUrl(ctx.Context)
}

func CompleteModelMultipart(ctx *context.APIContext) {
log.Info("CompleteModelMultipart by api.")
routeRepo.CompleteModelMultipart(ctx.Context)
}

+ 175
- 0
routers/api/v1/repo/cloudbrain.go View File

@@ -7,7 +7,12 @@ package repo

import (
"bufio"
"code.gitea.io/gitea/entity"
"code.gitea.io/gitea/modules/util"
"code.gitea.io/gitea/routers/response"
"code.gitea.io/gitea/services/ai_task_service/task"
"encoding/json"
"fmt"
"io"
"io/ioutil"
"net/http"
@@ -111,7 +116,123 @@ func GeneralCloudBrainJobStop(ctx *context.APIContext) {
}
func CreateFileNoteBook(ctx *context.APIContext, option api.CreateFileNotebookJobOption) {
cloudbrainTask.FileNotebookCreate(ctx.Context, option)
if ctx.Written() {
return
}
CreateFileNotebookTask(ctx.Context, option)
}

func CreateFileNotebookTask(ctx *context.Context, option api.CreateFileNotebookJobOption) {
displayJobName := cloudbrainService.GetDisplayJobName(ctx.User.Name)
jobName := util.ConvertDisplayJobNameToJobName(displayJobName)
jobType := models.JobTypeDebug
specId := setting.FileNoteBook.SpecIdGPU
ComputeSource := models.GPU
imageUrl := setting.FileNoteBook.ImageGPU
imageId := ""
imageName := imageUrl
cluster := entity.OpenICloudbrainOne

if option.Type == 0 {
specId = setting.FileNoteBook.SpecIdCPU
imageName = imageUrl
}
if option.Type > cloudbrainTask.GPUType {
imageId = setting.FileNoteBook.ImageIdNPU
imageName = setting.FileNoteBook.ImageNPUDescription
imageUrl = ""
imageNpu, err := getNpuImageId(option)
if err != nil {
ctx.JSON(http.StatusOK, models.BaseErrorMessageApi(ctx.Tr("repo.parameter_is_wrong")))
return
}
if imageNpu != nil {
imageId = imageNpu.Id
imageName = imageNpu.Value
}
ComputeSource = models.NPU
specId = setting.FileNoteBook.SpecIdNPU
if setting.ModelartsCD.Enabled {
specId = setting.FileNoteBook.SpecIdNPUCD
imageName = setting.FileNoteBook.ImageNPUCDDescription
}

cluster = entity.OpenICloudbrainTwo
}

sourceRepo, err := models.GetRepositoryByOwnerAndName(option.OwnerName, option.ProjectName)
if err != nil {
ctx.JSON(http.StatusOK, models.BaseErrorMessageApi(ctx.Tr("repo.notebook_file_not_exist")))
return
}
repo, _ := models.GetRepositoryByName(ctx.User.ID, setting.FileNoteBook.ProjectName)
if repo == nil {
log.Error("default file repository not exists")
ctx.JSON(http.StatusOK, models.BaseErrorMessageApi("system error"))
return
}

res, bizErr := task.CreateAITask(entity.CreateReq{
JobType: jobType,
DisplayJobName: displayJobName,
JobName: jobName,
SpecId: specId,
ComputeSourceStr: ComputeSource,
Cluster: cluster,
WorkServerNumber: 1,
ImageUrl: imageUrl,
ImageName: imageName,
ImageID: imageId,
BootFile: cloudbrainTask.GetBootFile(option.File, option.OwnerName, option.ProjectName, option.BranchName),
FileRepository: sourceRepo,
FileBranchName: option.BranchName,
IsFileNoteBookRequest: true,
Description: getDescription(option),
}, nil, repo, ctx.User)

code := 0

if bizErr != nil {
switch bizErr.Code {
case response.MULTI_TASK.Code:
code = 2
default:
code = 1
}
ctx.JSON(http.StatusOK, models.BaseMessageApi{Code: code, Message: ctx.Tr(bizErr.TrCode)})
return
}
ctx.JSON(http.StatusOK, models.BaseMessageApi{
Code: code,
Message: fmt.Sprint(res.ID),
})
}

const CharacterLength = 2550

func getDescription(option api.CreateFileNotebookJobOption) string {
des := option.OwnerName + "/" + option.ProjectName + "/" + option.BranchName + "/" + option.File
if len(des) <= CharacterLength {
return des
}
return ""
}

func getNpuImageId(option api.CreateFileNotebookJobOption) (*setting.ImageInfoModelArts, error) {
if option.Type != cloudbrainTask.NPUType {
return nil, fmt.Errorf("type is not npu.")
}
if option.Image == "" {
return nil, nil
}
for _, imageInfo := range setting.StImageInfos.ImageInfo {
if imageInfo.Value == option.Image {
return imageInfo, nil
}
}
return nil, fmt.Errorf("invalid image parameter")
}

func FileNoteBookStatus(ctx *context.APIContext, option api.CreateFileNotebookJobOption) {
cloudbrainTask.FileNotebookStatus(ctx.Context, option)
}
@@ -224,6 +345,36 @@ func GrampusNoteBookDebug(ctx *context.APIContext) {

}
func GrampusNotebookRestart(ctx *context.APIContext) {
var id = ctx.Params(":id")
var resultCode = "-1"
var errorMsg = ""
var status = ""

t := ctx.Cloudbrain
if t.IsNewAITask() {
res, bizErr := task.RestartAITask(t.ID, ctx.Repo.GitRepo, ctx.Repo.Repository, ctx.User)
if bizErr != nil {
log.Error("lRestartAITask failed:task.ID=%d err=%v", t.ID, bizErr.DefaultMsg)
errorMsg = ctx.Tr(bizErr.TrCode)
ctx.JSON(200, map[string]string{
"result_code": resultCode,
"error_msg": errorMsg,
"status": status,
"id": id,
})
return
}
id = strconv.FormatInt(res.ID, 10)
status = res.Status
resultCode = "0"
ctx.JSON(200, map[string]string{
"result_code": resultCode,
"error_msg": errorMsg,
"status": status,
"id": id,
})
return
}
cloudbrainTask.GrampusNotebookRestart(ctx.Context)
}

@@ -233,6 +384,15 @@ func GrampusStopJob(ctx *context.APIContext) {
}

func GrampusNotebookDel(ctx *context.APIContext) {
if isHandled, err := task.HandleNewAITaskDelete(ctx.Cloudbrain.ID); isHandled {
if err != nil {
log.Error("DeleteJob(%s) failed:%v", ctx.Cloudbrain.JobName, err, ctx.Data["msgID"])
ctx.JSON(http.StatusOK, models.BaseErrorMessageApi(err.Error()))
return
}
ctx.JSON(http.StatusOK, models.BaseOKMessageApi)
return
}
err := cloudbrainTask.DeleteGrampusJob(ctx.Context)
if err != nil {
ctx.JSON(http.StatusOK, models.BaseErrorMessageApi(err.Error()))
@@ -293,6 +453,21 @@ func GetCloudbrainTask(ctx *context.APIContext) {
ctx.NotFound(err)
return
}

if job.IsNewAITask() {
jobAfter, _ := task.UpdateCloudbrain(job)
ctx.JSON(http.StatusOK, map[string]interface{}{
"ID": ID,
"JobName": jobAfter.JobName,
"JobStatus": jobAfter.Status,
"SubState": "",
"CreatedTime": jobAfter.CreatedUnix.Format("2006-01-02 15:04:05"),
"CompletedTime": jobAfter.UpdatedUnix.Format("2006-01-02 15:04:05"),
"JobDuration": jobAfter.TrainJobDuration,
})
return
}

if job.JobType == string(models.JobTypeModelSafety) {
routerRepo.GetAiSafetyTaskByJob(job)
job, err = models.GetCloudbrainByID(ID)


+ 109
- 0
routers/api/v1/repo/cloudbrain_dashboard.go View File

@@ -4,10 +4,13 @@ import (
"fmt"
"net/http"
"net/url"
"sort"
"strconv"
"strings"
"time"

"code.gitea.io/gitea/modules/setting"

"code.gitea.io/gitea/services/cloudbrain/resource"

"code.gitea.io/gitea/models"
@@ -163,6 +166,112 @@ func GetOverviewDuration(ctx *context.Context) {
})
}

func GetCloubrainOverviewGroupByAiCenter(ctx *context.Context) {

cloudbrainCardTimeAndCountArray, err := models.GetCloudbrainCardTimeAndCountGroupByAICenter()
if err != nil {
log.Error("Can not query CardTimeAndCount.", err)
}

cardTimeMap, maxCardTime, _ := getCenterCardTimeInfo(cloudbrainCardTimeAndCountArray)

var aiCenterLocationInfos = make(map[string][]*cloudbrainService.AiCenterLocationInfo, 0)

const AI_CENTER = "智算中心"
for _, value := range setting.AiCenterCodeAndNameAndLocMapInfo {
long, lat := getLongLat(value.Loc)
aicenterArray, ok := aiCenterLocationInfos[value.Type]
if !ok {
aicenterArray = make([]*cloudbrainService.AiCenterLocationInfo, 0)

}
if value.Type == "超算中心" || value.Type == "东数西算" {

aiCenterLocationInfos[value.Type] = append(aicenterArray, &cloudbrainService.AiCenterLocationInfo{
Name: cloudbrainService.GetAiCenterShowByAiCenterId(value.Name, ctx),
Longitude: long,
Latitude: lat,
Value: setting.ScreenMap.MinValue,
})
} else if value.Type == AI_CENTER {

aiCenterLocationInfos[value.Type] = append(aicenterArray, &cloudbrainService.AiCenterLocationInfo{
Name: cloudbrainService.GetAiCenterShowByAiCenterId(value.Name, ctx),
Longitude: long,
Latitude: lat,
Value: getAiCenterSize(value.Name, cardTimeMap, maxCardTime, 0),
})

}

}
sort.SliceStable(aiCenterLocationInfos[AI_CENTER], func(i, j int) bool {
return aiCenterLocationInfos[AI_CENTER][i].Value > aiCenterLocationInfos[AI_CENTER][j].Value
})

if setting.ScreenMap.ShowData || ctx.IsUserSiteAdmin() {

for _, cloudbrainCardTimeAndCountMap := range cloudbrainCardTimeAndCountArray {
centerId := cloudbrainCardTimeAndCountMap["ai_center"]
centerShow := cloudbrainService.GetAiCenterShowByAiCenterId(centerId, ctx)
cloudbrainCardTimeAndCountMap["ai_center"] = centerShow

}
ctx.JSON(http.StatusOK, map[string]interface{}{
"cardAndJobCount": cloudbrainCardTimeAndCountArray,
"locationInfo": aiCenterLocationInfos,
})
return
}

ctx.JSON(http.StatusOK, map[string]interface{}{
"cardAndJobCount": []map[string]string{},
"locationInfo": aiCenterLocationInfos,
})
return

}

func getAiCenterSize(name string, timeMap map[string]int64, MaxCardTime int64, MinCardTime int64) int {
cardTime, _ := timeMap[name]
if cardTime == 0 {
return setting.ScreenMap.MinValue
} else {
if MaxCardTime == MinCardTime {
return setting.ScreenMap.MaxValue
} else {
return int(float64(cardTime-MinCardTime)/float64(MaxCardTime-MinCardTime)*float64(setting.ScreenMap.MaxValue-setting.ScreenMap.MinValue)) + setting.ScreenMap.MinValue
}
}

}

func getLongLat(loc string) (string, string) {
longLat := strings.Split(loc, ",")
if len(longLat) != 2 {
return "", ""
}
return longLat[0], longLat[1]
}

func getCenterCardTimeInfo(cloudbrainCardTimeAndCountArray []map[string]string) (map[string]int64, int64, int64) {
var centerCardTimeMap = make(map[string]int64, len(cloudbrainCardTimeAndCountArray))
var maxCardTime int64 = 0
var minCardTime int64 = 0
for i, cloudbrainCardTimeAndCount := range cloudbrainCardTimeAndCountArray {

cardTime, _ := strconv.ParseInt(cloudbrainCardTimeAndCount["card_duration"], 10, 64)
if i == 0 {
maxCardTime = cardTime
}
if i == len(cloudbrainCardTimeAndCountArray)-1 {
minCardTime = cardTime
}
centerCardTimeMap[cloudbrainCardTimeAndCount["ai_center"]] = cardTime
}
return centerCardTimeMap, maxCardTime, minCardTime
}

func GetCloudbrainCardDuration(task models.Cloudbrain) string {
cardNum := int(0)
spec, err := resource.GetCloudbrainSpec(task.ID)


+ 13
- 0
routers/api/v1/repo/datasets.go View File

@@ -12,6 +12,7 @@ import (
"code.gitea.io/gitea/modules/context"
"code.gitea.io/gitea/modules/log"
"code.gitea.io/gitea/modules/setting"
routerRepo "code.gitea.io/gitea/routers/repo"
)

func PublicDatasetMultiple(ctx *context.APIContext) {
@@ -143,3 +144,15 @@ func getSearchOrderByInValues(datasetIds []int64) models.SearchOrderBy {
searchOrderBy += " ELSE 0 END"
return models.SearchOrderBy(searchOrderBy)
}

func GetDataSetSelectItemByJobId(ctx *context.APIContext) {
routerRepo.GetDataSetSelectItemByJobId(ctx.Context)
}

func GetExportDataSetByMsgId(ctx *context.APIContext) {
routerRepo.GetExportDataSetByMsgId(ctx.Context)
}

func ExportModelToExistDataSet(ctx *context.APIContext) {
routerRepo.ExportModelToExistDataSet(ctx.Context)
}

+ 11
- 0
routers/api/v1/repo/modelarts.go View File

@@ -15,6 +15,7 @@ import (
"time"

"code.gitea.io/gitea/services/ai_task_service/schedule"
"code.gitea.io/gitea/services/ai_task_service/task"

"code.gitea.io/gitea/routers/response"

@@ -51,12 +52,17 @@ func GetModelArtsNotebook2(ctx *context.APIContext) {
return
}
if !job.Cleared {
if job.IsNewAITask() {
job, _ = task.UpdateCloudbrain(job)
} else {
err = modelarts.HandleNotebookInfo(job)
if err != nil {
ctx.NotFound(err)
return
}
}

}
ctx.JSON(http.StatusOK, map[string]interface{}{
"ID": ID,
"JobName": job.JobName,
@@ -652,3 +658,8 @@ func trainJobGetMetricStatistic(jobID string, versionName string) (*models.GetTr

return result, err
}

func DownloadMultiResultFile(ctx *context.APIContext) {
log.Info("DownloadMultiResultFile by api")
routerRepo.DownloadMultiResultFile(ctx.Context)
}

+ 10
- 0
routers/api/v1/repo/modelmanage.go View File

@@ -187,3 +187,13 @@ func DownloadModeConvertResultFile(ctx *context.APIContext) {
ctx.Context.SetParams("id", ctx.Query("id"))
routerRepo.ModelConvertDownloadModel(ctx.Context)
}

func SaveLocalModel(ctx *context.APIContext) {
log.Info("SaveLocalModel by api.")
routerRepo.SaveLocalModel(ctx.Context)
}

func DeleteModelFile(ctx *context.APIContext) {
log.Info("DeleteModelFile by api.")
routerRepo.DeleteModelFile(ctx.Context)
}

+ 7
- 0
routers/home.go View File

@@ -55,6 +55,8 @@ const (
tplRepoSquare base.TplName = "explore/repos/square"
tplRepoSearch base.TplName = "explore/repos/search"
tplRoshmci base.TplName = "explore/ros-hmci"

tplExploreCenterMap base.TplName = "explore/center_map"
)

// Home render home page
@@ -541,6 +543,11 @@ func ExploreDatasetsUI(ctx *context.Context) {
ctx.HTML(200, tplExploreDataset)
}

func CenterMapUI(ctx *context.Context) {

ctx.HTML(200, tplExploreCenterMap)
}

func getDatasetOrderBy(ctx *context.Context) models.SearchOrderBy {
var orderBy models.SearchOrderBy
switch ctx.Query("sort") {


+ 1
- 0
routers/private/internal.go View File

@@ -59,6 +59,7 @@ func RegisterRoutes(m *macaron.Macaron) {
m.Post("/repos/cnt_stat/handle_historical_task", admin.RefreshHistorySpec)
m.Post("/duration_statisctic/history_handle", repo.CloudbrainUpdateHistoryData)
m.Post("/square/repo/stat/refresh", repository.RefreshRepoStatData)
m.Get("/setting/refresh", RefreshSetting)

}, CheckInternalToken)
}

+ 17
- 0
routers/private/setting.go View File

@@ -0,0 +1,17 @@
package private

import (
"code.gitea.io/gitea/modules/setting"
"gitea.com/macaron/macaron"
)

func RefreshSetting(ctx *macaron.Context) {

setting.Cfg.Reload()
setting.NewScreenMapConfig()
setting.GetGrampusConfig()
setting.GetModelartsConfig()
setting.GetModelartsCDConfig()
ctx.PlainText(200, []byte("success"))

}

+ 65
- 33
routers/repo/attachment.go View File

@@ -667,23 +667,36 @@ func GetSuccessChunks(ctx *context.Context) {

}

func NewMultipart(ctx *context.Context) {
func NewMultipartForApi(ctx *context.Context, isFlowControl bool) (map[string]string, error) {
if !setting.Attachment.Enabled {
ctx.Error(404, "attachment is not enabled")
return
return nil, errors.New("attachment is not enabled")
}

err := upload.VerifyFileType(ctx.Query("fileType"), strings.Split(setting.Attachment.AllowedTypes, ","))
typeCloudBrain := ctx.QueryInt("type")
fileMD5 := ctx.Query("md5")
fileChunk, err := models.GetFileChunkByMD5AndUser(fileMD5, ctx.User.ID, typeCloudBrain)
if err == nil {
if fileChunk != nil {
log.Info("cannot reupload,name" + ctx.Query("file_name"))
return nil, errors.New("Cannot upload repeatedly,name is " + ctx.Query("file_name"))
}
}
if isFlowControl {
err = CheckFlowForDataset(ctx)
if err != nil {
ctx.Error(400, err.Error())
return
log.Info("check error," + err.Error())
return nil, err
}
}
err = upload.VerifyFileType(ctx.Query("fileType"), strings.Split(setting.Attachment.AllowedTypes, ","))
if err != nil {
log.Info("VerifyFileType error," + err.Error())
return nil, errors.New("Not support file type.")
}

typeCloudBrain := ctx.QueryInt("type")
err = checkTypeCloudBrain(typeCloudBrain)
if err != nil {
ctx.ServerError("checkTypeCloudBrain failed", err)
return
log.Info("checkTypeCloudBrain error," + err.Error())
return nil, err
}

fileName := ctx.Query("file_name")
@@ -691,14 +704,15 @@ func NewMultipart(ctx *context.Context) {
if setting.Attachment.StoreType == storage.MinioStorageType {
totalChunkCounts := ctx.QueryInt("totalChunkCounts")
if totalChunkCounts > minio_ext.MaxPartsCount {
ctx.Error(400, fmt.Sprintf("chunk counts(%d) is too much", totalChunkCounts))
return
log.Info(fmt.Sprintf("chunk counts(%d) is too much", totalChunkCounts))
return nil, errors.New(fmt.Sprintf("chunk counts(%d) is too much", totalChunkCounts))

}

fileSize := ctx.QueryInt64("size")
if fileSize > minio_ext.MaxMultipartPutObjectSize {
ctx.Error(400, fmt.Sprintf("file size(%d) is too big", fileSize))
return
log.Info(fmt.Sprintf("file size(%d) is too big", fileSize))
return nil, errors.New(fmt.Sprintf("file size(%d) is too big", fileSize))
}

uuid := gouuid.NewV4().String()
@@ -706,17 +720,16 @@ func NewMultipart(ctx *context.Context) {
if typeCloudBrain == models.TypeCloudBrainOne {
uploadID, err = storage.NewMultiPartUpload(strings.TrimPrefix(path.Join(setting.Attachment.Minio.BasePath, path.Join(uuid[0:1], uuid[1:2], uuid)), "/"))
if err != nil {
ctx.ServerError("NewMultipart", err)
return
log.Info("NewMultipart " + err.Error())
return nil, err
}
} else {
uploadID, err = storage.NewObsMultiPartUpload(strings.TrimPrefix(path.Join(setting.BasePath, path.Join(uuid[0:1], uuid[1:2], uuid, fileName)), "/"))
if err != nil {
ctx.ServerError("NewObsMultiPartUpload", err)
return
log.Info("NewObsMultiPartUpload " + err.Error())
return nil, err
}
}

_, err = models.InsertFileChunk(&models.FileChunk{
UUID: uuid,
UserID: ctx.User.ID,
@@ -728,18 +741,26 @@ func NewMultipart(ctx *context.Context) {
})

if err != nil {
ctx.Error(500, fmt.Sprintf("InsertFileChunk: %v", err))
return
log.Info(fmt.Sprintf("InsertFileChunk: %v", err))
return nil, err
}

ctx.JSON(200, map[string]string{
return map[string]string{
"uuid": uuid,
"uploadID": uploadID,
})
}, nil
} else {
ctx.Error(404, "storage type is not enabled")
return nil, errors.New("storage type is not enabled")
}

}

func NewMultipart(ctx *context.Context) {
re, err := NewMultipartForApi(ctx, false)
if err != nil {
ctx.ServerError("NewMultipart failed", err)
return
}
ctx.JSON(200, re)
}

func PutOBSProxyUpload(ctx *context.Context) {
@@ -850,24 +871,31 @@ func CompleteMultipart(ctx *context.Context) {

fileChunk, err := models.GetFileChunkByUUID(uuid)
if err != nil {
if models.IsErrFileChunkNotExist(err) {
ctx.Error(404)
} else {
ctx.ServerError("GetFileChunkByUUID", err)
}
ctx.JSON(200, map[string]string{
"result_code": "-1",
"msg": "The upload file not found.",
})
return
}

if typeCloudBrain == models.TypeCloudBrainOne {
_, err = storage.CompleteMultiPartUpload(strings.TrimPrefix(path.Join(setting.Attachment.Minio.BasePath, path.Join(fileChunk.UUID[0:1], fileChunk.UUID[1:2], fileChunk.UUID)), "/"), uploadID, fileChunk.TotalChunks)
if err != nil {
ctx.Error(500, fmt.Sprintf("CompleteMultiPartUpload failed: %v", err))
ctx.JSON(200, map[string]string{
"result_code": "-1",
"msg": fmt.Sprintf("CompleteMultiPartUpload failed: %v", err),
})
//ctx.Error(500, fmt.Sprintf("CompleteMultiPartUpload failed: %v", err))
return
}
} else {
err = storage.CompleteObsMultiPartUpload(strings.TrimPrefix(path.Join(setting.BasePath, path.Join(fileChunk.UUID[0:1], fileChunk.UUID[1:2], fileChunk.UUID, fileName)), "/"), uploadID, fileChunk.TotalChunks)
if err != nil {
ctx.Error(500, fmt.Sprintf("CompleteObsMultiPartUpload failed: %v", err))
ctx.JSON(200, map[string]string{
"result_code": "-1",
"msg": fmt.Sprintf("CompleteObsMultiPartUpload failed: %v", err),
})
//ctx.Error(500, fmt.Sprintf("CompleteObsMultiPartUpload failed: %v", err))
return
}
}
@@ -876,7 +904,11 @@ func CompleteMultipart(ctx *context.Context) {

err = models.UpdateFileChunk(fileChunk)
if err != nil {
ctx.Error(500, fmt.Sprintf("UpdateFileChunk: %v", err))
ctx.JSON(200, map[string]string{
"result_code": "-1",
"msg": fmt.Sprintf("UpdateFileChunk: %v", err),
})
//ctx.Error(500, fmt.Sprintf("UpdateFileChunk: %v", err))
return
}



+ 3
- 3
routers/repo/attachment_model.go View File

@@ -20,9 +20,9 @@ func GetModelChunks(ctx *context.Context) {
fileMD5 := ctx.Query("md5")
typeCloudBrain := ctx.QueryInt("type")
fileName := ctx.Query("file_name")
scene := ctx.Query("scene")
//scene := ctx.Query("scene")
modeluuid := ctx.Query("modeluuid")
log.Info("scene=" + scene + " typeCloudBrain=" + fmt.Sprint(typeCloudBrain))
log.Info(" typeCloudBrain=" + fmt.Sprint(typeCloudBrain))
var chunks string

err := checkTypeCloudBrain(typeCloudBrain)
@@ -131,7 +131,7 @@ func GetModelChunks(ctx *context.Context) {
"attachID": "0",
"datasetID": "0",
"fileName": "",
"datasetName": "",
"modelName": "",
})
}
}


+ 74
- 15
routers/repo/cloudbrain.go View File

@@ -200,14 +200,16 @@ func prepareCloudbrainOneSpecs(ctx *context.Context) {
}

func CloudBrainNew(ctx *context.Context) {
err := cloudBrainNewDataPrepare(ctx, string(models.JobTypeDebug))
if err != nil {
ctx.ServerError("get new cloudbrain info failed", err)
return
}
ctx.Data["PageIsGPUDebug"] = true
// err := cloudBrainNewDataPrepare(ctx, string(models.JobTypeDebug))
// if err != nil {
// ctx.ServerError("get new cloudbrain info failed", err)
// return
// }
// ctx.Data["PageIsGPUDebug"] = true
ctx.Data["PageIsCloudBrain"] = true
ctx.HTML(200, tplCloudBrainNew)
}

func CloudBrainCreate(ctx *context.Context, form auth.CreateCloudBrainForm) {
ctx.Data["IsCreate"] = true
cloudBrainCreate(ctx, form)
@@ -703,6 +705,32 @@ func CloudBrainRestart(ctx *context.Context) {
var status = string(models.JobWaiting)
task := ctx.Cloudbrain

if task.IsNewAITask() {
res, bizErr := ai_task.RestartAITask(task.ID, ctx.Repo.GitRepo, ctx.Repo.Repository, ctx.User)
if bizErr != nil {
log.Error("RestartAITask failed:task.ID=%d err=%v", task.ID, bizErr.DefaultMsg)
errorMsg = ctx.Tr(bizErr.TrCode)
resultCode = "-1"
ctx.JSON(200, map[string]string{
"result_code": resultCode,
"error_msg": errorMsg,
"status": status,
"id": ID,
})
return
}
id := strconv.FormatInt(res.ID, 10)
status = res.Status
resultCode = "0"
ctx.JSON(200, map[string]string{
"result_code": resultCode,
"error_msg": errorMsg,
"status": status,
"id": id,
})
return
}

lockOperator, errMsg := cloudbrainService.Lock4CloudbrainRestart(&lock.LockContext{Repo: ctx.Repo.Repository, Task: &models.Cloudbrain{JobType: task.JobType}, User: ctx.User})
defer func() {
if lockOperator != nil {
@@ -838,7 +866,9 @@ func CloudBrainBenchMarkShow(ctx *context.Context) {
}

func CloudBrainShow(ctx *context.Context) {
cloudBrainShow(ctx, tplCloudBrainShow, models.JobTypeDebug)
// cloudBrainShow(ctx, tplCloudBrainShow, models.JobTypeDebug)
ctx.Data["PageIsCloudBrain"] = true
ctx.HTML(200, tplCloudBrainShow)
}

func CloudBrainTrainJobShow(ctx *context.Context) {
@@ -871,12 +901,16 @@ func cloudBrainShow(ctx *context.Context, tpName base.TplName, jobType models.Jo
return
}
if task.Status == string(models.JobWaiting) || task.Status == string(models.JobRunning) {
if task.IsNewAITask() {
task, _ = ai_task.UpdateCloudbrain(task)
} else {
task, err = cloudbrainTask.SyncCloudBrainOneStatus(task)
if err != nil {
log.Info("error:" + err.Error())
ctx.NotFound(ctx.Req.URL.RequestURI(), nil)
return
}
}

}

@@ -1307,6 +1341,16 @@ func CloudBrainStop(ctx *context.Context) {
resultCode = task.Status
break
}
if res, isHandled, err := ai_task.HandleNewAITaskStop(task.ID); isHandled {
if err != nil {
log.Error("StopJob(%s) failed:%v", task.JobName, err, ctx.Data["msgID"])
resultCode = "-1"
errorMsg = "cloudbrain.Stopped_failed"
break
}
status = res.Status
break
}

err := cloudbrain.StopJob(task.JobID)
if err != nil {
@@ -1513,6 +1557,14 @@ func CloudBrainDel(ctx *context.Context) {
func deleteCloudbrainJob(ctx *context.Context) error {
task := ctx.Cloudbrain

if isHandled, err := ai_task.HandleNewAITaskDelete(task.ID); isHandled {
if err != nil {
log.Error("DeleteJob(%s) failed:%v", task.JobName, err, ctx.Data["msgID"])
return err
}
return nil
}

if task.Status != string(models.JobStopped) && task.Status != string(models.JobFailed) && task.Status != string(models.JobSucceeded) {
log.Error("the job(%s) has not been stopped", task.JobName, ctx.Data["msgID"])
return errors.New("the job has not been stopped")
@@ -1949,6 +2001,13 @@ func mkPathAndReadMeFile(path string, text string) error {
}

func SyncCloudbrainStatus() {
defer func() {
if err := recover(); err != nil {
combinedErr := fmt.Errorf("%s\n%s", err, log.Stack(2))
log.Error("PANIC:%v", combinedErr)
}
}()

cloudBrains, err := models.GetCloudBrainUnStoppedJob()
if err != nil {
log.Error("GetCloudBrainUnStoppedJob failed:", err.Error())
@@ -1956,12 +2015,18 @@ func SyncCloudbrainStatus() {
}

for _, task := range cloudBrains {

if task.JobType == string(models.JobTypeModelSafety) {
continue
}

if task.IsNewAITask() {
task, _ = ai_task.UpdateCloudbrain(task)
if task.Duration >= setting.MaxDuration && task.JobType == string(models.JobTypeDebug) {
ai_task.StopCloudbrain(task)
}
continue
}
if task.Type == models.TypeCloudBrainOne {

task, err = cloudbrainTask.SyncCloudBrainOneStatus(task)
if err != nil {
log.Error("Sync cloud brain one (%s) failed:%v", task.JobName, err)
@@ -1986,13 +2051,7 @@ func SyncCloudbrainStatus() {
}
} else if task.Type == models.TypeC2Net {
if task.JobType == string(models.JobTypeDebug) {
if task.IsNewAITask() {
ai_task.UpdateCloudbrain(task)
task, _ = models.GetCloudbrainByCloudbrainID(task.ID)
} else {
cloudbrainTask.SyncGrampusNotebookStatus(task)
}

} else {
result, err := grampus.GetJob(task.JobID)
if err != nil {


+ 1
- 0
routers/repo/cloudbrain_statistic.go View File

@@ -15,6 +15,7 @@ import (

func CloudbrainDurationStatisticHour() {
if setting.IsCloudbrainTimingEnabled {
log.Info("CloudbrainDurationStatisticHour start")
var statisticTime time.Time
var count int64
recordDurationUpdateTime, err := models.GetDurationRecordUpdateTime()


+ 5
- 2
routers/repo/dataset.go View File

@@ -173,7 +173,10 @@ func DatasetIndex(ctx *context.Context) {

//load attachment creator
for _, attachment := range pageAttachments {
uploader, _ := models.GetUserByID(attachment.UploaderID)
uploader, err1 := models.GetUserByID(attachment.UploaderID)
if err1 != nil {
log.Info("query dataset user error." + err1.Error())
}
attachment.Uploader = uploader
if !strings.HasSuffix(attachment.Name, ".zip") && !strings.HasSuffix(attachment.Name, ".tar.gz") {
attachment.DecompressState = -1 //非压缩文件
@@ -192,7 +195,7 @@ func DatasetIndex(ctx *context.Context) {
ctx.Data["Type"] = cloudbrainType

renderAttachmentSettings(ctx)
log.Info("dataset index finished.")
ctx.HTML(200, tplIndex)
}



+ 146
- 0
routers/repo/flow_control.go View File

@@ -0,0 +1,146 @@
package repo

import (
"encoding/json"
"errors"
"fmt"
"strconv"
"sync"
"time"

"code.gitea.io/gitea/models"
"code.gitea.io/gitea/modules/context"
"code.gitea.io/gitea/modules/log"
"code.gitea.io/gitea/modules/redis/redis_client"
"code.gitea.io/gitea/modules/setting"
)

const (
REDIS_FLOW_ATTACHMENT_KEY = "flow_attachment_key"
)

var mutex *sync.RWMutex = new(sync.RWMutex)

func CheckFlowForDataset(ctx *context.Context) error {
if ctx.User == nil {
return errors.New("User not login.")
}
log.Info("start to check flow for upload dataset file.")
fileName := ctx.Query("file_name")
currentTimeNow := time.Now()
currentLongTime := currentTimeNow.Unix()
last24Hour := currentTimeNow.AddDate(0, 0, -1).Unix()
filechunks, err := models.GetFileChunksByUserId(ctx.User.ID, last24Hour, true)
if err == nil {
if len(filechunks) > setting.FLOW_CONTROL.ATTACHEMENT_NUM_A_USER_LAST24HOUR {
log.Info("A single user cannot upload more than " + fmt.Sprint(setting.FLOW_CONTROL.ATTACHEMENT_NUM_A_USER_LAST24HOUR) + " files within the last 24 hours. so " + fileName + " is rejected. user id=" + fmt.Sprint(ctx.User.ID))
return errors.New("A single user cannot upload more than " + fmt.Sprint(setting.FLOW_CONTROL.ATTACHEMENT_NUM_A_USER_LAST24HOUR) + " files within the last 24 hours.")
}
var totalSize int64
totalSize += ctx.QueryInt64("size")
concurrentUpload := 0
for _, file := range filechunks {
totalSize += file.Size
if (currentLongTime - int64(file.CreatedUnix)) < 10*60 {
log.Info("the file " + file.Md5 + " in 10min upload." + file.CreatedUnix.Format("2006-01-02 15:04:05"))
concurrentUpload += 1
} else {
log.Info("the file " + file.Md5 + " not in 10min upload." + file.CreatedUnix.Format("2006-01-02 15:04:05"))
}
}
log.Info("The concurrentUpload is " + fmt.Sprint(concurrentUpload) + " to checked " + fileName + ". user id=" + fmt.Sprint(ctx.User.ID))
if concurrentUpload >= setting.FLOW_CONTROL.ATTACHEMENT_NUM_A_USER_LAST10M {
log.Info("A single user cannot upload more than " + fmt.Sprint(setting.FLOW_CONTROL.ATTACHEMENT_NUM_A_USER_LAST10M) + " files within the past 10 minutes. so " + fileName + " is rejected. user id=" + fmt.Sprint(ctx.User.ID))
return errors.New("A single user cannot upload more than " + fmt.Sprint(setting.FLOW_CONTROL.ATTACHEMENT_NUM_A_USER_LAST10M) + " files within the past 10 minutes.")
}
if totalSize >= setting.FLOW_CONTROL.ATTACHEMENT_SIZE_A_USER*1024*1024*1024 {
log.Info("The total file size uploaded by a single user within the past 24 hours cannot exceed " + fmt.Sprint(setting.FLOW_CONTROL.ATTACHEMENT_SIZE_A_USER) + "G. so " + fileName + " is rejected. user id=" + fmt.Sprint(ctx.User.ID))
return errors.New("The total file size uploaded by a single user within the past 24 hours cannot exceed " + fmt.Sprint(setting.FLOW_CONTROL.ATTACHEMENT_SIZE_A_USER) + "G.")
}
}
return nil
}

func AddFileNameToCache(datasetId int64, fileName string, userId int64) {
mutex.Lock()
defer mutex.Unlock()
cacheMap := getSDKUploadFileMap(REDIS_FLOW_ATTACHMENT_KEY)
expireTimeKeys := make([]string, 0)
currentTime := time.Now().Unix()
for tmpKey, tmpValue := range cacheMap {
time, err := strconv.ParseInt(tmpValue, 10, 64)
if err == nil {
if currentTime-time > 24*3600 {
expireTimeKeys = append(expireTimeKeys, tmpKey)
continue
}
}
}
for _, delKey := range expireTimeKeys {
delete(cacheMap, delKey)
}
key := fmt.Sprint(datasetId) + "_" + fileName + "_" + fmt.Sprint(userId)
value := fmt.Sprint(time.Now().Unix())
cacheMap[key] = value
log.Info("set key=" + key + " value=" + value + " to cache.")
setSDKUploadFileCache(REDIS_FLOW_ATTACHMENT_KEY, cacheMap)
}

func RemoveFileFromCache(datasetId int64, fileName string, userId int64) {
mutex.Lock()
defer mutex.Unlock()
key := fmt.Sprint(datasetId) + "_" + fileName + "_" + fmt.Sprint(userId)
cacheMap := getSDKUploadFileMap(REDIS_FLOW_ATTACHMENT_KEY)
delete(cacheMap, key)
log.Info("remove key=" + key + " from cache.")
setSDKUploadFileCache(REDIS_FLOW_ATTACHMENT_KEY, cacheMap)
}

func getSDKUploadFileMap(msgKey string) map[string]string {
valueStr, err := redis_client.Get(msgKey)
msgMap := make(map[string]string, 0)
if err == nil {
if valueStr != "" {
err1 := json.Unmarshal([]byte(valueStr), &msgMap)
if err1 != nil {
log.Info("unmarshal json failed. " + err1.Error())
}
}
} else {
log.Info("Failed to load from reids. " + err.Error())
}
return msgMap
}

func setSDKUploadFileCache(msgKey string, msgMap map[string]string) {
msgMapJson, _ := json.Marshal(msgMap)
redisValue := string(msgMapJson)
log.Info("set redis key=" + msgKey + " value=" + redisValue)
re, err := redis_client.Setex(msgKey, redisValue, 24*3600*time.Second)
if err == nil {
log.Info("re =" + fmt.Sprint(re))
} else {
log.Info("set redis error:" + err.Error())
}
}

func CheckFlowForDatasetSDK() error {
cacheMap := getSDKUploadFileMap(REDIS_FLOW_ATTACHMENT_KEY)
currentTime := time.Now().Unix()
count := 0
for _, tmpValue := range cacheMap {
time, err := strconv.ParseInt(tmpValue, 10, 64)
if err == nil {
if currentTime-time > 24*3600 {
continue
}
}
count += 1
}
log.Info("total find " + fmt.Sprint(count) + " uploading files.")
if count >= setting.FLOW_CONTROL.ALL_ATTACHEMENT_NUM_SDK {
log.Info("The number of datasets uploaded using the SDK simultaneously cannot exceed " + fmt.Sprint(setting.FLOW_CONTROL.ALL_ATTACHEMENT_NUM_SDK))
return errors.New("The number of datasets uploaded using the SDK simultaneously cannot exceed " + fmt.Sprint(setting.FLOW_CONTROL.ALL_ATTACHEMENT_NUM_SDK))
}
return nil
}

+ 133
- 34
routers/repo/grampus.go View File

@@ -8,6 +8,7 @@ import (
"net/http"
"os"
"path"
"strconv"
"strings"
"time"
"unicode/utf8"
@@ -70,31 +71,33 @@ const (
)

func GrampusNotebookNew(ctx *context.Context) {
ctx.Data["IsCreate"] = true
ctx.Data["PageIsCloudBrain"] = true
notebookType := ctx.QueryInt("type")
processType := grampus.ProcessorTypeGPU
if notebookType == 1 {
processType = grampus.ProcessorTypeNPU
} else if notebookType == 2 {
processType = grampus.ProcessorTypeGCU
} else if notebookType == 3 {
processType = grampus.ProcessorTypeMLU
ctx.HTML(http.StatusOK, tplGrampusNotebookMLUNew)
return
}
err := grampusNotebookNewDataPrepare(ctx, processType)
if err != nil {
ctx.ServerError("get new notebook-job info failed", err)
return
}
if processType == grampus.ProcessorTypeGPU {
ctx.HTML(http.StatusOK, tplGrampusNotebookGPUNew)
} else if processType == grampus.ProcessorTypeNPU {
ctx.HTML(http.StatusOK, tplGrampusNotebookNPUNew)
} else if processType == grampus.ProcessorTypeGCU {
ctx.HTML(http.StatusOK, tplGrampusNotebookGCUNew)
}
ctx.HTML(http.StatusOK, tplGrampusNotebookNew)
// ctx.Data["IsCreate"] = true
// ctx.Data["PageIsCloudBrain"] = true
// notebookType := ctx.QueryInt("type")
// processType := grampus.ProcessorTypeGPU
// if notebookType == 1 {
// processType = grampus.ProcessorTypeNPU
// } else if notebookType == 2 {
// processType = grampus.ProcessorTypeGCU
// } else if notebookType == 3 {
// processType = grampus.ProcessorTypeMLU
// ctx.HTML(http.StatusOK, tplGrampusNotebookMLUNew)
// return
// }
// err := grampusNotebookNewDataPrepare(ctx, processType)
// if err != nil {
// ctx.ServerError("get new notebook-job info failed", err)
// return
// }
// if processType == grampus.ProcessorTypeGPU {
// ctx.HTML(http.StatusOK, tplGrampusNotebookGPUNew)
// } else if processType == grampus.ProcessorTypeNPU {
// ctx.HTML(http.StatusOK, tplGrampusNotebookNPUNew)
// } else if processType == grampus.ProcessorTypeGCU {
// ctx.HTML(http.StatusOK, tplGrampusNotebookGCUNew)
// }
}

func GrampusTrainJobGPUNew(ctx *context.Context) {
@@ -1207,12 +1210,12 @@ func grampusTrainJobNpuCreate(ctx *context.Context, form auth.CreateGrampusTrain
}

//todo: upload code (send to file_server todo this work?)
if err := obsMkdir(setting.CodePathPrefix + jobName + modelarts.OutputPath); err != nil {
/**if err := obsMkdir(setting.CodePathPrefix + jobName + modelarts.OutputPath); err != nil {
log.Error("Failed to obsMkdir_output: %s (%v)", repo.FullName(), err)
grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeNPU)
ctx.RenderWithErr(ctx.Tr("cloudbrain.load_code_failed"), tpl, &form)
return
}
}*/

if err := uploadCodeToObs(codeLocalPath, jobName, ""); err != nil {
log.Error("Failed to uploadCodeToObs: %s (%v)", repo.FullName(), err)
@@ -1339,8 +1342,7 @@ func GetGrampusNotebook(ctx *context.APIContext) {

var jobAfter *models.Cloudbrain
if job.IsNewAITask() {
ai_task.UpdateCloudbrain(job)
jobAfter, _ = models.GetCloudbrainByCloudbrainID(job.ID)
jobAfter, _ = ai_task.UpdateCloudbrain(job)
} else {
jobAfter, err = cloudbrainTask.SyncGrampusNotebookStatus(job)
}
@@ -1365,16 +1367,46 @@ func GetGrampusNotebook(ctx *context.APIContext) {
}

func GrampusStopJob(ctx *context.Context) {
if res, isHandled, err := ai_task.HandleNewAITaskStop(ctx.Cloudbrain.ID); isHandled {
if err != nil {
log.Error("StopJob(%s) failed:%v", ctx.Cloudbrain.JobName, err, ctx.Data["msgID"])
ctx.JSON(200, map[string]interface{}{
"result_code": "-1",
"error_msg": ctx.Tr("cloudbrain.Stopped_failed"),
"status": "",
"id": ctx.Params(":id"),
"StatusOK": 0,
})
return
}
ctx.JSON(200, map[string]interface{}{
"result_code": "0",
"error_msg": "",
"status": res.Status,
"id": ctx.Params(":id"),
"StatusOK": 0,
})
return
}
cloudbrainTask.GrampusStopJob(ctx)
}

func GrampusNotebookDel(ctx *context.Context) {
var listType = ctx.Query("listType")

if isHandled, err := ai_task.HandleNewAITaskDelete(ctx.Cloudbrain.ID); isHandled {
if err != nil {
log.Error("DeleteJob(%s) failed:%v", ctx.Cloudbrain.JobName, err, ctx.Data["msgID"])
ctx.ServerError(err.Error(), err)
return
}
} else {
if err := cloudbrainTask.DeleteGrampusJob(ctx); err != nil {
log.Error("deleteGrampusJob failed: %v", err, ctx.Data["msgID"])
ctx.ServerError(err.Error(), err)
return
}
}

var isAdminPage = ctx.Query("isadminpage")
var isHomePage = ctx.Query("ishomepage")
@@ -1412,7 +1444,9 @@ type NotebookDataset struct {

func GrampusNotebookShow(ctx *context.Context) {
ctx.Data["PageIsCloudBrain"] = true

ctx.HTML(http.StatusOK, tplGrampusNotebookShow)
return
/*
var task *models.Cloudbrain
task, err := models.GetCloudbrainByIDWithDeleted(ctx.Params(":id"))
if err != nil {
@@ -1423,8 +1457,7 @@ func GrampusNotebookShow(ctx *context.Context) {
task.ContainerIp = ""

if task.IsNewAITask() {
ai_task.UpdateCloudbrain(task)
task, _ = models.GetCloudbrainByCloudbrainID(task.ID)
task, _ = ai_task.UpdateCloudbrain(task)
} else if task.DeletedAt.IsZero() && cloudbrainTask.IsTaskNotStop(task) { //normal record
result, err := grampus.GetNotebookJob(task.JobID)
if err != nil {
@@ -1500,6 +1533,7 @@ func GrampusNotebookShow(ctx *context.Context) {
ctx.Data["dataset_path"] = cloudbrain.DataSetMountPath
ctx.Data["model_path"] = cloudbrain.ModelMountPath
ctx.HTML(http.StatusOK, tplGrampusNotebookShow)
*/
}

func getDatasetDownloadInfo(ctx *context.Context, task *models.Cloudbrain) []*models.DatasetDownload {
@@ -1659,13 +1693,26 @@ func GrampusDownloadLog(ctx *context.Context) {
ctx.ServerError(err.Error(), err)
return
}
fileName := job.JobName + "-log.txt"

content, err := grampus.GetTrainJobLog(job.JobID)
nodeIdStr := ctx.Params(":nodeId")
var content string
if nodeIdStr != "" {
nodeId, _ := strconv.Atoi(nodeIdStr)
fileName = job.JobName + "-" + strconv.Itoa(nodeId+1) + "-log.txt"
if job.WorkServerNumber < 1 || nodeId > job.WorkServerNumber-1 {
ctx.NotFound("query parameter is wrong", nil)
return
}
content, err = grampus.GetTrainJobLog(job.JobID, nodeId)
} else {
content, err = grampus.GetTrainJobLog(job.JobID)
}
if err != nil {
log.Error("GetTrainJobLog failed: %v", err, ctx.Data["MsgID"])
content = ""
}
fileName := job.JobName + "-log.txt"
ctx.Resp.Header().Set("Content-Disposition", "attachment; filename="+fileName)
ctx.Resp.Header().Set("Content-Type", "application/octet-stream")
var b []byte = []byte(content)
@@ -1696,7 +1743,19 @@ func GrampusGetLog(ctx *context.Context) {
exitDiagnostics = result.ExitDiagnostics
}

content, err := grampus.GetTrainJobLog(job.JobID)
nodeIdStr := ctx.Params(":nodeId")
var content string
if nodeIdStr != "" {
nodeId, _ := strconv.Atoi(nodeIdStr)
if job.WorkServerNumber < 1 || nodeId > job.WorkServerNumber-1 {
ctx.NotFound("query parameter is wrong", nil)
return
}
content, err = grampus.GetTrainJobLog(job.JobID, nodeId)
} else {
content, err = grampus.GetTrainJobLog(job.JobID)
}

if err != nil {
log.Error("GetTrainJobLog failed: %v", err, ctx.Data["MsgID"])
ctx.JSON(http.StatusOK, map[string]interface{}{
@@ -1734,7 +1793,17 @@ func GrampusMetrics(ctx *context.Context) {
}
var result models.NewModelArtsMetricStatisticResult
if job.IsNPUTask() {
nodeIdStr := ctx.Params(":nodeId")
if nodeIdStr != "" {
nodeId, _ := strconv.Atoi(nodeIdStr)
if job.WorkServerNumber < 1 || nodeId > job.WorkServerNumber-1 {
ctx.NotFound("query parameter is wrong", nil)
return
}
result, err = grampus.GetGrampusMetrics(job.JobID, 0, 0, nodeId)
} else {
result, err = grampus.GetGrampusMetrics(job.JobID, 0, 0)
}
} else if job.IsGPUTask() {
startTime := int64(job.StartTime)
if startTime == 0 {
@@ -2016,6 +2085,36 @@ func GrampusNotebookDebug(ctx *context.Context) {
}

func GrampusNotebookRestart(ctx *context.Context) {
var id = ctx.Params(":id")
var resultCode = "-1"
var errorMsg = ""
var status = ""

t := ctx.Cloudbrain
if t.IsNewAITask() {
res, bizErr := ai_task.RestartAITask(t.ID, ctx.Repo.GitRepo, ctx.Repo.Repository, ctx.User)
if bizErr != nil {
log.Error("lRestartAITask failed:task.ID=%d err=%v", t.ID, bizErr.DefaultMsg)
errorMsg = ctx.Tr(bizErr.TrCode)
ctx.JSON(200, map[string]string{
"result_code": resultCode,
"error_msg": errorMsg,
"status": status,
"id": id,
})
return
}
id = strconv.FormatInt(res.ID, 10)
status = res.Status
resultCode = "0"
ctx.JSON(200, map[string]string{
"result_code": resultCode,
"error_msg": errorMsg,
"status": status,
"id": id,
})
return
}
cloudbrainTask.GrampusNotebookRestart(ctx)
}



+ 30
- 0
routers/repo/grampus_onlineinfer.go View File

@@ -0,0 +1,30 @@
package repo

import (
"net/http"

"code.gitea.io/gitea/modules/base"
"code.gitea.io/gitea/modules/context"
)

const (
tplGrampusOnlineInferIndex base.TplName = "repo/grampus/onlineinfer/list"
tplGrampusOnlineInferShow base.TplName = "repo/grampus/onlineinfer/show"
tplGrampusOnlineInferNew base.TplName = "repo/grampus/onlineinfer/new"
)

func GrampusOnlineInferNew(ctx *context.Context) {
ctx.Data["PageIsCloudBrain"] = true
ctx.HTML(http.StatusOK, tplGrampusOnlineInferNew)
}

func GrampusOnlineInferShow(ctx *context.Context) {
ctx.Data["PageIsCloudBrain"] = true
ctx.HTML(http.StatusOK, tplGrampusOnlineInferShow)
}

func GrampusOnlineInferIndex(ctx *context.Context) {
ctx.Data["PageIsCloudBrain"] = true
ctx.HTML(http.StatusOK, tplGrampusOnlineInferIndex)

}

+ 36
- 2
routers/repo/modelarts.go View File

@@ -2,6 +2,7 @@ package repo

import (
"archive/zip"
ai_task "code.gitea.io/gitea/services/ai_task_service/task"
"encoding/json"
"errors"
"fmt"
@@ -124,8 +125,8 @@ func MustEnableModelArts(ctx *context.Context) {
}

func NotebookNew(ctx *context.Context) {
notebookNewDataPrepare(ctx)
// notebookNewDataPrepare(ctx)
ctx.Data["PageIsCloudBrain"] = true
ctx.HTML(200, tplModelArtsNotebookNew)
}

@@ -305,6 +306,9 @@ func Notebook2Create(ctx *context.Context, form auth.CreateModelArtsNotebookForm

func NotebookShow(ctx *context.Context) {
ctx.Data["PageIsCloudBrain"] = true
ctx.HTML(200, tplModelArtsNotebookShow)
return
/*
debugListType := ctx.Query("debugListType")
if debugListType == "" {
debugListType = "all"
@@ -362,6 +366,7 @@ func NotebookShow(ctx *context.Context) {
ctx.Data["jobName"] = task.JobName
ctx.Data["debugListType"] = debugListType
ctx.HTML(200, tplModelArtsNotebookShow)
*/
}

func GetModelDownload(task *models.Cloudbrain) models.ModelDownload {
@@ -673,6 +678,19 @@ func NotebookStop(ctx *context.Context) {
errorMsg = ctx.Tr("cloudbrain.Already_stopped")
break
}
if res, isHandled, err := ai_task.HandleNewAITaskStop(task.ID); isHandled {
if err != nil {
log.Error("ManageNotebook2(%s) failed:%v", task.JobName, err.Error(), ctx.Data["MsgID"])
resultCode = "-1"
errorMsg = err.Error()
if strings.Contains(err.Error(), modelarts.NotebookNotFound) {
errorMsg = "the job's version is too old and can not be restarted"
}
break
}
status = res.Status
break
}

err, res := StopModelArtsNotebook(task)

@@ -734,6 +752,22 @@ func NotebookDel(ctx *context.Context) {
var listType = ctx.Query("debugListType")
task := ctx.Cloudbrain

if isHandled, err := ai_task.HandleNewAITaskDelete(task.ID); isHandled {
if err != nil {
log.Error("DeleteJob(%s) failed:%v", task.JobName, err, ctx.Data["msgID"])
ctx.RenderWithErr("DeleteJob failed", tplDebugJobIndex, nil)
}
var isAdminPage = ctx.Query("isadminpage")
var isHomePage = ctx.Query("ishomepage")
if ctx.IsUserSiteAdmin() && isAdminPage == "true" {
ctx.Redirect(setting.AppSubURL + "/admin" + "/cloudbrains")
} else if isHomePage == "true" {
ctx.Redirect(setting.AppSubURL + "/cloudbrains")
} else {
ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/debugjob?debugListType=" + listType)
}
}

if task.Status != string(models.ModelArtsCreateFailed) && task.Status != string(models.ModelArtsStartFailed) && task.Status != string(models.ModelArtsStopped) && task.Status != string(models.ModelArtsDeleted) {
log.Error("the job(%s) has not been stopped", task.JobName)
ctx.RenderWithErr("the job has not been stopped", tplDebugJobIndex, nil)


+ 7
- 7
routers/repo/setting.go View File

@@ -489,20 +489,20 @@ func SettingsPost(ctx *context.Context, form auth.RepoSettingForm) {
ctx.RenderWithErr(ctx.Tr("form.enterred_invalid_repo_name"), tplSettingsOptions, nil)
return
}
deployments, err := models.GetRunningServiceByUser(ctx.User.ID)
if err != nil {
ctx.ServerError("GetRunningServiceByUser", err)
return
}
if deployments != nil {

// finetune: openi-notebook repo can not be deleted if it has running service
if repo.Name == "openi-notebook" {
if deployments, err := models.GetRunningServiceByUser(ctx.User.ID); deployments != nil && err == nil {
if len(deployments) > 0 {
ctx.Data["Err_RepoName"] = nil
log.Error("盘古部署删除项目失败,repo id %v, 用户 id%v", repo.ID, ctx.User.ID)
log.Error("panguService: delete repo failed, repo %s, user %s", repo.ID, ctx.User.ID)
ctx.Flash.Error(ctx.Tr("deployment.deletion_notice_repo"))
ctx.Redirect(ctx.Repo.RepoLink + "/settings")
return
}
}
}

count, err := models.GetCloudbrainRunCountByRepoID(repo.ID)
if err != nil {
ctx.ServerError("GetCloudbrainCountByRepoID failed", err)


+ 4
- 0
routers/response/api_response.go View File

@@ -29,7 +29,11 @@ type TrFunc func(string, ...interface{}) string
func OuterTrBizError(err *BizError, locale macaron.Locale) *AiforgeOuterResponse {
msg := err.DefaultMsg
if locale != nil && err.TrCode != "" {
if err.TrParams == nil || len(err.TrParams) == 0 {
msg = locale.Tr(err.TrCode)
} else {
msg = locale.Tr(err.TrCode, err.TrParams...)
}
}
return &AiforgeOuterResponse{Code: err.Code, Msg: msg}
}


+ 26
- 1
routers/response/error.go View File

@@ -4,10 +4,28 @@ type BizError struct {
Code int
DefaultMsg string
TrCode string
TrParams []interface{}
}

//当调用此方法时意味着错误信息中有占位符,需要传入参数
//因此此时需要新建一个对象避免并发问题
func (e *BizError) WithParams(params ...interface{}) *BizError {
newErr := &BizError{
Code: e.Code,
DefaultMsg: e.DefaultMsg,
TrCode: e.TrCode,
}
if e.TrParams == nil {
newErr.TrParams = params
} else {
newErr.TrParams = append(e.TrParams, params)
}

return newErr
}

func NewBizError(err error) *BizError {
return &BizError{Code: RESPONSE_CODE_ERROR_DEFAULT, DefaultMsg: err.Error()}
return &BizError{Code: RESPONSE_CODE_ERROR_DEFAULT, DefaultMsg: err.Error(), TrCode: err.Error()}
}

func BuildBizError(code int, defaultMsg string, trCode ...string) *BizError {
@@ -17,3 +35,10 @@ func BuildBizError(code int, defaultMsg string, trCode ...string) *BizError {
}
return &BizError{Code: code, DefaultMsg: defaultMsg, TrCode: t}
}
func BuildDefaultBizError(defaultMsg string, trCode ...string) *BizError {
t := ""
if len(t) == 0 {
t = trCode[0]
}
return &BizError{Code: RESPONSE_CODE_ERROR_DEFAULT, DefaultMsg: defaultMsg, TrCode: t}
}

+ 10
- 1
routers/response/response_list.go View File

@@ -11,6 +11,7 @@ var BADGES_STILL_HAS_USERS = &BizError{Code: 1005, DefaultMsg: "Please delete us
var SYSTEM_ERROR = &BizError{Code: 9009, DefaultMsg: "System error.Please try again later", TrCode: "common_error.system_error"}
var INSUFFICIENT_PERMISSION = &BizError{Code: 9003, DefaultMsg: "insufficient permissions", TrCode: "common_error.insufficient_permission"}
var PARAM_ERROR = &BizError{Code: 9001, DefaultMsg: "param error", TrCode: "common_error.param_error"}
var WECHAT_NOT_BIND = &BizError{Code: 9002, DefaultMsg: "Please scan the code and bind to wechat first", TrCode: "common_error.wechat_not_bind"}

//云脑任务相关错误
var AI_TASK_NOT_EXISTS = &BizError{Code: 2001, DefaultMsg: "AI task not exists", TrCode: "ai_task.task_not_exists"}
@@ -23,4 +24,12 @@ var DATASET_NOT_EXISTS = &BizError{Code: 2007, DefaultMsg: "The part of datasets
var MODEL_NOT_EXISTS = &BizError{Code: 2008, DefaultMsg: "The model in the task does not exist or has been deleted, please create a new debug job.", TrCode: "repo.debug.manage.model_not_exist"}
var RESULT_CLEARD = &BizError{Code: 2009, DefaultMsg: "The files of the task have been cleared, can not restart any more, please create a new debug task instead.", TrCode: "cloudbrain.result_cleared"}
var CREATE_FAILED = &BizError{Code: 2010, DefaultMsg: "Create AI task failed", TrCode: "ai_task.create_failed"}
var RESTART_FAILED = &BizError{Code: 2010, DefaultMsg: "Restart AI task failed", TrCode: "ai_task.restart_failed"}
var RESTART_FAILED = &BizError{Code: 2011, DefaultMsg: "Restart AI task failed", TrCode: "ai_task.restart_failed"}
var STOP_FAILED = &BizError{Code: 2012, DefaultMsg: "Stop AI task failed", TrCode: "ai_task.stop_failed"}
var DATASET_SIZE_OVER_LIMIT = &BizError{Code: 2013, DefaultMsg: "The size of dataset exceeds limitation", TrCode: "ai_task.dataset_size_over_limit"}
var BOOT_FILE_MUST_BE_PYTHON = &BizError{Code: 2013, DefaultMsg: "The boot file must be a python file", TrCode: "ai_task.boot_file_must_python"}
var BOOT_FILE_NOT_EXIST = &BizError{Code: 2014, DefaultMsg: "The boot file not exist", TrCode: "ai_task.boot_file_not_exist"}
var DATASET_SELECT_ERROR = &BizError{Code: 2017, DefaultMsg: "Dataset select error: the count exceed the limit or has same name", TrCode: "cloudbrain.error.dataset_select"}
var PARTIAL_DATASETS_NOT_AVAILABLE = &BizError{Code: 2018, DefaultMsg: "There are non-existent or deleted files in the selected dataset file, please select again", TrCode: "cloudbrain.error.partial_datasets_not_available"}
var LOAD_CODE_FAILED = &BizError{Code: 2019, DefaultMsg: "Fail to load code, please check if the right branch is selected.", TrCode: "cloudbrain.load_code_failed"}
var BRANCH_NOT_EXISTS = &BizError{Code: 2020, DefaultMsg: "The branch does not exist", TrCode: "ai_task.branch_not_exists"}

+ 10
- 1
routers/routes/routes.go View File

@@ -385,7 +385,7 @@ func RegisterRoutes(m *macaron.Macaron) {
m.Get("", modelapp.ModelBaseUI)
m.Group("/pangufinetune", func() {
m.Get("", modelapp.PanguFinetuneUI)
m.Get("/create", reqSignIn, modelapp.PanguFinetuneCreateUI)
m.Get("/create", reqSignIn, reqWechatBind, modelapp.PanguFinetuneCreateUI)
m.Get("/inference", reqSignIn, modelapp.PanguInferenceUI)
})

@@ -428,6 +428,7 @@ func RegisterRoutes(m *macaron.Macaron) {
m.Get("/data_analysis/ProTrend", routers.ExploreDataAnalysisProTrend)
m.Get("/data_analysis/Overview", routers.ExploreDataAnalysisOverview)
m.Get("/data_analysis/BrainAnalysis", routers.ExploreDataAnalysisBrainAnalysis)
m.Get("/center_map", reqSignIn, routers.CenterMapUI)

}, ignSignIn)
m.Combo("/install", routers.InstallInit).Get(routers.Install).
@@ -1318,6 +1319,14 @@ func RegisterRoutes(m *macaron.Macaron) {
m.Post("/create", reqWechatBind, reqRepoCloudBrainWriter, bindIgnErr(auth.CreateGrampusNotebookForm{}), context.PointAccount(), repo.GrampusNotebookCreate)
})

m.Group("/onlineinfer", func() {
m.Get("", reqRepoCloudBrainReader, repo.GrampusOnlineInferIndex)
m.Group("/:id", func() {
m.Get("", reqRepoCloudBrainReader, repo.GrampusOnlineInferShow)
})
m.Get("/create", reqWechatBind, reqRepoCloudBrainWriter, context.PointAccount(), repo.GrampusOnlineInferNew)
})

m.Group("/train-job", func() {
m.Group("/:jobid", func() {
m.Get("", reqRepoCloudBrainReader, repo.GrampusTrainJobShow)


+ 1
- 0
routers/user/home.go View File

@@ -85,6 +85,7 @@ func retrieveFeeds(ctx *context.Context, options models.GetFeedsOptions) {
if act.ActUser != nil {
userCache[act.ActUserID] = act.ActUser
}
act.FilterCloudbrainInfo()
}

for _, act := range actions {


+ 167
- 37
services/ai_task_service/cluster/c2net.go View File

@@ -1,14 +1,18 @@
package cluster

import (
"code.gitea.io/gitea/entity/ai_task_entity"
"errors"
"fmt"
"strings"

"code.gitea.io/gitea/entity"
"code.gitea.io/gitea/manager/client/grampus"
"code.gitea.io/gitea/models"
model_grampus "code.gitea.io/gitea/modules/grampus"
"code.gitea.io/gitea/modules/log"
"code.gitea.io/gitea/modules/setting"
"errors"
"fmt"
"strings"
"code.gitea.io/gitea/routers/response"
"code.gitea.io/gitea/services/cloudbrain/cloudbrainTask"
)

type C2NetClusterAdapter struct {
@@ -16,10 +20,10 @@ type C2NetClusterAdapter struct {

func init() {
//注册到一个Map
AddCluster(ai_task_entity.C2Net, new(C2NetClusterAdapter))
AddCluster(entity.C2Net, new(C2NetClusterAdapter))
}

func (c C2NetClusterAdapter) CreateNoteBook(req ai_task_entity.CreateNoteBookTaskRequest) (*ai_task_entity.CreateNoteBookTaskResponse, error) {
func (c C2NetClusterAdapter) CreateNoteBook(req entity.CreateNoteBookTaskRequest) (*entity.CreateNoteBookTaskResponse, error) {
jobResult, err := grampus.CreateNotebookJob(convertNoteBookReq2Grampus(req))
if err != nil {
log.Error("CreateNoteBook failed: %v", err.Error())
@@ -32,7 +36,20 @@ func (c C2NetClusterAdapter) CreateNoteBook(req ai_task_entity.CreateNoteBookTas
return convertGrampus2NoteBookRes(jobResult), nil
}

func (c C2NetClusterAdapter) GetImages(req ai_task_entity.GetImageReq) ([]ai_task_entity.ClusterImage, bool, error) {
func (c C2NetClusterAdapter) CreateOnlineInfer(req entity.CreateNoteBookTaskRequest) (*entity.CreateNoteBookTaskResponse, error) {
jobResult, err := grampus.CreateNotebookJob(convertOnlineInfer2Grampus(req))
if err != nil {
log.Error("CreateNoteBook failed: %v", err.Error())
return nil, err
}
if jobResult.ErrorCode > 0 {
log.Error("CreateNotebookJob err.req.Name = %s ErrorCode = %d ErrorMsg = %s", req.Name, jobResult.ErrorCode, jobResult.ErrorMsg)
return nil, errors.New(fmt.Sprintf("CreateNotebookJob err[%d%s]", jobResult.ErrorCode, jobResult.ErrorMsg))
}
return convertGrampus2NoteBookRes(jobResult), nil
}

func (c C2NetClusterAdapter) GetImages(req entity.GetImageReq) ([]entity.ClusterImage, bool, error) {
processType := req.ComputeSource.FullName
images, err := grampus.GetImages(processType, string(req.JobType))
if err != nil {
@@ -42,24 +59,36 @@ func (c C2NetClusterAdapter) GetImages(req ai_task_entity.GetImageReq) ([]ai_tas
if images == nil || images.Infos == nil || len(images.Infos) == 0 {
return nil, true, err
}
r := make([]ai_task_entity.ClusterImage, len(images.Infos))
r := make([]entity.ClusterImage, len(images.Infos))
for i, v := range images.Infos {
r[i] = ConvertGrampusImageToStandard(v)
}
return r, false, nil
}

func ConvertGrampusImageToStandard(image models.GrampusImage) ai_task_entity.ClusterImage {
return ai_task_entity.ClusterImage{
func ConvertGrampusImageToStandard(image models.GrampusImage) entity.ClusterImage {
return entity.ClusterImage{
ImageId: image.ID,
ImageName: image.Name,
}
}

func convertNoteBookReq2Grampus(req ai_task_entity.CreateNoteBookTaskRequest) models.CreateGrampusNotebookRequest {
var commandGpuDebug = "mkdir -p /dataset;! [ -x \"$(command -v jupyter)\" ] && pip install jupyterlab==3 -i https://pypi.tuna.tsinghua.edu.cn/simple;jupyter lab --ServerApp.shutdown_no_activity_timeout=%s --TerminalManager.cull_inactive_timeout=%s --TerminalManager.cull_interval=%s --MappingKernelManager.cull_idle_timeout=%s --MappingKernelManager.cull_interval=%s --MappingKernelManager.cull_connected=True --MappingKernelManager.cull_busy=True --no-browser --ip=0.0.0.0 --allow-root --notebook-dir='/code' --port=$OCTOPUS_NOTEBOOK_PORT --LabApp.token='' --LabApp.allow_origin='*' --LabApp.base_url=$OCTOPUS_NOTEBOOK_BASE_URL;"
command := fmt.Sprintf(commandGpuDebug, setting.CullIdleTimeout, setting.CullIdleTimeout, setting.CullInterval, setting.CullIdleTimeout, setting.CullInterval)
func convertNoteBookReq2Grampus(req entity.CreateNoteBookTaskRequest) models.CreateGrampusNotebookRequest {
codePath := "/code"
if len(req.Tasks[0].Code) > 0 {
codePath = req.Tasks[0].Code[0].ContainerPath
if strings.Contains(codePath, "/") {
codePath = codePath[0:strings.LastIndex(codePath, "/")]
}
}

var commandGpuDebug = "mkdir -p /dataset;! [ -x \"$(command -v jupyter)\" ] && pip install jupyterlab==3 -i https://pypi.tuna.tsinghua.edu.cn/simple;jupyter lab --ServerApp.shutdown_no_activity_timeout=%s --TerminalManager.cull_inactive_timeout=%s --TerminalManager.cull_interval=%s --MappingKernelManager.cull_idle_timeout=%s --MappingKernelManager.cull_interval=%s --MappingKernelManager.cull_connected=True --MappingKernelManager.cull_busy=True --no-browser --ip=0.0.0.0 --allow-root --notebook-dir='%s' --port=$OCTOPUS_NOTEBOOK_PORT --LabApp.token='' --LabApp.allow_origin='*' --LabApp.base_url=$OCTOPUS_NOTEBOOK_BASE_URL;"
command := fmt.Sprintf(commandGpuDebug, setting.CullIdleTimeout, setting.CullIdleTimeout, setting.CullInterval, setting.CullIdleTimeout, setting.CullInterval, codePath)
// command := "bash && cd /code && unzip master.zip && cd test-export-data && uvicorn train:app --host 0.0.0.0 --port $OCTOPUS_NOTEBOOK_PORT"
if models.NPU == req.Tasks[0].Spec.ComputeResource {
command = ""
}
log.Info("debug cmd=" + command)
tasks := make([]models.GrampusNotebookTask, len(req.Tasks))
for i := 0; i < len(req.Tasks); i++ {
t := req.Tasks[i]
@@ -69,7 +98,64 @@ func convertNoteBookReq2Grampus(req ai_task_entity.CreateNoteBookTaskRequest) mo
return models.CreateGrampusNotebookRequest{Name: req.Name, Tasks: tasks}
}

func convertNoteBookTask2Grampus(t ai_task_entity.NoteBookTask, command string) models.GrampusNotebookTask {
func convertOnlineInfer2Grampus(req entity.CreateNoteBookTaskRequest) models.CreateGrampusNotebookRequest {

command := generateCommand(req.RepoName, req.Tasks[0].BootFile, req.PrimitiveDatasetName)

tasks := make([]models.GrampusNotebookTask, len(req.Tasks))
for i := 0; i < len(req.Tasks); i++ {
t := req.Tasks[i]
tasks[i] = convertNoteBookTask2Grampus(t, command)
}

return models.CreateGrampusNotebookRequest{Name: req.Name, Tasks: tasks}
}

func generateCommand(repoName, bootFile, datasetName string) string {

//prepare
//command := "bash && cd /code && unzip master.zip && cd test-export-data && uvicorn train:app --host 0.0.0.0 --port $OCTOPUS_NOTEBOOK_PORT"
workDir := "/"
command := "pip install gradio fastapi -i https://pypi.tuna.tsinghua.edu.cn/simple;"
command += "pwd; cd " + workDir + fmt.Sprintf(model_grampus.CommandPrepareScriptGpu)

//unzip code & dataset
unZipDatasetCommand := cloudbrainTask.GenerateDatasetUnzipCommand(datasetName)
bootFile = strings.ReplaceAll(bootFile, "\\", "/")
bootfilepath := ""
bootonlyfile := bootFile
if strings.Index(bootFile, "/") != -1 {
bootfilepath = bootFile[0:strings.LastIndex(bootFile, "/")]
if strings.HasPrefix(bootfilepath, "/") {
bootfilepath = bootfilepath[1:]
}
bootonlyfile = bootFile[strings.LastIndex(bootFile, "/")+1:]
}
log.Info("bootfilepath=" + bootfilepath + " bootonlyfile=" + bootonlyfile)
copyDatasetCmd := getCopyCmd(datasetName, repoName, bootfilepath)
copyDatasetPath := "/code/" + strings.ToLower(repoName) + "/" + bootfilepath
commandUnzip := "export OPENI_GRADIO_URL=$OCTOPUS_NOTEBOOK_BASE_URL;" + "cd " + workDir + "code;echo \"start unzip code\";unzip -q master.zip; " + copyDatasetCmd + " echo \"start to unzip dataset\";cd " + copyDatasetPath + "; " + unZipDatasetCommand
//commandUnzip := "cd " + workDir + "code;echo \"start unzip code\";unzip -q master.zip;echo \"start to unzip dataset\";cd " + workDir + "dataset;" + unZipDatasetCommand
command += commandUnzip
command += "echo \"unzip finished;start to exec code;\";"
if strings.HasSuffix(bootonlyfile, ".py") {
bootonlyfile = bootonlyfile[0 : len(bootonlyfile)-3]
}
command += "cd " + copyDatasetPath + ";uvicorn " + bootonlyfile + ":app --host 0.0.0.0 --port $OCTOPUS_NOTEBOOK_PORT "

log.Info("comand=" + command)
return command
}
func getCopyCmd(datasetName, repoName, bootfilepath string) string {
cmd := ""
datasetNameArray := strings.Split(datasetName, ";")
for _, datasetNameTemp := range datasetNameArray {
cmd += "cp /dataset/" + datasetNameTemp + " /code/" + strings.ToLower(repoName) + "/" + bootfilepath + ";"
}
return cmd
}

func convertNoteBookTask2Grampus(t entity.NoteBookTask, command string) models.GrampusNotebookTask {

code := models.GrampusDataset{}
codeArray := convertContainerArray2Grampus(t.Code)
@@ -90,7 +176,7 @@ func convertNoteBookTask2Grampus(t ai_task_entity.NoteBookTask, command string)
}
}

func convertContainerArray2Grampus(containerDatas []ai_task_entity.ContainerData) []models.GrampusDataset {
func convertContainerArray2Grampus(containerDatas []entity.ContainerData) []models.GrampusDataset {
res := make([]models.GrampusDataset, len(containerDatas))
for i := 0; i < len(containerDatas); i++ {
d := containerDatas[i]
@@ -99,7 +185,7 @@ func convertContainerArray2Grampus(containerDatas []ai_task_entity.ContainerData
return res
}

func convertContainer2Grampus(d ai_task_entity.ContainerData) models.GrampusDataset {
func convertContainer2Grampus(d entity.ContainerData) models.GrampusDataset {
return models.GrampusDataset{
Name: d.Name,
Bucket: d.Bucket,
@@ -110,9 +196,9 @@ func convertContainer2Grampus(d ai_task_entity.ContainerData) models.GrampusData
}
}

func convertGrampus2NoteBookRes(res *models.GrampusNotebookResponse) *ai_task_entity.CreateNoteBookTaskResponse {
func convertGrampus2NoteBookRes(res *models.GrampusNotebookResponse) *entity.CreateNoteBookTaskResponse {
jobInfo := res.JobInfo
return &ai_task_entity.CreateNoteBookTaskResponse{
return &entity.CreateNoteBookTaskResponse{
StartedAt: jobInfo.StartedAt,
RunSec: jobInfo.RunSec,
CompletedAt: jobInfo.CompletedAt,
@@ -126,7 +212,7 @@ func convertGrampus2NoteBookRes(res *models.GrampusNotebookResponse) *ai_task_en
}
}

func (c C2NetClusterAdapter) RestartNoteBook(jobId string) (*ai_task_entity.RestartNoteBookTaskResponse, error) {
func (c C2NetClusterAdapter) RestartNoteBook(jobId string) (*entity.RestartNoteBookTaskResponse, error) {
res, err := grampus.RestartNotebookJob(jobId)
if err != nil {
log.Error("RestartNotebookJob err jobId=%s .%v", jobId, err)
@@ -134,13 +220,16 @@ func (c C2NetClusterAdapter) RestartNoteBook(jobId string) (*ai_task_entity.Rest
}
if res.ErrorCode > 0 {
log.Error("RestartNotebookJob err.jobId = %s ErrorCode = %d ErrorMsg = %s", jobId, res.ErrorCode, res.ErrorMsg)
return nil, errors.New(fmt.Sprintf("RestartNotebookJob err[%d%s]", res.ErrorCode, res.ErrorMsg))
if entity.GrampusJobCanNotRestart.IsMatch(res.ErrorCode) {
return nil, errors.New(entity.GrampusJobCanNotRestart.CodeTrCode)
}
return nil, errors.New(response.RESTART_FAILED.TrCode)
}
return convertToCreateNoteBookTaskResponse(res), nil
}

func convertToCreateNoteBookTaskResponse(res *models.GrampusNotebookRestartResponse) *ai_task_entity.RestartNoteBookTaskResponse {
return &ai_task_entity.RestartNoteBookTaskResponse{
func convertToCreateNoteBookTaskResponse(res *models.GrampusNotebookRestartResponse) *entity.RestartNoteBookTaskResponse {
return &entity.RestartNoteBookTaskResponse{
JobId: res.NewId,
Status: res.Status,
}
@@ -159,7 +248,10 @@ func (c C2NetClusterAdapter) StopNoteBook(jobId string) error {
return nil
}

func (c C2NetClusterAdapter) QueryNoteBook(jobId string) (*ai_task_entity.QueryTaskResponse, error) {
func (c C2NetClusterAdapter) QueryNoteBook(jobId string) (*entity.QueryTaskResponse, error) {
if jobId == "" {
return nil, errors.New("jobID is empty")
}
result, err := grampus.GetNotebookJob(jobId)
if err != nil {
return nil, err
@@ -167,19 +259,19 @@ func (c C2NetClusterAdapter) QueryNoteBook(jobId string) (*ai_task_entity.QueryT
if result == nil {
return nil, nil
}
return ai_task_entity.ConvertGrampusNotebookResponse(result.JobInfo), nil
return entity.ConvertGrampusNotebookResponse(result.JobInfo), nil
}

func (c C2NetClusterAdapter) QueryNoteBookByJobName(jobName string) ([]*ai_task_entity.QueryTaskResponse, error) {
func (c C2NetClusterAdapter) QueryNoteBookByJobName(jobName string) ([]*entity.QueryTaskResponse, error) {
res, err := grampus.GetJobListByJobName(jobName)
if err != nil {
return nil, err
}
result := make([]*ai_task_entity.QueryTaskResponse, 0)
result := make([]*entity.QueryTaskResponse, 0)
if res != nil {
for i := 0; i < len(res.JobInfos); i++ {
if res.JobInfos[i].Name == jobName {
result = append(result, ai_task_entity.ConvertGrampusTrainResponse(res.JobInfos[i]))
result = append(result, entity.ConvertGrampusTrainResponse(res.JobInfos[i]))
}

}
@@ -187,7 +279,7 @@ func (c C2NetClusterAdapter) QueryNoteBookByJobName(jobName string) ([]*ai_task_
return result, nil
}

func (c C2NetClusterAdapter) GetNoteBookLog(jobId string) (*ai_task_entity.ClusterLog, error) {
func (c C2NetClusterAdapter) GetNoteBookLog(jobId string) (*entity.ClusterLog, error) {
return nil, nil
}

@@ -201,8 +293,46 @@ func (c C2NetClusterAdapter) GetNoteBookUrl(jobId string) (string, error) {
}
return res.Url + "?token=" + res.Token, nil
}
func (c C2NetClusterAdapter) GetNoteBookOperationProfile(jobId string) (*entity.OperationProfile, error) {
if jobId == "" {
log.Error("jobid is empty")
return nil, errors.New("jobid is empty")
}
jobResult, err := grampus.GetDebugJobEvents(jobId)
if err != nil {
log.Error("GetDebugJobEvents failed:%v", err)
return nil, err
}

r := parseC2NetEventsToOperationProfile(jobResult.NotebookEvents)
getJobResult, err := grampus.GetJob(jobId)
if err == nil && getJobResult != nil && getJobResult.ExitDiagnostics != "" {
r.Events = append(r.Events, entity.ProfileEvent{
Message: getJobResult.ExitDiagnostics,
Reason: "Exit",
})
}
return r, nil
}

func parseC2NetEventsToOperationProfile(notebookEvents []models.GrampusJobEvents) *entity.OperationProfile {
events := make([]entity.ProfileEvent, 0)
for i := 0; i < len(notebookEvents); i++ {
e := notebookEvents[i]
if e.Message == "" {
continue
}
events = append(events, entity.ProfileEvent{
Message: e.Message,
Reason: e.Reason,
Name: e.Name,
Timestamp: e.Timestamp,
})
}
return &entity.OperationProfile{Events: events}
}

func (c C2NetClusterAdapter) CreateTrainJob(req ai_task_entity.CreateTrainTaskRequest) (*ai_task_entity.CreateTrainTaskResponse, error) {
func (c C2NetClusterAdapter) CreateTrainJob(req entity.CreateTrainTaskRequest) (*entity.CreateTrainTaskResponse, error) {
jobResult, err := grampus.CreateJob(convertTrainReq2Grampus(req))
if err != nil {
log.Error("CreateNoteBook failed: %v", err.Error())
@@ -211,7 +341,7 @@ func (c C2NetClusterAdapter) CreateTrainJob(req ai_task_entity.CreateTrainTaskRe
return convertGrampus2TrainRes(jobResult), nil
}

func convertTrainReq2Grampus(req ai_task_entity.CreateTrainTaskRequest) models.CreateGrampusJobRequest {
func convertTrainReq2Grampus(req entity.CreateTrainTaskRequest) models.CreateGrampusJobRequest {
command := generateGrampusTrainCommand(req)

tasks := make([]models.GrampusTasks, len(req.Tasks))
@@ -223,7 +353,7 @@ func convertTrainReq2Grampus(req ai_task_entity.CreateTrainTaskRequest) models.C
return models.CreateGrampusJobRequest{Name: req.Name, Tasks: tasks}
}

func generateGrampusTrainCommand(req ai_task_entity.CreateTrainTaskRequest) string {
func generateGrampusTrainCommand(req entity.CreateTrainTaskRequest) string {
var command string
t := req.Tasks[0]
computeResource := t.Spec.ComputeResource
@@ -298,7 +428,7 @@ func getNpuModelObjectKey(jobName string) string {
return setting.CodePathPrefix + jobName + RemoteModelPath + "/" + models.ModelSuffix
}

func convertTrainTask2Grampus(t ai_task_entity.TrainTask, command string) models.GrampusTasks {
func convertTrainTask2Grampus(t entity.TrainTask, command string) models.GrampusTasks {
return models.GrampusTasks{
Name: t.Name,
ResourceSpecId: t.ResourceSpecId,
@@ -315,9 +445,9 @@ func convertTrainTask2Grampus(t ai_task_entity.TrainTask, command string) models
}
}

func convertGrampus2TrainRes(res *models.CreateGrampusJobResponse) *ai_task_entity.CreateTrainTaskResponse {
func convertGrampus2TrainRes(res *models.CreateGrampusJobResponse) *entity.CreateTrainTaskResponse {
jobInfo := res.JobInfo
return &ai_task_entity.CreateTrainTaskResponse{
return &entity.CreateTrainTaskResponse{
StartedAt: jobInfo.StartedAt,
RunSec: jobInfo.RunSec,
CompletedAt: jobInfo.CompletedAt,
@@ -337,13 +467,13 @@ func (c C2NetClusterAdapter) DeleteTrainJob(string) error {
func (c C2NetClusterAdapter) StopTrainJob(string) error {
return nil
}
func (c C2NetClusterAdapter) QueryTrainJob(string) (*ai_task_entity.QueryTaskResponse, error) {
func (c C2NetClusterAdapter) QueryTrainJob(string) (*entity.QueryTaskResponse, error) {
return nil, nil
}
func (c C2NetClusterAdapter) RestartTrainJob(string) (*ai_task_entity.CreateTrainTaskResponse, error) {
func (c C2NetClusterAdapter) RestartTrainJob(string) (*entity.CreateTrainTaskResponse, error) {
return nil, nil
}

func (c C2NetClusterAdapter) GetTrainLog(jobId string) (*ai_task_entity.ClusterLog, error) {
func (c C2NetClusterAdapter) GetTrainLog(jobId string) (*entity.ClusterLog, error) {
return nil, nil
}

+ 91
- 24
services/ai_task_service/cluster/cloudbrain_one.go View File

@@ -2,12 +2,14 @@ package cluster

import "C"
import (
"code.gitea.io/gitea/entity/ai_task_entity"
"encoding/json"
"errors"

"code.gitea.io/gitea/entity"
"code.gitea.io/gitea/models"
"code.gitea.io/gitea/modules/cloudbrain"
"code.gitea.io/gitea/modules/log"
"code.gitea.io/gitea/modules/setting"
"errors"
)

type CloudbrainOneClusterAdapter struct {
@@ -15,10 +17,10 @@ type CloudbrainOneClusterAdapter struct {

func init() {
//注册到一个Map
AddCluster(ai_task_entity.OpenICloudbrainOne, new(CloudbrainOneClusterAdapter))
AddCluster(entity.OpenICloudbrainOne, new(CloudbrainOneClusterAdapter))
}

func (c CloudbrainOneClusterAdapter) CreateNoteBook(req ai_task_entity.CreateNoteBookTaskRequest) (*ai_task_entity.CreateNoteBookTaskResponse, error) {
func (c CloudbrainOneClusterAdapter) CreateNoteBook(req entity.CreateNoteBookTaskRequest) (*entity.CreateNoteBookTaskResponse, error) {
jobResult, err := cloudbrain.CreateJob(req.Name, convertNoteBookReq2CloudbrainOne(req))
if err != nil {
log.Error("CreateNoteBook failed: %v", err.Error())
@@ -27,13 +29,17 @@ func (c CloudbrainOneClusterAdapter) CreateNoteBook(req ai_task_entity.CreateNot
return convertCloudbrainOne2NoteBookRes(jobResult), nil
}

func (c CloudbrainOneClusterAdapter) GetImages(req ai_task_entity.GetImageReq) ([]ai_task_entity.ClusterImage, bool, error) {
func (c CloudbrainOneClusterAdapter) CreateOnlineInfer(req entity.CreateNoteBookTaskRequest) (*entity.CreateNoteBookTaskResponse, error) {
return nil, nil
}

func (c CloudbrainOneClusterAdapter) GetImages(req entity.GetImageReq) ([]entity.ClusterImage, bool, error) {
return nil, true, nil
}

var SubTaskName = "task1"

func convertNoteBookReq2CloudbrainOne(req ai_task_entity.CreateNoteBookTaskRequest) models.CreateJobParams {
func convertNoteBookReq2CloudbrainOne(req entity.CreateNoteBookTaskRequest) models.CreateJobParams {
var command = `pip3 install jupyterlab==3 -i https://pypi.tuna.tsinghua.edu.cn/simple;pip3 install -U "nbclassic>=0.2.8" -i https://pypi.tuna.tsinghua.edu.cn/simple;service ssh stop;jupyter lab --ServerApp.shutdown_no_activity_timeout=` + setting.CullIdleTimeout + ` --TerminalManager.cull_inactive_timeout=` + setting.CullIdleTimeout + ` --TerminalManager.cull_interval=` + setting.CullInterval + ` --MappingKernelManager.cull_idle_timeout=` + setting.CullIdleTimeout + ` --MappingKernelManager.cull_interval=` + setting.CullInterval + ` --MappingKernelManager.cull_connected=True --MappingKernelManager.cull_busy=True --no-browser --ip=0.0.0.0 --allow-root --notebook-dir="/code" --port=80 --ServerApp.token="" --LabApp.token="" --ServerApp.allow_origin="self https://cloudbrain.pcl.ac.cn" `
t := req.Tasks[0]

@@ -58,11 +64,11 @@ func convertNoteBookReq2CloudbrainOne(req ai_task_entity.CreateNoteBookTaskReque
UseNNI: false,
},
},
Volumes: convertContainerDataArray2Volume(t.Code, t.Datasets, t.PreTrainModel),
Volumes: convertContainerDataArray2Volume(t.Code, t.Datasets, t.PreTrainModel, t.OutPut),
}
}

func convertContainerDataArray2Volume(containerDataArray ...[]ai_task_entity.ContainerData) []models.Volume {
func convertContainerDataArray2Volume(containerDataArray ...[]entity.ContainerData) []models.Volume {
r := make([]models.Volume, 0)
for _, array := range containerDataArray {
for _, d := range array {
@@ -72,7 +78,7 @@ func convertContainerDataArray2Volume(containerDataArray ...[]ai_task_entity.Con
return r
}

func convertContainerData2Volume(d ai_task_entity.ContainerData) models.Volume {
func convertContainerData2Volume(d entity.ContainerData) models.Volume {
return models.Volume{
HostPath: models.StHostPath{
Path: d.RealPath,
@@ -82,15 +88,15 @@ func convertContainerData2Volume(d ai_task_entity.ContainerData) models.Volume {
}
}

func convertCloudbrainOne2NoteBookRes(res *models.CreateJobResult) *ai_task_entity.CreateNoteBookTaskResponse {
func convertCloudbrainOne2NoteBookRes(res *models.CreateJobResult) *entity.CreateNoteBookTaskResponse {
playload := res.Payload
return &ai_task_entity.CreateNoteBookTaskResponse{
return &entity.CreateNoteBookTaskResponse{
JobID: playload["jobId"].(string),
Status: string(models.JobWaiting),
}
}

func (c CloudbrainOneClusterAdapter) RestartNoteBook(string) (*ai_task_entity.RestartNoteBookTaskResponse, error) {
func (c CloudbrainOneClusterAdapter) RestartNoteBook(string) (*entity.RestartNoteBookTaskResponse, error) {

return nil, nil
}
@@ -99,10 +105,15 @@ func (c CloudbrainOneClusterAdapter) DeleteNoteBook(string) error {
}

func (c CloudbrainOneClusterAdapter) StopNoteBook(jobId string) error {
err := cloudbrain.StopJob(jobId)
if err != nil {
log.Error("StopNoteBook(%s) failed:%v", jobId, err)
return err
}
return nil
}

func (c CloudbrainOneClusterAdapter) QueryNoteBook(jobId string) (*ai_task_entity.QueryTaskResponse, error) {
func (c CloudbrainOneClusterAdapter) QueryNoteBook(jobId string) (*entity.QueryTaskResponse, error) {
if jobId == "" {
log.Error("jobid is empty")
return nil, errors.New("jobid is empty")
@@ -112,15 +123,14 @@ func (c CloudbrainOneClusterAdapter) QueryNoteBook(jobId string) (*ai_task_entit
log.Error("QueryNoteBook failed:%v", err)
return nil, err
}
result, err := models.ConvertToJobResultPayload(jobResult.Payload)
if err != nil {
log.Error("ConvertToJobResultPayload failed:%v", err)
return nil, err
}
return ai_task_entity.ConvertCloudbrainOneNotebookResponse(result), nil
return entity.ConvertCloudbrainOneNotebookResponse(jobResult.Payload)
}

func (c CloudbrainOneClusterAdapter) QueryNoteBookByJobName(jobName string) ([]*ai_task_entity.QueryTaskResponse, error) {
func (c CloudbrainOneClusterAdapter) QueryNoteBookByJobName(jobName string) ([]*entity.QueryTaskResponse, error) {
jobResult, err := cloudbrain.GetJobListByName(jobName)
if err != nil {
log.Error("GetJobListByName failed:%v", err)
@@ -131,23 +141,80 @@ func (c CloudbrainOneClusterAdapter) QueryNoteBookByJobName(jobName string) ([]*
log.Error("ConvertToJobListResultPayload failed:%v", err)
return nil, err
}
r := make([]*ai_task_entity.QueryTaskResponse, 0)
r := make([]*entity.QueryTaskResponse, 0)
for i := 0; i < len(result.Jobs); i++ {
if result.Jobs[i].Name == jobName {
r = append(r, ai_task_entity.ConvertCloudbrainOneQueryNotebookByNameResponse(result.Jobs[i]))
r = append(r, entity.ConvertCloudbrainOneQueryNotebookByNameResponse(result.Jobs[i]))
}
}
return r, nil
}

func (c CloudbrainOneClusterAdapter) GetNoteBookLog(jobId string) (*ai_task_entity.ClusterLog, error) {
func (c CloudbrainOneClusterAdapter) GetNoteBookLog(jobId string) (*entity.ClusterLog, error) {
return nil, nil
}

func (c CloudbrainOneClusterAdapter) GetNoteBookUrl(jobId string) (string, error) {
return "", nil
return setting.DebugServerHost + "jpylab_" + jobId + "_" + models.SubTaskName, nil
}
func (c CloudbrainOneClusterAdapter) CreateTrainJob(ai_task_entity.CreateTrainTaskRequest) (*ai_task_entity.CreateTrainTaskResponse, error) {

func (c CloudbrainOneClusterAdapter) GetNoteBookOperationProfile(jobId string) (*entity.OperationProfile, error) {
if jobId == "" {
log.Error("jobid is empty")
return nil, errors.New("jobid is empty")
}
jobResult, err := cloudbrain.GetJob(jobId)
if err != nil {
log.Error("QueryNoteBook failed:%v", err)
return nil, err
}
result, err := models.ConvertToJobResultPayload(jobResult.Payload)
if err != nil {
log.Error("ConvertToJobResultPayload failed:%v", err)
return nil, err
}
return parseDiagnosticsToOperationProfile(result.JobStatus.AppExitDiagnostics), nil
}

func parseDiagnosticsToOperationProfile(appExitDiagnostics string) *entity.OperationProfile {
if appExitDiagnostics == "" {
return nil
}
diagnostics := entity.CloudbrainOneAppExitDiagnostics{}
err := json.Unmarshal([]byte(appExitDiagnostics), &diagnostics)
if err != nil {
log.Error("json.Unmarshal appExitDiagnostics err.%v", err)
return nil
}
events := make([]entity.ProfileEvent, 0)
podEvents := diagnostics.PodEvents.Task10
for i := 0; i < len(podEvents); i++ {
e := podEvents[i]
if e.Message == "" {
continue
}
events = append(events, entity.ProfileEvent{
Message: e.Message,
Reason: e.Reason,
Action: e.Action,
})
}
extras := diagnostics.Extras
for i := 0; i < len(extras); i++ {
e := extras[i]
if e.Message == "" {
continue
}
events = append(events, entity.ProfileEvent{
Message: e.Message,
Reason: e.Reason,
Action: e.Action,
})
}
return &entity.OperationProfile{Events: events}
}

func (c CloudbrainOneClusterAdapter) CreateTrainJob(entity.CreateTrainTaskRequest) (*entity.CreateTrainTaskResponse, error) {
return nil, nil
}
func (c CloudbrainOneClusterAdapter) DeleteTrainJob(string) error {
@@ -156,12 +223,12 @@ func (c CloudbrainOneClusterAdapter) DeleteTrainJob(string) error {
func (c CloudbrainOneClusterAdapter) StopTrainJob(string) error {
return nil
}
func (c CloudbrainOneClusterAdapter) QueryTrainJob(string) (*ai_task_entity.QueryTaskResponse, error) {
func (c CloudbrainOneClusterAdapter) QueryTrainJob(string) (*entity.QueryTaskResponse, error) {
return nil, nil
}
func (c CloudbrainOneClusterAdapter) RestartTrainJob(string) (*ai_task_entity.CreateTrainTaskResponse, error) {
func (c CloudbrainOneClusterAdapter) RestartTrainJob(string) (*entity.CreateTrainTaskResponse, error) {
return nil, nil
}
func (c CloudbrainOneClusterAdapter) GetTrainLog(string) (*ai_task_entity.ClusterLog, error) {
func (c CloudbrainOneClusterAdapter) GetTrainLog(string) (*entity.ClusterLog, error) {
return nil, nil
}

+ 297
- 0
services/ai_task_service/cluster/cloudbrain_two.go View File

@@ -0,0 +1,297 @@
package cluster

import "C"
import (
"encoding/json"
"fmt"

"code.gitea.io/gitea/entity"
"code.gitea.io/gitea/manager/client/cloudbrain_two"
"code.gitea.io/gitea/manager/client/cloudbrain_two_cd"
"code.gitea.io/gitea/models"
"code.gitea.io/gitea/modules/log"
"code.gitea.io/gitea/modules/setting"
"code.gitea.io/gitea/modules/timeutil"
)

type CloudbrainTwoClusterAdapter struct {
}

func init() {
AddCluster(entity.OpenICloudbrainTwo, new(CloudbrainTwoClusterAdapter))
}

func (c CloudbrainTwoClusterAdapter) CreateNoteBook(req entity.CreateNoteBookTaskRequest) (*entity.CreateNoteBookTaskResponse, error) {
if poolInfos == nil {
json.Unmarshal([]byte(setting.PoolInfos), &poolInfos)
}
t := req.Tasks[0]

var jobResult *models.CreateNotebookResult
var err error
if setting.ModelartsCD.Enabled {
jobResult, err = cloudbrain_two_cd.CreateNotebook(models.CreateNotebookWithoutPoolParams{
JobName: req.Name,
Description: req.Description,
Flavor: t.Spec.SourceSpecId,
Duration: t.AutoStopDuration,
ImageID: t.ImageId,
Feature: models.NotebookFeature,
Volume: models.VolumeReq{
Capacity: setting.Capacity,
Category: models.EVSCategory,
Ownership: models.ManagedOwnership,
},
WorkspaceID: "0",
})
} else {
jobResult, err = cloudbrain_two.CreateNotebook2(models.CreateNotebook2Params{
JobName: req.Name,
Description: req.Description,
Flavor: t.Spec.SourceSpecId,
Duration: t.AutoStopDuration,
ImageID: t.ImageId,
PoolID: poolInfos.PoolInfo[0].PoolId,
Feature: models.NotebookFeature,
Volume: models.VolumeReq{
Capacity: setting.Capacity,
Category: models.EVSCategory,
Ownership: models.ManagedOwnership,
},
WorkspaceID: "0",
})
}

if err != nil {
log.Error("CreateNoteBook failed: %v", err.Error())
return nil, err
}
return convertCloudbrainTwo2NoteBookRes(jobResult), nil
}

func (c CloudbrainTwoClusterAdapter) CreateOnlineInfer(req entity.CreateNoteBookTaskRequest) (*entity.CreateNoteBookTaskResponse, error) {
return nil, nil
}

var cloudbrainTwoImages []entity.ClusterImage

func (c CloudbrainTwoClusterAdapter) GetImages(req entity.GetImageReq) ([]entity.ClusterImage, bool, error) {
if cloudbrainTwoImages == nil || len(cloudbrainTwoImages) == 0 {
images := setting.StImageInfos.ImageInfo
cloudbrainTwoImages = make([]entity.ClusterImage, len(images))
for i := 0; i < len(images); i++ {
cloudbrainTwoImages[i] = entity.ClusterImage{
ImageId: images[i].Id,
ImageName: images[i].Value,
}
}
}

return cloudbrainTwoImages, false, nil
}

var poolInfos *models.PoolInfos

func convertCloudbrainTwo2NoteBookRes(res *models.CreateNotebookResult) *entity.CreateNoteBookTaskResponse {
return &entity.CreateNoteBookTaskResponse{
JobID: res.ID,
Status: res.Status,
}
}

func (c CloudbrainTwoClusterAdapter) RestartNoteBook(jobId string) (*entity.RestartNoteBookTaskResponse, error) {
param := models.NotebookAction{
Action: models.ActionStart,
}
task, err := models.GetNewestCloudbrainByJobId(jobId)
if err != nil {
return nil, err
}

var res *models.NotebookActionResult
if task.Type == models.TypeCloudBrainTwo {
res, err = cloudbrain_two.ManageNotebook2(task.JobID, param)
} else if task.Type == models.TypeCDCenter {
res, err = cloudbrain_two_cd.ManageNotebook(task.JobID, param)
}
if err != nil {
log.Error("ManageNotebook err.jobID=%s err=%v", jobId, err)
return nil, err
}
return convertCloudbrainTwo2NoteBookRestartRes(jobId, res), nil
}

func convertCloudbrainTwo2NoteBookRestartRes(jobId string, res *models.NotebookActionResult) *entity.RestartNoteBookTaskResponse {
return &entity.RestartNoteBookTaskResponse{
JobId: jobId,
Status: res.Status,
}
}

func (c CloudbrainTwoClusterAdapter) DeleteNoteBook(jobId string) error {
task, err := models.GetNewestCloudbrainByJobId(jobId)
if err != nil {
return err
}

if task.Type == models.TypeCloudBrainTwo {
_, err = cloudbrain_two.DelNotebook2(task.JobID)
} else if task.Type == models.TypeCDCenter {
_, err = cloudbrain_two_cd.DelNotebook(task.JobID)
}
if err != nil {
log.Error("DeleteNoteBook err.jobID=%s err=%v", jobId, err)
return err
}
return nil
}

func (c CloudbrainTwoClusterAdapter) StopNoteBook(jobId string) error {
task, err := models.GetNewestCloudbrainByJobId(jobId)
if err != nil {
return err
}
param := models.NotebookAction{
Action: models.ActionStop,
}
if task.Type == models.TypeCloudBrainTwo {
_, err = cloudbrain_two.ManageNotebook2(task.JobID, param)
} else if task.Type == models.TypeCDCenter {
_, err = cloudbrain_two_cd.ManageNotebook(task.JobID, param)
}
if err != nil {
log.Error("StopNoteBook err.jobID=%s err=%v", jobId, err)
return err
}
return nil
}

func (c CloudbrainTwoClusterAdapter) QueryNoteBook(jobId string) (*entity.QueryTaskResponse, error) {
task, err := models.GetNewestCloudbrainByJobId(jobId)
if err != nil {
return nil, err
}

var result *models.GetNotebook2Result
if task.Type == models.TypeCloudBrainTwo {
result, err = cloudbrain_two.GetNotebook2(task.JobID)
} else if task.Type == models.TypeCDCenter {
result, err = cloudbrain_two_cd.GetNotebook(task.JobID)
}
if err != nil {
log.Error("GetNotebook(%s) failed:%v", task.DisplayJobName, err)
return nil, err
}
return convertCloudbrainTwo2QueryRes(result), nil
}

func convertCloudbrainTwo2QueryRes(res *models.GetNotebook2Result) *entity.QueryTaskResponse {
startedAt := timeutil.TimeStamp(0)
if res.Lease.UpdateTime > 0 {
startedAt = timeutil.TimeStamp(res.Lease.UpdateTime / 1000)
}
completedAt := timeutil.TimeStamp(0)
if models.IsCloudbrainTerminalStatus(res.Status) {
completedAt = timeutil.TimeStampNow()
}
return &entity.QueryTaskResponse{
StartedAt: startedAt,
CompletedAt: completedAt,
JobId: res.ID,
Status: res.Status,
Url: res.Url,
Token: res.Token,
}
}

func (c CloudbrainTwoClusterAdapter) QueryNoteBookByJobName(jobName string) ([]*entity.QueryTaskResponse, error) {
result, err := cloudbrain_two.GetNotebookList(1000, 0, "createTime", "DESC", jobName)
if err != nil {
log.Error("QueryNoteBookByJobName failed:jobName=%s err=%v", jobName, err)
return nil, err
}
r := make([]*entity.QueryTaskResponse, 0)
for i := 0; i < len(result.NotebookList); i++ {
if result.NotebookList[i].JobName == jobName {
r = append(r, convertCloudbrainTwoQueryNotebookByNameResponse(result.NotebookList[i]))
}
}
return r, nil
}

func convertCloudbrainTwoQueryNotebookByNameResponse(notebook models.NotebookList) *entity.QueryTaskResponse {
return &entity.QueryTaskResponse{
StartedAt: timeutil.TimeStamp(notebook.Lease.CreateTime / 1000),
Status: notebook.Status,
JobId: notebook.JobID,
}
}

func (c CloudbrainTwoClusterAdapter) GetNoteBookLog(jobId string) (*entity.ClusterLog, error) {
return nil, nil
}

func (c CloudbrainTwoClusterAdapter) GetNoteBookUrl(jobId string) (string, error) {
res, err := c.QueryNoteBook(jobId)
if err != nil {
return "", err
}
return res.Url + "?token=" + res.Token, nil
}

func (c CloudbrainTwoClusterAdapter) GetNoteBookOperationProfile(jobId string) (*entity.OperationProfile, error) {
task, err := models.GetNewestCloudbrainByJobId(jobId)
if err != nil {
return nil, err
}

var result *models.GetNotebook2Result
if task.Type == models.TypeCloudBrainTwo {
result, err = cloudbrain_two.GetNotebook2(task.JobID)
} else if task.Type == models.TypeCDCenter {
result, err = cloudbrain_two_cd.GetNotebook(task.JobID)
}
if err != nil {
log.Error("GetNotebook(%s) failed:%v", task.DisplayJobName, err)
return nil, err
}
return parseCloudbrainTwoEventsToOperationProfile(result), nil
}

func parseCloudbrainTwoEventsToOperationProfile(result *models.GetNotebook2Result) *entity.OperationProfile {
events := make([]entity.ProfileEvent, 0)
if result.ActionProgress == nil || len(result.ActionProgress) == 0 {
return nil
}
for i := 0; i < len(result.ActionProgress); i++ {
e := result.ActionProgress[i]
if e.Description == "" {
continue
}
events = append(events, entity.ProfileEvent{
Message: e.Description,
Reason: fmt.Sprint(e.Step),
Name: e.Status,
})
}
return &entity.OperationProfile{Events: events}
}

func (c CloudbrainTwoClusterAdapter) CreateTrainJob(entity.CreateTrainTaskRequest) (*entity.CreateTrainTaskResponse, error) {
return nil, nil
}
func (c CloudbrainTwoClusterAdapter) DeleteTrainJob(string) error {
return nil
}
func (c CloudbrainTwoClusterAdapter) StopTrainJob(string) error {
return nil
}
func (c CloudbrainTwoClusterAdapter) QueryTrainJob(string) (*entity.QueryTaskResponse, error) {
return nil, nil
}
func (c CloudbrainTwoClusterAdapter) RestartTrainJob(string) (*entity.CreateTrainTaskResponse, error) {
return nil, nil
}
func (c CloudbrainTwoClusterAdapter) GetTrainLog(string) (*entity.ClusterLog, error) {
return nil, nil
}

+ 18
- 14
services/ai_task_service/cluster/cluster_base.go View File

@@ -1,17 +1,18 @@
package cluster

import (
"code.gitea.io/gitea/entity/ai_task_entity"
"errors"

"code.gitea.io/gitea/entity"
)

var clusterMap = map[ai_task_entity.ClusterType]ClusterAdapter{}
var clusterMap = map[entity.ClusterType]ClusterAdapter{}

func AddCluster(t ai_task_entity.ClusterType, cluster ClusterAdapter) {
func AddCluster(t entity.ClusterType, cluster ClusterAdapter) {
clusterMap[t] = cluster
}

func GetCluster(t ai_task_entity.ClusterType) (ClusterAdapter, error) {
func GetCluster(t entity.ClusterType) (ClusterAdapter, error) {
if t == "" {
return nil, errors.New("ClusterType is empty")
}
@@ -23,22 +24,25 @@ func GetCluster(t ai_task_entity.ClusterType) (ClusterAdapter, error) {
}

type ClusterAdapter interface {
CreateNoteBook(req ai_task_entity.CreateNoteBookTaskRequest) (*ai_task_entity.CreateNoteBookTaskResponse, error)
RestartNoteBook(jobId string) (*ai_task_entity.RestartNoteBookTaskResponse, error)
CreateNoteBook(req entity.CreateNoteBookTaskRequest) (*entity.CreateNoteBookTaskResponse, error)
RestartNoteBook(jobId string) (*entity.RestartNoteBookTaskResponse, error)
DeleteNoteBook(jobId string) error
StopNoteBook(jobId string) error
QueryNoteBook(jobId string) (*ai_task_entity.QueryTaskResponse, error)
QueryNoteBookByJobName(jobName string) ([]*ai_task_entity.QueryTaskResponse, error)
GetNoteBookLog(jobId string) (*ai_task_entity.ClusterLog, error)
QueryNoteBook(jobId string) (*entity.QueryTaskResponse, error)
QueryNoteBookByJobName(jobName string) ([]*entity.QueryTaskResponse, error)
GetNoteBookLog(jobId string) (*entity.ClusterLog, error)
GetNoteBookUrl(jobId string) (string, error)
CreateTrainJob(req ai_task_entity.CreateTrainTaskRequest) (*ai_task_entity.CreateTrainTaskResponse, error)
GetNoteBookOperationProfile(jobId string) (*entity.OperationProfile, error)
CreateTrainJob(req entity.CreateTrainTaskRequest) (*entity.CreateTrainTaskResponse, error)
DeleteTrainJob(jobId string) error
StopTrainJob(string) error
RestartTrainJob(jobId string) (*ai_task_entity.CreateTrainTaskResponse, error)
QueryTrainJob(jobId string) (*ai_task_entity.QueryTaskResponse, error)
GetTrainLog(jobId string) (*ai_task_entity.ClusterLog, error)
RestartTrainJob(jobId string) (*entity.CreateTrainTaskResponse, error)
QueryTrainJob(jobId string) (*entity.QueryTaskResponse, error)
GetTrainLog(jobId string) (*entity.ClusterLog, error)

//GetImages return available list of clusters
//The second parameter will return true if image is no limit
GetImages(req ai_task_entity.GetImageReq) ([]ai_task_entity.ClusterImage, bool, error)
GetImages(req entity.GetImageReq) ([]entity.ClusterImage, bool, error)

CreateOnlineInfer(req entity.CreateNoteBookTaskRequest) (*entity.CreateNoteBookTaskResponse, error)
}

+ 81
- 0
services/ai_task_service/container_builder/code_builder.go View File

@@ -0,0 +1,81 @@
package container_builder

import (
"code.gitea.io/gitea/entity"
"code.gitea.io/gitea/modules/cloudbrain"
"code.gitea.io/gitea/modules/log"
"code.gitea.io/gitea/modules/setting"
"code.gitea.io/gitea/routers/response"
"code.gitea.io/gitea/services/ai_task_service/context"
"code.gitea.io/gitea/services/ai_task_service/upload"
"strings"
)

type CodeBuilder struct {
Opts *entity.ContainerBuildOpts
}

func init() {
o := &CodeBuilder{}
RegisterContainerBuilder(o)
}

func (b *CodeBuilder) SetOpts(opts *entity.ContainerBuildOpts) {
b.Opts = opts
}

func (b *CodeBuilder) GetContainerType() entity.ContainerDataType {
return entity.ContainerCode
}

func (b *CodeBuilder) Build(ctx *context.CreationContext) ([]entity.ContainerData, *response.BizError) {
opts := b.Opts
if opts.Disable {
return nil, nil
}
storageTypes := opts.AcceptStorageType
if storageTypes == nil || len(storageTypes) == 0 {
return nil, response.SYSTEM_ERROR
}

jobName := ctx.Request.JobName
repo := ctx.Repository
codeLocalPath := setting.JobPath + jobName + cloudbrain.CodeMountPath + "/"
uploader := upload.SelectUploaderFromStorageType(storageTypes[0])

remoteDir := uploader.GetJobDefaultObjectKeyPrefix(jobName) + cloudbrain.CodeMountPath
//再次调试和在线运行notebook不需要下载、上传代码
if !ctx.Request.IsRestartRequest && !ctx.Request.IsFileNoteBookRequest {
if err := DownloadCode(ctx, codeLocalPath, b.Opts.NotArchive); err != nil {
log.Error("downloadZipCode failed, server timed out: %s (%v)", repo.FullName(), err)
return nil, response.LOAD_CODE_FAILED
}

if err := uploader.UploadDir(codeLocalPath, remoteDir); err != nil {
log.Error("Failed to UploadDir: %s (%v)", repo.FullName(), err)
return nil, response.LOAD_CODE_FAILED
}
}

codeArchiveName := ""
//如果代码是压缩包形式,以默认分支命名压缩包(继承原有逻辑)
if !b.Opts.NotArchive {
codeArchiveName = cloudbrain.DefaultBranchName + ".zip"
}

containerPath := ""
if opts.ContainerPath != "" {
containerPath = opts.ContainerPath + "/" + codeArchiveName
}
objectKey := remoteDir + "/" + codeArchiveName
codeData := entity.ContainerData{
Name: strings.ToLower(repo.Name),
Bucket: uploader.GetBucket(),
EndPoint: uploader.GetEndpoint(),
ObjectKey: objectKey,
ReadOnly: opts.ReadOnly,
ContainerPath: containerPath,
RealPath: uploader.GetRealPath(objectKey),
}
return []entity.ContainerData{codeData}, nil
}

+ 95
- 0
services/ai_task_service/container_builder/common.go View File

@@ -0,0 +1,95 @@
package container_builder

import (
"bufio"
"code.gitea.io/gitea/models"
"code.gitea.io/gitea/modules/git"
"code.gitea.io/gitea/modules/log"
"code.gitea.io/gitea/services/ai_task_service/context"
"code.gitea.io/gitea/services/ai_task_service/upload"
"errors"
"io"
"io/ioutil"
"os"
"strings"
)

func DownloadCode(ctx *context.CreationContext, codeLocalPath string, notArchive bool) error {
dir, err := ioutil.ReadDir(codeLocalPath)
//ReqCommitID为空时需要下载最新的代码,把旧的删掉
if len(dir) != 0 && ctx.Request.ReqCommitID == "" {
if err == nil {
os.RemoveAll(codeLocalPath)
}
}
var commitId string

//目录为空时需要下载代码
if len(dir) == 0 {
if notArchive {
commitId, err = upload.DownloadCode(ctx.GitRepo, ctx.Repository, codeLocalPath, ctx.Request.BranchName)
} else {
commitId, err = upload.DownloadZipCode(ctx.GitRepo, codeLocalPath, ctx.Request.BranchName)
}
if err != nil {
log.Error("downloadZipCode failed, server timed out: %s (%v)", ctx.Repository.FullName(), err)
return errors.New("cloudbrain.load_code_failed")
}
}
ctx.CommitID = commitId
return nil
}

var obsUploader = &upload.OBSUploader{}
var minioUploader = &upload.MinioUploader{}

const CLONE_FILE_PREFIX = "file:///"

func DownloadBranch(repo *models.Repository, codePath, branchName string) error {
//add "file:///" prefix to make the depth valid
if err := git.Clone(CLONE_FILE_PREFIX+repo.RepoPath(), codePath, git.CloneRepoOptions{Branch: branchName, Depth: 1}); err != nil {
log.Error("Failed to clone repository: %s (%v)", repo.FullName(), err)
return err
}

configFile, err := os.OpenFile(codePath+"/.git/config", os.O_RDWR, 0666)
if err != nil {
log.Error("open file(%s) failed:%v", codePath+"/,git/config", err)
return err
}

defer configFile.Close()

pos := int64(0)
reader := bufio.NewReader(configFile)
for {
line, err := reader.ReadString('\n')
if err != nil {
if err == io.EOF {
log.Error("not find the remote-url")
return nil
} else {
log.Error("read error: %v", err)
return err
}
}

if strings.Contains(line, "url") && strings.Contains(line, ".git") {
originUrl := "\turl = " + repo.CloneLink().HTTPS + "\n"
if len(line) > len(originUrl) {
originUrl += strings.Repeat(" ", len(line)-len(originUrl))
}
bytes := []byte(originUrl)
_, err := configFile.WriteAt(bytes, pos)
if err != nil {
log.Error("WriteAt failed:%v", err)
return err
}
break
}

pos += int64(len(line))
}

return nil
}

+ 40
- 12
services/ai_task_service/container_builder/container_builder.go View File

@@ -1,24 +1,52 @@
package container_builder

import (
"code.gitea.io/gitea/entity/ai_task_entity"
"code.gitea.io/gitea/modules/setting"
"code.gitea.io/gitea/entity"
"code.gitea.io/gitea/modules/log"
"code.gitea.io/gitea/routers/response"
"code.gitea.io/gitea/services/ai_task_service/context"
"strings"
"fmt"
"reflect"
)

type ContainerBuilder interface {
Build(ctx *context.CreationContext) ([]ai_task_entity.ContainerData, error)
GetContainerType() ai_task_entity.ContainerDataType
Build(ctx *context.CreationContext) ([]entity.ContainerData, *response.BizError)
GetContainerType() entity.ContainerDataType
SetOpts(opts *entity.ContainerBuildOpts)
}

type ContainerBuildOpts struct {
ContainerPath string
ReadOnly bool
var containerBuilderMap = map[entity.ContainerDataType]reflect.Type{}

func RegisterContainerBuilder(builder ContainerBuilder) {
containerBuilderMap[builder.GetContainerType()] = reflect.TypeOf(builder)
}

func CreateContainerBuilder(containerType entity.ContainerDataType, opts *entity.ContainerBuildOpts) ContainerBuilder {
defer func() {
if err := recover(); err != nil {
combinedErr := fmt.Errorf("%s\n%s", err, log.Stack(2))
log.Error("PANIC:%v", combinedErr)
}
}()
t := containerBuilderMap[containerType]
if t == nil {
return nil
}
b := reflect.New(t.Elem()).Interface().(ContainerBuilder)
//.Interface().(ContainerBuilder)
//b.SetOpts(opts)
b.SetOpts(opts)
return b
}

func GetEndPoint() string {
index := strings.Index(setting.Endpoint, "//")
endpoint := setting.Endpoint[index+2:]
return endpoint
func BuildContainerDataChain(configMap map[entity.ContainerDataType]*entity.ContainerBuildOpts) *BuilderChain {
c := NewBuilderChain()
for k, v := range configMap {
b := CreateContainerBuilder(k, v)
if b == nil {
continue
}
c.Next(b)
}
return c
}

+ 7
- 1
services/ai_task_service/container_builder/container_builder_chan.go View File

@@ -1,6 +1,7 @@
package container_builder

import (
"code.gitea.io/gitea/routers/response"
"code.gitea.io/gitea/services/ai_task_service/context"
)

@@ -17,8 +18,13 @@ func (c *BuilderChain) Next(b ContainerBuilder) *BuilderChain {
return c
}

func (c *BuilderChain) Run(ctx *context.CreationContext) error {
func (c *BuilderChain) Run(ctx *context.CreationContext) *response.BizError {
for _, builder := range c.builderList {
current := ctx.GetContainerDataArray(builder.GetContainerType())
//如果已经存在则不需要再构建
if current != nil && len(current) > 0 {
continue
}
d, err := builder.Build(ctx)
if err != nil {
return err


+ 55
- 32
services/ai_task_service/container_builder/dataset_builder.go View File

@@ -1,70 +1,93 @@
package container_builder

import (
"code.gitea.io/gitea/entity/ai_task_entity"
"code.gitea.io/gitea/entity"
"code.gitea.io/gitea/models"
"code.gitea.io/gitea/modules/log"
"code.gitea.io/gitea/modules/setting"
"code.gitea.io/gitea/routers/response"
"code.gitea.io/gitea/services/ai_task_service/context"
"errors"
"strings"
)

type DatasetBuilder struct {
Opts ContainerBuildOpts
Opts *entity.ContainerBuildOpts
}

func (b DatasetBuilder) Build(ctx *context.CreationContext) ([]ai_task_entity.ContainerData, error) {
func init() {
o := &DatasetBuilder{}
RegisterContainerBuilder(o)
}

func (b *DatasetBuilder) SetOpts(opts *entity.ContainerBuildOpts) {
b.Opts = opts
}

func (b *DatasetBuilder) Build(ctx *context.CreationContext) ([]entity.ContainerData, *response.BizError) {
if b.Opts.Disable {
return nil, nil
}
uuid := ctx.Request.DatasetUUIDStr
if uuid == "" {
return nil, nil
}
var attachSize int64
datasetInfos, _, err := models.GetDatasetInfo(uuid, ctx.Request.ComputeSource.Name)
var datasetInfos map[string]models.DatasetInfo
var datasetNames string
var err error
// models.GetDatasetInfo 是使用的以前的方法,所以此处按集群类型适配
if ctx.Request.Cluster == models.C2NetCluster {
datasetInfos, datasetNames, err = models.GetDatasetInfo(uuid, ctx.Request.ComputeSource.Name)
} else {
datasetInfos, datasetNames, err = models.GetDatasetInfo(uuid)
}
if err != nil {
log.Error("GetDatasetInfo failed: %v", err)
return nil, errors.New("cloudbrain.error.dataset_select")
return nil, response.DATASET_SELECT_ERROR
}
uuidArray := strings.Split(uuid, ";")
if datasetInfos == nil || len(datasetInfos) < len(uuidArray) {
return nil, errors.New("cloudbrain.error.partial_datasets_not_available")
}
for _, infos := range datasetInfos {
attachSize += infos.Size
return nil, response.PARTIAL_DATASETS_NOT_AVAILABLE
}
if attachSize > int64(setting.DebugAttachSize*1000*1000*1000) {
log.Error("The DatasetSize exceeds the limit (%dGB)", setting.DebugAttachSize) // GB
return nil, errors.New("cloudbrain.error.debug_datasetsize")
}
var data []ai_task_entity.ContainerData
obsEndPoint := GetEndPoint()
var data []entity.ContainerData
for _, datasetInfo := range datasetInfos {
name := datasetInfo.FullName
//如果不是压缩包,那么文件名是去掉后缀以后的数据集名称
if b.Opts.NotArchive {
name = datasetInfo.Name
}
if datasetInfo.Type == models.TypeCloudBrainOne {
data = append(data, ai_task_entity.ContainerData{
Name: datasetInfo.FullName,
Bucket: setting.Attachment.Minio.Bucket,
EndPoint: setting.Attachment.Minio.Endpoint,
ObjectKey: datasetInfo.DataLocalPath,
//如果返回的localPath已经带了实际路径的前缀,需要去除掉以后才是在minio上的objectKey
objectKey := datasetInfo.DataLocalPath
objectKey = strings.TrimPrefix(objectKey, setting.Attachment.Minio.RealPath)
objectKey = strings.TrimPrefix(objectKey, setting.Attachment.Minio.Bucket)
objectKey = strings.TrimPrefix(objectKey, "/")
data = append(data, entity.ContainerData{
Name: name,
Bucket: minioUploader.GetBucket(),
EndPoint: minioUploader.GetEndpoint(),
ObjectKey: objectKey,
ReadOnly: b.Opts.ReadOnly,
ContainerPath: b.Opts.ContainerPath + "/" + datasetInfo.FullName,
RealPath: setting.Attachment.Minio.RealPath + setting.Attachment.Minio.Bucket + "/" + datasetInfo.DataLocalPath,
ContainerPath: b.Opts.ContainerPath + "/" + name,
RealPath: minioUploader.GetRealPath(objectKey),
})

} else {
data = append(data, ai_task_entity.ContainerData{
Name: datasetInfo.FullName,
Bucket: setting.Bucket,
EndPoint: obsEndPoint,
ObjectKey: datasetInfo.DataLocalPath + datasetInfo.FullName,
objectKey := datasetInfo.DataLocalPath + datasetInfo.FullName
data = append(data, entity.ContainerData{
Name: name,
Bucket: obsUploader.GetBucket(),
EndPoint: obsUploader.GetEndpoint(),
ObjectKey: objectKey,
ReadOnly: b.Opts.ReadOnly,
ContainerPath: b.Opts.ContainerPath + "/" + datasetInfo.FullName,
ContainerPath: b.Opts.ContainerPath + "/" + name,
})
}

}
ctx.Request.DatasetNames = datasetNames
return data, nil
}

func (b DatasetBuilder) GetContainerType() ai_task_entity.ContainerDataType {
return ai_task_entity.ContainerDataset
func (b *DatasetBuilder) GetContainerType() entity.ContainerDataType {
return entity.ContainerDataset
}

+ 47
- 0
services/ai_task_service/container_builder/file_notebook_code_builder.go View File

@@ -0,0 +1,47 @@
package container_builder

import (
"code.gitea.io/gitea/entity"
"code.gitea.io/gitea/models"
"code.gitea.io/gitea/modules/log"
"code.gitea.io/gitea/modules/setting"
"code.gitea.io/gitea/routers/response"
"code.gitea.io/gitea/services/ai_task_service/context"
)

type FileNoteBookCodeBuilder struct {
Opts *entity.ContainerBuildOpts
}

func init() {
o := &FileNoteBookCodeBuilder{}
RegisterContainerBuilder(o)
}

func (b *FileNoteBookCodeBuilder) SetOpts(opts *entity.ContainerBuildOpts) {
b.Opts = opts
}

func (b *FileNoteBookCodeBuilder) GetContainerType() entity.ContainerDataType {
return entity.ContainerFileNoteBookCode
}

func (b *FileNoteBookCodeBuilder) Build(ctx *context.CreationContext) ([]entity.ContainerData, *response.BizError) {
if b.Opts.Disable {
return nil, nil
}
repo := ctx.Request.FileRepository
if repo == nil {
return nil, nil
}
err := DownloadBranch(repo, getCodePath(ctx.Request.JobName, repo, ctx.Request.FileBranchName), ctx.Request.FileBranchName)
if err != nil {
log.Error("download code failed", err)
return nil, response.LOAD_CODE_FAILED
}
return nil, nil
}

func getCodePath(jobName string, repo *models.Repository, branchName string) string {
return setting.JobPath + jobName + "/code" + "/" + repo.OwnerName + "/" + repo.Name + "/" + branchName
}

+ 0
- 59
services/ai_task_service/container_builder/minio_code_builder.go View File

@@ -1,59 +0,0 @@
package container_builder

import (
"code.gitea.io/gitea/entity/ai_task_entity"
"code.gitea.io/gitea/modules/cloudbrain"
"code.gitea.io/gitea/modules/log"
"code.gitea.io/gitea/modules/setting"
"code.gitea.io/gitea/modules/storage"
"code.gitea.io/gitea/services/ai_task_service/context"
"code.gitea.io/gitea/services/ai_task_service/upload"
"errors"
"io/ioutil"
"os"
"strings"
)

type MinioCodeBuilder struct {
Opts ContainerBuildOpts
}

func (b MinioCodeBuilder) GetContainerType() ai_task_entity.ContainerDataType {
return ai_task_entity.ContainerCode
}

func (b MinioCodeBuilder) Build(ctx *context.CreationContext) ([]ai_task_entity.ContainerData, error) {
opts := b.Opts
var err error
jobName := ctx.Request.JobName
repo := ctx.Repository
codeLocalPath := setting.JobPath + jobName + cloudbrain.CodeMountPath + "/"
_, err = ioutil.ReadDir(codeLocalPath)
if err == nil {
os.RemoveAll(codeLocalPath)
}

commitId, err := upload.DownloadZipCode(ctx.GitRepo, codeLocalPath, ctx.Request.BranchName)
if err != nil {
log.Error("downloadZipCode failed, server timed out: %s (%v)", repo.FullName(), err)
return nil, errors.New("cloudbrain.load_code_failed")
}
if err := upload.UploadDirToMinio(codeLocalPath+"/", jobName, cloudbrain.CodeMountPath+"/"); err != nil {
log.Error("Failed to uploadCodeToMinio: %s (%v)", repo.FullName(), err)
return nil, errors.New("cloudbrain.load_code_failed")
}

codeArchiveName := cloudbrain.DefaultBranchName + ".zip"
codeData := ai_task_entity.ContainerData{
Name: strings.ToLower(repo.Name),
Bucket: setting.Attachment.Minio.Bucket,
EndPoint: setting.Attachment.Minio.Endpoint,
ObjectKey: setting.CBCodePathPrefix + jobName + cloudbrain.CodeMountPath + "/" + codeArchiveName,
ReadOnly: opts.ReadOnly,
ContainerPath: opts.ContainerPath + "/" + codeArchiveName,
RealPath: storage.GetMinioPath(jobName, cloudbrain.CodeMountPath+"/"+codeArchiveName),
}
//todo 更好的方法?
ctx.CommitID = commitId
return []ai_task_entity.ContainerData{codeData}, nil
}

+ 0
- 18
services/ai_task_service/container_builder/obs_code_builder.go View File

@@ -1,18 +0,0 @@
package container_builder

import (
"code.gitea.io/gitea/entity/ai_task_entity"
"code.gitea.io/gitea/services/ai_task_service/context"
)

type ObsCodeBuilder struct {
Opts ContainerBuildOpts
}

func (b ObsCodeBuilder) GetContainerType() ai_task_entity.ContainerDataType {
return ai_task_entity.ContainerCode
}

func (b ObsCodeBuilder) Build(ctx *context.CreationContext) ([]ai_task_entity.ContainerData, error) {
return nil, nil
}

+ 40
- 6
services/ai_task_service/container_builder/output_path_builder.go View File

@@ -1,21 +1,55 @@
package container_builder

import (
"code.gitea.io/gitea/entity/ai_task_entity"
"code.gitea.io/gitea/entity"
"code.gitea.io/gitea/modules/cloudbrain"
"code.gitea.io/gitea/modules/log"
"code.gitea.io/gitea/routers/response"
"code.gitea.io/gitea/services/ai_task_service/context"
"code.gitea.io/gitea/services/ai_task_service/upload"
)

type OutputPathBuilder struct {
Opts ContainerBuildOpts
Opts *entity.ContainerBuildOpts
}

func (b OutputPathBuilder) Build(*context.CreationContext) ([]ai_task_entity.ContainerData, error) {
return []ai_task_entity.ContainerData{{
func init() {
o := &OutputPathBuilder{}
RegisterContainerBuilder(o)
}

func (b *OutputPathBuilder) SetOpts(opts *entity.ContainerBuildOpts) {
b.Opts = opts
}

func (b *OutputPathBuilder) Build(ctx *context.CreationContext) ([]entity.ContainerData, *response.BizError) {
if b.Opts.Disable {
return nil, nil
}
storageTypes := b.Opts.AcceptStorageType
if storageTypes == nil || len(storageTypes) == 0 {
return nil, response.SYSTEM_ERROR
}

jobName := ctx.Request.JobName

uploader := upload.SelectUploaderFromStorageType(storageTypes[0])
remoteDir := uploader.GetJobDefaultObjectKeyPrefix(jobName) + cloudbrain.ModelMountPath
err := uploader.MKDIR(remoteDir)
if err != nil {
log.Error("MKDIR err.displayJobName = %s err=%v", ctx.Request.DisplayJobName, err)
return nil, response.NewBizError(err)
}
return []entity.ContainerData{{
ContainerPath: b.Opts.ContainerPath,
ReadOnly: b.Opts.ReadOnly,
ObjectKey: remoteDir,
RealPath: uploader.GetRealPath(remoteDir),
Bucket: uploader.GetBucket(),
EndPoint: uploader.GetEndpoint(),
}}, nil
}

func (b OutputPathBuilder) GetContainerType() ai_task_entity.ContainerDataType {
return ai_task_entity.ContainerOutPutPath
func (b *OutputPathBuilder) GetContainerType() entity.ContainerDataType {
return entity.ContainerOutPutPath
}

+ 0
- 59
services/ai_task_service/container_builder/output_readme_builder.go View File

@@ -1,59 +0,0 @@
package container_builder

import (
"code.gitea.io/gitea/entity/ai_task_entity"
"code.gitea.io/gitea/modules/log"
"code.gitea.io/gitea/modules/setting"
"code.gitea.io/gitea/services/ai_task_service/context"
"code.gitea.io/gitea/services/ai_task_service/upload"
"errors"
"os"
)

type CloudbrainOneOutputReadmeBuilder struct {
Opts ContainerBuildOpts
}

const README = "README"

func (b CloudbrainOneOutputReadmeBuilder) Build(ctx *context.CreationContext) ([]ai_task_entity.ContainerData, error) {
modelPath := setting.JobPath + ctx.Request.JobName + b.Opts.ContainerPath + "/"
text := "You can put the files into this directory and download the files by the web page."
err := os.MkdirAll(modelPath, os.ModePerm)
if err != nil {
log.Error("MkdirAll(%s) failed:%v", modelPath, err)
return nil, err
}
fileName := modelPath + README
f, err := os.OpenFile(fileName, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, os.ModePerm)
if err != nil {
log.Error("OpenFile failed", err.Error())
return nil, err
}

defer f.Close()

_, err = f.WriteString(text)
if err != nil {
log.Error("WriteString failed", err.Error())
return nil, err
}

if err := upload.UploadDirToMinio(modelPath, ctx.Request.JobName, b.Opts.ContainerPath+"/"); err != nil {
log.Error("Failed to UploadDirToMinio: %s (%v)", ctx.Request.JobName, err)
return nil, errors.New("cloudbrain.load_code_failed")
}

return []ai_task_entity.ContainerData{{
Name: README,
Bucket: setting.Attachment.Minio.Bucket,
EndPoint: setting.Attachment.Minio.Endpoint,
ObjectKey: setting.CBCodePathPrefix + ctx.Request.JobName + b.Opts.ContainerPath + "/" + README,
ContainerPath: b.Opts.ContainerPath,
ReadOnly: b.Opts.ReadOnly,
}}, nil
}

func (b CloudbrainOneOutputReadmeBuilder) GetContainerType() ai_task_entity.ContainerDataType {
return ai_task_entity.ContainerCloudbrainOneOutPutReadMe
}

+ 120
- 36
services/ai_task_service/container_builder/pre_model_builder.go View File

@@ -1,66 +1,113 @@
package container_builder

import (
"code.gitea.io/gitea/entity/ai_task_entity"
"code.gitea.io/gitea/routers/response"
"fmt"
"strings"

"code.gitea.io/gitea/entity"
"code.gitea.io/gitea/models"
"code.gitea.io/gitea/modules/cloudbrain"
"code.gitea.io/gitea/modules/log"
"code.gitea.io/gitea/modules/setting"
"code.gitea.io/gitea/modules/storage"
"code.gitea.io/gitea/services/ai_task_service/context"
"code.gitea.io/gitea/services/ai_task_service/upload"
"code.gitea.io/gitea/services/cloudbrain/cloudbrainTask"
"errors"
"strings"
)

type PretrainModelBuilder struct {
Opts ContainerBuildOpts
Opts *entity.ContainerBuildOpts
}

func (b PretrainModelBuilder) Build(ctx *context.CreationContext) ([]ai_task_entity.ContainerData, error) {
form := ctx.Request
func init() {
o := &PretrainModelBuilder{}
RegisterContainerBuilder(o)
}

if form.ModelName == "" {
func (b *PretrainModelBuilder) SetOpts(opts *entity.ContainerBuildOpts) {
b.Opts = opts
}

func (b *PretrainModelBuilder) Build(ctx *context.CreationContext) ([]entity.ContainerData, *response.BizError) {
if b.Opts.Disable {
return nil, nil
}

m, err := models.QueryModelByPath(form.PreTrainModelUrl)
form := ctx.Request
storageTypes := b.Opts.AcceptStorageType
if storageTypes == nil || len(storageTypes) == 0 {
return nil, response.SYSTEM_ERROR
}
//未选择预训练模型,跳过此步
if form.PretrainModelName == "" {
return nil, nil
}
if form.PretrainModelId == "" {
//异常数据,理论上应该都有modelId
return nil, response.RESULT_CLEARD
}
//查出模型数据
m, err := models.QueryModelById(form.PretrainModelId)
if err != nil {
log.Error("Can not find model", err)
return nil, errors.New("repo.modelconvert.manage.model_not_exist")
return nil, response.MODEL_NOT_EXISTS
}
if !cloudbrainTask.IsModelFileExists(m, form.CkptName) {
log.Error("model file not exist.name = %s", form.CkptName)
return nil, errors.New("repo.modelconvert.manage.model_file_not_exist")
preTrainModelUrl := m.Path
if err != nil {
log.Error("Can not find model", err)
return nil, response.MODEL_NOT_EXISTS
}
//模型文件存储方式
oldStorageType := entity.GetStorageTypeFromCloudbrainType(m.Type)
if oldStorageType == "" {
log.Error("model storage type error.modelId=%d", m.ID)
return nil, response.SYSTEM_ERROR
}
preTrainModelPath := getPreTrainModelPath(form.PreTrainModelUrl, form.CkptName)

var modelData ai_task_entity.ContainerData
switch m.Type {
case models.TypeCloudBrainOne:
modelData = ai_task_entity.ContainerData{
Name: form.ModelName,
Bucket: setting.Attachment.Minio.Bucket,
EndPoint: setting.Attachment.Minio.Endpoint,
var preTrainModelPath string
var preTrainModelEntity []entity.ContainerData
storageType := oldStorageType
ckptNames := strings.Split(form.PretrainModelCkptName, ";")
for _, ckptName := range ckptNames {
if !cloudbrainTask.IsModelFileExists(m, ckptName) {
log.Error("model file not exist.name = %s", ckptName)
return nil, response.MODEL_NOT_EXISTS
}
preTrainModelPath = getPreTrainModelPath(preTrainModelUrl, ckptName)
if !b.Opts.IsStorageTypeIn(oldStorageType) {
//意味着模型之前存储的位置不符合要求,需要转存到指定存储
newStorageType := b.Opts.AcceptStorageType[0]
//todo 可优化
if newStorageType == entity.MINIO && oldStorageType == entity.OBS {
//复用以前代码
minioPreModelURL, err := dealModelInfo(form.PretrainModelId, form.JobName, ckptName)
if err != nil {
log.Error("Can not find model,modelId=%d err=%v", form.PretrainModelId, err)
return nil, response.MODEL_NOT_EXISTS
}
preTrainModelUrl = minioPreModelURL
preTrainModelPath = getPreTrainModelPath(minioPreModelURL, ckptName)
storageType = entity.MINIO
}
}
uploader := upload.SelectUploaderFromStorageType(storageType)
modelData := entity.ContainerData{
Name: form.PretrainModelName,
Bucket: uploader.GetBucket(),
EndPoint: uploader.GetEndpoint(),
ObjectKey: preTrainModelPath,
ReadOnly: b.Opts.ReadOnly,
ContainerPath: b.Opts.ContainerPath + "/" + form.CkptName,
RealPath: setting.Attachment.Minio.RealPath + setting.Attachment.Minio.Bucket + "/" + preTrainModelPath,
}
case models.TypeCloudBrainTwo:
modelData = ai_task_entity.ContainerData{
Name: form.ModelName,
Bucket: setting.Bucket,
EndPoint: GetEndPoint(),
ReadOnly: b.Opts.ReadOnly,
ObjectKey: preTrainModelPath,
ContainerPath: b.Opts.ContainerPath + "/" + form.CkptName,
ContainerPath: b.Opts.ContainerPath + "/" + ckptName,
RealPath: uploader.GetRealPath(preTrainModelPath),
}
preTrainModelEntity = append(preTrainModelEntity, modelData)
}

return []ai_task_entity.ContainerData{modelData}, nil
form.PreTrainModelUrl = preTrainModelUrl
return preTrainModelEntity, nil
}

func (b PretrainModelBuilder) GetContainerType() ai_task_entity.ContainerDataType {
return ai_task_entity.ContainerPreTrainModel
func (b *PretrainModelBuilder) GetContainerType() entity.ContainerDataType {
return entity.ContainerPreTrainModel
}

func getPreTrainModelPath(pretrainModelDir string, fileName string) string {
@@ -73,3 +120,40 @@ func getPreTrainModelPath(pretrainModelDir string, fileName string) string {
}

}

func dealModelInfo(modelId string, jobName string, ckptName string) (string, error) {
preModel, err := models.QueryModelById(modelId)
if err != nil || preModel == nil || preModel.ID == "" {
log.Error("Can not find model", err)
return "", fmt.Errorf("Can not find model: %v", ckptName)
}
minioPreModelURL, err := downloadModelFromObs(preModel, jobName, cloudbrain.PretrainModelMountPath, ckptName)
if err != nil {
log.Error("Can not find model", err)

return "", err
}
return minioPreModelURL, nil
}

func downloadModelFromObs(preModel *models.AiModelManage, jobName, suffixPath string, ckptFileName string) (string, error) {
destPath := setting.CBCodePathPrefix + jobName + suffixPath + "/"
destFile := destPath + ckptFileName
returnStr := setting.Attachment.Minio.Bucket + "/" + destPath
srcUrl := preModel.Path[len(setting.Bucket)+1:] + ckptFileName
log.Info("dest model Path=" + returnStr + " src path=" + preModel.Path + ckptFileName)
body, err := storage.ObsDownloadAFile(setting.Bucket, srcUrl)
if err == nil {
defer body.Close()
_, err = storage.Attachments.UploadContent(setting.Attachment.Minio.Bucket, destFile, body)
if err != nil {
log.Error("UploadObject(%s) failed: %s", preModel.Path+ckptFileName, err.Error())
return "", err
}
} else {
log.Info("download model failed. as " + err.Error())
return "", err
}
log.Info("download model from obs succeed")
return returnStr, nil
}

+ 12
- 12
services/ai_task_service/context/context.go View File

@@ -1,48 +1,48 @@
package context

import (
"code.gitea.io/gitea/entity/ai_task_entity"
"code.gitea.io/gitea/entity"
"code.gitea.io/gitea/models"
"code.gitea.io/gitea/modules/git"
)

type CreationContext struct {
Request ai_task_entity.CreateReq
ContainerData map[ai_task_entity.ContainerDataType][]ai_task_entity.ContainerData
Request *entity.CreateReq
ContainerData map[entity.ContainerDataType][]entity.ContainerData
GitRepo *git.Repository
Repository *models.Repository
Spec *models.Specification
User *models.User
Datasets map[string]models.DatasetInfo
CommitID string
Response *ai_task_entity.CreationResponse
Response *entity.CreationResponse
SourceCloudbrain *models.Cloudbrain
NewCloudbrain *models.Cloudbrain
AITaskConfig entity.AITaskConfig
}

func (ctx *CreationContext) AddContainerData(t ai_task_entity.ContainerDataType, d []ai_task_entity.ContainerData) {
func (ctx *CreationContext) AddContainerData(t entity.ContainerDataType, d []entity.ContainerData) {
if ctx.ContainerData == nil {
ctx.ContainerData = make(map[ai_task_entity.ContainerDataType][]ai_task_entity.ContainerData, 0)
ctx.ContainerData = make(map[entity.ContainerDataType][]entity.ContainerData, 0)
}
ctx.ContainerData[t] = d
}
func (ctx *CreationContext) GetContainerDataArray(t ai_task_entity.ContainerDataType) []ai_task_entity.ContainerData {
func (ctx *CreationContext) GetContainerDataArray(t entity.ContainerDataType) []entity.ContainerData {
if ctx.ContainerData == nil {
return nil
}
return ctx.ContainerData[t]
}
func (ctx *CreationContext) GetContainerData(t ai_task_entity.ContainerDataType) ai_task_entity.ContainerData {
func (ctx *CreationContext) GetContainerData(t entity.ContainerDataType) entity.ContainerData {
a := ctx.GetContainerDataArray(t)
if a == nil || len(a) == 0 {
return ai_task_entity.ContainerData{}
return entity.ContainerData{}
}
return a[0]
}
func (ctx *CreationContext) WriteResponse(t ai_task_entity.ContainerDataType) ai_task_entity.ContainerData {
func (ctx *CreationContext) WriteResponse(t entity.ContainerDataType) entity.ContainerData {
a := ctx.GetContainerDataArray(t)
if a == nil || len(a) == 0 {
return ai_task_entity.ContainerData{}
return entity.ContainerData{}
}
return a[0]
}

+ 51
- 7
services/ai_task_service/schedule/model_schedule.go View File

@@ -2,6 +2,18 @@ package schedule

import (
"bytes"
"encoding/json"
"errors"
"fmt"
"os/exec"
"path"
"strings"
"time"

"code.gitea.io/gitea/modules/modelarts"

"code.gitea.io/gitea/modules/obs"

"code.gitea.io/gitea/models"
"code.gitea.io/gitea/modules/grampus"
"code.gitea.io/gitea/modules/labelmsg"
@@ -11,14 +23,7 @@ import (
"code.gitea.io/gitea/modules/setting"
"code.gitea.io/gitea/modules/storage"
"code.gitea.io/gitea/modules/util"
"encoding/json"
"errors"
"fmt"
"github.com/minio/minio-go"
"os/exec"
"path"
"strings"
"time"
)

const NPUModelDefaultName = "models.zip"
@@ -205,7 +210,33 @@ func LocalMigrateOperate(jobName, computeSource string, r *models.ModelMigrateRe
}
if computeSource == models.NPUResource {
//因为NPU的输出会被压缩,因此需要解压+移桶
if err := obsMkdir(setting.CodePathPrefix + jobName + modelarts.OutputPath); err != nil {
log.Error("Failed to obsMkdir_output: %s (%v)", jobName, err)

return err
}
log.Info("DestObjectKey", r.DestObjectKey)
if strings.Contains(r.DestObjectKey, ".") {
decompress(r.DestBucket+"/"+r.DestObjectKey, setting.Bucket+"/"+strings.TrimSuffix(r.DestObjectKey, models.ModelSuffix))

} else { //如果是文件夹,遍历文件
fileInfos, err := storage.GetOneLevelObjectsUnderDir(r.DestBucket, "", r.DestObjectKey)
if err != nil {
log.Error("UpdateModelMigrateStatusByStep err. r.ID=%d step=%d err=%v", r.ID, models.BucketMoveFailed, err)
return err
}

for _, fileInfo := range fileInfos {
log.Info("decompress file:", fileInfo.FileName)
sourceFilPath := r.DestBucket + "/" + r.DestObjectKey + fileInfo.FileName
if !strings.HasSuffix(r.DestObjectKey, "/") {
sourceFilPath = r.DestBucket + "/" + r.DestObjectKey + "/" + fileInfo.FileName
}
decompress(sourceFilPath, setting.Bucket+"/"+strings.TrimSuffix(r.DestObjectKey, models.ModelSuffix))
}

}

} else {
//因为调度无法指定桶,所以调度成功后我们还需要移桶
if setting.UseLocalMinioMigrate {
@@ -233,6 +264,19 @@ func LocalMigrateOperate(jobName, computeSource string, r *models.ModelMigrateRe
return nil
}

func obsMkdir(dir string) error {
input := &obs.PutObjectInput{}
input.Bucket = setting.Bucket
input.Key = dir
_, err := storage.ObsCli.PutObject(input)
if err != nil {
log.Error("PutObject(%s) failed: %s", input.Key, err.Error())
return err
}

return nil
}

func TryToUpdateNPUMoveBucketResult(record *models.ModelMigrateRecord, jobName, versionName string) error {
if IsNPUModelDirHasFile(jobName, versionName) {
if err := models.UpdateModelMigrateStatusByStep(record, models.BucketMoveSuccess); err != nil {


+ 94
- 76
services/ai_task_service/task/cloudbrain_one_notebook_task.go View File

@@ -1,14 +1,13 @@
package task

import (
"code.gitea.io/gitea/entity/ai_task_entity"
"code.gitea.io/gitea/entity"
"code.gitea.io/gitea/models"
"code.gitea.io/gitea/modules/log"
"code.gitea.io/gitea/modules/notification"
"code.gitea.io/gitea/modules/setting"
"code.gitea.io/gitea/modules/timeutil"
"code.gitea.io/gitea/routers/response"
"code.gitea.io/gitea/services/ai_task_service/cluster"
"code.gitea.io/gitea/services/ai_task_service/container_builder"
"code.gitea.io/gitea/services/ai_task_service/context"
"strconv"
@@ -16,106 +15,139 @@ import (
)

type CloudbrainOneNotebookTaskTemplate struct {
DefaultCreationHandler
DefaultAITaskTemplate
}

func init() {
t := &CloudbrainOneNotebookTaskTemplate{
DefaultAITaskTemplate: DefaultAITaskTemplate{
ClusterType: ai_task_entity.OpenICloudbrainOne,
ClusterType: entity.OpenICloudbrainOne,
JobType: models.JobTypeDebug,
},
}
RegisterTask(models.JobTypeDebug, ai_task_entity.OpenICloudbrainOne, t)
RegisterTask(models.JobTypeDebug, entity.OpenICloudbrainOne, t)
}

func (g CloudbrainOneNotebookTaskTemplate) MyClusterType() ai_task_entity.ClusterType {
return ""
}

func (t CloudbrainOneNotebookTaskTemplate) Create(ctx *context.CreationContext) (*ai_task_entity.CreateTaskRes, *response.BizError) {
func (t CloudbrainOneNotebookTaskTemplate) Create(ctx *context.CreationContext) (*entity.CreateTaskRes, *response.BizError) {
c := &CreateOperator{}
err := c.Next(t.CheckParam).
Next(t.CheckMulti).
Next(t.CheckDisplayJobName).
Next(t.LoadSpec).
Next(t.CheckPointBalance).
Next(t.CheckDatasetSize).
Next(t.CheckDatasetExists).
Next(t.CheckBranchExists).
Next(t.InsertCloudbrainRecord4Async).
AsyncNext(t.BuildContainerData, t.CallCreationAPI, t.AfterCallCreationAPI4Async, t.NotifyCreation).
AsyncNextWithErrFun(t.BuildContainerData, t.CallCreationAPI, t.AfterCallCreationAPI4Async, t.NotifyCreation, t.HandleErr4Async).
Operate(ctx)
if err != nil {
log.Error("create CloudbrainOneNotebookTask err.%v", err)
return nil, err
}
return &ai_task_entity.CreateTaskRes{ID: ctx.NewCloudbrain.ID}, nil
return &entity.CreateTaskRes{ID: ctx.NewCloudbrain.ID}, nil

}

func (g CloudbrainOneNotebookTaskTemplate) Restart(*context.CreationContext) (*ai_task_entity.CreateTaskRes, *response.BizError) {
return nil, nil
}

func (g CloudbrainOneNotebookTaskTemplate) CallCreationAPI(ctx *context.CreationContext) *response.BizError {
c, err := cluster.GetCluster(ai_task_entity.OpenICloudbrainOne)
if err != nil {
return response.SYSTEM_ERROR
func (g CloudbrainOneNotebookTaskTemplate) GetConfig(opts entity.GetAITaskConfigOpts) entity.AITaskConfig {
if opts.IsFileNoteBookRequest {
return entity.AITaskConfig{
ContainerSteps: map[entity.ContainerDataType]*entity.ContainerBuildOpts{
entity.ContainerFileNoteBookCode: {},
entity.ContainerCode: {
ContainerPath: "/code",
ReadOnly: false,
AcceptStorageType: []entity.StorageType{entity.MINIO},
NotArchive: true,
},
},
}
form := ctx.Request
req := ai_task_entity.CreateNoteBookTaskRequest{
Name: form.JobName,
Tasks: []ai_task_entity.NoteBookTask{
{
Name: form.JobName,
ResourceSpecId: ctx.Spec.SourceSpecId,
ImageId: form.ImageID,
ImageUrl: strings.TrimSpace(form.ImageUrl),
Datasets: ctx.GetContainerDataArray(ai_task_entity.ContainerDataset),
Code: ctx.GetContainerDataArray(ai_task_entity.ContainerCode),
PreTrainModel: ctx.GetContainerDataArray(ai_task_entity.ContainerPreTrainModel),
AutoStopDuration: autoStopDurationMs,
Capacity: setting.Capacity,
CenterID: ctx.Spec.GetAvailableCenterIds(ctx.User.ID, form.JobType),
Spec: ctx.Spec,
}
return entity.AITaskConfig{
DatasetMaxSize: setting.DebugAttachSize * 1000 * 1000 * 1000,
ContainerSteps: map[entity.ContainerDataType]*entity.ContainerBuildOpts{
entity.ContainerCode: {
ContainerPath: "/code",
ReadOnly: false,
AcceptStorageType: []entity.StorageType{entity.MINIO},
NotArchive: true,
},
entity.ContainerDataset: {
ContainerPath: "/dataset",
ReadOnly: true,
AcceptStorageType: []entity.StorageType{entity.MINIO},
NotArchive: true,
},
entity.ContainerPreTrainModel: {
ContainerPath: "/pretrainmodel",
ReadOnly: true,
AcceptStorageType: []entity.StorageType{entity.MINIO},
},
entity.ContainerOutPutPath: {
ContainerPath: "/model",
ReadOnly: false,
AcceptStorageType: []entity.StorageType{entity.MINIO},
},
},
}
createTime := timeutil.TimeStampNow()
res, err := c.CreateNoteBook(req)
}

func (t CloudbrainOneNotebookTaskTemplate) Restart(ctx *context.CreationContext) (*entity.CreateTaskRes, *response.BizError) {
c := &CreateOperator{}
err := c.Next(t.BuildRequest4Restart).
Next(t.CheckOutput4Restart).
Next(t.CheckModel).
Next(t.CheckDatasetExists).
Next(t.CheckParam).
Next(t.CheckMulti).
Next(t.LoadSpec).
Next(t.CheckPointBalance).
Next(t.BuildContainerData).
Next(t.CallRestartAPI).
Next(t.CreateCloudbrainRecord4Restart).
Next(t.NotifyCreation).
Operate(ctx)
if err != nil {
log.Error("CloudbrainOneNotebookTask CreateNoteBook err.req=%+v err=%v", req, err)
ctx.Response = &ai_task_entity.CreationResponse{
Error: err,
}
return nil
} else {
ctx.Response = &ai_task_entity.CreationResponse{
JobID: res.JobID,
Status: res.Status,
CreateTime: createTime,
log.Error("Restart GrampusNoteBookTask err.%v", err)
return nil, err
}
if err != nil {
log.Error("Restart GrampusNoteBookTask err.%v", err)
return nil, err
}
return &entity.CreateTaskRes{ID: ctx.NewCloudbrain.ID, Status: ctx.NewCloudbrain.Status}, nil

return nil
}

func (g CloudbrainOneNotebookTaskTemplate) CallRestartAPI(ctx *context.CreationContext) *response.BizError {
c, err := cluster.GetCluster(ai_task_entity.OpenICloudbrainOne)
func (c CloudbrainOneNotebookTaskTemplate) BuildContainerData(ctx *context.CreationContext) *response.BizError {
err := container_builder.BuildContainerDataChain(c.GetConfig(entity.GetAITaskConfigOpts{
ComputeSource: ctx.Request.ComputeSource.Name,
IsFileNoteBookRequest: ctx.Request.IsFileNoteBookRequest,
}).ContainerSteps).Run(ctx)
if err != nil {
return err
}
return nil
}

func (g CloudbrainOneNotebookTaskTemplate) CallCreationAPI(ctx *context.CreationContext) *response.BizError {
c := g.GetMyCluster()
if c == nil {
return response.SYSTEM_ERROR
}
form := ctx.Request
req := ai_task_entity.CreateNoteBookTaskRequest{
req := entity.CreateNoteBookTaskRequest{
Name: form.JobName,
Tasks: []ai_task_entity.NoteBookTask{
Tasks: []entity.NoteBookTask{
{
Name: form.JobName,
ResourceSpecId: ctx.Spec.SourceSpecId,
ImageId: form.ImageID,
ImageUrl: strings.TrimSpace(form.ImageUrl),
Datasets: ctx.GetContainerDataArray(ai_task_entity.ContainerDataset),
Code: ctx.GetContainerDataArray(ai_task_entity.ContainerCode),
PreTrainModel: ctx.GetContainerDataArray(ai_task_entity.ContainerPreTrainModel),
Datasets: ctx.GetContainerDataArray(entity.ContainerDataset),
Code: ctx.GetContainerDataArray(entity.ContainerCode),
PreTrainModel: ctx.GetContainerDataArray(entity.ContainerPreTrainModel),
OutPut: ctx.GetContainerDataArray(entity.ContainerOutPutPath),
AutoStopDuration: autoStopDurationMs,
Capacity: setting.Capacity,
CenterID: ctx.Spec.GetAvailableCenterIds(ctx.User.ID, form.JobType),
@@ -129,33 +161,19 @@ func (g CloudbrainOneNotebookTaskTemplate) CallRestartAPI(ctx *context.CreationC
log.Error("CloudbrainOneNotebookTask CreateNoteBook err.req=%+v err=%v", req, err)
return response.NewBizError(err)
}
ctx.Response = &ai_task_entity.CreationResponse{

ctx.Response = &entity.CreationResponse{
JobID: res.JobID,
Status: res.Status,
CreateTime: createTime,
}

return nil
}

func (CloudbrainOneNotebookTaskTemplate) BuildContainerData(ctx *context.CreationContext) *response.BizError {
err := container_builder.NewBuilderChain().
Next(container_builder.ObsCodeBuilder{Opts: container_builder.ContainerBuildOpts{
ContainerPath: "/code",
ReadOnly: false,
}}).
Next(container_builder.DatasetBuilder{Opts: container_builder.ContainerBuildOpts{
ContainerPath: "/dataset",
ReadOnly: true,
}}).
Next(container_builder.PretrainModelBuilder{Opts: container_builder.ContainerBuildOpts{
ContainerPath: "/pretrainmodel",
ReadOnly: false,
}}).
Run(ctx)
if err != nil {
return response.NewBizError(err)
}
return nil
func (g CloudbrainOneNotebookTaskTemplate) CallRestartAPI(ctx *context.CreationContext) *response.BizError {
//云脑一没有再次调试接口,通过使用同样的参数新建接口来模拟
return g.CallCreationAPI(ctx)
}

func (CloudbrainOneNotebookTaskTemplate) NotifyCreation(ctx *context.CreationContext) *response.BizError {


+ 217
- 0
services/ai_task_service/task/cloudbrain_two_notebook_task.go View File

@@ -0,0 +1,217 @@
package task

import (
"code.gitea.io/gitea/entity"
"code.gitea.io/gitea/models"
"code.gitea.io/gitea/modules/convert"
"code.gitea.io/gitea/modules/log"
"code.gitea.io/gitea/modules/notification"
"code.gitea.io/gitea/modules/setting"
api "code.gitea.io/gitea/modules/structs"
"code.gitea.io/gitea/modules/timeutil"
"code.gitea.io/gitea/routers/response"
"code.gitea.io/gitea/services/ai_task_service/container_builder"
"code.gitea.io/gitea/services/ai_task_service/context"
"code.gitea.io/gitea/services/cloudbrain/resource"
"strconv"
"strings"
)

type CloudbrainTwoNotebookTaskTemplate struct {
DefaultAITaskTemplate
}

func init() {
t := &CloudbrainTwoNotebookTaskTemplate{
DefaultAITaskTemplate: DefaultAITaskTemplate{
ClusterType: entity.OpenICloudbrainTwo,
JobType: models.JobTypeDebug,
},
}
RegisterTask(models.JobTypeDebug, entity.OpenICloudbrainTwo, t)
}

func (t CloudbrainTwoNotebookTaskTemplate) Create(ctx *context.CreationContext) (*entity.CreateTaskRes, *response.BizError) {
c := &CreateOperator{}
err := c.Next(t.CheckParam).
Next(t.CheckMulti).
Next(t.CheckDisplayJobName).
Next(t.LoadSpec).
Next(t.CheckPointBalance).
Next(t.CheckDatasetSize).
Next(t.CheckDatasetExists).
Next(t.CheckBranchExists).
Next(t.InsertCloudbrainRecord4Async).
AsyncNextWithErrFun(t.BuildContainerData, t.CallCreationAPI, t.AfterCallCreationAPI4Async, t.NotifyCreation, t.HandleErr4Async).
Operate(ctx)
if err != nil {
log.Error("create CloudbrainOneNotebookTask err.%v", err)
return nil, err
}
return &entity.CreateTaskRes{ID: ctx.NewCloudbrain.ID}, nil

}

func (g CloudbrainTwoNotebookTaskTemplate) GetConfig(opts entity.GetAITaskConfigOpts) entity.AITaskConfig {
if opts.IsFileNoteBookRequest {
return entity.AITaskConfig{
ContainerSteps: map[entity.ContainerDataType]*entity.ContainerBuildOpts{
entity.ContainerFileNoteBookCode: {},
},
}
}

return entity.AITaskConfig{
DatasetMaxSize: setting.DebugAttachSize * 1000 * 1000 * 1000,
ContainerSteps: map[entity.ContainerDataType]*entity.ContainerBuildOpts{
entity.ContainerCode: {
Disable: true,
AcceptStorageType: []entity.StorageType{entity.OBS},
},
entity.ContainerDataset: {
Disable: true,
AcceptStorageType: []entity.StorageType{entity.OBS},
},
entity.ContainerPreTrainModel: {
Disable: true,
AcceptStorageType: []entity.StorageType{entity.OBS},
},
},
}
}

func (t CloudbrainTwoNotebookTaskTemplate) Restart(ctx *context.CreationContext) (*entity.CreateTaskRes, *response.BizError) {
c := &CreateOperator{}
err := c.Next(t.BuildRequest4Restart).
Next(t.CheckOutput4Restart).
Next(t.CheckModel).
Next(t.CheckDatasetExists).
Next(t.CheckIsCleared).
Next(t.CheckParam).
Next(t.CheckMulti).
Next(t.LoadSpec).
Next(t.CheckPointBalance).
Next(t.CallRestartAPI).
Next(t.CreateCloudbrainRecord4Restart).
Next(t.NotifyCreation).
Operate(ctx)
if err != nil {
log.Error("Restart GrampusNoteBookTask err.%v", err)
return nil, err
}
if err != nil {
log.Error("Restart GrampusNoteBookTask err.%v", err)
return nil, err
}
return &entity.CreateTaskRes{ID: ctx.NewCloudbrain.ID, Status: ctx.NewCloudbrain.Status}, nil

}

func (g CloudbrainTwoNotebookTaskTemplate) CallCreationAPI(ctx *context.CreationContext) *response.BizError {
c := g.GetMyCluster()
if c == nil {
return response.SYSTEM_ERROR
}
form := ctx.Request
req := entity.CreateNoteBookTaskRequest{
Name: form.JobName,
Description: form.Description,
Tasks: []entity.NoteBookTask{
{
Name: form.JobName,
ResourceSpecId: ctx.Spec.SourceSpecId,
ImageId: form.ImageID,
ImageUrl: strings.TrimSpace(form.ImageUrl),
AutoStopDuration: autoStopDurationMs,
Spec: ctx.Spec,
},
},
}
createTime := timeutil.TimeStampNow()
res, err := c.CreateNoteBook(req)
if err != nil {
log.Error("CloudbrainTwoNotebookTaskTemplate CreateNoteBook err.req=%+v err=%v", req, err)
return response.NewBizError(err)
}
ctx.Response = &entity.CreationResponse{
JobID: res.JobID,
Status: res.Status,
CreateTime: createTime,
}

return nil
}

func (g CloudbrainTwoNotebookTaskTemplate) CallRestartAPI(ctx *context.CreationContext) *response.BizError {
c := g.GetMyCluster()
if c == nil {
log.Error("Get cluster failed")
return response.SYSTEM_ERROR
}
createTime := timeutil.TimeStampNow()
res, err := c.RestartNoteBook(ctx.SourceCloudbrain.JobID)
if err != nil {
log.Error("CloudbrainTwoNotebookTaskTemplate RestartNoteBook err.Cloudbrain.JobID=%s err=%v", ctx.SourceCloudbrain.JobID, err)
return response.NewBizError(err)
}
if res.JobId == "" {
log.Error("CloudbrainTwoNotebookTaskTemplate RestartNoteBook failed.Cloudbrain.JobID=%s", ctx.SourceCloudbrain.JobID)
return response.RESTART_FAILED
}
ctx.Response = &entity.CreationResponse{
JobID: res.JobId,
Status: res.Status,
CreateTime: createTime,
}
return nil
}

func (c CloudbrainTwoNotebookTaskTemplate) BuildContainerData(ctx *context.CreationContext) *response.BizError {
err := container_builder.BuildContainerDataChain(c.GetConfig(entity.GetAITaskConfigOpts{
ComputeSource: ctx.Request.ComputeSource.Name,
IsFileNoteBookRequest: ctx.Request.IsFileNoteBookRequest,
}).ContainerSteps).Run(ctx)
if err != nil {
return err
}
return nil
}

func (CloudbrainTwoNotebookTaskTemplate) NotifyCreation(ctx *context.CreationContext) *response.BizError {
req := ctx.Request
jobID := ctx.Response.JobID
task, err := models.GetCloudbrainByJobID(jobID)
if err != nil {
log.Error("GetCloudbrainByJobID failed: %v", err.Error())
return response.NewBizError(err)
}

stringId := strconv.FormatInt(task.ID, 10)
notification.NotifyOtherTask(ctx.User, ctx.Repository, stringId, req.DisplayJobName, models.ActionCreateDebugNPUTask)
return nil
}

func (g CloudbrainTwoNotebookTaskTemplate) GetSpecs(userId int64, computeSource models.ComputeSource) ([]*api.SpecificationShow, *response.BizError) {
var aiCenterCode = models.AICenterOfCloudBrainTwo
if setting.ModelartsCD.Enabled {
aiCenterCode = models.AICenterOfChengdu
}
var specs []*models.Specification
var err error
specs, err = resource.FindAvailableSpecs(userId, models.FindSpecsOptions{
JobType: g.JobType,
ComputeResource: computeSource.Name,
Cluster: g.ClusterType.GetParentCluster(),
AiCenterCode: aiCenterCode,
})

if err != nil {
log.Error("GetSpecs err.%v", err)
return nil, response.SPEC_NOT_AVAILABLE
}
r := make([]*api.SpecificationShow, len(specs))
for i, v := range specs {
r[i] = convert.ToSpecification(v)
}
return r, nil
}

Some files were not shown because too many files changed in this diff

Loading…
Cancel
Save