#4357 V20230531

Merged
ychao_1983 merged 396 commits from V20230531 into develop 10 months ago
  1. +15
    -0
      README.md
  2. +36
    -37
      custom/public/css/git.openi.css
  3. +208
    -0
      entity/ai_task.go
  4. +35
    -0
      entity/ai_task_config.go
  5. +0
    -28
      entity/ai_task_entity/container.go
  6. +0
    -1
      entity/ai_task_entity/creation.go
  7. +0
    -70
      entity/ai_task_entity/task.go
  8. +12
    -0
      entity/ai_task_list.go
  9. +77
    -15
      entity/cluster.go
  10. +45
    -0
      entity/container.go
  11. +6
    -4
      entity/creation.go
  12. +13
    -0
      entity/err_code.go
  13. +3
    -0
      entity/grampus_err_code.go
  14. +1
    -1
      entity/images.go
  15. +35
    -0
      entity/operation_profile.go
  16. +21
    -0
      entity/storage.go
  17. +44
    -0
      entity/user.go
  18. +1527
    -0
      manager/client/cloudbrain_two/resty.go
  19. +233
    -0
      manager/client/cloudbrain_two_cd/resty.go
  20. +65
    -1
      manager/client/grampus/grampus.go
  21. +59
    -22
      models/action.go
  22. +14
    -0
      models/ai_model_manage.go
  23. +1
    -1
      models/attachment.go
  24. +123
    -19
      models/cloudbrain.go
  25. +24
    -0
      models/cloudbrain_spec.go
  26. +17
    -0
      models/cloudbrain_static.go
  27. +22
    -0
      models/file_chunk.go
  28. +30
    -0
      models/ip_location.go
  29. +3
    -2
      models/model_migrate_record.go
  30. +11
    -0
      models/modelarts_deploy.go
  31. +1
    -0
      models/models.go
  32. +2
    -0
      models/repo_watch.go
  33. +1
    -0
      models/task_config.go
  34. +9
    -0
      models/user_login_log.go
  35. +5
    -2
      modules/auth/wechat/cloudbrain.go
  36. +0
    -8
      modules/context/repo.go
  37. +19
    -12
      modules/grampus/grampus.go
  38. +15
    -3
      modules/grampus/resty.go
  39. +44
    -0
      modules/ipinfo/ipinfo.go
  40. +1
    -1
      modules/minio_ext/constants.go
  41. +1
    -1
      modules/modelappservice/modelsevice.go
  42. +31
    -25
      modules/modelarts/resty.go
  43. +63
    -0
      modules/modelarts/wenxinresty.go
  44. +24
    -0
      modules/setting/screen_map.go
  45. +90
    -55
      modules/setting/setting.go
  46. +1
    -0
      modules/structs/cloudbrain.go
  47. +1
    -1
      modules/templates/helper.go
  48. +17
    -5
      options/locale/locale_en-US.ini
  49. +16
    -4
      options/locale/locale_zh-CN.ini
  50. +48
    -16
      package-lock.json
  51. +1
    -1
      package.json
  52. +36
    -11
      public/home/home.js
  53. +2
    -0
      public/home/search.js
  54. +122
    -72
      routers/ai_task/ai_task.go
  55. +1
    -0
      routers/ai_task/notebook.go
  56. +75
    -13
      routers/api/v1/api.go
  57. +19
    -2
      routers/api/v1/finetune/panguervice.go
  58. +53
    -2
      routers/api/v1/repo/attachments.go
  59. +175
    -0
      routers/api/v1/repo/cloudbrain.go
  60. +109
    -0
      routers/api/v1/repo/cloudbrain_dashboard.go
  61. +13
    -0
      routers/api/v1/repo/datasets.go
  62. +15
    -4
      routers/api/v1/repo/modelarts.go
  63. +10
    -0
      routers/api/v1/repo/modelmanage.go
  64. +7
    -0
      routers/home.go
  65. +1
    -0
      routers/private/internal.go
  66. +17
    -0
      routers/private/setting.go
  67. +65
    -33
      routers/repo/attachment.go
  68. +10
    -10
      routers/repo/attachment_model.go
  69. +80
    -21
      routers/repo/cloudbrain.go
  70. +1
    -0
      routers/repo/cloudbrain_statistic.go
  71. +5
    -2
      routers/repo/dataset.go
  72. +146
    -0
      routers/repo/flow_control.go
  73. +214
    -115
      routers/repo/grampus.go
  74. +30
    -0
      routers/repo/grampus_onlineinfer.go
  75. +85
    -51
      routers/repo/modelarts.go
  76. +12
    -12
      routers/repo/setting.go
  77. +5
    -1
      routers/response/api_response.go
  78. +26
    -1
      routers/response/error.go
  79. +10
    -1
      routers/response/response_list.go
  80. +10
    -1
      routers/routes/routes.go
  81. +1
    -0
      routers/user/home.go
  82. +167
    -37
      services/ai_task_service/cluster/c2net.go
  83. +91
    -24
      services/ai_task_service/cluster/cloudbrain_one.go
  84. +297
    -0
      services/ai_task_service/cluster/cloudbrain_two.go
  85. +18
    -14
      services/ai_task_service/cluster/cluster_base.go
  86. +81
    -0
      services/ai_task_service/container_builder/code_builder.go
  87. +95
    -0
      services/ai_task_service/container_builder/common.go
  88. +40
    -12
      services/ai_task_service/container_builder/container_builder.go
  89. +7
    -1
      services/ai_task_service/container_builder/container_builder_chan.go
  90. +55
    -32
      services/ai_task_service/container_builder/dataset_builder.go
  91. +47
    -0
      services/ai_task_service/container_builder/file_notebook_code_builder.go
  92. +0
    -59
      services/ai_task_service/container_builder/minio_code_builder.go
  93. +0
    -18
      services/ai_task_service/container_builder/obs_code_builder.go
  94. +40
    -6
      services/ai_task_service/container_builder/output_path_builder.go
  95. +0
    -59
      services/ai_task_service/container_builder/output_readme_builder.go
  96. +120
    -36
      services/ai_task_service/container_builder/pre_model_builder.go
  97. +12
    -12
      services/ai_task_service/context/context.go
  98. +52
    -8
      services/ai_task_service/schedule/model_schedule.go
  99. +95
    -77
      services/ai_task_service/task/cloudbrain_one_notebook_task.go
  100. +217
    -0
      services/ai_task_service/task/cloudbrain_two_notebook_task.go

+ 15
- 0
README.md View File

@@ -172,6 +172,21 @@
> [attachment]
> PATH = /data/gitea/attachments
>
> ENABLED = true
> MAX_SIZE = 1048576
> ALLOWED_TYPES = */*
> MAX_FILES = 10
> STORE_TYPE = minio
> MINIO_ENDPOINT =
>
> MINIO_ACCESS_KEY_ID =
> MINIO_SECRET_ACCESS_KEY =
> MINIO_BUCKET =
> MINIO_LOCATION =
> MINIO_BASE_PATH = attachment/
> MINIO_USE_SSL = true
> MINIO_REAL_PATH =
>
> [log]
> MODE = file
> LEVEL = info


+ 36
- 37
custom/public/css/git.openi.css View File

@@ -419,41 +419,40 @@
@media only screen and (min-width: 1920px) {

}

/* rotation3D */
#app{
position: relative;
width: 800px;
margin: 0 auto;
z-index: 4;
}
.rotation3D-baseMap{
position: absolute; left: 0; right: 0; top: 104px; margin: auto;
width: 800px; height: 516px;
background: url("../rotation3D/img/baseMap.png") no-repeat;
background-size: cover;
}
.rotation3D-baseMap::before{
position: absolute;
margin: auto; z-index: 99;
left:50%; top: -150px;
transform:translate(-50%,0);
width: 342px; height: 470px; display: block; content: '';
background: url("../rotation3D/img/baseLogo.svg");
/*animation: 10s bounceUpDown infinite;*/
}
.rotation3D-baseMap::after{
position: absolute;
margin: auto; z-index: 100;
left:50%; top:0;
transform:translate(-50%,0);
width: 110px; height: 86px; display: block; content: '';
background: url("../rotation3D/img/brain.svg");
animation: 6s bounceUpDown infinite;
mix-blend-mode: color-dodge;
}
@keyframes bounceUpDown{
0% {transform: translate(-50%, 0px);}
50% {transform: translate(-50%, -15px);}
100% {transform: translate(-50%, 0px);}
/* rotation3D */
#app{
position: relative;
width: 800px;
margin: 0 auto;
z-index: 4;
}
.rotation3D-baseMap{
position: absolute; left: 0; right: 0; top: 104px; margin: auto;
width: 800px; height: 516px;
background: url("../rotation3D/img/baseMap.png") no-repeat;
background-size: cover;
}
.rotation3D-baseMap::before{
position: absolute;
margin: auto; z-index: 99;
left:50%; top: -150px;
transform:translate(-50%,0);
width: 342px; height: 470px; display: block; content: '';
background: url("../rotation3D/img/baseLogo.svg");
/*animation: 10s bounceUpDown infinite;*/
}
.rotation3D-baseMap::after{
position: absolute;
margin: auto; z-index: 100;
left:50%; top:0;
transform:translate(-50%,0);
width: 110px; height: 86px; display: block; content: '';
background: url("../rotation3D/img/brain.svg");
animation: 6s bounceUpDown infinite;
mix-blend-mode: color-dodge;
}
@keyframes bounceUpDown{
0% {transform: translate(-50%, 0px);}
50% {transform: translate(-50%, -15px);}
100% {transform: translate(-50%, 0px);}
}

+ 208
- 0
entity/ai_task.go View File

@@ -0,0 +1,208 @@
package entity

import (
"strings"

"code.gitea.io/gitea/models"
"code.gitea.io/gitea/modules/git"
"code.gitea.io/gitea/modules/setting"
"code.gitea.io/gitea/modules/structs"
"code.gitea.io/gitea/modules/timeutil"
)

//todo 暂时保留之前各种云脑属性的定义
type CreateReq struct {
JobType models.JobType `json:"job_type" binding:"Required"`
DisplayJobName string `json:"display_job_name" binding:"Required"`
JobName string `json:"job_name"`
SpecId int64 `json:"spec_id" binding:"Required"`
ComputeSourceStr string `json:"compute_source" binding:"Required"`
Cluster ClusterType `json:"cluster" binding:"Required"`
WorkServerNumber int `json:"work_server_number"`
BranchName string `json:"branch_name"`
PreTrainModelUrl string `json:"pretrain_model_url"`
PretrainModelCkptName string `json:"pretrain_model_ckpt_name"`
ImageUrl string `json:"image_url"`
ImageID string `json:"image_id"`
ImageName string `json:"image_name"`
PretrainModelName string `json:"pretrain_model_name"`
PretrainModelVersion string `json:"pretrain_model_version"`
PretrainModelId string `json:"pretrain_model_id"`
Description string `json:"description"`
LabelName string `json:"label_names"`
DatasetUUIDStr string `json:"dataset_uuid_str"`
Params string `json:"run_para_list"`
BootFile string `json:"boot_file"`
ParamArray models.Parameters
ComputeSource *models.ComputeSource
ReqCommitID string
IsFileNoteBookRequest bool
FileRepository *models.Repository
FileBranchName string
IsRestartRequest bool
DatasetNames string
}

type CreationResponse struct {
Error error
JobID string
Status string //todo 考虑统一状态
CreateTime timeutil.TimeStamp
}

type QueryAITaskRes struct {
Task *AITaskDetailInfo `json:"task"`
EarlyVersionList []*AITaskDetailInfo `json:"early_version_list"`
CanCreateVersion bool `json:"can_create_version"`
}

type AITaskDetailInfo struct {
ID int64 `json:"id"`
JobID string `json:"job_id"`
Status string `json:"status"`
JobType string `json:"job_type"`
Cluster string `json:"cluster"`
DisplayJobName string `json:"display_job_name"`
FormattedDuration string `json:"formatted_duration"`
ComputeSource string `json:"compute_source"`
AICenter string `json:"ai_center"`
BootFile string `json:"boot_file"`
PreVersionName string `json:"pre_version_name"`
CurrentVersionName string `json:"current_version_name"`
WorkServerNumber int `json:"work_server_number"`
Spec *structs.SpecificationShow `json:"spec"`
DatasetList []*models.DatasetDownload `json:"dataset_list"`
PretrainModelList []*models.ModelDownload `json:"pretrain_model_list"`
Parameters *models.Parameters `json:"parameters"`
CreatedUnix timeutil.TimeStamp `json:"created_unix"`
CodePath string `json:"code_path"`
DatasetPath string `json:"dataset_path"`
PretrainModelPath string `json:"pretrain_model_path"`
OutputPath string `json:"output_path"`
CodeUrl string `json:"code_url"`
PretrainModelName string `json:"pretrain_model_name"`
PretrainModelVersion string `json:"pretrain_model_version"`
PretrainCkptName string `json:"pretrain_model_ckpt_name"`
StartTime timeutil.TimeStamp `json:"start_time"`
EndTime timeutil.TimeStamp `json:"end_time"`
Description string `json:"description"`
CommitID string `json:"commit_id"`
BranchName string `json:"branch_name"`
ImageUrl string `json:"image_url"`
ImageID string `json:"image_id"`
ImageName string `json:"image_name"`
CreatorName string `json:"creator_name"`
EngineName string `json:"engine_name"`
}

func (a *AITaskDetailInfo) Tr(language string) {
a.AICenter = getAiCenterShow(a.AICenter, language)
}

func (a *AITaskDetailInfo) RemoveDatasets() {
a.DatasetList = []*models.DatasetDownload{}
}
func (a *AITaskDetailInfo) RemovePretrainModelList() {
a.PretrainModelList = []*models.ModelDownload{}
}

func getAiCenterShow(aiCenter string, language string) string {
aiCenterInfo := strings.Split(aiCenter, "+")

if len(aiCenterInfo) == 2 {
if setting.C2NetMapInfo != nil {
if info, ok := setting.C2NetMapInfo[aiCenterInfo[0]]; ok {
if language == defaultLanguage {
return info.Content
} else {
return info.ContentEN
}
} else {
return aiCenterInfo[1]
}

} else {
return aiCenterInfo[1]
}

}

return ""

}

var defaultLanguage = "zh-CN"

type CreateTaskRes struct {
ID int64 `json:"id"`
Status string `json:"status"`
}

type GetAITaskCreationInfoReq struct {
User *models.User
JobType models.JobType
ClusterType ClusterType
ComputeSource *models.ComputeSource
Repo *models.Repository
GitRepo *git.Repository
IsOnlineType bool
}

type AITaskBriefInfo struct {
ID int64 `json:"id"`
JobType string `json:"job_type"`
Status string `json:"status"`
DisplayJobName string `json:"display_job_name"`
CreatedUnix timeutil.TimeStamp `json:"created_unix"`
StartTime timeutil.TimeStamp `json:"start_time"`
EndTime timeutil.TimeStamp `json:"end_time"`
FormattedDuration string `json:"formatted_duration"`
Cluster string `json:"cluster"`
ComputeSource string `json:"compute_source"`
AICenter string `json:"ai_center"`
IsFileNotebook bool `json:"is_file_notebook"`
}

func (a *AITaskBriefInfo) Tr(language string) {
a.AICenter = getAiCenterShow(a.AICenter, language)
}

type AITaskListRes struct {
Tasks []*AITaskInfo4List `json:"tasks"`
Total int64 `json:"total"`
PageSize int `json:"page_size"`
Page int `json:"page"`
CanCreateTask bool `json:"can_create_task"`
}
type AITaskInfo4List struct {
Task *AITaskBriefInfo `json:"task"`
Creator UserBriefInfo `json:"creator"`
CanModify bool `json:"can_modify"`
CanDelete bool `json:"can_delete"`
}

func ConvertCloudbrainToAITaskBriefInfo(task *models.Cloudbrain) *AITaskBriefInfo {
computeSource := ""
c := models.GetComputeSourceInstance(task.ComputeResource)
if c != nil {
computeSource = c.Name
}
return &AITaskBriefInfo{
ID: task.ID,
JobType: task.JobType,
Status: task.Status,
DisplayJobName: task.DisplayJobName,
CreatedUnix: task.CreatedUnix,
FormattedDuration: task.TrainJobDuration,
Cluster: GetClusterTypeFromCloudbrainType(task.Type).GetParentCluster(),
ComputeSource: computeSource,
StartTime: task.StartTime,
EndTime: task.EndTime,
AICenter: task.AiCenter,
IsFileNotebook: task.IsFileNoteBookTask(),
}
}

type NotebookDataset struct {
DatasetUrl string `json:"dataset_url"`
}

+ 35
- 0
entity/ai_task_config.go View File

@@ -0,0 +1,35 @@
package entity

type AITaskConfig struct {
ContainerSteps map[ContainerDataType]*ContainerBuildOpts `json:"container_configs"`
DatasetMaxSize int
}

type ContainerConfig struct {
Enable bool
ContainerPath string
ReadOnly bool
AcceptStorageType []StorageType
}

type GetAITaskConfigOpts struct {
ComputeSource string
IsFileNoteBookRequest bool
}

func (c *AITaskConfig) GetContainerConfig(containerDataType ContainerDataType) *ContainerBuildOpts {
containerConfigs := c.ContainerSteps
if containerConfigs != nil {
return containerConfigs[containerDataType]
}
return nil

}
func (c *AITaskConfig) GetContainerPath(containerDataType ContainerDataType) string {
config := c.GetContainerConfig(containerDataType)
if config == nil {
return ""
}
return config.ContainerPath

}

+ 0
- 28
entity/ai_task_entity/container.go View File

@@ -1,28 +0,0 @@
package ai_task_entity

type TaskData struct {
Code ContainerData
Dataset []ContainerData
PreTrainModel ContainerData
OutPutPath ContainerData
}

type ContainerData struct {
Name string `json:"name"`
Bucket string `json:"bucket"`
EndPoint string `json:"endPoint"`
ObjectKey string `json:"objectKey"`
ContainerPath string `json:"containerPath"`
RealPath string `json:"realPath"`
ReadOnly bool `json:"readOnly"`
}

type ContainerDataType string

const (
ContainerCode ContainerDataType = "code"
ContainerDataset ContainerDataType = "dataset"
ContainerPreTrainModel ContainerDataType = "pre_train_model"
ContainerOutPutPath ContainerDataType = "output"
ContainerCloudbrainOneOutPutReadMe ContainerDataType = "cloudbrain_one_readme"
)

+ 0
- 1
entity/ai_task_entity/creation.go View File

@@ -1 +0,0 @@
package ai_task_entity

+ 0
- 70
entity/ai_task_entity/task.go View File

@@ -1,70 +0,0 @@
package ai_task_entity

import (
"code.gitea.io/gitea/models"
"code.gitea.io/gitea/modules/git"
"code.gitea.io/gitea/modules/structs"
"code.gitea.io/gitea/modules/timeutil"
)

//todo 暂时保留之前各种云脑属性的定义
type CreateReq struct {
JobType models.JobType `json:"job_type" binding:"Required"`
DisplayJobName string `json:"display_job_name" binding:"Required"`
JobName string `json:"job_name"`
SpecId int64 `json:"spec_id" binding:"Required"`
ComputeSourceStr string `json:"compute_source" binding:"Required"`
Cluster ClusterType `json:"cluster" binding:"Required"`
WorkServerNumber int `json:"work_server_number"`
BranchName string `json:"branch_name"`
PreTrainModelUrl string `json:"pre_train_model_url"`
CkptName string `json:"ckpt_name"`
ImageUrl string `json:"image_url"`
ImageID string `json:"image_id"`
ImageName string `json:"image_name"`
ModelName string `json:"model_name"`
ModelVersion string `json:"model_version"`
ModelId string `json:"model_id"`
Description string `json:"description"`
LabelName string `json:"label_names"`
DatasetUUIDStr string `json:"dataset_uuid_str"`
Params string `json:"run_para_list"`
BootFile string `json:"boot_file"`
ParamArray models.Parameters
ComputeSource *models.ComputeSource
}

type CreationResponse struct {
Error error
JobID string
Status string //todo 考虑统一状态
CreateTime timeutil.TimeStamp
}

type QueryTaskInfo struct {
ID int64 `json:"id"`
JobID string `json:"job_id"`
Status string `json:"status"`
JobType string `json:"job_type"`
Cluster string `json:"cluster"`
DisplayJobName string `json:"display_job_name"`
Duration string `json:"duration"`
ComputeSource string `json:"compute_source"`
AiCenter string `json:"ai_center"`
WorkServerNumber int `json:"work_server_number"`
Spec *structs.SpecificationShow `json:"spec"`
DatasetList []*models.DatasetDownload `json:"dataset_list"`
}

type CreateTaskRes struct {
ID int64 `json:"id"`
}

type GetAITaskCreationInfoReq struct {
User *models.User
JobType models.JobType
ClusterType ClusterType
ComputeSource *models.ComputeSource
Repo *models.Repository
GitRepo *git.Repository
}

+ 12
- 0
entity/ai_task_list.go View File

@@ -0,0 +1,12 @@
package entity

import "code.gitea.io/gitea/models"

type GetTaskListReq struct {
models.ListOptions
ComputeSource *models.ComputeSource
JobTypes []string
RepoID int64
Operator *models.User
IsRepoOwner bool
}

entity/ai_task_entity/cluster.go → entity/cluster.go View File

@@ -1,26 +1,37 @@
package ai_task_entity
package entity

import (
"encoding/json"
"fmt"
"strconv"

"code.gitea.io/gitea/models"
"code.gitea.io/gitea/modules/cloudbrain"
"code.gitea.io/gitea/modules/log"
"code.gitea.io/gitea/modules/timeutil"
)

type CreateNoteBookTaskRequest struct {
Name string
Tasks []NoteBookTask
Name string
Description string
Tasks []NoteBookTask
PrimitiveDatasetName string
RepoName string
}

type NoteBookTask struct {
AutoStopDuration int
AutoStopDuration int64
Name string
Capacity int
CenterID []string
Code []ContainerData
Datasets []ContainerData
PreTrainModel []ContainerData
OutPut []ContainerData
ImageId string
ImageUrl string
ResourceSpecId string
BootFile string
Spec *models.Specification
}

@@ -56,6 +67,10 @@ type QueryTaskResponse struct {
Token string `json:"token"`
CenterId string `json:"center_id"`
CenterName string `json:"center_name"`
CodeUrl string `json:"code_url"`
DataUrl string `json:"data_url"`
ContainerIP string `json:"container_ip"`
ContainerID string `json:"container_id"`
}

func ConvertGrampusNotebookResponse(job models.GrampusNotebookInfo) *QueryTaskResponse {
@@ -71,10 +86,13 @@ func ConvertGrampusNotebookResponse(job models.GrampusNotebookInfo) *QueryTaskRe
if len(task.CenterName) > 0 {
centerName = task.CenterName[0]
}
var url, token string
var url, token, codeUrl, dataUrl string
if len(job.Tasks) > 0 {
url = job.Tasks[0].Url
token = job.Tasks[0].Token
t := job.Tasks[0]
url = t.Url
token = t.Token
codeUrl = t.CodeUrl
dataUrl = t.DataUrl
}
return &QueryTaskResponse{
StartedAt: timeutil.TimeStamp(job.StartedAt),
@@ -85,6 +103,8 @@ func ConvertGrampusNotebookResponse(job models.GrampusNotebookInfo) *QueryTaskRe
Url: url,
Token: token,
JobId: job.JobID,
CodeUrl: codeUrl,
DataUrl: dataUrl,
}
}
func ConvertGrampusTrainResponse(job models.GrampusJobInfo) *QueryTaskResponse {
@@ -122,16 +142,56 @@ func ConvertCloudbrainOneQueryNotebookByNameResponse(result models.JobResultInLi
}
}

func ConvertCloudbrainOneNotebookResponse(result models.JobResultPayload) *QueryTaskResponse {
if result.JobStatus.State == "" {
return nil
func ConvertCloudbrainOneNotebookResponse(input map[string]interface{}) (*QueryTaskResponse, error) {
data, _ := json.Marshal(input)
var jobResultPayload models.JobResultPayload
err := json.Unmarshal(data, &jobResultPayload)
if err != nil {
log.Error("parse cloudbrain one result err,result=%+v err=%v", input, err)
return nil, err
}
return &QueryTaskResponse{
StartedAt: timeutil.TimeStamp(result.JobStatus.CreatedTime / 1000),
CompletedAt: timeutil.TimeStamp(result.JobStatus.CompletedTime / 1000),
Status: result.JobStatus.State,
JobId: result.ID,
if jobResultPayload.JobStatus.State == "" {
return nil, nil
}

startTime := jobResultPayload.JobStatus.AppLaunchedTime / 1000
var endTime int64
switch jobResultPayload.JobStatus.AppCompletedTime.(type) {
case float64:
f := jobResultPayload.JobStatus.AppCompletedTime.(float64)
s := fmt.Sprintf("%.0f", f)
i, err := strconv.ParseInt(s, 10, 64)
if err == nil {
endTime = i / 1000
}
}

if jobResultPayload.JobStatus.State == string(models.JobWaiting) {
startTime = 0
endTime = 0
}
var containerIP, containerID string
taskRoles := jobResultPayload.TaskRoles
if taskRoles != nil && len(taskRoles) > 0 {
subTask := taskRoles[cloudbrain.SubTaskName]
if subTask != nil {
taskRes, _ := models.ConvertToTaskPod(taskRoles[cloudbrain.SubTaskName].(map[string]interface{}))
if taskRes.TaskStatuses != nil && len(taskRes.TaskStatuses) > 0 {
containerIP = taskRes.TaskStatuses[0].ContainerIP
containerID = taskRes.TaskStatuses[0].ContainerID
}
}
}

res := &QueryTaskResponse{
StartedAt: timeutil.TimeStamp(startTime),
CompletedAt: timeutil.TimeStamp(endTime),
Status: jobResultPayload.JobStatus.State,
JobId: jobResultPayload.ID,
ContainerIP: containerIP,
ContainerID: containerID,
}
return res, nil
}

type ClusterLog struct {
@@ -205,6 +265,8 @@ func GetClusterTypeFromCloudbrainType(t int) ClusterType {
return OpenICloudbrainTwo
case models.TypeC2Net:
return C2Net
case models.TypeCDCenter:
return OpenICloudbrainTwo
}
return ""
}

+ 45
- 0
entity/container.go View File

@@ -0,0 +1,45 @@
package entity

type TaskData struct {
Code ContainerData
Dataset []ContainerData
PreTrainModel ContainerData
OutPutPath ContainerData
}

type ContainerData struct {
Name string `json:"name"`
Bucket string `json:"bucket"`
EndPoint string `json:"endPoint"`
ObjectKey string `json:"objectKey"`
ContainerPath string `json:"containerPath"`
RealPath string `json:"realPath"`
ReadOnly bool `json:"readOnly"`
}

type ContainerDataType string

const (
ContainerCode ContainerDataType = "code"
ContainerDataset ContainerDataType = "dataset"
ContainerPreTrainModel ContainerDataType = "pre_train_model"
ContainerOutPutPath ContainerDataType = "output"
ContainerFileNoteBookCode ContainerDataType = "file_note_book_code"
)

type ContainerBuildOpts struct {
Disable bool
ContainerPath string
ReadOnly bool
AcceptStorageType []StorageType
NotArchive bool
}

func (opts ContainerBuildOpts) IsStorageTypeIn(storageType StorageType) bool {
for _, s := range opts.AcceptStorageType {
if string(s) == string(storageType) {
return true
}
}
return false
}

entity/ai_task_entity/task_list.go → entity/creation.go View File

@@ -1,13 +1,10 @@
package ai_task_entity
package entity

import (
"code.gitea.io/gitea/models"
"code.gitea.io/gitea/modules/structs"
)

type TaskBriefInfo struct {
}

type CreationRequiredInfo struct {
//排队信息、代码分支信息、查询是否有正在运行的任务、查询镜像列表、查询资源规格(积分余额,开关)
Specs []*structs.SpecificationShow `json:"specs"`
@@ -20,6 +17,11 @@ type CreationRequiredInfo struct {
DisplayJobName string `json:"display_job_name"`
PointAccount *PointAccountInfo `json:"point_account"`
PaySwitch bool `json:"pay_switch"`
Config AITaskCreationConfig `json:"config"`
}

type AITaskCreationConfig struct {
DatasetMaxSize int `json:"dataset_max_size"`
}

type SpecificationInfo struct {

+ 13
- 0
entity/err_code.go View File

@@ -0,0 +1,13 @@
package entity

import "fmt"

type ErrCode struct {
CodeVal string
CodeMsg string
CodeTrCode string
}

func (e *ErrCode) IsMatch(code interface{}) bool {
return fmt.Sprint(code) == e.CodeVal
}

+ 3
- 0
entity/grampus_err_code.go View File

@@ -0,0 +1,3 @@
package entity

var GrampusJobCanNotRestart = &ErrCode{CodeVal: "5005", CodeMsg: "Job can not restart", CodeTrCode: "ai_task.can_not_restart"}

entity/ai_task_entity/images.go → entity/images.go View File

@@ -1,4 +1,4 @@
package ai_task_entity
package entity

import "code.gitea.io/gitea/models"


+ 35
- 0
entity/operation_profile.go View File

@@ -0,0 +1,35 @@
package entity

type OperationProfile struct {
Events []ProfileEvent `json:"events"`
}

type ProfileEvent struct {
Message string `json:"message"`
Name string `json:"name"`
Reason string `json:"reason"`
Timestamp string `json:"timestamp"`
Action string `json:"action"`
}

type CloudbrainOneAppExitDiagnostics struct {
PodRoleName struct {
Task10 string `json:"task1-0"`
} `json:"podRoleName"`
PodEvents struct {
Task10 []struct {
Uid string `json:"uid"`
Reason string `json:"reason"`
Message string `json:"message"`
ReportingController string `json:"reportingController"`
Action string `json:"action"`
} `json:"task1-0"`
} `json:"podEvents"`
Extras []struct {
Uid string `json:"uid"`
Reason string `json:"reason"`
Message string `json:"message"`
ReportingController string `json:"reportingController"`
Action string `json:"action"`
} `json:"extras"`
}

+ 21
- 0
entity/storage.go View File

@@ -0,0 +1,21 @@
package entity

import "code.gitea.io/gitea/models"

type StorageType string

const (
MINIO StorageType = "MINIO"
OBS StorageType = "OBS"
)

func GetStorageTypeFromCloudbrainType(cloudbrainType int) StorageType {
switch cloudbrainType {
case models.TypeCloudBrainOne:
return MINIO
case models.TypeCloudBrainTwo:
return OBS

}
return ""
}

+ 44
- 0
entity/user.go View File

@@ -0,0 +1,44 @@
package entity

import (
"code.gitea.io/gitea/models"
"code.gitea.io/gitea/modules/timeutil"
)

type UserBriefInfo struct {
ID int64 `json:"id"`
LowerName string `json:"lower_name"`
Name string `json:"name"`
FullName string `json:"full_name"`
Email string `json:"email"`
Language string `json:"language"`
Description string `json:"description"`
RelAvatarLink string `json:"rel_avatar_link"`
NumMembers int `json:"num_members"`
CreatedUnix timeutil.TimeStamp `json:"created_unix"`
UpdatedUnix timeutil.TimeStamp `json:"updated_unix"`
}

func ConvertUserToBrief(u *models.User) *UserBriefInfo {
fullName := u.Name
if u.FullName != "" {
fullName = u.FullName
}
uf := &UserBriefInfo{
ID: u.ID,
LowerName: u.LowerName,
Name: u.Name,
FullName: fullName,
Email: u.Email,
Language: u.Language,
Description: u.Description,
CreatedUnix: u.CreatedUnix,
UpdatedUnix: u.UpdatedUnix,
NumMembers: u.NumMembers,
}
if !u.KeepEmailPrivate {
uf.Email = u.Email
}
uf.RelAvatarLink = u.RelAvatarLink()
return uf
}

+ 1527
- 0
manager/client/cloudbrain_two/resty.go View File

@@ -0,0 +1,1527 @@
package cloudbrain_two

import (
"crypto/tls"
"encoding/json"
"fmt"
"net/http"
"strconv"

"code.gitea.io/gitea/models"
"code.gitea.io/gitea/modules/log"
"code.gitea.io/gitea/modules/setting"
"github.com/go-resty/resty/v2"
)

var (
restyClient *resty.Client
HOST string
TOKEN string
AutoStopDurationMs = 4 * 60 * 60 * 1000
)

const (
methodPassword = "password"

urlGetToken = "/v3/auth/tokens"
urlNotebook = "/demanager/instances"
urlTrainJob = "/training-jobs"
urlResourceSpecs = "/job/resource-specs"
urlTrainJobConfig = "/training-job-configs"
errorCodeExceedLimit = "ModelArts.0118"

//notebook 2.0
urlNotebook2 = "/notebooks"

//error code
modelartsIllegalToken = "ModelArts.6401"
NotebookNotFound = "ModelArts.6404"
NotebookNoPermission = "ModelArts.6407"
NotebookInvalid = "ModelArts.6400"
UnknownErrorPrefix = "UNKNOWN:"

ModelArtsJobInTargetState = "ModelArts.6357"
ModelArtsJobNotExists = "ModelArts.0102"
ModelArtsJobInternalError = "ModelArts.0010"
)

func getRestyClient() *resty.Client {
if restyClient == nil {
restyClient = resty.New()
restyClient.SetTLSClientConfig(&tls.Config{InsecureSkipVerify: true})
}
return restyClient
}

func checkSetting() {
if len(HOST) != 0 && len(TOKEN) != 0 && restyClient != nil {
return
}

err := getToken()
if err != nil {
log.Error("getToken failed:%v", err)
}
}

func getToken() error {
HOST = setting.ModelArtsHost

client := getRestyClient()
params := models.GetTokenParams{
Auth: models.Auth{
Identity: models.Identity{
Methods: []string{methodPassword},
Password: models.Password{
User: models.NotebookUser{
Name: setting.ModelArtsUsername,
Password: setting.ModelArtsPassword,
Domain: models.Domain{
Name: setting.ModelArtsDomain,
},
},
},
},
Scope: models.Scope{
Project: models.Project{
Name: setting.ProjectName,
},
},
},
}

res, err := client.R().
SetHeader("Content-Type", "application/json").
SetBody(params).
Post(setting.IamHost + urlGetToken)
if err != nil {
return fmt.Errorf("resty getToken: %v", err)
}

if res.StatusCode() != http.StatusCreated {
return fmt.Errorf("getToken failed:%s", res.String())
}

TOKEN = res.Header().Get("X-Subject-Token")

return nil
}

func CreateJob(createJobParams models.CreateNotebookParams) (*models.CreateNotebookResult, error) {
checkSetting()
client := getRestyClient()
var result models.CreateNotebookResult

retry := 0

sendjob:
res, err := client.R().
SetHeader("Content-Type", "application/json").
SetAuthToken(TOKEN).
SetBody(createJobParams).
SetResult(&result).
Post(HOST + "/v1/" + setting.ProjectID + urlNotebook)

if err != nil {
return nil, fmt.Errorf("resty create notebook: %s", err)
}

if res.StatusCode() == http.StatusUnauthorized && retry < 1 {
retry++
_ = getToken()
goto sendjob
}

var response models.NotebookResult
err = json.Unmarshal(res.Body(), &response)
if err != nil {
log.Error("json.Unmarshal failed: %s", err.Error())
return &result, fmt.Errorf("son.Unmarshal failed: %s", err.Error())
}

if len(response.ErrorCode) != 0 {
log.Error("createNotebook failed(%s): %s", response.ErrorCode, response.ErrorMsg)
if response.ErrorCode == errorCodeExceedLimit {
response.ErrorMsg = "所选规格使用数量已超过最大配额限制。"
}
return &result, fmt.Errorf("createNotebook failed(%s): %s", response.ErrorCode, response.ErrorMsg)
}

return &result, nil
}

func GetJob(jobID string) (*models.GetNotebookResult, error) {
checkSetting()
client := getRestyClient()
var result models.GetNotebookResult

retry := 0

sendjob:
res, err := client.R().
SetHeader("Content-Type", "application/json").
SetAuthToken(TOKEN).
SetResult(&result).
Get(HOST + "/v1/" + setting.ProjectID + urlNotebook + "/" + jobID)

if err != nil {
return nil, fmt.Errorf("resty GetJob: %v", err)
}

if res.StatusCode() == http.StatusUnauthorized && retry < 1 {
retry++
_ = getToken()
goto sendjob
}

var response models.NotebookResult
err = json.Unmarshal(res.Body(), &response)
if err != nil {
log.Error("json.Unmarshal failed: %s", err.Error())
return &result, fmt.Errorf("son.Unmarshal failed: %s", err.Error())
}

if len(response.ErrorCode) != 0 {
log.Error("GetJob failed(%s): %s", response.ErrorCode, response.ErrorMsg)
return &result, fmt.Errorf("GetJob failed(%s): %s", response.ErrorCode, response.ErrorMsg)
}

return &result, nil
}

func GetNotebook2(jobID string) (*models.GetNotebook2Result, error) {
checkSetting()
client := getRestyClient()
var result models.GetNotebook2Result

retry := 0

sendjob:
res, err := client.R().
SetHeader("Content-Type", "application/json").
SetAuthToken(TOKEN).
SetResult(&result).
Get(HOST + "/v1/" + setting.ProjectID + urlNotebook2 + "/" + jobID)

if err != nil {
return nil, fmt.Errorf("resty GetJob: %v", err)
}

if res.StatusCode() == http.StatusUnauthorized && retry < 1 {
retry++
_ = getToken()
goto sendjob
}

var response models.NotebookResult
err = json.Unmarshal(res.Body(), &response)
if err != nil {
log.Error("json.Unmarshal failed: %s", err.Error())
return &result, fmt.Errorf("son.Unmarshal failed: %s", err.Error())
}

if len(response.ErrorCode) != 0 {
log.Error("GetJob failed(%s): %s", response.ErrorCode, response.ErrorMsg)
if response.ErrorCode == modelartsIllegalToken && retry < 1 {
retry++
_ = getToken()
goto sendjob
}
return &result, fmt.Errorf("GetJob failed(%s): %s", response.ErrorCode, response.ErrorMsg)
}

return &result, nil
}

func ManageNotebook(jobID string, param models.NotebookAction) (*models.NotebookActionResult, error) {
checkSetting()
client := getRestyClient()
var result models.NotebookActionResult

retry := 0

sendjob:
res, err := client.R().
SetHeader("Content-Type", "application/json").
SetBody(param).
SetAuthToken(TOKEN).
SetResult(&result).
Post(HOST + "/v1/" + setting.ProjectID + urlNotebook + "/" + jobID + "/action")

if err != nil {
return &result, fmt.Errorf("resty StopJob: %v", err)
}

if res.StatusCode() == http.StatusUnauthorized && retry < 1 {
retry++
_ = getToken()
goto sendjob
}

var response models.NotebookResult
err = json.Unmarshal(res.Body(), &response)
if err != nil {
log.Error("json.Unmarshal failed: %s", err.Error())
return &result, fmt.Errorf("son.Unmarshal failed: %s", err.Error())
}

if len(response.ErrorCode) != 0 {
log.Error("ManageNotebook failed(%s): %s", response.ErrorCode, response.ErrorMsg)
return &result, fmt.Errorf("ManageNotebook failed(%s): %s", response.ErrorCode, response.ErrorMsg)
}

return &result, nil
}

func ManageNotebook2(jobID string, param models.NotebookAction) (*models.NotebookActionResult, error) {
checkSetting()
client := getRestyClient()
var result models.NotebookActionResult

retry := 0

sendjob:
res, err := client.R().
SetHeader("Content-Type", "application/json").
SetAuthToken(TOKEN).
SetResult(&result).
Post(HOST + "/v1/" + setting.ProjectID + urlNotebook2 + "/" + jobID + "/" + param.Action + "?duration=" + strconv.Itoa(AutoStopDurationMs))

if err != nil {
return &result, fmt.Errorf("resty ManageNotebook2: %v", err)
}

if res.StatusCode() == http.StatusUnauthorized && retry < 1 {
retry++
_ = getToken()
goto sendjob
}

var response models.NotebookResult
err = json.Unmarshal(res.Body(), &response)
if err != nil {
log.Error("json.Unmarshal failed: %s", err.Error())
return &result, fmt.Errorf("son.Unmarshal failed: %s", err.Error())
}

if res.StatusCode() == http.StatusBadGateway {
return &result, fmt.Errorf(UnknownErrorPrefix+"createNotebook2 failed(%s): %s", response.ErrorCode, response.ErrorMsg)
}

if len(response.ErrorCode) != 0 {
log.Error("ManageNotebook2 failed(%s): %s", response.ErrorCode, response.ErrorMsg)
if response.ErrorCode == modelartsIllegalToken && retry < 1 {
retry++
_ = getToken()
goto sendjob
}
return &result, fmt.Errorf("ManageNotebook2 failed(%s): %s", response.ErrorCode, response.ErrorMsg)
}

return &result, nil
}

func DelNotebook(jobID string) (*models.NotebookDelResult, error) {
checkSetting()
client := getRestyClient()
var result models.NotebookDelResult

retry := 0

sendjob:
res, err := client.R().
SetHeader("Content-Type", "application/json").
SetAuthToken(TOKEN).
SetResult(&result).
Delete(HOST + "/v1/" + setting.ProjectID + urlNotebook + "/" + jobID)

if err != nil {
return &result, fmt.Errorf("resty DelJob: %v", err)
}

if res.StatusCode() == http.StatusUnauthorized && retry < 1 {
retry++
_ = getToken()
goto sendjob
}

var response models.NotebookResult
err = json.Unmarshal(res.Body(), &response)
if err != nil {
log.Error("json.Unmarshal failed: %s", err.Error())
return &result, fmt.Errorf("son.Unmarshal failed: %s", err.Error())
}

if len(response.ErrorCode) != 0 {
log.Error("DelJob failed(%s): %s", response.ErrorCode, response.ErrorMsg)
return &result, fmt.Errorf("DelJob failed(%s): %s", response.ErrorCode, response.ErrorMsg)
}

return &result, nil
}

func DelNotebook2(jobID string) (*models.NotebookDelResult, error) {
checkSetting()
client := getRestyClient()
var result models.NotebookDelResult

retry := 0

sendjob:
res, err := client.R().
SetHeader("Content-Type", "application/json").
SetAuthToken(TOKEN).
SetResult(&result).
Delete(HOST + "/v1/" + setting.ProjectID + urlNotebook2 + "/" + jobID)

if err != nil {
return &result, fmt.Errorf("resty DelJob: %v", err)
}

if res.StatusCode() == http.StatusUnauthorized && retry < 1 {
retry++
_ = getToken()
goto sendjob
}

var response models.NotebookResult
err = json.Unmarshal(res.Body(), &response)
if err != nil {
log.Error("json.Unmarshal failed: %s", err.Error())
return &result, fmt.Errorf("son.Unmarshal failed: %s", err.Error())
}

if len(response.ErrorCode) != 0 {
log.Error("DelNotebook2 failed(%s): %s", response.ErrorCode, response.ErrorMsg)
if response.ErrorCode == modelartsIllegalToken && retry < 1 {
retry++
_ = getToken()
goto sendjob
}
if response.ErrorCode == ModelArtsJobNotExists || response.ErrorCode == ModelArtsJobInTargetState {
//任务不存在或者已经处于被删除的状态,此时认为删除成功
return &models.NotebookDelResult{}, nil
}
if result.ErrorCode == ModelArtsJobInternalError {
log.Error("ModelArt internal error when del job,jobId=%s", jobID)
return &models.NotebookDelResult{}, nil
}
return &result, fmt.Errorf("DelNotebook2 failed(%s): %s", response.ErrorCode, response.ErrorMsg)
}

return &result, nil
}

func DelJob(jobID string) (*models.NotebookDelResult, error) {
checkSetting()
client := getRestyClient()
var result models.NotebookDelResult

retry := 0

sendjob:
res, err := client.R().
SetHeader("Content-Type", "application/json").
SetAuthToken(TOKEN).
SetResult(&result).
Delete(HOST + "/v1/" + setting.ProjectID + urlNotebook + "/" + jobID)

if err != nil {
return &result, fmt.Errorf("resty DelJob: %v", err)
}

if res.StatusCode() == http.StatusUnauthorized && retry < 1 {
retry++
_ = getToken()
goto sendjob
}

var response models.NotebookResult
err = json.Unmarshal(res.Body(), &response)
if err != nil {
log.Error("json.Unmarshal failed: %s", err.Error())
return &result, fmt.Errorf("son.Unmarshal failed: %s", err.Error())
}

if len(response.ErrorCode) != 0 {
log.Error("DelJob failed(%s): %s", response.ErrorCode, response.ErrorMsg)
return &result, fmt.Errorf("DelJob failed(%s): %s", response.ErrorCode, response.ErrorMsg)
}

return &result, nil
}

func GetJobToken(jobID string) (*models.NotebookGetJobTokenResult, error) {
checkSetting()
client := getRestyClient()
var result models.NotebookGetJobTokenResult

retry := 0

sendjob:
res, err := client.R().
SetHeader("Content-Type", "application/json").
SetAuthToken(TOKEN).
SetResult(&result).
Get(HOST + "/v1/" + setting.ProjectID + urlNotebook + "/" + jobID + "/token")

if err != nil {
return &result, fmt.Errorf("resty GetJobToken: %v", err)
}

if res.StatusCode() == http.StatusUnauthorized && retry < 1 {
retry++
_ = getToken()
goto sendjob
}

var response models.NotebookResult
err = json.Unmarshal(res.Body(), &response)
if err != nil {
log.Error("json.Unmarshal failed: %s", err.Error())
return &result, fmt.Errorf("son.Unmarshal failed: %s", err.Error())
}

if len(response.ErrorCode) != 0 {
log.Error("GetJobToken failed(%s): %s", response.ErrorCode, response.ErrorMsg)
return &result, fmt.Errorf("GetJobToken failed(%s): %s", response.ErrorCode, response.ErrorMsg)
}

return &result, nil
}

func createTrainJobUserImage(createJobParams models.CreateUserImageTrainJobParams) (*models.CreateTrainJobResult, error) {
checkSetting()
client := getRestyClient()
var result models.CreateTrainJobResult

retry := 0

sendjob:
res, err := client.R().
SetHeader("Content-Type", "application/json").
SetAuthToken(TOKEN).
SetBody(createJobParams).
SetResult(&result).
Post(HOST + "/v1/" + setting.ProjectID + urlTrainJob)

if err != nil {
return nil, fmt.Errorf("resty create train-job: %s", err)
}

req, _ := json.Marshal(createJobParams)
log.Info("postapi json: %s", req)

if res.StatusCode() == http.StatusUnauthorized && retry < 1 {
retry++
_ = getToken()
goto sendjob
}

if res.StatusCode() != http.StatusOK {
var temp models.ErrorResult
if err = json.Unmarshal([]byte(res.String()), &temp); err != nil {
log.Error("json.Unmarshal failed(%s): %v", res.String(), err.Error())
return &result, fmt.Errorf("json.Unmarshal failed(%s): %v", res.String(), err.Error())
}
log.Error("createTrainJobUserImage failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
bootFileErrorMsg := "Invalid OBS path '" + createJobParams.Config.BootFileUrl + "'."
dataSetErrorMsg := "Invalid OBS path '" + createJobParams.Config.DataUrl + "'."
if temp.ErrorMsg == bootFileErrorMsg {
log.Error("启动文件错误!createTrainJobUserImage failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
return &result, fmt.Errorf("启动文件错误!")
}
if temp.ErrorMsg == dataSetErrorMsg {
log.Error("数据集错误!createTrainJobUserImage failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
return &result, fmt.Errorf("数据集错误!")
}
if res.StatusCode() == http.StatusBadGateway {
return &result, fmt.Errorf(UnknownErrorPrefix+"createTrainJobUserImage failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
} else {
return &result, fmt.Errorf("createTrainJobUserImage failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
}
}

if !result.IsSuccess {
log.Error("createTrainJobUserImage failed(%s): %s", result.ErrorCode, result.ErrorMsg)
return &result, fmt.Errorf("createTrainJobUserImage failed(%s): %s", result.ErrorCode, result.ErrorMsg)
}

return &result, nil
}

func createTrainJob(createJobParams models.CreateTrainJobParams) (*models.CreateTrainJobResult, error) {
checkSetting()
client := getRestyClient()
var result models.CreateTrainJobResult

retry := 0
req, _ := json.Marshal(createJobParams)
log.Info("postapi json: %s", req)

sendjob:
res, err := client.R().
SetHeader("Content-Type", "application/json").
SetAuthToken(TOKEN).
SetBody(createJobParams).
SetResult(&result).
Post(HOST + "/v1/" + setting.ProjectID + urlTrainJob)

if err != nil {
return nil, fmt.Errorf("resty create train-job: %s", err)
}

if res.StatusCode() == http.StatusUnauthorized && retry < 1 {
retry++
_ = getToken()
goto sendjob
}

if res.StatusCode() != http.StatusOK {
var temp models.ErrorResult
if err = json.Unmarshal([]byte(res.String()), &temp); err != nil {
log.Error("json.Unmarshal failed(%s): %v", res.String(), err.Error())
return &result, fmt.Errorf("json.Unmarshal failed(%s): %v", res.String(), err.Error())
}
log.Error("createTrainJob failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
bootFileErrorMsg := "Invalid OBS path '" + createJobParams.Config.BootFileUrl + "'."
dataSetErrorMsg := "Invalid OBS path '" + createJobParams.Config.DataUrl + "'."
if temp.ErrorMsg == bootFileErrorMsg {
log.Error("启动文件错误!createTrainJob failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
return &result, fmt.Errorf("启动文件错误!")
}
if temp.ErrorMsg == dataSetErrorMsg {
log.Error("数据集错误!createTrainJob failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
return &result, fmt.Errorf("数据集错误!")
}
if res.StatusCode() == http.StatusBadGateway {
return &result, fmt.Errorf(UnknownErrorPrefix+"createTrainJob failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
} else {
return &result, fmt.Errorf("createTrainJob failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
}
}

if !result.IsSuccess {
log.Error("createTrainJob failed(%s): %s", result.ErrorCode, result.ErrorMsg)
return &result, fmt.Errorf("createTrainJob failed(%s): %s", result.ErrorCode, result.ErrorMsg)
}

return &result, nil
}

func createTrainJobVersion(createJobVersionParams models.CreateTrainJobVersionParams, jobID string) (*models.CreateTrainJobResult, error) {
checkSetting()
client := getRestyClient()
var result models.CreateTrainJobResult

retry := 0

sendjob:
res, err := client.R().
SetHeader("Content-Type", "application/json").
SetAuthToken(TOKEN).
SetBody(createJobVersionParams).
SetResult(&result).
Post(HOST + "/v1/" + setting.ProjectID + urlTrainJob + "/" + jobID + "/versions")

if err != nil {
return nil, fmt.Errorf("resty create train-job version: %s", err)
}

if res.StatusCode() == http.StatusUnauthorized && retry < 1 {
retry++
_ = getToken()
goto sendjob
}

if res.StatusCode() != http.StatusOK {
var temp models.ErrorResult
if err = json.Unmarshal([]byte(res.String()), &temp); err != nil {
log.Error("json.Unmarshal failed(%s): %v", res.String(), err.Error())
return &result, fmt.Errorf("json.Unmarshal failed(%s): %v", res.String(), err.Error())
}

log.Error("createTrainJobVersion failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
bootFileErrorMsg := "Invalid OBS path '" + createJobVersionParams.Config.BootFileUrl + "'."
dataSetErrorMsg := "Invalid OBS path '" + createJobVersionParams.Config.DataUrl + "'."
if temp.ErrorMsg == bootFileErrorMsg {
log.Error("启动文件错误!createTrainJobVersion failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
return &result, fmt.Errorf("启动文件错误!")
}
if temp.ErrorMsg == dataSetErrorMsg {
log.Error("数据集错误!createTrainJobVersion failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
return &result, fmt.Errorf("数据集错误!")
}
if res.StatusCode() == http.StatusBadGateway {
return &result, fmt.Errorf(UnknownErrorPrefix+"createTrainJobVersion failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
} else {
return &result, fmt.Errorf("createTrainJobVersion failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
}
}

if !result.IsSuccess {
log.Error("createTrainJobVersion failed(%s): %s", result.ErrorCode, result.ErrorMsg)
return &result, fmt.Errorf("createTrainJobVersion failed(%s): %s", result.ErrorCode, result.ErrorMsg)
}

return &result, nil
}

func createTrainJobVersionUserImage(createJobVersionParams models.CreateTrainJobVersionUserImageParams, jobID string) (*models.CreateTrainJobResult, error) {
checkSetting()
client := getRestyClient()
var result models.CreateTrainJobResult

retry := 0

sendjob:
res, err := client.R().
SetHeader("Content-Type", "application/json").
SetAuthToken(TOKEN).
SetBody(createJobVersionParams).
SetResult(&result).
Post(HOST + "/v1/" + setting.ProjectID + urlTrainJob + "/" + jobID + "/versions")

if err != nil {
return nil, fmt.Errorf("resty create train-job version: %s", err)
}

req, _ := json.Marshal(createJobVersionParams)
log.Info("%s", req)

if res.StatusCode() == http.StatusUnauthorized && retry < 1 {
retry++
_ = getToken()
goto sendjob
}

if res.StatusCode() != http.StatusOK {
var temp models.ErrorResult
if err = json.Unmarshal([]byte(res.String()), &temp); err != nil {
log.Error("json.Unmarshal failed(%s): %v", res.String(), err.Error())
return &result, fmt.Errorf("json.Unmarshal failed(%s): %v", res.String(), err.Error())
}
BootFileErrorMsg := "Invalid OBS path '" + createJobVersionParams.Config.BootFileUrl + "'."
DataSetErrorMsg := "Invalid OBS path '" + createJobVersionParams.Config.DataUrl + "'."
if temp.ErrorMsg == BootFileErrorMsg {
log.Error("启动文件错误!createTrainJobVersion failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
return &result, fmt.Errorf("启动文件错误!")
}
if temp.ErrorMsg == DataSetErrorMsg {
log.Error("数据集错误!createTrainJobVersion failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
return &result, fmt.Errorf("数据集错误!")
}
return &result, fmt.Errorf("createTrainJobVersion failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
}

if !result.IsSuccess {
log.Error("createTrainJobVersion failed(%s): %s", result.ErrorCode, result.ErrorMsg)
return &result, fmt.Errorf("createTrainJobVersion failed(%s): %s", result.ErrorCode, result.ErrorMsg)
}

return &result, nil
}

func GetResourceSpecs() (*models.GetResourceSpecsResult, error) {
checkSetting()
client := getRestyClient()
var result models.GetResourceSpecsResult

retry := 0

sendjob:
res, err := client.R().
SetHeader("Content-Type", "application/json").
SetAuthToken(TOKEN).
SetResult(&result).
Get(HOST + "/v1/" + setting.ProjectID + urlResourceSpecs)

if err != nil {
return nil, fmt.Errorf("resty GetResourceSpecs: %v", err)
}

if res.StatusCode() == http.StatusUnauthorized && retry < 1 {
retry++
_ = getToken()
goto sendjob
}

if res.StatusCode() != http.StatusOK {
var temp models.ErrorResult
if err = json.Unmarshal([]byte(res.String()), &temp); err != nil {
log.Error("json.Unmarshal failed(%s): %v", res.String(), err.Error())
return &result, fmt.Errorf("json.Unmarshal failed(%s): %v", res.String(), err.Error())
}
log.Error("GetResourceSpecs failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
return &result, fmt.Errorf("GetResourceSpecs failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
}

if !result.IsSuccess {
log.Error("GetResourceSpecs failed(%s): %s", result.ErrorCode, result.ErrorMsg)
return &result, fmt.Errorf("GetResourceSpecs failed(%s): %s", result.ErrorCode, result.ErrorMsg)
}

return &result, nil
}

func CreateTrainJobConfig(req models.CreateConfigParams) (*models.CreateTrainJobConfigResult, error) {
checkSetting()
client := getRestyClient()
var result models.CreateTrainJobConfigResult

retry := 0

sendjob:
res, err := client.R().
SetHeader("Content-Type", "application/json").
SetAuthToken(TOKEN).
SetBody(req).
SetResult(&result).
Post(HOST + "/v1/" + setting.ProjectID + urlTrainJobConfig)

if err != nil {
return nil, fmt.Errorf("resty CreateTrainJobConfig: %s", err)
}

if res.StatusCode() == http.StatusUnauthorized && retry < 1 {
retry++
_ = getToken()
goto sendjob
}

if res.StatusCode() != http.StatusOK {
var temp models.ErrorResult
if err = json.Unmarshal([]byte(res.String()), &temp); err != nil {
log.Error("json.Unmarshal failed(%s): %v", res.String(), err.Error())
return &result, fmt.Errorf("json.Unmarshal failed(%s): %v", res.String(), err.Error())
}
log.Error("CreateTrainJobConfig failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
return &result, fmt.Errorf("CreateTrainJobConfig failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
}

if !result.IsSuccess {
log.Error("CreateTrainJobConfig failed(%s): %s", result.ErrorCode, result.ErrorMsg)
return &result, fmt.Errorf("CreateTrainJobConfig failed(%s): %s", result.ErrorCode, result.ErrorMsg)
}

return &result, nil
}

func GetConfigList(perPage, page int, sortBy, order, searchContent, configType string) (*models.GetConfigListResult, error) {
checkSetting()
client := getRestyClient()
var result models.GetConfigListResult

retry := 0

sendjob:
res, err := client.R().
SetQueryParams(map[string]string{
"per_page": strconv.Itoa(perPage),
"page": strconv.Itoa(page),
"sortBy": sortBy,
"order": order,
"search_content": searchContent,
"config_type": configType,
}).
SetAuthToken(TOKEN).
SetResult(&result).
Get(HOST + "/v1/" + setting.ProjectID + urlTrainJobConfig)

if err != nil {
return nil, fmt.Errorf("resty GetConfigList: %v", err)
}

if res.StatusCode() == http.StatusUnauthorized && retry < 1 {
retry++
_ = getToken()
goto sendjob
}

if res.StatusCode() != http.StatusOK {
var temp models.ErrorResult
if err = json.Unmarshal([]byte(res.String()), &temp); err != nil {
log.Error("json.Unmarshal failed(%s): %v", res.String(), err.Error())
return &result, fmt.Errorf("json.Unmarshal failed(%s): %v", res.String(), err.Error())
}
log.Error("GetConfigList failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
return &result, fmt.Errorf("获取参数配置列表失败(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
}

if !result.IsSuccess {
log.Error("GetConfigList failed(%s): %s", result.ErrorCode, result.ErrorMsg)
return &result, fmt.Errorf("获取参数配置列表失败(%s): %s", result.ErrorCode, result.ErrorMsg)
}

return &result, nil
}

func GetParaConfig(configName, configType string) (models.GetConfigResult, error) {
checkSetting()
client := getRestyClient()
var result models.GetConfigResult

retry := 0

sendjob:
res, err := client.R().
SetQueryParams(map[string]string{
"config_type": configType,
}).
SetAuthToken(TOKEN).
SetResult(&result).
Get(HOST + "/v1/" + setting.ProjectID + urlTrainJobConfig + "/" + configName)

if err != nil {
return result, fmt.Errorf("resty GetParaConfig: %v", err)
}

if res.StatusCode() == http.StatusUnauthorized && retry < 1 {
retry++
_ = getToken()
goto sendjob
}

if res.StatusCode() != http.StatusOK {
var temp models.ErrorResult
if err = json.Unmarshal([]byte(res.String()), &temp); err != nil {
log.Error("json.Unmarshal failed(%s): %v", res.String(), err.Error())
return result, fmt.Errorf("json.Unmarshal failed(%s): %v", res.String(), err.Error())
}
log.Error("GetParaConfig failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
return result, fmt.Errorf("获取参数配置详情失败(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
}

if !result.IsSuccess {
log.Error("GetParaConfig failed(%s): %s", result.ErrorCode, result.ErrorMsg)
return result, fmt.Errorf("获取参数配置详情失败(%s): %s", result.ErrorCode, result.ErrorMsg)
}

return result, nil
}

func GetTrainJob(jobID, versionID string) (*models.GetTrainJobResult, error) {
checkSetting()
client := getRestyClient()
var result models.GetTrainJobResult

retry := 0

sendjob:
res, err := client.R().
SetAuthToken(TOKEN).
SetResult(&result).
Get(HOST + "/v1/" + setting.ProjectID + urlTrainJob + "/" + jobID + "/versions/" + versionID)

if err != nil {
return nil, fmt.Errorf("resty GetTrainJob: %v", err)
}

if res.StatusCode() == http.StatusUnauthorized && retry < 1 {
retry++
_ = getToken()
goto sendjob
}

if res.StatusCode() != http.StatusOK {
var temp models.ErrorResult
if err = json.Unmarshal([]byte(res.String()), &temp); err != nil {
log.Error("json.Unmarshal failed(%s): %v", res.String(), err.Error())
return &result, fmt.Errorf("json.Unmarshal failed(%s): %v", res.String(), err.Error())
}
log.Error("GetTrainJob failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
return &result, fmt.Errorf("获取作业详情失败(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
}

if !result.IsSuccess {
log.Error("GetTrainJob(%s) failed", jobID)
return &result, fmt.Errorf("获取作业详情失败")
}

return &result, nil
}

func GetTrainJobLog(jobID, versionID, baseLine, logFile, order string, lines int) (*models.GetTrainJobLogResult, error) {
checkSetting()
client := getRestyClient()
var result models.GetTrainJobLogResult

retry := 0

sendjob:
res, err := client.R().
SetQueryParams(map[string]string{
"base_line": baseLine,
"lines": strconv.Itoa(lines),
"log_file": logFile,
"order": order,
}).
SetAuthToken(TOKEN).
SetResult(&result).
Get(HOST + "/v1/" + setting.ProjectID + urlTrainJob + "/" + jobID + "/versions/" + versionID + "/aom-log")

if err != nil {
return nil, fmt.Errorf("resty GetTrainJobLog: %v", err)
}

if res.StatusCode() == http.StatusUnauthorized && retry < 1 {
retry++
_ = getToken()
goto sendjob
}

if res.StatusCode() != http.StatusOK {
var temp models.ErrorResult
if err = json.Unmarshal([]byte(res.String()), &temp); err != nil {
log.Error("json.Unmarshal failed(%s): %v", res.String(), err.Error())
return &result, fmt.Errorf("json.Unmarshal failed(%s): %v", res.String(), err.Error())
}
log.Error("GetTrainJobLog failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
return &result, fmt.Errorf("获取作业日志失败(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
}

if !result.IsSuccess {
log.Error("GetTrainJobLog(%s) failed", jobID)
return &result, fmt.Errorf("获取作业日志失败:%s", result.ErrorMsg)
}

return &result, nil
}

func GetTrainJobLogFileNames(jobID, versionID string) (*models.GetTrainJobLogFileNamesResult, error) {
checkSetting()
client := getRestyClient()
var result models.GetTrainJobLogFileNamesResult

retry := 0

sendjob:
res, err := client.R().
SetAuthToken(TOKEN).
SetResult(&result).
Get(HOST + "/v1/" + setting.ProjectID + urlTrainJob + "/" + jobID + "/versions/" + versionID + "/log/file-names")

if err != nil {
return nil, fmt.Errorf("resty GetTrainJobLogFileNames: %v", err)
}

if res.StatusCode() == http.StatusUnauthorized && retry < 1 {
retry++
_ = getToken()
goto sendjob
}

if res.StatusCode() != http.StatusOK {
var temp models.ErrorResult
if err = json.Unmarshal([]byte(res.String()), &temp); err != nil {
log.Error("json.Unmarshal failed(%s): %v", res.String(), err.Error())
return &result, fmt.Errorf("json.Unmarshal failed(%s): %v", res.String(), err.Error())
}
log.Error("GetTrainJobLogFileNames failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
return &result, fmt.Errorf("GetTrainJobLogFileNames failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
}

if !result.IsSuccess {
log.Error("GetTrainJobLogFileNames(%s) failed", jobID)
return &result, fmt.Errorf("获取作业日志文件失败:%s", result.ErrorMsg)
}

return &result, nil
}

func DelTrainJob(jobID string) (*models.TrainJobResult, error) {
checkSetting()
client := getRestyClient()
var result models.TrainJobResult

retry := 0

sendjob:
res, err := client.R().
SetAuthToken(TOKEN).
SetResult(&result).
Delete(HOST + "/v1/" + setting.ProjectID + urlTrainJob + "/" + jobID)

if err != nil {
return &result, fmt.Errorf("resty DelTrainJob: %v", err)
}

if res.StatusCode() == http.StatusUnauthorized && retry < 1 {
retry++
_ = getToken()
goto sendjob
}

if res.StatusCode() != http.StatusOK {
var temp models.ErrorResult
if err = json.Unmarshal([]byte(res.String()), &temp); err != nil {
log.Error("json.Unmarshal failed(%s): %v", res.String(), err.Error())
return &result, fmt.Errorf("json.Unmarshal failed(%s): %v", res.String(), err.Error())
}
log.Error("DelTrainJob failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
if temp.ErrorCode == ModelArtsJobNotExists || temp.ErrorCode == ModelArtsJobInTargetState {
//任务不存在或者已经处于被删除的状态,此时认为删除成功
return &models.TrainJobResult{IsSuccess: true}, nil
}
if result.ErrorCode == ModelArtsJobInternalError {
log.Error("ModelArt internal error when del job,jobId=%s", jobID)
return &models.TrainJobResult{IsSuccess: true}, nil
}
return &result, fmt.Errorf("删除训练作业失败(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
}

if !result.IsSuccess {
log.Error("DelTrainJob(%s) failed", jobID)
if result.ErrorCode == ModelArtsJobNotExists || result.ErrorCode == ModelArtsJobInTargetState {
//任务不存在或者已经处于被删除的状态,此时认为删除成功
return &models.TrainJobResult{IsSuccess: true}, nil
}
if result.ErrorCode == ModelArtsJobInternalError {
log.Error("ModelArt internal error when del job,jobId=%s", jobID)
return &models.TrainJobResult{IsSuccess: true}, nil
}
return &result, fmt.Errorf("删除训练作业失败:%s", result.ErrorMsg)
}

return &result, nil
}

func StopTrainJob(jobID, versionID string) (*models.TrainJobResult, error) {
checkSetting()
client := getRestyClient()
var result models.TrainJobResult

retry := 0

sendjob:
res, err := client.R().
SetAuthToken(TOKEN).
SetResult(&result).
Post(HOST + "/v1/" + setting.ProjectID + urlTrainJob + "/" + jobID + "/versions/" + versionID + "/stop")

if err != nil {
return &result, fmt.Errorf("resty StopTrainJob: %v", err)
}

if res.StatusCode() == http.StatusUnauthorized && retry < 1 {
retry++
_ = getToken()
goto sendjob
}

if res.StatusCode() != http.StatusOK {
var temp models.ErrorResult
if err = json.Unmarshal([]byte(res.String()), &temp); err != nil {
log.Error("json.Unmarshal failed(%s): %v", res.String(), err.Error())
return &result, fmt.Errorf("json.Unmarshal failed(%s): %v", res.String(), err.Error())
}
log.Error("StopTrainJob failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
return &result, fmt.Errorf("停止训练作业失败(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
}

if !result.IsSuccess {
log.Error("StopTrainJob(%s) failed", jobID)
return &result, fmt.Errorf("停止训练作业失败:%s", result.ErrorMsg)
}

return &result, nil
}

func DelTrainJobVersion(jobID string, versionID string) (*models.TrainJobResult, error) {
checkSetting()
client := getRestyClient()
var result models.TrainJobResult

retry := 0

sendjob:
res, err := client.R().
SetAuthToken(TOKEN).
SetResult(&result).
Delete(HOST + "/v1/" + setting.ProjectID + urlTrainJob + "/" + jobID + "/versions/" + versionID)

if err != nil {
return &result, fmt.Errorf("resty DelTrainJobVersion: %v", err)
}

if res.StatusCode() == http.StatusUnauthorized && retry < 1 {
retry++
_ = getToken()
goto sendjob
}

if res.StatusCode() != http.StatusOK {
var temp models.ErrorResult
if err = json.Unmarshal([]byte(res.String()), &temp); err != nil {
log.Error("json.Unmarshal failed(%s): %v", res.String(), err.Error())
return &result, fmt.Errorf("json.Unmarshal failed(%s): %v", res.String(), err.Error())
}

if temp.ErrorCode == ModelArtsJobNotExists || temp.ErrorCode == ModelArtsJobInTargetState {
//任务不存在或者已经处于被删除的状态,此时认为删除成功
return &models.TrainJobResult{IsSuccess: true}, nil
}
if result.ErrorCode == ModelArtsJobInternalError {
log.Error("ModelArt internal error when del job,jobId=%s", jobID)
return &models.TrainJobResult{IsSuccess: true}, nil
}
log.Error("DelTrainJob failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
return &result, fmt.Errorf("删除训练作业版本失败(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
}

if !result.IsSuccess {
log.Error("DelTrainJob(%s) failed", jobID)
return &result, fmt.Errorf("删除训练作业版本失败:%s", result.ErrorMsg)
}

return &result, nil
}

func createInferenceJob(createJobParams models.CreateInferenceJobParams) (*models.CreateTrainJobResult, error) {
checkSetting()
client := getRestyClient()
var result models.CreateTrainJobResult

retry := 0

sendjob:
res, err := client.R().
SetHeader("Content-Type", "application/json").
SetAuthToken(TOKEN).
SetBody(createJobParams).
SetResult(&result).
Post(HOST + "/v1/" + setting.ProjectID + urlTrainJob)

if err != nil {
return nil, fmt.Errorf("resty create inference-job: %s", err)
}

req, _ := json.Marshal(createJobParams)
log.Info("%s", req)

if res.StatusCode() == http.StatusUnauthorized && retry < 1 {
retry++
_ = getToken()
goto sendjob
}

if res.StatusCode() != http.StatusOK {
var temp models.ErrorResult
if err = json.Unmarshal([]byte(res.String()), &temp); err != nil {
log.Error("json.Unmarshal failed(%s): %v", res.String(), err.Error())
return &result, fmt.Errorf("json.Unmarshal failed(%s): %v", res.String(), err.Error())
}
log.Error("createInferenceJob failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
BootFileErrorMsg := "Invalid OBS path '" + createJobParams.InfConfig.BootFileUrl + "'."
DataSetErrorMsg := "Invalid OBS path '" + createJobParams.InfConfig.DataUrl + "'."
if temp.ErrorMsg == BootFileErrorMsg {
log.Error("启动文件错误!createInferenceJob failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
return &result, fmt.Errorf("启动文件错误!")
}
if temp.ErrorMsg == DataSetErrorMsg {
log.Error("数据集错误!createInferenceJob failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
return &result, fmt.Errorf("数据集错误!")
}
if res.StatusCode() == http.StatusBadGateway {
return &result, fmt.Errorf(UnknownErrorPrefix+"createInferenceJob failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
} else {
return &result, fmt.Errorf("createInferenceJob failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
}
}

if !result.IsSuccess {
log.Error("createInferenceJob failed(%s): %s", result.ErrorCode, result.ErrorMsg)
return &result, fmt.Errorf("createInferenceJob failed(%s): %s", result.ErrorCode, result.ErrorMsg)
}

return &result, nil
}

func createInferenceJobUserImage(createJobParams models.CreateInfUserImageParams) (*models.CreateTrainJobResult, error) {
checkSetting()
client := getRestyClient()
var result models.CreateTrainJobResult

retry := 0

sendjob:
res, err := client.R().
SetHeader("Content-Type", "application/json").
SetAuthToken(TOKEN).
SetBody(createJobParams).
SetResult(&result).
Post(HOST + "/v1/" + setting.ProjectID + urlTrainJob)

if err != nil {
return nil, fmt.Errorf("resty create train-job: %s", err)
}

req, _ := json.Marshal(createJobParams)
log.Info("%s", req)

if res.StatusCode() == http.StatusUnauthorized && retry < 1 {
retry++
_ = getToken()
goto sendjob
}

if res.StatusCode() != http.StatusOK {
var temp models.ErrorResult
if err = json.Unmarshal([]byte(res.String()), &temp); err != nil {
log.Error("json.Unmarshal failed(%s): %v", res.String(), err.Error())
return &result, fmt.Errorf("json.Unmarshal failed(%s): %v", res.String(), err.Error())
}
log.Error("createInferenceJobUserImage failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
bootFileErrorMsg := "Invalid OBS path '" + createJobParams.Config.BootFileUrl + "'."
dataSetErrorMsg := "Invalid OBS path '" + createJobParams.Config.DataUrl + "'."
if temp.ErrorMsg == bootFileErrorMsg {
log.Error("启动文件错误!createInferenceJobUserImage failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
return &result, fmt.Errorf("启动文件错误!")
}
if temp.ErrorMsg == dataSetErrorMsg {
log.Error("数据集错误!createInferenceJobUserImage failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
return &result, fmt.Errorf("数据集错误!")
}
if res.StatusCode() == http.StatusBadGateway {
return &result, fmt.Errorf(UnknownErrorPrefix+"createInferenceJobUserImage failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
} else {
return &result, fmt.Errorf("createInferenceJobUserImage failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
}
}

if !result.IsSuccess {
log.Error("createInferenceJobUserImage failed(%s): %s", result.ErrorCode, result.ErrorMsg)
return &result, fmt.Errorf("createInferenceJobUserImage failed(%s): %s", result.ErrorCode, result.ErrorMsg)
}

return &result, nil
}

func CreateNotebook2(createJobParams models.CreateNotebook2Params) (*models.CreateNotebookResult, error) {
checkSetting()
client := getRestyClient()
var result models.CreateNotebookResult

retry := 0

sendjob:
res, err := client.R().
SetHeader("Content-Type", "application/json").
SetAuthToken(TOKEN).
SetBody(createJobParams).
SetResult(&result).
Post(HOST + "/v1/" + setting.ProjectID + urlNotebook2)

if err != nil {
return nil, fmt.Errorf("resty create notebook2: %s", err)
}

if res.StatusCode() == http.StatusUnauthorized && retry < 1 {
retry++
_ = getToken()
goto sendjob
}

var response models.NotebookResult
err = json.Unmarshal(res.Body(), &response)
if err != nil {
log.Error("json.Unmarshal failed: %s", err.Error())
return &result, fmt.Errorf("json.Unmarshal failed: %s", err.Error())
}

if res.StatusCode() == http.StatusBadGateway {
return &result, fmt.Errorf(UnknownErrorPrefix+"createNotebook2 failed(%s): %s", response.ErrorCode, response.ErrorMsg)
}

if len(response.ErrorCode) != 0 {
log.Error("createNotebook2 failed(%s): %s", response.ErrorCode, response.ErrorMsg)
if response.ErrorCode == errorCodeExceedLimit {
response.ErrorMsg = "所选规格使用数量已超过最大配额限制。"
}
if response.ErrorCode == modelartsIllegalToken && retry < 1 {
retry++
_ = getToken()
goto sendjob
}
return &result, fmt.Errorf("createNotebook2 failed(%s): %s", response.ErrorCode, response.ErrorMsg)
}

return &result, nil
}

func GetTrainJobMetricStatistic(jobID, versionID, podName string) (*models.GetTrainJobMetricStatisticResult, error) {
checkSetting()
client := getRestyClient()
var result models.GetTrainJobMetricStatisticResult

retry := 0

sendjob:
res, err := client.R().
SetAuthToken(TOKEN).
SetResult(&result).
Get(HOST + "/v1/" + setting.ProjectID + urlTrainJob + "/" + jobID + "/versions/" + versionID + "/pod/" + podName + "/metric-statistic?statistic_type=each")

if err != nil {
return nil, fmt.Errorf("resty GetTrainJobMetricStatistic: %v", err)
}

if res.StatusCode() == http.StatusUnauthorized && retry < 1 {
retry++
_ = getToken()
goto sendjob
}

if res.StatusCode() != http.StatusOK {
var temp models.ErrorResult
if err = json.Unmarshal([]byte(res.String()), &temp); err != nil {
log.Error("json.Unmarshal failed(%s): %v", res.String(), err.Error())
return &result, fmt.Errorf("json.Unmarshal failed(%s): %v", res.String(), err.Error())
}
log.Error("GetTrainJobMetricStatistic failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
return &result, fmt.Errorf("GetTrainJobMetricStatistic failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
}

if !result.IsSuccess {
log.Error("GetTrainJobMetricStatistic(%s) failed", jobID)
return &result, fmt.Errorf("获取任务资源占用情况失败:%s", result.ErrorMsg)
}

return &result, nil
}

func GetTrainJobList(perPage, page int, sortBy, order, searchContent string) (*models.GetTrainJobListResult, error) {
checkSetting()
client := getRestyClient()
var result models.GetTrainJobListResult

retry := 0

sendjob:
res, err := client.R().
SetQueryParams(map[string]string{
"per_page": strconv.Itoa(perPage),
"page": strconv.Itoa(page),
"sortBy": sortBy,
"order": order,
"search_content": searchContent,
}).
SetAuthToken(TOKEN).
SetResult(&result).
Get(HOST + "/v1/" + setting.ProjectID + urlTrainJob)

if err != nil {
return nil, fmt.Errorf("resty GetTrainJobList: %v", err)
}

if res.StatusCode() == http.StatusUnauthorized && retry < 1 {
retry++
_ = getToken()
goto sendjob
}

if res.StatusCode() != http.StatusOK {
var temp models.ErrorResult
if err = json.Unmarshal([]byte(res.String()), &temp); err != nil {
log.Error("json.Unmarshal failed(%s): %v", res.String(), err.Error())
return &result, fmt.Errorf("json.Unmarshal failed(%s): %v", res.String(), err.Error())
}
log.Error("GetTrainJobList failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
return &result, fmt.Errorf(temp.ErrorMsg)
}

if !result.IsSuccess {
log.Error("GetTrainJobList failed(%s): %s", result.ErrorCode, result.ErrorMsg)
return &result, fmt.Errorf(result.ErrorMsg)
}

return &result, nil
}

func GetTrainJobVersionList(perPage, page int, jobID string) (*models.GetTrainJobVersionListResult, error) {
checkSetting()
client := getRestyClient()
var result models.GetTrainJobVersionListResult

retry := 0

sendjob:
res, err := client.R().
SetQueryParams(map[string]string{
"per_page": strconv.Itoa(perPage),
"page": strconv.Itoa(page),
}).
SetAuthToken(TOKEN).
SetResult(&result).
Get(HOST + "/v1/" + setting.ProjectID + urlTrainJob + "/" + jobID + "/versions")

if err != nil {
return nil, fmt.Errorf("resty GetTrainJobVersionList: %v", err)
}

if res.StatusCode() == http.StatusUnauthorized && retry < 1 {
retry++
_ = getToken()
goto sendjob
}

if res.StatusCode() != http.StatusOK {
var temp models.ErrorResult
if err = json.Unmarshal([]byte(res.String()), &temp); err != nil {
log.Error("json.Unmarshal failed(%s): %v", res.String(), err.Error())
return &result, fmt.Errorf("json.Unmarshal failed(%s): %v", res.String(), err.Error())
}
log.Error("GetTrainJobVersionList failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
return &result, fmt.Errorf(temp.ErrorMsg)
}

if !result.IsSuccess {
log.Error("GetTrainJobVersionList failed(%s): %s", result.ErrorCode, result.ErrorMsg)
return &result, fmt.Errorf(result.ErrorMsg)
}

return &result, nil
}

func GetNotebookList(limit, offset int, sortBy, order, searchContent string) (*models.GetNotebookListResult, error) {
checkSetting()
client := getRestyClient()
var result models.GetNotebookListResult

retry := 0

sendjob:
res, err := client.R().
SetQueryParams(map[string]string{
"limit": strconv.Itoa(limit),
"offset": strconv.Itoa(offset),
"name": searchContent,
"sort_key": sortBy,
"sort_dir": order,
}).
SetAuthToken(TOKEN).
SetResult(&result).
Get(HOST + "/v1/" + setting.ProjectID + urlNotebook2)

if err != nil {
return nil, fmt.Errorf("resty GetNotebookList: %v", err)
}

if res.StatusCode() == http.StatusUnauthorized && retry < 1 {
retry++
_ = getToken()
goto sendjob
}

if res.StatusCode() != http.StatusOK {
var temp models.ErrorResult
if err = json.Unmarshal([]byte(res.String()), &temp); err != nil {
log.Error("json.Unmarshal failed(%s): %v", res.String(), err.Error())
return &result, fmt.Errorf("json.Unmarshal failed(%s): %v", res.String(), err.Error())
}
log.Error("GetNotebookList failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
return &result, fmt.Errorf(temp.ErrorMsg)
}

return &result, nil
}

+ 233
- 0
manager/client/cloudbrain_two_cd/resty.go View File

@@ -0,0 +1,233 @@
package cloudbrain_two_cd

import (
"bytes"
"code.gitea.io/gitea/modules/modelarts_gateway/core"
"crypto/tls"
"encoding/json"
"fmt"
"io/ioutil"
"net/http"
"strconv"
"time"

"code.gitea.io/gitea/models"
"code.gitea.io/gitea/modules/log"
"code.gitea.io/gitea/modules/setting"
)

var (
httpClient *http.Client
HOST string
TOKEN string
autoStopDurationMs = 4 * 60 * 60 * 1000
)

const (
errorCodeExceedLimit = "ModelArts.0118"

//notebook 2.0
urlNotebook2 = "/notebooks"

//error code
modelartsIllegalToken = "ModelArts.6401"
NotebookNotFound = "ModelArts.6404"
NotebookNoPermission = "ModelArts.6407"
NotebookInvalid = "ModelArts.6400"
UnknownErrorPrefix = "UNKNOWN:"
ModelArtsJobNotExists = "ModelArts.0102"
ModelArtsJobInTargetState = "ModelArts.6357"
ModelArtsJobInternalError = "ModelArts.0010"
)

func getHttpClient() *http.Client {
if httpClient == nil {
httpClient = &http.Client{
Timeout: 30 * time.Second,
Transport: &http.Transport{TLSClientConfig: &tls.Config{InsecureSkipVerify: true}},
}
}
return httpClient
}

func GetNotebook(jobID string) (*models.GetNotebook2Result, error) {
var result models.GetNotebook2Result

client := getHttpClient()
s := core.Signer{
Key: setting.ModelartsCD.AccessKey,
Secret: setting.ModelartsCD.SecretKey,
}
r, _ := http.NewRequest(http.MethodGet,
setting.ModelartsCD.EndPoint+"/v1/"+setting.ModelartsCD.ProjectID+urlNotebook2+"/"+jobID,
nil)

r.Header.Add("content-type", "application/json")
s.Sign(r)

resp, err := client.Do(r)
if err != nil {
log.Error("client.Do failed: %s", err.Error())
return &result, fmt.Errorf("client.Do failed: %s", err.Error())
}

defer resp.Body.Close()
body, err := ioutil.ReadAll(resp.Body)
if err != nil {
log.Error("ioutil.ReadAll failed: %s", err.Error())
return &result, fmt.Errorf("ioutil.ReadAll failed: %s", err.Error())
}

err = json.Unmarshal(body, &result)
if err != nil {
log.Error("json.Unmarshal failed: %s", err.Error())
return &result, fmt.Errorf("son.Unmarshal failed: %s", err.Error())
}

if len(result.ErrorCode) != 0 {
log.Error("GetNotebook failed(%s): %s", result.ErrorCode, result.ErrorMsg)
return &result, fmt.Errorf("GetNotebook failed(%s): %s", result.ErrorCode, result.ErrorMsg)
}

return &result, nil
}

func ManageNotebook(jobID string, param models.NotebookAction) (*models.NotebookActionResult, error) {
var result models.NotebookActionResult

client := getHttpClient()
s := core.Signer{
Key: setting.ModelartsCD.AccessKey,
Secret: setting.ModelartsCD.SecretKey,
}
r, _ := http.NewRequest(http.MethodPost,
setting.ModelartsCD.EndPoint+"/v1/"+setting.ModelartsCD.ProjectID+urlNotebook2+"/"+jobID+"/"+param.Action+"?duration="+strconv.Itoa(autoStopDurationMs),
nil)

r.Header.Add("content-type", "application/json")
s.Sign(r)

resp, err := client.Do(r)
if err != nil {
log.Error("client.Do failed: %s", err.Error())
return &result, fmt.Errorf("client.Do failed: %s", err.Error())
}

defer resp.Body.Close()
body, err := ioutil.ReadAll(resp.Body)
if err != nil {
log.Error("ioutil.ReadAll failed: %s", err.Error())
return &result, fmt.Errorf("ioutil.ReadAll failed: %s", err.Error())
}

err = json.Unmarshal(body, &result)
if err != nil {
log.Error("json.Unmarshal failed: %s", err.Error())
return &result, fmt.Errorf("son.Unmarshal failed: %s", err.Error())
}

if len(result.ErrorCode) != 0 {
log.Error("ManageNotebook2 failed(%s): %s", result.ErrorCode, result.ErrorMsg)
return &result, fmt.Errorf("ManageNotebook failed(%s): %s", result.ErrorCode, result.ErrorMsg)
}

return &result, nil
}

func DelNotebook(jobID string) (*models.NotebookDelResult, error) {
var result models.NotebookDelResult

client := getHttpClient()
s := core.Signer{
Key: setting.ModelartsCD.AccessKey,
Secret: setting.ModelartsCD.SecretKey,
}

r, _ := http.NewRequest(http.MethodDelete,
setting.ModelartsCD.EndPoint+"/v1/"+setting.ModelartsCD.ProjectID+urlNotebook2+"/"+jobID,
nil)

r.Header.Add("content-type", "application/json")
s.Sign(r)

resp, err := client.Do(r)
if err != nil {
log.Error("client.Do failed: %s", err.Error())
return &result, fmt.Errorf("client.Do failed: %s", err.Error())
}

defer resp.Body.Close()
body, err := ioutil.ReadAll(resp.Body)
if err != nil {
log.Error("ioutil.ReadAll failed: %s", err.Error())
return &result, fmt.Errorf("ioutil.ReadAll failed: %s", err.Error())
}

err = json.Unmarshal(body, &result)
if err != nil {
log.Error("json.Unmarshal failed: %s", err.Error())
return &result, fmt.Errorf("son.Unmarshal failed: %s", err.Error())
}

if len(result.ErrorCode) != 0 {
log.Error("DelNotebook2 failed(%s): %s", result.ErrorCode, result.ErrorMsg)
if result.ErrorCode == ModelArtsJobNotExists || result.ErrorCode == ModelArtsJobInTargetState {
//任务不存在或者已经处于被删除的状态,此时认为删除成功
return &models.NotebookDelResult{}, nil
}

if result.ErrorCode == ModelArtsJobInternalError {
log.Error("ModelArt internal error when del job,jobId=%s", jobID)
return &models.NotebookDelResult{}, nil
}
return &result, fmt.Errorf("DelNotebook2 failed(%s): %s", result.ErrorCode, result.ErrorMsg)
}

return &result, nil
}

func CreateNotebook(createJobParams models.CreateNotebookWithoutPoolParams) (*models.CreateNotebookResult, error) {
var result models.CreateNotebookResult
client := getHttpClient()
s := core.Signer{
Key: setting.ModelartsCD.AccessKey,
Secret: setting.ModelartsCD.SecretKey,
}

req, _ := json.Marshal(createJobParams)
r, _ := http.NewRequest(http.MethodPost,
setting.ModelartsCD.EndPoint+"/v1/"+setting.ModelartsCD.ProjectID+urlNotebook2,
ioutil.NopCloser(bytes.NewBuffer(req)))

r.Header.Add("content-type", "application/json")
s.Sign(r)

resp, err := client.Do(r)
if err != nil {
log.Error("client.Do failed: %s", err.Error())
return &result, fmt.Errorf("client.Do failed: %s", err.Error())
}

defer resp.Body.Close()
body, err := ioutil.ReadAll(resp.Body)
if err != nil {
log.Error("ioutil.ReadAll failed: %s", err.Error())
return &result, fmt.Errorf("ioutil.ReadAll failed: %s", err.Error())
}

err = json.Unmarshal(body, &result)
if err != nil {
log.Error("json.Unmarshal failed: %s", err.Error())
return &result, fmt.Errorf("json.Unmarshal failed: %s", err.Error())
}

if len(result.ErrorCode) != 0 {
log.Error("createNotebook failed(%s): %s", result.ErrorCode, result.ErrorMsg)
if result.ErrorCode == errorCodeExceedLimit {
result.ErrorMsg = "所选规格使用数量已超过最大配额限制。"
}
return &result, fmt.Errorf("createNotebook failed(%s): %s", result.ErrorCode, result.ErrorMsg)
}

return &result, nil
}

+ 65
- 1
manager/client/grampus/grampus.go View File

@@ -118,7 +118,7 @@ sendjob:
log.Error("CreateNotebookJob failed(%d): %s", result.ErrorCode, result.ErrorMsg)
return &result, fmt.Errorf("CreateNotebookJob failed(%d): %s", result.ErrorCode, result.ErrorMsg)
}
log.Info("CreateNotebookJob success.req.JobName = %s ,result=%+v", req.Name, result)
return &result, nil
}

@@ -466,6 +466,70 @@ sendjob:
log.Error("resty grampus restart note book job failed(%s): %v", res.String(), err.Error())
return nil, fmt.Errorf("resty grampus restart note book job failed: %v", err)
}
log.Info("RestartNotebookJob success.jobId = %s ,result=%+v", jobID, restartResponse)

return restartResponse, nil
}

func GetDebugJobEvents(jobID string) (*models.GetGrampusDebugJobEventsResponse, error) {
checkSetting()
client := getRestyClient()
var result models.GetGrampusDebugJobEventsResponse

retry := 0

sendjob:
res, err := client.R().
SetAuthToken(TOKEN).
SetResult(&result).
Get(HOST + urlNotebookJob + "/" + jobID + "/events")
log.Info("res=%v", res)
if err != nil {
return nil, fmt.Errorf("resty GetDebugJobEvents: %v", err)
}

if result.ErrorCode == errorIllegalToken && retry < 1 {
retry++
log.Info("retry get token")
_ = getToken()
goto sendjob
}

if result.ErrorCode != 0 {
log.Error("GetDebugJobEvents failed(%d): %s", result.ErrorCode, result.ErrorMsg)
return nil, fmt.Errorf("GetDebugJobEvents failed(%d): %s", result.ErrorCode, result.ErrorMsg)
}

return &result, nil
}

func GetTrainJobEvents(jobID string) (*models.GetGrampusJobEventsResponse, error) {
checkSetting()
client := getRestyClient()
var result models.GetGrampusJobEventsResponse

retry := 0

sendjob:
_, err := client.R().
SetAuthToken(TOKEN).
SetResult(&result).
Get(HOST + urlTrainJob + "/" + jobID + "/events")
if err != nil {
return nil, fmt.Errorf("resty GetTrainJobEvents: %v", err)
}

if result.ErrorCode == errorIllegalToken && retry < 1 {
retry++
log.Info("retry get token")
_ = getToken()
goto sendjob
}

if result.ErrorCode != 0 {
log.Error("GetTrainJobEvents failed(%d): %s", result.ErrorCode, result.ErrorMsg)
return nil, fmt.Errorf("GetTrainJobEvents failed(%d): %s", result.ErrorCode, result.ErrorMsg)
}

return &result, nil
}

+ 59
- 22
models/action.go View File

@@ -50,26 +50,28 @@ const (
ActionRejectPullRequest // 22
ActionCommentPull // 23

ActionUploadAttachment //24
ActionCreateDebugGPUTask //25
ActionCreateDebugNPUTask //26
ActionCreateTrainTask //27
ActionCreateInferenceTask // 28
ActionCreateBenchMarkTask //29
ActionCreateNewModelTask //30
ActionCreateGPUTrainTask //31
ActionCreateGrampusNPUTrainTask //32
ActionCreateGrampusGPUTrainTask //33
ActionBindWechat //34
ActionDatasetRecommended //35
ActionCreateImage //36
ActionImageRecommend //37
ActionChangeUserAvatar //38
ActionCreateGrampusNPUDebugTask //39
ActionCreateGrampusGPUDebugTask //40
ActionCreateGrampusGCUDebugTask //41
ActionCreateGrampusGCUTrainTask //42
ActionCreateGrampusMLUDebugTask //43
ActionUploadAttachment //24
ActionCreateDebugGPUTask //25
ActionCreateDebugNPUTask //26
ActionCreateTrainTask //27
ActionCreateInferenceTask // 28
ActionCreateBenchMarkTask //29
ActionCreateNewModelTask //30
ActionCreateGPUTrainTask //31
ActionCreateGrampusNPUTrainTask //32
ActionCreateGrampusGPUTrainTask //33
ActionBindWechat //34
ActionDatasetRecommended //35
ActionCreateImage //36
ActionImageRecommend //37
ActionChangeUserAvatar //38
ActionCreateGrampusNPUDebugTask //39
ActionCreateGrampusGPUDebugTask //40
ActionCreateGrampusGCUDebugTask //41
ActionCreateGrampusGCUTrainTask //42
ActionCreateGrampusMLUDebugTask //43
ActionCreateGrampusMLUTrainTask //44
ActionCreateGrampusGPUOnlineInferTask //45
)

// Action represents user operation type and other information to
@@ -126,6 +128,20 @@ func (a *Action) loadActUser() {
}
}

func (a *Action) FilterCloudbrainInfo() {
if a.Cloudbrain == nil {
return
}

if a.Cloudbrain.DeletedAt.IsZero() {
newCloudbrain := &Cloudbrain{}
newCloudbrain.ID = a.Cloudbrain.ID
a.Cloudbrain = newCloudbrain
} else {
a.Cloudbrain = nil
}
}

func (a *Action) loadRepo() {
if a.Repo != nil {
return
@@ -136,6 +152,26 @@ func (a *Action) loadRepo() {
log.Error("GetRepositoryByID(%d): %v", a.RepoID, err)
}
}
func (a *Action) loadCloudbrain() {
if !a.IsCloudbrainAction() {
return
}
cloudbrain := &Cloudbrain{}
cloudbrainId, _ := strconv.ParseInt(a.Content, 10, 64)
jobId := a.Content

//由于各个类型的云脑任务在发布action的时候,content字段保存的ID含义不同,部分取的是ID,部分取的是jobId
//所以在查询action对应的cloudbrain对象时,以这两个字段做为条件查询
if has, err := x.
Where(builder.Or(builder.Eq{"id": cloudbrainId}).Or(builder.Eq{"job_id": jobId})).Unscoped().
Get(cloudbrain); err != nil || !has {
return
}
if cloudbrain.DisplayJobName == a.RefName || cloudbrain.JobName == a.RefName {
a.Cloudbrain = cloudbrain
}

}

// GetActFullName gets the action's user full name.
func (a *Action) GetActFullName() string {
@@ -381,6 +417,7 @@ func (a *Action) IsCloudbrainAction() bool {
ActionCreateBenchMarkTask,
ActionCreateGPUTrainTask,
ActionCreateGrampusGPUDebugTask,
ActionCreateGrampusGPUOnlineInferTask,
ActionCreateGrampusNPUDebugTask,
ActionCreateGrampusNPUTrainTask,
ActionCreateGrampusGPUTrainTask,
@@ -463,7 +500,7 @@ func GetFeeds(opts GetFeedsOptions) ([]*Action, error) {
return nil, fmt.Errorf("Find: %v", err)
}

if err := ActionList(actions).LoadAttributes(); err != nil {
if err := ActionList(actions).LoadAllAttributes(); err != nil {
return nil, fmt.Errorf("LoadAttributes: %v", err)
}

@@ -483,7 +520,7 @@ func GetLast20PublicFeeds(opTypes []int) ([]*Action, error) {
return nil, fmt.Errorf("Find: %v", err)
}

if err := ActionList(actions).LoadAttributes(); err != nil {
if err := ActionList(actions).LoadAllAttributes(); err != nil {
return nil, fmt.Errorf("LoadAttributes: %v", err)
}



+ 14
- 0
models/ai_model_manage.go View File

@@ -819,3 +819,17 @@ func QueryModelForSearch(opts *AiModelQueryOptions) ([]*AiModelManage, int64, er

return aiModelManages, count, nil
}

func QueryModelRepoByModelID(modelId string) (*Repository, error) {
r := &Repository{}
has, err := x.Where(builder.NewCond().
And(builder.Eq{"id": builder.Select("repo_id").
From("ai_model_manage").
Where(builder.Eq{"id": modelId})})).Get(r)
if err != nil {
return nil, err
} else if !has {
return nil, &ErrRecordNotExist{}
}
return r, nil
}

+ 1
- 1
models/attachment.go View File

@@ -329,7 +329,7 @@ func DeleteAttachments(attachments []*Attachment, remove bool) (int, error) {
log.Info("Message:%s\n", obsError.Message)
}
}
DeleteFileChunkById(a.UUID)
//rf := path.Join(a.UUID[0:1], a.UUID[1:2])
/*
files, err := repo.GetDatasetDirs(a.UUID, "")


+ 123
- 19
models/cloudbrain.go View File

@@ -68,15 +68,16 @@ const (

ModelSafetyTesting CloudbrainStatus = "TESTING"

JobTypeDebug JobType = "DEBUG"
JobTypeBenchmark JobType = "BENCHMARK"
JobTypeModelSafety JobType = "MODELSAFETY"
JobTypeSnn4imagenet JobType = "SNN4IMAGENET"
JobTypeBrainScore JobType = "BRAINSCORE"
JobTypeSnn4Ecoset JobType = "SNN4ECOSET"
JobTypeSim2BrainSNN JobType = "SIM2BRAIN_SNN"
JobTypeTrain JobType = "TRAIN"
JobTypeInference JobType = "INFERENCE"
JobTypeDebug JobType = "DEBUG"
JobTypeBenchmark JobType = "BENCHMARK"
JobTypeModelSafety JobType = "MODELSAFETY"
JobTypeSnn4imagenet JobType = "SNN4IMAGENET"
JobTypeBrainScore JobType = "BRAINSCORE"
JobTypeSnn4Ecoset JobType = "SNN4ECOSET"
JobTypeSim2BrainSNN JobType = "SIM2BRAIN_SNN"
JobTypeTrain JobType = "TRAIN"
JobTypeInference JobType = "INFERENCE"
JobTypeOnlineInference JobType = "ONLINEINFERENCE"

//notebook
ModelArtsCreateQueue ModelArtsJobStatus = "CREATE_QUEUING" //免费资源创建排队中
@@ -237,6 +238,7 @@ type Cloudbrain struct {
EngineID int64 //引擎id
ImageID string //grampus image_id
AiCenter string //grampus ai center: center_id+center_name
FailedReason string `xorm:"text"`

TrainUrl string //输出模型的obs路径
BranchName string `xorm:"varchar(2550)"` //分支名称
@@ -344,15 +346,56 @@ func (task *Cloudbrain) CorrectCreateUnix() {
task.CreatedUnix = task.StartTime
}
}
func (task *Cloudbrain) GetAiCenter() string {
if task.Type == TypeCloudBrainOne {
return AICenterOfCloudBrainOne
} else if task.Type == TypeCloudBrainTwo {
return AICenterOfCloudBrainTwo
} else if task.Type == TypeCDCenter {
return AICenterOfChengdu
} else {
return strings.Split(task.AiCenter, "+")[0]
}

}

//是否为在线notebook文件任务
func (task *Cloudbrain) IsFileNoteBookTask() bool {
return task.JobType == string(JobTypeDebug) && task.BootFile != ""
}

func (task *Cloudbrain) CanUserModify(user *User) bool {
if user == nil {
return false
}

return user.IsAdmin || user.ID == task.UserID
}
func (task *Cloudbrain) CanUserDelete(user *User, isRepoOwner bool) bool {
if user == nil {
return false
}

return isRepoOwner || user.IsAdmin || user.ID == task.UserID
}

func AllTerminalStatus() []string {
return []string{string(ModelArtsTrainJobCompleted), string(ModelArtsTrainJobFailed),
string(ModelArtsTrainJobKilled), string(ModelArtsStopped),
string(JobStopped), string(JobFailed),
string(ModelArtsTrainJobKilled), string(ModelArtsStopped), string(ModelArtsCreateFailed),
string(ModelArtsStartFailed), string(JobStopped), string(JobFailed),
string(JobSucceeded), GrampusStatusFailed,
GrampusStatusSucceeded, GrampusStatusStopped, LocalStatusFailed}
}

func IsCloudbrainTerminalStatus(status string) bool {
for _, s := range AllTerminalStatus() {
if strings.ToUpper(status) == strings.ToUpper(s) {
return true
}
}
return false
}

func AllStoppingStatus() []string {
return []string{string(ModelArtsStopping), string(ModelArtsDeleting),
string(ModelArtsTrainJobKilling), GrampusStatusStopping}
@@ -388,12 +431,7 @@ func AllStoppingAndTerminalStatus() []string {

func (task *Cloudbrain) IsTerminal() bool {
status := task.Status
for _, s := range AllTerminalStatus() {
if status == s {
return true
}
}
return false
return IsCloudbrainTerminalStatus(status)
}
func (task *Cloudbrain) IsPreparing() bool {
return task.Status == LocalStatusPreparing
@@ -405,6 +443,15 @@ func (task *Cloudbrain) NeedActiveStop() bool {
return task.IsCreating() || (task.IsPreparing() && int64(task.CreatedUnix) < time.Now().Add(-1*setting.PREPARING_MAX_WAIT_DURATION).Unix())
}

//是否允许创建多版本
//目前只有启智NPU可以
func (task *Cloudbrain) IsAllowedToCreateMultipleVersions() bool {
if task.Type == TypeCloudBrainTwo && task.ComputeResource == NPUResource {
return true
}
return false
}

func (task *Cloudbrain) IsNewAITask() bool {
for k, v := range setting.AI_TASK_RANGE {
if k == task.JobType+"_"+fmt.Sprint(task.Type) {
@@ -1225,6 +1272,11 @@ type GetNotebook2Result struct {
Ownership string `json:"ownership"`
Status string `json:"status"`
} `json:"volume"`
ActionProgress []struct {
Step int `json:"step"`
Status string `json:"status"`
Description string `json:"description"`
} `json:"action_progress"`
}

type GetTokenParams struct {
@@ -1694,6 +1746,11 @@ type NotebookList struct {
JobName string `json:"name"`
JobID string `json:"id"`
Status string `json:"status"`
Lease struct {
CreateTime int64 `json:"create_at"` //实例创建的时间,UTC毫秒
Duration int64 `json:"duration"` //实例运行时长,以创建时间为起点计算,即“创建时间+duration > 当前时刻”时,系统会自动停止实例
UpdateTime int64 `json:"update_at"` //实例最后更新(不包括保活心跳)的时间,UTC毫秒
} `json:"lease"` //实例自动停止的倒计时信息
}

type GetNotebookListResult struct {
@@ -1886,7 +1943,7 @@ type GrampusTasks struct {
WorkServerNumber int `json:"nodeCount"`
}
type GrampusNotebookTask struct {
AutoStopDuration int `json:"autoStopDuration"`
AutoStopDuration int64 `json:"autoStopDuration"`
Name string `json:"name"`
Capacity int `json:"capacity"`
CenterID []string `json:"centerID"`
@@ -2227,6 +2284,22 @@ func CloudbrainsVersionList(opts *CloudbrainsOptions) ([]*CloudbrainInfo, int, e
return cloudbrains, int(count), nil
}

func GetCloudbrainEarlyVersionList(task *Cloudbrain) ([]*Cloudbrain, error) {
cloudbrains := make([]*Cloudbrain, 0)
if err := x.Where(builder.NewCond().
And(builder.Eq{"cloudbrain.repo_id": task.RepoID}).
And(builder.Eq{"cloudbrain.type": task.Type}).
And(builder.Eq{"cloudbrain.job_id": task.JobID}).
And(builder.Eq{"cloudbrain.job_type": task.JobType}).
And(builder.Lt{"cloudbrain.created_unix": task.CreatedUnix})).
OrderBy("cloudbrain.created_unix DESC").
Find(&cloudbrains); err != nil {
return nil, fmt.Errorf("Find: %v", err)
}

return cloudbrains, nil
}

func CreateCloudbrain(cloudbrain *Cloudbrain) (err error) {
session := x.NewSession()
defer session.Close()
@@ -2302,6 +2375,26 @@ func GetCloudbrainByJobID(jobID string) (*Cloudbrain, error) {
return getRepoCloudBrain(cb)
}

func GetCloudbrainListByJobID(jobID string) ([]*Cloudbrain, error) {
r := make([]*Cloudbrain, 0)
if err := x.Where("job_id = ?", jobID).OrderBy("id desc").Find(&r); err != nil {
return nil, err
}
return r, nil

}

func GetNewestCloudbrainByJobId(jobID string) (*Cloudbrain, error) {
r := &Cloudbrain{}
if has, err := x.Where("job_id = ?", jobID).OrderBy("id desc").Limit(1).Get(r); err != nil {
return nil, err
} else if !has {
return nil, ErrRecordNotExist{}
}
return r, nil

}

func GetCloudbrainByJobIDWithDeleted(jobID string) (*Cloudbrain, error) {
cb := &Cloudbrain{JobID: jobID}
return getRepoCloudBrainWithDeleted(cb)
@@ -2663,7 +2756,7 @@ func GetModelSafetyCountByUserID(userID int64) (int, error) {
}

func GetWaitingCloudbrainCount(cloudbrainType int, computeResource string, jobTypes ...JobType) (int64, error) {
sess := x.Where("status=? and type=?", JobWaiting, cloudbrainType)
sess := x.Where(builder.NewCond().And(builder.In("status", JobWaiting, LocalStatusPreparing, LocalStatusCreating)).And(builder.Eq{"type": cloudbrainType}))
if len(jobTypes) > 0 {
sess.In("job_type", jobTypes)
}
@@ -2947,6 +3040,15 @@ func CloudbrainAllStatic(opts *CloudbrainsOptions) ([]*CloudbrainInfo, int64, er
return cloudbrains, count, nil
}

func GetLastestNCloudbrain(n int) ([]*Cloudbrain, error) {
r := make([]*Cloudbrain, 0)
err := x.Where("ai_center!='' or type!=2").Desc("id").Limit(n).Unscoped().Find(&r)
if err != nil {
return nil, err
}
return r, nil

}
func CloudbrainAllKanBan(opts *CloudbrainsOptions) ([]*CloudbrainInfo, int64, error) {
sess := x.NewSession()
defer sess.Close()
@@ -3224,3 +3326,5 @@ func GetCloudBrainByRepoIdAndModelName(repoId int64, modelName string) ([]*Cloud
err := x.AllCols().Where("model_name=? and repo_id=?", modelName, repoId).OrderBy("created_unix asc").Find(&cloudBrains)
return cloudBrains, err
}

var SubTaskName = "task1"

+ 24
- 0
models/cloudbrain_spec.go View File

@@ -134,3 +134,27 @@ func GetCloudbrainTaskUnitPrice(task Cloudbrain) (int, error) {
}
return s.UnitPrice * n, nil
}

func UpdateCloudbrainSpec(cloudbrainId int64, s *Specification) (int64, error) {
new := CloudbrainSpec{
CloudbrainID: cloudbrainId,
SpecId: s.ID,
SourceSpecId: s.SourceSpecId,
AccCardsNum: s.AccCardsNum,
AccCardType: s.AccCardType,
CpuCores: s.CpuCores,
MemGiB: s.MemGiB,
GPUMemGiB: s.GPUMemGiB,
ShareMemGiB: s.ShareMemGiB,
ComputeResource: s.ComputeResource,
UnitPrice: s.UnitPrice,
QueueId: s.QueueId,
QueueCode: s.QueueCode,
Cluster: s.Cluster,
AiCenterCode: s.AiCenterCode,
AiCenterName: s.AiCenterName,
IsExclusive: s.IsExclusive,
ExclusiveOrg: s.ExclusiveOrg,
}
return x.Where("cloudbrain_id = ?", cloudbrainId).Update(&new)
}

+ 17
- 0
models/cloudbrain_static.go View File

@@ -144,6 +144,23 @@ func GetCloudbrainStatusCount() ([]map[string]string, error) {
return x.QueryString(countSql)
}

func GetCloudbrainCardTimeAndCountGroupByAICenter() ([]map[string]string, error) {
countSql := `select ai_center,SUM(
COALESCE(a.duration *
CASE
WHEN a.work_server_number = 0 THEN 1
ELSE COALESCE(a.work_server_number, 1)
END *
COALESCE(cloudbrain_spec.acc_cards_num, 1), 0)
) as card_duration,count(*) num from

(select id,duration,work_server_number,case when type=0 then 'OpenIOne' when type=1 then 'OpenITwo' when type=3 then 'OpenIChengdu' else split_part(ai_center, '+',1)
end ai_center
FROM public.cloudbrain ) a Left JOIN cloudbrain_spec on a.id = cloudbrain_spec.cloudbrain_id
where ai_center!='' group by a.ai_center order by card_duration desc`
return x.QueryString(countSql)
}

func GetCloudbrainTpyeDurationSum() ([]map[string]string, error) {
countSql := "SELECT type,sum(duration) FROM public.cloudbrain group by type order by sum(duration) desc"
return x.QueryString(countSql)


+ 22
- 0
models/file_chunk.go View File

@@ -5,6 +5,7 @@ import (

"code.gitea.io/gitea/modules/log"
"code.gitea.io/gitea/modules/timeutil"
"xorm.io/builder"
"xorm.io/xorm"
)

@@ -92,6 +93,27 @@ func getFileChunkByMD5AndUser(e Engine, md5 string, userID int64, typeCloudBrain
return fileChunk, nil
}

// GetAttachmentByID returns attachment by given id
func GetFileChunksByUserId(userId int64, lastTime int64, isUploadFinished bool) ([]*FileChunk, error) {
return getFileChunksByUserId(x, userId, lastTime, isUploadFinished)
}

func getFileChunksByUserId(e Engine, userId int64, lastTime int64, isUploadFinished bool) ([]*FileChunk, error) {
fileChunks := make([]*FileChunk, 0)
cond := builder.NewCond()
cond = cond.And(builder.Eq{"user_id": userId})
if lastTime > 0 {
cond = cond.And(builder.Gte{"created_unix": lastTime})
}
if !isUploadFinished {
cond = cond.And(builder.Eq{"is_uploaded": 0})
}
if err := e.Where(cond).Find(&fileChunks); err != nil {
return nil, err
}
return fileChunks, nil
}

// GetAttachmentByID returns attachment by given id
func GetFileChunkByUUID(uuid string) (*FileChunk, error) {
return getFileChunkByUUID(x, uuid)


+ 30
- 0
models/ip_location.go View File

@@ -0,0 +1,30 @@
package models

type IPLocation struct {
ID int64 `xorm:"pk autoincr"`
IpAddr string `xorm:"unique"`
Longitude string
Latitude string
}

func CreateIPLocation(ipLocation *IPLocation) (err error) {
_, err = x.Insert(ipLocation)
return err

}

func GetIpLocation(ip string) (*IPLocation, error) {

ipLocation := &IPLocation{IpAddr: ip}
has, err := x.Get(ipLocation)
if err != nil {
return nil, err
}

if has {
return ipLocation, nil
} else {
return nil, ErrRecordNotExist{}
}

}

+ 3
- 2
models/model_migrate_record.go View File

@@ -176,8 +176,9 @@ func UpdateModelMigrateRecordByStep(record *ModelMigrateRecord) error {

func GetUnfinishedModelMigrateRecords() ([]*ModelMigrateRecord, error) {
records := make([]*ModelMigrateRecord, 0, 10)
return records, x.
Where(builder.NewCond().And(builder.In("current_step", UnFinishedMigrateSteps))).
return records, x.Cols("model_migrate_record.id", "model_migrate_record.cloudbrain_id", "model_migrate_record.dest_bucket", "model_migrate_record.dest_endpoint", "model_migrate_record.dest_object_key", "model_migrate_record.dest_proxy", "model_migrate_record.src_bucket", "model_migrate_record.src_endpoint", "model_migrate_record.src_object_key", "model_migrate_record.status", "model_migrate_record.current_step", "model_migrate_record.retry_count", "model_migrate_record.created_unix", "model_migrate_record.updated_unix", "model_migrate_record.deleted_at", "model_migrate_record.remark").Table("model_migrate_record").
Join("inner", "cloudbrain", "cloudbrain.id = model_migrate_record.cloudbrain_id").
Where(builder.NewCond().And(builder.In("model_migrate_record.current_step", UnFinishedMigrateSteps)).And(builder.Eq{"cloudbrain.deleted_at": "0001-01-01 00:00:00"}.Or(builder.IsNull{"cloudbrain.deleted_at"}))).
Limit(100).
Find(&records)
}


+ 11
- 0
models/modelarts_deploy.go View File

@@ -241,3 +241,14 @@ func DeployStatusConvert(status string) string {
return statusConvert
}
}

func GetModelartsDeployFinishTimebyJobID(jobID string) (finishTime timeutil.TimeStamp, err error) {
finishTime = timeutil.TimeStamp(0)
deploy, err := GetModelartsDeployByJobID(jobID)
if err != nil || deploy.CompleteUnix == timeutil.TimeStamp(0) {
return finishTime, err
} else {
finishTime = deploy.CompleteUnix.Add(int64(30 * 60))
return finishTime, nil
}
}

+ 1
- 0
models/models.go View File

@@ -173,6 +173,7 @@ func init() {
new(AiModelCollect),
new(AiModelFile),
new(ModelMigrateRecord),
new(IPLocation),
new(ModelartsDeploy),
new(ModelartsDeployQueue),
)


+ 2
- 0
models/repo_watch.go View File

@@ -332,6 +332,8 @@ func NotifyWatchers(actions ...*Action) error {
func producer(actions ...*Action) {
for _, action := range actions {
if !action.IsPrivate {
action.loadCloudbrain()
action.FilterCloudbrainInfo()
ActionChan <- action
}
}


+ 1
- 0
models/task_config.go View File

@@ -42,6 +42,7 @@ func GetTaskTypeFromAction(a ActionType) TaskType {
ActionCreateGrampusGCUDebugTask,
ActionCreateGrampusGCUTrainTask,
ActionCreateGrampusMLUDebugTask,
ActionCreateGrampusGPUOnlineInferTask,
ActionCreateGrampusGPUTrainTask:
return TaskCreateCloudbrainTask
case ActionCreateRepo:


+ 9
- 0
models/user_login_log.go View File

@@ -13,6 +13,15 @@ type UserLoginLog struct {
CreatedUnix timeutil.TimeStamp `xorm:"created"`
}

func GetIpByUID(uid int64) string {
userLoginLog := new(UserLoginLog)
has, err := xStatistic.Where("u_id=?", uid).Desc("id").Limit(1).Get(userLoginLog)
if err != nil || !has {
return ""
}
return userLoginLog.IpAddr
}

func SaveLoginInfoToDb(r *http.Request, u *User) {
statictisSess := xStatistic.NewSession()
defer statictisSess.Close()


+ 5
- 2
modules/auth/wechat/cloudbrain.go View File

@@ -1,11 +1,12 @@
package wechat

import (
"fmt"
"time"

"code.gitea.io/gitea/models"
"code.gitea.io/gitea/modules/log"
"code.gitea.io/gitea/modules/setting"
"fmt"
"time"
)

type JobOperateType string
@@ -157,6 +158,8 @@ func getJobTypeDisplayName(jobType string) string {
switch jobType {
case string(models.JobTypeDebug):
return "调试任务"
case string(models.JobTypeOnlineInference):
return "在线推理"
case string(models.JobTypeBenchmark):
return "评测任务"
case string(models.JobTypeTrain):


+ 0
- 8
modules/context/repo.go View File

@@ -378,7 +378,6 @@ func RepoAssignment() macaron.Handler {
owner *models.User
err error
)

userName := ctx.Params(":username")
repoName := ctx.Params(":reponame")

@@ -431,7 +430,6 @@ func RepoAssignment() macaron.Handler {
if ctx.Written() {
return
}

ctx.Repo.RepoLink = repo.Link()
ctx.Data["RepoLink"] = ctx.Repo.RepoLink
ctx.Data["RepoRelPath"] = ctx.Repo.Owner.Name + "/" + ctx.Repo.Repository.Name
@@ -464,7 +462,6 @@ func RepoAssignment() macaron.Handler {
ctx.ServerError("CanUserFork", err)
return
}

ctx.Data["DisableSSH"] = setting.SSH.Disabled
ctx.Data["ExposeAnonSSH"] = setting.SSH.ExposeAnonymous
ctx.Data["DisableHTTP"] = setting.Repository.DisableHTTPGit
@@ -581,7 +578,6 @@ func RepoAssignment() macaron.Handler {
}
ctx.Data["CanCompareOrPull"] = canCompare
ctx.Data["PullRequestCtx"] = ctx.Repo.PullRequest

if ctx.Query("go-get") == "1" {
ctx.Data["GoGetImport"] = ComposeGoGetImport(owner.Name, repo.Name)
prefix := setting.AppURL + path.Join(owner.Name, repo.Name, "src", "branch", ctx.Repo.BranchName)
@@ -696,7 +692,6 @@ func RepoRefByType(refType RepoRefType) macaron.Handler {
if ctx.Repo.Repository.IsEmpty {
return
}

var (
refName string
err error
@@ -718,7 +713,6 @@ func RepoRefByType(refType RepoRefType) macaron.Handler {
}
}()
}

// Get default branch.
if len(ctx.Params("*")) == 0 {
refName = ctx.Repo.Repository.DefaultBranch
@@ -789,7 +783,6 @@ func RepoRefByType(refType RepoRefType) macaron.Handler {
return
}
}

ctx.Data["BranchName"] = ctx.Repo.BranchName
ctx.Data["BranchNameSubURL"] = ctx.Repo.BranchNameSubURL()
ctx.Data["CommitID"] = ctx.Repo.CommitID
@@ -805,7 +798,6 @@ func RepoRefByType(refType RepoRefType) macaron.Handler {
return
}
ctx.Data["CommitsCount"] = ctx.Repo.CommitsCount

ctx.Next()
}
}


+ 19
- 12
modules/grampus/grampus.go View File

@@ -133,10 +133,12 @@ func getDatasetGrampus(datasetInfos map[string]models.DatasetInfo) []models.Gram
endPoint := getEndPoint()
for _, datasetInfo := range datasetInfos {
datasetGrampus = append(datasetGrampus, models.GrampusDataset{
Name: datasetInfo.FullName,
Bucket: setting.Bucket,
EndPoint: endPoint,
ObjectKey: datasetInfo.DataLocalPath + datasetInfo.FullName,
Name: datasetInfo.FullName,
Bucket: setting.Bucket,
EndPoint: endPoint,
ObjectKey: datasetInfo.DataLocalPath + datasetInfo.FullName,
ReadOnly: true,
ContainerPath: "/tmp/dataset/" + datasetInfo.FullName,
})

}
@@ -352,21 +354,26 @@ func GenerateTrainJob(ctx *context.Context, req *GenerateTrainJobReq) (jobId str
for i, ckptName := range req.CkptNames {
if len(req.CkptNames) != 0 {
ckptGrampus = models.GrampusDataset{
Name: ckptName,
Bucket: setting.Bucket,
EndPoint: getEndPoint(),
ObjectKey: req.PreTrainModelPaths[i],
Name: ckptName,
Bucket: setting.Bucket,
EndPoint: getEndPoint(),
ObjectKey: req.PreTrainModelPaths[i],
ContainerPath: "/tmp/pretrainmodel/" + req.CkptName,
ReadOnly: true,
}
}
modelGrampus = append(modelGrampus, ckptGrampus)
}
codeGrampus = models.GrampusDataset{
Name: req.CodeName,
Bucket: setting.Bucket,
EndPoint: getEndPoint(),
ObjectKey: req.CodeObsPath + cloudbrain.DefaultBranchName + ".zip",
Name: req.CodeName,
Bucket: setting.Bucket,
EndPoint: getEndPoint(),
ObjectKey: req.CodeObsPath + cloudbrain.DefaultBranchName + ".zip",
ReadOnly: false,
ContainerPath: "/tmp/code/" + cloudbrain.DefaultBranchName + ".zip",
}
outputGrampus = models.GrampusDataset{
ContainerPath: "/tmp/output",
GetBackEndpoint: getEndPoint(),
}
} else if ProcessorTypeGPU == req.ProcessType {


+ 15
- 3
modules/grampus/resty.go View File

@@ -164,6 +164,10 @@ sendjob:
}

func GetNotebookJob(jobID string) (*models.GrampusNotebookResponse, error) {
if jobID == "" {
return nil, fmt.Errorf("jobID is emmpty")
}

checkSetting()
client := getRestyClient()
var result models.GrampusNotebookResponse
@@ -295,15 +299,20 @@ sendjob:
return &result, nil
}

func GetTrainJobLog(jobID string) (string, error) {
func GetTrainJobLog(jobID string, nodeId ...int) (string, error) {
checkSetting()
client := getRestyClient()
var logContent string

url := HOST + urlTrainJob + "/" + jobID + "/task/0/replica/0/log"
if len(nodeId) > 0 {
url = HOST + urlTrainJob + "/" + jobID + "/task/0/replica/0/log/node/" + strconv.Itoa(nodeId[0])
}

res, err := client.R().
SetAuthToken(TOKEN).
SetResult(&logContent).
Get(HOST + urlTrainJob + "/" + jobID + "/task/0/replica/0/log")
Get(url)

if err != nil {
return logContent, fmt.Errorf("resty GetTrainJobLog: %v", err)
@@ -324,11 +333,14 @@ func GetTrainJobLog(jobID string) (string, error) {
return logContent, nil
}

func GetGrampusMetrics(jobID string, startTime int64, endTime int64) (models.NewModelArtsMetricStatisticResult, error) {
func GetGrampusMetrics(jobID string, startTime int64, endTime int64, nodeId ...int) (models.NewModelArtsMetricStatisticResult, error) {
checkSetting()
client := getRestyClient()
var result models.NewModelArtsMetricStatisticResult
url := HOST + urlTrainJob + "/" + jobID + "/task/0/replica/0/metrics"
if len(nodeId) > 0 {
url = HOST + urlTrainJob + "/" + jobID + "/task/0/replica/0/metrics/node/" + strconv.Itoa(nodeId[0])
}
if startTime > 0 {
var step int64 = 60



+ 44
- 0
modules/ipinfo/ipinfo.go View File

@@ -0,0 +1,44 @@
package ipinfo

import (
"crypto/tls"
"fmt"
"net/http"

"code.gitea.io/gitea/modules/setting"

"github.com/go-resty/resty/v2"
)

var restyClient *resty.Client

type IpInfoResponse struct {
Ip string `json:"ip"`
Loc string `json:"loc"`
Bogon bool `json:"bogon"`
}

func getRestyClient() *resty.Client {
if restyClient == nil {
restyClient = resty.New()
restyClient.SetTLSClientConfig(&tls.Config{InsecureSkipVerify: true})
}
return restyClient
}

func GetLocationByIp(ip string) (*IpInfoResponse, error) {
client := getRestyClient()
var result IpInfoResponse
res, err := client.R().
SetHeader("Accept", "application/json").
SetAuthToken(setting.IPInfo.Token).
SetResult(&result).
Get(setting.IPInfo.Host + "/" + ip)
if err != nil {
return nil, err
}
if res.StatusCode() != http.StatusOK {
return nil, fmt.Errorf("http status is %d", res.StatusCode())
}
return &result, nil
}

+ 1
- 1
modules/minio_ext/constants.go View File

@@ -40,7 +40,7 @@ const maxSinglePutObjectSize = 1024 * 1024 * 1024 * 5

// maxMultipartPutObjectSize - maximum size 5TiB of object for
// Multipart operation.
const MaxMultipartPutObjectSize = 1024 * 1024 * 1024 * 1024 * 5
const MaxMultipartPutObjectSize = 1024 * 1024 * 1024 * 200

// unsignedPayload - value to be set to X-Amz-Content-Sha256 header when
// we don't want to sign the request payload


+ 1
- 1
modules/modelappservice/modelsevice.go View File

@@ -45,7 +45,7 @@ func consumerOrder(in <-chan *models.ModelApp, url string) {
continue
}
log.Info("goroutine id=" + fmt.Sprint(goroutine_id) + " wenxin text=" + modelApp.Desc)
result, err := modelarts.CreateWenXinJob(modelApp, url)
result, err := modelarts.CreateWenXinJobToCD(modelApp, url)
if err == nil {
if !modelarts.SendPictureReivew(result.Result) {
modelApp.Status = -1


+ 31
- 25
modules/modelarts/resty.go View File

@@ -1036,14 +1036,16 @@ func DelTrainJob(jobID string) (*models.TrainJobResult, error) {
var result models.TrainJobResult

//get cloudbrain job by jobid
finetuneJob, _ := models.GetCloudbrainByJobID(jobID)
log.Info("调试:%s", finetuneJob.FineTune)
if finetuneJob.FineTune {
err := ServiceDelete(jobID)
if err != nil {
log.Error("盘古微调部署: Delete Deploy failed:%s %v", jobID, err.Error())
return &result, err
if finetuneJob, err := models.GetCloudbrainByJobID(jobID); finetuneJob != nil && err == nil {
if finetuneJob.FineTune {
err := ServiceDelete(jobID)
if err != nil {
log.Error("panguService: Delete Deploy failed:%s %v", jobID, err.Error())
return nil, err
}
}
} else if err != nil {
log.Warn("DelTrainJob GetCloudbrainByJobID from DB failed:%s %v", jobID, err.Error())
}

retry := 0
@@ -1145,14 +1147,16 @@ func DelTrainJobVersion(jobID string, versionID string) (*models.TrainJobResult,
var result models.TrainJobResult

//get cloudbrain job by jobid
finetuneJob, _ := models.GetCloudbrainByJobID(jobID)
log.Info("调试:%s", finetuneJob.FineTune)
if finetuneJob.FineTune {
err := ServiceDelete(jobID)
if err != nil {
log.Error("盘古微调部署: Delete Deploy failed:%s %v", jobID, err.Error())
return &result, err
if finetuneJob, err := models.GetCloudbrainByJobID(jobID); finetuneJob != nil && err == nil {
if finetuneJob.FineTune {
err := ServiceDelete(jobID)
if err != nil {
log.Error("panguService: Delete Deploy failed:%s %v", jobID, err.Error())
return nil, err
}
}
} else if err != nil {
log.Warn("DelTrainJobVersion GetCloudbrainByJobID failed, cannnot get job from DB:%s %v", jobID, err.Error())
}

retry := 0
@@ -1859,28 +1863,30 @@ sendjob:
func ServiceDelete(jobID string) error {
if deploy, _ := models.GetModelartsDeployByJobID(jobID); deploy != nil {
if deploy.Status == "STOP" || deploy.Status == "FAILED" {

if deploy.ServiceID != "" {
err := DeleteDeployService(deploy.ServiceID)
if err != nil {
if err := DeleteDeployService(deploy.ServiceID); err != nil {
log.Error("panguService: Delete DeployService API failed:%s %v", jobID, err.Error())
return err
} else {
log.Info("panguService: deploy service delete success %s", jobID)
}
log.Info("panguService: deploy service delete success %s", jobID)
}

if deploy.ModelID != "" {
err := DeleteDeployModel(deploy.ModelID)
if err != nil {
if err := DeleteDeployModel(deploy.ModelID); err != nil {
log.Error("panguService: Delete DeployModel API failed:%s %v", jobID, err.Error())
return err
} else {
log.Info("panguService: deploy model delete success %s", jobID)
}
log.Info("panguService: deploy model delete success %s", jobID)
}
err := models.DeleteModelartsDeploy(jobID)
if err != nil {
if err := models.DeleteModelartsDeploy(jobID); err != nil {
log.Error("panguService: Delete ModelartsDeploy from DB failed:%s %v", jobID, err.Error())
return err
} else {
log.Info("panguService: deploy DB record delete success %s", jobID)
}
log.Info("panguService: deploy DB record delete success %s", jobID)
} else {
log.Error("the job(%s) is a deploying finetune job, can be not deleted", jobID)
return fmt.Errorf("1")


+ 63
- 0
modules/modelarts/wenxinresty.go View File

@@ -1,6 +1,8 @@
package modelarts

import (
"bytes"
"crypto/tls"
"encoding/base64"
"encoding/json"
"fmt"
@@ -13,6 +15,7 @@ import (

"code.gitea.io/gitea/models"
"code.gitea.io/gitea/modules/log"
"code.gitea.io/gitea/modules/modelarts_gateway/core"
"code.gitea.io/gitea/modules/setting"
)

@@ -29,6 +32,66 @@ type WenXinResult struct {
Result string `json:"result"`
}

var (
cdHttpClient *http.Client
)

func getCDHttpClient() *http.Client {
if cdHttpClient == nil {
cdHttpClient = &http.Client{
Timeout: 30 * time.Second,
Transport: &http.Transport{TLSClientConfig: &tls.Config{InsecureSkipVerify: true}},
}
}
return cdHttpClient
}

func CreateWenXinJobToCD(modelapp *models.ModelApp, url string) (*WenXinResult, error) {
createJobParams := &CreateWenXinParams{
Data: WenXinText{
Prompt: modelapp.Desc,
},
Parameters: make(map[string]string),
}
var result WenXinResult

client := getCDHttpClient()
s := core.Signer{
Key: setting.ModelartsCD.AccessKey,
Secret: setting.ModelartsCD.SecretKey,
}

req, _ := json.Marshal(createJobParams)
r, _ := http.NewRequest(http.MethodPost, url, ioutil.NopCloser(bytes.NewBuffer(req)))
log.Info("send to cd modelarts")
r.Header.Add("content-type", "application/json")
s.Sign(r)
res, err := client.Do(r)
if err == nil {
if res.StatusCode == 200 {
defer res.Body.Close()
body, err := ioutil.ReadAll(res.Body)
if err != nil {
log.Error("ioutil.ReadAll failed: %s", err.Error())
return &result, fmt.Errorf("ioutil.ReadAll failed: %s", err.Error())
}
err = json.Unmarshal(body, &result)
if err != nil {
log.Error("json.Unmarshal failed: %s", err.Error())
return &result, fmt.Errorf("son.Unmarshal failed: %s", err.Error())
}
return &result, nil
} else {
log.Info("res.status=" + fmt.Sprint(res.StatusCode))
return nil, fmt.Errorf("Service unavailable")
}
} else {
log.Info("error =" + err.Error())
return nil, fmt.Errorf("Service unavailable")
}

}

func CreateWenXinJob(modelapp *models.ModelApp, url string) (*WenXinResult, error) {
createJobParams := &CreateWenXinParams{
Data: WenXinText{


+ 24
- 0
modules/setting/screen_map.go View File

@@ -0,0 +1,24 @@
package setting

var ScreenMap = struct {
ShowData bool
MinValue int
MaxValue int
}{}

var IPInfo = struct {
Host string
Token string
}{}

func NewScreenMapConfig() {
sec := Cfg.Section("Screen")
ScreenMap.ShowData = sec.Key("ShowData").MustBool(false)
ScreenMap.MinValue = sec.Key("MinValue").MustInt(130)
ScreenMap.MaxValue = sec.Key("MaxValue").MustInt(190)

sec = Cfg.Section("IPInfo")

IPInfo.Host = sec.Key("Host").MustString("https://ipinfo.io")
IPInfo.Token = sec.Key("Token").MustString("df2b002afe582a")
}

+ 90
- 55
modules/setting/setting.go View File

@@ -70,6 +70,8 @@ type C2NetSequenceInfo struct {
Name string `json:"name"`
Content string `json:"content"`
ContentEN string `json:"content_en"`
Loc string `json:"loc"`
Type string `json:"type"`
}

type C2NetSqInfos struct {
@@ -624,20 +626,21 @@ var (

//grampus config
Grampus = struct {
Env string
Host string
UserName string
Password string
SpecialPools string
C2NetSequence string
SyncScriptProject string
LocalCenterID string
GPULocalCenterID string
AiCenterInfo string
AiCenterCodeAndNameInfo string
UsageRateBeginTime string
GPUImageCommonName string
MultiNode string
Env string
Host string
UserName string
Password string
SpecialPools string
C2NetSequence string
SyncScriptProject string
LocalCenterID string
GPULocalCenterID string
AiCenterInfo string
AiCenterCodeAndNameInfo string
AiCenterCodeAndNameAndLocInfo string
UsageRateBeginTime string
GPUImageCommonName string
MultiNode string
}{}

ClearStrategy = struct {
@@ -655,6 +658,8 @@ var (
C2NetMapInfo map[string]*C2NetSequenceInfo
AiCenterCodeAndNameMapInfo map[string]*C2NetSequenceInfo

AiCenterCodeAndNameAndLocMapInfo map[string]*C2NetSequenceInfo

//elk config
ElkUrl string
ElkUser string
@@ -842,6 +847,13 @@ var (
ModelApp = struct {
DesensitizationUrl string
}{}

FLOW_CONTROL = struct {
ATTACHEMENT_NUM_A_USER_LAST24HOUR int
ATTACHEMENT_NUM_A_USER_LAST10M int
ATTACHEMENT_SIZE_A_USER int64 //G
ALL_ATTACHEMENT_NUM_SDK int
}{}
)

// DateLang transforms standard language locale name to corresponding value in datetime plugin.
@@ -1623,30 +1635,7 @@ func NewContext() {
UserBasePath = sec.Key("BASE_PATH_USER").MustString("users/")
PROXYURL = sec.Key("PROXY_URL").MustString("")

sec = Cfg.Section("modelarts")
ModelArtsHost = sec.Key("ENDPOINT").MustString("")
IamHost = sec.Key("IAMHOST").MustString("")
ProjectID = sec.Key("PROJECT_ID").MustString("")
ProjectName = sec.Key("PROJECT_NAME").MustString("")
ModelArtsUsername = sec.Key("USERNAME").MustString("")
ModelArtsPassword = sec.Key("PASSWORD").MustString("")
ModelArtsDomain = sec.Key("DOMAIN").MustString("")
AllowedOrg = sec.Key("ORGANIZATION").MustString("")
ProfileID = sec.Key("PROFILE_ID").MustString("")
PoolInfos = sec.Key("POOL_INFOS").MustString("")
ImageInfos = sec.Key("IMAGE_INFOS").MustString("")
Capacity = sec.Key("CAPACITY").MustInt(100)
MaxTempQueryTimes = sec.Key("MAX_TEMP_QUERY_TIMES").MustInt(30)
ResourcePools = sec.Key("Resource_Pools").MustString("")
Engines = sec.Key("Engines").MustString("")
EngineVersions = sec.Key("Engine_Versions").MustString("")
FlavorInfos = sec.Key("FLAVOR_INFOS").MustString("")
TrainJobFLAVORINFOS = sec.Key("TrainJob_FLAVOR_INFOS").MustString("")
ModelArtsSpecialPools = sec.Key("SPECIAL_POOL").MustString("")
ModelArtsMultiNode = sec.Key("MULTI_NODE").MustString("")
ModelArtsShareAddr = sec.Key("ModelArts_Share_Addr").MustString("192.168.0.30:/")
ModelArtsMountPath = sec.Key("ModelArts_Mount_Path").MustString("/cache/sfs")
ModelArtsNasType = sec.Key("ModelArts_Nas_Type").MustString("nfs")
GetModelartsConfig()

sec = Cfg.Section("elk")
ElkUrl = sec.Key("ELKURL").MustString("")
@@ -1742,13 +1731,43 @@ func NewContext() {
BaiduWenXin.RUN_WORKERS = sec.Key("RUN_WORKERS").MustInt(1)
BaiduWenXin.MODEL_SERVERS = sec.Key("MODEL_SERVERS").MustInt(1)

getGrampusConfig()
getModelartsCDConfig()
GetGrampusConfig()
GetModelartsCDConfig()
getModelConvertConfig()
getModelSafetyConfig()
getModelAppConfig()
getClearStrategy()
NewScreenMapConfig()
}

func GetModelartsConfig() {
sec := Cfg.Section("modelarts")
ModelArtsHost = sec.Key("ENDPOINT").MustString("")
IamHost = sec.Key("IAMHOST").MustString("")
ProjectID = sec.Key("PROJECT_ID").MustString("")
ProjectName = sec.Key("PROJECT_NAME").MustString("")
ModelArtsUsername = sec.Key("USERNAME").MustString("")
ModelArtsPassword = sec.Key("PASSWORD").MustString("")
ModelArtsDomain = sec.Key("DOMAIN").MustString("")
AllowedOrg = sec.Key("ORGANIZATION").MustString("")
ProfileID = sec.Key("PROFILE_ID").MustString("")
PoolInfos = sec.Key("POOL_INFOS").MustString("")
ImageInfos = sec.Key("IMAGE_INFOS").MustString("")
Capacity = sec.Key("CAPACITY").MustInt(100)
MaxTempQueryTimes = sec.Key("MAX_TEMP_QUERY_TIMES").MustInt(30)
ResourcePools = sec.Key("Resource_Pools").MustString("")
Engines = sec.Key("Engines").MustString("")
EngineVersions = sec.Key("Engine_Versions").MustString("")
FlavorInfos = sec.Key("FLAVOR_INFOS").MustString("")
TrainJobFLAVORINFOS = sec.Key("TrainJob_FLAVOR_INFOS").MustString("")
ModelArtsSpecialPools = sec.Key("SPECIAL_POOL").MustString("")
ModelArtsMultiNode = sec.Key("MULTI_NODE").MustString("")
ModelArtsShareAddr = sec.Key("ModelArts_Share_Addr").MustString("192.168.0.30:/")
ModelArtsMountPath = sec.Key("ModelArts_Mount_Path").MustString("/cache/sfs")
ModelArtsNasType = sec.Key("ModelArts_Nas_Type").MustString("nfs")

getFineTuneConfig()
getFlowControlConfig()
}

func getModelSafetyConfig() {
@@ -1787,14 +1806,20 @@ func getModelConvertConfig() {
ModelConvert.PaddleOnnxBootFile = sec.Key("PaddleOnnxBootFile").MustString("convert_paddle.py")
ModelConvert.MXnetOnnxBootFile = sec.Key("MXnetOnnxBootFile").MustString("convert_mxnet.py")
}
func getFlowControlConfig() {
sec := Cfg.Section("flow_control")
FLOW_CONTROL.ALL_ATTACHEMENT_NUM_SDK = sec.Key("ALL_ATTACHEMENT_NUM_SDK").MustInt(100)
FLOW_CONTROL.ATTACHEMENT_NUM_A_USER_LAST24HOUR = sec.Key("ATTACHEMENT_NUM_A_USER_LAST24HOUR").MustInt(1000)
FLOW_CONTROL.ATTACHEMENT_NUM_A_USER_LAST10M = sec.Key("ATTACHEMENT_NUM_A_USER_LAST10M").MustInt(10)
FLOW_CONTROL.ATTACHEMENT_SIZE_A_USER = sec.Key("ATTACHEMENT_SIZE_A_USER").MustInt64(500)
}

func getModelAppConfig() {
sec := Cfg.Section("model_app")
ModelApp.DesensitizationUrl = sec.Key("desensitization_url").MustString("")

}

func getModelartsCDConfig() {
func GetModelartsCDConfig() {
sec := Cfg.Section("modelarts-cd")

ModelartsCD.Enabled = sec.Key("ENABLED").MustBool(false)
@@ -1821,7 +1846,7 @@ func getClearStrategy() {
ClearStrategy.RunAtStart = sec.Key("RUN_AT_START").MustBool(false)
}

func getGrampusConfig() {
func GetGrampusConfig() {
sec := Cfg.Section("grampus")

Grampus.Env = sec.Key("ENV").MustString("TEST")
@@ -1831,6 +1856,8 @@ func getGrampusConfig() {
Grampus.SpecialPools = sec.Key("SPECIAL_POOL").MustString("")
Grampus.C2NetSequence = sec.Key("C2NET_SEQUENCE").MustString("{\"sequence\":[{\"id\":1,\"name\":\"cloudbrain_one\",\"content\":\"鹏城云脑一号\",\"content_en\":\"Pencheng Cloudbrain Ⅰ\"},{\"id\":2,\"name\":\"cloudbrain_two\",\"content\":\"鹏城云脑二号\",\"content_en\":\"Pencheng Cloudbrain Ⅱ\"},{\"id\":3,\"name\":\"beida\",\"content\":\"北大人工智能集群系统\",\"content_en\":\"Peking University AI Center\"},{\"id\":4,\"name\":\"hefei\",\"content\":\"合肥类脑智能开放平台\",\"content_en\":\"Hefei AI Center\"},{\"id\":5,\"name\":\"wuhan\",\"content\":\"武汉人工智能计算中心\",\"content_en\":\"Wuhan AI Center\"},{\"id\":6,\"name\":\"xian\",\"content\":\"西安未来人工智能计算中心\",\"content_en\":\"Xi'an AI Center\"},{\"id\":7,\"pclcci\":\"more\",\"content\":\"鹏城云计算所\",\"content_en\":\"Pengcheng Cloud Computing Institute\"},{\"id\":8,\"name\":\"xuchang\",\"content\":\"中原人工智能计算中心\",\"content_en\":\"Zhongyuan AI Center\"},{\"id\":9,\"name\":\"chengdu\",\"content\":\"成都人工智能计算中心\",\"content_en\":\"Chengdu AI Center\"},{\"id\":10,\"name\":\"more\",\"content\":\"横琴先进智能计算中心\",\"content_en\":\"Hengqin AI Center\"},{\"id\":11,\"name\":\"more\",\"content\":\"国家超级计算济南中心\",\"content_en\":\"HPC & AI Center\"}]}")
Grampus.AiCenterCodeAndNameInfo = sec.Key("AI_CENTER_CODE_AND_NAME").MustString("{\"sequence\":[{\"id\":1,\"name\":\"cloudbrain_one\",\"content\":\"鹏城云脑一号\",\"content_en\":\"Pencheng Cloudbrain Ⅰ\"},{\"id\":2,\"name\":\"cloudbrain_two\",\"content\":\"鹏城云脑二号\",\"content_en\":\"Pencheng Cloudbrain Ⅱ\"},{\"id\":3,\"name\":\"beida\",\"content\":\"北大人工智能集群系统\",\"content_en\":\"Peking University AI Center\"},{\"id\":4,\"name\":\"hefei\",\"content\":\"合肥类脑智能开放平台\",\"content_en\":\"Hefei AI Center\"},{\"id\":5,\"name\":\"wuhan\",\"content\":\"武汉人工智能计算中心\",\"content_en\":\"Wuhan AI Center\"},{\"id\":6,\"name\":\"xian\",\"content\":\"西安未来人工智能计算中心\",\"content_en\":\"Xi'an AI Center\"},{\"id\":7,\"pclcci\":\"more\",\"content\":\"鹏城云计算所\",\"content_en\":\"Pengcheng Cloud Computing Institute\"},{\"id\":8,\"name\":\"xuchang\",\"content\":\"中原人工智能计算中心\",\"content_en\":\"Zhongyuan AI Center\"},{\"id\":9,\"name\":\"chengdu\",\"content\":\"成都人工智能计算中心\",\"content_en\":\"Chengdu AI Center\"},{\"id\":10,\"name\":\"more\",\"content\":\"横琴先进智能计算中心\",\"content_en\":\"Hengqin AI Center\"},{\"id\":11,\"name\":\"more\",\"content\":\"国家超级计算济南中心\",\"content_en\":\"HPC & AI Center\"}]}")
Grampus.AiCenterCodeAndNameAndLocInfo = sec.Key("AI_CENTER_CODE_AND_NAME_AND_LOC").MustString("{\"sequence\":[{\"id\":1,\"name\":\"cloudbrain_one\",\"content\":\"鹏城云脑一号\",\"content_en\":\"Pencheng Cloudbrain Ⅰ\"},{\"id\":2,\"name\":\"cloudbrain_two\",\"content\":\"鹏城云脑二号\",\"content_en\":\"Pencheng Cloudbrain Ⅱ\"},{\"id\":3,\"name\":\"beida\",\"content\":\"北大人工智能集群系统\",\"content_en\":\"Peking University AI Center\"},{\"id\":4,\"name\":\"hefei\",\"content\":\"合肥类脑智能开放平台\",\"content_en\":\"Hefei AI Center\"},{\"id\":5,\"name\":\"wuhan\",\"content\":\"武汉人工智能计算中心\",\"content_en\":\"Wuhan AI Center\"},{\"id\":6,\"name\":\"xian\",\"content\":\"西安未来人工智能计算中心\",\"content_en\":\"Xi'an AI Center\"},{\"id\":7,\"pclcci\":\"more\",\"content\":\"鹏城云计算所\",\"content_en\":\"Pengcheng Cloud Computing Institute\"},{\"id\":8,\"name\":\"xuchang\",\"content\":\"中原人工智能计算中心\",\"content_en\":\"Zhongyuan AI Center\"},{\"id\":9,\"name\":\"chengdu\",\"content\":\"成都人工智能计算中心\",\"content_en\":\"Chengdu AI Center\"},{\"id\":10,\"name\":\"more\",\"content\":\"横琴先进智能计算中心\",\"content_en\":\"Hengqin AI Center\"},{\"id\":11,\"name\":\"more\",\"content\":\"国家超级计算济南中心\",\"content_en\":\"HPC & AI Center\"}]}")

Grampus.UsageRateBeginTime = sec.Key("USAGE_RATE_BEGIN_TIME").MustString("2021-01-01 00:00:00")
Grampus.GPUImageCommonName = sec.Key("GPU_IMAGE_COMMON_NAME").MustString("image")
if Grampus.C2NetSequence != "" {
@@ -1842,6 +1869,15 @@ func getGrampusConfig() {
C2NetMapInfo[value.Name] = value
}
}
if Grampus.AiCenterCodeAndNameAndLocInfo != "" {
if err := json.Unmarshal([]byte(Grampus.AiCenterCodeAndNameAndLocInfo), &C2NetInfos); err != nil {
log.Error("Unmarshal(AiCenterCodeAndNameLocInfo) failed:%v", err)
}
AiCenterCodeAndNameAndLocMapInfo = make(map[string]*C2NetSequenceInfo)
for _, value := range C2NetInfos.C2NetSqInfo {
AiCenterCodeAndNameAndLocMapInfo[value.Name] = value
}
}
if Grampus.AiCenterCodeAndNameInfo != "" {
if err := json.Unmarshal([]byte(Grampus.AiCenterCodeAndNameInfo), &C2NetInfos); err != nil {
log.Error("Unmarshal(AiCenterCodeAndNameInfo) failed:%v", err)
@@ -1851,6 +1887,7 @@ func getGrampusConfig() {
AiCenterCodeAndNameMapInfo[value.Name] = value
}
}

Grampus.SyncScriptProject = sec.Key("SYNC_SCRIPT_PROJECT").MustString("script_for_grampus")
Grampus.LocalCenterID = sec.Key("LOCAL_CENTER_ID").MustString("cloudbrain2")
Grampus.GPULocalCenterID = sec.Key("GPU_LOCAL_CENTER_ID").MustString("openi")
@@ -1984,22 +2021,20 @@ func ensureLFSDirectory() {
}

func getNotebookImageInfos() {
if StImageInfos == nil {
if ModelartsCD.Enabled {
json.Unmarshal([]byte(ModelartsCD.ImageInfos), &StImageInfos)
} else {
json.Unmarshal([]byte(ImageInfos), &StImageInfos)
}

if ModelartsCD.Enabled {
json.Unmarshal([]byte(ModelartsCD.ImageInfos), &StImageInfos)
} else {
json.Unmarshal([]byte(ImageInfos), &StImageInfos)
}
}

func getNotebookFlavorInfos() {
if StFlavorInfo == nil {
if ModelartsCD.Enabled {
json.Unmarshal([]byte(ModelartsCD.FlavorInfos), &StFlavorInfo)
} else {
json.Unmarshal([]byte(FlavorInfos), &StFlavorInfo)
}

if ModelartsCD.Enabled {
json.Unmarshal([]byte(ModelartsCD.FlavorInfos), &StFlavorInfo)
} else {
json.Unmarshal([]byte(FlavorInfos), &StFlavorInfo)
}
}



+ 1
- 0
modules/structs/cloudbrain.go View File

@@ -99,6 +99,7 @@ type CreateFileNotebookJobOption struct {
OwnerName string `json:"owner_name" binding:"Required"`
ProjectName string `json:"project_name" binding:"Required"`
JobId string `json:"job_id"`
ID int64 `json:"id"`
}

type Cloudbrain struct {


+ 1
- 1
modules/templates/helper.go View File

@@ -794,7 +794,7 @@ func licenses() []string {

// Dataset tasks
func tasks() []string {
return []string{"machine_translation", "question_answering_system", "information_retrieval", "knowledge_graph", "text_annotation", "text_categorization", "emotion_analysis", "language_modeling", "speech_recognition", "automatic_digest", "information_extraction", "description_generation", "image_classification", "face_recognition", "image_search", "target_detection", "image_description_generation", "vehicle_license_plate_recognition", "medical_image_analysis", "unmanned", "unmanned_security", "drone", "vr_ar", "2_d_vision", "2.5_d_vision", "3_d_reconstruction", "image_processing", "video_processing", "visual_input_system", "speech_coding", "speech_enhancement", "speech_synthesis","ROS_hmci"}
return []string{"machine_translation", "question_answering_system", "information_retrieval", "knowledge_graph", "text_annotation", "text_categorization", "emotion_analysis", "language_modeling", "speech_recognition", "automatic_digest", "information_extraction", "description_generation", "image_classification", "face_recognition", "image_search", "target_detection", "image_description_generation", "vehicle_license_plate_recognition", "medical_image_analysis", "unmanned", "unmanned_security", "drone", "vr_ar", "2_d_vision", "2.5_d_vision", "3_d_reconstruction", "image_processing", "video_processing", "visual_input_system", "speech_coding", "speech_enhancement", "speech_synthesis", "ros_hmci_datasets"}
}

func GetRefType(ref string) string {


+ 17
- 5
options/locale/locale_en-US.ini View File

@@ -933,7 +933,7 @@ task.speech_coding= speech coding
task.speech_enhancement= speech enhancement
task.speech_recognition= speech recognition
task.speech_synthesis= speech synthesis
task.ROS_hmci=ROS-hmci Community
task.ros_hmci_datasets=ROS-hmci datasets
category.computer_vision= computer vision
category.natural_language_processing= natural language processing
category.speech_processing= speech processing
@@ -968,7 +968,8 @@ download = Download
modify_description = Modify Description
set_public = Set Public
set_private = Set Private
annotation = Annotation
annotation = Image Annotation
more_annotation = More Annotation
upload_dataset_file = Upload Dataset File
file_description = File Description
data_upload = Dataset Upload
@@ -1091,6 +1092,7 @@ repo_mirror_add=Mirror Project Increment
repo_self_add=Custom Project Increment

debug=Debug
online_debug = Start
debug_again=Restart
stop=Stop
delete=Delete
@@ -1267,6 +1269,7 @@ cloudbrain.morethanonejob=You already have a running or waiting task, create it
cloudbrain.morethanonejob1=You have created an <span style="color:rgba(242, 113, 28, 1);"> equivalent task </span> that is waiting or running, please wait for the task to finish before creating it.
cloudbrain.morethanonejob2=You can view all your Cloud Brain tasks in <a href="/cloudbrains" target="_blank"> Home > Cloudbrain Task </a>.

modelarts.online_infer = Online Inference
modelarts.infer_job_model = Model
modelarts.infer_job_model_file = Model File
modelarts.infer_job = Inference Job
@@ -3176,6 +3179,7 @@ task_c2ent_gcudebugjob=`created GCU type debugging task <a href="%s/grampus/trai
task_c2ent_gcutrainjob=`created GCU type train task <a href="%s/modelarts/train-job/%s">%s</a>`
task_c2ent_mludebugjob=`created MLU type debugging task <a href="%s/grampus/train-job/%s">%s</a>`
task_c2ent_mlutrainjob=`created MLU type train task <a href="%s/modelarts/train-job/%s">%s</a>`
task_c2ent_onlineinferjob=`created GPU type online inference task <a href="%s/grampus/onlineinfer/%s">%s</a>`
task_nputrainjob=`created NPU training task <a href="%s/modelarts/train-job/%s">%s</a>`
task_inferencejob=`created reasoning task <a href="%s/modelarts/inference-job/%s">%s</a>`
task_benchmark=`created profiling task <a href="%s/cloudbrain/benchmark/%s">%s</a>`
@@ -3344,6 +3348,7 @@ SIM2BRAIN_SNN = BENCHMARK
TRAIN = TRAIN
INFERENCE = INFERENCE
BENCHMARK = BENCHMARK
ONLINEINFERENCE = ONLINEINFERENCE
brain_area = Brain Area

Delete_failed=Fail to delete the job, please try again later.
@@ -3363,7 +3368,7 @@ new_debug_gpu_tooltips1 = The code is storaged in <strong style="color:#010101">
new_train_npu_tooltips = The code is storaged in <strong style="color:#010101">%s</strong>, the pre-trained model is storaged in the run parameter <strong style="color:#010101">%s</strong>, and please put your model into <strong style="color:#010101">%s</strong> then you can download it online
new_infer_gpu_tooltips = The dataset is stored in <strong style="color:#010101">%s</strong>, the model file is stored in <strong style="color:#010101">%s</strong>, please store the inference output in <strong style="color:#010101">%s</strong> for subsequent downloads.
code_obs_address = Code OBS address
task_save_most_time = <p><span>*</span>The platform only retains the results of debugge, train, inference and evaluation tasks for nearly<span> 30 </span> days <span>Tasks over 30 days will not be able to download results and view logs, and cannot be debugged or trained again</span></p>
task_save_most_time = <p><span>*</span>The platform only retains the results of debug, train, inference and evaluation tasks for nearly<span> 30 </span> days. <span>Tasks over 30 days will not be able to download results and view logs, and cannot be debugged or trained again</span></p>
query_finetune_fail=Fail to query fine tuning job, please try again later.
finetune_max=The number of fine tuning job you created exceed the limit. please delete some first.
dataset_same_fail=The name of dataset file is used by the fine tune job, please select other dataset file.
@@ -3422,12 +3427,19 @@ multi_task = You have already a running or waiting task, can not create more
job_name_already_used = The job name did already exist
insufficient_point_balance = Insufficient point balance
create_failed = Create AI task failed
restart_failed = Restart AI task failed
restart_failed = Restart AI task failed, please try again later.
stop_failed = Fail to stop the job, please try again later.
can_not_restart = The task was not scheduled successfully before, so it cannot be restart.
dataset_size_over_limit = The size of dataset exceeds limitation (%dGB)
boot_file_must_python = The boot file must be a python file
boot_file_not_exist= The boot file is not exists.
branch_not_exists= The branch does not exist. Please refresh and select again.

[common_error]
system_error = System error.Please try again later
insufficient_permission = Insufficient permissions
param_error = The parameter you submitted is incorrect
wechat_not_bind = Please scan the code and bind to wechat first

[deployment]
deploy_max = The maximum deployment is %v per user
@@ -3437,4 +3449,4 @@ model_copy_failed = Failed to copy the model files
builidng_fail = Failed to build AI Model, please try again later
deletion_notice_repo = There is a deploying or running service related to this repository, please stop the service before deletion.
deletion_notice_trainjob = There is a deploying or running service related to this task, please stop the service before deletion.
stop_service_failed = Failed to stop deploy service
stop_service_failed = Failed to stop deploy service

+ 16
- 4
options/locale/locale_zh-CN.ini View File

@@ -938,7 +938,7 @@ task.speech_coding=语音编码
task.speech_enhancement=语音增强
task.speech_recognition=语音识别
task.speech_synthesis=语音合成
task.ROS_hmci=开源开放社区
task.ros_hmci_datasets=开源开放社区数据集
category.computer_vision=计算机视觉
category.natural_language_processing=自然语言处理
category.speech_processing=语音处理
@@ -973,7 +973,8 @@ download = 下载
modify_description = 修改描述
set_public = 设为公开
set_private = 设为私有
annotation = 标注
annotation = 图片标注
more_annotation = 更多标注
upload_dataset_file = 上传数据集文件
file_description = 文件描述
data_upload = 数据上传
@@ -1090,6 +1091,7 @@ repo_mirror_add=新增镜像项目
repo_self_add=新增自建项目

debug=调试
online_debug = 在线推理
debug_again=再次调试
stop=停止
delete=删除
@@ -1279,6 +1281,7 @@ cloudbrain.morethanonejob=您已经创建了一个正在等待或运行中的同
cloudbrain.morethanonejob1=您已经有 <span style="color:rgba(242, 113, 28, 1);">同类任务</span> 正在等待或运行中,请等待任务结束再创建;
cloudbrain.morethanonejob2=可以在 “<a href="/cloudbrains" target="_blank" >个人中心 > 云脑任务</a>” 查看您所有的云脑任务。

modelarts.online_infer = 在线推理
modelarts.infer_job_model = 模型名称
modelarts.infer_job_model_file = 模型文件
modelarts.infer_job = 推理任务
@@ -3194,6 +3197,7 @@ task_c2ent_gcudebugjob=`创建了GCU类型调试任务 <a href="%s/grampus/noteb
task_c2ent_gcutrainjob=`创建了GCU类型训练任务 <a href="%s/grampus/train-job/%s">%s</a>`
task_c2ent_mludebugjob=`创建了MLU类型调试任务 <a href="%s/grampus/notebook/%s">%s</a>`
task_c2ent_mlutrainjob=`创建了MLU类型训练任务 <a href="%s/grampus/train-job/%s">%s</a>`
task_c2ent_onlineinferjob=`创建了GPU类型在线推理任务 <a href="%s/grampus/onlineinfer/%s">%s</a>`
task_nputrainjob=`创建了NPU类型训练任务 <a href="%s/modelarts/train-job/%s">%s</a>`
task_inferencejob=`创建了推理任务 <a href="%s/modelarts/inference-job/%s">%s</a>`
task_benchmark=`创建了评测任务 <a href="%s/cloudbrain/benchmark/%s">%s</a>`
@@ -3365,6 +3369,7 @@ SIM2BRAIN_SNN = 评测任务
TRAIN = 训练任务
INFERENCE = 推理任务
BENCHMARK = 评测任务
ONLINEINFERENCE = 在线推理
brain_area = 脑区

Delete_failed=任务删除失败,请稍后再试。
@@ -3444,12 +3449,19 @@ multi_task = 您已经有一个正在等待或运行中的任务,请结束该
job_name_already_used = 任务名已被使用,请换一个名称
insufficient_point_balance = 积分余额不足
create_failed = 创建AI任务失败
restart_failed = 再次调试AI任务失败
restart_failed = 再次调试失败,请稍后再试
stop_failed = 任务停止失败,请稍后再试
can_not_restart = 这个任务之前没有调度成功,不能再次调试。
dataset_size_over_limit = 数据集大小超过限制(%dGB)
boot_file_must_python = 启动文件必须是python文件
boot_file_not_exist =启动文件不存在
branch_not_exists= 代码分支不存在,请刷新后重试

[common_error]
system_error = 当前服务不可用,请稍后再试
insufficient_permission = 权限不足
param_error = 提交的参数有误
wechat_not_bind = 请先扫码绑定微信

[deployment]
deploy_max = 每个用户只能同时创建 %v 个部署任务
@@ -3459,4 +3471,4 @@ model_copy_failed = 模型拷贝失败,请重新部署
builidng_fail = AI应用创建失败
deletion_notice_repo = 此项目有正在部署或正在体验的服务,请先停止服务,然后再删除。
deletion_notice_trainjob = 此任务有正在部署或正在体验的服务,请先停止服务,然后再删除。
stop_service_failed = 停止部署服务失败
stop_service_failed = 停止部署服务失败

+ 48
- 16
package-lock.json View File

@@ -1,5 +1,5 @@
{
"name": "aiforge",
"name": "aiforge1",
"lockfileVersion": 2,
"requires": true,
"packages": {
@@ -21,7 +21,7 @@
"dayjs": "1.10.7",
"domino": "2.1.5",
"dropzone": "5.7.2",
"echarts": "3.8.5",
"echarts": "5.4.2",
"element-ui": "2.15.5",
"esdk-obs-browserjs": "3.22.3",
"esdk-obs-nodejs": "3.20.11",
@@ -5448,13 +5448,19 @@
}
},
"node_modules/echarts": {
"version": "3.8.5",
"resolved": "https://registry.npmmirror.com/echarts/download/echarts-3.8.5.tgz",
"integrity": "sha1-WOSlHSdDxvt1JXsNwKnPn1N4rA4=",
"version": "5.4.2",
"resolved": "https://registry.npmmirror.com/echarts/-/echarts-5.4.2.tgz",
"integrity": "sha512-2W3vw3oI2tWJdyAz+b8DuWS0nfXtSDqlDmqgin/lfzbkB01cuMEN66KWBlmur3YMp5nEDEEt5s23pllnAzB4EA==",
"dependencies": {
"zrender": "3.7.4"
"tslib": "2.3.0",
"zrender": "5.4.3"
}
},
"node_modules/echarts/node_modules/tslib": {
"version": "2.3.0",
"resolved": "https://registry.npmmirror.com/tslib/-/tslib-2.3.0.tgz",
"integrity": "sha512-N82ooyxVNm6h1riLCoyS9e3fuJ3AMG2zIZs2Gd1ATcSFjSA23Q0fzjjZeh0jbJvWVDZ0cJT8yaNNaaXHzueNjg=="
},
"node_modules/editions": {
"version": "1.3.4",
"resolved": "https://registry.npmjs.org/editions/-/editions-1.3.4.tgz",
@@ -20940,9 +20946,17 @@
"integrity": "sha1-6NV3TRwHOKR7z6hynzcS4t7d6yU="
},
"node_modules/zrender": {
"version": "3.7.4",
"resolved": "https://registry.nlark.com/zrender/download/zrender-3.7.4.tgz",
"integrity": "sha1-+EfVOUhIHvbUKQbR6prux6y+/fI="
"version": "5.4.3",
"resolved": "https://registry.npmmirror.com/zrender/-/zrender-5.4.3.tgz",
"integrity": "sha512-DRUM4ZLnoaT0PBVvGBDO9oWIDBKFdAVieNWxWwK0niYzJCMwGchRk21/hsE+RKkIveH3XHCyvXcJDkgLVvfizQ==",
"dependencies": {
"tslib": "2.3.0"
}
},
"node_modules/zrender/node_modules/tslib": {
"version": "2.3.0",
"resolved": "https://registry.npmmirror.com/tslib/-/tslib-2.3.0.tgz",
"integrity": "sha512-N82ooyxVNm6h1riLCoyS9e3fuJ3AMG2zIZs2Gd1ATcSFjSA23Q0fzjjZeh0jbJvWVDZ0cJT8yaNNaaXHzueNjg=="
}
},
"dependencies": {
@@ -25312,11 +25326,19 @@
}
},
"echarts": {
"version": "3.8.5",
"resolved": "https://registry.npmmirror.com/echarts/download/echarts-3.8.5.tgz",
"integrity": "sha1-WOSlHSdDxvt1JXsNwKnPn1N4rA4=",
"version": "5.4.2",
"resolved": "https://registry.npmmirror.com/echarts/-/echarts-5.4.2.tgz",
"integrity": "sha512-2W3vw3oI2tWJdyAz+b8DuWS0nfXtSDqlDmqgin/lfzbkB01cuMEN66KWBlmur3YMp5nEDEEt5s23pllnAzB4EA==",
"requires": {
"zrender": "3.7.4"
"tslib": "2.3.0",
"zrender": "5.4.3"
},
"dependencies": {
"tslib": {
"version": "2.3.0",
"resolved": "https://registry.npmmirror.com/tslib/-/tslib-2.3.0.tgz",
"integrity": "sha512-N82ooyxVNm6h1riLCoyS9e3fuJ3AMG2zIZs2Gd1ATcSFjSA23Q0fzjjZeh0jbJvWVDZ0cJT8yaNNaaXHzueNjg=="
}
}
},
"editions": {
@@ -37726,9 +37748,19 @@
"integrity": "sha1-6NV3TRwHOKR7z6hynzcS4t7d6yU="
},
"zrender": {
"version": "3.7.4",
"resolved": "https://registry.nlark.com/zrender/download/zrender-3.7.4.tgz",
"integrity": "sha1-+EfVOUhIHvbUKQbR6prux6y+/fI="
"version": "5.4.3",
"resolved": "https://registry.npmmirror.com/zrender/-/zrender-5.4.3.tgz",
"integrity": "sha512-DRUM4ZLnoaT0PBVvGBDO9oWIDBKFdAVieNWxWwK0niYzJCMwGchRk21/hsE+RKkIveH3XHCyvXcJDkgLVvfizQ==",
"requires": {
"tslib": "2.3.0"
},
"dependencies": {
"tslib": {
"version": "2.3.0",
"resolved": "https://registry.npmmirror.com/tslib/-/tslib-2.3.0.tgz",
"integrity": "sha512-N82ooyxVNm6h1riLCoyS9e3fuJ3AMG2zIZs2Gd1ATcSFjSA23Q0fzjjZeh0jbJvWVDZ0cJT8yaNNaaXHzueNjg=="
}
}
}
}
}

+ 1
- 1
package.json View File

@@ -20,7 +20,7 @@
"dayjs": "1.10.7",
"domino": "2.1.5",
"dropzone": "5.7.2",
"echarts": "3.8.5",
"echarts": "5.4.2",
"element-ui": "2.15.5",
"esdk-obs-browserjs": "3.22.3",
"esdk-obs-nodejs": "3.20.11",


+ 36
- 11
public/home/home.js View File

@@ -246,11 +246,21 @@ document.onreadystatechange = function () {
else if(record.OpType == "24" || record.OpType == "26" || record.OpType == "27" || record.OpType == "28" || record.OpType == "30"
|| record.OpType == "31" || record.OpType == "32" || record.OpType == "33" || record.OpType == "42" || record.OpType == "44"){
html += recordPrefix + actionName;
html += " <a href=\"" + getTaskLink(record) + "\" rel=\"nofollow\">" + record.RefName + "</a>"
const taskLink = getTaskLink(record);
if (taskLink) {
html += " <a href=\"" + taskLink + "\" rel=\"nofollow\">" + record.RefName + "</a>"
} else {
html += " <span style=\"color: rgba(0,0,0,0.3)\">" + record.RefName + "</span>"
}
}
else if(record.OpType == "25" || record.OpType == "29" || record.OpType == "39" || record.OpType == "40" || record.OpType == "41" || record.OpType == "43"){
else if(record.OpType == "25" || record.OpType == "29" || record.OpType == "39" || record.OpType == "40" || record.OpType == "41" || record.OpType == "43"|| record.OpType == "44"|| record.OpType == "45"){
html += recordPrefix + actionName;
html += " <a href=\"" + getTaskLink(record) + "\" rel=\"nofollow\">" + record.RefName + "</a>"
const taskLink = getTaskLink(record);
if (taskLink) {
html += " <a href=\"" + taskLink + "\" rel=\"nofollow\">" + record.RefName + "</a>"
} else {
html += " <span style=\"color: rgba(0,0,0,0.3)\">" + record.RefName + "</span>"
}
}
else if(record.OpType == "35"){
var datasetLink = "<a href=\"" + getRepoLink(record) + "/datasets" + "\" rel=\"nofollow\">" + record.Content.split('|')[1] + "</a>";
@@ -280,9 +290,17 @@ function getTaskLink(record){
if(record.OpType == 24){
re = re + "/datasets";
}else if(record.OpType == 25){
re = re + "/cloudbrain/" + record.Content;
if (record.Cloudbrain) {
re = re + "/cloudbrain/" + record.Cloudbrain.ID;
} else {
re = '';
}
}else if(record.OpType == 26){
re = re + "/modelarts/notebook/" + record.Content;
if (record.Cloudbrain) {
re = re + "/modelarts/notebook/" + record.Content;
} else {
re = '';
}
}else if(record.OpType == 27){
re = re + "/modelarts/train-job/" + record.Content;
}else if(record.OpType == 28){
@@ -296,9 +314,14 @@ function getTaskLink(record){
}else if(record.OpType == 32 || record.OpType == 33 || record.OpType == 42 || record.OpType == 44){
re = re + "/grampus/train-job/" + record.Content;
}else if(record.OpType == 39 || record.OpType == 40 || record.OpType == 41 || record.OpType == 43){
re = re + "/grampus/notebook/" + record.Content;
if (record.Cloudbrain) {
re = re + "/grampus/notebook/" + record.Cloudbrain.ID;
} else {
re = '';
}
} else if(record.OpType == 45){
re = re + "/grampus/onlineinfer/" + record.Content;
}
re = encodeURI(re);
return re;
}
@@ -455,12 +478,13 @@ var actionNameZH={
"35":"创建的数据集 {dataset} 被设置为推荐数据集",
"36":"提交了镜像 {image}",
"37":"提交的镜像 {image} 被设置为推荐镜像",
"39":"创建了CPU/GPU类型调试任务",
"40":"创建了NPU类型调试任务",
"39":"创建了NPU类型调试任务",
"40":"创建了CPU/GPU类型调试任务",
"41":"创建了GCU类型调试任务",
"42":"创建了GCU类型训练任务",
"43":"创建了MLU类型调试任务",
"44":"创建了MLU类型训练任务",
"45":"创建了GPU在线推理任务",
};

var actionNameEN={
@@ -492,12 +516,13 @@ var actionNameEN={
"35":" created dataset {dataset} was set as recommended dataset",
"36":"committed image {image}",
"37":"committed image {image} was set as recommended image",
"39":" created CPU/GPU type debugging task ",
"40":" created NPU type debugging task ",
"39":" created NPU type debugging task ",
"40":" created CPU/GPU type debugging task ",
"41":" created GCU type debugging task ",
"42":" created GCU type training task ",
"43":" created MLU type debugging task ",
"44":" created MLU type training task ",
"45":" created GPU type online inference task ",
};

var repoAndOrgZH={


+ 2
- 0
public/home/search.js View File

@@ -390,6 +390,7 @@ var taskDesc = {
speech_enhancement: "语音增强",
speech_recognition: "语音识别",
speech_synthesis: "语音合成",
ros_hmci_datasets: "开源开放社区",
};

var taskENDesc = {
@@ -426,6 +427,7 @@ var taskENDesc = {
speech_enhancement: "speech enhancement",
speech_recognition: "speech recognition",
speech_synthesis: "speech synthesis",
ros_hmci_datasets: "ROS-hmci datasets",
};

function getCategoryDesc(isZh, key) {


+ 122
- 72
routers/ai_task/ai_task.go View File

@@ -1,46 +1,22 @@
package ai_task

import (
"code.gitea.io/gitea/entity/ai_task_entity"
"net/http"

"code.gitea.io/gitea/entity"
"code.gitea.io/gitea/models"
"code.gitea.io/gitea/modules/cloudbrain"
"code.gitea.io/gitea/modules/context"
"code.gitea.io/gitea/modules/log"
"code.gitea.io/gitea/modules/setting"
"code.gitea.io/gitea/modules/util"
"code.gitea.io/gitea/routers/response"
creation_context "code.gitea.io/gitea/services/ai_task_service/context"
"code.gitea.io/gitea/services/ai_task_service/task"
cloudbrainService "code.gitea.io/gitea/services/cloudbrain"
"code.gitea.io/gitea/services/lock"
"net/http"
)

func CreateAITask(ctx *context.Context, form ai_task_entity.CreateReq) {
func CreateAITask(ctx *context.Context, form entity.CreateReq) {
handCreateReq(&form)

t, err := task.GetAITask(form.JobType, form.Cluster)
if err != nil {
log.Error("param error")
ctx.JSON(http.StatusOK, response.OuterTrBizError(err, ctx))
return
}

lockOperator, errMsg := cloudbrainService.Lock4CloudbrainCreation(&lock.LockContext{Repo: ctx.Repo.Repository, Task: &models.Cloudbrain{DisplayJobName: form.DisplayJobName, JobType: string(form.JobType)}, User: ctx.User})
defer func() {
if lockOperator != nil {
lockOperator.Unlock()
}
}()
if errMsg != "" {
log.Error("lock processed failed:%s", errMsg)
ctx.JSON(http.StatusOK, response.OuterServerError(ctx.Tr(errMsg)))
return
}
res, err := t.Create(&creation_context.CreationContext{
Request: form,
GitRepo: ctx.Repo.GitRepo,
Repository: ctx.Repo.Repository,
User: ctx.User,
})
res, err := task.CreateAITask(form, ctx.Repo.GitRepo, ctx.Repo.Repository, ctx.User)
if err != nil {
ctx.JSON(http.StatusOK, response.OuterTrBizError(err, ctx))
return
@@ -49,7 +25,7 @@ func CreateAITask(ctx *context.Context, form ai_task_entity.CreateReq) {
}
func DelAITask(ctx *context.Context) {
id := ctx.QueryInt64("id")
t, _ := task.GetAITaskByCloudbrainId(id)
t, _ := task.GetAITaskTemplateByCloudbrainId(id)
if t == nil {
log.Error("param error")
ctx.JSON(http.StatusOK, response.OuterTrBizError(response.PARAM_ERROR, ctx))
@@ -65,7 +41,7 @@ func DelAITask(ctx *context.Context) {
}
func StopAITask(ctx *context.Context) {
id := ctx.QueryInt64("id")
t, err := task.GetAITaskByCloudbrainId(id)
t, err := task.GetAITaskTemplateByCloudbrainId(id)
if err != nil {
log.Error("param error")
ctx.JSON(http.StatusOK, response.OuterTrBizError(response.PARAM_ERROR, ctx))
@@ -81,36 +57,7 @@ func StopAITask(ctx *context.Context) {
}
func RestartAITask(ctx *context.Context) {
id := ctx.QueryInt64("id")
cloudbrain, err := models.GetCloudbrainByCloudbrainID(id)
if err != nil {
log.Error("RestartAITask GetCloudbrainByJobID err.%v", err)
ctx.JSON(http.StatusOK, response.OuterTrBizError(response.AI_TASK_NOT_EXISTS, ctx))
return
}
t, bizErr := task.GetAITaskFromCloudbrain(cloudbrain)
if err != nil {
log.Error("param error")
ctx.JSON(http.StatusOK, response.OuterTrBizError(response.PARAM_ERROR, ctx))
return
}

lockOperator, errMsg := cloudbrainService.Lock4CloudbrainRestart(&lock.LockContext{Repo: ctx.Repo.Repository, Task: &models.Cloudbrain{DisplayJobName: cloudbrain.DisplayJobName, JobType: cloudbrain.JobType}, User: ctx.User})
defer func() {
if lockOperator != nil {
lockOperator.Unlock()
}
}()
if errMsg != "" {
log.Error("lock processed failed:%s", errMsg)
ctx.JSON(http.StatusOK, response.OuterServerError(ctx.Tr(errMsg)))
return
}
res, bizErr := t.Restart(&creation_context.CreationContext{
GitRepo: ctx.Repo.GitRepo,
Repository: ctx.Repo.Repository,
User: ctx.User,
SourceCloudbrain: cloudbrain,
})
res, bizErr := task.RestartAITask(id, ctx.Repo.GitRepo, ctx.Repo.Repository, ctx.User)
if bizErr != nil {
ctx.JSON(http.StatusOK, response.OuterTrBizError(bizErr, ctx))
return
@@ -121,7 +68,7 @@ func RestartAITask(ctx *context.Context) {

func GetAITaskLog(ctx *context.Context) {
id := ctx.QueryInt64("id")
t, err := task.GetAITaskByCloudbrainId(id)
t, err := task.GetAITaskTemplateByCloudbrainId(id)
if err != nil {
log.Error("param error")
ctx.JSON(http.StatusOK, response.OuterTrBizError(err, ctx))
@@ -133,18 +80,63 @@ func GetAITaskLog(ctx *context.Context) {

func GetAITaskInfo(ctx *context.Context) {
id := ctx.QueryInt64("id")
t, err := task.GetAITaskByCloudbrainId(id)
cloudbrain, bizErr := models.GetCloudbrainByCloudbrainID(id)
if bizErr != nil {
log.Error("GetAITaskInfo GetCloudbrainByCloudbrainID err.%v", bizErr)
ctx.JSON(http.StatusOK, response.OuterTrBizError(response.AI_TASK_NOT_EXISTS, ctx))
return
}
t, err := task.GetAITaskTemplateFromCloudbrain(cloudbrain)
if err != nil {
log.Error("param error")
ctx.JSON(http.StatusOK, response.OuterTrBizError(err, ctx))
return
}
resultTask, err := t.Query(id)
if err != nil {
log.Error("Query error.id=%d err=%v", id, err)
ctx.JSON(http.StatusOK, response.OuterTrBizError(err, ctx))
return
}
//国际化
resultTask.Tr(ctx.Language())
//根据权限去掉数据集和模型信息
var operatorId int64
if ctx.User != nil {
operatorId = ctx.User.ID
}
if operatorId == 0 || cloudbrain.UserID != operatorId {
resultTask.RemoveDatasets()
resultTask.RemovePretrainModelList()
}
//加载关联版本
earlyVersionList, bizErr := task.QueryTaskEarlyVersionList(id, operatorId)
if bizErr != nil {
log.Error("QueryTaskEarlyVersionList err.id=%d err=%v", id, err)
ctx.JSON(http.StatusOK, response.OuterResponseError(bizErr))
return
}
ctx.JSON(http.StatusOK, response.OuterSuccessWithData(&entity.QueryAITaskRes{
Task: resultTask,
EarlyVersionList: earlyVersionList,
CanCreateVersion: cloudbrain.CanUserModify(ctx.User),
}))
}
func GetAITaskBriefInfo(ctx *context.Context) {
id := ctx.QueryInt64("id")
t, err := task.GetAITaskTemplateByCloudbrainId(id)
if err != nil {
log.Error("param error")
ctx.JSON(http.StatusOK, response.OuterTrBizError(err, ctx))
return
}
res, err := t.Query(id)
res, err := t.BriefQuery(id)
if err != nil {
log.Error("Query error.%v", err)
log.Error("BriefQuery error.%v", err)
ctx.JSON(http.StatusOK, response.OuterTrBizError(err, ctx))
return
}
res.Tr(ctx.Language())
ctx.JSON(http.StatusOK, response.OuterSuccessWithData(res))
}

@@ -154,35 +146,44 @@ func GetAITaskOutput(ctx *context.Context) {

func GetNotebookUrl(ctx *context.Context) {
id := ctx.QueryInt64("id")
t, err := task.GetAITaskByCloudbrainId(id)
t, err := task.GetAITaskTemplateByCloudbrainId(id)
if err != nil {
log.Error("param error")
ctx.JSON(http.StatusOK, response.OuterTrBizError(err, ctx))
return
}
url, err := t.GetDebugUrl(id)
fileName := ctx.QueryTrim("file")
url, err := t.GetDebugUrl(id, fileName)
if err != nil {
log.Error("GetNotebookUrl error.%v", err)
ctx.JSON(http.StatusOK, response.OuterTrBizError(err, ctx))
return
}

m := map[string]interface{}{"url": url}
ctx.JSON(http.StatusOK, response.OuterSuccessWithData(m))
}

func GetCreationRequiredInfo(ctx *context.Context) {
jobType := ctx.Query("job_type")
var isOnlineType bool
if models.JobType(jobType) == (models.JobTypeOnlineInference) {
isOnlineType = true
jobType = string(models.JobTypeDebug)
}
log.Info("required jobType=" + jobType)
computeSourceName := ctx.Query("compute_source")
clusterType := ctx.Query("cluster_type")
computeSource := models.GetComputeSourceInstance(computeSourceName)

result, err := task.GetAITaskCreationInfo(ai_task_entity.GetAITaskCreationInfoReq{
result, err := task.GetAITaskCreationInfo(entity.GetAITaskCreationInfoReq{
User: ctx.User,
JobType: models.JobType(jobType),
ClusterType: ai_task_entity.ClusterType(clusterType),
ClusterType: entity.ClusterType(clusterType),
ComputeSource: computeSource,
Repo: ctx.Repo.Repository,
GitRepo: ctx.Repo.GitRepo,
IsOnlineType: isOnlineType,
})
if err != nil {
log.Error("GetAITaskCreationInfo error,err=%v", err)
@@ -192,7 +193,56 @@ func GetCreationRequiredInfo(ctx *context.Context) {
ctx.JSON(http.StatusOK, response.OuterSuccessWithData(result))
}

func handCreateReq(req *ai_task_entity.CreateReq) {
func GetAITaskList(ctx *context.Context) {
jobType := ctx.Query("job_type")
computeSourceName := ctx.Query("compute_source")
page := ctx.QueryInt("page")
computeSource := models.GetComputeSourceInstance(computeSourceName)
if page <= 0 {
page = 1
}
jobTypes := make([]string, 0)
if jobType != "" {
jobTypes = append(jobTypes, jobType)
}
result, err := task.GetAITaskList(entity.GetTaskListReq{
ListOptions: models.ListOptions{
PageSize: setting.UI.IssuePagingNum,
Page: page,
},
ComputeSource: computeSource,
JobTypes: jobTypes,
RepoID: ctx.Repo.Repository.ID,
Operator: ctx.User,
IsRepoOwner: ctx.Repo.IsOwner(),
})
if err != nil {
log.Error("GetAITaskList error,err=%v", err)
ctx.JSON(http.StatusOK, response.OuterTrBizError(err, ctx))
return
}
result.CanCreateTask = cloudbrain.CanCreateOrDebugJob(ctx)
ctx.JSON(http.StatusOK, response.OuterSuccessWithData(result))
}

func GetAITaskOperationProfile(ctx *context.Context) {
id := ctx.QueryInt64("id")
t, err := task.GetAITaskTemplateByCloudbrainId(id)
if err != nil {
log.Error("param error")
ctx.JSON(http.StatusOK, response.OuterTrBizError(err, ctx))
return
}
r, err := t.GetOperationProfile(id)
if err != nil {
log.Error("GetOperationProfile error.%v", err)
ctx.JSON(http.StatusOK, response.OuterTrBizError(err, ctx))
return
}
ctx.JSON(http.StatusOK, response.OuterSuccessWithData(r))
}

func handCreateReq(req *entity.CreateReq) {
req.JobName = util.ConvertDisplayJobNameToJobName(req.DisplayJobName)
if req.WorkServerNumber == 0 {
req.WorkServerNumber = 1


+ 1
- 0
routers/ai_task/notebook.go View File

@@ -0,0 +1 @@
package ai_task

+ 75
- 13
routers/api/v1/api.go View File

@@ -59,10 +59,15 @@
package v1

import (
"code.gitea.io/gitea/entity"
"code.gitea.io/gitea/routers/response"
"net/http"
"strings"

"code.gitea.io/gitea/entity/ai_task_entity"


cloudbrainService "code.gitea.io/gitea/services/cloudbrain"

"code.gitea.io/gitea/routers/ai_task"

"code.gitea.io/gitea/routers/api/v1/finetune"
@@ -123,6 +128,29 @@ func sudo() macaron.Handler {
}
}

func reqAITaskInRepo() macaron.Handler {
return func(ctx *context.APIContext) {
if ctx.Repo == nil {
ctx.Context.Error(http.StatusUnauthorized)
return
}
id := ctx.QueryInt64("id")
if id <= 0 {
ctx.Context.Error(http.StatusUnauthorized)
return
}
t, err := models.GetCloudbrainByCloudbrainID(id)
if err != nil {
ctx.Context.Error(http.StatusUnauthorized)
return
}
if t.RepoID != ctx.Repo.Repository.ID {
ctx.Context.Error(http.StatusUnauthorized)
return
}
}
}

func repoAssignment() macaron.Handler {
return func(ctx *context.APIContext) {
userName := ctx.Params(":username")
@@ -341,6 +369,15 @@ func reqWeChat() macaron.Handler {
}
}

func reqWeChatStandard() macaron.Handler {
return func(ctx *context.Context) {
if setting.WechatAuthSwitch && ctx.User.WechatOpenId == "" {
ctx.JSON(http.StatusOK, response.OuterTrBizError(response.WECHAT_NOT_BIND, ctx))
return
}
}
}

// reqAnyRepoReader user should have any permission to read repository or permissions of site admin
func reqAnyRepoReader() macaron.Handler {
return func(ctx *context.Context) {
@@ -610,17 +647,22 @@ func RegisterRoutes(m *macaron.Macaron) {

m.Group("/:username/:reponame", func() {
m.Group("/ai_task", func() {
m.Post("/create", reqWeChat(), reqRepoWriter(models.UnitTypeCloudBrain), bind(ai_task_entity.CreateReq{}), ai_task.CreateAITask)
m.Get("", reqWeChat(), reqRepoWriter(models.UnitTypeCloudBrain), ai_task.GetAITaskInfo)
m.Post("/stop", reqWeChat(), reqRepoWriter(models.UnitTypeCloudBrain), reqAdminOrOwnerAITaskCreator(), ai_task.StopAITask)
m.Post("/del", reqWeChat(), reqRepoWriter(models.UnitTypeCloudBrain), reqAdminOrOwnerAITaskCreator(), ai_task.DelAITask)
m.Post("/restart", reqWeChat(), reqRepoWriter(models.UnitTypeCloudBrain), reqAdminOrAITaskCreator(), ai_task.RestartAITask)
m.Get("/log", reqWeChat(), reqRepoWriter(models.UnitTypeCloudBrain), ai_task.GetAITaskLog)
m.Get("/output", reqWeChat(), reqRepoWriter(models.UnitTypeCloudBrain), ai_task.GetAITaskOutput)
m.Get("/debug_url", reqWeChat(), reqRepoWriter(models.UnitTypeCloudBrain), ai_task.GetNotebookUrl)
m.Get("/creation/required", reqWeChat(), reqRepoWriter(models.UnitTypeCloudBrain), ai_task.GetCreationRequiredInfo)
}, context.RepoRef())
}, reqToken(), repoAssignment())
m.Post("/create", reqWeChatStandard(), reqRepoWriter(models.UnitTypeCloudBrain), bind(entity.CreateReq{}), ai_task.CreateAITask)
m.Post("/stop", reqWeChatStandard(), reqRepoWriter(models.UnitTypeCloudBrain), reqAITaskInRepo(), reqAdminOrOwnerAITaskCreator(), ai_task.StopAITask)
m.Post("/del", reqWeChatStandard(), reqRepoWriter(models.UnitTypeCloudBrain), reqAITaskInRepo(), reqAdminOrOwnerAITaskCreator(), ai_task.DelAITask)
m.Post("/restart", reqWeChatStandard(), reqRepoWriter(models.UnitTypeCloudBrain), reqAITaskInRepo(), reqAdminOrAITaskCreator(), ai_task.RestartAITask)
m.Get("/log", reqWeChatStandard(), reqRepoWriter(models.UnitTypeCloudBrain), reqAITaskInRepo(), ai_task.GetAITaskLog)
m.Get("/output", reqWeChatStandard(), reqRepoWriter(models.UnitTypeCloudBrain), reqAITaskInRepo(), ai_task.GetAITaskOutput)
m.Get("/debug_url", reqWeChatStandard(), reqRepoWriter(models.UnitTypeCloudBrain), reqAITaskInRepo(), ai_task.GetNotebookUrl)
m.Get("/creation/required", reqWeChatStandard(), reqRepoWriter(models.UnitTypeCloudBrain), ai_task.GetCreationRequiredInfo)
}, reqToken(), context.RepoRef())
m.Group("/ai_task", func() {
m.Get("", reqRepoReader(models.UnitTypeCloudBrain), ai_task.GetAITaskInfo)
m.Get("/brief", reqRepoReader(models.UnitTypeCloudBrain), reqAITaskInRepo(), ai_task.GetAITaskBriefInfo)
m.Get("/list", reqRepoReader(models.UnitTypeCloudBrain), ai_task.GetAITaskList)
m.Get("/operation_profile", reqRepoReader(models.UnitTypeCloudBrain), reqAITaskInRepo(), ai_task.GetAITaskOperationProfile)
})
}, repoAssignment())
// Miscellaneous
if setting.API.EnableSwagger {
m.Get("/swagger", misc.Swagger)
@@ -670,6 +712,12 @@ func RegisterRoutes(m *macaron.Macaron) {
m.Post("/complete_multipart", repo.CompleteMultipart)

}, reqToken())
m.Group("/attachments/model", func() {
m.Get("/get_chunks", repo.GetModelChunks)
m.Get("/new_multipart", repo.NewModelMultipart)
m.Get("/get_multipart_url", repo.GetModelMultipartUploadUrl)
m.Post("/complete_multipart", repo.CompleteModelMultipart)
})
m.Group("/pipeline", func() {
m.Post("/notification", bind(api.PipelineNotification{}), notify.PipelineNotify)

@@ -748,6 +796,9 @@ func RegisterRoutes(m *macaron.Macaron) {

//cloudbrain board
m.Get("/cloudbrainboard/cloudbrain/resource_queues", repo.GetResourceQueues)
m.Get("/cloudbrainboard/ai_center_overview", repo.GetCloubrainOverviewGroupByAiCenter)
m.Get("/cloudbrainboard/location", cloudbrainService.GetCloudbrainLocationInfo)

m.Group("/cloudbrainboard", func() {
m.Get("/downloadAll", repo.DownloadCloudBrainBoard)
m.Group("/cloudbrain", func() {
@@ -870,11 +921,16 @@ func RegisterRoutes(m *macaron.Macaron) {
m.Get("/my_datasets", repo.MyDatasetsMultiple)
m.Get("/public_datasets", repo.PublicDatasetMultiple)
m.Get("/my_favorite", repo.MyFavoriteDatasetMultiple)
m.Group("/model", func() {
m.Get("/getmodelfile", repo.GetDataSetSelectItemByJobId)
m.Get("/getprogress", repo.GetExportDataSetByMsgId)
m.Post("/export_exist_dataset", repo.ExportModelToExistDataSet)
})
}, reqToken(), repoAssignment())

m.Group("/file_notebook", func() {
m.Get("", repo.GetFileNoteBookInfo)
m.Post("/create", reqToken(), reqWeChat(), bind(api.CreateFileNotebookJobOption{}), repo.CreateFileNoteBook)
m.Post("/create", reqToken(), reqWeChatStandard(), bind(api.CreateFileNotebookJobOption{}), repo.CreateFileNoteBook)
m.Post("/status", reqToken(), bind(api.CreateFileNotebookJobOption{}), repo.FileNoteBookStatus)
})

@@ -1179,6 +1235,8 @@ func RegisterRoutes(m *macaron.Macaron) {
}, reqRepoReader(models.UnitTypeCloudBrain))
m.Group("/modelmanage", func() {
m.Post("/create_new_model", repo.CreateNewModel)
m.Post("/create_local_model", repo.SaveLocalModel)
m.Delete("/delete_model_file", repo.DeleteModelFile)
m.Get("/show_model_api", repo.ShowModelManageApi)
m.Delete("/delete_model", repo.DeleteModel)
m.Get("/downloadall", repo.DownloadModel)
@@ -1225,6 +1283,7 @@ func RegisterRoutes(m *macaron.Macaron) {
m.Post("/del_version", repo.DelTrainJobVersion)
m.Post("/stop_version", repo.StopTrainJobVersion)
m.Get("/result_list", repo.ResultList)
m.Get("/downloadall", repo.DownloadMultiResultFile)
})
})
}, reqRepoReader(models.UnitTypeCloudBrain))
@@ -1239,8 +1298,11 @@ func RegisterRoutes(m *macaron.Macaron) {
m.Post("/stop_version", cloudbrain.AdminOrOwnerOrJobCreaterRightForTrain, repo_ext.GrampusStopJob)
m.Get("/log", repo_ext.GrampusGetLog)
m.Get("/metrics", repo_ext.GrampusMetrics)
m.Get("/metrics/:nodeId", repo_ext.GrampusMetrics)
m.Get("/log/:nodeId", repo_ext.GrampusGetLog)
m.Get("/download_multi_model", cloudbrain.AdminOrJobCreaterRightForTrain, repo.MultiModelDownload)
m.Get("/download_log", cloudbrain.AdminOrJobCreaterRightForTrain, repo_ext.GrampusDownloadLog)
m.Get("/download_log/:nodeId", cloudbrain.AdminOrJobCreaterRightForTrain, repo_ext.GrampusDownloadLog)
m.Get("/job_event", repo_ext.GrampusTrainJobEvents)
})
})


+ 19
- 2
routers/api/v1/finetune/panguervice.go View File

@@ -199,9 +199,26 @@ func SyncPanguDeployStatus() {

func GetPanguDeployStatus(ctx *context.APIContext) {
var jobID = ctx.Params(":jobid")
status, _ := models.GetModelartsDeployStatusByJobID(jobID)

status, err := models.GetModelartsDeployStatusByJobID(jobID)
if err != nil {
log.Info("panguService: GetPanguDeployStatus, jobID %s, err %v", jobID, status, err)
ctx.JSON(http.StatusOK, models.BaseErrorMessageApi(err.Error()))
return
}

finishTime, err := models.GetModelartsDeployFinishTimebyJobID(jobID)
if err != nil {
log.Info("panguService: GetModelartsDeployFinishTimebyJobID, jobID %s, err %v", jobID, status, err)
ctx.JSON(http.StatusOK, models.BaseErrorMessageApi(err.Error()))
return
}

log.Info("panguService: GetPanguDeployStatus, jobID %s, status %s, finishTime %s", jobID, status, finishTime)

ctx.JSON(http.StatusOK, map[string]interface{}{
"fineTuneDeployStatus": status,
"fineTuneDeployStatus": status,
"fineTuneDeployFinishTime": finishTime,
})
}



+ 53
- 2
routers/api/v1/repo/attachments.go View File

@@ -2,6 +2,7 @@ package repo

import (
"net/http"
"sync"

"code.gitea.io/gitea/modules/log"

@@ -10,6 +11,8 @@ import (
routeRepo "code.gitea.io/gitea/routers/repo"
)

var mutex *sync.Mutex = new(sync.Mutex)

func GetSuccessChunks(ctx *context.APIContext) {
if errStr := checkDatasetPermission(ctx); errStr != "" {
ctx.JSON(http.StatusForbidden, ctx.Tr(errStr))
@@ -47,9 +50,34 @@ func checkDatasetPermission(ctx *context.APIContext) string {

func NewMultipart(ctx *context.APIContext) {
if errStr := checkDatasetPermission(ctx); errStr != "" {
ctx.JSON(http.StatusForbidden, ctx.Tr(errStr))
ctx.JSON(200, map[string]string{
"result_code": "-1",
"msg": ctx.Tr(errStr),
})
return
}
if err := routeRepo.CheckFlowForDatasetSDK(); err != nil {
ctx.JSON(200, map[string]string{
"result_code": "-1",
"msg": err.Error(),
})
return
}
mutex.Lock()
defer mutex.Unlock()
datasetId := ctx.QueryInt64("dataset_id")
fileName := ctx.Query("file_name")
re, err := routeRepo.NewMultipartForApi(ctx.Context, true)
if err != nil {
ctx.JSON(200, map[string]string{
"result_code": "-1",
"msg": err.Error(),
})
} else {
routeRepo.AddFileNameToCache(datasetId, fileName, ctx.User.ID)
re["result_code"] = "0"
ctx.JSON(200, re)
}
routeRepo.NewMultipart(ctx.Context)
}
func GetMultipartUploadUrl(ctx *context.APIContext) {
if errStr := checkDatasetPermission(ctx); errStr != "" {
@@ -62,9 +90,32 @@ func CompleteMultipart(ctx *context.APIContext) {
if errStr := checkDatasetPermission(ctx); errStr != "" {
ctx.JSON(http.StatusForbidden, ctx.Tr(errStr))
}
datasetId := ctx.QueryInt64("dataset_id")
fileName := ctx.Query("file_name")
routeRepo.RemoveFileFromCache(datasetId, fileName, ctx.User.ID)
routeRepo.CompleteMultipart(ctx.Context)

}
func GetAttachment(ctx *context.APIContext) {
routeRepo.GetAttachment(ctx.Context)
}

func GetModelChunks(ctx *context.APIContext) {
log.Info("GetModelChunks by api.")
routeRepo.GetModelChunks(ctx.Context)
}

func NewModelMultipart(ctx *context.APIContext) {
log.Info("NewModelMultipart by api.")
routeRepo.NewModelMultipart(ctx.Context)
}

func GetModelMultipartUploadUrl(ctx *context.APIContext) {
log.Info("GetModelMultipartUploadUrl by api.")
routeRepo.GetModelMultipartUploadUrl(ctx.Context)
}

func CompleteModelMultipart(ctx *context.APIContext) {
log.Info("CompleteModelMultipart by api.")
routeRepo.CompleteModelMultipart(ctx.Context)
}

+ 175
- 0
routers/api/v1/repo/cloudbrain.go View File

@@ -7,7 +7,12 @@ package repo

import (
"bufio"
"code.gitea.io/gitea/entity"
"code.gitea.io/gitea/modules/util"
"code.gitea.io/gitea/routers/response"
"code.gitea.io/gitea/services/ai_task_service/task"
"encoding/json"
"fmt"
"io"
"io/ioutil"
"net/http"
@@ -111,7 +116,123 @@ func GeneralCloudBrainJobStop(ctx *context.APIContext) {
}
func CreateFileNoteBook(ctx *context.APIContext, option api.CreateFileNotebookJobOption) {
cloudbrainTask.FileNotebookCreate(ctx.Context, option)
if ctx.Written() {
return
}
CreateFileNotebookTask(ctx.Context, option)
}

func CreateFileNotebookTask(ctx *context.Context, option api.CreateFileNotebookJobOption) {
displayJobName := cloudbrainService.GetDisplayJobName(ctx.User.Name)
jobName := util.ConvertDisplayJobNameToJobName(displayJobName)
jobType := models.JobTypeDebug
specId := setting.FileNoteBook.SpecIdGPU
ComputeSource := models.GPU
imageUrl := setting.FileNoteBook.ImageGPU
imageId := ""
imageName := imageUrl
cluster := entity.OpenICloudbrainOne

if option.Type == 0 {
specId = setting.FileNoteBook.SpecIdCPU
imageName = imageUrl
}
if option.Type > cloudbrainTask.GPUType {
imageId = setting.FileNoteBook.ImageIdNPU
imageName = setting.FileNoteBook.ImageNPUDescription
imageUrl = ""
imageNpu, err := getNpuImageId(option)
if err != nil {
ctx.JSON(http.StatusOK, models.BaseErrorMessageApi(ctx.Tr("repo.parameter_is_wrong")))
return
}
if imageNpu != nil {
imageId = imageNpu.Id
imageName = imageNpu.Value
}
ComputeSource = models.NPU
specId = setting.FileNoteBook.SpecIdNPU
if setting.ModelartsCD.Enabled {
specId = setting.FileNoteBook.SpecIdNPUCD
imageName = setting.FileNoteBook.ImageNPUCDDescription
}

cluster = entity.OpenICloudbrainTwo
}

sourceRepo, err := models.GetRepositoryByOwnerAndName(option.OwnerName, option.ProjectName)
if err != nil {
ctx.JSON(http.StatusOK, models.BaseErrorMessageApi(ctx.Tr("repo.notebook_file_not_exist")))
return
}
repo, _ := models.GetRepositoryByName(ctx.User.ID, setting.FileNoteBook.ProjectName)
if repo == nil {
log.Error("default file repository not exists")
ctx.JSON(http.StatusOK, models.BaseErrorMessageApi("system error"))
return
}

res, bizErr := task.CreateAITask(entity.CreateReq{
JobType: jobType,
DisplayJobName: displayJobName,
JobName: jobName,
SpecId: specId,
ComputeSourceStr: ComputeSource,
Cluster: cluster,
WorkServerNumber: 1,
ImageUrl: imageUrl,
ImageName: imageName,
ImageID: imageId,
BootFile: cloudbrainTask.GetBootFile(option.File, option.OwnerName, option.ProjectName, option.BranchName),
FileRepository: sourceRepo,
FileBranchName: option.BranchName,
IsFileNoteBookRequest: true,
Description: getDescription(option),
}, nil, repo, ctx.User)

code := 0

if bizErr != nil {
switch bizErr.Code {
case response.MULTI_TASK.Code:
code = 2
default:
code = 1
}
ctx.JSON(http.StatusOK, models.BaseMessageApi{Code: code, Message: ctx.Tr(bizErr.TrCode)})
return
}
ctx.JSON(http.StatusOK, models.BaseMessageApi{
Code: code,
Message: fmt.Sprint(res.ID),
})
}

const CharacterLength = 2550

func getDescription(option api.CreateFileNotebookJobOption) string {
des := option.OwnerName + "/" + option.ProjectName + "/" + option.BranchName + "/" + option.File
if len(des) <= CharacterLength {
return des
}
return ""
}

func getNpuImageId(option api.CreateFileNotebookJobOption) (*setting.ImageInfoModelArts, error) {
if option.Type != cloudbrainTask.NPUType {
return nil, fmt.Errorf("type is not npu.")
}
if option.Image == "" {
return nil, nil
}
for _, imageInfo := range setting.StImageInfos.ImageInfo {
if imageInfo.Value == option.Image {
return imageInfo, nil
}
}
return nil, fmt.Errorf("invalid image parameter")
}

func FileNoteBookStatus(ctx *context.APIContext, option api.CreateFileNotebookJobOption) {
cloudbrainTask.FileNotebookStatus(ctx.Context, option)
}
@@ -224,6 +345,36 @@ func GrampusNoteBookDebug(ctx *context.APIContext) {

}
func GrampusNotebookRestart(ctx *context.APIContext) {
var id = ctx.Params(":id")
var resultCode = "-1"
var errorMsg = ""
var status = ""

t := ctx.Cloudbrain
if t.IsNewAITask() {
res, bizErr := task.RestartAITask(t.ID, ctx.Repo.GitRepo, ctx.Repo.Repository, ctx.User)
if bizErr != nil {
log.Error("lRestartAITask failed:task.ID=%d err=%v", t.ID, bizErr.DefaultMsg)
errorMsg = ctx.Tr(bizErr.TrCode)
ctx.JSON(200, map[string]string{
"result_code": resultCode,
"error_msg": errorMsg,
"status": status,
"id": id,
})
return
}
id = strconv.FormatInt(res.ID, 10)
status = res.Status
resultCode = "0"
ctx.JSON(200, map[string]string{
"result_code": resultCode,
"error_msg": errorMsg,
"status": status,
"id": id,
})
return
}
cloudbrainTask.GrampusNotebookRestart(ctx.Context)
}

@@ -233,6 +384,15 @@ func GrampusStopJob(ctx *context.APIContext) {
}

func GrampusNotebookDel(ctx *context.APIContext) {
if isHandled, err := task.HandleNewAITaskDelete(ctx.Cloudbrain.ID); isHandled {
if err != nil {
log.Error("DeleteJob(%s) failed:%v", ctx.Cloudbrain.JobName, err, ctx.Data["msgID"])
ctx.JSON(http.StatusOK, models.BaseErrorMessageApi(err.Error()))
return
}
ctx.JSON(http.StatusOK, models.BaseOKMessageApi)
return
}
err := cloudbrainTask.DeleteGrampusJob(ctx.Context)
if err != nil {
ctx.JSON(http.StatusOK, models.BaseErrorMessageApi(err.Error()))
@@ -293,6 +453,21 @@ func GetCloudbrainTask(ctx *context.APIContext) {
ctx.NotFound(err)
return
}

if job.IsNewAITask() {
jobAfter, _ := task.UpdateCloudbrain(job)
ctx.JSON(http.StatusOK, map[string]interface{}{
"ID": ID,
"JobName": jobAfter.JobName,
"JobStatus": jobAfter.Status,
"SubState": "",
"CreatedTime": jobAfter.CreatedUnix.Format("2006-01-02 15:04:05"),
"CompletedTime": jobAfter.UpdatedUnix.Format("2006-01-02 15:04:05"),
"JobDuration": jobAfter.TrainJobDuration,
})
return
}

if job.JobType == string(models.JobTypeModelSafety) {
routerRepo.GetAiSafetyTaskByJob(job)
job, err = models.GetCloudbrainByID(ID)


+ 109
- 0
routers/api/v1/repo/cloudbrain_dashboard.go View File

@@ -4,10 +4,13 @@ import (
"fmt"
"net/http"
"net/url"
"sort"
"strconv"
"strings"
"time"

"code.gitea.io/gitea/modules/setting"

"code.gitea.io/gitea/services/cloudbrain/resource"

"code.gitea.io/gitea/models"
@@ -163,6 +166,112 @@ func GetOverviewDuration(ctx *context.Context) {
})
}

func GetCloubrainOverviewGroupByAiCenter(ctx *context.Context) {

cloudbrainCardTimeAndCountArray, err := models.GetCloudbrainCardTimeAndCountGroupByAICenter()
if err != nil {
log.Error("Can not query CardTimeAndCount.", err)
}

cardTimeMap, maxCardTime, _ := getCenterCardTimeInfo(cloudbrainCardTimeAndCountArray)

var aiCenterLocationInfos = make(map[string][]*cloudbrainService.AiCenterLocationInfo, 0)

const AI_CENTER = "智算中心"
for _, value := range setting.AiCenterCodeAndNameAndLocMapInfo {
long, lat := getLongLat(value.Loc)
aicenterArray, ok := aiCenterLocationInfos[value.Type]
if !ok {
aicenterArray = make([]*cloudbrainService.AiCenterLocationInfo, 0)

}
if value.Type == "超算中心" || value.Type == "东数西算" {

aiCenterLocationInfos[value.Type] = append(aicenterArray, &cloudbrainService.AiCenterLocationInfo{
Name: cloudbrainService.GetAiCenterShowByAiCenterId(value.Name, ctx),
Longitude: long,
Latitude: lat,
Value: setting.ScreenMap.MinValue,
})
} else if value.Type == AI_CENTER {

aiCenterLocationInfos[value.Type] = append(aicenterArray, &cloudbrainService.AiCenterLocationInfo{
Name: cloudbrainService.GetAiCenterShowByAiCenterId(value.Name, ctx),
Longitude: long,
Latitude: lat,
Value: getAiCenterSize(value.Name, cardTimeMap, maxCardTime, 0),
})

}

}
sort.SliceStable(aiCenterLocationInfos[AI_CENTER], func(i, j int) bool {
return aiCenterLocationInfos[AI_CENTER][i].Value > aiCenterLocationInfos[AI_CENTER][j].Value
})

if setting.ScreenMap.ShowData || ctx.IsUserSiteAdmin() {

for _, cloudbrainCardTimeAndCountMap := range cloudbrainCardTimeAndCountArray {
centerId := cloudbrainCardTimeAndCountMap["ai_center"]
centerShow := cloudbrainService.GetAiCenterShowByAiCenterId(centerId, ctx)
cloudbrainCardTimeAndCountMap["ai_center"] = centerShow

}
ctx.JSON(http.StatusOK, map[string]interface{}{
"cardAndJobCount": cloudbrainCardTimeAndCountArray,
"locationInfo": aiCenterLocationInfos,
})
return
}

ctx.JSON(http.StatusOK, map[string]interface{}{
"cardAndJobCount": []map[string]string{},
"locationInfo": aiCenterLocationInfos,
})
return

}

func getAiCenterSize(name string, timeMap map[string]int64, MaxCardTime int64, MinCardTime int64) int {
cardTime, _ := timeMap[name]
if cardTime == 0 {
return setting.ScreenMap.MinValue
} else {
if MaxCardTime == MinCardTime {
return setting.ScreenMap.MaxValue
} else {
return int(float64(cardTime-MinCardTime)/float64(MaxCardTime-MinCardTime)*float64(setting.ScreenMap.MaxValue-setting.ScreenMap.MinValue)) + setting.ScreenMap.MinValue
}
}

}

func getLongLat(loc string) (string, string) {
longLat := strings.Split(loc, ",")
if len(longLat) != 2 {
return "", ""
}
return longLat[0], longLat[1]
}

func getCenterCardTimeInfo(cloudbrainCardTimeAndCountArray []map[string]string) (map[string]int64, int64, int64) {
var centerCardTimeMap = make(map[string]int64, len(cloudbrainCardTimeAndCountArray))
var maxCardTime int64 = 0
var minCardTime int64 = 0
for i, cloudbrainCardTimeAndCount := range cloudbrainCardTimeAndCountArray {

cardTime, _ := strconv.ParseInt(cloudbrainCardTimeAndCount["card_duration"], 10, 64)
if i == 0 {
maxCardTime = cardTime
}
if i == len(cloudbrainCardTimeAndCountArray)-1 {
minCardTime = cardTime
}
centerCardTimeMap[cloudbrainCardTimeAndCount["ai_center"]] = cardTime
}
return centerCardTimeMap, maxCardTime, minCardTime
}

func GetCloudbrainCardDuration(task models.Cloudbrain) string {
cardNum := int(0)
spec, err := resource.GetCloudbrainSpec(task.ID)


+ 13
- 0
routers/api/v1/repo/datasets.go View File

@@ -12,6 +12,7 @@ import (
"code.gitea.io/gitea/modules/context"
"code.gitea.io/gitea/modules/log"
"code.gitea.io/gitea/modules/setting"
routerRepo "code.gitea.io/gitea/routers/repo"
)

func PublicDatasetMultiple(ctx *context.APIContext) {
@@ -143,3 +144,15 @@ func getSearchOrderByInValues(datasetIds []int64) models.SearchOrderBy {
searchOrderBy += " ELSE 0 END"
return models.SearchOrderBy(searchOrderBy)
}

func GetDataSetSelectItemByJobId(ctx *context.APIContext) {
routerRepo.GetDataSetSelectItemByJobId(ctx.Context)
}

func GetExportDataSetByMsgId(ctx *context.APIContext) {
routerRepo.GetExportDataSetByMsgId(ctx.Context)
}

func ExportModelToExistDataSet(ctx *context.APIContext) {
routerRepo.ExportModelToExistDataSet(ctx.Context)
}

+ 15
- 4
routers/api/v1/repo/modelarts.go View File

@@ -15,6 +15,7 @@ import (
"time"

"code.gitea.io/gitea/services/ai_task_service/schedule"
"code.gitea.io/gitea/services/ai_task_service/task"

"code.gitea.io/gitea/routers/response"

@@ -51,11 +52,16 @@ func GetModelArtsNotebook2(ctx *context.APIContext) {
return
}
if !job.Cleared {
err = modelarts.HandleNotebookInfo(job)
if err != nil {
ctx.NotFound(err)
return
if job.IsNewAITask() {
job, _ = task.UpdateCloudbrain(job)
} else {
err = modelarts.HandleNotebookInfo(job)
if err != nil {
ctx.NotFound(err)
return
}
}

}
ctx.JSON(http.StatusOK, map[string]interface{}{
"ID": ID,
@@ -652,3 +658,8 @@ func trainJobGetMetricStatistic(jobID string, versionName string) (*models.GetTr

return result, err
}

func DownloadMultiResultFile(ctx *context.APIContext) {
log.Info("DownloadMultiResultFile by api")
routerRepo.DownloadMultiResultFile(ctx.Context)
}

+ 10
- 0
routers/api/v1/repo/modelmanage.go View File

@@ -187,3 +187,13 @@ func DownloadModeConvertResultFile(ctx *context.APIContext) {
ctx.Context.SetParams("id", ctx.Query("id"))
routerRepo.ModelConvertDownloadModel(ctx.Context)
}

func SaveLocalModel(ctx *context.APIContext) {
log.Info("SaveLocalModel by api.")
routerRepo.SaveLocalModel(ctx.Context)
}

func DeleteModelFile(ctx *context.APIContext) {
log.Info("DeleteModelFile by api.")
routerRepo.DeleteModelFile(ctx.Context)
}

+ 7
- 0
routers/home.go View File

@@ -55,6 +55,8 @@ const (
tplRepoSquare base.TplName = "explore/repos/square"
tplRepoSearch base.TplName = "explore/repos/search"
tplRoshmci base.TplName = "explore/ros-hmci"

tplExploreCenterMap base.TplName = "explore/center_map"
)

// Home render home page
@@ -541,6 +543,11 @@ func ExploreDatasetsUI(ctx *context.Context) {
ctx.HTML(200, tplExploreDataset)
}

func CenterMapUI(ctx *context.Context) {

ctx.HTML(200, tplExploreCenterMap)
}

func getDatasetOrderBy(ctx *context.Context) models.SearchOrderBy {
var orderBy models.SearchOrderBy
switch ctx.Query("sort") {


+ 1
- 0
routers/private/internal.go View File

@@ -59,6 +59,7 @@ func RegisterRoutes(m *macaron.Macaron) {
m.Post("/repos/cnt_stat/handle_historical_task", admin.RefreshHistorySpec)
m.Post("/duration_statisctic/history_handle", repo.CloudbrainUpdateHistoryData)
m.Post("/square/repo/stat/refresh", repository.RefreshRepoStatData)
m.Get("/setting/refresh", RefreshSetting)

}, CheckInternalToken)
}

+ 17
- 0
routers/private/setting.go View File

@@ -0,0 +1,17 @@
package private

import (
"code.gitea.io/gitea/modules/setting"
"gitea.com/macaron/macaron"
)

func RefreshSetting(ctx *macaron.Context) {

setting.Cfg.Reload()
setting.NewScreenMapConfig()
setting.GetGrampusConfig()
setting.GetModelartsConfig()
setting.GetModelartsCDConfig()
ctx.PlainText(200, []byte("success"))

}

+ 65
- 33
routers/repo/attachment.go View File

@@ -667,23 +667,36 @@ func GetSuccessChunks(ctx *context.Context) {

}

func NewMultipart(ctx *context.Context) {
func NewMultipartForApi(ctx *context.Context, isFlowControl bool) (map[string]string, error) {
if !setting.Attachment.Enabled {
ctx.Error(404, "attachment is not enabled")
return
return nil, errors.New("attachment is not enabled")
}

err := upload.VerifyFileType(ctx.Query("fileType"), strings.Split(setting.Attachment.AllowedTypes, ","))
typeCloudBrain := ctx.QueryInt("type")
fileMD5 := ctx.Query("md5")
fileChunk, err := models.GetFileChunkByMD5AndUser(fileMD5, ctx.User.ID, typeCloudBrain)
if err == nil {
if fileChunk != nil {
log.Info("cannot reupload,name" + ctx.Query("file_name"))
return nil, errors.New("Cannot upload repeatedly,name is " + ctx.Query("file_name"))
}
}
if isFlowControl {
err = CheckFlowForDataset(ctx)
if err != nil {
log.Info("check error," + err.Error())
return nil, err
}
}
err = upload.VerifyFileType(ctx.Query("fileType"), strings.Split(setting.Attachment.AllowedTypes, ","))
if err != nil {
ctx.Error(400, err.Error())
return
log.Info("VerifyFileType error," + err.Error())
return nil, errors.New("Not support file type.")
}

typeCloudBrain := ctx.QueryInt("type")
err = checkTypeCloudBrain(typeCloudBrain)
if err != nil {
ctx.ServerError("checkTypeCloudBrain failed", err)
return
log.Info("checkTypeCloudBrain error," + err.Error())
return nil, err
}

fileName := ctx.Query("file_name")
@@ -691,14 +704,15 @@ func NewMultipart(ctx *context.Context) {
if setting.Attachment.StoreType == storage.MinioStorageType {
totalChunkCounts := ctx.QueryInt("totalChunkCounts")
if totalChunkCounts > minio_ext.MaxPartsCount {
ctx.Error(400, fmt.Sprintf("chunk counts(%d) is too much", totalChunkCounts))
return
log.Info(fmt.Sprintf("chunk counts(%d) is too much", totalChunkCounts))
return nil, errors.New(fmt.Sprintf("chunk counts(%d) is too much", totalChunkCounts))

}

fileSize := ctx.QueryInt64("size")
if fileSize > minio_ext.MaxMultipartPutObjectSize {
ctx.Error(400, fmt.Sprintf("file size(%d) is too big", fileSize))
return
log.Info(fmt.Sprintf("file size(%d) is too big", fileSize))
return nil, errors.New(fmt.Sprintf("file size(%d) is too big", fileSize))
}

uuid := gouuid.NewV4().String()
@@ -706,17 +720,16 @@ func NewMultipart(ctx *context.Context) {
if typeCloudBrain == models.TypeCloudBrainOne {
uploadID, err = storage.NewMultiPartUpload(strings.TrimPrefix(path.Join(setting.Attachment.Minio.BasePath, path.Join(uuid[0:1], uuid[1:2], uuid)), "/"))
if err != nil {
ctx.ServerError("NewMultipart", err)
return
log.Info("NewMultipart " + err.Error())
return nil, err
}
} else {
uploadID, err = storage.NewObsMultiPartUpload(strings.TrimPrefix(path.Join(setting.BasePath, path.Join(uuid[0:1], uuid[1:2], uuid, fileName)), "/"))
if err != nil {
ctx.ServerError("NewObsMultiPartUpload", err)
return
log.Info("NewObsMultiPartUpload " + err.Error())
return nil, err
}
}

_, err = models.InsertFileChunk(&models.FileChunk{
UUID: uuid,
UserID: ctx.User.ID,
@@ -728,18 +741,26 @@ func NewMultipart(ctx *context.Context) {
})

if err != nil {
ctx.Error(500, fmt.Sprintf("InsertFileChunk: %v", err))
return
log.Info(fmt.Sprintf("InsertFileChunk: %v", err))
return nil, err
}

ctx.JSON(200, map[string]string{
return map[string]string{
"uuid": uuid,
"uploadID": uploadID,
})
}, nil
} else {
ctx.Error(404, "storage type is not enabled")
return nil, errors.New("storage type is not enabled")
}

}

func NewMultipart(ctx *context.Context) {
re, err := NewMultipartForApi(ctx, false)
if err != nil {
ctx.ServerError("NewMultipart failed", err)
return
}
ctx.JSON(200, re)
}

func PutOBSProxyUpload(ctx *context.Context) {
@@ -850,24 +871,31 @@ func CompleteMultipart(ctx *context.Context) {

fileChunk, err := models.GetFileChunkByUUID(uuid)
if err != nil {
if models.IsErrFileChunkNotExist(err) {
ctx.Error(404)
} else {
ctx.ServerError("GetFileChunkByUUID", err)
}
ctx.JSON(200, map[string]string{
"result_code": "-1",
"msg": "The upload file not found.",
})
return
}

if typeCloudBrain == models.TypeCloudBrainOne {
_, err = storage.CompleteMultiPartUpload(strings.TrimPrefix(path.Join(setting.Attachment.Minio.BasePath, path.Join(fileChunk.UUID[0:1], fileChunk.UUID[1:2], fileChunk.UUID)), "/"), uploadID, fileChunk.TotalChunks)
if err != nil {
ctx.Error(500, fmt.Sprintf("CompleteMultiPartUpload failed: %v", err))
ctx.JSON(200, map[string]string{
"result_code": "-1",
"msg": fmt.Sprintf("CompleteMultiPartUpload failed: %v", err),
})
//ctx.Error(500, fmt.Sprintf("CompleteMultiPartUpload failed: %v", err))
return
}
} else {
err = storage.CompleteObsMultiPartUpload(strings.TrimPrefix(path.Join(setting.BasePath, path.Join(fileChunk.UUID[0:1], fileChunk.UUID[1:2], fileChunk.UUID, fileName)), "/"), uploadID, fileChunk.TotalChunks)
if err != nil {
ctx.Error(500, fmt.Sprintf("CompleteObsMultiPartUpload failed: %v", err))
ctx.JSON(200, map[string]string{
"result_code": "-1",
"msg": fmt.Sprintf("CompleteObsMultiPartUpload failed: %v", err),
})
//ctx.Error(500, fmt.Sprintf("CompleteObsMultiPartUpload failed: %v", err))
return
}
}
@@ -876,7 +904,11 @@ func CompleteMultipart(ctx *context.Context) {

err = models.UpdateFileChunk(fileChunk)
if err != nil {
ctx.Error(500, fmt.Sprintf("UpdateFileChunk: %v", err))
ctx.JSON(200, map[string]string{
"result_code": "-1",
"msg": fmt.Sprintf("UpdateFileChunk: %v", err),
})
//ctx.Error(500, fmt.Sprintf("UpdateFileChunk: %v", err))
return
}



+ 10
- 10
routers/repo/attachment_model.go View File

@@ -20,9 +20,9 @@ func GetModelChunks(ctx *context.Context) {
fileMD5 := ctx.Query("md5")
typeCloudBrain := ctx.QueryInt("type")
fileName := ctx.Query("file_name")
scene := ctx.Query("scene")
//scene := ctx.Query("scene")
modeluuid := ctx.Query("modeluuid")
log.Info("scene=" + scene + " typeCloudBrain=" + fmt.Sprint(typeCloudBrain))
log.Info(" typeCloudBrain=" + fmt.Sprint(typeCloudBrain))
var chunks string

err := checkTypeCloudBrain(typeCloudBrain)
@@ -124,14 +124,14 @@ func GetModelChunks(ctx *context.Context) {
})
} else {
ctx.JSON(200, map[string]string{
"uuid": fileChunk.UUID,
"uploaded": strconv.Itoa(fileChunk.IsUploaded),
"uploadID": fileChunk.UploadID,
"chunks": string(chunks),
"attachID": "0",
"datasetID": "0",
"fileName": "",
"datasetName": "",
"uuid": fileChunk.UUID,
"uploaded": strconv.Itoa(fileChunk.IsUploaded),
"uploadID": fileChunk.UploadID,
"chunks": string(chunks),
"attachID": "0",
"datasetID": "0",
"fileName": "",
"modelName": "",
})
}
}


+ 80
- 21
routers/repo/cloudbrain.go View File

@@ -200,14 +200,16 @@ func prepareCloudbrainOneSpecs(ctx *context.Context) {
}

func CloudBrainNew(ctx *context.Context) {
err := cloudBrainNewDataPrepare(ctx, string(models.JobTypeDebug))
if err != nil {
ctx.ServerError("get new cloudbrain info failed", err)
return
}
ctx.Data["PageIsGPUDebug"] = true
// err := cloudBrainNewDataPrepare(ctx, string(models.JobTypeDebug))
// if err != nil {
// ctx.ServerError("get new cloudbrain info failed", err)
// return
// }
// ctx.Data["PageIsGPUDebug"] = true
ctx.Data["PageIsCloudBrain"] = true
ctx.HTML(200, tplCloudBrainNew)
}

func CloudBrainCreate(ctx *context.Context, form auth.CreateCloudBrainForm) {
ctx.Data["IsCreate"] = true
cloudBrainCreate(ctx, form)
@@ -703,6 +705,32 @@ func CloudBrainRestart(ctx *context.Context) {
var status = string(models.JobWaiting)
task := ctx.Cloudbrain

if task.IsNewAITask() {
res, bizErr := ai_task.RestartAITask(task.ID, ctx.Repo.GitRepo, ctx.Repo.Repository, ctx.User)
if bizErr != nil {
log.Error("RestartAITask failed:task.ID=%d err=%v", task.ID, bizErr.DefaultMsg)
errorMsg = ctx.Tr(bizErr.TrCode)
resultCode = "-1"
ctx.JSON(200, map[string]string{
"result_code": resultCode,
"error_msg": errorMsg,
"status": status,
"id": ID,
})
return
}
id := strconv.FormatInt(res.ID, 10)
status = res.Status
resultCode = "0"
ctx.JSON(200, map[string]string{
"result_code": resultCode,
"error_msg": errorMsg,
"status": status,
"id": id,
})
return
}

lockOperator, errMsg := cloudbrainService.Lock4CloudbrainRestart(&lock.LockContext{Repo: ctx.Repo.Repository, Task: &models.Cloudbrain{JobType: task.JobType}, User: ctx.User})
defer func() {
if lockOperator != nil {
@@ -838,7 +866,9 @@ func CloudBrainBenchMarkShow(ctx *context.Context) {
}

func CloudBrainShow(ctx *context.Context) {
cloudBrainShow(ctx, tplCloudBrainShow, models.JobTypeDebug)
// cloudBrainShow(ctx, tplCloudBrainShow, models.JobTypeDebug)
ctx.Data["PageIsCloudBrain"] = true
ctx.HTML(200, tplCloudBrainShow)
}

func CloudBrainTrainJobShow(ctx *context.Context) {
@@ -871,11 +901,15 @@ func cloudBrainShow(ctx *context.Context, tpName base.TplName, jobType models.Jo
return
}
if task.Status == string(models.JobWaiting) || task.Status == string(models.JobRunning) {
task, err = cloudbrainTask.SyncCloudBrainOneStatus(task)
if err != nil {
log.Info("error:" + err.Error())
ctx.NotFound(ctx.Req.URL.RequestURI(), nil)
return
if task.IsNewAITask() {
task, _ = ai_task.UpdateCloudbrain(task)
} else {
task, err = cloudbrainTask.SyncCloudBrainOneStatus(task)
if err != nil {
log.Info("error:" + err.Error())
ctx.NotFound(ctx.Req.URL.RequestURI(), nil)
return
}
}

}
@@ -1307,6 +1341,16 @@ func CloudBrainStop(ctx *context.Context) {
resultCode = task.Status
break
}
if res, isHandled, err := ai_task.HandleNewAITaskStop(task.ID); isHandled {
if err != nil {
log.Error("StopJob(%s) failed:%v", task.JobName, err, ctx.Data["msgID"])
resultCode = "-1"
errorMsg = "cloudbrain.Stopped_failed"
break
}
status = res.Status
break
}

err := cloudbrain.StopJob(task.JobID)
if err != nil {
@@ -1513,6 +1557,14 @@ func CloudBrainDel(ctx *context.Context) {
func deleteCloudbrainJob(ctx *context.Context) error {
task := ctx.Cloudbrain

if isHandled, err := ai_task.HandleNewAITaskDelete(task.ID); isHandled {
if err != nil {
log.Error("DeleteJob(%s) failed:%v", task.JobName, err, ctx.Data["msgID"])
return err
}
return nil
}

if task.Status != string(models.JobStopped) && task.Status != string(models.JobFailed) && task.Status != string(models.JobSucceeded) {
log.Error("the job(%s) has not been stopped", task.JobName, ctx.Data["msgID"])
return errors.New("the job has not been stopped")
@@ -1949,6 +2001,13 @@ func mkPathAndReadMeFile(path string, text string) error {
}

func SyncCloudbrainStatus() {
defer func() {
if err := recover(); err != nil {
combinedErr := fmt.Errorf("%s\n%s", err, log.Stack(2))
log.Error("PANIC:%v", combinedErr)
}
}()

cloudBrains, err := models.GetCloudBrainUnStoppedJob()
if err != nil {
log.Error("GetCloudBrainUnStoppedJob failed:", err.Error())
@@ -1956,12 +2015,18 @@ func SyncCloudbrainStatus() {
}

for _, task := range cloudBrains {

if task.JobType == string(models.JobTypeModelSafety) {
continue
}

if task.IsNewAITask() {
task, _ = ai_task.UpdateCloudbrain(task)
if task.Duration >= setting.MaxDuration && task.JobType == string(models.JobTypeDebug) {
ai_task.StopCloudbrain(task)
}
continue
}
if task.Type == models.TypeCloudBrainOne {

task, err = cloudbrainTask.SyncCloudBrainOneStatus(task)
if err != nil {
log.Error("Sync cloud brain one (%s) failed:%v", task.JobName, err)
@@ -1986,13 +2051,7 @@ func SyncCloudbrainStatus() {
}
} else if task.Type == models.TypeC2Net {
if task.JobType == string(models.JobTypeDebug) {
if task.IsNewAITask() {
ai_task.UpdateCloudbrain(task)
task, _ = models.GetCloudbrainByCloudbrainID(task.ID)
} else {
cloudbrainTask.SyncGrampusNotebookStatus(task)
}

cloudbrainTask.SyncGrampusNotebookStatus(task)
} else {
result, err := grampus.GetJob(task.JobID)
if err != nil {


+ 1
- 0
routers/repo/cloudbrain_statistic.go View File

@@ -15,6 +15,7 @@ import (

func CloudbrainDurationStatisticHour() {
if setting.IsCloudbrainTimingEnabled {
log.Info("CloudbrainDurationStatisticHour start")
var statisticTime time.Time
var count int64
recordDurationUpdateTime, err := models.GetDurationRecordUpdateTime()


+ 5
- 2
routers/repo/dataset.go View File

@@ -173,7 +173,10 @@ func DatasetIndex(ctx *context.Context) {

//load attachment creator
for _, attachment := range pageAttachments {
uploader, _ := models.GetUserByID(attachment.UploaderID)
uploader, err1 := models.GetUserByID(attachment.UploaderID)
if err1 != nil {
log.Info("query dataset user error." + err1.Error())
}
attachment.Uploader = uploader
if !strings.HasSuffix(attachment.Name, ".zip") && !strings.HasSuffix(attachment.Name, ".tar.gz") {
attachment.DecompressState = -1 //非压缩文件
@@ -192,7 +195,7 @@ func DatasetIndex(ctx *context.Context) {
ctx.Data["Type"] = cloudbrainType

renderAttachmentSettings(ctx)
log.Info("dataset index finished.")
ctx.HTML(200, tplIndex)
}



+ 146
- 0
routers/repo/flow_control.go View File

@@ -0,0 +1,146 @@
package repo

import (
"encoding/json"
"errors"
"fmt"
"strconv"
"sync"
"time"

"code.gitea.io/gitea/models"
"code.gitea.io/gitea/modules/context"
"code.gitea.io/gitea/modules/log"
"code.gitea.io/gitea/modules/redis/redis_client"
"code.gitea.io/gitea/modules/setting"
)

const (
REDIS_FLOW_ATTACHMENT_KEY = "flow_attachment_key"
)

var mutex *sync.RWMutex = new(sync.RWMutex)

func CheckFlowForDataset(ctx *context.Context) error {
if ctx.User == nil {
return errors.New("User not login.")
}
log.Info("start to check flow for upload dataset file.")
fileName := ctx.Query("file_name")
currentTimeNow := time.Now()
currentLongTime := currentTimeNow.Unix()
last24Hour := currentTimeNow.AddDate(0, 0, -1).Unix()
filechunks, err := models.GetFileChunksByUserId(ctx.User.ID, last24Hour, true)
if err == nil {
if len(filechunks) > setting.FLOW_CONTROL.ATTACHEMENT_NUM_A_USER_LAST24HOUR {
log.Info("A single user cannot upload more than " + fmt.Sprint(setting.FLOW_CONTROL.ATTACHEMENT_NUM_A_USER_LAST24HOUR) + " files within the last 24 hours. so " + fileName + " is rejected. user id=" + fmt.Sprint(ctx.User.ID))
return errors.New("A single user cannot upload more than " + fmt.Sprint(setting.FLOW_CONTROL.ATTACHEMENT_NUM_A_USER_LAST24HOUR) + " files within the last 24 hours.")
}
var totalSize int64
totalSize += ctx.QueryInt64("size")
concurrentUpload := 0
for _, file := range filechunks {
totalSize += file.Size
if (currentLongTime - int64(file.CreatedUnix)) < 10*60 {
log.Info("the file " + file.Md5 + " in 10min upload." + file.CreatedUnix.Format("2006-01-02 15:04:05"))
concurrentUpload += 1
} else {
log.Info("the file " + file.Md5 + " not in 10min upload." + file.CreatedUnix.Format("2006-01-02 15:04:05"))
}
}
log.Info("The concurrentUpload is " + fmt.Sprint(concurrentUpload) + " to checked " + fileName + ". user id=" + fmt.Sprint(ctx.User.ID))
if concurrentUpload >= setting.FLOW_CONTROL.ATTACHEMENT_NUM_A_USER_LAST10M {
log.Info("A single user cannot upload more than " + fmt.Sprint(setting.FLOW_CONTROL.ATTACHEMENT_NUM_A_USER_LAST10M) + " files within the past 10 minutes. so " + fileName + " is rejected. user id=" + fmt.Sprint(ctx.User.ID))
return errors.New("A single user cannot upload more than " + fmt.Sprint(setting.FLOW_CONTROL.ATTACHEMENT_NUM_A_USER_LAST10M) + " files within the past 10 minutes.")
}
if totalSize >= setting.FLOW_CONTROL.ATTACHEMENT_SIZE_A_USER*1024*1024*1024 {
log.Info("The total file size uploaded by a single user within the past 24 hours cannot exceed " + fmt.Sprint(setting.FLOW_CONTROL.ATTACHEMENT_SIZE_A_USER) + "G. so " + fileName + " is rejected. user id=" + fmt.Sprint(ctx.User.ID))
return errors.New("The total file size uploaded by a single user within the past 24 hours cannot exceed " + fmt.Sprint(setting.FLOW_CONTROL.ATTACHEMENT_SIZE_A_USER) + "G.")
}
}
return nil
}

func AddFileNameToCache(datasetId int64, fileName string, userId int64) {
mutex.Lock()
defer mutex.Unlock()
cacheMap := getSDKUploadFileMap(REDIS_FLOW_ATTACHMENT_KEY)
expireTimeKeys := make([]string, 0)
currentTime := time.Now().Unix()
for tmpKey, tmpValue := range cacheMap {
time, err := strconv.ParseInt(tmpValue, 10, 64)
if err == nil {
if currentTime-time > 24*3600 {
expireTimeKeys = append(expireTimeKeys, tmpKey)
continue
}
}
}
for _, delKey := range expireTimeKeys {
delete(cacheMap, delKey)
}
key := fmt.Sprint(datasetId) + "_" + fileName + "_" + fmt.Sprint(userId)
value := fmt.Sprint(time.Now().Unix())
cacheMap[key] = value
log.Info("set key=" + key + " value=" + value + " to cache.")
setSDKUploadFileCache(REDIS_FLOW_ATTACHMENT_KEY, cacheMap)
}

func RemoveFileFromCache(datasetId int64, fileName string, userId int64) {
mutex.Lock()
defer mutex.Unlock()
key := fmt.Sprint(datasetId) + "_" + fileName + "_" + fmt.Sprint(userId)
cacheMap := getSDKUploadFileMap(REDIS_FLOW_ATTACHMENT_KEY)
delete(cacheMap, key)
log.Info("remove key=" + key + " from cache.")
setSDKUploadFileCache(REDIS_FLOW_ATTACHMENT_KEY, cacheMap)
}

func getSDKUploadFileMap(msgKey string) map[string]string {
valueStr, err := redis_client.Get(msgKey)
msgMap := make(map[string]string, 0)
if err == nil {
if valueStr != "" {
err1 := json.Unmarshal([]byte(valueStr), &msgMap)
if err1 != nil {
log.Info("unmarshal json failed. " + err1.Error())
}
}
} else {
log.Info("Failed to load from reids. " + err.Error())
}
return msgMap
}

func setSDKUploadFileCache(msgKey string, msgMap map[string]string) {
msgMapJson, _ := json.Marshal(msgMap)
redisValue := string(msgMapJson)
log.Info("set redis key=" + msgKey + " value=" + redisValue)
re, err := redis_client.Setex(msgKey, redisValue, 24*3600*time.Second)
if err == nil {
log.Info("re =" + fmt.Sprint(re))
} else {
log.Info("set redis error:" + err.Error())
}
}

func CheckFlowForDatasetSDK() error {
cacheMap := getSDKUploadFileMap(REDIS_FLOW_ATTACHMENT_KEY)
currentTime := time.Now().Unix()
count := 0
for _, tmpValue := range cacheMap {
time, err := strconv.ParseInt(tmpValue, 10, 64)
if err == nil {
if currentTime-time > 24*3600 {
continue
}
}
count += 1
}
log.Info("total find " + fmt.Sprint(count) + " uploading files.")
if count >= setting.FLOW_CONTROL.ALL_ATTACHEMENT_NUM_SDK {
log.Info("The number of datasets uploaded using the SDK simultaneously cannot exceed " + fmt.Sprint(setting.FLOW_CONTROL.ALL_ATTACHEMENT_NUM_SDK))
return errors.New("The number of datasets uploaded using the SDK simultaneously cannot exceed " + fmt.Sprint(setting.FLOW_CONTROL.ALL_ATTACHEMENT_NUM_SDK))
}
return nil
}

+ 214
- 115
routers/repo/grampus.go View File

@@ -8,6 +8,7 @@ import (
"net/http"
"os"
"path"
"strconv"
"strings"
"time"
"unicode/utf8"
@@ -70,31 +71,33 @@ const (
)

func GrampusNotebookNew(ctx *context.Context) {
ctx.Data["IsCreate"] = true
ctx.Data["PageIsCloudBrain"] = true
notebookType := ctx.QueryInt("type")
processType := grampus.ProcessorTypeGPU
if notebookType == 1 {
processType = grampus.ProcessorTypeNPU
} else if notebookType == 2 {
processType = grampus.ProcessorTypeGCU
} else if notebookType == 3 {
processType = grampus.ProcessorTypeMLU
ctx.HTML(http.StatusOK, tplGrampusNotebookMLUNew)
return
}
err := grampusNotebookNewDataPrepare(ctx, processType)
if err != nil {
ctx.ServerError("get new notebook-job info failed", err)
return
}
if processType == grampus.ProcessorTypeGPU {
ctx.HTML(http.StatusOK, tplGrampusNotebookGPUNew)
} else if processType == grampus.ProcessorTypeNPU {
ctx.HTML(http.StatusOK, tplGrampusNotebookNPUNew)
} else if processType == grampus.ProcessorTypeGCU {
ctx.HTML(http.StatusOK, tplGrampusNotebookGCUNew)
}
ctx.HTML(http.StatusOK, tplGrampusNotebookNew)
// ctx.Data["IsCreate"] = true
// ctx.Data["PageIsCloudBrain"] = true
// notebookType := ctx.QueryInt("type")
// processType := grampus.ProcessorTypeGPU
// if notebookType == 1 {
// processType = grampus.ProcessorTypeNPU
// } else if notebookType == 2 {
// processType = grampus.ProcessorTypeGCU
// } else if notebookType == 3 {
// processType = grampus.ProcessorTypeMLU
// ctx.HTML(http.StatusOK, tplGrampusNotebookMLUNew)
// return
// }
// err := grampusNotebookNewDataPrepare(ctx, processType)
// if err != nil {
// ctx.ServerError("get new notebook-job info failed", err)
// return
// }
// if processType == grampus.ProcessorTypeGPU {
// ctx.HTML(http.StatusOK, tplGrampusNotebookGPUNew)
// } else if processType == grampus.ProcessorTypeNPU {
// ctx.HTML(http.StatusOK, tplGrampusNotebookNPUNew)
// } else if processType == grampus.ProcessorTypeGCU {
// ctx.HTML(http.StatusOK, tplGrampusNotebookGCUNew)
// }
}

func GrampusTrainJobGPUNew(ctx *context.Context) {
@@ -1207,12 +1210,12 @@ func grampusTrainJobNpuCreate(ctx *context.Context, form auth.CreateGrampusTrain
}

//todo: upload code (send to file_server todo this work?)
if err := obsMkdir(setting.CodePathPrefix + jobName + modelarts.OutputPath); err != nil {
/**if err := obsMkdir(setting.CodePathPrefix + jobName + modelarts.OutputPath); err != nil {
log.Error("Failed to obsMkdir_output: %s (%v)", repo.FullName(), err)
grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeNPU)
ctx.RenderWithErr(ctx.Tr("cloudbrain.load_code_failed"), tpl, &form)
return
}
}*/

if err := uploadCodeToObs(codeLocalPath, jobName, ""); err != nil {
log.Error("Failed to uploadCodeToObs: %s (%v)", repo.FullName(), err)
@@ -1339,8 +1342,7 @@ func GetGrampusNotebook(ctx *context.APIContext) {

var jobAfter *models.Cloudbrain
if job.IsNewAITask() {
ai_task.UpdateCloudbrain(job)
jobAfter, _ = models.GetCloudbrainByCloudbrainID(job.ID)
jobAfter, _ = ai_task.UpdateCloudbrain(job)
} else {
jobAfter, err = cloudbrainTask.SyncGrampusNotebookStatus(job)
}
@@ -1365,15 +1367,45 @@ func GetGrampusNotebook(ctx *context.APIContext) {
}

func GrampusStopJob(ctx *context.Context) {
if res, isHandled, err := ai_task.HandleNewAITaskStop(ctx.Cloudbrain.ID); isHandled {
if err != nil {
log.Error("StopJob(%s) failed:%v", ctx.Cloudbrain.JobName, err, ctx.Data["msgID"])
ctx.JSON(200, map[string]interface{}{
"result_code": "-1",
"error_msg": ctx.Tr("cloudbrain.Stopped_failed"),
"status": "",
"id": ctx.Params(":id"),
"StatusOK": 0,
})
return
}
ctx.JSON(200, map[string]interface{}{
"result_code": "0",
"error_msg": "",
"status": res.Status,
"id": ctx.Params(":id"),
"StatusOK": 0,
})
return
}
cloudbrainTask.GrampusStopJob(ctx)
}

func GrampusNotebookDel(ctx *context.Context) {
var listType = ctx.Query("listType")
if err := cloudbrainTask.DeleteGrampusJob(ctx); err != nil {
log.Error("deleteGrampusJob failed: %v", err, ctx.Data["msgID"])
ctx.ServerError(err.Error(), err)
return

if isHandled, err := ai_task.HandleNewAITaskDelete(ctx.Cloudbrain.ID); isHandled {
if err != nil {
log.Error("DeleteJob(%s) failed:%v", ctx.Cloudbrain.JobName, err, ctx.Data["msgID"])
ctx.ServerError(err.Error(), err)
return
}
} else {
if err := cloudbrainTask.DeleteGrampusJob(ctx); err != nil {
log.Error("deleteGrampusJob failed: %v", err, ctx.Data["msgID"])
ctx.ServerError(err.Error(), err)
return
}
}

var isAdminPage = ctx.Query("isadminpage")
@@ -1412,94 +1444,96 @@ type NotebookDataset struct {

func GrampusNotebookShow(ctx *context.Context) {
ctx.Data["PageIsCloudBrain"] = true

var task *models.Cloudbrain
task, err := models.GetCloudbrainByIDWithDeleted(ctx.Params(":id"))
if err != nil {
log.Error("GetCloudbrainByID failed:" + err.Error())
ctx.NotFound(ctx.Req.URL.RequestURI(), nil)
return
}
task.ContainerIp = ""

if task.IsNewAITask() {
ai_task.UpdateCloudbrain(task)
task, _ = models.GetCloudbrainByCloudbrainID(task.ID)
} else if task.DeletedAt.IsZero() && cloudbrainTask.IsTaskNotStop(task) { //normal record
result, err := grampus.GetNotebookJob(task.JobID)
if err != nil {
log.Error("GetJob failed:" + err.Error())
ctx.NotFound(ctx.Req.URL.RequestURI(), nil)
return
}

if result != nil {
if len(result.JobInfo.Tasks[0].CenterID) == 1 && len(result.JobInfo.Tasks[0].CenterName) == 1 {
task.AiCenter = result.JobInfo.Tasks[0].CenterID[0] + "+" + result.JobInfo.Tasks[0].CenterName[0]
ctx.HTML(http.StatusOK, tplGrampusNotebookShow)
return
/*
var task *models.Cloudbrain
task, err := models.GetCloudbrainByIDWithDeleted(ctx.Params(":id"))
if err != nil {
log.Error("GetCloudbrainByID failed:" + err.Error())
ctx.NotFound(ctx.Req.URL.RequestURI(), nil)
return
}
oldStatus := task.Status
task.Status = grampus.TransTrainJobStatus(result.JobInfo.Status)
if task.Status != oldStatus || task.Status == models.GrampusStatusRunning {
task.Duration = result.JobInfo.RunSec
if task.Duration < 0 {
task.Duration = 0
task.ContainerIp = ""

if task.IsNewAITask() {
task, _ = ai_task.UpdateCloudbrain(task)
} else if task.DeletedAt.IsZero() && cloudbrainTask.IsTaskNotStop(task) { //normal record
result, err := grampus.GetNotebookJob(task.JobID)
if err != nil {
log.Error("GetJob failed:" + err.Error())
ctx.NotFound(ctx.Req.URL.RequestURI(), nil)
return
}
task.TrainJobDuration = models.ConvertDurationToStr(task.Duration)

if task.StartTime == 0 && result.JobInfo.StartedAt > 0 {
task.StartTime = timeutil.TimeStamp(result.JobInfo.StartedAt)
}
if task.EndTime == 0 && models.IsTrainJobTerminal(task.Status) && task.StartTime > 0 {
task.EndTime = task.StartTime.Add(task.Duration)
}
task.CorrectCreateUnix()
if oldStatus != task.Status {
notification.NotifyChangeCloudbrainStatus(task, oldStatus)
if result != nil {
if len(result.JobInfo.Tasks[0].CenterID) == 1 && len(result.JobInfo.Tasks[0].CenterName) == 1 {
task.AiCenter = result.JobInfo.Tasks[0].CenterID[0] + "+" + result.JobInfo.Tasks[0].CenterName[0]
}
oldStatus := task.Status
task.Status = grampus.TransTrainJobStatus(result.JobInfo.Status)
if task.Status != oldStatus || task.Status == models.GrampusStatusRunning {
task.Duration = result.JobInfo.RunSec
if task.Duration < 0 {
task.Duration = 0
}
task.TrainJobDuration = models.ConvertDurationToStr(task.Duration)

if task.StartTime == 0 && result.JobInfo.StartedAt > 0 {
task.StartTime = timeutil.TimeStamp(result.JobInfo.StartedAt)
}
if task.EndTime == 0 && models.IsTrainJobTerminal(task.Status) && task.StartTime > 0 {
task.EndTime = task.StartTime.Add(task.Duration)
}
task.CorrectCreateUnix()
if oldStatus != task.Status {
notification.NotifyChangeCloudbrainStatus(task, oldStatus)
}
}
err = models.UpdateJob(task)
if err != nil {
log.Error("UpdateJob failed:" + err.Error())
}
}
}
err = models.UpdateJob(task)
if err != nil {
log.Error("UpdateJob failed:" + err.Error())
}
}
}

if len(task.Parameters) > 0 {
var parameters models.Parameters
err := json.Unmarshal([]byte(task.Parameters), &parameters)
if err != nil {
log.Error("Failed to Unmarshal Parameters: %s (%v)", task.Parameters, err)
ctx.ServerError("system error", err)
return
}
if len(task.Parameters) > 0 {
var parameters models.Parameters
err := json.Unmarshal([]byte(task.Parameters), &parameters)
if err != nil {
log.Error("Failed to Unmarshal Parameters: %s (%v)", task.Parameters, err)
ctx.ServerError("system error", err)
return
}

if len(parameters.Parameter) > 0 {
paramTemp := ""
for _, Parameter := range parameters.Parameter {
param := Parameter.Label + " = " + Parameter.Value + "; "
paramTemp = paramTemp + param
if len(parameters.Parameter) > 0 {
paramTemp := ""
for _, Parameter := range parameters.Parameter {
param := Parameter.Label + " = " + Parameter.Value + "; "
paramTemp = paramTemp + param
}
task.Parameters = paramTemp[:len(paramTemp)-2]
} else {
task.Parameters = ""
}
}
user, err := models.GetUserByID(task.UserID)
if err == nil {
task.User = user
}
task.Parameters = paramTemp[:len(paramTemp)-2]
} else {
task.Parameters = ""
}
}
user, err := models.GetUserByID(task.UserID)
if err == nil {
task.User = user
}

prepareSpec4Show(ctx, task)

ctx.Data["task"] = task
ctx.Data["datasetDownload"] = getDatasetDownloadInfo(ctx, task)
ctx.Data["modelDownload"] = getModelDownloadInfo(ctx, task)
ctx.Data["canDownload"] = cloudbrain.CanDownloadJob(ctx, task)
ctx.Data["ai_center"] = cloudbrainService.GetAiCenterShow(task.AiCenter, ctx)
ctx.Data["code_path"] = cloudbrain.CodeMountPath
ctx.Data["dataset_path"] = cloudbrain.DataSetMountPath
ctx.Data["model_path"] = cloudbrain.ModelMountPath
ctx.HTML(http.StatusOK, tplGrampusNotebookShow)
prepareSpec4Show(ctx, task)

ctx.Data["task"] = task
ctx.Data["datasetDownload"] = getDatasetDownloadInfo(ctx, task)
ctx.Data["modelDownload"] = getModelDownloadInfo(ctx, task)
ctx.Data["canDownload"] = cloudbrain.CanDownloadJob(ctx, task)
ctx.Data["ai_center"] = cloudbrainService.GetAiCenterShow(task.AiCenter, ctx)
ctx.Data["code_path"] = cloudbrain.CodeMountPath
ctx.Data["dataset_path"] = cloudbrain.DataSetMountPath
ctx.Data["model_path"] = cloudbrain.ModelMountPath
ctx.HTML(http.StatusOK, tplGrampusNotebookShow)
*/
}

func getDatasetDownloadInfo(ctx *context.Context, task *models.Cloudbrain) []*models.DatasetDownload {
@@ -1659,13 +1693,26 @@ func GrampusDownloadLog(ctx *context.Context) {
ctx.ServerError(err.Error(), err)
return
}
fileName := job.JobName + "-log.txt"

content, err := grampus.GetTrainJobLog(job.JobID)
nodeIdStr := ctx.Params(":nodeId")
var content string
if nodeIdStr != "" {
nodeId, _ := strconv.Atoi(nodeIdStr)
fileName = job.JobName + "-" + strconv.Itoa(nodeId+1) + "-log.txt"
if job.WorkServerNumber < 1 || nodeId > job.WorkServerNumber-1 {
ctx.NotFound("query parameter is wrong", nil)
return
}
content, err = grampus.GetTrainJobLog(job.JobID, nodeId)
} else {
content, err = grampus.GetTrainJobLog(job.JobID)
}
if err != nil {
log.Error("GetTrainJobLog failed: %v", err, ctx.Data["MsgID"])
content = ""
}
fileName := job.JobName + "-log.txt"
ctx.Resp.Header().Set("Content-Disposition", "attachment; filename="+fileName)
ctx.Resp.Header().Set("Content-Type", "application/octet-stream")
var b []byte = []byte(content)
@@ -1696,7 +1743,19 @@ func GrampusGetLog(ctx *context.Context) {
exitDiagnostics = result.ExitDiagnostics
}

content, err := grampus.GetTrainJobLog(job.JobID)
nodeIdStr := ctx.Params(":nodeId")
var content string
if nodeIdStr != "" {
nodeId, _ := strconv.Atoi(nodeIdStr)
if job.WorkServerNumber < 1 || nodeId > job.WorkServerNumber-1 {
ctx.NotFound("query parameter is wrong", nil)
return
}
content, err = grampus.GetTrainJobLog(job.JobID, nodeId)
} else {
content, err = grampus.GetTrainJobLog(job.JobID)
}

if err != nil {
log.Error("GetTrainJobLog failed: %v", err, ctx.Data["MsgID"])
ctx.JSON(http.StatusOK, map[string]interface{}{
@@ -1734,7 +1793,17 @@ func GrampusMetrics(ctx *context.Context) {
}
var result models.NewModelArtsMetricStatisticResult
if job.IsNPUTask() {
result, err = grampus.GetGrampusMetrics(job.JobID, 0, 0)
nodeIdStr := ctx.Params(":nodeId")
if nodeIdStr != "" {
nodeId, _ := strconv.Atoi(nodeIdStr)
if job.WorkServerNumber < 1 || nodeId > job.WorkServerNumber-1 {
ctx.NotFound("query parameter is wrong", nil)
return
}
result, err = grampus.GetGrampusMetrics(job.JobID, 0, 0, nodeId)
} else {
result, err = grampus.GetGrampusMetrics(job.JobID, 0, 0)
}
} else if job.IsGPUTask() {
startTime := int64(job.StartTime)
if startTime == 0 {
@@ -2016,6 +2085,36 @@ func GrampusNotebookDebug(ctx *context.Context) {
}

func GrampusNotebookRestart(ctx *context.Context) {
var id = ctx.Params(":id")
var resultCode = "-1"
var errorMsg = ""
var status = ""

t := ctx.Cloudbrain
if t.IsNewAITask() {
res, bizErr := ai_task.RestartAITask(t.ID, ctx.Repo.GitRepo, ctx.Repo.Repository, ctx.User)
if bizErr != nil {
log.Error("lRestartAITask failed:task.ID=%d err=%v", t.ID, bizErr.DefaultMsg)
errorMsg = ctx.Tr(bizErr.TrCode)
ctx.JSON(200, map[string]string{
"result_code": resultCode,
"error_msg": errorMsg,
"status": status,
"id": id,
})
return
}
id = strconv.FormatInt(res.ID, 10)
status = res.Status
resultCode = "0"
ctx.JSON(200, map[string]string{
"result_code": resultCode,
"error_msg": errorMsg,
"status": status,
"id": id,
})
return
}
cloudbrainTask.GrampusNotebookRestart(ctx)
}



+ 30
- 0
routers/repo/grampus_onlineinfer.go View File

@@ -0,0 +1,30 @@
package repo

import (
"net/http"

"code.gitea.io/gitea/modules/base"
"code.gitea.io/gitea/modules/context"
)

const (
tplGrampusOnlineInferIndex base.TplName = "repo/grampus/onlineinfer/list"
tplGrampusOnlineInferShow base.TplName = "repo/grampus/onlineinfer/show"
tplGrampusOnlineInferNew base.TplName = "repo/grampus/onlineinfer/new"
)

func GrampusOnlineInferNew(ctx *context.Context) {
ctx.Data["PageIsCloudBrain"] = true
ctx.HTML(http.StatusOK, tplGrampusOnlineInferNew)
}

func GrampusOnlineInferShow(ctx *context.Context) {
ctx.Data["PageIsCloudBrain"] = true
ctx.HTML(http.StatusOK, tplGrampusOnlineInferShow)
}

func GrampusOnlineInferIndex(ctx *context.Context) {
ctx.Data["PageIsCloudBrain"] = true
ctx.HTML(http.StatusOK, tplGrampusOnlineInferIndex)

}

+ 85
- 51
routers/repo/modelarts.go View File

@@ -2,6 +2,7 @@ package repo

import (
"archive/zip"
ai_task "code.gitea.io/gitea/services/ai_task_service/task"
"encoding/json"
"errors"
"fmt"
@@ -124,8 +125,8 @@ func MustEnableModelArts(ctx *context.Context) {
}

func NotebookNew(ctx *context.Context) {
notebookNewDataPrepare(ctx)
// notebookNewDataPrepare(ctx)
ctx.Data["PageIsCloudBrain"] = true
ctx.HTML(200, tplModelArtsNotebookNew)
}

@@ -305,63 +306,67 @@ func Notebook2Create(ctx *context.Context, form auth.CreateModelArtsNotebookForm

func NotebookShow(ctx *context.Context) {
ctx.Data["PageIsCloudBrain"] = true
debugListType := ctx.Query("debugListType")
if debugListType == "" {
debugListType = "all"
}
var ID = ctx.Params(":id")
task, err := models.GetCloudbrainByIDWithDeleted(ID)
if err != nil {
log.Error("GET job error", err.Error())
ctx.NotFound(ctx.Req.URL.RequestURI(), nil)
return
}

if task.DeletedAt.IsZero() && !task.Cleared { //normal record
err := modelarts.HandleNotebookInfo(task)
ctx.HTML(200, tplModelArtsNotebookShow)
return
/*
debugListType := ctx.Query("debugListType")
if debugListType == "" {
debugListType = "all"
}
var ID = ctx.Params(":id")
task, err := models.GetCloudbrainByIDWithDeleted(ID)
if err != nil {
ctx.Data["error"] = err.Error()
ctx.RenderWithErr(err.Error(), tplModelArtsNotebookShow, nil)
log.Error("GET job error", err.Error())
ctx.NotFound(ctx.Req.URL.RequestURI(), nil)
return
}
}
datasetDownload := make([]*models.DatasetDownload, 0)
var modelDownload models.ModelDownload
if ctx.IsSigned {
if task.Uuid != "" && task.UserID == ctx.User.ID {
datasetDownload = GetCloudBrainDataSetInfo(task.Uuid, task.DatasetName, true)
}
if task.ModelName != "" && task.UserID == ctx.User.ID {
modelDownload = GetModelDownload(task)

if task.DeletedAt.IsZero() && !task.Cleared { //normal record
err := modelarts.HandleNotebookInfo(task)
if err != nil {
ctx.Data["error"] = err.Error()
ctx.RenderWithErr(err.Error(), tplModelArtsNotebookShow, nil)
return
}
}
datasetDownload := make([]*models.DatasetDownload, 0)
var modelDownload models.ModelDownload
if ctx.IsSigned {
if task.Uuid != "" && task.UserID == ctx.User.ID {
datasetDownload = GetCloudBrainDataSetInfo(task.Uuid, task.DatasetName, true)
}
if task.ModelName != "" && task.UserID == ctx.User.ID {
modelDownload = GetModelDownload(task)

}
user, err := models.GetUserByID(task.UserID)
if err == nil {
task.User = user
}
prepareSpec4Show(ctx, task)
if task.TrainJobDuration == "" {
if task.Duration == 0 {
var duration int64
if task.Status == string(models.JobRunning) {
duration = time.Now().Unix() - int64(task.CreatedUnix)
} else {
duration = int64(task.UpdatedUnix) - int64(task.CreatedUnix)
}
task.Duration = duration

}
task.TrainJobDuration = models.ConvertDurationToStr(task.Duration)
}
ctx.Data["duration"] = task.TrainJobDuration
ctx.Data["datasetDownload"] = datasetDownload
ctx.Data["modelDownload"] = modelDownload
ctx.Data["task"] = task
ctx.Data["ID"] = ID
ctx.Data["jobName"] = task.JobName
ctx.Data["debugListType"] = debugListType
ctx.HTML(200, tplModelArtsNotebookShow)
user, err := models.GetUserByID(task.UserID)
if err == nil {
task.User = user
}
prepareSpec4Show(ctx, task)
if task.TrainJobDuration == "" {
if task.Duration == 0 {
var duration int64
if task.Status == string(models.JobRunning) {
duration = time.Now().Unix() - int64(task.CreatedUnix)
} else {
duration = int64(task.UpdatedUnix) - int64(task.CreatedUnix)
}
task.Duration = duration
}
task.TrainJobDuration = models.ConvertDurationToStr(task.Duration)
}
ctx.Data["duration"] = task.TrainJobDuration
ctx.Data["datasetDownload"] = datasetDownload
ctx.Data["modelDownload"] = modelDownload
ctx.Data["task"] = task
ctx.Data["ID"] = ID
ctx.Data["jobName"] = task.JobName
ctx.Data["debugListType"] = debugListType
ctx.HTML(200, tplModelArtsNotebookShow)
*/
}

func GetModelDownload(task *models.Cloudbrain) models.ModelDownload {
@@ -673,6 +678,19 @@ func NotebookStop(ctx *context.Context) {
errorMsg = ctx.Tr("cloudbrain.Already_stopped")
break
}
if res, isHandled, err := ai_task.HandleNewAITaskStop(task.ID); isHandled {
if err != nil {
log.Error("ManageNotebook2(%s) failed:%v", task.JobName, err.Error(), ctx.Data["MsgID"])
resultCode = "-1"
errorMsg = err.Error()
if strings.Contains(err.Error(), modelarts.NotebookNotFound) {
errorMsg = "the job's version is too old and can not be restarted"
}
break
}
status = res.Status
break
}

err, res := StopModelArtsNotebook(task)

@@ -734,6 +752,22 @@ func NotebookDel(ctx *context.Context) {
var listType = ctx.Query("debugListType")
task := ctx.Cloudbrain

if isHandled, err := ai_task.HandleNewAITaskDelete(task.ID); isHandled {
if err != nil {
log.Error("DeleteJob(%s) failed:%v", task.JobName, err, ctx.Data["msgID"])
ctx.RenderWithErr("DeleteJob failed", tplDebugJobIndex, nil)
}
var isAdminPage = ctx.Query("isadminpage")
var isHomePage = ctx.Query("ishomepage")
if ctx.IsUserSiteAdmin() && isAdminPage == "true" {
ctx.Redirect(setting.AppSubURL + "/admin" + "/cloudbrains")
} else if isHomePage == "true" {
ctx.Redirect(setting.AppSubURL + "/cloudbrains")
} else {
ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/debugjob?debugListType=" + listType)
}
}

if task.Status != string(models.ModelArtsCreateFailed) && task.Status != string(models.ModelArtsStartFailed) && task.Status != string(models.ModelArtsStopped) && task.Status != string(models.ModelArtsDeleted) {
log.Error("the job(%s) has not been stopped", task.JobName)
ctx.RenderWithErr("the job has not been stopped", tplDebugJobIndex, nil)


+ 12
- 12
routers/repo/setting.go View File

@@ -489,20 +489,20 @@ func SettingsPost(ctx *context.Context, form auth.RepoSettingForm) {
ctx.RenderWithErr(ctx.Tr("form.enterred_invalid_repo_name"), tplSettingsOptions, nil)
return
}
deployments, err := models.GetRunningServiceByUser(ctx.User.ID)
if err != nil {
ctx.ServerError("GetRunningServiceByUser", err)
return
}
if deployments != nil {
if len(deployments) > 0 {
ctx.Data["Err_RepoName"] = nil
log.Error("盘古部署删除项目失败,repo id %v, 用户 id%v", repo.ID, ctx.User.ID)
ctx.Flash.Error(ctx.Tr("deployment.deletion_notice_repo"))
ctx.Redirect(ctx.Repo.RepoLink + "/settings")
return

// finetune: openi-notebook repo can not be deleted if it has running service
if repo.Name == "openi-notebook" {
if deployments, err := models.GetRunningServiceByUser(ctx.User.ID); deployments != nil && err == nil {
if len(deployments) > 0 {
ctx.Data["Err_RepoName"] = nil
log.Error("panguService: delete repo failed, repo %s, user %s", repo.ID, ctx.User.ID)
ctx.Flash.Error(ctx.Tr("deployment.deletion_notice_repo"))
ctx.Redirect(ctx.Repo.RepoLink + "/settings")
return
}
}
}

count, err := models.GetCloudbrainRunCountByRepoID(repo.ID)
if err != nil {
ctx.ServerError("GetCloudbrainCountByRepoID failed", err)


+ 5
- 1
routers/response/api_response.go View File

@@ -29,7 +29,11 @@ type TrFunc func(string, ...interface{}) string
func OuterTrBizError(err *BizError, locale macaron.Locale) *AiforgeOuterResponse {
msg := err.DefaultMsg
if locale != nil && err.TrCode != "" {
msg = locale.Tr(err.TrCode)
if err.TrParams == nil || len(err.TrParams) == 0 {
msg = locale.Tr(err.TrCode)
} else {
msg = locale.Tr(err.TrCode, err.TrParams...)
}
}
return &AiforgeOuterResponse{Code: err.Code, Msg: msg}
}


+ 26
- 1
routers/response/error.go View File

@@ -4,10 +4,28 @@ type BizError struct {
Code int
DefaultMsg string
TrCode string
TrParams []interface{}
}

//当调用此方法时意味着错误信息中有占位符,需要传入参数
//因此此时需要新建一个对象避免并发问题
func (e *BizError) WithParams(params ...interface{}) *BizError {
newErr := &BizError{
Code: e.Code,
DefaultMsg: e.DefaultMsg,
TrCode: e.TrCode,
}
if e.TrParams == nil {
newErr.TrParams = params
} else {
newErr.TrParams = append(e.TrParams, params)
}

return newErr
}

func NewBizError(err error) *BizError {
return &BizError{Code: RESPONSE_CODE_ERROR_DEFAULT, DefaultMsg: err.Error()}
return &BizError{Code: RESPONSE_CODE_ERROR_DEFAULT, DefaultMsg: err.Error(), TrCode: err.Error()}
}

func BuildBizError(code int, defaultMsg string, trCode ...string) *BizError {
@@ -17,3 +35,10 @@ func BuildBizError(code int, defaultMsg string, trCode ...string) *BizError {
}
return &BizError{Code: code, DefaultMsg: defaultMsg, TrCode: t}
}
func BuildDefaultBizError(defaultMsg string, trCode ...string) *BizError {
t := ""
if len(t) == 0 {
t = trCode[0]
}
return &BizError{Code: RESPONSE_CODE_ERROR_DEFAULT, DefaultMsg: defaultMsg, TrCode: t}
}

+ 10
- 1
routers/response/response_list.go View File

@@ -11,6 +11,7 @@ var BADGES_STILL_HAS_USERS = &BizError{Code: 1005, DefaultMsg: "Please delete us
var SYSTEM_ERROR = &BizError{Code: 9009, DefaultMsg: "System error.Please try again later", TrCode: "common_error.system_error"}
var INSUFFICIENT_PERMISSION = &BizError{Code: 9003, DefaultMsg: "insufficient permissions", TrCode: "common_error.insufficient_permission"}
var PARAM_ERROR = &BizError{Code: 9001, DefaultMsg: "param error", TrCode: "common_error.param_error"}
var WECHAT_NOT_BIND = &BizError{Code: 9002, DefaultMsg: "Please scan the code and bind to wechat first", TrCode: "common_error.wechat_not_bind"}

//云脑任务相关错误
var AI_TASK_NOT_EXISTS = &BizError{Code: 2001, DefaultMsg: "AI task not exists", TrCode: "ai_task.task_not_exists"}
@@ -23,4 +24,12 @@ var DATASET_NOT_EXISTS = &BizError{Code: 2007, DefaultMsg: "The part of datasets
var MODEL_NOT_EXISTS = &BizError{Code: 2008, DefaultMsg: "The model in the task does not exist or has been deleted, please create a new debug job.", TrCode: "repo.debug.manage.model_not_exist"}
var RESULT_CLEARD = &BizError{Code: 2009, DefaultMsg: "The files of the task have been cleared, can not restart any more, please create a new debug task instead.", TrCode: "cloudbrain.result_cleared"}
var CREATE_FAILED = &BizError{Code: 2010, DefaultMsg: "Create AI task failed", TrCode: "ai_task.create_failed"}
var RESTART_FAILED = &BizError{Code: 2010, DefaultMsg: "Restart AI task failed", TrCode: "ai_task.restart_failed"}
var RESTART_FAILED = &BizError{Code: 2011, DefaultMsg: "Restart AI task failed", TrCode: "ai_task.restart_failed"}
var STOP_FAILED = &BizError{Code: 2012, DefaultMsg: "Stop AI task failed", TrCode: "ai_task.stop_failed"}
var DATASET_SIZE_OVER_LIMIT = &BizError{Code: 2013, DefaultMsg: "The size of dataset exceeds limitation", TrCode: "ai_task.dataset_size_over_limit"}
var BOOT_FILE_MUST_BE_PYTHON = &BizError{Code: 2013, DefaultMsg: "The boot file must be a python file", TrCode: "ai_task.boot_file_must_python"}
var BOOT_FILE_NOT_EXIST = &BizError{Code: 2014, DefaultMsg: "The boot file not exist", TrCode: "ai_task.boot_file_not_exist"}
var DATASET_SELECT_ERROR = &BizError{Code: 2017, DefaultMsg: "Dataset select error: the count exceed the limit or has same name", TrCode: "cloudbrain.error.dataset_select"}
var PARTIAL_DATASETS_NOT_AVAILABLE = &BizError{Code: 2018, DefaultMsg: "There are non-existent or deleted files in the selected dataset file, please select again", TrCode: "cloudbrain.error.partial_datasets_not_available"}
var LOAD_CODE_FAILED = &BizError{Code: 2019, DefaultMsg: "Fail to load code, please check if the right branch is selected.", TrCode: "cloudbrain.load_code_failed"}
var BRANCH_NOT_EXISTS = &BizError{Code: 2020, DefaultMsg: "The branch does not exist", TrCode: "ai_task.branch_not_exists"}

+ 10
- 1
routers/routes/routes.go View File

@@ -385,7 +385,7 @@ func RegisterRoutes(m *macaron.Macaron) {
m.Get("", modelapp.ModelBaseUI)
m.Group("/pangufinetune", func() {
m.Get("", modelapp.PanguFinetuneUI)
m.Get("/create", reqSignIn, modelapp.PanguFinetuneCreateUI)
m.Get("/create", reqSignIn, reqWechatBind, modelapp.PanguFinetuneCreateUI)
m.Get("/inference", reqSignIn, modelapp.PanguInferenceUI)
})

@@ -428,6 +428,7 @@ func RegisterRoutes(m *macaron.Macaron) {
m.Get("/data_analysis/ProTrend", routers.ExploreDataAnalysisProTrend)
m.Get("/data_analysis/Overview", routers.ExploreDataAnalysisOverview)
m.Get("/data_analysis/BrainAnalysis", routers.ExploreDataAnalysisBrainAnalysis)
m.Get("/center_map", reqSignIn, routers.CenterMapUI)

}, ignSignIn)
m.Combo("/install", routers.InstallInit).Get(routers.Install).
@@ -1318,6 +1319,14 @@ func RegisterRoutes(m *macaron.Macaron) {
m.Post("/create", reqWechatBind, reqRepoCloudBrainWriter, bindIgnErr(auth.CreateGrampusNotebookForm{}), context.PointAccount(), repo.GrampusNotebookCreate)
})

m.Group("/onlineinfer", func() {
m.Get("", reqRepoCloudBrainReader, repo.GrampusOnlineInferIndex)
m.Group("/:id", func() {
m.Get("", reqRepoCloudBrainReader, repo.GrampusOnlineInferShow)
})
m.Get("/create", reqWechatBind, reqRepoCloudBrainWriter, context.PointAccount(), repo.GrampusOnlineInferNew)
})

m.Group("/train-job", func() {
m.Group("/:jobid", func() {
m.Get("", reqRepoCloudBrainReader, repo.GrampusTrainJobShow)


+ 1
- 0
routers/user/home.go View File

@@ -85,6 +85,7 @@ func retrieveFeeds(ctx *context.Context, options models.GetFeedsOptions) {
if act.ActUser != nil {
userCache[act.ActUserID] = act.ActUser
}
act.FilterCloudbrainInfo()
}

for _, act := range actions {


+ 167
- 37
services/ai_task_service/cluster/c2net.go View File

@@ -1,14 +1,18 @@
package cluster

import (
"code.gitea.io/gitea/entity/ai_task_entity"
"errors"
"fmt"
"strings"

"code.gitea.io/gitea/entity"
"code.gitea.io/gitea/manager/client/grampus"
"code.gitea.io/gitea/models"
model_grampus "code.gitea.io/gitea/modules/grampus"
"code.gitea.io/gitea/modules/log"
"code.gitea.io/gitea/modules/setting"
"errors"
"fmt"
"strings"
"code.gitea.io/gitea/routers/response"
"code.gitea.io/gitea/services/cloudbrain/cloudbrainTask"
)

type C2NetClusterAdapter struct {
@@ -16,10 +20,10 @@ type C2NetClusterAdapter struct {

func init() {
//注册到一个Map
AddCluster(ai_task_entity.C2Net, new(C2NetClusterAdapter))
AddCluster(entity.C2Net, new(C2NetClusterAdapter))
}

func (c C2NetClusterAdapter) CreateNoteBook(req ai_task_entity.CreateNoteBookTaskRequest) (*ai_task_entity.CreateNoteBookTaskResponse, error) {
func (c C2NetClusterAdapter) CreateNoteBook(req entity.CreateNoteBookTaskRequest) (*entity.CreateNoteBookTaskResponse, error) {
jobResult, err := grampus.CreateNotebookJob(convertNoteBookReq2Grampus(req))
if err != nil {
log.Error("CreateNoteBook failed: %v", err.Error())
@@ -32,7 +36,20 @@ func (c C2NetClusterAdapter) CreateNoteBook(req ai_task_entity.CreateNoteBookTas
return convertGrampus2NoteBookRes(jobResult), nil
}

func (c C2NetClusterAdapter) GetImages(req ai_task_entity.GetImageReq) ([]ai_task_entity.ClusterImage, bool, error) {
func (c C2NetClusterAdapter) CreateOnlineInfer(req entity.CreateNoteBookTaskRequest) (*entity.CreateNoteBookTaskResponse, error) {
jobResult, err := grampus.CreateNotebookJob(convertOnlineInfer2Grampus(req))
if err != nil {
log.Error("CreateNoteBook failed: %v", err.Error())
return nil, err
}
if jobResult.ErrorCode > 0 {
log.Error("CreateNotebookJob err.req.Name = %s ErrorCode = %d ErrorMsg = %s", req.Name, jobResult.ErrorCode, jobResult.ErrorMsg)
return nil, errors.New(fmt.Sprintf("CreateNotebookJob err[%d%s]", jobResult.ErrorCode, jobResult.ErrorMsg))
}
return convertGrampus2NoteBookRes(jobResult), nil
}

func (c C2NetClusterAdapter) GetImages(req entity.GetImageReq) ([]entity.ClusterImage, bool, error) {
processType := req.ComputeSource.FullName
images, err := grampus.GetImages(processType, string(req.JobType))
if err != nil {
@@ -42,23 +59,48 @@ func (c C2NetClusterAdapter) GetImages(req ai_task_entity.GetImageReq) ([]ai_tas
if images == nil || images.Infos == nil || len(images.Infos) == 0 {
return nil, true, err
}
r := make([]ai_task_entity.ClusterImage, len(images.Infos))
r := make([]entity.ClusterImage, len(images.Infos))
for i, v := range images.Infos {
r[i] = ConvertGrampusImageToStandard(v)
}
return r, false, nil
}

func ConvertGrampusImageToStandard(image models.GrampusImage) ai_task_entity.ClusterImage {
return ai_task_entity.ClusterImage{
func ConvertGrampusImageToStandard(image models.GrampusImage) entity.ClusterImage {
return entity.ClusterImage{
ImageId: image.ID,
ImageName: image.Name,
}
}

func convertNoteBookReq2Grampus(req ai_task_entity.CreateNoteBookTaskRequest) models.CreateGrampusNotebookRequest {
var commandGpuDebug = "mkdir -p /dataset;! [ -x \"$(command -v jupyter)\" ] && pip install jupyterlab==3 -i https://pypi.tuna.tsinghua.edu.cn/simple;jupyter lab --ServerApp.shutdown_no_activity_timeout=%s --TerminalManager.cull_inactive_timeout=%s --TerminalManager.cull_interval=%s --MappingKernelManager.cull_idle_timeout=%s --MappingKernelManager.cull_interval=%s --MappingKernelManager.cull_connected=True --MappingKernelManager.cull_busy=True --no-browser --ip=0.0.0.0 --allow-root --notebook-dir='/code' --port=$OCTOPUS_NOTEBOOK_PORT --LabApp.token='' --LabApp.allow_origin='*' --LabApp.base_url=$OCTOPUS_NOTEBOOK_BASE_URL;"
command := fmt.Sprintf(commandGpuDebug, setting.CullIdleTimeout, setting.CullIdleTimeout, setting.CullInterval, setting.CullIdleTimeout, setting.CullInterval)
func convertNoteBookReq2Grampus(req entity.CreateNoteBookTaskRequest) models.CreateGrampusNotebookRequest {
codePath := "/code"
if len(req.Tasks[0].Code) > 0 {
codePath = req.Tasks[0].Code[0].ContainerPath
if strings.Contains(codePath, "/") {
codePath = codePath[0:strings.LastIndex(codePath, "/")]
}
}

var commandGpuDebug = "mkdir -p /dataset;! [ -x \"$(command -v jupyter)\" ] && pip install jupyterlab==3 -i https://pypi.tuna.tsinghua.edu.cn/simple;jupyter lab --ServerApp.shutdown_no_activity_timeout=%s --TerminalManager.cull_inactive_timeout=%s --TerminalManager.cull_interval=%s --MappingKernelManager.cull_idle_timeout=%s --MappingKernelManager.cull_interval=%s --MappingKernelManager.cull_connected=True --MappingKernelManager.cull_busy=True --no-browser --ip=0.0.0.0 --allow-root --notebook-dir='%s' --port=$OCTOPUS_NOTEBOOK_PORT --LabApp.token='' --LabApp.allow_origin='*' --LabApp.base_url=$OCTOPUS_NOTEBOOK_BASE_URL;"
command := fmt.Sprintf(commandGpuDebug, setting.CullIdleTimeout, setting.CullIdleTimeout, setting.CullInterval, setting.CullIdleTimeout, setting.CullInterval, codePath)
// command := "bash && cd /code && unzip master.zip && cd test-export-data && uvicorn train:app --host 0.0.0.0 --port $OCTOPUS_NOTEBOOK_PORT"
if models.NPU == req.Tasks[0].Spec.ComputeResource {
command = ""
}
log.Info("debug cmd=" + command)
tasks := make([]models.GrampusNotebookTask, len(req.Tasks))
for i := 0; i < len(req.Tasks); i++ {
t := req.Tasks[i]
tasks[i] = convertNoteBookTask2Grampus(t, command)
}

return models.CreateGrampusNotebookRequest{Name: req.Name, Tasks: tasks}
}

func convertOnlineInfer2Grampus(req entity.CreateNoteBookTaskRequest) models.CreateGrampusNotebookRequest {

command := generateCommand(req.RepoName, req.Tasks[0].BootFile, req.PrimitiveDatasetName)

tasks := make([]models.GrampusNotebookTask, len(req.Tasks))
for i := 0; i < len(req.Tasks); i++ {
@@ -69,7 +111,51 @@ func convertNoteBookReq2Grampus(req ai_task_entity.CreateNoteBookTaskRequest) mo
return models.CreateGrampusNotebookRequest{Name: req.Name, Tasks: tasks}
}

func convertNoteBookTask2Grampus(t ai_task_entity.NoteBookTask, command string) models.GrampusNotebookTask {
func generateCommand(repoName, bootFile, datasetName string) string {

//prepare
//command := "bash && cd /code && unzip master.zip && cd test-export-data && uvicorn train:app --host 0.0.0.0 --port $OCTOPUS_NOTEBOOK_PORT"
workDir := "/"
command := "pip install gradio fastapi -i https://pypi.tuna.tsinghua.edu.cn/simple;"
command += "pwd; cd " + workDir + fmt.Sprintf(model_grampus.CommandPrepareScriptGpu)

//unzip code & dataset
unZipDatasetCommand := cloudbrainTask.GenerateDatasetUnzipCommand(datasetName)
bootFile = strings.ReplaceAll(bootFile, "\\", "/")
bootfilepath := ""
bootonlyfile := bootFile
if strings.Index(bootFile, "/") != -1 {
bootfilepath = bootFile[0:strings.LastIndex(bootFile, "/")]
if strings.HasPrefix(bootfilepath, "/") {
bootfilepath = bootfilepath[1:]
}
bootonlyfile = bootFile[strings.LastIndex(bootFile, "/")+1:]
}
log.Info("bootfilepath=" + bootfilepath + " bootonlyfile=" + bootonlyfile)
copyDatasetCmd := getCopyCmd(datasetName, repoName, bootfilepath)
copyDatasetPath := "/code/" + strings.ToLower(repoName) + "/" + bootfilepath
commandUnzip := "export OPENI_GRADIO_URL=$OCTOPUS_NOTEBOOK_BASE_URL;" + "cd " + workDir + "code;echo \"start unzip code\";unzip -q master.zip; " + copyDatasetCmd + " echo \"start to unzip dataset\";cd " + copyDatasetPath + "; " + unZipDatasetCommand
//commandUnzip := "cd " + workDir + "code;echo \"start unzip code\";unzip -q master.zip;echo \"start to unzip dataset\";cd " + workDir + "dataset;" + unZipDatasetCommand
command += commandUnzip
command += "echo \"unzip finished;start to exec code;\";"
if strings.HasSuffix(bootonlyfile, ".py") {
bootonlyfile = bootonlyfile[0 : len(bootonlyfile)-3]
}
command += "cd " + copyDatasetPath + ";uvicorn " + bootonlyfile + ":app --host 0.0.0.0 --port $OCTOPUS_NOTEBOOK_PORT "

log.Info("comand=" + command)
return command
}
func getCopyCmd(datasetName, repoName, bootfilepath string) string {
cmd := ""
datasetNameArray := strings.Split(datasetName, ";")
for _, datasetNameTemp := range datasetNameArray {
cmd += "cp /dataset/" + datasetNameTemp + " /code/" + strings.ToLower(repoName) + "/" + bootfilepath + ";"
}
return cmd
}

func convertNoteBookTask2Grampus(t entity.NoteBookTask, command string) models.GrampusNotebookTask {

code := models.GrampusDataset{}
codeArray := convertContainerArray2Grampus(t.Code)
@@ -90,7 +176,7 @@ func convertNoteBookTask2Grampus(t ai_task_entity.NoteBookTask, command string)
}
}

func convertContainerArray2Grampus(containerDatas []ai_task_entity.ContainerData) []models.GrampusDataset {
func convertContainerArray2Grampus(containerDatas []entity.ContainerData) []models.GrampusDataset {
res := make([]models.GrampusDataset, len(containerDatas))
for i := 0; i < len(containerDatas); i++ {
d := containerDatas[i]
@@ -99,7 +185,7 @@ func convertContainerArray2Grampus(containerDatas []ai_task_entity.ContainerData
return res
}

func convertContainer2Grampus(d ai_task_entity.ContainerData) models.GrampusDataset {
func convertContainer2Grampus(d entity.ContainerData) models.GrampusDataset {
return models.GrampusDataset{
Name: d.Name,
Bucket: d.Bucket,
@@ -110,9 +196,9 @@ func convertContainer2Grampus(d ai_task_entity.ContainerData) models.GrampusData
}
}

func convertGrampus2NoteBookRes(res *models.GrampusNotebookResponse) *ai_task_entity.CreateNoteBookTaskResponse {
func convertGrampus2NoteBookRes(res *models.GrampusNotebookResponse) *entity.CreateNoteBookTaskResponse {
jobInfo := res.JobInfo
return &ai_task_entity.CreateNoteBookTaskResponse{
return &entity.CreateNoteBookTaskResponse{
StartedAt: jobInfo.StartedAt,
RunSec: jobInfo.RunSec,
CompletedAt: jobInfo.CompletedAt,
@@ -126,7 +212,7 @@ func convertGrampus2NoteBookRes(res *models.GrampusNotebookResponse) *ai_task_en
}
}

func (c C2NetClusterAdapter) RestartNoteBook(jobId string) (*ai_task_entity.RestartNoteBookTaskResponse, error) {
func (c C2NetClusterAdapter) RestartNoteBook(jobId string) (*entity.RestartNoteBookTaskResponse, error) {
res, err := grampus.RestartNotebookJob(jobId)
if err != nil {
log.Error("RestartNotebookJob err jobId=%s .%v", jobId, err)
@@ -134,13 +220,16 @@ func (c C2NetClusterAdapter) RestartNoteBook(jobId string) (*ai_task_entity.Rest
}
if res.ErrorCode > 0 {
log.Error("RestartNotebookJob err.jobId = %s ErrorCode = %d ErrorMsg = %s", jobId, res.ErrorCode, res.ErrorMsg)
return nil, errors.New(fmt.Sprintf("RestartNotebookJob err[%d%s]", res.ErrorCode, res.ErrorMsg))
if entity.GrampusJobCanNotRestart.IsMatch(res.ErrorCode) {
return nil, errors.New(entity.GrampusJobCanNotRestart.CodeTrCode)
}
return nil, errors.New(response.RESTART_FAILED.TrCode)
}
return convertToCreateNoteBookTaskResponse(res), nil
}

func convertToCreateNoteBookTaskResponse(res *models.GrampusNotebookRestartResponse) *ai_task_entity.RestartNoteBookTaskResponse {
return &ai_task_entity.RestartNoteBookTaskResponse{
func convertToCreateNoteBookTaskResponse(res *models.GrampusNotebookRestartResponse) *entity.RestartNoteBookTaskResponse {
return &entity.RestartNoteBookTaskResponse{
JobId: res.NewId,
Status: res.Status,
}
@@ -159,7 +248,10 @@ func (c C2NetClusterAdapter) StopNoteBook(jobId string) error {
return nil
}

func (c C2NetClusterAdapter) QueryNoteBook(jobId string) (*ai_task_entity.QueryTaskResponse, error) {
func (c C2NetClusterAdapter) QueryNoteBook(jobId string) (*entity.QueryTaskResponse, error) {
if jobId == "" {
return nil, errors.New("jobID is empty")
}
result, err := grampus.GetNotebookJob(jobId)
if err != nil {
return nil, err
@@ -167,19 +259,19 @@ func (c C2NetClusterAdapter) QueryNoteBook(jobId string) (*ai_task_entity.QueryT
if result == nil {
return nil, nil
}
return ai_task_entity.ConvertGrampusNotebookResponse(result.JobInfo), nil
return entity.ConvertGrampusNotebookResponse(result.JobInfo), nil
}

func (c C2NetClusterAdapter) QueryNoteBookByJobName(jobName string) ([]*ai_task_entity.QueryTaskResponse, error) {
func (c C2NetClusterAdapter) QueryNoteBookByJobName(jobName string) ([]*entity.QueryTaskResponse, error) {
res, err := grampus.GetJobListByJobName(jobName)
if err != nil {
return nil, err
}
result := make([]*ai_task_entity.QueryTaskResponse, 0)
result := make([]*entity.QueryTaskResponse, 0)
if res != nil {
for i := 0; i < len(res.JobInfos); i++ {
if res.JobInfos[i].Name == jobName {
result = append(result, ai_task_entity.ConvertGrampusTrainResponse(res.JobInfos[i]))
result = append(result, entity.ConvertGrampusTrainResponse(res.JobInfos[i]))
}

}
@@ -187,7 +279,7 @@ func (c C2NetClusterAdapter) QueryNoteBookByJobName(jobName string) ([]*ai_task_
return result, nil
}

func (c C2NetClusterAdapter) GetNoteBookLog(jobId string) (*ai_task_entity.ClusterLog, error) {
func (c C2NetClusterAdapter) GetNoteBookLog(jobId string) (*entity.ClusterLog, error) {
return nil, nil
}

@@ -201,8 +293,46 @@ func (c C2NetClusterAdapter) GetNoteBookUrl(jobId string) (string, error) {
}
return res.Url + "?token=" + res.Token, nil
}
func (c C2NetClusterAdapter) GetNoteBookOperationProfile(jobId string) (*entity.OperationProfile, error) {
if jobId == "" {
log.Error("jobid is empty")
return nil, errors.New("jobid is empty")
}
jobResult, err := grampus.GetDebugJobEvents(jobId)
if err != nil {
log.Error("GetDebugJobEvents failed:%v", err)
return nil, err
}

r := parseC2NetEventsToOperationProfile(jobResult.NotebookEvents)
getJobResult, err := grampus.GetJob(jobId)
if err == nil && getJobResult != nil && getJobResult.ExitDiagnostics != "" {
r.Events = append(r.Events, entity.ProfileEvent{
Message: getJobResult.ExitDiagnostics,
Reason: "Exit",
})
}
return r, nil
}

func parseC2NetEventsToOperationProfile(notebookEvents []models.GrampusJobEvents) *entity.OperationProfile {
events := make([]entity.ProfileEvent, 0)
for i := 0; i < len(notebookEvents); i++ {
e := notebookEvents[i]
if e.Message == "" {
continue
}
events = append(events, entity.ProfileEvent{
Message: e.Message,
Reason: e.Reason,
Name: e.Name,
Timestamp: e.Timestamp,
})
}
return &entity.OperationProfile{Events: events}
}

func (c C2NetClusterAdapter) CreateTrainJob(req ai_task_entity.CreateTrainTaskRequest) (*ai_task_entity.CreateTrainTaskResponse, error) {
func (c C2NetClusterAdapter) CreateTrainJob(req entity.CreateTrainTaskRequest) (*entity.CreateTrainTaskResponse, error) {
jobResult, err := grampus.CreateJob(convertTrainReq2Grampus(req))
if err != nil {
log.Error("CreateNoteBook failed: %v", err.Error())
@@ -211,7 +341,7 @@ func (c C2NetClusterAdapter) CreateTrainJob(req ai_task_entity.CreateTrainTaskRe
return convertGrampus2TrainRes(jobResult), nil
}

func convertTrainReq2Grampus(req ai_task_entity.CreateTrainTaskRequest) models.CreateGrampusJobRequest {
func convertTrainReq2Grampus(req entity.CreateTrainTaskRequest) models.CreateGrampusJobRequest {
command := generateGrampusTrainCommand(req)

tasks := make([]models.GrampusTasks, len(req.Tasks))
@@ -223,7 +353,7 @@ func convertTrainReq2Grampus(req ai_task_entity.CreateTrainTaskRequest) models.C
return models.CreateGrampusJobRequest{Name: req.Name, Tasks: tasks}
}

func generateGrampusTrainCommand(req ai_task_entity.CreateTrainTaskRequest) string {
func generateGrampusTrainCommand(req entity.CreateTrainTaskRequest) string {
var command string
t := req.Tasks[0]
computeResource := t.Spec.ComputeResource
@@ -298,7 +428,7 @@ func getNpuModelObjectKey(jobName string) string {
return setting.CodePathPrefix + jobName + RemoteModelPath + "/" + models.ModelSuffix
}

func convertTrainTask2Grampus(t ai_task_entity.TrainTask, command string) models.GrampusTasks {
func convertTrainTask2Grampus(t entity.TrainTask, command string) models.GrampusTasks {
return models.GrampusTasks{
Name: t.Name,
ResourceSpecId: t.ResourceSpecId,
@@ -315,9 +445,9 @@ func convertTrainTask2Grampus(t ai_task_entity.TrainTask, command string) models
}
}

func convertGrampus2TrainRes(res *models.CreateGrampusJobResponse) *ai_task_entity.CreateTrainTaskResponse {
func convertGrampus2TrainRes(res *models.CreateGrampusJobResponse) *entity.CreateTrainTaskResponse {
jobInfo := res.JobInfo
return &ai_task_entity.CreateTrainTaskResponse{
return &entity.CreateTrainTaskResponse{
StartedAt: jobInfo.StartedAt,
RunSec: jobInfo.RunSec,
CompletedAt: jobInfo.CompletedAt,
@@ -337,13 +467,13 @@ func (c C2NetClusterAdapter) DeleteTrainJob(string) error {
func (c C2NetClusterAdapter) StopTrainJob(string) error {
return nil
}
func (c C2NetClusterAdapter) QueryTrainJob(string) (*ai_task_entity.QueryTaskResponse, error) {
func (c C2NetClusterAdapter) QueryTrainJob(string) (*entity.QueryTaskResponse, error) {
return nil, nil
}
func (c C2NetClusterAdapter) RestartTrainJob(string) (*ai_task_entity.CreateTrainTaskResponse, error) {
func (c C2NetClusterAdapter) RestartTrainJob(string) (*entity.CreateTrainTaskResponse, error) {
return nil, nil
}

func (c C2NetClusterAdapter) GetTrainLog(jobId string) (*ai_task_entity.ClusterLog, error) {
func (c C2NetClusterAdapter) GetTrainLog(jobId string) (*entity.ClusterLog, error) {
return nil, nil
}

+ 91
- 24
services/ai_task_service/cluster/cloudbrain_one.go View File

@@ -2,12 +2,14 @@ package cluster

import "C"
import (
"code.gitea.io/gitea/entity/ai_task_entity"
"encoding/json"
"errors"

"code.gitea.io/gitea/entity"
"code.gitea.io/gitea/models"
"code.gitea.io/gitea/modules/cloudbrain"
"code.gitea.io/gitea/modules/log"
"code.gitea.io/gitea/modules/setting"
"errors"
)

type CloudbrainOneClusterAdapter struct {
@@ -15,10 +17,10 @@ type CloudbrainOneClusterAdapter struct {

func init() {
//注册到一个Map
AddCluster(ai_task_entity.OpenICloudbrainOne, new(CloudbrainOneClusterAdapter))
AddCluster(entity.OpenICloudbrainOne, new(CloudbrainOneClusterAdapter))
}

func (c CloudbrainOneClusterAdapter) CreateNoteBook(req ai_task_entity.CreateNoteBookTaskRequest) (*ai_task_entity.CreateNoteBookTaskResponse, error) {
func (c CloudbrainOneClusterAdapter) CreateNoteBook(req entity.CreateNoteBookTaskRequest) (*entity.CreateNoteBookTaskResponse, error) {
jobResult, err := cloudbrain.CreateJob(req.Name, convertNoteBookReq2CloudbrainOne(req))
if err != nil {
log.Error("CreateNoteBook failed: %v", err.Error())
@@ -27,13 +29,17 @@ func (c CloudbrainOneClusterAdapter) CreateNoteBook(req ai_task_entity.CreateNot
return convertCloudbrainOne2NoteBookRes(jobResult), nil
}

func (c CloudbrainOneClusterAdapter) GetImages(req ai_task_entity.GetImageReq) ([]ai_task_entity.ClusterImage, bool, error) {
func (c CloudbrainOneClusterAdapter) CreateOnlineInfer(req entity.CreateNoteBookTaskRequest) (*entity.CreateNoteBookTaskResponse, error) {
return nil, nil
}

func (c CloudbrainOneClusterAdapter) GetImages(req entity.GetImageReq) ([]entity.ClusterImage, bool, error) {
return nil, true, nil
}

var SubTaskName = "task1"

func convertNoteBookReq2CloudbrainOne(req ai_task_entity.CreateNoteBookTaskRequest) models.CreateJobParams {
func convertNoteBookReq2CloudbrainOne(req entity.CreateNoteBookTaskRequest) models.CreateJobParams {
var command = `pip3 install jupyterlab==3 -i https://pypi.tuna.tsinghua.edu.cn/simple;pip3 install -U "nbclassic>=0.2.8" -i https://pypi.tuna.tsinghua.edu.cn/simple;service ssh stop;jupyter lab --ServerApp.shutdown_no_activity_timeout=` + setting.CullIdleTimeout + ` --TerminalManager.cull_inactive_timeout=` + setting.CullIdleTimeout + ` --TerminalManager.cull_interval=` + setting.CullInterval + ` --MappingKernelManager.cull_idle_timeout=` + setting.CullIdleTimeout + ` --MappingKernelManager.cull_interval=` + setting.CullInterval + ` --MappingKernelManager.cull_connected=True --MappingKernelManager.cull_busy=True --no-browser --ip=0.0.0.0 --allow-root --notebook-dir="/code" --port=80 --ServerApp.token="" --LabApp.token="" --ServerApp.allow_origin="self https://cloudbrain.pcl.ac.cn" `
t := req.Tasks[0]

@@ -58,11 +64,11 @@ func convertNoteBookReq2CloudbrainOne(req ai_task_entity.CreateNoteBookTaskReque
UseNNI: false,
},
},
Volumes: convertContainerDataArray2Volume(t.Code, t.Datasets, t.PreTrainModel),
Volumes: convertContainerDataArray2Volume(t.Code, t.Datasets, t.PreTrainModel, t.OutPut),
}
}

func convertContainerDataArray2Volume(containerDataArray ...[]ai_task_entity.ContainerData) []models.Volume {
func convertContainerDataArray2Volume(containerDataArray ...[]entity.ContainerData) []models.Volume {
r := make([]models.Volume, 0)
for _, array := range containerDataArray {
for _, d := range array {
@@ -72,7 +78,7 @@ func convertContainerDataArray2Volume(containerDataArray ...[]ai_task_entity.Con
return r
}

func convertContainerData2Volume(d ai_task_entity.ContainerData) models.Volume {
func convertContainerData2Volume(d entity.ContainerData) models.Volume {
return models.Volume{
HostPath: models.StHostPath{
Path: d.RealPath,
@@ -82,15 +88,15 @@ func convertContainerData2Volume(d ai_task_entity.ContainerData) models.Volume {
}
}

func convertCloudbrainOne2NoteBookRes(res *models.CreateJobResult) *ai_task_entity.CreateNoteBookTaskResponse {
func convertCloudbrainOne2NoteBookRes(res *models.CreateJobResult) *entity.CreateNoteBookTaskResponse {
playload := res.Payload
return &ai_task_entity.CreateNoteBookTaskResponse{
return &entity.CreateNoteBookTaskResponse{
JobID: playload["jobId"].(string),
Status: string(models.JobWaiting),
}
}

func (c CloudbrainOneClusterAdapter) RestartNoteBook(string) (*ai_task_entity.RestartNoteBookTaskResponse, error) {
func (c CloudbrainOneClusterAdapter) RestartNoteBook(string) (*entity.RestartNoteBookTaskResponse, error) {

return nil, nil
}
@@ -99,10 +105,15 @@ func (c CloudbrainOneClusterAdapter) DeleteNoteBook(string) error {
}

func (c CloudbrainOneClusterAdapter) StopNoteBook(jobId string) error {
err := cloudbrain.StopJob(jobId)
if err != nil {
log.Error("StopNoteBook(%s) failed:%v", jobId, err)
return err
}
return nil
}

func (c CloudbrainOneClusterAdapter) QueryNoteBook(jobId string) (*ai_task_entity.QueryTaskResponse, error) {
func (c CloudbrainOneClusterAdapter) QueryNoteBook(jobId string) (*entity.QueryTaskResponse, error) {
if jobId == "" {
log.Error("jobid is empty")
return nil, errors.New("jobid is empty")
@@ -112,15 +123,14 @@ func (c CloudbrainOneClusterAdapter) QueryNoteBook(jobId string) (*ai_task_entit
log.Error("QueryNoteBook failed:%v", err)
return nil, err
}
result, err := models.ConvertToJobResultPayload(jobResult.Payload)
if err != nil {
log.Error("ConvertToJobResultPayload failed:%v", err)
return nil, err
}
return ai_task_entity.ConvertCloudbrainOneNotebookResponse(result), nil
return entity.ConvertCloudbrainOneNotebookResponse(jobResult.Payload)
}

func (c CloudbrainOneClusterAdapter) QueryNoteBookByJobName(jobName string) ([]*ai_task_entity.QueryTaskResponse, error) {
func (c CloudbrainOneClusterAdapter) QueryNoteBookByJobName(jobName string) ([]*entity.QueryTaskResponse, error) {
jobResult, err := cloudbrain.GetJobListByName(jobName)
if err != nil {
log.Error("GetJobListByName failed:%v", err)
@@ -131,23 +141,80 @@ func (c CloudbrainOneClusterAdapter) QueryNoteBookByJobName(jobName string) ([]*
log.Error("ConvertToJobListResultPayload failed:%v", err)
return nil, err
}
r := make([]*ai_task_entity.QueryTaskResponse, 0)
r := make([]*entity.QueryTaskResponse, 0)
for i := 0; i < len(result.Jobs); i++ {
if result.Jobs[i].Name == jobName {
r = append(r, ai_task_entity.ConvertCloudbrainOneQueryNotebookByNameResponse(result.Jobs[i]))
r = append(r, entity.ConvertCloudbrainOneQueryNotebookByNameResponse(result.Jobs[i]))
}
}
return r, nil
}

func (c CloudbrainOneClusterAdapter) GetNoteBookLog(jobId string) (*ai_task_entity.ClusterLog, error) {
func (c CloudbrainOneClusterAdapter) GetNoteBookLog(jobId string) (*entity.ClusterLog, error) {
return nil, nil
}

func (c CloudbrainOneClusterAdapter) GetNoteBookUrl(jobId string) (string, error) {
return "", nil
return setting.DebugServerHost + "jpylab_" + jobId + "_" + models.SubTaskName, nil
}
func (c CloudbrainOneClusterAdapter) CreateTrainJob(ai_task_entity.CreateTrainTaskRequest) (*ai_task_entity.CreateTrainTaskResponse, error) {

func (c CloudbrainOneClusterAdapter) GetNoteBookOperationProfile(jobId string) (*entity.OperationProfile, error) {
if jobId == "" {
log.Error("jobid is empty")
return nil, errors.New("jobid is empty")
}
jobResult, err := cloudbrain.GetJob(jobId)
if err != nil {
log.Error("QueryNoteBook failed:%v", err)
return nil, err
}
result, err := models.ConvertToJobResultPayload(jobResult.Payload)
if err != nil {
log.Error("ConvertToJobResultPayload failed:%v", err)
return nil, err
}
return parseDiagnosticsToOperationProfile(result.JobStatus.AppExitDiagnostics), nil
}

func parseDiagnosticsToOperationProfile(appExitDiagnostics string) *entity.OperationProfile {
if appExitDiagnostics == "" {
return nil
}
diagnostics := entity.CloudbrainOneAppExitDiagnostics{}
err := json.Unmarshal([]byte(appExitDiagnostics), &diagnostics)
if err != nil {
log.Error("json.Unmarshal appExitDiagnostics err.%v", err)
return nil
}
events := make([]entity.ProfileEvent, 0)
podEvents := diagnostics.PodEvents.Task10
for i := 0; i < len(podEvents); i++ {
e := podEvents[i]
if e.Message == "" {
continue
}
events = append(events, entity.ProfileEvent{
Message: e.Message,
Reason: e.Reason,
Action: e.Action,
})
}
extras := diagnostics.Extras
for i := 0; i < len(extras); i++ {
e := extras[i]
if e.Message == "" {
continue
}
events = append(events, entity.ProfileEvent{
Message: e.Message,
Reason: e.Reason,
Action: e.Action,
})
}
return &entity.OperationProfile{Events: events}
}

func (c CloudbrainOneClusterAdapter) CreateTrainJob(entity.CreateTrainTaskRequest) (*entity.CreateTrainTaskResponse, error) {
return nil, nil
}
func (c CloudbrainOneClusterAdapter) DeleteTrainJob(string) error {
@@ -156,12 +223,12 @@ func (c CloudbrainOneClusterAdapter) DeleteTrainJob(string) error {
func (c CloudbrainOneClusterAdapter) StopTrainJob(string) error {
return nil
}
func (c CloudbrainOneClusterAdapter) QueryTrainJob(string) (*ai_task_entity.QueryTaskResponse, error) {
func (c CloudbrainOneClusterAdapter) QueryTrainJob(string) (*entity.QueryTaskResponse, error) {
return nil, nil
}
func (c CloudbrainOneClusterAdapter) RestartTrainJob(string) (*ai_task_entity.CreateTrainTaskResponse, error) {
func (c CloudbrainOneClusterAdapter) RestartTrainJob(string) (*entity.CreateTrainTaskResponse, error) {
return nil, nil
}
func (c CloudbrainOneClusterAdapter) GetTrainLog(string) (*ai_task_entity.ClusterLog, error) {
func (c CloudbrainOneClusterAdapter) GetTrainLog(string) (*entity.ClusterLog, error) {
return nil, nil
}

+ 297
- 0
services/ai_task_service/cluster/cloudbrain_two.go View File

@@ -0,0 +1,297 @@
package cluster

import "C"
import (
"encoding/json"
"fmt"

"code.gitea.io/gitea/entity"
"code.gitea.io/gitea/manager/client/cloudbrain_two"
"code.gitea.io/gitea/manager/client/cloudbrain_two_cd"
"code.gitea.io/gitea/models"
"code.gitea.io/gitea/modules/log"
"code.gitea.io/gitea/modules/setting"
"code.gitea.io/gitea/modules/timeutil"
)

type CloudbrainTwoClusterAdapter struct {
}

func init() {
AddCluster(entity.OpenICloudbrainTwo, new(CloudbrainTwoClusterAdapter))
}

func (c CloudbrainTwoClusterAdapter) CreateNoteBook(req entity.CreateNoteBookTaskRequest) (*entity.CreateNoteBookTaskResponse, error) {
if poolInfos == nil {
json.Unmarshal([]byte(setting.PoolInfos), &poolInfos)
}
t := req.Tasks[0]

var jobResult *models.CreateNotebookResult
var err error
if setting.ModelartsCD.Enabled {
jobResult, err = cloudbrain_two_cd.CreateNotebook(models.CreateNotebookWithoutPoolParams{
JobName: req.Name,
Description: req.Description,
Flavor: t.Spec.SourceSpecId,
Duration: t.AutoStopDuration,
ImageID: t.ImageId,
Feature: models.NotebookFeature,
Volume: models.VolumeReq{
Capacity: setting.Capacity,
Category: models.EVSCategory,
Ownership: models.ManagedOwnership,
},
WorkspaceID: "0",
})
} else {
jobResult, err = cloudbrain_two.CreateNotebook2(models.CreateNotebook2Params{
JobName: req.Name,
Description: req.Description,
Flavor: t.Spec.SourceSpecId,
Duration: t.AutoStopDuration,
ImageID: t.ImageId,
PoolID: poolInfos.PoolInfo[0].PoolId,
Feature: models.NotebookFeature,
Volume: models.VolumeReq{
Capacity: setting.Capacity,
Category: models.EVSCategory,
Ownership: models.ManagedOwnership,
},
WorkspaceID: "0",
})
}

if err != nil {
log.Error("CreateNoteBook failed: %v", err.Error())
return nil, err
}
return convertCloudbrainTwo2NoteBookRes(jobResult), nil
}

func (c CloudbrainTwoClusterAdapter) CreateOnlineInfer(req entity.CreateNoteBookTaskRequest) (*entity.CreateNoteBookTaskResponse, error) {
return nil, nil
}

var cloudbrainTwoImages []entity.ClusterImage

func (c CloudbrainTwoClusterAdapter) GetImages(req entity.GetImageReq) ([]entity.ClusterImage, bool, error) {
if cloudbrainTwoImages == nil || len(cloudbrainTwoImages) == 0 {
images := setting.StImageInfos.ImageInfo
cloudbrainTwoImages = make([]entity.ClusterImage, len(images))
for i := 0; i < len(images); i++ {
cloudbrainTwoImages[i] = entity.ClusterImage{
ImageId: images[i].Id,
ImageName: images[i].Value,
}
}
}

return cloudbrainTwoImages, false, nil
}

var poolInfos *models.PoolInfos

func convertCloudbrainTwo2NoteBookRes(res *models.CreateNotebookResult) *entity.CreateNoteBookTaskResponse {
return &entity.CreateNoteBookTaskResponse{
JobID: res.ID,
Status: res.Status,
}
}

func (c CloudbrainTwoClusterAdapter) RestartNoteBook(jobId string) (*entity.RestartNoteBookTaskResponse, error) {
param := models.NotebookAction{
Action: models.ActionStart,
}
task, err := models.GetNewestCloudbrainByJobId(jobId)
if err != nil {
return nil, err
}

var res *models.NotebookActionResult
if task.Type == models.TypeCloudBrainTwo {
res, err = cloudbrain_two.ManageNotebook2(task.JobID, param)
} else if task.Type == models.TypeCDCenter {
res, err = cloudbrain_two_cd.ManageNotebook(task.JobID, param)
}
if err != nil {
log.Error("ManageNotebook err.jobID=%s err=%v", jobId, err)
return nil, err
}
return convertCloudbrainTwo2NoteBookRestartRes(jobId, res), nil
}

func convertCloudbrainTwo2NoteBookRestartRes(jobId string, res *models.NotebookActionResult) *entity.RestartNoteBookTaskResponse {
return &entity.RestartNoteBookTaskResponse{
JobId: jobId,
Status: res.Status,
}
}

func (c CloudbrainTwoClusterAdapter) DeleteNoteBook(jobId string) error {
task, err := models.GetNewestCloudbrainByJobId(jobId)
if err != nil {
return err
}

if task.Type == models.TypeCloudBrainTwo {
_, err = cloudbrain_two.DelNotebook2(task.JobID)
} else if task.Type == models.TypeCDCenter {
_, err = cloudbrain_two_cd.DelNotebook(task.JobID)
}
if err != nil {
log.Error("DeleteNoteBook err.jobID=%s err=%v", jobId, err)
return err
}
return nil
}

func (c CloudbrainTwoClusterAdapter) StopNoteBook(jobId string) error {
task, err := models.GetNewestCloudbrainByJobId(jobId)
if err != nil {
return err
}
param := models.NotebookAction{
Action: models.ActionStop,
}
if task.Type == models.TypeCloudBrainTwo {
_, err = cloudbrain_two.ManageNotebook2(task.JobID, param)
} else if task.Type == models.TypeCDCenter {
_, err = cloudbrain_two_cd.ManageNotebook(task.JobID, param)
}
if err != nil {
log.Error("StopNoteBook err.jobID=%s err=%v", jobId, err)
return err
}
return nil
}

func (c CloudbrainTwoClusterAdapter) QueryNoteBook(jobId string) (*entity.QueryTaskResponse, error) {
task, err := models.GetNewestCloudbrainByJobId(jobId)
if err != nil {
return nil, err
}

var result *models.GetNotebook2Result
if task.Type == models.TypeCloudBrainTwo {
result, err = cloudbrain_two.GetNotebook2(task.JobID)
} else if task.Type == models.TypeCDCenter {
result, err = cloudbrain_two_cd.GetNotebook(task.JobID)
}
if err != nil {
log.Error("GetNotebook(%s) failed:%v", task.DisplayJobName, err)
return nil, err
}
return convertCloudbrainTwo2QueryRes(result), nil
}

func convertCloudbrainTwo2QueryRes(res *models.GetNotebook2Result) *entity.QueryTaskResponse {
startedAt := timeutil.TimeStamp(0)
if res.Lease.UpdateTime > 0 {
startedAt = timeutil.TimeStamp(res.Lease.UpdateTime / 1000)
}
completedAt := timeutil.TimeStamp(0)
if models.IsCloudbrainTerminalStatus(res.Status) {
completedAt = timeutil.TimeStampNow()
}
return &entity.QueryTaskResponse{
StartedAt: startedAt,
CompletedAt: completedAt,
JobId: res.ID,
Status: res.Status,
Url: res.Url,
Token: res.Token,
}
}

func (c CloudbrainTwoClusterAdapter) QueryNoteBookByJobName(jobName string) ([]*entity.QueryTaskResponse, error) {
result, err := cloudbrain_two.GetNotebookList(1000, 0, "createTime", "DESC", jobName)
if err != nil {
log.Error("QueryNoteBookByJobName failed:jobName=%s err=%v", jobName, err)
return nil, err
}
r := make([]*entity.QueryTaskResponse, 0)
for i := 0; i < len(result.NotebookList); i++ {
if result.NotebookList[i].JobName == jobName {
r = append(r, convertCloudbrainTwoQueryNotebookByNameResponse(result.NotebookList[i]))
}
}
return r, nil
}

func convertCloudbrainTwoQueryNotebookByNameResponse(notebook models.NotebookList) *entity.QueryTaskResponse {
return &entity.QueryTaskResponse{
StartedAt: timeutil.TimeStamp(notebook.Lease.CreateTime / 1000),
Status: notebook.Status,
JobId: notebook.JobID,
}
}

func (c CloudbrainTwoClusterAdapter) GetNoteBookLog(jobId string) (*entity.ClusterLog, error) {
return nil, nil
}

func (c CloudbrainTwoClusterAdapter) GetNoteBookUrl(jobId string) (string, error) {
res, err := c.QueryNoteBook(jobId)
if err != nil {
return "", err
}
return res.Url + "?token=" + res.Token, nil
}

func (c CloudbrainTwoClusterAdapter) GetNoteBookOperationProfile(jobId string) (*entity.OperationProfile, error) {
task, err := models.GetNewestCloudbrainByJobId(jobId)
if err != nil {
return nil, err
}

var result *models.GetNotebook2Result
if task.Type == models.TypeCloudBrainTwo {
result, err = cloudbrain_two.GetNotebook2(task.JobID)
} else if task.Type == models.TypeCDCenter {
result, err = cloudbrain_two_cd.GetNotebook(task.JobID)
}
if err != nil {
log.Error("GetNotebook(%s) failed:%v", task.DisplayJobName, err)
return nil, err
}
return parseCloudbrainTwoEventsToOperationProfile(result), nil
}

func parseCloudbrainTwoEventsToOperationProfile(result *models.GetNotebook2Result) *entity.OperationProfile {
events := make([]entity.ProfileEvent, 0)
if result.ActionProgress == nil || len(result.ActionProgress) == 0 {
return nil
}
for i := 0; i < len(result.ActionProgress); i++ {
e := result.ActionProgress[i]
if e.Description == "" {
continue
}
events = append(events, entity.ProfileEvent{
Message: e.Description,
Reason: fmt.Sprint(e.Step),
Name: e.Status,
})
}
return &entity.OperationProfile{Events: events}
}

func (c CloudbrainTwoClusterAdapter) CreateTrainJob(entity.CreateTrainTaskRequest) (*entity.CreateTrainTaskResponse, error) {
return nil, nil
}
func (c CloudbrainTwoClusterAdapter) DeleteTrainJob(string) error {
return nil
}
func (c CloudbrainTwoClusterAdapter) StopTrainJob(string) error {
return nil
}
func (c CloudbrainTwoClusterAdapter) QueryTrainJob(string) (*entity.QueryTaskResponse, error) {
return nil, nil
}
func (c CloudbrainTwoClusterAdapter) RestartTrainJob(string) (*entity.CreateTrainTaskResponse, error) {
return nil, nil
}
func (c CloudbrainTwoClusterAdapter) GetTrainLog(string) (*entity.ClusterLog, error) {
return nil, nil
}

+ 18
- 14
services/ai_task_service/cluster/cluster_base.go View File

@@ -1,17 +1,18 @@
package cluster

import (
"code.gitea.io/gitea/entity/ai_task_entity"
"errors"

"code.gitea.io/gitea/entity"
)

var clusterMap = map[ai_task_entity.ClusterType]ClusterAdapter{}
var clusterMap = map[entity.ClusterType]ClusterAdapter{}

func AddCluster(t ai_task_entity.ClusterType, cluster ClusterAdapter) {
func AddCluster(t entity.ClusterType, cluster ClusterAdapter) {
clusterMap[t] = cluster
}

func GetCluster(t ai_task_entity.ClusterType) (ClusterAdapter, error) {
func GetCluster(t entity.ClusterType) (ClusterAdapter, error) {
if t == "" {
return nil, errors.New("ClusterType is empty")
}
@@ -23,22 +24,25 @@ func GetCluster(t ai_task_entity.ClusterType) (ClusterAdapter, error) {
}

type ClusterAdapter interface {
CreateNoteBook(req ai_task_entity.CreateNoteBookTaskRequest) (*ai_task_entity.CreateNoteBookTaskResponse, error)
RestartNoteBook(jobId string) (*ai_task_entity.RestartNoteBookTaskResponse, error)
CreateNoteBook(req entity.CreateNoteBookTaskRequest) (*entity.CreateNoteBookTaskResponse, error)
RestartNoteBook(jobId string) (*entity.RestartNoteBookTaskResponse, error)
DeleteNoteBook(jobId string) error
StopNoteBook(jobId string) error
QueryNoteBook(jobId string) (*ai_task_entity.QueryTaskResponse, error)
QueryNoteBookByJobName(jobName string) ([]*ai_task_entity.QueryTaskResponse, error)
GetNoteBookLog(jobId string) (*ai_task_entity.ClusterLog, error)
QueryNoteBook(jobId string) (*entity.QueryTaskResponse, error)
QueryNoteBookByJobName(jobName string) ([]*entity.QueryTaskResponse, error)
GetNoteBookLog(jobId string) (*entity.ClusterLog, error)
GetNoteBookUrl(jobId string) (string, error)
CreateTrainJob(req ai_task_entity.CreateTrainTaskRequest) (*ai_task_entity.CreateTrainTaskResponse, error)
GetNoteBookOperationProfile(jobId string) (*entity.OperationProfile, error)
CreateTrainJob(req entity.CreateTrainTaskRequest) (*entity.CreateTrainTaskResponse, error)
DeleteTrainJob(jobId string) error
StopTrainJob(string) error
RestartTrainJob(jobId string) (*ai_task_entity.CreateTrainTaskResponse, error)
QueryTrainJob(jobId string) (*ai_task_entity.QueryTaskResponse, error)
GetTrainLog(jobId string) (*ai_task_entity.ClusterLog, error)
RestartTrainJob(jobId string) (*entity.CreateTrainTaskResponse, error)
QueryTrainJob(jobId string) (*entity.QueryTaskResponse, error)
GetTrainLog(jobId string) (*entity.ClusterLog, error)

//GetImages return available list of clusters
//The second parameter will return true if image is no limit
GetImages(req ai_task_entity.GetImageReq) ([]ai_task_entity.ClusterImage, bool, error)
GetImages(req entity.GetImageReq) ([]entity.ClusterImage, bool, error)

CreateOnlineInfer(req entity.CreateNoteBookTaskRequest) (*entity.CreateNoteBookTaskResponse, error)
}

+ 81
- 0
services/ai_task_service/container_builder/code_builder.go View File

@@ -0,0 +1,81 @@
package container_builder

import (
"code.gitea.io/gitea/entity"
"code.gitea.io/gitea/modules/cloudbrain"
"code.gitea.io/gitea/modules/log"
"code.gitea.io/gitea/modules/setting"
"code.gitea.io/gitea/routers/response"
"code.gitea.io/gitea/services/ai_task_service/context"
"code.gitea.io/gitea/services/ai_task_service/upload"
"strings"
)

type CodeBuilder struct {
Opts *entity.ContainerBuildOpts
}

func init() {
o := &CodeBuilder{}
RegisterContainerBuilder(o)
}

func (b *CodeBuilder) SetOpts(opts *entity.ContainerBuildOpts) {
b.Opts = opts
}

func (b *CodeBuilder) GetContainerType() entity.ContainerDataType {
return entity.ContainerCode
}

func (b *CodeBuilder) Build(ctx *context.CreationContext) ([]entity.ContainerData, *response.BizError) {
opts := b.Opts
if opts.Disable {
return nil, nil
}
storageTypes := opts.AcceptStorageType
if storageTypes == nil || len(storageTypes) == 0 {
return nil, response.SYSTEM_ERROR
}

jobName := ctx.Request.JobName
repo := ctx.Repository
codeLocalPath := setting.JobPath + jobName + cloudbrain.CodeMountPath + "/"
uploader := upload.SelectUploaderFromStorageType(storageTypes[0])

remoteDir := uploader.GetJobDefaultObjectKeyPrefix(jobName) + cloudbrain.CodeMountPath
//再次调试和在线运行notebook不需要下载、上传代码
if !ctx.Request.IsRestartRequest && !ctx.Request.IsFileNoteBookRequest {
if err := DownloadCode(ctx, codeLocalPath, b.Opts.NotArchive); err != nil {
log.Error("downloadZipCode failed, server timed out: %s (%v)", repo.FullName(), err)
return nil, response.LOAD_CODE_FAILED
}

if err := uploader.UploadDir(codeLocalPath, remoteDir); err != nil {
log.Error("Failed to UploadDir: %s (%v)", repo.FullName(), err)
return nil, response.LOAD_CODE_FAILED
}
}

codeArchiveName := ""
//如果代码是压缩包形式,以默认分支命名压缩包(继承原有逻辑)
if !b.Opts.NotArchive {
codeArchiveName = cloudbrain.DefaultBranchName + ".zip"
}

containerPath := ""
if opts.ContainerPath != "" {
containerPath = opts.ContainerPath + "/" + codeArchiveName
}
objectKey := remoteDir + "/" + codeArchiveName
codeData := entity.ContainerData{
Name: strings.ToLower(repo.Name),
Bucket: uploader.GetBucket(),
EndPoint: uploader.GetEndpoint(),
ObjectKey: objectKey,
ReadOnly: opts.ReadOnly,
ContainerPath: containerPath,
RealPath: uploader.GetRealPath(objectKey),
}
return []entity.ContainerData{codeData}, nil
}

+ 95
- 0
services/ai_task_service/container_builder/common.go View File

@@ -0,0 +1,95 @@
package container_builder

import (
"bufio"
"code.gitea.io/gitea/models"
"code.gitea.io/gitea/modules/git"
"code.gitea.io/gitea/modules/log"
"code.gitea.io/gitea/services/ai_task_service/context"
"code.gitea.io/gitea/services/ai_task_service/upload"
"errors"
"io"
"io/ioutil"
"os"
"strings"
)

func DownloadCode(ctx *context.CreationContext, codeLocalPath string, notArchive bool) error {
dir, err := ioutil.ReadDir(codeLocalPath)
//ReqCommitID为空时需要下载最新的代码,把旧的删掉
if len(dir) != 0 && ctx.Request.ReqCommitID == "" {
if err == nil {
os.RemoveAll(codeLocalPath)
}
}
var commitId string

//目录为空时需要下载代码
if len(dir) == 0 {
if notArchive {
commitId, err = upload.DownloadCode(ctx.GitRepo, ctx.Repository, codeLocalPath, ctx.Request.BranchName)
} else {
commitId, err = upload.DownloadZipCode(ctx.GitRepo, codeLocalPath, ctx.Request.BranchName)
}
if err != nil {
log.Error("downloadZipCode failed, server timed out: %s (%v)", ctx.Repository.FullName(), err)
return errors.New("cloudbrain.load_code_failed")
}
}
ctx.CommitID = commitId
return nil
}

var obsUploader = &upload.OBSUploader{}
var minioUploader = &upload.MinioUploader{}

const CLONE_FILE_PREFIX = "file:///"

func DownloadBranch(repo *models.Repository, codePath, branchName string) error {
//add "file:///" prefix to make the depth valid
if err := git.Clone(CLONE_FILE_PREFIX+repo.RepoPath(), codePath, git.CloneRepoOptions{Branch: branchName, Depth: 1}); err != nil {
log.Error("Failed to clone repository: %s (%v)", repo.FullName(), err)
return err
}

configFile, err := os.OpenFile(codePath+"/.git/config", os.O_RDWR, 0666)
if err != nil {
log.Error("open file(%s) failed:%v", codePath+"/,git/config", err)
return err
}

defer configFile.Close()

pos := int64(0)
reader := bufio.NewReader(configFile)
for {
line, err := reader.ReadString('\n')
if err != nil {
if err == io.EOF {
log.Error("not find the remote-url")
return nil
} else {
log.Error("read error: %v", err)
return err
}
}

if strings.Contains(line, "url") && strings.Contains(line, ".git") {
originUrl := "\turl = " + repo.CloneLink().HTTPS + "\n"
if len(line) > len(originUrl) {
originUrl += strings.Repeat(" ", len(line)-len(originUrl))
}
bytes := []byte(originUrl)
_, err := configFile.WriteAt(bytes, pos)
if err != nil {
log.Error("WriteAt failed:%v", err)
return err
}
break
}

pos += int64(len(line))
}

return nil
}

+ 40
- 12
services/ai_task_service/container_builder/container_builder.go View File

@@ -1,24 +1,52 @@
package container_builder

import (
"code.gitea.io/gitea/entity/ai_task_entity"
"code.gitea.io/gitea/modules/setting"
"code.gitea.io/gitea/entity"
"code.gitea.io/gitea/modules/log"
"code.gitea.io/gitea/routers/response"
"code.gitea.io/gitea/services/ai_task_service/context"
"strings"
"fmt"
"reflect"
)

type ContainerBuilder interface {
Build(ctx *context.CreationContext) ([]ai_task_entity.ContainerData, error)
GetContainerType() ai_task_entity.ContainerDataType
Build(ctx *context.CreationContext) ([]entity.ContainerData, *response.BizError)
GetContainerType() entity.ContainerDataType
SetOpts(opts *entity.ContainerBuildOpts)
}

type ContainerBuildOpts struct {
ContainerPath string
ReadOnly bool
var containerBuilderMap = map[entity.ContainerDataType]reflect.Type{}

func RegisterContainerBuilder(builder ContainerBuilder) {
containerBuilderMap[builder.GetContainerType()] = reflect.TypeOf(builder)
}

func CreateContainerBuilder(containerType entity.ContainerDataType, opts *entity.ContainerBuildOpts) ContainerBuilder {
defer func() {
if err := recover(); err != nil {
combinedErr := fmt.Errorf("%s\n%s", err, log.Stack(2))
log.Error("PANIC:%v", combinedErr)
}
}()
t := containerBuilderMap[containerType]
if t == nil {
return nil
}
b := reflect.New(t.Elem()).Interface().(ContainerBuilder)
//.Interface().(ContainerBuilder)
//b.SetOpts(opts)
b.SetOpts(opts)
return b
}

func GetEndPoint() string {
index := strings.Index(setting.Endpoint, "//")
endpoint := setting.Endpoint[index+2:]
return endpoint
func BuildContainerDataChain(configMap map[entity.ContainerDataType]*entity.ContainerBuildOpts) *BuilderChain {
c := NewBuilderChain()
for k, v := range configMap {
b := CreateContainerBuilder(k, v)
if b == nil {
continue
}
c.Next(b)
}
return c
}

+ 7
- 1
services/ai_task_service/container_builder/container_builder_chan.go View File

@@ -1,6 +1,7 @@
package container_builder

import (
"code.gitea.io/gitea/routers/response"
"code.gitea.io/gitea/services/ai_task_service/context"
)

@@ -17,8 +18,13 @@ func (c *BuilderChain) Next(b ContainerBuilder) *BuilderChain {
return c
}

func (c *BuilderChain) Run(ctx *context.CreationContext) error {
func (c *BuilderChain) Run(ctx *context.CreationContext) *response.BizError {
for _, builder := range c.builderList {
current := ctx.GetContainerDataArray(builder.GetContainerType())
//如果已经存在则不需要再构建
if current != nil && len(current) > 0 {
continue
}
d, err := builder.Build(ctx)
if err != nil {
return err


+ 55
- 32
services/ai_task_service/container_builder/dataset_builder.go View File

@@ -1,70 +1,93 @@
package container_builder

import (
"code.gitea.io/gitea/entity/ai_task_entity"
"code.gitea.io/gitea/entity"
"code.gitea.io/gitea/models"
"code.gitea.io/gitea/modules/log"
"code.gitea.io/gitea/modules/setting"
"code.gitea.io/gitea/routers/response"
"code.gitea.io/gitea/services/ai_task_service/context"
"errors"
"strings"
)

type DatasetBuilder struct {
Opts ContainerBuildOpts
Opts *entity.ContainerBuildOpts
}

func (b DatasetBuilder) Build(ctx *context.CreationContext) ([]ai_task_entity.ContainerData, error) {
func init() {
o := &DatasetBuilder{}
RegisterContainerBuilder(o)
}

func (b *DatasetBuilder) SetOpts(opts *entity.ContainerBuildOpts) {
b.Opts = opts
}

func (b *DatasetBuilder) Build(ctx *context.CreationContext) ([]entity.ContainerData, *response.BizError) {
if b.Opts.Disable {
return nil, nil
}
uuid := ctx.Request.DatasetUUIDStr
if uuid == "" {
return nil, nil
}
var attachSize int64
datasetInfos, _, err := models.GetDatasetInfo(uuid, ctx.Request.ComputeSource.Name)
var datasetInfos map[string]models.DatasetInfo
var datasetNames string
var err error
// models.GetDatasetInfo 是使用的以前的方法,所以此处按集群类型适配
if ctx.Request.Cluster == models.C2NetCluster {
datasetInfos, datasetNames, err = models.GetDatasetInfo(uuid, ctx.Request.ComputeSource.Name)
} else {
datasetInfos, datasetNames, err = models.GetDatasetInfo(uuid)
}
if err != nil {
log.Error("GetDatasetInfo failed: %v", err)
return nil, errors.New("cloudbrain.error.dataset_select")
return nil, response.DATASET_SELECT_ERROR
}
uuidArray := strings.Split(uuid, ";")
if datasetInfos == nil || len(datasetInfos) < len(uuidArray) {
return nil, errors.New("cloudbrain.error.partial_datasets_not_available")
}
for _, infos := range datasetInfos {
attachSize += infos.Size
return nil, response.PARTIAL_DATASETS_NOT_AVAILABLE
}
if attachSize > int64(setting.DebugAttachSize*1000*1000*1000) {
log.Error("The DatasetSize exceeds the limit (%dGB)", setting.DebugAttachSize) // GB
return nil, errors.New("cloudbrain.error.debug_datasetsize")
}
var data []ai_task_entity.ContainerData
obsEndPoint := GetEndPoint()
var data []entity.ContainerData
for _, datasetInfo := range datasetInfos {
name := datasetInfo.FullName
//如果不是压缩包,那么文件名是去掉后缀以后的数据集名称
if b.Opts.NotArchive {
name = datasetInfo.Name
}
if datasetInfo.Type == models.TypeCloudBrainOne {
data = append(data, ai_task_entity.ContainerData{
Name: datasetInfo.FullName,
Bucket: setting.Attachment.Minio.Bucket,
EndPoint: setting.Attachment.Minio.Endpoint,
ObjectKey: datasetInfo.DataLocalPath,
//如果返回的localPath已经带了实际路径的前缀,需要去除掉以后才是在minio上的objectKey
objectKey := datasetInfo.DataLocalPath
objectKey = strings.TrimPrefix(objectKey, setting.Attachment.Minio.RealPath)
objectKey = strings.TrimPrefix(objectKey, setting.Attachment.Minio.Bucket)
objectKey = strings.TrimPrefix(objectKey, "/")
data = append(data, entity.ContainerData{
Name: name,
Bucket: minioUploader.GetBucket(),
EndPoint: minioUploader.GetEndpoint(),
ObjectKey: objectKey,
ReadOnly: b.Opts.ReadOnly,
ContainerPath: b.Opts.ContainerPath + "/" + datasetInfo.FullName,
RealPath: setting.Attachment.Minio.RealPath + setting.Attachment.Minio.Bucket + "/" + datasetInfo.DataLocalPath,
ContainerPath: b.Opts.ContainerPath + "/" + name,
RealPath: minioUploader.GetRealPath(objectKey),
})

} else {
data = append(data, ai_task_entity.ContainerData{
Name: datasetInfo.FullName,
Bucket: setting.Bucket,
EndPoint: obsEndPoint,
ObjectKey: datasetInfo.DataLocalPath + datasetInfo.FullName,
objectKey := datasetInfo.DataLocalPath + datasetInfo.FullName
data = append(data, entity.ContainerData{
Name: name,
Bucket: obsUploader.GetBucket(),
EndPoint: obsUploader.GetEndpoint(),
ObjectKey: objectKey,
ReadOnly: b.Opts.ReadOnly,
ContainerPath: b.Opts.ContainerPath + "/" + datasetInfo.FullName,
ContainerPath: b.Opts.ContainerPath + "/" + name,
})
}

}
ctx.Request.DatasetNames = datasetNames
return data, nil
}

func (b DatasetBuilder) GetContainerType() ai_task_entity.ContainerDataType {
return ai_task_entity.ContainerDataset
func (b *DatasetBuilder) GetContainerType() entity.ContainerDataType {
return entity.ContainerDataset
}

+ 47
- 0
services/ai_task_service/container_builder/file_notebook_code_builder.go View File

@@ -0,0 +1,47 @@
package container_builder

import (
"code.gitea.io/gitea/entity"
"code.gitea.io/gitea/models"
"code.gitea.io/gitea/modules/log"
"code.gitea.io/gitea/modules/setting"
"code.gitea.io/gitea/routers/response"
"code.gitea.io/gitea/services/ai_task_service/context"
)

type FileNoteBookCodeBuilder struct {
Opts *entity.ContainerBuildOpts
}

func init() {
o := &FileNoteBookCodeBuilder{}
RegisterContainerBuilder(o)
}

func (b *FileNoteBookCodeBuilder) SetOpts(opts *entity.ContainerBuildOpts) {
b.Opts = opts
}

func (b *FileNoteBookCodeBuilder) GetContainerType() entity.ContainerDataType {
return entity.ContainerFileNoteBookCode
}

func (b *FileNoteBookCodeBuilder) Build(ctx *context.CreationContext) ([]entity.ContainerData, *response.BizError) {
if b.Opts.Disable {
return nil, nil
}
repo := ctx.Request.FileRepository
if repo == nil {
return nil, nil
}
err := DownloadBranch(repo, getCodePath(ctx.Request.JobName, repo, ctx.Request.FileBranchName), ctx.Request.FileBranchName)
if err != nil {
log.Error("download code failed", err)
return nil, response.LOAD_CODE_FAILED
}
return nil, nil
}

func getCodePath(jobName string, repo *models.Repository, branchName string) string {
return setting.JobPath + jobName + "/code" + "/" + repo.OwnerName + "/" + repo.Name + "/" + branchName
}

+ 0
- 59
services/ai_task_service/container_builder/minio_code_builder.go View File

@@ -1,59 +0,0 @@
package container_builder

import (
"code.gitea.io/gitea/entity/ai_task_entity"
"code.gitea.io/gitea/modules/cloudbrain"
"code.gitea.io/gitea/modules/log"
"code.gitea.io/gitea/modules/setting"
"code.gitea.io/gitea/modules/storage"
"code.gitea.io/gitea/services/ai_task_service/context"
"code.gitea.io/gitea/services/ai_task_service/upload"
"errors"
"io/ioutil"
"os"
"strings"
)

type MinioCodeBuilder struct {
Opts ContainerBuildOpts
}

func (b MinioCodeBuilder) GetContainerType() ai_task_entity.ContainerDataType {
return ai_task_entity.ContainerCode
}

func (b MinioCodeBuilder) Build(ctx *context.CreationContext) ([]ai_task_entity.ContainerData, error) {
opts := b.Opts
var err error
jobName := ctx.Request.JobName
repo := ctx.Repository
codeLocalPath := setting.JobPath + jobName + cloudbrain.CodeMountPath + "/"
_, err = ioutil.ReadDir(codeLocalPath)
if err == nil {
os.RemoveAll(codeLocalPath)
}

commitId, err := upload.DownloadZipCode(ctx.GitRepo, codeLocalPath, ctx.Request.BranchName)
if err != nil {
log.Error("downloadZipCode failed, server timed out: %s (%v)", repo.FullName(), err)
return nil, errors.New("cloudbrain.load_code_failed")
}
if err := upload.UploadDirToMinio(codeLocalPath+"/", jobName, cloudbrain.CodeMountPath+"/"); err != nil {
log.Error("Failed to uploadCodeToMinio: %s (%v)", repo.FullName(), err)
return nil, errors.New("cloudbrain.load_code_failed")
}

codeArchiveName := cloudbrain.DefaultBranchName + ".zip"
codeData := ai_task_entity.ContainerData{
Name: strings.ToLower(repo.Name),
Bucket: setting.Attachment.Minio.Bucket,
EndPoint: setting.Attachment.Minio.Endpoint,
ObjectKey: setting.CBCodePathPrefix + jobName + cloudbrain.CodeMountPath + "/" + codeArchiveName,
ReadOnly: opts.ReadOnly,
ContainerPath: opts.ContainerPath + "/" + codeArchiveName,
RealPath: storage.GetMinioPath(jobName, cloudbrain.CodeMountPath+"/"+codeArchiveName),
}
//todo 更好的方法?
ctx.CommitID = commitId
return []ai_task_entity.ContainerData{codeData}, nil
}

+ 0
- 18
services/ai_task_service/container_builder/obs_code_builder.go View File

@@ -1,18 +0,0 @@
package container_builder

import (
"code.gitea.io/gitea/entity/ai_task_entity"
"code.gitea.io/gitea/services/ai_task_service/context"
)

type ObsCodeBuilder struct {
Opts ContainerBuildOpts
}

func (b ObsCodeBuilder) GetContainerType() ai_task_entity.ContainerDataType {
return ai_task_entity.ContainerCode
}

func (b ObsCodeBuilder) Build(ctx *context.CreationContext) ([]ai_task_entity.ContainerData, error) {
return nil, nil
}

+ 40
- 6
services/ai_task_service/container_builder/output_path_builder.go View File

@@ -1,21 +1,55 @@
package container_builder

import (
"code.gitea.io/gitea/entity/ai_task_entity"
"code.gitea.io/gitea/entity"
"code.gitea.io/gitea/modules/cloudbrain"
"code.gitea.io/gitea/modules/log"
"code.gitea.io/gitea/routers/response"
"code.gitea.io/gitea/services/ai_task_service/context"
"code.gitea.io/gitea/services/ai_task_service/upload"
)

type OutputPathBuilder struct {
Opts ContainerBuildOpts
Opts *entity.ContainerBuildOpts
}

func (b OutputPathBuilder) Build(*context.CreationContext) ([]ai_task_entity.ContainerData, error) {
return []ai_task_entity.ContainerData{{
func init() {
o := &OutputPathBuilder{}
RegisterContainerBuilder(o)
}

func (b *OutputPathBuilder) SetOpts(opts *entity.ContainerBuildOpts) {
b.Opts = opts
}

func (b *OutputPathBuilder) Build(ctx *context.CreationContext) ([]entity.ContainerData, *response.BizError) {
if b.Opts.Disable {
return nil, nil
}
storageTypes := b.Opts.AcceptStorageType
if storageTypes == nil || len(storageTypes) == 0 {
return nil, response.SYSTEM_ERROR
}

jobName := ctx.Request.JobName

uploader := upload.SelectUploaderFromStorageType(storageTypes[0])
remoteDir := uploader.GetJobDefaultObjectKeyPrefix(jobName) + cloudbrain.ModelMountPath
err := uploader.MKDIR(remoteDir)
if err != nil {
log.Error("MKDIR err.displayJobName = %s err=%v", ctx.Request.DisplayJobName, err)
return nil, response.NewBizError(err)
}
return []entity.ContainerData{{
ContainerPath: b.Opts.ContainerPath,
ReadOnly: b.Opts.ReadOnly,
ObjectKey: remoteDir,
RealPath: uploader.GetRealPath(remoteDir),
Bucket: uploader.GetBucket(),
EndPoint: uploader.GetEndpoint(),
}}, nil
}

func (b OutputPathBuilder) GetContainerType() ai_task_entity.ContainerDataType {
return ai_task_entity.ContainerOutPutPath
func (b *OutputPathBuilder) GetContainerType() entity.ContainerDataType {
return entity.ContainerOutPutPath
}

+ 0
- 59
services/ai_task_service/container_builder/output_readme_builder.go View File

@@ -1,59 +0,0 @@
package container_builder

import (
"code.gitea.io/gitea/entity/ai_task_entity"
"code.gitea.io/gitea/modules/log"
"code.gitea.io/gitea/modules/setting"
"code.gitea.io/gitea/services/ai_task_service/context"
"code.gitea.io/gitea/services/ai_task_service/upload"
"errors"
"os"
)

type CloudbrainOneOutputReadmeBuilder struct {
Opts ContainerBuildOpts
}

const README = "README"

func (b CloudbrainOneOutputReadmeBuilder) Build(ctx *context.CreationContext) ([]ai_task_entity.ContainerData, error) {
modelPath := setting.JobPath + ctx.Request.JobName + b.Opts.ContainerPath + "/"
text := "You can put the files into this directory and download the files by the web page."
err := os.MkdirAll(modelPath, os.ModePerm)
if err != nil {
log.Error("MkdirAll(%s) failed:%v", modelPath, err)
return nil, err
}
fileName := modelPath + README
f, err := os.OpenFile(fileName, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, os.ModePerm)
if err != nil {
log.Error("OpenFile failed", err.Error())
return nil, err
}

defer f.Close()

_, err = f.WriteString(text)
if err != nil {
log.Error("WriteString failed", err.Error())
return nil, err
}

if err := upload.UploadDirToMinio(modelPath, ctx.Request.JobName, b.Opts.ContainerPath+"/"); err != nil {
log.Error("Failed to UploadDirToMinio: %s (%v)", ctx.Request.JobName, err)
return nil, errors.New("cloudbrain.load_code_failed")
}

return []ai_task_entity.ContainerData{{
Name: README,
Bucket: setting.Attachment.Minio.Bucket,
EndPoint: setting.Attachment.Minio.Endpoint,
ObjectKey: setting.CBCodePathPrefix + ctx.Request.JobName + b.Opts.ContainerPath + "/" + README,
ContainerPath: b.Opts.ContainerPath,
ReadOnly: b.Opts.ReadOnly,
}}, nil
}

func (b CloudbrainOneOutputReadmeBuilder) GetContainerType() ai_task_entity.ContainerDataType {
return ai_task_entity.ContainerCloudbrainOneOutPutReadMe
}

+ 120
- 36
services/ai_task_service/container_builder/pre_model_builder.go View File

@@ -1,66 +1,113 @@
package container_builder

import (
"code.gitea.io/gitea/entity/ai_task_entity"
"code.gitea.io/gitea/routers/response"
"fmt"
"strings"

"code.gitea.io/gitea/entity"
"code.gitea.io/gitea/models"
"code.gitea.io/gitea/modules/cloudbrain"
"code.gitea.io/gitea/modules/log"
"code.gitea.io/gitea/modules/setting"
"code.gitea.io/gitea/modules/storage"
"code.gitea.io/gitea/services/ai_task_service/context"
"code.gitea.io/gitea/services/ai_task_service/upload"
"code.gitea.io/gitea/services/cloudbrain/cloudbrainTask"
"errors"
"strings"
)

type PretrainModelBuilder struct {
Opts ContainerBuildOpts
Opts *entity.ContainerBuildOpts
}

func (b PretrainModelBuilder) Build(ctx *context.CreationContext) ([]ai_task_entity.ContainerData, error) {
form := ctx.Request
func init() {
o := &PretrainModelBuilder{}
RegisterContainerBuilder(o)
}

if form.ModelName == "" {
func (b *PretrainModelBuilder) SetOpts(opts *entity.ContainerBuildOpts) {
b.Opts = opts
}

func (b *PretrainModelBuilder) Build(ctx *context.CreationContext) ([]entity.ContainerData, *response.BizError) {
if b.Opts.Disable {
return nil, nil
}

m, err := models.QueryModelByPath(form.PreTrainModelUrl)
form := ctx.Request
storageTypes := b.Opts.AcceptStorageType
if storageTypes == nil || len(storageTypes) == 0 {
return nil, response.SYSTEM_ERROR
}
//未选择预训练模型,跳过此步
if form.PretrainModelName == "" {
return nil, nil
}
if form.PretrainModelId == "" {
//异常数据,理论上应该都有modelId
return nil, response.RESULT_CLEARD
}
//查出模型数据
m, err := models.QueryModelById(form.PretrainModelId)
if err != nil {
log.Error("Can not find model", err)
return nil, errors.New("repo.modelconvert.manage.model_not_exist")
return nil, response.MODEL_NOT_EXISTS
}
if !cloudbrainTask.IsModelFileExists(m, form.CkptName) {
log.Error("model file not exist.name = %s", form.CkptName)
return nil, errors.New("repo.modelconvert.manage.model_file_not_exist")
preTrainModelUrl := m.Path
if err != nil {
log.Error("Can not find model", err)
return nil, response.MODEL_NOT_EXISTS
}
//模型文件存储方式
oldStorageType := entity.GetStorageTypeFromCloudbrainType(m.Type)
if oldStorageType == "" {
log.Error("model storage type error.modelId=%d", m.ID)
return nil, response.SYSTEM_ERROR
}
preTrainModelPath := getPreTrainModelPath(form.PreTrainModelUrl, form.CkptName)

var modelData ai_task_entity.ContainerData
switch m.Type {
case models.TypeCloudBrainOne:
modelData = ai_task_entity.ContainerData{
Name: form.ModelName,
Bucket: setting.Attachment.Minio.Bucket,
EndPoint: setting.Attachment.Minio.Endpoint,
ObjectKey: preTrainModelPath,
ReadOnly: b.Opts.ReadOnly,
ContainerPath: b.Opts.ContainerPath + "/" + form.CkptName,
RealPath: setting.Attachment.Minio.RealPath + setting.Attachment.Minio.Bucket + "/" + preTrainModelPath,
var preTrainModelPath string
var preTrainModelEntity []entity.ContainerData
storageType := oldStorageType
ckptNames := strings.Split(form.PretrainModelCkptName, ";")
for _, ckptName := range ckptNames {
if !cloudbrainTask.IsModelFileExists(m, ckptName) {
log.Error("model file not exist.name = %s", ckptName)
return nil, response.MODEL_NOT_EXISTS
}
case models.TypeCloudBrainTwo:
modelData = ai_task_entity.ContainerData{
Name: form.ModelName,
Bucket: setting.Bucket,
EndPoint: GetEndPoint(),
ReadOnly: b.Opts.ReadOnly,
preTrainModelPath = getPreTrainModelPath(preTrainModelUrl, ckptName)
if !b.Opts.IsStorageTypeIn(oldStorageType) {
//意味着模型之前存储的位置不符合要求,需要转存到指定存储
newStorageType := b.Opts.AcceptStorageType[0]
//todo 可优化
if newStorageType == entity.MINIO && oldStorageType == entity.OBS {
//复用以前代码
minioPreModelURL, err := dealModelInfo(form.PretrainModelId, form.JobName, ckptName)
if err != nil {
log.Error("Can not find model,modelId=%d err=%v", form.PretrainModelId, err)
return nil, response.MODEL_NOT_EXISTS
}
preTrainModelUrl = minioPreModelURL
preTrainModelPath = getPreTrainModelPath(minioPreModelURL, ckptName)
storageType = entity.MINIO
}
}
uploader := upload.SelectUploaderFromStorageType(storageType)
modelData := entity.ContainerData{
Name: form.PretrainModelName,
Bucket: uploader.GetBucket(),
EndPoint: uploader.GetEndpoint(),
ObjectKey: preTrainModelPath,
ContainerPath: b.Opts.ContainerPath + "/" + form.CkptName,
ReadOnly: b.Opts.ReadOnly,
ContainerPath: b.Opts.ContainerPath + "/" + ckptName,
RealPath: uploader.GetRealPath(preTrainModelPath),
}
preTrainModelEntity = append(preTrainModelEntity, modelData)
}

return []ai_task_entity.ContainerData{modelData}, nil
form.PreTrainModelUrl = preTrainModelUrl
return preTrainModelEntity, nil
}

func (b PretrainModelBuilder) GetContainerType() ai_task_entity.ContainerDataType {
return ai_task_entity.ContainerPreTrainModel
func (b *PretrainModelBuilder) GetContainerType() entity.ContainerDataType {
return entity.ContainerPreTrainModel
}

func getPreTrainModelPath(pretrainModelDir string, fileName string) string {
@@ -73,3 +120,40 @@ func getPreTrainModelPath(pretrainModelDir string, fileName string) string {
}

}

func dealModelInfo(modelId string, jobName string, ckptName string) (string, error) {
preModel, err := models.QueryModelById(modelId)
if err != nil || preModel == nil || preModel.ID == "" {
log.Error("Can not find model", err)
return "", fmt.Errorf("Can not find model: %v", ckptName)
}
minioPreModelURL, err := downloadModelFromObs(preModel, jobName, cloudbrain.PretrainModelMountPath, ckptName)
if err != nil {
log.Error("Can not find model", err)

return "", err
}
return minioPreModelURL, nil
}

func downloadModelFromObs(preModel *models.AiModelManage, jobName, suffixPath string, ckptFileName string) (string, error) {
destPath := setting.CBCodePathPrefix + jobName + suffixPath + "/"
destFile := destPath + ckptFileName
returnStr := setting.Attachment.Minio.Bucket + "/" + destPath
srcUrl := preModel.Path[len(setting.Bucket)+1:] + ckptFileName
log.Info("dest model Path=" + returnStr + " src path=" + preModel.Path + ckptFileName)
body, err := storage.ObsDownloadAFile(setting.Bucket, srcUrl)
if err == nil {
defer body.Close()
_, err = storage.Attachments.UploadContent(setting.Attachment.Minio.Bucket, destFile, body)
if err != nil {
log.Error("UploadObject(%s) failed: %s", preModel.Path+ckptFileName, err.Error())
return "", err
}
} else {
log.Info("download model failed. as " + err.Error())
return "", err
}
log.Info("download model from obs succeed")
return returnStr, nil
}

+ 12
- 12
services/ai_task_service/context/context.go View File

@@ -1,48 +1,48 @@
package context

import (
"code.gitea.io/gitea/entity/ai_task_entity"
"code.gitea.io/gitea/entity"
"code.gitea.io/gitea/models"
"code.gitea.io/gitea/modules/git"
)

type CreationContext struct {
Request ai_task_entity.CreateReq
ContainerData map[ai_task_entity.ContainerDataType][]ai_task_entity.ContainerData
Request *entity.CreateReq
ContainerData map[entity.ContainerDataType][]entity.ContainerData
GitRepo *git.Repository
Repository *models.Repository
Spec *models.Specification
User *models.User
Datasets map[string]models.DatasetInfo
CommitID string
Response *ai_task_entity.CreationResponse
Response *entity.CreationResponse
SourceCloudbrain *models.Cloudbrain
NewCloudbrain *models.Cloudbrain
AITaskConfig entity.AITaskConfig
}

func (ctx *CreationContext) AddContainerData(t ai_task_entity.ContainerDataType, d []ai_task_entity.ContainerData) {
func (ctx *CreationContext) AddContainerData(t entity.ContainerDataType, d []entity.ContainerData) {
if ctx.ContainerData == nil {
ctx.ContainerData = make(map[ai_task_entity.ContainerDataType][]ai_task_entity.ContainerData, 0)
ctx.ContainerData = make(map[entity.ContainerDataType][]entity.ContainerData, 0)
}
ctx.ContainerData[t] = d
}
func (ctx *CreationContext) GetContainerDataArray(t ai_task_entity.ContainerDataType) []ai_task_entity.ContainerData {
func (ctx *CreationContext) GetContainerDataArray(t entity.ContainerDataType) []entity.ContainerData {
if ctx.ContainerData == nil {
return nil
}
return ctx.ContainerData[t]
}
func (ctx *CreationContext) GetContainerData(t ai_task_entity.ContainerDataType) ai_task_entity.ContainerData {
func (ctx *CreationContext) GetContainerData(t entity.ContainerDataType) entity.ContainerData {
a := ctx.GetContainerDataArray(t)
if a == nil || len(a) == 0 {
return ai_task_entity.ContainerData{}
return entity.ContainerData{}
}
return a[0]
}
func (ctx *CreationContext) WriteResponse(t ai_task_entity.ContainerDataType) ai_task_entity.ContainerData {
func (ctx *CreationContext) WriteResponse(t entity.ContainerDataType) entity.ContainerData {
a := ctx.GetContainerDataArray(t)
if a == nil || len(a) == 0 {
return ai_task_entity.ContainerData{}
return entity.ContainerData{}
}
return a[0]
}

+ 52
- 8
services/ai_task_service/schedule/model_schedule.go View File

@@ -2,6 +2,18 @@ package schedule

import (
"bytes"
"encoding/json"
"errors"
"fmt"
"os/exec"
"path"
"strings"
"time"

"code.gitea.io/gitea/modules/modelarts"

"code.gitea.io/gitea/modules/obs"

"code.gitea.io/gitea/models"
"code.gitea.io/gitea/modules/grampus"
"code.gitea.io/gitea/modules/labelmsg"
@@ -11,14 +23,7 @@ import (
"code.gitea.io/gitea/modules/setting"
"code.gitea.io/gitea/modules/storage"
"code.gitea.io/gitea/modules/util"
"encoding/json"
"errors"
"fmt"
"github.com/minio/minio-go"
"os/exec"
"path"
"strings"
"time"
)

const NPUModelDefaultName = "models.zip"
@@ -205,7 +210,33 @@ func LocalMigrateOperate(jobName, computeSource string, r *models.ModelMigrateRe
}
if computeSource == models.NPUResource {
//因为NPU的输出会被压缩,因此需要解压+移桶
decompress(r.DestBucket+"/"+r.DestObjectKey, setting.Bucket+"/"+strings.TrimSuffix(r.DestObjectKey, models.ModelSuffix))
if err := obsMkdir(setting.CodePathPrefix + jobName + modelarts.OutputPath); err != nil {
log.Error("Failed to obsMkdir_output: %s (%v)", jobName, err)

return err
}
log.Info("DestObjectKey", r.DestObjectKey)
if strings.Contains(r.DestObjectKey, ".") {
decompress(r.DestBucket+"/"+r.DestObjectKey, setting.Bucket+"/"+strings.TrimSuffix(r.DestObjectKey, models.ModelSuffix))

} else { //如果是文件夹,遍历文件
fileInfos, err := storage.GetOneLevelObjectsUnderDir(r.DestBucket, "", r.DestObjectKey)
if err != nil {
log.Error("UpdateModelMigrateStatusByStep err. r.ID=%d step=%d err=%v", r.ID, models.BucketMoveFailed, err)
return err
}

for _, fileInfo := range fileInfos {
log.Info("decompress file:", fileInfo.FileName)
sourceFilPath := r.DestBucket + "/" + r.DestObjectKey + fileInfo.FileName
if !strings.HasSuffix(r.DestObjectKey, "/") {
sourceFilPath = r.DestBucket + "/" + r.DestObjectKey + "/" + fileInfo.FileName
}
decompress(sourceFilPath, setting.Bucket+"/"+strings.TrimSuffix(r.DestObjectKey, models.ModelSuffix))
}

}

} else {
//因为调度无法指定桶,所以调度成功后我们还需要移桶
if setting.UseLocalMinioMigrate {
@@ -233,6 +264,19 @@ func LocalMigrateOperate(jobName, computeSource string, r *models.ModelMigrateRe
return nil
}

func obsMkdir(dir string) error {
input := &obs.PutObjectInput{}
input.Bucket = setting.Bucket
input.Key = dir
_, err := storage.ObsCli.PutObject(input)
if err != nil {
log.Error("PutObject(%s) failed: %s", input.Key, err.Error())
return err
}

return nil
}

func TryToUpdateNPUMoveBucketResult(record *models.ModelMigrateRecord, jobName, versionName string) error {
if IsNPUModelDirHasFile(jobName, versionName) {
if err := models.UpdateModelMigrateStatusByStep(record, models.BucketMoveSuccess); err != nil {


+ 95
- 77
services/ai_task_service/task/cloudbrain_one_notebook_task.go View File

@@ -1,14 +1,13 @@
package task

import (
"code.gitea.io/gitea/entity/ai_task_entity"
"code.gitea.io/gitea/entity"
"code.gitea.io/gitea/models"
"code.gitea.io/gitea/modules/log"
"code.gitea.io/gitea/modules/notification"
"code.gitea.io/gitea/modules/setting"
"code.gitea.io/gitea/modules/timeutil"
"code.gitea.io/gitea/routers/response"
"code.gitea.io/gitea/services/ai_task_service/cluster"
"code.gitea.io/gitea/services/ai_task_service/container_builder"
"code.gitea.io/gitea/services/ai_task_service/context"
"strconv"
@@ -16,106 +15,139 @@ import (
)

type CloudbrainOneNotebookTaskTemplate struct {
DefaultCreationHandler
DefaultAITaskTemplate
}

func init() {
t := &CloudbrainOneNotebookTaskTemplate{
DefaultAITaskTemplate: DefaultAITaskTemplate{
ClusterType: ai_task_entity.OpenICloudbrainOne,
ClusterType: entity.OpenICloudbrainOne,
JobType: models.JobTypeDebug,
},
}
RegisterTask(models.JobTypeDebug, ai_task_entity.OpenICloudbrainOne, t)
RegisterTask(models.JobTypeDebug, entity.OpenICloudbrainOne, t)
}

func (g CloudbrainOneNotebookTaskTemplate) MyClusterType() ai_task_entity.ClusterType {
return ""
}

func (t CloudbrainOneNotebookTaskTemplate) Create(ctx *context.CreationContext) (*ai_task_entity.CreateTaskRes, *response.BizError) {
func (t CloudbrainOneNotebookTaskTemplate) Create(ctx *context.CreationContext) (*entity.CreateTaskRes, *response.BizError) {
c := &CreateOperator{}
err := c.Next(t.CheckParam).
Next(t.CheckMulti).
Next(t.CheckDisplayJobName).
Next(t.LoadSpec).
Next(t.CheckPointBalance).
Next(t.CheckDatasetSize).
Next(t.CheckDatasetExists).
Next(t.CheckBranchExists).
Next(t.InsertCloudbrainRecord4Async).
AsyncNext(t.BuildContainerData, t.CallCreationAPI, t.AfterCallCreationAPI4Async, t.NotifyCreation).
AsyncNextWithErrFun(t.BuildContainerData, t.CallCreationAPI, t.AfterCallCreationAPI4Async, t.NotifyCreation, t.HandleErr4Async).
Operate(ctx)
if err != nil {
log.Error("create CloudbrainOneNotebookTask err.%v", err)
return nil, err
}
return &ai_task_entity.CreateTaskRes{ID: ctx.NewCloudbrain.ID}, nil

}
return &entity.CreateTaskRes{ID: ctx.NewCloudbrain.ID}, nil

func (g CloudbrainOneNotebookTaskTemplate) Restart(*context.CreationContext) (*ai_task_entity.CreateTaskRes, *response.BizError) {
return nil, nil
}

func (g CloudbrainOneNotebookTaskTemplate) CallCreationAPI(ctx *context.CreationContext) *response.BizError {
c, err := cluster.GetCluster(ai_task_entity.OpenICloudbrainOne)
if err != nil {
return response.SYSTEM_ERROR
func (g CloudbrainOneNotebookTaskTemplate) GetConfig(opts entity.GetAITaskConfigOpts) entity.AITaskConfig {
if opts.IsFileNoteBookRequest {
return entity.AITaskConfig{
ContainerSteps: map[entity.ContainerDataType]*entity.ContainerBuildOpts{
entity.ContainerFileNoteBookCode: {},
entity.ContainerCode: {
ContainerPath: "/code",
ReadOnly: false,
AcceptStorageType: []entity.StorageType{entity.MINIO},
NotArchive: true,
},
},
}
}
form := ctx.Request
req := ai_task_entity.CreateNoteBookTaskRequest{
Name: form.JobName,
Tasks: []ai_task_entity.NoteBookTask{
{
Name: form.JobName,
ResourceSpecId: ctx.Spec.SourceSpecId,
ImageId: form.ImageID,
ImageUrl: strings.TrimSpace(form.ImageUrl),
Datasets: ctx.GetContainerDataArray(ai_task_entity.ContainerDataset),
Code: ctx.GetContainerDataArray(ai_task_entity.ContainerCode),
PreTrainModel: ctx.GetContainerDataArray(ai_task_entity.ContainerPreTrainModel),
AutoStopDuration: autoStopDurationMs,
Capacity: setting.Capacity,
CenterID: ctx.Spec.GetAvailableCenterIds(ctx.User.ID, form.JobType),
Spec: ctx.Spec,
return entity.AITaskConfig{
DatasetMaxSize: setting.DebugAttachSize * 1000 * 1000 * 1000,
ContainerSteps: map[entity.ContainerDataType]*entity.ContainerBuildOpts{
entity.ContainerCode: {
ContainerPath: "/code",
ReadOnly: false,
AcceptStorageType: []entity.StorageType{entity.MINIO},
NotArchive: true,
},
entity.ContainerDataset: {
ContainerPath: "/dataset",
ReadOnly: true,
AcceptStorageType: []entity.StorageType{entity.MINIO},
NotArchive: true,
},
entity.ContainerPreTrainModel: {
ContainerPath: "/pretrainmodel",
ReadOnly: true,
AcceptStorageType: []entity.StorageType{entity.MINIO},
},
entity.ContainerOutPutPath: {
ContainerPath: "/model",
ReadOnly: false,
AcceptStorageType: []entity.StorageType{entity.MINIO},
},
},
}
createTime := timeutil.TimeStampNow()
res, err := c.CreateNoteBook(req)
}

func (t CloudbrainOneNotebookTaskTemplate) Restart(ctx *context.CreationContext) (*entity.CreateTaskRes, *response.BizError) {
c := &CreateOperator{}
err := c.Next(t.BuildRequest4Restart).
Next(t.CheckOutput4Restart).
Next(t.CheckModel).
Next(t.CheckDatasetExists).
Next(t.CheckParam).
Next(t.CheckMulti).
Next(t.LoadSpec).
Next(t.CheckPointBalance).
Next(t.BuildContainerData).
Next(t.CallRestartAPI).
Next(t.CreateCloudbrainRecord4Restart).
Next(t.NotifyCreation).
Operate(ctx)
if err != nil {
log.Error("CloudbrainOneNotebookTask CreateNoteBook err.req=%+v err=%v", req, err)
ctx.Response = &ai_task_entity.CreationResponse{
Error: err,
}
return nil
} else {
ctx.Response = &ai_task_entity.CreationResponse{
JobID: res.JobID,
Status: res.Status,
CreateTime: createTime,
}
log.Error("Restart GrampusNoteBookTask err.%v", err)
return nil, err
}
if err != nil {
log.Error("Restart GrampusNoteBookTask err.%v", err)
return nil, err
}
return &entity.CreateTaskRes{ID: ctx.NewCloudbrain.ID, Status: ctx.NewCloudbrain.Status}, nil

return nil
}

func (g CloudbrainOneNotebookTaskTemplate) CallRestartAPI(ctx *context.CreationContext) *response.BizError {
c, err := cluster.GetCluster(ai_task_entity.OpenICloudbrainOne)
func (c CloudbrainOneNotebookTaskTemplate) BuildContainerData(ctx *context.CreationContext) *response.BizError {
err := container_builder.BuildContainerDataChain(c.GetConfig(entity.GetAITaskConfigOpts{
ComputeSource: ctx.Request.ComputeSource.Name,
IsFileNoteBookRequest: ctx.Request.IsFileNoteBookRequest,
}).ContainerSteps).Run(ctx)
if err != nil {
return err
}
return nil
}

func (g CloudbrainOneNotebookTaskTemplate) CallCreationAPI(ctx *context.CreationContext) *response.BizError {
c := g.GetMyCluster()
if c == nil {
return response.SYSTEM_ERROR
}
form := ctx.Request
req := ai_task_entity.CreateNoteBookTaskRequest{
req := entity.CreateNoteBookTaskRequest{
Name: form.JobName,
Tasks: []ai_task_entity.NoteBookTask{
Tasks: []entity.NoteBookTask{
{
Name: form.JobName,
ResourceSpecId: ctx.Spec.SourceSpecId,
ImageId: form.ImageID,
ImageUrl: strings.TrimSpace(form.ImageUrl),
Datasets: ctx.GetContainerDataArray(ai_task_entity.ContainerDataset),
Code: ctx.GetContainerDataArray(ai_task_entity.ContainerCode),
PreTrainModel: ctx.GetContainerDataArray(ai_task_entity.ContainerPreTrainModel),
Datasets: ctx.GetContainerDataArray(entity.ContainerDataset),
Code: ctx.GetContainerDataArray(entity.ContainerCode),
PreTrainModel: ctx.GetContainerDataArray(entity.ContainerPreTrainModel),
OutPut: ctx.GetContainerDataArray(entity.ContainerOutPutPath),
AutoStopDuration: autoStopDurationMs,
Capacity: setting.Capacity,
CenterID: ctx.Spec.GetAvailableCenterIds(ctx.User.ID, form.JobType),
@@ -129,33 +161,19 @@ func (g CloudbrainOneNotebookTaskTemplate) CallRestartAPI(ctx *context.CreationC
log.Error("CloudbrainOneNotebookTask CreateNoteBook err.req=%+v err=%v", req, err)
return response.NewBizError(err)
}
ctx.Response = &ai_task_entity.CreationResponse{

ctx.Response = &entity.CreationResponse{
JobID: res.JobID,
Status: res.Status,
CreateTime: createTime,
}

return nil
}

func (CloudbrainOneNotebookTaskTemplate) BuildContainerData(ctx *context.CreationContext) *response.BizError {
err := container_builder.NewBuilderChain().
Next(container_builder.ObsCodeBuilder{Opts: container_builder.ContainerBuildOpts{
ContainerPath: "/code",
ReadOnly: false,
}}).
Next(container_builder.DatasetBuilder{Opts: container_builder.ContainerBuildOpts{
ContainerPath: "/dataset",
ReadOnly: true,
}}).
Next(container_builder.PretrainModelBuilder{Opts: container_builder.ContainerBuildOpts{
ContainerPath: "/pretrainmodel",
ReadOnly: false,
}}).
Run(ctx)
if err != nil {
return response.NewBizError(err)
}
return nil
func (g CloudbrainOneNotebookTaskTemplate) CallRestartAPI(ctx *context.CreationContext) *response.BizError {
//云脑一没有再次调试接口,通过使用同样的参数新建接口来模拟
return g.CallCreationAPI(ctx)
}

func (CloudbrainOneNotebookTaskTemplate) NotifyCreation(ctx *context.CreationContext) *response.BizError {


+ 217
- 0
services/ai_task_service/task/cloudbrain_two_notebook_task.go View File

@@ -0,0 +1,217 @@
package task

import (
"code.gitea.io/gitea/entity"
"code.gitea.io/gitea/models"
"code.gitea.io/gitea/modules/convert"
"code.gitea.io/gitea/modules/log"
"code.gitea.io/gitea/modules/notification"
"code.gitea.io/gitea/modules/setting"
api "code.gitea.io/gitea/modules/structs"
"code.gitea.io/gitea/modules/timeutil"
"code.gitea.io/gitea/routers/response"
"code.gitea.io/gitea/services/ai_task_service/container_builder"
"code.gitea.io/gitea/services/ai_task_service/context"
"code.gitea.io/gitea/services/cloudbrain/resource"
"strconv"
"strings"
)

type CloudbrainTwoNotebookTaskTemplate struct {
DefaultAITaskTemplate
}

func init() {
t := &CloudbrainTwoNotebookTaskTemplate{
DefaultAITaskTemplate: DefaultAITaskTemplate{
ClusterType: entity.OpenICloudbrainTwo,
JobType: models.JobTypeDebug,
},
}
RegisterTask(models.JobTypeDebug, entity.OpenICloudbrainTwo, t)
}

func (t CloudbrainTwoNotebookTaskTemplate) Create(ctx *context.CreationContext) (*entity.CreateTaskRes, *response.BizError) {
c := &CreateOperator{}
err := c.Next(t.CheckParam).
Next(t.CheckMulti).
Next(t.CheckDisplayJobName).
Next(t.LoadSpec).
Next(t.CheckPointBalance).
Next(t.CheckDatasetSize).
Next(t.CheckDatasetExists).
Next(t.CheckBranchExists).
Next(t.InsertCloudbrainRecord4Async).
AsyncNextWithErrFun(t.BuildContainerData, t.CallCreationAPI, t.AfterCallCreationAPI4Async, t.NotifyCreation, t.HandleErr4Async).
Operate(ctx)
if err != nil {
log.Error("create CloudbrainOneNotebookTask err.%v", err)
return nil, err
}
return &entity.CreateTaskRes{ID: ctx.NewCloudbrain.ID}, nil

}

func (g CloudbrainTwoNotebookTaskTemplate) GetConfig(opts entity.GetAITaskConfigOpts) entity.AITaskConfig {
if opts.IsFileNoteBookRequest {
return entity.AITaskConfig{
ContainerSteps: map[entity.ContainerDataType]*entity.ContainerBuildOpts{
entity.ContainerFileNoteBookCode: {},
},
}
}

return entity.AITaskConfig{
DatasetMaxSize: setting.DebugAttachSize * 1000 * 1000 * 1000,
ContainerSteps: map[entity.ContainerDataType]*entity.ContainerBuildOpts{
entity.ContainerCode: {
Disable: true,
AcceptStorageType: []entity.StorageType{entity.OBS},
},
entity.ContainerDataset: {
Disable: true,
AcceptStorageType: []entity.StorageType{entity.OBS},
},
entity.ContainerPreTrainModel: {
Disable: true,
AcceptStorageType: []entity.StorageType{entity.OBS},
},
},
}
}

func (t CloudbrainTwoNotebookTaskTemplate) Restart(ctx *context.CreationContext) (*entity.CreateTaskRes, *response.BizError) {
c := &CreateOperator{}
err := c.Next(t.BuildRequest4Restart).
Next(t.CheckOutput4Restart).
Next(t.CheckModel).
Next(t.CheckDatasetExists).
Next(t.CheckIsCleared).
Next(t.CheckParam).
Next(t.CheckMulti).
Next(t.LoadSpec).
Next(t.CheckPointBalance).
Next(t.CallRestartAPI).
Next(t.CreateCloudbrainRecord4Restart).
Next(t.NotifyCreation).
Operate(ctx)
if err != nil {
log.Error("Restart GrampusNoteBookTask err.%v", err)
return nil, err
}
if err != nil {
log.Error("Restart GrampusNoteBookTask err.%v", err)
return nil, err
}
return &entity.CreateTaskRes{ID: ctx.NewCloudbrain.ID, Status: ctx.NewCloudbrain.Status}, nil

}

func (g CloudbrainTwoNotebookTaskTemplate) CallCreationAPI(ctx *context.CreationContext) *response.BizError {
c := g.GetMyCluster()
if c == nil {
return response.SYSTEM_ERROR
}
form := ctx.Request
req := entity.CreateNoteBookTaskRequest{
Name: form.JobName,
Description: form.Description,
Tasks: []entity.NoteBookTask{
{
Name: form.JobName,
ResourceSpecId: ctx.Spec.SourceSpecId,
ImageId: form.ImageID,
ImageUrl: strings.TrimSpace(form.ImageUrl),
AutoStopDuration: autoStopDurationMs,
Spec: ctx.Spec,
},
},
}
createTime := timeutil.TimeStampNow()
res, err := c.CreateNoteBook(req)
if err != nil {
log.Error("CloudbrainTwoNotebookTaskTemplate CreateNoteBook err.req=%+v err=%v", req, err)
return response.NewBizError(err)
}
ctx.Response = &entity.CreationResponse{
JobID: res.JobID,
Status: res.Status,
CreateTime: createTime,
}

return nil
}

func (g CloudbrainTwoNotebookTaskTemplate) CallRestartAPI(ctx *context.CreationContext) *response.BizError {
c := g.GetMyCluster()
if c == nil {
log.Error("Get cluster failed")
return response.SYSTEM_ERROR
}
createTime := timeutil.TimeStampNow()
res, err := c.RestartNoteBook(ctx.SourceCloudbrain.JobID)
if err != nil {
log.Error("CloudbrainTwoNotebookTaskTemplate RestartNoteBook err.Cloudbrain.JobID=%s err=%v", ctx.SourceCloudbrain.JobID, err)
return response.NewBizError(err)
}
if res.JobId == "" {
log.Error("CloudbrainTwoNotebookTaskTemplate RestartNoteBook failed.Cloudbrain.JobID=%s", ctx.SourceCloudbrain.JobID)
return response.RESTART_FAILED
}
ctx.Response = &entity.CreationResponse{
JobID: res.JobId,
Status: res.Status,
CreateTime: createTime,
}
return nil
}

func (c CloudbrainTwoNotebookTaskTemplate) BuildContainerData(ctx *context.CreationContext) *response.BizError {
err := container_builder.BuildContainerDataChain(c.GetConfig(entity.GetAITaskConfigOpts{
ComputeSource: ctx.Request.ComputeSource.Name,
IsFileNoteBookRequest: ctx.Request.IsFileNoteBookRequest,
}).ContainerSteps).Run(ctx)
if err != nil {
return err
}
return nil
}

func (CloudbrainTwoNotebookTaskTemplate) NotifyCreation(ctx *context.CreationContext) *response.BizError {
req := ctx.Request
jobID := ctx.Response.JobID
task, err := models.GetCloudbrainByJobID(jobID)
if err != nil {
log.Error("GetCloudbrainByJobID failed: %v", err.Error())
return response.NewBizError(err)
}

stringId := strconv.FormatInt(task.ID, 10)
notification.NotifyOtherTask(ctx.User, ctx.Repository, stringId, req.DisplayJobName, models.ActionCreateDebugNPUTask)
return nil
}

func (g CloudbrainTwoNotebookTaskTemplate) GetSpecs(userId int64, computeSource models.ComputeSource) ([]*api.SpecificationShow, *response.BizError) {
var aiCenterCode = models.AICenterOfCloudBrainTwo
if setting.ModelartsCD.Enabled {
aiCenterCode = models.AICenterOfChengdu
}
var specs []*models.Specification
var err error
specs, err = resource.FindAvailableSpecs(userId, models.FindSpecsOptions{
JobType: g.JobType,
ComputeResource: computeSource.Name,
Cluster: g.ClusterType.GetParentCluster(),
AiCenterCode: aiCenterCode,
})

if err != nil {
log.Error("GetSpecs err.%v", err)
return nil, response.SPEC_NOT_AVAILABLE
}
r := make([]*api.SpecificationShow, len(specs))
for i, v := range specs {
r[i] = convert.ToSpecification(v)
}
return r, nil
}

Some files were not shown because too many files changed in this diff

Loading…
Cancel
Save