#5168 新增GPGPU训练任务

Merged
ychao_1983 merged 6 commits from fix-5096 into V20240129 3 months ago
  1. +3
    -1
      models/action.go
  2. +2
    -1
      models/task_config.go
  3. +2
    -0
      modules/grampus/grampus.go
  4. +1
    -0
      options/locale/locale_en-US.ini
  5. +1
    -0
      options/locale/locale_zh-CN.ini
  6. +5
    -3
      public/home/home.js
  7. +12
    -41
      routers/repo/grampus.go
  8. +5
    -0
      routers/routes/routes.go
  9. +4
    -1
      services/ai_task_service/task/grampus_train_task.go
  10. +1
    -1
      services/socketwrap/clientManager.go
  11. +1
    -0
      templates/repo/grampus/trainjob/iluvatar-gpgpu/new.tmpl
  12. +8
    -0
      templates/user/dashboard/feeds.tmpl
  13. +59
    -2
      web_src/vuepages/pages/cloudbrain/configs.js

+ 3
- 1
models/action.go View File

@@ -78,6 +78,7 @@ const (
ActionCreateGrampusMETAXDebugTask //49
ActionCreateGrampusGPUInferenceTask //50
ActionCreateGrampusILUVATARInferenceTask //51
ActionCreateGrampusILUVATARTrainTask
)

// Action represents user operation type and other information to
@@ -435,7 +436,8 @@ func (a *Action) IsCloudbrainAction() bool {
ActionCreateGrampusMETAXDebugTask,
ActionCreateSuperComputeTask,
ActionCreateGrampusILUVATARInferenceTask,
ActionCreateGrampusGPUInferenceTask:
ActionCreateGrampusGPUInferenceTask,
ActionCreateGrampusILUVATARTrainTask:
return true
}
return false


+ 2
- 1
models/task_config.go View File

@@ -82,7 +82,8 @@ func GetTaskTypeFromAction(a ActionType) TaskType {
ActionCreateGrampusGPUOnlineInferTask,
ActionCreateGrampusGPUTrainTask,
ActionCreateGrampusGPUInferenceTask,
ActionCreateGrampusILUVATARInferenceTask:
ActionCreateGrampusILUVATARInferenceTask,
ActionCreateGrampusILUVATARTrainTask:
return TaskCreateCloudbrainTask
case ActionCreateRepo:
return TaskCreatePublicRepo


+ 2
- 0
modules/grampus/grampus.go View File

@@ -514,6 +514,8 @@ func GenerateTrainJob(ctx *context.Context, req *GenerateTrainJobReq) (jobId str
actionType = models.ActionCreateGrampusGPUTrainTask
} else if req.ComputeResource == models.GCUResource {
actionType = models.ActionCreateGrampusGCUTrainTask
} else if req.ComputeResource == models.ILUVATAR {
actionType = models.ActionCreateGrampusILUVATARTrainTask
}
notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, jobID, req.DisplayJobName, actionType)



+ 1
- 0
options/locale/locale_en-US.ini View File

@@ -3236,6 +3236,7 @@ task_c2net_cpusupercomputejob=`created CPU type HPC task <a href="%s/supercomput
task_nputrainjob=`created NPU training task <a href="%s/modelarts/train-job/%s">%s</a>`
task_inferencejob=`created reasoning task <a href="%s/modelarts/inference-job/%s">%s</a>`
task_c2net_gpu_inferencejob=`created GPU type inference task <a href="%s/grampus/inference-job/%s">%s</a>`
task_c2net_gpgpu_iluvatar_trainjob=`created ILUVATAR-GPGPU type train task <a href="%s/grampus/train-job/%s">%s</a>`
task_c2net_gpgpu_iluvatar_inferencejob=`created ILUVATAR-GPGPU type inference task <a href="%s/grampus/inference-job/%s">%s</a>`
task_benchmark=`created profiling task <a href="%s/cloudbrain/benchmark/%s">%s</a>`
task_createmodel=`created new model <a href="%s/modelmanage/model_readme_tmpl?name=%s">%s</a>`


+ 1
- 0
options/locale/locale_zh-CN.ini View File

@@ -3255,6 +3255,7 @@ task_c2net_cpusupercomputejob=`创建了CPU类型超算任务 <a href="%s/superc
task_nputrainjob=`创建了NPU类型训练任务 <a href="%s/modelarts/train-job/%s">%s</a>`
task_inferencejob=`创建了推理任务 <a href="%s/modelarts/inference-job/%s">%s</a>`
task_c2net_gpu_inferencejob=`创建了GPU类型推理任务 <a href="%s/grampus/inference-job/%s">%s</a>`
task_c2net_gpgpu_iluvatar_trainjob=`创建了ILUVATAR-GPGPU类型训练任务 <a href="%s/grampus/train-job/%s">%s</a>`
task_c2net_gpgpu_iluvatar_inferencejob=`创建了ILUVATAR-GPGPU类型推理任务 <a href="%s/grampus/inference-job/%s">%s</a>`
task_benchmark=`创建了评测任务 <a href="%s/cloudbrain/benchmark/%s">%s</a>`
task_createmodel=`导入了新模型 <a href="%s/modelmanage/model_readme_tmpl?name=%s">%s</a>`


+ 5
- 3
public/home/home.js View File

@@ -243,7 +243,7 @@ document.onreadystatechange = function () {
html += recordPrefix + actionName;
html += " <a href=\"" + getRepoLink(record) + "\" rel=\"nofollow\">" + getRepotext(record) + "</a>"
}
else if(record.OpType == "24" || record.OpType == "26" || record.OpType == "27" || record.OpType == "28" || record.OpType == "50" || record.OpType == "51"
else if(record.OpType == "24" || record.OpType == "26" || record.OpType == "27" || record.OpType == "28" || record.OpType == "50" || record.OpType == "51"
|| record.OpType == "30" || record.OpType == "31" || record.OpType == "32" || record.OpType == "33" || record.OpType == "42" || record.OpType == "44"){
html += recordPrefix + actionName;
const taskLink = getTaskLink(record);
@@ -255,7 +255,7 @@ document.onreadystatechange = function () {
}
else if(record.OpType == "25" || record.OpType == "29" || record.OpType == "39" || record.OpType == "40" || record.OpType == "41"
|| record.OpType == "43"|| record.OpType == "44"|| record.OpType == "45"|| record.OpType == "46"|| record.OpType == "47"
|| record.OpType == "48"|| record.OpType == "49"
|| record.OpType == "48"|| record.OpType == "49" || record.OpType == "52"
){
html += recordPrefix + actionName;
const taskLink = getTaskLink(record);
@@ -332,7 +332,7 @@ function getTaskLink(record){
} else {
re = '';
}
}else if(record.OpType == 32 || record.OpType == 33 || record.OpType == 42 || record.OpType == 44){
}else if(record.OpType == 32 || record.OpType == 33 || record.OpType == 42 || record.OpType == 44 || record.OpType == 52){
if (record.Cloudbrain) {
re = re + "/grampus/train-job/" + record.Cloudbrain.ID;
} else {
@@ -518,6 +518,7 @@ var actionNameZH={
"47":"创建了CPU类型超算任务",
"48":"创建了ILUVATAR-GPGPU类型调试任务",
"49":"创建了METAX-GPGPU类型调试任务",
"52":"创建了ILUVATAR-GPGPU类型训练任务",
};

var actionNameEN={
@@ -562,6 +563,7 @@ var actionNameEN={
"47":" created CPU type super compute task ",
"48":" created ILUVATAR-GPGPU type debugging task ",
"49":" created METAX-GPGPU type debugging task ",
"52":" created ILUVATAR-GPGPU type training task ",
};

var repoAndOrgZH={


+ 12
- 41
routers/repo/grampus.go View File

@@ -66,6 +66,9 @@ const (
tplGrampusNotebookMLUNew base.TplName = "repo/grampus/notebook/mlu/new"
tplGrampusTrainJobMLUNew base.TplName = "repo/grampus/trainjob/mlu/new"

//IluvatarGPGPU
tplGrampusTrainJobIluvatarGPGPUNew base.TplName = "repo/grampus/trainjob/iluvatar-gpgpu/new"

//C2NET notebook
tplGrampusNotebookNew base.TplName = "repo/grampus/notebook/new"

@@ -87,65 +90,33 @@ func GrampusInferenceShow(ctx *context.Context) {
func GrampusNotebookNew(ctx *context.Context) {
ctx.Data["PageIsCloudBrain"] = true
ctx.HTML(http.StatusOK, tplGrampusNotebookNew)
// ctx.Data["IsCreate"] = true
// ctx.Data["PageIsCloudBrain"] = true
// notebookType := ctx.QueryInt("type")
// processType := grampus.ProcessorTypeGPU
// if notebookType == 1 {
// processType = grampus.ProcessorTypeNPU
// } else if notebookType == 2 {
// processType = grampus.ProcessorTypeGCU
// } else if notebookType == 3 {
// processType = grampus.ProcessorTypeMLU
// ctx.HTML(http.StatusOK, tplGrampusNotebookMLUNew)
// return
// }
// err := grampusNotebookNewDataPrepare(ctx, processType)
// if err != nil {
// ctx.ServerError("get new notebook-job info failed", err)
// return
// }
// if processType == grampus.ProcessorTypeGPU {
// ctx.HTML(http.StatusOK, tplGrampusNotebookGPUNew)
// } else if processType == grampus.ProcessorTypeNPU {
// ctx.HTML(http.StatusOK, tplGrampusNotebookNPUNew)
// } else if processType == grampus.ProcessorTypeGCU {
// ctx.HTML(http.StatusOK, tplGrampusNotebookGCUNew)
// }
}

func GrampusTrainJobGPUNew(ctx *context.Context) {
ctx.Data["IsCreate"] = true
err := grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
if err != nil {
ctx.ServerError("get new train-job info failed", err)
return
}
ctx.Data["PageIsCloudBrain"] = true

ctx.HTML(http.StatusOK, tplGrampusTrainJobGPUNew)
}

func GrampusTrainJobNPUNew(ctx *context.Context) {
ctx.Data["IsCreate"] = true
err := grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeNPU)
if err != nil {
ctx.ServerError("get new train-job info failed", err)
return
}
ctx.Data["PageIsCloudBrain"] = true
ctx.HTML(200, tplGrampusTrainJobNPUNew)
}

func GrampusTrainJobGCUNew(ctx *context.Context) {
ctx.Data["IsCreate"] = true
err := grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGCU)
if err != nil {
ctx.ServerError("get new train-job info failed", err)
return
}

ctx.Data["PageIsCloudBrain"] = true
ctx.HTML(http.StatusOK, tplGrampusTrainJobGCUNew)
}

func GrampusTrainJobIluvatarGPGPUNew(ctx *context.Context) {
ctx.Data["IsCreate"] = true
ctx.Data["PageIsCloudBrain"] = true
ctx.HTML(http.StatusOK, tplGrampusTrainJobIluvatarGPGPUNew)
}

func GrampusNotebookCreate(ctx *context.Context, form auth.CreateGrampusNotebookForm) {
ctx.Data["IsCreate"] = true
displayJobName := form.DisplayJobName


+ 5
- 0
routers/routes/routes.go View File

@@ -453,6 +453,8 @@ func RegisterRoutes(m *macaron.Macaron) {
m.Get("/my_list", card_request.GetMyCardRequestList)
m.Get("/admin_list", operationReq, card_request.GetAdminCardRequestList)
m.Get("/specification/list", operationReq, admin.GetAllResourceSpecificationList)
m.Get("/resources/queue/centers", operationReq, admin.GetResourceAiCenters)
m.Get("/resources/queue/codes", operationReq, admin.GetResourceQueueCodes)
m.Put("/update/:id", binding.Bind(structs.CardReq{}), card_request.UpdateCardRequest)
m.Put("/admin/update/:id", operationReq, bindIgnErr(structs.CardReq{}), card_request.UpdateCardRequestAndSpec)

@@ -1388,6 +1390,9 @@ func RegisterRoutes(m *macaron.Macaron) {
m.Get("/create", reqWechatBind, reqRepoCloudBrainWriter, context.PointAccount(), repo.GrampusTrainJobGCUNew)
m.Post("/create", reqWechatBind, reqRepoCloudBrainWriter, bindIgnErr(auth.CreateGrampusTrainJobForm{}), context.PointAccount(), repo.GrampusTrainJobGcuCreate)
})
m.Group("/iluvatar-gpgpu", func() {
m.Get("/create", reqWechatBind, reqRepoCloudBrainWriter, context.PointAccount(), repo.GrampusTrainJobIluvatarGPGPUNew)
})
})

m.Group("/inference-job", func() {


+ 4
- 1
services/ai_task_service/task/grampus_train_task.go View File

@@ -1,6 +1,8 @@
package task

import (
"strings"

"code.gitea.io/gitea/entity"
"code.gitea.io/gitea/models"
"code.gitea.io/gitea/modules/cloudbrain"
@@ -9,7 +11,6 @@ import (
"code.gitea.io/gitea/modules/timeutil"
"code.gitea.io/gitea/routers/response"
"code.gitea.io/gitea/services/ai_task_service/context"
"strings"
)

type GrampusTrainTaskTemplate struct {
@@ -95,6 +96,8 @@ func GetGrampusTrainTaskConfig(opts entity.AITaskConfigKey) *entity.AITaskBaseCo
config.ActionType = models.ActionCreateGrampusGPUTrainTask
case models.GCU:
config.ActionType = models.ActionCreateGrampusGCUTrainTask
case models.ILUVATAR:
config.ActionType = models.ActionCreateGrampusILUVATARTrainTask
}
config.IsActionUseJobId = true
return config


+ 1
- 1
services/socketwrap/clientManager.go View File

@@ -10,7 +10,7 @@ import (
"github.com/elliotchance/orderedmap"
)

var opTypes = []int{1, 2, 5, 6, 7, 9, 10, 11, 12, 13, 14, 15, 17, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 35, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51}
var opTypes = []int{1, 2, 5, 6, 7, 9, 10, 11, 12, 13, 14, 15, 17, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 35, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52}

type ClientsManager struct {
Clients *orderedmap.OrderedMap


+ 1
- 0
templates/repo/grampus/trainjob/iluvatar-gpgpu/new.tmpl View File

@@ -0,0 +1 @@
{{ template "repo/cloudbrain/cloudbraincreate" .}}

+ 8
- 0
templates/user/dashboard/feeds.tmpl View File

@@ -191,6 +191,12 @@
{{$.i18n.Tr "action.task_c2ent_mlutrainjob" .GetRepoLink (Printf "%d" .Cloudbrain.ID) .RefName | Str2html}}
{{else}}
{{$.i18n.Tr "action.task_c2ent_mlutrainjob" "" "" "" | Str2html}}<span style="">{{.RefName}}{{$.i18n.Tr "repo.issues.deleted_milestone"}}</span>
{{end}}
{{else if eq .GetOpType 52}}
{{if .Cloudbrain}}
{{$.i18n.Tr "action.task_c2net_gpgpu_iluvatar_trainjob" .GetRepoLink (Printf "%d" .Cloudbrain.ID) .RefName | Str2html}}
{{else}}
{{$.i18n.Tr "action.task_c2net_gpgpu_iluvatar_trainjob" "" "" "" | Str2html}}<span style="">{{.RefName}}{{$.i18n.Tr "repo.issues.deleted_milestone"}}</span>
{{end}}
{{else if eq .GetOpType 45}}
{{$.i18n.Tr "action.task_c2ent_onlineinferjob" .GetRepoLink .Content .RefName | Str2html}}
@@ -261,6 +267,8 @@
<span class="text grey"><i class="ri-haze-2-line icon big"></i></span>
{{else if eq .GetOpType 51}}
<span class="text grey"><i class="ri-haze-2-line icon big"></i></span>
{{else if eq .GetOpType 52}}
<span class="text grey"><i class="ri-voice-recognition-line icon big"></i></span>
{{else if eq .GetOpType 29}}
<span class="text grey"><i class="ri-vip-crown-line icon big"></i></span>
{{else if eq .GetOpType 30}}


+ 59
- 2
web_src/vuepages/pages/cloudbrain/configs.js View File

@@ -286,7 +286,7 @@ export const CreatePageConfigs = {
}],
'C2Net': [{
url: 'grampus/train-job/gpu/create',
computerResouces: ['GPU', 'NPU', 'GCU'],
computerResouces: ['GPU', 'NPU', 'GCU', 'ILUVATAR-GPGPU'],
'GPU': [{
url: 'grampus/train-job/gpu/create',
clusterType: 'C2Net',
@@ -380,6 +380,28 @@ export const CreatePageConfigs = {
},
modify: { showIsContinue: false, },
}],
'ILUVATAR-GPGPU': [{
url: 'grampus/train-job/iluvatar-gpgpu/create',
clusterType: 'C2Net',
tips2: i18n.t('cloudbrainObj.pathTips3', {
code: '/code',
dataset: '/dataset',
model: '/pretrainmodel',
}),
form: {
taskName: { required: true, },
taskDescr: { required: false, },
branchName: { required: true, },
model: { required: false, multiple: false },
imagev2: { required: true, relatedSpec: true },
bootFile: { required: true, sampleUrl: '' },
dataset: { required: true },
runParameters: { required: false },
networkType: { required: true },
spec: { required: true },
},
modify: { showIsContinue: false, },
}],
}],
}],
// 推理任务
@@ -584,7 +606,7 @@ export const ListPageConfigs = {
emptyTip3: i18n.t('cloudbrainObj.debugTaskEmptyTip3', { url: 'https://openi.pcl.ac.cn/docs/index.html#/cloudbrain/debug/debug' }),
}, {
jobType: 'TRAIN',
sortList: getSortList(['', 'GPU', 'NPU', 'GCU']),
sortList: getSortList(['', 'GPU', 'NPU', 'GCU', 'ILUVATAR-GPGPU']),
jobTypeName: getListValueWithKey(JOB_TYPE, 'TRAIN'),
url: 'modelarts/train-job',
createUrl: 'grampus/train-job/npu/create',
@@ -1071,6 +1093,41 @@ export const DetailPageConfigs = {
name: 'resultDownload'
}],
}],
'ILUVATAR-GPGPU': [{
detailUrl: 'grampus/train-job/',
summary: [],
operations: ['saveModel', 'exportDataset'],
tabs: [{
name: 'configInfo',
fields: [
'taskName', 'imagev2',
'status', 'spec',
'creator', 'aiCenter',
'branch', 'hasInternet',
'computerRes', 'modelName',
'runVersion', 'modelVersion',
'createTime', 'modelFiles',
'startTime', 'bootFile',
'endTime', 'runParameters',
'duration', 'workServerNum',
'descr', '',
'failedReason',
'dataset',
'modelList',
]
}, {
name: 'operationProfile'
}, {
name: 'logs',
noScroll: true,
multiNodes: true,
}, {
name: 'resourceUseage',
multiNodes: true,
}, {
name: 'resultDownload'
}],
}],
}]
}],
// 推理任务


Loading…
Cancel
Save