#217 #64,#182

Merged
liwei03 merged 14 commits from openioctopus/octopus:master into master 2 years ago
  1. +7
    -1
      admin-portal/src/views/dataManager/components/versionList.vue
  2. +7
    -1
      admin-portal/src/views/devManager/components/algorithm/versionList.vue
  3. +4
    -0
      admin-portal/src/views/modelManager/components/versionList.vue
  4. +5
    -0
      admin-portal/src/views/platformManager/platformTrainingTaskList.vue
  5. +26
    -11
      admin-portal/src/views/traningManager/components/taskInfo.vue
  6. +4
    -0
      openai-portal/src/views/dataManager/components/versionList.vue
  7. +7
    -1
      openai-portal/src/views/modelDev/components/algorithm/versionList.vue
  8. +26
    -11
      openai-portal/src/views/trainingManager/components/detailDialog/taskInfo.vue
  9. +4
    -4
      server/admin-server/api/v1/develop.proto
  10. +12
    -10
      server/admin-server/api/v1/platform.proto
  11. +4
    -4
      server/admin-server/api/v1/trainJob.proto
  12. +4
    -4
      server/base-server/api/v1/develop.proto
  13. +12
    -10
      server/base-server/api/v1/platformtrainJob.proto
  14. +4
    -4
      server/base-server/api/v1/trainJob.proto
  15. +1
    -1
      server/base-server/internal/data/dao/platform/platform_train_job.go
  16. +8
    -2
      server/base-server/internal/data/dao/train_job.go
  17. +4
    -4
      server/base-server/internal/service/lable/lable.go
  18. +20
    -7
      server/base-server/internal/service/platform/train_job.go
  19. +9
    -1
      server/common/utils/protoc.go
  20. +4
    -4
      server/openai-server/api/v1/develop.proto
  21. +4
    -4
      server/openai-server/api/v1/trainJob.proto
  22. +2
    -2
      server/platform-server/api/v1/trainJob.proto

+ 7
- 1
admin-portal/src/views/dataManager/components/versionList.vue View File

@@ -208,4 +208,10 @@
}
}
}
</script>
</script>
<style lang="scss" scoped>
.block {
float: right;
margin: 20px;
}
</style>

+ 7
- 1
admin-portal/src/views/devManager/components/algorithm/versionList.vue View File

@@ -284,4 +284,10 @@
}

}
</script>
</script>
<style lang="scss" scoped>
.block {
float: right;
margin: 20px;
}
</style>

+ 4
- 0
admin-portal/src/views/modelManager/components/versionList.vue View File

@@ -274,4 +274,8 @@
}
</script>
<style lang="scss" scoped>
.block {
float: right;
margin: 20px;
}
</style>

+ 5
- 0
admin-portal/src/views/platformManager/platformTrainingTaskList.vue View File

@@ -11,6 +11,11 @@
<span>{{ scope.row.name }}</span>
</template>
</el-table-column>
<el-table-column label="平台名称" align="center">
<template slot-scope="scope">
<span>{{ scope.row.platformName }}</span>
</template>
</el-table-column>
<el-table-column label="镜像" align="center">
<template slot-scope="scope">
<span>{{ scope.row.image.name+":"+scope.row.image.version }}</span>


+ 26
- 11
admin-portal/src/views/traningManager/components/taskInfo.vue View File

@@ -9,7 +9,7 @@
</el-col>
</el-row>
<el-row>
<el-col v-if="data.isDistributed" :span="12">
<el-col v-if="showInfo" :span="12">
<el-form ref="ruleForm" :model="ruleForm">
<el-form-item prop="subTaskItem">
<div style="font-size: 15px">子任务名:
@@ -45,7 +45,6 @@

<div class="block">
<el-pagination
v-if="showInfo"
:current-page="pageIndex"
:page-sizes="[10, 20, 50, 80]"
:page-size="pageSize"
@@ -94,10 +93,7 @@
})
}
}
if (!this.data.isDistributed) {
this.isDistributed = !this.data.isDistributed
this.selectedSubTaskOption()
}
this.selectedSubTaskOption()
},
methods: {
selectedSubTaskOption() {
@@ -110,7 +106,7 @@
}
getTempalteInfo(param).then(response => {
if (response.success) {
this.showInfo = response.payload.jobEvents.length
this.showInfo = !this.data.isDistributed ? this.data.isDistributed : response.payload.jobEvents.length
this.total = response.payload.totalSize
let infoMessage = ""
response.payload.jobEvents.forEach(function(element) {
@@ -119,12 +115,31 @@
infoMessage += "\n" + "[" + title + "]"
infoMessage += "\n" + "[" + message + "]" + "\n"
})
this.ruleForm.subTaskItem = this.row.config[0].replicaStates[0].key
this.subTaskInfo = infoMessage
} else {
this.$message({
message: "暂无相关运行信息",
type: 'warning'
});
const data = {
id: this.row.id,
pageIndex: this.pageIndex,
pageSize: this.pageSize,
taskIndex: 0,
replicaIndex: 0
}
getTempalteInfo(data).then(response => {
if (response.success) {
this.total = response.payload.totalSize
let infoMessage = ""
response.payload.jobEvents.forEach(function(element) {
const title = element.reason
const message = element.message
infoMessage += "\n" + "[" + title + "]"
infoMessage += "\n" + "[" + message + "]" + "\n"
})
this.subTaskInfo = infoMessage
} else {
this.subTaskInfo = "暂无相关运行信息"
}
})
}
}).catch(err => {
console.log("err:", err)


+ 4
- 0
openai-portal/src/views/dataManager/components/versionList.vue View File

@@ -300,4 +300,8 @@
</script>

<style lang="scss" scoped>
.block {
float: right;
margin: 20px;
}
</style>

+ 7
- 1
openai-portal/src/views/modelDev/components/algorithm/versionList.vue View File

@@ -393,4 +393,10 @@
}
}
}
</script>
</script>
<style lang="scss" scoped>
.block {
float: right;
margin: 20px;
}
</style>

+ 26
- 11
openai-portal/src/views/trainingManager/components/detailDialog/taskInfo.vue View File

@@ -9,7 +9,7 @@
</el-col>
</el-row>
<el-row>
<el-col v-if="data.isDistributed" :span="12">
<el-col v-if="showInfo" :span="12">
<el-form ref="ruleForm" :model="ruleForm">
<el-form-item prop="subTaskItem">
<div style="font-size: 15px">子任务名:
@@ -45,7 +45,6 @@

<div class="block">
<el-pagination
v-if="showInfo"
:current-page="pageIndex"
:page-sizes="[10, 20, 50, 80]"
:page-size="pageSize"
@@ -94,10 +93,7 @@
})
}
}
if (!this.data.isDistributed) {
this.isDistributed = !this.data.isDistributed
this.selectedSubTaskOption()
}
this.selectedSubTaskOption()
},
methods: {
selectedSubTaskOption() {
@@ -110,7 +106,7 @@
}
getTempalteInfo(param).then(response => {
if (response.success) {
this.showInfo = response.payload.jobEvents.length
this.showInfo = !this.data.isDistributed ? this.data.isDistributed : response.payload.jobEvents.length
this.total = response.payload.totalSize
let infoMessage = ""
response.payload.jobEvents.forEach(function(element) {
@@ -119,12 +115,31 @@
infoMessage += "\n" + "[" + title + "]"
infoMessage += "\n" + "[" + message + "]" + "\n"
})
this.ruleForm.subTaskItem = this.row.config[0].replicaStates[0].key
this.subTaskInfo = infoMessage
} else {
this.$message({
message: "暂无相关运行信息",
type: 'warning'
});
const data = {
id: this.row.id,
pageIndex: this.pageIndex,
pageSize: this.pageSize,
taskIndex: 0,
replicaIndex: 0
}
getTempalteInfo(data).then(response => {
if (response.success) {
this.total = response.payload.totalSize
let infoMessage = ""
response.payload.jobEvents.forEach(function(element) {
const title = element.reason
const message = element.message
infoMessage += "\n" + "[" + title + "]"
infoMessage += "\n" + "[" + message + "]" + "\n"
})
this.subTaskInfo = infoMessage
} else {
this.subTaskInfo = "暂无相关运行信息"
}
})
}
}).catch(err => {
console.log("err:", err)


+ 4
- 4
server/admin-server/api/v1/develop.proto View File

@@ -88,10 +88,10 @@ message NotebookEventListRequest {
int64 pageSize = 2[(validate.rules).int64 = {gte:1,lt:100}];
//任务ID
string id = 3[(validate.rules).string = {min_len: 1}];
//子任务序号,从1开始
int64 taskIndex = 4[(validate.rules).int64 = {gte:1,lt:100}];
//副本序号,从1开始
int64 replicaIndex = 5[(validate.rules).int64 = {gte:1,lt:100}];
//子任务序号,从0开始
int64 taskIndex = 4[(validate.rules).int64 = {gte:0,lt:100}];
//副本序号,从0开始
int64 replicaIndex = 5[(validate.rules).int64 = {gte:0,lt:100}];
}

message NotebookEventListReply {


+ 12
- 10
server/admin-server/api/v1/platform.proto View File

@@ -304,25 +304,27 @@ message PlatformTrainJob{
string name = 2;
//platformId
string platformId = 3;
//platformName
string platformName = 4;
//job描述
string desc = 4;
string desc = 5;
//数据集
repeated PlatformDataset datasets = 5;
repeated PlatformDataset datasets = 6;
//镜像
PlatformImage image = 6;
PlatformImage image = 7;
//子任务配置信息
repeated PlatformTask tasks = 7;
repeated PlatformTask tasks = 8;
//创建时间
int64 createdAt = 8;
int64 createdAt = 9;
//更新时间
int64 updatedAt = 9;
int64 updatedAt = 10;
//任务状态
string status = 10;
string status = 11;
//job完成时间
int64 completedAt = 11;
int64 completedAt = 12;
//运行时
int64 runSec = 12;
int64 runSec = 13;
//启动时间
int64 startedAt = 13;
int64 startedAt = 14;
}


+ 4
- 4
server/admin-server/api/v1/trainJob.proto View File

@@ -189,10 +189,10 @@ message JobEventListRequest {
int64 pageSize = 2[(validate.rules).int64 = {gte:1,lt:100}];
//任务ID
string id = 3[(validate.rules).string = {min_len: 1}];
//子任务序号,从1开始
int64 taskIndex = 4[(validate.rules).int64 = {gte:1,lt:100}];
//副本序号,从1开始
int64 replicaIndex = 5[(validate.rules).int64 = {gte:1,lt:100}];
//子任务序号,从0开始
int64 taskIndex = 4[(validate.rules).int64 = {gte:0,lt:100}];
//副本序号,从0开始
int64 replicaIndex = 5[(validate.rules).int64 = {gte:0,lt:100}];
}

message JobEventListReply {


+ 4
- 4
server/base-server/api/v1/develop.proto View File

@@ -124,10 +124,10 @@ message NotebookEventListRequest {
int64 pageSize = 2[(validate.rules).int64 = {gte:1,lt:100}];
//任务ID
string id = 3[(validate.rules).string = {min_len: 1}];
//子任务序号,从1开始
int64 taskIndex = 4[(validate.rules).int64 = {gte:1,lt:100}];
//副本序号,从1开始
int64 replicaIndex = 5[(validate.rules).int64 = {gte:1,lt:100}];
//子任务序号,从0开始
int64 taskIndex = 4[(validate.rules).int64 = {gte:0,lt:100}];
//副本序号,从0开始
int64 replicaIndex = 5[(validate.rules).int64 = {gte:0,lt:100}];
}

message NotebookEventListReply {


+ 12
- 10
server/base-server/api/v1/platformtrainJob.proto View File

@@ -147,26 +147,28 @@ message PlatformTrainJob{
string name = 2;
//platformId
string platformId = 3;
//platformName
string platformName = 4;
//job描述
string desc = 4;
string desc = 5;
//数据集
repeated PlatformDataset datasets = 5;
repeated PlatformDataset datasets = 6;
//镜像
PlatformImage image = 6;
PlatformImage image = 7;
//子任务配置信息
repeated PlatformTask tasks = 7;
repeated PlatformTask tasks = 8;
//创建时间
int64 createdAt = 8;
int64 createdAt = 9;
//更新时间
int64 updatedAt = 9;
int64 updatedAt = 10;
//任务状态
string status = 10;
string status = 11;
//job完成时间
int64 completedAt = 11;
int64 completedAt = 12;
//运行时
int64 runSec = 12;
int64 runSec = 13;
//启动时间
int64 startedAt = 13;
int64 startedAt = 14;
}




+ 4
- 4
server/base-server/api/v1/trainJob.proto View File

@@ -245,10 +245,10 @@ message JobEventListRequest {
int64 pageSize = 2[(validate.rules).int64 = {gte:1,lt:100}];
//任务ID
string id = 3[(validate.rules).string = {min_len: 1}];
//子任务序号,从1开始
int64 taskIndex = 4[(validate.rules).int64 = {gte:1,lt:100}];
//副本序号,从1开始
int64 replicaIndex = 5[(validate.rules).int64 = {gte:1,lt:100}];
//子任务序号,从0开始
int64 taskIndex = 4[(validate.rules).int64 = {gte:0,lt:100}];
//副本序号,从0开始
int64 replicaIndex = 5[(validate.rules).int64 = {gte:0,lt:100}];
}

message JobEventListReply {


+ 1
- 1
server/base-server/internal/data/dao/platform/platform_train_job.go View File

@@ -128,7 +128,7 @@ func (d *platformTrainJobDao) GetTrainJobList(ctx context.Context, query *model.
}

if query.OrderBy != "" {
sortBy = query.OrderBy
orderBy = query.OrderBy
}

db = db.Order(fmt.Sprintf("%s %s", sortBy, orderBy))


+ 8
- 2
server/base-server/internal/data/dao/train_job.go View File

@@ -366,7 +366,12 @@ func (d *trainJobDao) GetTrainJobEvents(jobEventQuery *model.JobEventQuery) ([]*
ReplicaIndex := jobEventQuery.ReplicaIndex
events := make([]*model.TrainJobEvent, 0)

objectName := fmt.Sprintf("%s-task%d-%d", jobEventQuery.Id, TaskIndex-1, ReplicaIndex-1)
objectName := ""
if TaskIndex > 0 && ReplicaIndex > 0 {
objectName = fmt.Sprintf("%s-task%d-%d", jobEventQuery.Id, TaskIndex-1, ReplicaIndex-1)
} else {
objectName = jobEventQuery.Id
}

countQuery := fmt.Sprintf("SELECT COUNT(%s) FROM octopus..events where object_name = '%s'", keyMessage, objectName)
res, err := d.influxdb.Query(countQuery)
@@ -384,8 +389,9 @@ func (d *trainJobDao) GetTrainJobEvents(jobEventQuery *model.JobEventQuery) ([]*
return events, 0, errors.Errorf(err, errors.ErroInfluxdbFindFailed)
}

query := fmt.Sprintf("select %s, %s, %s from octopus..events where object_name = '%s' and kind = 'Pod' LIMIT %d OFFSET %d",
query := fmt.Sprintf("select %s, %s, %s from octopus..events where object_name = '%s' LIMIT %d OFFSET %d",
keyName, keyReason, keyMessage, objectName, PageSize, (PageIndex-1)*PageSize)

res, err = d.influxdb.Query(query)

if err != nil {


+ 4
- 4
server/base-server/internal/service/lable/lable.go View File

@@ -17,12 +17,12 @@ import (

var LABLE_DEFAULT_DESC = map[api.Relegation]map[api.Type][]string{
api.Relegation_LABLE_RELEGATION_DATASET: {
api.Type_LABLE_TYPE_DATASET_TYPE: {"图像", "视频", "音频", "文本"},
api.Type_LABLE_TYPE_DATASET_APPLY: {"图像分类", "目标检测", "目标跟踪", "语义分割", "文本分类", "中文分词", "音频分类", "数据增强"},
api.Type_LABLE_TYPE_DATASET_TYPE: {"图像", "视频", "音频", "文本", "其他"},
api.Type_LABLE_TYPE_DATASET_APPLY: {"图像分类", "目标检测", "目标跟踪", "语义分割", "文本分类", "中文分词", "音频分类", "数据增强", "其他"},
},
api.Relegation_LABLE_RELEGATION_ALGORITHM: {
api.Type_LABLE_TYPE_ALGORITHM_APPLY: {"图像分类", "目标检测", "目标跟踪", "语义分割", "文本分类", "中文分词", "音频分类", "模型优化"},
api.Type_LABLE_TYPE_ALGORITHM_FRAMEWORK: {"TensorFlow", "Pytorch", "MindSpore", "Keras"},
api.Type_LABLE_TYPE_ALGORITHM_APPLY: {"图像分类", "目标检测", "目标跟踪", "语义分割", "文本分类", "中文分词", "音频分类", "模型优化", "其他"},
api.Type_LABLE_TYPE_ALGORITHM_FRAMEWORK: {"TensorFlow", "Pytorch", "MindSpore", "Keras", "其他"},
},
}



+ 20
- 7
server/base-server/internal/service/platform/train_job.go View File

@@ -588,7 +588,11 @@ func (s *platformTrainJobService) TrainJobList(ctx context.Context, req *api.Pla
if err != nil {
return nil, err
}

platform, err := s.getPlatformInfo(ctx, job.PlatformId)
if err != nil {
return nil, err
}
trainJob.PlatformName = platform.Name
trainJobs = append(trainJobs, trainJob)
}

@@ -1032,15 +1036,10 @@ func (s *platformTrainJobService) updatePlatfromJobStatus(ctx context.Context, p
return err
}
if url, ok := reply.Config[common.PlatformKeyJobStatusCallbackAddr]; ok {
platformReply, err := s.platformService.BatchGetPlatform(ctx, &api.BatchGetPlatformRequest{Ids: []string{platformId}})
platform, err := s.getPlatformInfo(ctx, platformId)
if err != nil {
return err
}
if len(platformReply.Platforms) <= 0 {
s.log.Info(ctx, "updatePlatfromJobStatus failed, cannot find platform ClientSecret:"+info.JobId)
return errors.Errorf(err, errors.ErrorDBFindEmpty)
}
platform := platformReply.Platforms[0]
err = s.data.Platform.UpdateJobStatus(ctx, url, platform.ClientSecret, info)
if err != nil {
return err
@@ -1050,3 +1049,17 @@ func (s *platformTrainJobService) updatePlatfromJobStatus(ctx context.Context, p
}
return nil
}

func (s *platformTrainJobService) getPlatformInfo(ctx context.Context, platformId string) (*api.Platform, error) {

platformReply, err := s.platformService.BatchGetPlatform(ctx, &api.BatchGetPlatformRequest{Ids: []string{platformId}})
if err != nil {
return nil, err
}
if len(platformReply.Platforms) <= 0 {
s.log.Info(ctx, "updatePlatfromJobStatus failed, cannot find platform ClientSecret")
return nil, errors.Errorf(err, errors.ErrorDBFindEmpty)
}
platform := platformReply.Platforms[0]
return platform, nil
}

+ 9
- 1
server/common/utils/protoc.go View File

@@ -8,6 +8,7 @@ import (
"os"
"os/exec"
"path/filepath"
"regexp"
"strings"

jsonpatch "github.com/evanphx/json-patch"
@@ -162,6 +163,8 @@ func GenSwagger() error {
"./",
"--openapiv2_opt",
"logtostderr=true",
"--openapiv2_opt",
"enums_as_ints=true",
name,
}

@@ -210,7 +213,12 @@ func GenSwagger() error {
}

swaggerStr := strings.ReplaceAll(string(swaggerBytes), `,"default":{"description":"An unexpected error response.","schema":{"$ref":"#/definitions/rpcStatus"}}`, "")

reg := regexp.MustCompile(`{[^{]*"format":"(int64|uint64)"[\s\S]*?"type":"string"[^}]*}`)
swaggerStr = reg.ReplaceAllStringFunc(swaggerStr, func(s string) string { // proto json序列化64位序列化为字符串,octopus用的是标准库json序列化,这个修改为整型
s = strings.ReplaceAll(s, `"type":"string"`, `"type":"number"`)
r := regexp.MustCompile(`"format":"(int64|uint64)",`)
return r.ReplaceAllString(s, "")
})
err = ioutil.WriteFile(filepath.Join(dir, swaggerFileName), []byte(swaggerStr), 0755)
if err != nil {
return err


+ 4
- 4
server/openai-server/api/v1/develop.proto View File

@@ -149,10 +149,10 @@ message NotebookEventListRequest {
int64 pageSize = 2[(validate.rules).int64 = {gte:1,lt:100}];
//任务ID
string id = 3[(validate.rules).string = {min_len: 1}];
//子任务序号,从1开始
int64 taskIndex = 4[(validate.rules).int64 = {gte:1,lt:100}];
//副本序号,从1开始
int64 replicaIndex = 5[(validate.rules).int64 = {gte:1,lt:100}];
//子任务序号,从0开始
int64 taskIndex = 4[(validate.rules).int64 = {gte:0,lt:100}];
//副本序号,从0开始
int64 replicaIndex = 5[(validate.rules).int64 = {gte:0,lt:100}];
}

message NotebookEventListReply {


+ 4
- 4
server/openai-server/api/v1/trainJob.proto View File

@@ -385,10 +385,10 @@ message JobEventListRequest {
int64 pageSize = 2[(validate.rules).int64 = {gte:1,lt:100}];
//任务ID
string id = 3[(validate.rules).string = {min_len: 1}];
//子任务序号,从1开始
int64 taskIndex = 4[(validate.rules).int64 = {gte:1,lt:100}];
//副本序号,从1开始
int64 replicaIndex = 5[(validate.rules).int64 = {gte:1,lt:100}];
//子任务序号,从0开始
int64 taskIndex = 4[(validate.rules).int64 = {gte:0,lt:100}];
//副本序号,从0开始
int64 replicaIndex = 5[(validate.rules).int64 = {gte:0,lt:100}];
}

message JobEventListReply {


+ 2
- 2
server/platform-server/api/v1/trainJob.proto View File

@@ -140,9 +140,9 @@ message StopJobReply {
}

message TrainJobListRequest{
// 页码,从1开始,必填
// 页码,从1开始,必填
int64 pageIndex = 1[(validate.rules).int64 = {gte:1}];
// 页大小,最小1条,最大100条,必填
// 页大小,最小1条,最大100条,必填
int64 pageSize = 2[(validate.rules).int64 = {gte:1,lt:100}];
// 分组依据,非必填
string sortBy = 3;


Loading…
Cancel
Save