#728 master

Merged
yangxzh1 merged 35 commits from openioctopus/octopus:master into master 1 year ago
  1. +3
    -2
      .golangci.yaml
  2. +3
    -0
      admin-portal/src/components/search/index.vue
  3. +1
    -3
      admin-portal/src/views/modelManager/components/createDialog.vue
  4. +14
    -4
      admin-portal/src/views/timeManager/component/time.vue
  5. +3
    -4
      admin-portal/src/views/userManager/components/addDialog.vue
  6. +6
    -6
      admin-portal/src/views/userManager/user.vue
  7. +6
    -6
      deploy/charts/octopus/templates/grafana.yaml
  8. +65
    -0
      deploy/charts/octopus/templates/xilinx-device-plugin.yaml
  9. +7
    -2
      deploy/charts/octopus/values.yaml
  10. +8
    -0
      deploy/single_master_k8s_install/comm.sh
  11. +2
    -0
      deploy/single_master_k8s_install/master_install_k8s.sh
  12. +2
    -0
      deploy/single_master_k8s_install/node_install_k8s.sh
  13. +1
    -1
      openai-portal/src/permission.js
  14. +1
    -3
      openai-portal/src/views/cloudInterconnection/trainingTaskCreate.vue
  15. +2
    -1
      openai-portal/src/views/trainingManager/components/createDialog/traningList.vue
  16. +12
    -23
      openai-portal/src/views/trainingManager/components/editDialog/traningList.vue
  17. +2
    -2
      openai-portal/vue.config.js
  18. +40
    -0
      server/73pipeline
  19. +2
    -0
      server/admin-server/api/v1/billing.proto
  20. +1
    -0
      server/admin-server/api/v1/user.proto
  21. +12
    -1
      server/admin-server/internal/service/billing.go
  22. +13
    -0
      server/admin-server/internal/service/user.go
  23. +3
    -1
      server/base-server/api/v1/develop.proto
  24. +1
    -0
      server/base-server/api/v1/trainJob.proto
  25. +1
    -1
      server/base-server/internal/data/dao/model/user.go
  26. +26
    -2
      server/base-server/internal/service/develop/develop.go
  27. +36
    -2
      server/base-server/internal/service/trainjob/train_job.go
  28. +6
    -0
      server/common/api/v1/common.proto
  29. +3
    -1
      server/openai-server/api/v1/develop.proto
  30. +2
    -0
      server/openai-server/api/v1/trainJob.proto

+ 3
- 2
.golangci.yaml View File

@@ -27,7 +27,7 @@ linters:
- gosimple # 简化代码工具
- govet # 检查Go 代码正确性的工具
- lll # 每行代码长度检查,不超过150字符
- typecheck # 类型检查
# - typecheck # 类型检查 #误报,暂时屏蔽
#- revive # 不同于fmt,主要检查代码风格问题
#- misspell # 拼写检查,防止线上出现拼写出错导致的bug
#- gomnd # 检测幻数的工具
@@ -75,4 +75,5 @@ run:
- develop.go
- train_job.go
- api/v1/configkey.go
- base-server/internal/data/dao/model_deploy.go
- base-server/internal/data/dao/model_deploy.go
- admin-server/internal/service/billing.go

+ 3
- 0
admin-portal/src/components/search/index.vue View File

@@ -151,6 +151,9 @@
.el-select {
width: 188px;
}
.el-input {
width: 260px !important;
}

.el-select>.el-input {
max-width: 185px !important;


+ 1
- 3
admin-portal/src/views/modelManager/components/createDialog.vue View File

@@ -64,10 +64,8 @@
inserted: function(el, binding) {
const SELECTWRAP_DOM = el.querySelector('.el-select-dropdown .el-select-dropdown__wrap');
SELECTWRAP_DOM.addEventListener('scroll', function() {
const CONDITION = this.scrollHeight - this.scrollTop <= this.clientHeight;
if (CONDITION) {
const CONDITION = this.scrollHeight - this.scrollTop <= this.clientHeight;
binding.value();
}
})
}
}


+ 14
- 4
admin-portal/src/views/timeManager/component/time.vue View File

@@ -3,8 +3,8 @@
<div>
<el-select v-model="userId" filterable :filter-method="getUserOptions" v-loadmore="loadUserName"
@focus='userClick' v-if="type=='user'" placeholder="用户 搜索">
<el-option v-for="op in userOptions" :key="op.id" :label="op.fullName+'('+op.email+')'"
:value="op.id" />
<el-option v-for="op in userOptions" :key="op.id" :label="op.fullName+'('+op.email+')'"
:value="op.id" :title="op.bind.length ? op.bind[0].userName : ''" />
</el-select>
<el-select v-model="spaceId" v-loadmore="loadGroupName"
@focus='groupClick' v-if="type=='group'" placeholder="群组 搜索">
@@ -18,9 +18,19 @@
<el-table-column :label="type==='user'?'用户名称':'群组名称'" align="center">
<template slot-scope="scope">
<el-tooltip trigger="hover" :content="scope.row.userEmail" placement="top">
<span v-if="type==='user'" style="margin-left: 10px">{{ scope.row.userName }}</span>
<span v-if="type==='user'">{{ scope.row.userName }}</span>
</el-tooltip>
<span v-if="type==='group'" style="margin-left: 10px">{{ scope.row.spaceName }}</span>
<span v-if="type==='group'">{{ scope.row.spaceName }}</span>
</template>
</el-table-column>
<el-table-column label="用户邮箱" align="center">
<template slot-scope="scope">
<span>{{ scope.row.userEmail}}</span>
</template>
</el-table-column>
<el-table-column label="第三方账号" align="center">
<template slot-scope="scope">
<span style="margin-left: 10px">{{ scope.row.bind ? scope.row.bind[0].userName : "" }}</span>
</template>
</el-table-column>
<el-table-column label="当前机时剩余(小时)" align="center">


+ 3
- 4
admin-portal/src/views/userManager/components/addDialog.vue View File

@@ -33,7 +33,7 @@
<el-input v-model.trim="ruleForm.fullname" />
</el-form-item>
<el-form-item label="备注" prop="desc" v-if="user" :label-width="formLabelWidth">
<el-input type="textarea" v-model="ruleForm.desc" maxlength="100" show-word-limit="true"></el-input>
<el-input type="textarea" v-model="ruleForm.desc" maxlength="100" :show-word-limit="true"></el-input>
</el-form-item>
</el-form>
<div slot="footer" class="dialog-footer">
@@ -55,9 +55,7 @@
const SELECTWRAP_DOM = el.querySelector('.el-select-dropdown .el-select-dropdown__wrap');
SELECTWRAP_DOM.addEventListener('scroll', function () {
const CONDITION = this.scrollHeight - this.scrollTop <= this.clientHeight;
if (CONDITION) {
binding.value();
}
binding.value();
})
}
}
@@ -296,6 +294,7 @@
this.$emit('close', false)
},
loadUser() {
console.log("OPPO")
this.userCount = this.userCount + 1
if (this.userOptions.length < this.total) {
this.getUserList()


+ 6
- 6
admin-portal/src/views/userManager/user.vue View File

@@ -1,6 +1,6 @@
<template>
<div>
<searchForm :search-form="searchForm" class="searchForm" :blur-name="user?'用户/邮箱 搜索':'群组 搜索'"
<searchForm :search-form="searchForm" class="searchForm" :blur-name="user?'用户/邮箱/第三方账号/备注 搜索':'群组 搜索'"
@searchData="getSearchData" />
<div class="create">
<el-button v-if="user" type="primary" @click="create">创建用户</el-button>
@@ -18,6 +18,11 @@
<span>{{ scope.row.email }}</span>
</template>
</el-table-column>
<el-table-column label="第三方账号" align="center">
<template slot-scope="scope">
<span>{{ scope.row.bind.length ? scope.row.bind[0].userName : "" }}</span>
</template>
</el-table-column>
<el-table-column v-if="user" label="电话" align="center">
<template slot-scope="scope">
<span>{{ scope.row.phone }}</span>
@@ -43,11 +48,6 @@
<span>{{ scope.row.desc }}</span>
</template>
</el-table-column>
<el-table-column label="创建时间" align="center">
<template slot-scope="scope">
<span>{{ scope.row.createdAt | parseTime }}</span>
</template>
</el-table-column>
<el-table-column label="修改时间" align="center">
<template slot-scope="scope">
<span>{{ scope.row.updatedAt | parseTime }}</span>


+ 6
- 6
deploy/charts/octopus/templates/grafana.yaml View File

@@ -2716,7 +2716,7 @@ data:
"tableColumn": "",
"targets": [
{
"expr": "avg(enflame_gcu_usage)",
"expr": "count(enflame_gcu_usage{pod_name=~\".+\"}) / sum(enflame_gcu_count) * 100 or vector(0)",
"format": "time_series",
"intervalFactor": 1,
"refId": "A"
@@ -2725,7 +2725,7 @@ data:
"thresholds": "65,90",
"timeFrom": null,
"timeShift": null,
"title": "GCU Avg Usage",
"title": "GCU Usage",
"type": "singlestat",
"valueFontSize": "80%",
"valueMaps": [
@@ -3134,7 +3134,7 @@ data:
"tableColumn": "",
"targets": [
{
"expr": "avg(enflame_vgcu_usage)",
"expr": "count(enflame_vgcu_usage{pod_name=~\".+\"}) / sum(enflame_vgcu_count) * 100 or vector(0)",
"format": "time_series",
"intervalFactor": 1,
"refId": "A"
@@ -3143,7 +3143,7 @@ data:
"thresholds": "65,90",
"timeFrom": null,
"timeShift": null,
"title": "VGCU Avg Usage",
"title": "VGCU Usage",
"type": "singlestat",
"valueFontSize": "80%",
"valueMaps": [
@@ -3552,7 +3552,7 @@ data:
"tableColumn": "",
"targets": [
{
"expr": "avg(mlu_usage)",
"expr": "sum(mlu_allocated) / count(mlu_usage) * 100 or vector(0)",
"format": "time_series",
"intervalFactor": 1,
"refId": "A"
@@ -3561,7 +3561,7 @@ data:
"thresholds": "65,90",
"timeFrom": null,
"timeShift": null,
"title": "MLU Avg Usage",
"title": "MLU Usage",
"type": "singlestat",
"valueFontSize": "80%",
"valueMaps": [


+ 65
- 0
deploy/charts/octopus/templates/xilinx-device-plugin.yaml View File

@@ -0,0 +1,65 @@
#Copyright 2018-2022 Xilinx Corporation. All Rights Reserved.
#
#Licensed under the Apache License, Version 2.0 (the "License");
#you may not use this file except in compliance with the License.
#You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
#Unless required by applicable law or agreed to in writing, software
#distributed under the License is distributed on an "AS IS" BASIS,
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#See the License for the specific language governing permissions and
#limitations under the License.

{{- if .Values.xilinx.enabled }}
apiVersion: apps/v1
#if run with k8s v1.16-, replace the above line with
#apiVersion: extensions/v1beta1
kind: DaemonSet
metadata:
name: xilinx-device-plugin-daemonset
namespace: kube-system
spec:
#if run with k8s v1.16-, the following 3 lines are not required
selector:
matchLabels:
name: xilinx-device-plugin-ds
updateStrategy:
type: RollingUpdate
template:
metadata:
annotations:
scheduler.alpha.kubernetes.io/critical-pod: ""
labels:
name: xilinx-device-plugin-ds
spec:
tolerations:
- key: CriticalAddonsOnly
operator: Exists
- key: xilinx.com/fpga
operator: Exists
effect: NoSchedule
nodeSelector:
hardware-type: XILINXFPGA
containers:
- image: {{ .Values.xilinx.fpgaDevicePluginImage }}
name: xilinx-device-plugin-ctr
imagePullPolicy: IfNotPresent
env:
- name: U30NameConvention
value: CommonName
- name: U30AllocUnit
value: Card
securityContext:
allowPrivilegeEscalation: false
capabilities:
drop: ["ALL"]
volumeMounts:
- name: device-plugin
mountPath: /var/lib/kubelet/device-plugins
volumes:
- name: device-plugin
hostPath:
path: /var/lib/kubelet/device-plugins
{{- end }}

+ 7
- 2
deploy/charts/octopus/values.yaml View File

@@ -40,7 +40,7 @@ ingress:
openaiserverPath: /openaiserver
adminportalPath: /admin
openaiportalPath: /openai
loggerHttpdPath: /log
loggerHttpdPath: /log/user/trainjob/*/*/*/*.log
ambassadorPath: /seldon
minioPath:
web: /minio
@@ -520,4 +520,9 @@ nvidia:

ascend:
# ascend节点需要打标签hardware-type=ASCENDNPU
enabled: false
enabled: false

xilinx:
# xilinx节点需要打标签hardware-type=XILINXFPGA
enabled: false
fpgaDevicePluginImage: public.ecr.aws/xilinx_dcg/k8s-device-plugin:1.1.0

+ 8
- 0
deploy/single_master_k8s_install/comm.sh View File

@@ -248,6 +248,14 @@ cambricon_mlu_label() {
echo -e "---------------------\033[31m cambricon mlu label success \033[0m---------------------"
}

# xilinx fpga节点打标签
xilinx_fpga_label() {
set -e
echo -e "---------------------\033[31m xilinx fpga label \033[0m---------------------"
kubectl label nodes `hostname` hardware-type=XILINXFPGA
echo -e "---------------------\033[31m xilinx fpga label success \033[0m---------------------"
}

# huawei a910节点打标签
huawei_a910_label() {
kubectl label nodes `hostname` hardware-type=ASCENDNPU


+ 2
- 0
deploy/single_master_k8s_install/master_install_k8s.sh View File

@@ -120,6 +120,8 @@ main() {
enflame_gcu_label
elif [[ $node_type == "cambricon_mlu" ]];then
cambricon_mlu_label
elif [[ $node_type == "xilinx_fpga" ]];then
xilinx_fpga_label
fi
# 验证


+ 2
- 0
deploy/single_master_k8s_install/node_install_k8s.sh View File

@@ -88,6 +88,8 @@ main() {
enflame_gcu_label
elif [[ $node_type == "cambricon_mlu" ]];then
cambricon_mlu_label
elif [[ $node_type == "xilinx_fpga" ]];then
xilinx_fpga_label
fi
}



+ 1
- 1
openai-portal/src/permission.js View File

@@ -25,7 +25,7 @@ router.beforeEach(async (to, from, next) => {
if (hasToken) {
try {
// eslint-disable-next-line eqeqeq
if (store.getters.name === '') { await store.dispatch('user/getInfo') }
await store.dispatch('user/getInfo')
if (store.getters.workspaces.length === 0) { await store.dispatch('user/getSpace') }
} catch (error) {
await store.dispatch('user/resetToken')


+ 1
- 3
openai-portal/src/views/cloudInterconnection/trainingTaskCreate.vue View File

@@ -336,10 +336,8 @@ export default {
inserted: function (el, binding) {
const SELECTWRAP_DOM = el.querySelector('.el-select-dropdown .el-select-dropdown__wrap');
SELECTWRAP_DOM.addEventListener('scroll', function () {
const CONDITION = this.scrollHeight - this.scrollTop <= this.clientHeight;
if (CONDITION) {
const CONDITION = this.scrollHeight - this.scrollTop <= this.clientHeight;
binding.value();
}
})
}
}


+ 2
- 1
openai-portal/src/views/trainingManager/components/createDialog/traningList.vue View File

@@ -118,10 +118,11 @@
val.taskNumber = parseInt(val.taskNumber)
val.minFailedTaskCount = parseInt(val.minFailedTaskCount)
val.minSucceededTaskCount = parseInt(val.minSucceededTaskCount)
this.tableData[this.currentIndex] = val
// this.tableData[this.currentIndex] = val
// flag为true新增
// flag为false编辑
if (this.flag) { this.tableData.push(val); }
else{ this.tableData[this.currentIndex] = val}
},
showResource(row) {
let name = ''


+ 12
- 23
openai-portal/src/views/trainingManager/components/editDialog/traningList.vue View File

@@ -2,12 +2,8 @@
<div>
<div class="index">
<el-button type="primary" class="add" @click="add">添加</el-button>
<el-table
:data="tableData"
style="width: 100%"
:header-cell-style="{'text-align':'left','color':'black'}"
:cell-style="{'text-align':'left'}"
>
<el-table :data="tableData" style="width: 100%" :header-cell-style="{'text-align':'left','color':'black'}"
:cell-style="{'text-align':'left'}">
<el-table-column prop="name" label="任务名称" align="center" />
<el-table-column label="是否是主任务" align="center">
<template slot-scope="scope">
@@ -37,17 +33,9 @@
</el-table>
</div>
<!-- 分布式任务对话框 -->
<distributedTask
v-if="FormVisible"
:row="row"
:flag="flag"
:resourcePool="resourcePool"
:disResourceOptions="resourceOptions"
@cancel="cancel"
@confirm="confirm"
@close="close"
@subTasks="getsubTasksList"
/>
<distributedTask v-if="FormVisible" :row="row" :flag="flag" :resourcePool="resourcePool"
:disResourceOptions="resourceOptions" @cancel="cancel" @confirm="confirm" @close="close"
@subTasks="getsubTasksList" />
</div>
</template>

@@ -65,8 +53,8 @@
default: () => []
},
resourcePool: {
type: String,
default: () => ""
type: String,
default: () => ""
}
},
data() {
@@ -84,7 +72,7 @@
this.$emit('tableData', this.tableData)
},
resourcePool() {
this.getResourceList()
this.getResourceList()
}
},
created() {
@@ -97,7 +85,7 @@
this.flag = true
this.row = { parameters: [] }
},
handleEdit(index,row) {
handleEdit(index, row) {
this.currentIndex = index
this.FormVisible = true
this.row = row
@@ -120,11 +108,12 @@
val.taskNumber = parseInt(val.taskNumber)
val.minFailedTaskCount = parseInt(val.minFailedTaskCount)
val.minSucceededTaskCount = parseInt(val.minSucceededTaskCount)
this.tableData[this.currentIndex] = val
// flag为true新增
// flag为false编辑

if (this.flag) { this.tableData.push(val); }
else { this.tableData[this.currentIndex] = val }
},
showResource(row) {
let name = ''
@@ -152,7 +141,7 @@
}
})
},
command: function(data) {
command: function (data) {
let command = data.command
if (data.parameters != null && data.parameters.length != 0) {
data.parameters.forEach(


+ 2
- 2
openai-portal/vue.config.js View File

@@ -42,14 +42,14 @@ module.exports = {
},
proxy: {
[process.env.VUE_APP_BASE_API]: {
target: 'http://192.168.202.71/',
target: 'http://192.168.202.73/',
changeOrigin: true,
pathRewrite: {
['^' + process.env.VUE_APP_BASE_API]: '/openaiserver'
}
},
[process.env.VUE_APP_BASE_API2]: {
target: 'http://192.168.202.71/',
target: 'http://192.168.202.73/',
changeOrigin: true,
pathRewrite: {
['^' + process.env.VUE_APP_BASE_API]: ''


+ 40
- 0
server/73pipeline View File

@@ -0,0 +1,40 @@
#默认amdin用户的token
adminToken: your-token
#任务流水线的配置
pipeline:
#设置流水线上有多少个工作线程
workerAmount: 10
#mysql配置项
mysql:
#最大空闲链接数
maxIdleConns: 10
#最大打开链接数
maxOpenConns: 5
#mysql权限校验
authStr: "root:root@(192.168.202.73:30336)/core?charset=utf8mb4&parseTime=True&loc=Local"
#kubernetes配置
kubernetes:
#api-server地址
apiServer: "https://192.168.202.73:6443"
#kubernetes登录文件地址
kubeFilePath: "/Users/wenlong/Desktop/启智社区/octopus/server/73kubeconfig"
#kubeFilePath: "/media/sf_project/config/config"
#lifehook相关配置
lifehook:
#推送超时时间
requestTimeOutSec: 30
#最大并行处理数
maxParallelProcessRequest: 5
#最大重试次数
maxRetryOnFail: 3
#http服务器端口
server:
port: 8080
evict:
waitMinutes: 120

+ 2
- 0
server/admin-server/api/v1/billing.proto View File

@@ -8,6 +8,7 @@ option go_package = "server/base-server/api/v1;v1";

import "google/api/annotations.proto";
import "validate/validate.proto";
import "server/common/api/v1/common.proto";

service BillingService {
// 计费用户列表
@@ -78,6 +79,7 @@ message BillingUser {
string userId = 4;
string userName = 5;
string userEmail = 6;
repeated common.api.v1.Bind bind = 7;
}

message ListBillingUserReply {


+ 1
- 0
server/admin-server/api/v1/user.proto View File

@@ -95,6 +95,7 @@ message UserItem {
common.api.v1.UserPermission permission = 11;
repeated string buckets = 12;
string minioUserName = 13;
repeated common.api.v1.Bind bind = 14;
}

message ListUserRequest {


+ 12
- 1
server/admin-server/internal/service/billing.go View File

@@ -2,14 +2,16 @@ package service

import (
"context"
"github.com/jinzhu/copier"
api "server/admin-server/api/v1"
"server/admin-server/internal/conf"
"server/admin-server/internal/data"
innerapi "server/base-server/api/v1"
commapi "server/common/api/v1"
"server/common/errors"
"server/common/log"
"server/common/utils/collections/set"

"github.com/jinzhu/copier"
)

type billingService struct {
@@ -80,6 +82,15 @@ func (s *billingService) assignUser(ctx context.Context, billingUsers []*api.Bil
for _, i := range billingUsers {

if v, ok := userMap[i.UserId]; ok {
if v.Bind != nil {
for _, b := range v.Bind {
i.Bind = append(i.Bind, &commapi.Bind{
Platform: b.Platform,
UserId: b.UserId,
UserName: b.UserName,
})
}
}
i.UserName = v.FullName
i.UserEmail = v.Email
}


+ 13
- 0
server/admin-server/internal/service/user.go View File

@@ -11,6 +11,8 @@ import (
"server/common/log"
"time"

commapi "server/common/api/v1"

"github.com/jinzhu/copier"
)

@@ -48,6 +50,16 @@ func (s *UserService) ListUser(ctx context.Context, req *pb.ListUserRequest) (*p

users := make([]*pb.UserItem, len(listUserReply.Users))
for idx, user := range listUserReply.Users {
bind := make([]*commapi.Bind, 0)
if user.Bind != nil {
for _, b := range user.Bind {
bind = append(bind, &commapi.Bind{
Platform: b.Platform,
UserId: b.UserId,
UserName: b.UserName,
})
}
}
users[idx] = &pb.UserItem{
Id: user.Id,
FullName: user.FullName,
@@ -62,6 +74,7 @@ func (s *UserService) ListUser(ctx context.Context, req *pb.ListUserRequest) (*p
Permission: user.Permission,
MinioUserName: user.MinioUserName,
Buckets: user.Buckets,
Bind: bind,
}
}



+ 3
- 1
server/base-server/api/v1/develop.proto View File

@@ -143,11 +143,13 @@ message NotebookEventListRequest {
// 页大小,最小1条,最大100条
int64 pageSize = 2[(validate.rules).int64 = {gte:1,lte:100}];
//任务ID
string id = 3[(validate.rules).string = {min_len: 1}];
string id = 3[(validate.rules).string = {min_len: 0}];
//子任务序号,从0开始
int64 taskIndex = 4[(validate.rules).int64 = {gte:0,lte:100}];
//副本序号,从0开始
int64 replicaIndex = 5[(validate.rules).int64 = {gte:0,lte:100}];
//Notebook ID
string NotebookId = 6[(validate.rules).string = {min_len: 0}];
}

message NotebookEventListReply {


+ 1
- 0
server/base-server/api/v1/trainJob.proto View File

@@ -301,4 +301,5 @@ message GetJobMetricReply {
repeated double memUsage = 2;
repeated double gpuUtil = 3;
repeated double gpuMemUsage = 4;
repeated double memUsagePercent = 5;
}

+ 1
- 1
server/base-server/internal/data/dao/model/user.go View File

@@ -128,7 +128,7 @@ func (u UserList) Where(db *gorm.DB) *gorm.DB {
func (u UserList) Or(db *gorm.DB) *gorm.DB {
if u.SearchKey != "" {
searchKeyLike := "%" + u.SearchKey + "%"
db = db.Where("full_name like ? or email like ?", searchKeyLike, searchKeyLike)
db = db.Where("full_name like ? or email like ? or bind->'$[*].userName' like ? or `desc` like ?", searchKeyLike, searchKeyLike, searchKeyLike, searchKeyLike)
}
return db
}


+ 26
- 2
server/base-server/internal/service/develop/develop.go View File

@@ -12,6 +12,7 @@ import (
"server/common/constant"
"server/common/errors"
"server/common/utils"
"server/common/utils/collections/set"
"strconv"
"strings"
"time"
@@ -532,12 +533,12 @@ func (s *developService) StartNotebook(ctx context.Context, req *api.StartNotebo
}
}()

err = s.data.DevelopDao.CreateNotebookEventRecord(ctx, &model.NotebookEventRecord{
err1 := s.data.DevelopDao.CreateNotebookEventRecord(ctx, &model.NotebookEventRecord{
Time: time.Now(),
NotebookId: nb.Id,
Type: commapi.NotebookEventRecordType_START,
})
if err != nil { // 插入事件记录出错只打印
if err1 != nil { // 插入事件记录出错只打印
s.log.Error(ctx, "create notebook event record error:", err)
}

@@ -964,6 +965,29 @@ func (s *developService) GetNotebookEventList(ctx context.Context, req *api.Note
return nil, err
}

if query.Id == "" {
if req.NotebookId == "" {
s.log.Errorf(ctx, "job id and notebook id empty")
return nil, errors.Errorf(nil, errors.ErrorInvalidRequestParameter)
}
nbIds := make([]string, 0)
nbIds = append(nbIds, req.NotebookId)
nbIds = set.NewStrings(nbIds...).Values()
nbs, _, err := s.data.DevelopDao.ListNotebook(ctx, &model.NotebookQuery{Ids: nbIds})
if err != nil {
s.log.Errorf(ctx, "ListNotebook err: %s", err)
return nil, err
}
if len(nbs) > 0 {
for _, nb := range nbs {
query.Id = nb.NotebookJobId
}
} else {
s.log.Errorf(ctx, "no notebook job found")
return nil, fmt.Errorf("no notebook job found")
}
}

events, totalSize, err := s.data.DevelopDao.GetNotebookEvents(query)
if err != nil {
return nil, err


+ 36
- 2
server/base-server/internal/service/trainjob/train_job.go View File

@@ -1325,10 +1325,44 @@ func (s *trainJobService) GetJobMetric(ctx context.Context, req *api.GetJobMetri
if err != nil {
return nil, err
}
return &api.GetJobMetricReply{

res := &api.GetJobMetricReply{
CpuUsage: cpuUsage,
MemUsage: memUsage,
GpuUtil: gpuUtil,
GpuMemUsage: gpuMemUtil,
}, nil
}
memUsagePercent, err := s.getMemUsagePercent(ctx, req, memUsage)
if err == nil { //忽略err
res.MemUsagePercent = memUsagePercent
}

return res, nil
}

func (s *trainJobService) getMemUsagePercent(ctx context.Context, req *api.GetJobMetricRequest, memUsage []float64) ([]float64, error) {
trainJob, err := s.data.TrainJobDao.GetTrainJob(ctx, req.Id)
if err != nil {
return nil, err
}
resourceSpec, err := s.resourceSpecService.GetResourceSpec(ctx, &api.GetResourceSpecRequest{Id: trainJob.Config[req.TaskIndex].ResourceSpecId})
if err != nil {
return nil, err
}

quantity, err := resource.ParseQuantity(resourceSpec.ResourceSpec.ResourceQuantity["memory"])
if err != nil {
return nil, err
}

res := make([]float64, 0)
for _, v := range memUsage {
if v == -1 {
res = append(res, v)
} else {
res = append(res, float64(int64(v)*100/quantity.Value()))
}
}

return res, nil
}

+ 6
- 0
server/common/api/v1/common.proto View File

@@ -21,4 +21,10 @@ message Mount {

message UserPermission {
bool mountExternalStorage = 1;
}

message Bind {
string platform = 1;
string userId = 2;
string userName = 3;
}

+ 3
- 1
server/openai-server/api/v1/develop.proto View File

@@ -175,11 +175,13 @@ message NotebookEventListRequest {
// 页大小,最小1条,最大100条
int64 pageSize = 2[(validate.rules).int64 = {gte:1,lte:100}];
//任务ID
string id = 3[(validate.rules).string = {min_len: 1}];
string id = 3[(validate.rules).string = {min_len: 0}];
//子任务序号,从0开始
int64 taskIndex = 4[(validate.rules).int64 = {gte:0,lt:100}];
//副本序号,从0开始
int64 replicaIndex = 5[(validate.rules).int64 = {gte:0,lt:100}];
//Notebook ID
string notebookId = 6[(validate.rules).string = {min_len: 0}];
}

message NotebookEventListReply {


+ 2
- 0
server/openai-server/api/v1/trainJob.proto View File

@@ -453,4 +453,6 @@ message GetJobMetricReply {
repeated double gpuUtil = 3;
//百分比
repeated double gpuMemUsage = 4;
//百分比
repeated double memUsagePercent = 5;
}

Loading…
Cancel
Save