#148 master

Merged
yangxzh1 merged 8 commits from OpenI/octopus:master into master 1 year ago
  1. +1
    -1
      admin-portal/src/views/timeManager/component/consumption.vue
  2. +1
    -1
      admin-portal/src/views/timeManager/component/recharge.vue
  3. BIN
      deploy/charts/octopus/charts/prometheus-node-exporter-4.17.2.tgz
  4. +228
    -1245
      deploy/charts/octopus/templates/grafana.yaml

+ 1
- 1
admin-portal/src/views/timeManager/component/consumption.vue View File

@@ -145,7 +145,7 @@
},
getSearchData(val) {
let data = {}
data = Object.assign(val, { pageIndex: this.searchData.pageIndex, pageSize: this.searchData.pageSize })
data = Object.assign(val, { pageIndex: 1, pageSize: this.searchData.pageSize })
this.getPay(data)
if (val.searchKey) {
this.searchKey = val.searchKey


+ 1
- 1
admin-portal/src/views/timeManager/component/recharge.vue View File

@@ -114,7 +114,7 @@
},
getSearchData(val) {
let data = {}
data = Object.assign(val, { pageIndex: this.searchData.pageIndex, pageSize: this.searchData.pageSize })
data = Object.assign(val, { pageIndex: 1, pageSize: this.searchData.pageSize })
this.Recharge(data)
if (val.searchKey) {
this.searchKey = val.searchKey


BIN
deploy/charts/octopus/charts/prometheus-node-exporter-4.17.2.tgz View File


+ 228
- 1245
deploy/charts/octopus/templates/grafana.yaml View File

@@ -761,183 +761,6 @@ data:
"alignLevel": null
}
},
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "Prometheus",
"fill": 1,
"gridPos": {
"h": 7,
"w": 12,
"x": 0,
"y": 30
},
"id": 10,
"legend": {
"avg": false,
"current": false,
"max": false,
"min": false,
"show": false,
"total": false,
"values": false
},
"lines": true,
"linewidth": 1,
"links": [],
"nullPointMode": "null",
"options": {},
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"spaceLength": 10,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "enflame_vgcu_usage{pod_name=\"$pod_name\"} or enflame_vgcu_usage{pod=\"$pod\"}",
"format": "time_series",
"hide": false,
"intervalFactor": 2,
"legendFormat": "VGCU utilization",
"metric": "enflame_vgcu_utilization",
"refId": "A",
"step": 2
}
],
"thresholds": [],
"timeFrom": null,
"timeRegions": [],
"timeShift": null,
"title": "VGCU utilization",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"format": "percent",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
],
"yaxis": {
"align": false,
"alignLevel": null
}
},
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "Prometheus",
"fill": 1,
"gridPos": {
"h": 7,
"w": 12,
"x": 12,
"y": 30
},
"id": 11,
"legend": {
"avg": false,
"current": false,
"max": false,
"min": false,
"show": false,
"total": false,
"values": false
},
"lines": true,
"linewidth": 1,
"links": [],
"nullPointMode": "null",
"options": {},
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"spaceLength": 10,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "enflame_vgcu_memory_usage{pod_name=\"$pod_name\"} * 100 or enflame_vgcu_memory_usage{pod=\"$pod\"} * 100",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "VGCU memory utilization",
"metric": "enflame_vgcu_mem_utilization",
"refId": "A",
"step": 2
}
],
"thresholds": [],
"timeFrom": null,
"timeRegions": [],
"timeShift": null,
"title": "VGCU memory utilization",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"format": "percent",
"label": "",
"logBase": 1,
"max": null,
"min": null,
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
],
"yaxis": {
"align": false,
"alignLevel": null
}
},
{
"aliasColors": {},
"bars": false,
@@ -2265,7 +2088,7 @@ data:
"steppedLine": false,
"targets": [
{
"expr": "sum(npu_chip_info_hbm_used_memory) / sum(npu_chip_info_hbm_total_memory) or vector(0)",
"expr": "sum(npu_chip_info_hbm_used_memory) / sum(npu_chip_info_hbm_total_memory) * 100 or vector(0)",
"format": "time_series",
"intervalFactor": 1,
"legendFormat": "Cluster NPU memory utilization(avg)",
@@ -2910,9 +2733,9 @@ data:
"h": 6,
"w": 8,
"x": 0,
"y": 22
"y": 28
},
"id": 20,
"id": 25,
"legend": {
"avg": false,
"current": false,
@@ -2937,10 +2760,10 @@ data:
"steppedLine": false,
"targets": [
{
"expr": "sum((sum(enflame_vgcu_usage) / count(enflame_vgcu_usage)) or vector(0))",
"expr": "sum((sum(mlu_utilization) / count(mlu_utilization)) or vector(0))",
"format": "time_series",
"intervalFactor": 1,
"legendFormat": "Cluster VGCU utilization (avg)",
"legendFormat": "Cluster MLU utilization (avg)",
"refId": "A"
}
],
@@ -2948,7 +2771,7 @@ data:
"timeFrom": null,
"timeRegions": [],
"timeShift": null,
"title": "Cluster VGCU utilization (avg)",
"title": "Cluster MLU utilization (avg)",
"tooltip": {
"shared": true,
"sort": 0,
@@ -2996,9 +2819,9 @@ data:
"h": 6,
"w": 8,
"x": 8,
"y": 22
"y": 28
},
"id": 21,
"id": 26,
"legend": {
"avg": false,
"current": false,
@@ -3023,10 +2846,10 @@ data:
"steppedLine": false,
"targets": [
{
"expr": "sum((sum(enflame_vgcu_memory_usage) / count(enflame_vgcu_memory_usage))*100 or vector(0))",
"expr": "sum((sum(mlu_memory_utilization) / count(mlu_memory_utilization)) or vector(0))",
"format": "time_series",
"intervalFactor": 1,
"legendFormat": "Cluster VGCU memory utilization(avg)",
"legendFormat": "Cluster MLU memory utilization(avg)",
"refId": "A"
}
],
@@ -3034,7 +2857,7 @@ data:
"timeFrom": null,
"timeRegions": [],
"timeShift": null,
"title": "Cluster VGCU memory utilization (avg)",
"title": "Cluster MLU memory utilization (avg)",
"tooltip": {
"shared": true,
"sort": 0,
@@ -3093,9 +2916,9 @@ data:
"h": 4,
"w": 8,
"x": 16,
"y": 24
"y": 30
},
"id": 22,
"id": 27,
"interval": null,
"links": [],
"mappingType": 1,
@@ -3134,7 +2957,7 @@ data:
"tableColumn": "",
"targets": [
{
"expr": "count(enflame_vgcu_usage{pod_name=~\".+\"}) / sum(enflame_vgcu_count) * 100 or vector(0)",
"expr": "sum(mlu_allocated) / count(mlu_usage) * 100 or vector(0)",
"format": "time_series",
"intervalFactor": 1,
"refId": "A"
@@ -3143,7 +2966,7 @@ data:
"thresholds": "65,90",
"timeFrom": null,
"timeShift": null,
"title": "VGCU Usage",
"title": "MLU Usage",
"type": "singlestat",
"valueFontSize": "80%",
"valueMaps": [
@@ -3177,9 +3000,9 @@ data:
"h": 2,
"w": 4,
"x": 16,
"y": 26
"y": 32
},
"id": 23,
"id": 28,
"interval": null,
"links": [],
"mappingType": 1,
@@ -3217,14 +3040,14 @@ data:
"tableColumn": "",
"targets": [
{
"expr": "count(enflame_vgcu_usage{pod_name=~\".+\"} or vector(0))-1",
"expr": "sum(mlu_allocated)",
"format": "time_series",
"intervalFactor": 1,
"refId": "A"
}
],
"thresholds": "100000000000000",
"title": "Used VGCUs",
"title": "Used MLUs",
"type": "singlestat",
"valueFontSize": "80%",
"valueMaps": [
@@ -3258,9 +3081,9 @@ data:
"h": 2,
"w": 4,
"x": 20,
"y": 26
"y": 32
},
"id": 24,
"id": 29,
"interval": null,
"links": [],
"mappingType": 1,
@@ -3298,14 +3121,14 @@ data:
"tableColumn": "",
"targets": [
{
"expr": "sum(enflame_vgcu_count)",
"expr": "count(mlu_usage)",
"format": "time_series",
"intervalFactor": 1,
"refId": "A"
}
],
"thresholds": "100000000000000",
"title": "VGCU Count",
"title": "MLU Count",
"type": "singlestat",
"valueFontSize": "80%",
"valueMaps": [
@@ -3318,452 +3141,34 @@ data:
"valueName": "current"
},
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"cacheTimeout": null,
"colorBackground": false,
"colorValue": true,
"colors": [
"#299c46",
"rgba(237, 129, 40, 0.89)",
"#d44a3a"
],
"datasource": "Prometheus",
"fill": 1,
"format": "percent",
"gauge": {
"maxValue": 100,
"minValue": 0,
"show": true,
"thresholdLabels": false,
"thresholdMarkers": true
},
"gridPos": {
"h": 6,
"h": 4,
"w": 8,
"x": 0,
"y": 28
},
"id": 25,
"legend": {
"avg": false,
"current": false,
"max": false,
"min": false,
"show": false,
"total": false,
"values": false
"y": 36
},
"lines": true,
"linewidth": 1,
"id": 30,
"interval": null,
"links": [],
"nullPointMode": "null",
"options": {},
"percentage": false,
"pointradius": 2,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"spaceLength": 10,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "sum((sum(mlu_utilization) / count(mlu_utilization)) or vector(0))",
"format": "time_series",
"intervalFactor": 1,
"legendFormat": "Cluster MLU utilization (avg)",
"refId": "A"
}
],
"thresholds": [],
"timeFrom": null,
"timeRegions": [],
"timeShift": null,
"title": "Cluster MLU utilization (avg)",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"format": "percent",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
],
"yaxis": {
"align": false,
"alignLevel": null
}
},
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "Prometheus",
"fill": 1,
"gridPos": {
"h": 6,
"w": 8,
"x": 8,
"y": 28
},
"id": 26,
"legend": {
"avg": false,
"current": false,
"max": false,
"min": false,
"show": false,
"total": false,
"values": false
},
"lines": true,
"linewidth": 1,
"links": [],
"nullPointMode": "null",
"options": {},
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"spaceLength": 10,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "sum((sum(mlu_memory_utilization) / count(mlu_memory_utilization)) or vector(0))",
"format": "time_series",
"intervalFactor": 1,
"legendFormat": "Cluster MLU memory utilization(avg)",
"refId": "A"
}
],
"thresholds": [],
"timeFrom": null,
"timeRegions": [],
"timeShift": null,
"title": "Cluster MLU memory utilization (avg)",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"format": "percent",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
],
"yaxis": {
"align": false,
"alignLevel": null
}
},
{
"cacheTimeout": null,
"colorBackground": false,
"colorValue": true,
"colors": [
"#299c46",
"rgba(237, 129, 40, 0.89)",
"#d44a3a"
],
"datasource": "Prometheus",
"format": "percent",
"gauge": {
"maxValue": 100,
"minValue": 0,
"show": true,
"thresholdLabels": false,
"thresholdMarkers": true
},
"gridPos": {
"h": 4,
"w": 8,
"x": 16,
"y": 30
},
"id": 27,
"interval": null,
"links": [],
"mappingType": 1,
"mappingTypes": [
{
"name": "value to text",
"value": 1
},
{
"name": "range to text",
"value": 2
}
],
"maxDataPoints": 100,
"nullPointMode": "connected",
"nullText": null,
"options": {},
"pluginVersion": "6.2.0",
"postfix": "",
"postfixFontSize": "50%",
"prefix": "",
"prefixFontSize": "50%",
"rangeMaps": [
{
"from": "null",
"text": "N/A",
"to": "null"
}
],
"sparkline": {
"fillColor": "rgba(31, 118, 189, 0.18)",
"full": false,
"lineColor": "rgb(31, 120, 193)",
"show": false
},
"tableColumn": "",
"targets": [
{
"expr": "sum(mlu_allocated) / count(mlu_usage) * 100 or vector(0)",
"format": "time_series",
"intervalFactor": 1,
"refId": "A"
}
],
"thresholds": "65,90",
"timeFrom": null,
"timeShift": null,
"title": "MLU Usage",
"type": "singlestat",
"valueFontSize": "80%",
"valueMaps": [
{
"op": "=",
"text": "N/A",
"value": "null"
}
],
"valueName": "current"
},
{
"cacheTimeout": null,
"colorBackground": false,
"colorValue": true,
"colors": [
"#299c46",
"rgba(237, 129, 40, 0.89)",
"#d44a3a"
],
"datasource": "Prometheus",
"format": "none",
"gauge": {
"maxValue": 100,
"minValue": 0,
"show": false,
"thresholdLabels": false,
"thresholdMarkers": true
},
"gridPos": {
"h": 2,
"w": 4,
"x": 16,
"y": 32
},
"id": 28,
"interval": null,
"links": [],
"mappingType": 1,
"mappingTypes": [
{
"name": "value to text",
"value": 1
},
{
"name": "range to text",
"value": 2
}
],
"maxDataPoints": 100,
"nullPointMode": "connected",
"nullText": null,
"options": {},
"postfix": "",
"postfixFontSize": "50%",
"prefix": "",
"prefixFontSize": "50%",
"rangeMaps": [
{
"from": "null",
"text": "N/A",
"to": "null"
}
],
"sparkline": {
"fillColor": "rgba(31, 118, 189, 0.18)",
"full": false,
"lineColor": "rgb(31, 120, 193)",
"show": false
},
"tableColumn": "",
"targets": [
{
"expr": "sum(mlu_allocated)",
"format": "time_series",
"intervalFactor": 1,
"refId": "A"
}
],
"thresholds": "100000000000000",
"title": "Used MLUs",
"type": "singlestat",
"valueFontSize": "80%",
"valueMaps": [
{
"op": "=",
"text": "N/A",
"value": "null"
}
],
"valueName": "current"
},
{
"cacheTimeout": null,
"colorBackground": false,
"colorValue": true,
"colors": [
"#299c46",
"rgba(237, 129, 40, 0.89)",
"#d44a3a"
],
"datasource": "Prometheus",
"format": "none",
"gauge": {
"maxValue": 100,
"minValue": 0,
"show": false,
"thresholdLabels": false,
"thresholdMarkers": true
},
"gridPos": {
"h": 2,
"w": 4,
"x": 20,
"y": 32
},
"id": 29,
"interval": null,
"links": [],
"mappingType": 1,
"mappingTypes": [
{
"name": "value to text",
"value": 1
},
{
"name": "range to text",
"value": 2
}
],
"maxDataPoints": 100,
"nullPointMode": "connected",
"nullText": null,
"options": {},
"postfix": "",
"postfixFontSize": "50%",
"prefix": "",
"prefixFontSize": "50%",
"rangeMaps": [
{
"from": "null",
"text": "N/A",
"to": "null"
}
],
"sparkline": {
"fillColor": "rgba(31, 118, 189, 0.18)",
"full": false,
"lineColor": "rgb(31, 120, 193)",
"show": false
},
"tableColumn": "",
"targets": [
{
"expr": "count(mlu_usage)",
"format": "time_series",
"intervalFactor": 1,
"refId": "A"
}
],
"thresholds": "100000000000000",
"title": "MLU Count",
"type": "singlestat",
"valueFontSize": "80%",
"valueMaps": [
{
"op": "=",
"text": "N/A",
"value": "null"
}
],
"valueName": "current"
},
{
"cacheTimeout": null,
"colorBackground": false,
"colorValue": true,
"colors": [
"#299c46",
"rgba(237, 129, 40, 0.89)",
"#d44a3a"
],
"datasource": "Prometheus",
"format": "percent",
"gauge": {
"maxValue": 100,
"minValue": 0,
"show": true,
"thresholdLabels": false,
"thresholdMarkers": true
},
"gridPos": {
"h": 4,
"w": 8,
"x": 0,
"y": 36
},
"id": 30,
"interval": null,
"links": [],
"mappingType": 1,
"mappingTypes": [
"mappingType": 1,
"mappingTypes": [
{
"name": "value to text",
"value": 1
@@ -3797,7 +3202,7 @@ data:
"tableColumn": "",
"targets": [
{
"expr": "sum (rate (container_cpu_usage_seconds_total{id=\"/\",kubernetes_io_hostname=~\"^$Node$\"}[1m])) / sum (machine_cpu_cores{kubernetes_io_hostname=~\"^$Node$\"}) * 100",
"expr": "sum (rate (container_cpu_usage_seconds_total{id=\"/\",kubernetes_io_hostname=~\"^$Node$\"}[1m])) / sum(count without(cpu, mode) (node_cpu_seconds_total{mode=\"idle\"})) * 100",
"format": "time_series",
"intervalFactor": 1,
"refId": "A"
@@ -3879,7 +3284,7 @@ data:
"tableColumn": "",
"targets": [
{
"expr": "sum (container_memory_working_set_bytes{id=\"/\",kubernetes_io_hostname=~\"^$Node$\"}) / sum (machine_memory_bytes{kubernetes_io_hostname=~\"^$Node$\"}) * 100",
"expr": "sum (container_memory_working_set_bytes{id=\"/\",kubernetes_io_hostname=~\"^$Node$\"}) / sum(node_memory_MemTotal_bytes) * 100",
"format": "time_series",
"intervalFactor": 1,
"refId": "A"
@@ -3967,254 +3372,8 @@ data:
"refId": "A"
}
],
"thresholds": "65,90",
"title": "Cluster filesystem usage",
"type": "singlestat",
"valueFontSize": "80%",
"valueMaps": [
{
"op": "=",
"text": "N/A",
"value": "null"
}
],
"valueName": "current"
},
{
"cacheTimeout": null,
"colorBackground": false,
"colorValue": true,
"colors": [
"#299c46",
"rgba(237, 129, 40, 0.89)",
"#d44a3a"
],
"datasource": "Prometheus",
"decimals": 2,
"format": "short",
"gauge": {
"maxValue": 100,
"minValue": 0,
"show": false,
"thresholdLabels": true,
"thresholdMarkers": true
},
"gridPos": {
"h": 2,
"w": 4,
"x": 0,
"y": 40
},
"id": 33,
"interval": null,
"links": [],
"mappingType": 1,
"mappingTypes": [
{
"name": "value to text",
"value": 1
},
{
"name": "range to text",
"value": 2
}
],
"maxDataPoints": 100,
"nullPointMode": "connected",
"nullText": null,
"options": {},
"postfix": " cores",
"postfixFontSize": "30%",
"prefix": "",
"prefixFontSize": "50%",
"rangeMaps": [
{
"from": "null",
"text": "N/A",
"to": "null"
}
],
"sparkline": {
"fillColor": "rgba(31, 118, 189, 0.18)",
"full": false,
"lineColor": "rgb(31, 120, 193)",
"show": false
},
"tableColumn": "",
"targets": [
{
"expr": "sum (rate (container_cpu_usage_seconds_total{id=\"/\",kubernetes_io_hostname=~\"^$Node$\"}[1m]))",
"format": "time_series",
"intervalFactor": 1,
"refId": "A"
}
],
"thresholds": "",
"title": "Used",
"type": "singlestat",
"valueFontSize": "80%",
"valueMaps": [
{
"op": "=",
"text": "N/A",
"value": "null"
}
],
"valueName": "current"
},
{
"cacheTimeout": null,
"colorBackground": false,
"colorValue": true,
"colors": [
"#299c46",
"rgba(237, 129, 40, 0.89)",
"#d44a3a"
],
"datasource": "Prometheus",
"decimals": 2,
"format": "short",
"gauge": {
"maxValue": 100,
"minValue": 0,
"show": false,
"thresholdLabels": false,
"thresholdMarkers": true
},
"gridPos": {
"h": 2,
"w": 4,
"x": 4,
"y": 40
},
"id": 34,
"interval": null,
"links": [],
"mappingType": 1,
"mappingTypes": [
{
"name": "value to text",
"value": 1
},
{
"name": "range to text",
"value": 2
}
],
"maxDataPoints": 100,
"nullPointMode": "connected",
"nullText": null,
"options": {},
"postfix": " cores",
"postfixFontSize": "30%",
"prefix": "",
"prefixFontSize": "50%",
"rangeMaps": [
{
"from": "null",
"text": "N/A",
"to": "null"
}
],
"sparkline": {
"fillColor": "rgba(31, 118, 189, 0.18)",
"full": false,
"lineColor": "rgb(31, 120, 193)",
"show": false
},
"tableColumn": "",
"targets": [
{
"expr": "sum (machine_cpu_cores{kubernetes_io_hostname=~\"^$Node$\"})",
"format": "time_series",
"intervalFactor": 1,
"refId": "A"
}
],
"thresholds": "100000000000000",
"title": "Total",
"type": "singlestat",
"valueFontSize": "80%",
"valueMaps": [
{
"op": "=",
"text": "N/A",
"value": "null"
}
],
"valueName": "current"
},
{
"cacheTimeout": null,
"colorBackground": false,
"colorValue": true,
"colors": [
"#299c46",
"rgba(237, 129, 40, 0.89)",
"#d44a3a"
],
"datasource": "Prometheus",
"decimals": 2,
"format": "decbytes",
"gauge": {
"maxValue": 100,
"minValue": 0,
"show": false,
"thresholdLabels": false,
"thresholdMarkers": true
},
"gridPos": {
"h": 2,
"w": 4,
"x": 8,
"y": 40
},
"id": 35,
"interval": null,
"links": [],
"mappingType": 1,
"mappingTypes": [
{
"name": "value to text",
"value": 1
},
{
"name": "range to text",
"value": 2
}
],
"maxDataPoints": 100,
"nullPointMode": "connected",
"nullText": null,
"options": {},
"postfix": "",
"postfixFontSize": "50%",
"prefix": "",
"prefixFontSize": "50%",
"rangeMaps": [
{
"from": "null",
"text": "N/A",
"to": "null"
}
],
"sparkline": {
"fillColor": "rgba(31, 118, 189, 0.18)",
"full": false,
"lineColor": "rgb(31, 120, 193)",
"show": false
},
"tableColumn": "",
"targets": [
{
"expr": "sum (container_memory_working_set_bytes{id=\"/\",kubernetes_io_hostname=~\"^$Node$\"})",
"format": "time_series",
"intervalFactor": 1,
"refId": "A"
}
],
"thresholds": "",
"title": "Used",
"thresholds": "65,90",
"title": "Cluster filesystem usage",
"type": "singlestat",
"valueFontSize": "80%",
"valueMaps": [
@@ -4237,21 +3396,21 @@ data:
],
"datasource": "Prometheus",
"decimals": 2,
"format": "decbytes",
"format": "short",
"gauge": {
"maxValue": 100,
"minValue": 0,
"show": false,
"thresholdLabels": false,
"thresholdLabels": true,
"thresholdMarkers": true
},
"gridPos": {
"h": 2,
"w": 4,
"x": 12,
"x": 0,
"y": 40
},
"id": 36,
"id": 33,
"interval": null,
"links": [],
"mappingType": 1,
@@ -4269,8 +3428,8 @@ data:
"nullPointMode": "connected",
"nullText": null,
"options": {},
"postfix": "",
"postfixFontSize": "50%",
"postfix": " cores",
"postfixFontSize": "30%",
"prefix": "",
"prefixFontSize": "50%",
"rangeMaps": [
@@ -4289,14 +3448,14 @@ data:
"tableColumn": "",
"targets": [
{
"expr": "sum (machine_memory_bytes{kubernetes_io_hostname=~\"^$Node$\"})",
"expr": "sum (rate (container_cpu_usage_seconds_total{id=\"/\",kubernetes_io_hostname=~\"^$Node$\"}[1m]))",
"format": "time_series",
"intervalFactor": 1,
"refId": "A"
}
],
"thresholds": "100000000000000",
"title": "Total",
"thresholds": "",
"title": "Used",
"type": "singlestat",
"valueFontSize": "80%",
"valueMaps": [
@@ -4319,7 +3478,7 @@ data:
],
"datasource": "Prometheus",
"decimals": 2,
"format": "decbytes",
"format": "short",
"gauge": {
"maxValue": 100,
"minValue": 0,
@@ -4330,10 +3489,10 @@ data:
"gridPos": {
"h": 2,
"w": 4,
"x": 16,
"x": 4,
"y": 40
},
"id": 37,
"id": 34,
"interval": null,
"links": [],
"mappingType": 1,
@@ -4351,8 +3510,8 @@ data:
"nullPointMode": "connected",
"nullText": null,
"options": {},
"postfix": "",
"postfixFontSize": "50%",
"postfix": " cores",
"postfixFontSize": "30%",
"prefix": "",
"prefixFontSize": "50%",
"rangeMaps": [
@@ -4371,14 +3530,14 @@ data:
"tableColumn": "",
"targets": [
{
"expr": "sum (container_fs_usage_bytes{id=\"/\",kubernetes_io_hostname=~\"^$Node$\"})",
"expr": "sum(count without(cpu, mode) (node_cpu_seconds_total{mode=\"idle\"}))",
"format": "time_series",
"intervalFactor": 1,
"refId": "A"
}
],
"thresholds": "",
"title": "Used",
"thresholds": "100000000000000",
"title": "Total",
"type": "singlestat",
"valueFontSize": "80%",
"valueMaps": [
@@ -4412,10 +3571,10 @@ data:
"gridPos": {
"h": 2,
"w": 4,
"x": 20,
"x": 8,
"y": 40
},
"id": 38,
"id": 35,
"interval": null,
"links": [],
"mappingType": 1,
@@ -4453,14 +3612,14 @@ data:
"tableColumn": "",
"targets": [
{
"expr": "sum (container_fs_limit_bytes{device=~\"^/dev/.*$\",id=\"/\",kubernetes_io_hostname=~\"^$Node$\"})",
"expr": "sum (container_memory_working_set_bytes{id=\"/\",kubernetes_io_hostname=~\"^$Node$\"})",
"format": "time_series",
"intervalFactor": 1,
"refId": "A"
}
],
"thresholds": "100000000000000",
"title": "Total",
"thresholds": "",
"title": "Used",
"type": "singlestat",
"valueFontSize": "80%",
"valueMaps": [
@@ -4471,96 +3630,7 @@ data:
}
],
"valueName": "current"
}
],
"refresh": "10s",
"schemaVersion": 18,
"style": "dark",
"tags": [],
"templating": {
"list": [
{
"allValue": ".*",
"current": {
"text": "All",
"value": "$__all"
},
"datasource": "Prometheus",
"definition": "",
"hide": 2,
"includeAll": true,
"label": null,
"multi": false,
"name": "Node",
"options": [],
"query": "label_values(kubernetes_io_hostname)",
"refresh": 1,
"regex": "",
"skipUrlSync": false,
"sort": 0,
"tagValuesQuery": "",
"tags": [],
"tagsQuery": "",
"type": "query",
"useTags": false
}
]
},
"time": {
"from": "now-5m",
"to": "now"
},
"timepicker": {
"refresh_intervals": [
"5s",
"10s",
"30s",
"1m",
"5m",
"15m",
"30m",
"1h",
"2h",
"1d"
],
"time_options": [
"5m",
"15m",
"1h",
"6h",
"12h",
"24h",
"2d",
"7d",
"30d"
]
},
"timezone": "",
"title": "ClusterMetrics",
"uid": "ft1oaQnWk",
"version": 1
}
openinodes.json: |
{
"annotations": {
"list": [
{
"builtIn": 1,
"datasource": "-- Grafana --",
"enable": true,
"hide": true,
"iconColor": "rgba(0, 211, 255, 1)",
"name": "Annotations & Alerts",
"type": "dashboard"
}
]
},
"editable": true,
"gnetId": null,
"graphTooltip": 0,
"iteration": 1575963441215,
"links": [],
"panels": [
{
"cacheTimeout": null,
"colorBackground": false,
@@ -4571,21 +3641,22 @@ data:
"#d44a3a"
],
"datasource": "Prometheus",
"format": "percent",
"decimals": 2,
"format": "decbytes",
"gauge": {
"maxValue": 100,
"minValue": 0,
"show": true,
"show": false,
"thresholdLabels": false,
"thresholdMarkers": true
},
"gridPos": {
"h": 4,
"w": 8,
"x": 16,
"y": 0
"h": 2,
"w": 4,
"x": 12,
"y": 40
},
"id": 3,
"id": 36,
"interval": null,
"links": [],
"mappingType": 1,
@@ -4623,16 +3694,14 @@ data:
"tableColumn": "",
"targets": [
{
"expr": "(count(dcgm_gpu_utilization{container_name=~\".+\",name=~\"^$Node$\"} or vector(0))-1)/ (count(dcgm_gpu_temp{uuid=~\".+\",name=~\"^$Node$\"} or vector(0)) - 1) * 100",
"expr": "sum(node_memory_MemTotal_bytes)",
"format": "time_series",
"intervalFactor": 1,
"refId": "A"
}
],
"thresholds": "65,90",
"timeFrom": null,
"timeShift": null,
"title": "Node GPU usage",
"thresholds": "100000000000000",
"title": "Total",
"type": "singlestat",
"valueFontSize": "80%",
"valueMaps": [
@@ -4654,7 +3723,8 @@ data:
"#d44a3a"
],
"datasource": "Prometheus",
"format": "none",
"decimals": 2,
"format": "decbytes",
"gauge": {
"maxValue": 100,
"minValue": 0,
@@ -4666,9 +3736,9 @@ data:
"h": 2,
"w": 4,
"x": 16,
"y": 0
"y": 40
},
"id": 4,
"id": 37,
"interval": null,
"links": [],
"mappingType": 1,
@@ -4706,16 +3776,14 @@ data:
"tableColumn": "",
"targets": [
{
"expr": "count(dcgm_gpu_utilization{container_name=~\".+\",name=~\"^$Node$\"} or vector(0))-1",
"expr": "sum (container_fs_usage_bytes{id=\"/\",kubernetes_io_hostname=~\"^$Node$\"})",
"format": "time_series",
"intervalFactor": 1,
"refId": "A"
}
],
"thresholds": "",
"timeFrom": null,
"timeShift": null,
"title": "Used GPUs",
"title": "Used",
"type": "singlestat",
"valueFontSize": "80%",
"valueMaps": [
@@ -4737,8 +3805,8 @@ data:
"#d44a3a"
],
"datasource": "Prometheus",
"decimals": null,
"format": "none",
"decimals": 2,
"format": "decbytes",
"gauge": {
"maxValue": 100,
"minValue": 0,
@@ -4750,9 +3818,9 @@ data:
"h": 2,
"w": 4,
"x": 20,
"y": 0
"y": 40
},
"id": 5,
"id": 38,
"interval": null,
"links": [],
"mappingType": 1,
@@ -4787,30 +3855,117 @@ data:
"lineColor": "rgb(31, 120, 193)",
"show": false
},
"tableColumn": "",
"targets": [
{
"expr": "count(dcgm_gpu_temp{uuid=~\".+\",name=~\"^$Node$\"} or vector(0)) - 1",
"format": "time_series",
"intervalFactor": 1,
"refId": "A"
"tableColumn": "",
"targets": [
{
"expr": "sum (container_fs_limit_bytes{device=~\"^/dev/.*$\",id=\"/\",kubernetes_io_hostname=~\"^$Node$\"})",
"format": "time_series",
"intervalFactor": 1,
"refId": "A"
}
],
"thresholds": "100000000000000",
"title": "Total",
"type": "singlestat",
"valueFontSize": "80%",
"valueMaps": [
{
"op": "=",
"text": "N/A",
"value": "null"
}
],
"valueName": "current"
}
],
"refresh": "10s",
"schemaVersion": 18,
"style": "dark",
"tags": [],
"templating": {
"list": [
{
"allValue": ".*",
"current": {
"text": "All",
"value": "$__all"
},
"datasource": "Prometheus",
"definition": "",
"hide": 2,
"includeAll": true,
"label": null,
"multi": false,
"name": "Node",
"options": [],
"query": "label_values(kubernetes_io_hostname)",
"refresh": 1,
"regex": "",
"skipUrlSync": false,
"sort": 0,
"tagValuesQuery": "",
"tags": [],
"tagsQuery": "",
"type": "query",
"useTags": false
}
]
},
"time": {
"from": "now-5m",
"to": "now"
},
"timepicker": {
"refresh_intervals": [
"5s",
"10s",
"30s",
"1m",
"5m",
"15m",
"30m",
"1h",
"2h",
"1d"
],
"time_options": [
"5m",
"15m",
"1h",
"6h",
"12h",
"24h",
"2d",
"7d",
"30d"
]
},
"timezone": "",
"title": "ClusterMetrics",
"uid": "ft1oaQnWk",
"version": 1
}
],
"thresholds": "100000000000",
"timeFrom": null,
"timeShift": null,
"title": "Total GPUs",
"type": "singlestat",
"valueFontSize": "80%",
"valueMaps": [
openinodes.json: |
{
"op": "=",
"text": "N/A",
"value": "null"
"annotations": {
"list": [
{
"builtIn": 1,
"datasource": "-- Grafana --",
"enable": true,
"hide": true,
"iconColor": "rgba(0, 211, 255, 1)",
"name": "Annotations & Alerts",
"type": "dashboard"
}
],
"valueName": "current"
]
},
"editable": true,
"gnetId": null,
"graphTooltip": 0,
"iteration": 1575963441215,
"links": [],
"panels": [
{
"cacheTimeout": null,
"colorBackground": false,
@@ -4833,9 +3988,9 @@ data:
"h": 4,
"w": 8,
"x": 16,
"y": 12
"y": 0
},
"id": 13,
"id": 3,
"interval": null,
"links": [],
"mappingType": 1,
@@ -4873,7 +4028,7 @@ data:
"tableColumn": "",
"targets": [
{
"expr": "(count(enflame_gcu_usage{pod_name=~\".+\",name=~\"^$Node$\"} or vector(0))-1)/ (count(enflame_gcu_usage{uuid=~\".+\",name=~\"^$Node$\"} or vector(0))-1) * 100",
"expr": "(count(dcgm_gpu_utilization{container_name=~\".+\",name=~\"^$Node$\"} or vector(0))-1)/ (count(dcgm_gpu_temp{uuid=~\".+\",name=~\"^$Node$\"} or vector(0)) - 1) * 100",
"format": "time_series",
"intervalFactor": 1,
"refId": "A"
@@ -4882,7 +4037,7 @@ data:
"thresholds": "65,90",
"timeFrom": null,
"timeShift": null,
"title": "Node GCU usage",
"title": "Node GPU usage",
"type": "singlestat",
"valueFontSize": "80%",
"valueMaps": [
@@ -4916,9 +4071,9 @@ data:
"h": 2,
"w": 4,
"x": 16,
"y": 16
"y": 0
},
"id": 14,
"id": 4,
"interval": null,
"links": [],
"mappingType": 1,
@@ -4956,7 +4111,7 @@ data:
"tableColumn": "",
"targets": [
{
"expr": "count(enflame_gcu_usage{pod_name=~\".+\",name=~\"^$Node$\"} or vector(0))-1",
"expr": "count(dcgm_gpu_utilization{container_name=~\".+\",name=~\"^$Node$\"} or vector(0))-1",
"format": "time_series",
"intervalFactor": 1,
"refId": "A"
@@ -4965,7 +4120,7 @@ data:
"thresholds": "",
"timeFrom": null,
"timeShift": null,
"title": "Used GCUs",
"title": "Used GPUs",
"type": "singlestat",
"valueFontSize": "80%",
"valueMaps": [
@@ -5000,9 +4155,9 @@ data:
"h": 2,
"w": 4,
"x": 20,
"y": 16
"y": 0
},
"id": 15,
"id": 5,
"interval": null,
"links": [],
"mappingType": 1,
@@ -5040,7 +4195,7 @@ data:
"tableColumn": "",
"targets": [
{
"expr": "sum(enflame_gcu_count{name=~\"^$Node$\"} or vector(0))",
"expr": "count(dcgm_gpu_temp{uuid=~\".+\",name=~\"^$Node$\"} or vector(0)) - 1",
"format": "time_series",
"intervalFactor": 1,
"refId": "A"
@@ -5049,7 +4204,7 @@ data:
"thresholds": "100000000000",
"timeFrom": null,
"timeShift": null,
"title": "Total GCUs",
"title": "Total GPUs",
"type": "singlestat",
"valueFontSize": "80%",
"valueMaps": [
@@ -5083,9 +4238,9 @@ data:
"h": 4,
"w": 8,
"x": 16,
"y": 18
"y": 12
},
"id": 18,
"id": 13,
"interval": null,
"links": [],
"mappingType": 1,
@@ -5123,7 +4278,7 @@ data:
"tableColumn": "",
"targets": [
{
"expr": "(count(enflame_vgcu_usage{pod_name=~\".+\",name=~\"^$Node$\"} or vector(0))-1)/ (count(enflame_vgcu_usage{uuid=~\".+\",name=~\"^$Node$\"} or vector(0))-1) * 100",
"expr": "(count(enflame_gcu_usage{pod_name=~\".+\",name=~\"^$Node$\"} or vector(0))-1)/ (count(enflame_gcu_usage{uuid=~\".+\",name=~\"^$Node$\"} or vector(0))-1) * 100",
"format": "time_series",
"intervalFactor": 1,
"refId": "A"
@@ -5132,7 +4287,7 @@ data:
"thresholds": "65,90",
"timeFrom": null,
"timeShift": null,
"title": "Node VGCU usage",
"title": "Node GCU usage",
"type": "singlestat",
"valueFontSize": "80%",
"valueMaps": [
@@ -5166,9 +4321,9 @@ data:
"h": 2,
"w": 4,
"x": 16,
"y": 18
"y": 16
},
"id": 19,
"id": 14,
"interval": null,
"links": [],
"mappingType": 1,
@@ -5206,7 +4361,7 @@ data:
"tableColumn": "",
"targets": [
{
"expr": "count(enflame_vgcu_usage{pod_name=~\".+\",name=~\"^$Node$\"} or vector(0))-1",
"expr": "count(enflame_gcu_usage{pod_name=~\".+\",name=~\"^$Node$\"} or vector(0))-1",
"format": "time_series",
"intervalFactor": 1,
"refId": "A"
@@ -5215,7 +4370,7 @@ data:
"thresholds": "",
"timeFrom": null,
"timeShift": null,
"title": "Used VGCUs",
"title": "Used GCUs",
"type": "singlestat",
"valueFontSize": "80%",
"valueMaps": [
@@ -5250,9 +4405,9 @@ data:
"h": 2,
"w": 4,
"x": 20,
"y": 18
"y": 16
},
"id": 20,
"id": 15,
"interval": null,
"links": [],
"mappingType": 1,
@@ -5290,7 +4445,7 @@ data:
"tableColumn": "",
"targets": [
{
"expr": "sum(enflame_vgcu_count{name=~\"^$Node$\"} or vector(0))",
"expr": "sum(enflame_gcu_count{name=~\"^$Node$\"} or vector(0))",
"format": "time_series",
"intervalFactor": 1,
"refId": "A"
@@ -5299,7 +4454,7 @@ data:
"thresholds": "100000000000",
"timeFrom": null,
"timeShift": null,
"title": "Total VGCUs",
"title": "Total GCUs",
"type": "singlestat",
"valueFontSize": "80%",
"valueMaps": [
@@ -6107,7 +5262,7 @@ data:
"steppedLine": false,
"targets": [
{
"expr": "sum(npu_chip_info_hbm_used_memory{name=\"$Node\"}) / sum(npu_chip_info_hbm_total_memory{name=\"$Node\"})",
"expr": "sum(npu_chip_info_hbm_used_memory{name=\"$Node\"}) / sum(npu_chip_info_hbm_total_memory{name=\"$Node\"}) * 100",
"format": "time_series",
"intervalFactor": 1,
"legendFormat": "Node NPU hbm memory utilization(avg)",
@@ -6328,178 +5483,6 @@ data:
"alignLevel": null
}
},
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "Prometheus",
"fill": 1,
"gridPos": {
"h": 6,
"w": 8,
"x": 0,
"y": 18
},
"id": 16,
"legend": {
"avg": false,
"current": false,
"max": false,
"min": false,
"show": true,
"total": false,
"values": false
},
"lines": true,
"linewidth": 1,
"links": [],
"nullPointMode": "null",
"options": {},
"percentage": false,
"pointradius": 2,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"spaceLength": 10,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "sum((sum(enflame_vgcu_usage{name=~\"^$Node$\"}) / count(enflame_vgcu_usage{name=~\"^$Node$\"}) ) or vector(0))",
"format": "time_series",
"intervalFactor": 1,
"legendFormat": "Node VGCU utilization (avg)",
"refId": "A"
}
],
"thresholds": [],
"timeFrom": null,
"timeRegions": [],
"timeShift": null,
"title": "Node VGCU utilization (avg)",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"format": "percent",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
],
"yaxis": {
"align": false,
"alignLevel": null
}
},
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "Prometheus",
"fill": 1,
"gridPos": {
"h": 6,
"w": 8,
"x": 8,
"y": 18
},
"id": 17,
"legend": {
"avg": false,
"current": false,
"max": false,
"min": false,
"show": true,
"total": false,
"values": false
},
"lines": true,
"linewidth": 1,
"links": [],
"nullPointMode": "null",
"options": {},
"percentage": false,
"pointradius": 2,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"spaceLength": 10,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "sum((sum(enflame_vgcu_memory_usage{name=~\"^$Node$\"}) / count(enflame_vgcu_memory_usage{name=~\"^$Node$\"})) * 100 or vector(0))",
"format": "time_series",
"intervalFactor": 1,
"legendFormat": "Node VGCU memory utilization(avg)",
"refId": "A"
}
],
"thresholds": [],
"timeFrom": null,
"timeRegions": [],
"timeShift": null,
"title": "Node VGCU memory utilization (avg)",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"format": "percent",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
],
"yaxis": {
"align": false,
"alignLevel": null
}
},
{
"aliasColors": {},
"bars": false,
@@ -6734,7 +5717,7 @@ data:
"tableColumn": "",
"targets": [
{
"expr": "sum (rate (container_cpu_usage_seconds_total{id=\"/\",kubernetes_io_hostname=~\"^$Node$\"}[1m])) / sum (machine_cpu_cores{kubernetes_io_hostname=~\"^$Node$\"}) * 100",
"expr": "sum (rate (container_cpu_usage_seconds_total{id=\"/\",kubernetes_io_hostname=~\"^$Node$\"}[1m])) / sum(count without(cpu, mode) (node_cpu_seconds_total{mode=\"idle\",name=~\"^$Node$\"})) * 100",
"format": "time_series",
"intervalFactor": 1,
"refId": "A"
@@ -6818,7 +5801,7 @@ data:
"tableColumn": "",
"targets": [
{
"expr": "sum (container_memory_working_set_bytes{id=\"/\",kubernetes_io_hostname=~\"^$Node$\"}) / sum (machine_memory_bytes{kubernetes_io_hostname=~\"^$Node$\"}) * 100",
"expr": "sum (container_memory_working_set_bytes{id=\"/\",kubernetes_io_hostname=~\"^$Node$\"}) / sum(node_memory_MemTotal_bytes{name=~\"^$Node$\"}) * 100",
"format": "time_series",
"intervalFactor": 1,
"refId": "A"
@@ -7069,7 +6052,7 @@ data:
"tableColumn": "",
"targets": [
{
"expr": "sum (machine_cpu_cores{kubernetes_io_hostname=~\"^$Node$\"})",
"expr": "sum(count without(cpu, mode) (node_cpu_seconds_total{mode=\"idle\",name=~\"^$Node$\"}))",
"format": "time_series",
"intervalFactor": 1,
"refId": "A"
@@ -7236,7 +6219,7 @@ data:
"tableColumn": "",
"targets": [
{
"expr": "sum (machine_memory_bytes{kubernetes_io_hostname=~\"^$Node$\"})",
"expr": "sum(node_memory_MemTotal_bytes{name=~\"^$Node$\"})",
"format": "time_series",
"intervalFactor": 1,
"refId": "A"


Loading…
Cancel
Save