From 96771968ce6533d0ee709842be47f12114a107cf Mon Sep 17 00:00:00 2001
From: ashergaga <1214443299@qq.com>
Date: Thu, 23 Apr 2026 13:10:29 +0000
Subject: [PATCH 1/3] add ascend-device-node-configmap
Signed-off-by: ashergaga <1214443299@qq.com>
---
ascend-device-node-configmap.yaml | 15 +++++++++++++++
ascend-device-plugin.yaml | 7 +++++++
cmd/main.go | 7 +++++++
internal/manager/manager.go | 20 ++++++++++++++++++++
internal/vnpu.go | 24 ++++++++++++++++++++++++
5 files changed, 73 insertions(+)
create mode 100644 ascend-device-node-configmap.yaml
diff --git a/ascend-device-node-configmap.yaml b/ascend-device-node-configmap.yaml
new file mode 100644
index 0000000..ca6ee57
--- /dev/null
+++ b/ascend-device-node-configmap.yaml
@@ -0,0 +1,15 @@
+apiVersion: v1
+kind: ConfigMap
+metadata:
+ labels:
+ app.kubernetes.io/component: hami-scheduler
+ app.kubernetes.io/name: hami
+ app.kubernetes.io/instance: hami
+ name: hami-device-node-config
+ namespace: kube-system
+data:
+ node-config.yaml: |-
+ nodes:
+ - name: "cnst-dev-w2"
+ hami-vnpu-core: true
+ vDeviceCount: 8
diff --git a/ascend-device-plugin.yaml b/ascend-device-plugin.yaml
index ff3a9c0..cabf60b 100644
--- a/ascend-device-plugin.yaml
+++ b/ascend-device-plugin.yaml
@@ -89,6 +89,10 @@ spec:
mountPath: /device-config.yaml
subPath: device-config.yaml
readOnly: true
+ - mountPath: /node-config.yaml
+ name: ascend-node-config
+ readOnly: true
+ subPath: node-config.yaml
env:
- name: NODE_NAME
valueFrom:
@@ -122,5 +126,8 @@ spec:
- name: ascend-config
configMap:
name: hami-scheduler-device
+ - name: ascend-node-config
+ configMap:
+ name: hami-device-node-config
nodeSelector:
ascend: "on"
diff --git a/cmd/main.go b/cmd/main.go
index 99b4f3e..849e40c 100644
--- a/cmd/main.go
+++ b/cmd/main.go
@@ -36,6 +36,7 @@ import (
var (
hwLoglevel = flag.Int("hw_loglevel", 0, "huawei log level, -1-debug, 0-info, 1-warning, 2-error 3-critical default value: 0")
configFile = flag.String("config_file", "", "config file path")
+ nodeConfigFile = flag.String("node_config_file", "", "node specific config file path")
nodeName = flag.String("node_name", os.Getenv("NODE_NAME"), "node name")
checkIdleVNPUInterval = flag.Int("check_idle_vnpu_interval", 60, "the interval (in seconds) to check idle vNPU and release them")
)
@@ -136,6 +137,12 @@ func main() {
if err != nil {
klog.Fatalf("load config failed, error is %v", err)
}
+ if *nodeConfigFile != "" {
+ err = mgr.LoadNodeConfig(*nodeConfigFile, *nodeName)
+ if err != nil {
+ klog.Errorf("load node config failed: %v", err)
+ }
+ }
server, err := server.NewPluginServer(mgr, *nodeName, *checkIdleVNPUInterval)
if err != nil {
klog.Fatalf("init PluginServer failed, error is %v", err)
diff --git a/internal/manager/manager.go b/internal/manager/manager.go
index 19222f7..eab833f 100644
--- a/internal/manager/manager.go
+++ b/internal/manager/manager.go
@@ -43,6 +43,7 @@ type AscendManager struct {
//nodeName string
config internal.VNPUConfig
devs []*Device
+ nodeConfig *internal.NodeConfig
}
func NewAscendManager() (*AscendManager, error) {
@@ -56,6 +57,25 @@ func NewAscendManager() (*AscendManager, error) {
}, nil
}
+func (am *AscendManager) LoadNodeConfig(nodePath string, nodeName string) error {
+ nodeConfigList, err := internal.LoadNodeConfig(nodePath)
+ if err != nil {
+ klog.Warningf("Failed to load node config from %s: %v", nodePath, err)
+ return err
+ }
+
+ for _, n := range nodeConfigList.Nodes {
+ if n.Name == nodeName {
+ am.nodeConfig = &n
+ klog.Infof("Successfully matched node config for %s: %+v", nodeName, n)
+ return nil
+ }
+ }
+
+ klog.Infof("No specific config found for node %s, will use default settings", nodeName)
+ return nil
+}
+
func (am *AscendManager) LoadConfig(path string) error {
config, err := internal.LoadConfig(path)
if err != nil {
diff --git a/internal/vnpu.go b/internal/vnpu.go
index ce8a92e..b726a48 100644
--- a/internal/vnpu.go
+++ b/internal/vnpu.go
@@ -57,3 +57,27 @@ func LoadConfig(path string) (*Config, error) {
}
return &yamlData, nil
}
+
+
+type NodeConfig struct {
+ Name string `json:"name"`
+ HamiVnpuCore bool `json:"hami-vnpu-core"`
+ VDeviceCount int `json:"vDeviceCount"`
+}
+
+type NodeListConfig struct {
+ Nodes []NodeConfig `json:"nodes"`
+}
+
+func LoadNodeConfig(path string) (*NodeListConfig, error) {
+ data, err := os.ReadFile(path)
+ if err != nil {
+ return nil, err
+ }
+ var yamlData NodeListConfig
+ err = yaml.Unmarshal(data, &yamlData)
+ if err != nil {
+ return nil, err
+ }
+ return &yamlData, nil
+}
\ No newline at end of file
From 06e47605dff55c236d4f90642fb31adccb032eec Mon Sep 17 00:00:00 2001
From: ashergaga <1214443299@qq.com>
Date: Thu, 23 Apr 2026 13:11:07 +0000
Subject: [PATCH 2/3] modify Readme
Signed-off-by: ashergaga <1214443299@qq.com>
---
README.md | 22 ++++++++++++++++++++++
README_cn.md | 18 ++++++++++++++++++
2 files changed, 40 insertions(+)
diff --git a/README.md b/README.md
index bd60387..f7788eb 100644
--- a/README.md
+++ b/README.md
@@ -30,6 +30,18 @@ hami-vnpu-core Soft Slicing Requirements:
- **Ascend Driver Version**: ≥ 25.5
- **Chip Mode**: enable `device-share` mode on Ascend chips for virtualization
+Below is the English translation of the instructions for enabling `device-share` mode:
+
+**Enabling `device-share` Mode**
+
+**npu-smi set -t device-share -i** *id* **-d** *value* This command is used to set the container sharing mode for all chips on a specified device.
+
+**Parameter Description**
+
+| Type | Description |
+| :--- | :--- |
+| *id* | **Device ID**. The NPU ID found by running the **npu-smi info -l** command is the device ID. |
+| *value* | **Container Enable Status**: Options are disabled or enabled. The default is disabled.
0: Disabled
1: Enabled |
## Compile
@@ -57,6 +69,16 @@ kubectl label node {ascend-node} ascend=on
kubectl apply -f ascend-device-configmap.yaml
```
+#### **Node Custom Configuration Description**
+The `hami-device-node-config` is used to enable hami-vnpu-core for specific nodes within the cluster.
+* By setting `hami-vnpu-core: true`, the specified node will enable soft-partitioning based on `hami-vnpu-core`.
+* Specify the number of virtual devices reported to Kubernetes for each physical chip via the `vDeviceCount` field.
+* Nodes without specific configurations will default to template-based hard-partitioning.
+
+```bash
+kubectl apply -f ascend-device-node-configmap.yaml
+```
+
### Deply RuntimeClass
```bash
diff --git a/README_cn.md b/README_cn.md
index f33ba64..e770bee 100644
--- a/README_cn.md
+++ b/README_cn.md
@@ -29,6 +29,17 @@ git submodule update --init --recursive
- Ascend 驱动版本:≥ 25.5
- 芯片模式:在昇腾芯片上开启 `device-share` 模式以支持虚拟化。
+**开启 `device-share`模式**
+
+**npu-smi set -t device-share -i** *id* **-d** *value* 用于设置指定设备的所有芯片的容器共享模式。
+
+**参数说明**
+
+| 类型 | 描述 |
+| ------- | ----------------------------------------------------------- |
+| *id* | 设备ID。通过**npu-smi info -l**命令查出的NPU ID即为设备ID。 |
+| *value* | 容器使能状态:分为禁用、使能。默认禁用。0:禁用1:使能 |
+
## 编译
```bash
@@ -55,6 +66,13 @@ kubectl label node {ascend-node} ascend=on
kubectl apply -f ascend-device-configmap.yaml
```
+#### 节点自定义配置说明
+hami-device-node-config 用于对集群中特定节点的显卡虚拟化策略进行精细化控制。
+通过设置 hami-vnpu-core: true,指定节点将启用基于 hami-vnpu-core 的软切分,通过 vDeviceCount 字段,手动定义每个物理芯片上报给 Kubernetes 的虚拟设备数量;否则走基于模板的硬切分。
+```bash
+kubectl apply -f ascend-device-node-configmap.yaml
+```
+
### 部署 RuntimeClass
```bash
From 933ca38f5b0f2946eeb7f09f3cef924aafc63f3f Mon Sep 17 00:00:00 2001
From: ashergaga <1214443299@qq.com>
Date: Mon, 27 Apr 2026 07:57:07 +0000
Subject: [PATCH 3/3] feat: add Node Annotaion hami-vnpu-core
Signed-off-by: ashergaga <1214443299@qq.com>
---
internal/manager/manager.go | 5 +++++
internal/server/server.go | 9 +++++++++
2 files changed, 14 insertions(+)
diff --git a/internal/manager/manager.go b/internal/manager/manager.go
index eab833f..0b553e9 100644
--- a/internal/manager/manager.go
+++ b/internal/manager/manager.go
@@ -252,3 +252,8 @@ func (am *AscendManager) CleanupIdleVNPUs() error {
klog.Infof("Cleanup completed, destroyed %d idle vNPUs", totalCleaned)
return nil
}
+
+
+func (am *AscendManager) GetNodeConfig() *internal.NodeConfig {
+ return am.nodeConfig
+}
\ No newline at end of file
diff --git a/internal/server/server.go b/internal/server/server.go
index 8623ab7..fa3d3f8 100644
--- a/internal/server/server.go
+++ b/internal/server/server.go
@@ -52,6 +52,7 @@ const (
Ascend910CType = "Ascend910C"
VNPUModeAnnotation = "huawei.com/vnpu-mode"
VNPUModeHamiCore = "hami-core"
+ VNPUNodeSelectorAnnotation = "hami-vnpu-core"
)
var (
@@ -387,6 +388,14 @@ func (ps *PluginServer) registerHAMi() error {
annos := make(map[string]string)
annos[ps.registerAnno] = device.MarshalNodeDevices(apiDevices)
annos[ps.handshakeAnno] = "Reported_" + time.Now().Add(time.Duration(*reportTimeOffset)*time.Second).Format("2006.01.02 15:04:05")
+
+ if ps.mgr.GetNodeConfig() != nil && ps.mgr.GetNodeConfig().HamiVnpuCore {
+ annos[VNPUNodeSelectorAnnotation] = "true"
+ klog.V(4).Infof("Node %s has HamiVnpuCore enabled, patching annotation %s: true", ps.nodeName, VNPUNodeSelectorAnnotation)
+ } else {
+ annos[VNPUNodeSelectorAnnotation] = "false"
+ }
+
node, err := util.GetNode(ps.nodeName)
if err != nil {
return fmt.Errorf("get node %s error: %v", ps.nodeName, err)