From 96771968ce6533d0ee709842be47f12114a107cf Mon Sep 17 00:00:00 2001 From: ashergaga <1214443299@qq.com> Date: Thu, 23 Apr 2026 13:10:29 +0000 Subject: [PATCH 1/3] add ascend-device-node-configmap Signed-off-by: ashergaga <1214443299@qq.com> --- ascend-device-node-configmap.yaml | 15 +++++++++++++++ ascend-device-plugin.yaml | 7 +++++++ cmd/main.go | 7 +++++++ internal/manager/manager.go | 20 ++++++++++++++++++++ internal/vnpu.go | 24 ++++++++++++++++++++++++ 5 files changed, 73 insertions(+) create mode 100644 ascend-device-node-configmap.yaml diff --git a/ascend-device-node-configmap.yaml b/ascend-device-node-configmap.yaml new file mode 100644 index 0000000..ca6ee57 --- /dev/null +++ b/ascend-device-node-configmap.yaml @@ -0,0 +1,15 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + labels: + app.kubernetes.io/component: hami-scheduler + app.kubernetes.io/name: hami + app.kubernetes.io/instance: hami + name: hami-device-node-config + namespace: kube-system +data: + node-config.yaml: |- + nodes: + - name: "cnst-dev-w2" + hami-vnpu-core: true + vDeviceCount: 8 diff --git a/ascend-device-plugin.yaml b/ascend-device-plugin.yaml index ff3a9c0..cabf60b 100644 --- a/ascend-device-plugin.yaml +++ b/ascend-device-plugin.yaml @@ -89,6 +89,10 @@ spec: mountPath: /device-config.yaml subPath: device-config.yaml readOnly: true + - mountPath: /node-config.yaml + name: ascend-node-config + readOnly: true + subPath: node-config.yaml env: - name: NODE_NAME valueFrom: @@ -122,5 +126,8 @@ spec: - name: ascend-config configMap: name: hami-scheduler-device + - name: ascend-node-config + configMap: + name: hami-device-node-config nodeSelector: ascend: "on" diff --git a/cmd/main.go b/cmd/main.go index 99b4f3e..849e40c 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -36,6 +36,7 @@ import ( var ( hwLoglevel = flag.Int("hw_loglevel", 0, "huawei log level, -1-debug, 0-info, 1-warning, 2-error 3-critical default value: 0") configFile = flag.String("config_file", "", "config file path") + nodeConfigFile = flag.String("node_config_file", "", "node specific config file path") nodeName = flag.String("node_name", os.Getenv("NODE_NAME"), "node name") checkIdleVNPUInterval = flag.Int("check_idle_vnpu_interval", 60, "the interval (in seconds) to check idle vNPU and release them") ) @@ -136,6 +137,12 @@ func main() { if err != nil { klog.Fatalf("load config failed, error is %v", err) } + if *nodeConfigFile != "" { + err = mgr.LoadNodeConfig(*nodeConfigFile, *nodeName) + if err != nil { + klog.Errorf("load node config failed: %v", err) + } + } server, err := server.NewPluginServer(mgr, *nodeName, *checkIdleVNPUInterval) if err != nil { klog.Fatalf("init PluginServer failed, error is %v", err) diff --git a/internal/manager/manager.go b/internal/manager/manager.go index 19222f7..eab833f 100644 --- a/internal/manager/manager.go +++ b/internal/manager/manager.go @@ -43,6 +43,7 @@ type AscendManager struct { //nodeName string config internal.VNPUConfig devs []*Device + nodeConfig *internal.NodeConfig } func NewAscendManager() (*AscendManager, error) { @@ -56,6 +57,25 @@ func NewAscendManager() (*AscendManager, error) { }, nil } +func (am *AscendManager) LoadNodeConfig(nodePath string, nodeName string) error { + nodeConfigList, err := internal.LoadNodeConfig(nodePath) + if err != nil { + klog.Warningf("Failed to load node config from %s: %v", nodePath, err) + return err + } + + for _, n := range nodeConfigList.Nodes { + if n.Name == nodeName { + am.nodeConfig = &n + klog.Infof("Successfully matched node config for %s: %+v", nodeName, n) + return nil + } + } + + klog.Infof("No specific config found for node %s, will use default settings", nodeName) + return nil +} + func (am *AscendManager) LoadConfig(path string) error { config, err := internal.LoadConfig(path) if err != nil { diff --git a/internal/vnpu.go b/internal/vnpu.go index ce8a92e..b726a48 100644 --- a/internal/vnpu.go +++ b/internal/vnpu.go @@ -57,3 +57,27 @@ func LoadConfig(path string) (*Config, error) { } return &yamlData, nil } + + +type NodeConfig struct { + Name string `json:"name"` + HamiVnpuCore bool `json:"hami-vnpu-core"` + VDeviceCount int `json:"vDeviceCount"` +} + +type NodeListConfig struct { + Nodes []NodeConfig `json:"nodes"` +} + +func LoadNodeConfig(path string) (*NodeListConfig, error) { + data, err := os.ReadFile(path) + if err != nil { + return nil, err + } + var yamlData NodeListConfig + err = yaml.Unmarshal(data, &yamlData) + if err != nil { + return nil, err + } + return &yamlData, nil +} \ No newline at end of file From 06e47605dff55c236d4f90642fb31adccb032eec Mon Sep 17 00:00:00 2001 From: ashergaga <1214443299@qq.com> Date: Thu, 23 Apr 2026 13:11:07 +0000 Subject: [PATCH 2/3] modify Readme Signed-off-by: ashergaga <1214443299@qq.com> --- README.md | 22 ++++++++++++++++++++++ README_cn.md | 18 ++++++++++++++++++ 2 files changed, 40 insertions(+) diff --git a/README.md b/README.md index bd60387..f7788eb 100644 --- a/README.md +++ b/README.md @@ -30,6 +30,18 @@ hami-vnpu-core Soft Slicing Requirements: - **Ascend Driver Version**: ≥ 25.5 - **Chip Mode**: enable `device-share` mode on Ascend chips for virtualization +Below is the English translation of the instructions for enabling `device-share` mode: + +**Enabling `device-share` Mode** + +**npu-smi set -t device-share -i** *id* **-d** *value* This command is used to set the container sharing mode for all chips on a specified device. + +**Parameter Description** + +| Type | Description | +| :--- | :--- | +| *id* | **Device ID**. The NPU ID found by running the **npu-smi info -l** command is the device ID. | +| *value* | **Container Enable Status**: Options are disabled or enabled. The default is disabled.
0: Disabled
1: Enabled | ## Compile @@ -57,6 +69,16 @@ kubectl label node {ascend-node} ascend=on kubectl apply -f ascend-device-configmap.yaml ``` +#### **Node Custom Configuration Description** +The `hami-device-node-config` is used to enable hami-vnpu-core for specific nodes within the cluster. +* By setting `hami-vnpu-core: true`, the specified node will enable soft-partitioning based on `hami-vnpu-core`. +* Specify the number of virtual devices reported to Kubernetes for each physical chip via the `vDeviceCount` field. +* Nodes without specific configurations will default to template-based hard-partitioning. + +```bash +kubectl apply -f ascend-device-node-configmap.yaml +``` + ### Deply RuntimeClass ```bash diff --git a/README_cn.md b/README_cn.md index f33ba64..e770bee 100644 --- a/README_cn.md +++ b/README_cn.md @@ -29,6 +29,17 @@ git submodule update --init --recursive - Ascend 驱动版本:≥ 25.5 - 芯片模式:在昇腾芯片上开启 `device-share` 模式以支持虚拟化。 +**开启 `device-share`模式** + +**npu-smi set -t device-share -i** *id* **-d** *value* 用于设置指定设备的所有芯片的容器共享模式。 + +**参数说明** + +| 类型 | 描述 | +| ------- | ----------------------------------------------------------- | +| *id* | 设备ID。通过**npu-smi info -l**命令查出的NPU ID即为设备ID。 | +| *value* | 容器使能状态:分为禁用、使能。默认禁用。0:禁用1:使能 | + ## 编译 ```bash @@ -55,6 +66,13 @@ kubectl label node {ascend-node} ascend=on kubectl apply -f ascend-device-configmap.yaml ``` +#### 节点自定义配置说明 +hami-device-node-config 用于对集群中特定节点的显卡虚拟化策略进行精细化控制。 +通过设置 hami-vnpu-core: true,指定节点将启用基于 hami-vnpu-core 的软切分,通过 vDeviceCount 字段,手动定义每个物理芯片上报给 Kubernetes 的虚拟设备数量;否则走基于模板的硬切分。 +```bash +kubectl apply -f ascend-device-node-configmap.yaml +``` + ### 部署 RuntimeClass ```bash From 933ca38f5b0f2946eeb7f09f3cef924aafc63f3f Mon Sep 17 00:00:00 2001 From: ashergaga <1214443299@qq.com> Date: Mon, 27 Apr 2026 07:57:07 +0000 Subject: [PATCH 3/3] feat: add Node Annotaion hami-vnpu-core Signed-off-by: ashergaga <1214443299@qq.com> --- internal/manager/manager.go | 5 +++++ internal/server/server.go | 9 +++++++++ 2 files changed, 14 insertions(+) diff --git a/internal/manager/manager.go b/internal/manager/manager.go index eab833f..0b553e9 100644 --- a/internal/manager/manager.go +++ b/internal/manager/manager.go @@ -252,3 +252,8 @@ func (am *AscendManager) CleanupIdleVNPUs() error { klog.Infof("Cleanup completed, destroyed %d idle vNPUs", totalCleaned) return nil } + + +func (am *AscendManager) GetNodeConfig() *internal.NodeConfig { + return am.nodeConfig +} \ No newline at end of file diff --git a/internal/server/server.go b/internal/server/server.go index 8623ab7..fa3d3f8 100644 --- a/internal/server/server.go +++ b/internal/server/server.go @@ -52,6 +52,7 @@ const ( Ascend910CType = "Ascend910C" VNPUModeAnnotation = "huawei.com/vnpu-mode" VNPUModeHamiCore = "hami-core" + VNPUNodeSelectorAnnotation = "hami-vnpu-core" ) var ( @@ -387,6 +388,14 @@ func (ps *PluginServer) registerHAMi() error { annos := make(map[string]string) annos[ps.registerAnno] = device.MarshalNodeDevices(apiDevices) annos[ps.handshakeAnno] = "Reported_" + time.Now().Add(time.Duration(*reportTimeOffset)*time.Second).Format("2006.01.02 15:04:05") + + if ps.mgr.GetNodeConfig() != nil && ps.mgr.GetNodeConfig().HamiVnpuCore { + annos[VNPUNodeSelectorAnnotation] = "true" + klog.V(4).Infof("Node %s has HamiVnpuCore enabled, patching annotation %s: true", ps.nodeName, VNPUNodeSelectorAnnotation) + } else { + annos[VNPUNodeSelectorAnnotation] = "false" + } + node, err := util.GetNode(ps.nodeName) if err != nil { return fmt.Errorf("get node %s error: %v", ps.nodeName, err)