Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 22 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,18 @@ hami-vnpu-core Soft Slicing Requirements:

- **Ascend Driver Version**: ≥ 25.5
- **Chip Mode**: enable `device-share` mode on Ascend chips for virtualization
Below is the English translation of the instructions for enabling `device-share` mode:

**Enabling `device-share` Mode**

**npu-smi set -t device-share -i** *id* **-d** *value* This command is used to set the container sharing mode for all chips on a specified device.

**Parameter Description**

| Type | Description |
| :--- | :--- |
| *id* | **Device ID**. The NPU ID found by running the **npu-smi info -l** command is the device ID. |
| *value* | **Container Enable Status**: Options are disabled or enabled. The default is disabled.<br>0: Disabled<br>1: Enabled |

## Compile

Expand Down Expand Up @@ -57,6 +69,16 @@ kubectl label node {ascend-node} ascend=on
kubectl apply -f ascend-device-configmap.yaml
```

#### **Node Custom Configuration Description**
The `hami-device-node-config` is used to enable hami-vnpu-core for specific nodes within the cluster.
* By setting `hami-vnpu-core: true`, the specified node will enable soft-partitioning based on `hami-vnpu-core`.
* Specify the number of virtual devices reported to Kubernetes for each physical chip via the `vDeviceCount` field.
* Nodes without specific configurations will default to template-based hard-partitioning.

```bash
kubectl apply -f ascend-device-node-configmap.yaml
```

### Deply RuntimeClass

```bash
Expand Down
18 changes: 18 additions & 0 deletions README_cn.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,17 @@ git submodule update --init --recursive
- Ascend 驱动版本:≥ 25.5
- 芯片模式:在昇腾芯片上开启 `device-share` 模式以支持虚拟化。

**开启 `device-share`模式**

**npu-smi set -t device-share -i** *id* **-d** *value* 用于设置指定设备的所有芯片的容器共享模式。

**参数说明**

| 类型 | 描述 |
| ------- | ----------------------------------------------------------- |
| *id* | 设备ID。通过**npu-smi info -l**命令查出的NPU ID即为设备ID。 |
| *value* | 容器使能状态:分为禁用、使能。默认禁用。0:禁用1:使能 |

## 编译

```bash
Expand All @@ -55,6 +66,13 @@ kubectl label node {ascend-node} ascend=on
kubectl apply -f ascend-device-configmap.yaml
```

#### 节点自定义配置说明
hami-device-node-config 用于对集群中特定节点的显卡虚拟化策略进行精细化控制。
通过设置 hami-vnpu-core: true,指定节点将启用基于 hami-vnpu-core 的软切分,通过 vDeviceCount 字段,手动定义每个物理芯片上报给 Kubernetes 的虚拟设备数量;否则走基于模板的硬切分。
```bash
kubectl apply -f ascend-device-node-configmap.yaml
```

### 部署 RuntimeClass

```bash
Expand Down
15 changes: 15 additions & 0 deletions ascend-device-node-configmap.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
apiVersion: v1
kind: ConfigMap
metadata:
labels:
app.kubernetes.io/component: hami-scheduler
app.kubernetes.io/name: hami
app.kubernetes.io/instance: hami
name: hami-device-node-config
namespace: kube-system
data:
node-config.yaml: |-
nodes:
- name: "cnst-dev-w2"
hami-vnpu-core: true
vDeviceCount: 8
7 changes: 7 additions & 0 deletions ascend-device-plugin.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,10 @@ spec:
mountPath: /device-config.yaml
subPath: device-config.yaml
readOnly: true
- mountPath: /node-config.yaml
name: ascend-node-config
readOnly: true
subPath: node-config.yaml
env:
- name: NODE_NAME
valueFrom:
Expand Down Expand Up @@ -122,5 +126,8 @@ spec:
- name: ascend-config
configMap:
name: hami-scheduler-device
- name: ascend-node-config
configMap:
name: hami-device-node-config
nodeSelector:
ascend: "on"
7 changes: 7 additions & 0 deletions cmd/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ import (
var (
hwLoglevel = flag.Int("hw_loglevel", 0, "huawei log level, -1-debug, 0-info, 1-warning, 2-error 3-critical default value: 0")
configFile = flag.String("config_file", "", "config file path")
nodeConfigFile = flag.String("node_config_file", "", "node specific config file path")
nodeName = flag.String("node_name", os.Getenv("NODE_NAME"), "node name")
checkIdleVNPUInterval = flag.Int("check_idle_vnpu_interval", 60, "the interval (in seconds) to check idle vNPU and release them")
)
Expand Down Expand Up @@ -136,6 +137,12 @@ func main() {
if err != nil {
klog.Fatalf("load config failed, error is %v", err)
}
if *nodeConfigFile != "" {
err = mgr.LoadNodeConfig(*nodeConfigFile, *nodeName)
if err != nil {
klog.Errorf("load node config failed: %v", err)
}
}
server, err := server.NewPluginServer(mgr, *nodeName, *checkIdleVNPUInterval)
if err != nil {
klog.Fatalf("init PluginServer failed, error is %v", err)
Expand Down
25 changes: 25 additions & 0 deletions internal/manager/manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ type AscendManager struct {
//nodeName string
config internal.VNPUConfig
devs []*Device
nodeConfig *internal.NodeConfig
}

func NewAscendManager() (*AscendManager, error) {
Expand All @@ -56,6 +57,25 @@ func NewAscendManager() (*AscendManager, error) {
}, nil
}

func (am *AscendManager) LoadNodeConfig(nodePath string, nodeName string) error {
nodeConfigList, err := internal.LoadNodeConfig(nodePath)
if err != nil {
klog.Warningf("Failed to load node config from %s: %v", nodePath, err)
return err
}

for _, n := range nodeConfigList.Nodes {
if n.Name == nodeName {
am.nodeConfig = &n
klog.Infof("Successfully matched node config for %s: %+v", nodeName, n)
return nil
}
}

klog.Infof("No specific config found for node %s, will use default settings", nodeName)
return nil
}

func (am *AscendManager) LoadConfig(path string) error {
config, err := internal.LoadConfig(path)
if err != nil {
Expand Down Expand Up @@ -232,3 +252,8 @@ func (am *AscendManager) CleanupIdleVNPUs() error {
klog.Infof("Cleanup completed, destroyed %d idle vNPUs", totalCleaned)
return nil
}


func (am *AscendManager) GetNodeConfig() *internal.NodeConfig {
return am.nodeConfig
}
9 changes: 9 additions & 0 deletions internal/server/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ const (
Ascend910CType = "Ascend910C"
VNPUModeAnnotation = "huawei.com/vnpu-mode"
VNPUModeHamiCore = "hami-core"
VNPUNodeSelectorAnnotation = "hami-vnpu-core"
)

var (
Expand Down Expand Up @@ -387,6 +388,14 @@ func (ps *PluginServer) registerHAMi() error {
annos := make(map[string]string)
annos[ps.registerAnno] = device.MarshalNodeDevices(apiDevices)
annos[ps.handshakeAnno] = "Reported_" + time.Now().Add(time.Duration(*reportTimeOffset)*time.Second).Format("2006.01.02 15:04:05")

if ps.mgr.GetNodeConfig() != nil && ps.mgr.GetNodeConfig().HamiVnpuCore {
annos[VNPUNodeSelectorAnnotation] = "true"
klog.V(4).Infof("Node %s has HamiVnpuCore enabled, patching annotation %s: true", ps.nodeName, VNPUNodeSelectorAnnotation)
} else {
annos[VNPUNodeSelectorAnnotation] = "false"
}

node, err := util.GetNode(ps.nodeName)
if err != nil {
return fmt.Errorf("get node %s error: %v", ps.nodeName, err)
Expand Down
24 changes: 24 additions & 0 deletions internal/vnpu.go
Original file line number Diff line number Diff line change
Expand Up @@ -57,3 +57,27 @@ func LoadConfig(path string) (*Config, error) {
}
return &yamlData, nil
}


type NodeConfig struct {
Name string `json:"name"`
HamiVnpuCore bool `json:"hami-vnpu-core"`
VDeviceCount int `json:"vDeviceCount"`
}

type NodeListConfig struct {
Nodes []NodeConfig `json:"nodes"`
}

func LoadNodeConfig(path string) (*NodeListConfig, error) {
data, err := os.ReadFile(path)
if err != nil {
return nil, err
}
var yamlData NodeListConfig
err = yaml.Unmarshal(data, &yamlData)
if err != nil {
return nil, err
}
return &yamlData, nil
}
Loading