Skip to content

Commit 6415c27

Browse files
authored
Merge pull request #54 from dartagnanli/main
fix: after the machine reboot, vNPU cannot be deleted, resulting in idle resource occupation
2 parents d9e7e17 + 09ab29d commit 6415c27

6 files changed

Lines changed: 114 additions & 32 deletions

File tree

Makefile

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,11 +14,11 @@ docker:
1414
--build-arg GOPROXY=https://goproxy.cn,direct \
1515
-t ${IMG_NAME}:${VERSION} .
1616

17-
lint:
17+
lint:
1818
$(GO) install github.com/golangci/golangci-lint/cmd/golangci-lint@v1.61.0
1919
golangci-lint run
2020

21-
ascend-device-plugin:
21+
ascend-device-plugin:
2222
$(GO) build $(BUILDARGS) -o ./ascend-device-plugin ./cmd/main.go
2323

2424
clean:

README_cn.md

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ Ascend device plugin 是用来支持在 [HAMi](https://github.com/Project-HAMi/H
1111
部署 [ascend-docker-runtime](https://gitcode.com/Ascend/mind-cluster/tree/master/component/ascend-docker-runtime)
1212

1313
克隆子模块 mind-cluster
14+
1415
```bash
1516
git submodule add https://gitcode.com/Ascend/mind-cluster.git
1617
```
@@ -31,7 +32,6 @@ docker buildx build -t $IMAGE_NAME .
3132

3233
### 给 Node 打 ascend 标签
3334

34-
3535
```
3636
kubectl label node {ascend-node} ascend=on
3737
```
@@ -48,7 +48,7 @@ kubectl apply -f ascend-device-configmap.yaml
4848
kubectl apply -f ascend-device-plugin.yaml
4949
```
5050

51-
如果要在HAMi中使用升腾NPU, 在部署HAMi时设置 `devices.ascend.enabled` 为 true 会自动部署 ConfigMap 和 `ascend-device-plugin`。 参考 https://github.com/Project-HAMi/HAMi/blob/master/charts/hami/README.md#huawei-ascend
51+
如果要在HAMi中使用升腾NPU, 在部署HAMi时设置 `devices.ascend.enabled` 为 true 会自动部署 ConfigMap 和 `ascend-device-plugin`。 参考 <https://github.com/Project-HAMi/HAMi/blob/master/charts/hami/README.md#huawei-ascend>
5252

5353
## 使用
5454

@@ -67,9 +67,10 @@ kubectl apply -f ascend-device-plugin.yaml
6767
# 如果不指定显存大小, 就会使用整张卡
6868
huawei.com/Ascend910B-memory: "4096"
6969
```
70+
7071
For more examples, see [examples](./examples/)
7172
72-
### 在 volcano 中使用
73+
### 在 volcano 中使用
7374
7475
在 volcano 中使用时需要提前部署好 volcano, 更多信息请[参考这里](https://github.com/volcano-sh/volcano/tree/master/docs/user-guide/how_to_use_vnpu.md)
7576
@@ -89,4 +90,4 @@ spec:
8990
limits:
9091
huawei.com/Ascend310P: "1"
9192
huawei.com/Ascend310P-memory: "4096"
92-
```
93+
```

cmd/main.go

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -34,9 +34,10 @@ import (
3434
)
3535

3636
var (
37-
hwLoglevel = flag.Int("hw_loglevel", 0, "huawei log level, -1-debug, 0-info, 1-warning, 2-error 3-critical default value: 0")
38-
configFile = flag.String("config_file", "", "config file path")
39-
nodeName = flag.String("node_name", os.Getenv("NODE_NAME"), "node name")
37+
hwLoglevel = flag.Int("hw_loglevel", 0, "huawei log level, -1-debug, 0-info, 1-warning, 2-error 3-critical default value: 0")
38+
configFile = flag.String("config_file", "", "config file path")
39+
nodeName = flag.String("node_name", os.Getenv("NODE_NAME"), "node name")
40+
checkIdleVNPUInterval = flag.Int("check_idle_vnpu_interval", 60, "the interval (in seconds) to check idle vNPU and release them")
4041
)
4142

4243
func checkFlags() {
@@ -72,6 +73,9 @@ restart:
7273
}
7374
}
7475
restarting = true
76+
if err := ps.CleanupIdleVNPUs(); err != nil {
77+
klog.Errorf("Failed to cleanup idle vNPUs: %v", err)
78+
}
7579
klog.Info("Starting Plugins.")
7680
err = ps.Start()
7781
if err != nil {
@@ -132,14 +136,13 @@ func main() {
132136
if err != nil {
133137
klog.Fatalf("load config failed, error is %v", err)
134138
}
135-
server, err := server.NewPluginServer(mgr, *nodeName)
139+
server, err := server.NewPluginServer(mgr, *nodeName, *checkIdleVNPUInterval)
136140
if err != nil {
137141
klog.Fatalf("init PluginServer failed, error is %v", err)
138142
}
139143
client.InitGlobalClient()
140144

141-
err = start(server)
142-
if err != nil {
145+
if err = start(server); err != nil {
143146
klog.Fatalf("start PluginServer failed, error is %v", err)
144147
}
145148
}

internal/manager/manager.go

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -182,3 +182,53 @@ func (am *AscendManager) GetUnHealthIDs() []int32 {
182182
}
183183
return unhealthy
184184
}
185+
186+
func (am *AscendManager) CleanupIdleVNPUs() error {
187+
klog.Info("Starting cleanup of idle vNPUs...")
188+
189+
_, IDs, err := am.mgr.GetDeviceList()
190+
if err != nil {
191+
return fmt.Errorf("failed to get device list: %v", err)
192+
}
193+
klog.Infof("Found %d devices to check for idle vNPUs,%+v", len(IDs), IDs)
194+
195+
totalCleaned := 0
196+
for _, logicID := range IDs {
197+
cardID, deviceID, err := am.mgr.GetCardIDDeviceID(logicID)
198+
if err != nil {
199+
klog.Warningf("failed to get card/device ID for logic ID %d: %v", logicID, err)
200+
continue
201+
}
202+
// Obtain all vNPU information on this device
203+
vDevInfos, err := am.mgr.GetVirtualDeviceInfo(logicID)
204+
if err != nil {
205+
klog.Infof("no vNPU found on device %d or query failed: %v", logicID, err)
206+
continue
207+
}
208+
209+
klog.V(1).Infof("Device logicID=%d, cardID=%d,deviceID=%d has %d vNPUs", logicID, cardID, deviceID, len(vDevInfos.VDevInfo))
210+
211+
for _, vDev := range vDevInfos.VDevInfo {
212+
klog.V(1).Infof("vNPU CardId=%d, VDevID(Vnpu ID)=%d,template=%s,IsContainerUsed=%d", cardID, vDev.VDevID, vDev.QueryInfo.Name, vDev.QueryInfo.IsContainerUsed)
213+
214+
if vDev.QueryInfo.IsContainerUsed == 0 {
215+
klog.V(1).Infof("Found idle vNPU: cardID=%d, deviceID=%d, vnpuID=%d, status=%d, template=%s,IsContainerUsed=%d",
216+
cardID, deviceID, vDev.VDevID, vDev.QueryInfo.Status, vDev.QueryInfo.Name, vDev.QueryInfo.IsContainerUsed)
217+
218+
err := am.mgr.DestroyVirtualDevice(logicID, uint32(vDev.VDevID))
219+
if err != nil {
220+
klog.Errorf("failed to destroy vNPU %d on device %d: %v", vDev.VDevID, logicID, err)
221+
} else {
222+
klog.Infof("Successfully destroyed idle vNPU: vnpuID=%d", vDev.VDevID)
223+
totalCleaned++
224+
}
225+
} else {
226+
klog.Infof("Skipping active vNPU: cardID=%d, deviceID=%d, vnpuID=%d, status=%d, template=%s,IsContainerUsed=%d",
227+
cardID, deviceID, vDev.VDevID, vDev.QueryInfo.Status, vDev.QueryInfo.Name, vDev.QueryInfo.IsContainerUsed)
228+
}
229+
}
230+
}
231+
232+
klog.Infof("Cleanup completed, destroyed %d idle vNPUs", totalCleaned)
233+
return nil
234+
}

internal/server/server.go

Lines changed: 47 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -52,28 +52,30 @@ var (
5252
)
5353

5454
type PluginServer struct {
55-
nodeName string
56-
registerAnno string
57-
handshakeAnno string
58-
allocAnno string
59-
grpcServer *grpc.Server
60-
mgr *manager.AscendManager
61-
socket string
62-
stopCh chan interface{}
63-
healthCh chan int32
55+
nodeName string
56+
registerAnno string
57+
handshakeAnno string
58+
allocAnno string
59+
grpcServer *grpc.Server
60+
mgr *manager.AscendManager
61+
socket string
62+
stopCh chan interface{}
63+
healthCh chan int32
64+
checkIdleVNPUInterval int
6465
}
6566

66-
func NewPluginServer(mgr *manager.AscendManager, nodeName string) (*PluginServer, error) {
67+
func NewPluginServer(mgr *manager.AscendManager, nodeName string, checkIdleVNPUInterval int) (*PluginServer, error) {
6768
return &PluginServer{
68-
nodeName: nodeName,
69-
registerAnno: fmt.Sprintf("hami.io/node-register-%s", mgr.CommonWord()),
70-
handshakeAnno: fmt.Sprintf("hami.io/node-handshake-%s", mgr.CommonWord()),
71-
allocAnno: fmt.Sprintf("huawei.com/%s", mgr.CommonWord()),
72-
grpcServer: grpc.NewServer(),
73-
mgr: mgr,
74-
socket: path.Join(v1beta1.DevicePluginPath, fmt.Sprintf("%s.sock", mgr.CommonWord())),
75-
stopCh: make(chan interface{}),
76-
healthCh: make(chan int32),
69+
nodeName: nodeName,
70+
registerAnno: fmt.Sprintf("hami.io/node-register-%s", mgr.CommonWord()),
71+
handshakeAnno: fmt.Sprintf("hami.io/node-handshake-%s", mgr.CommonWord()),
72+
allocAnno: fmt.Sprintf("huawei.com/%s", mgr.CommonWord()),
73+
grpcServer: grpc.NewServer(),
74+
mgr: mgr,
75+
socket: path.Join(v1beta1.DevicePluginPath, fmt.Sprintf("%s.sock", mgr.CommonWord())),
76+
stopCh: make(chan interface{}),
77+
healthCh: make(chan int32),
78+
checkIdleVNPUInterval: checkIdleVNPUInterval,
7779
}, nil
7880
}
7981

@@ -91,16 +93,42 @@ func (ps *PluginServer) Start() error {
9193
if err != nil {
9294
return err
9395
}
96+
go ps.startPeriodicCheckIdleVNPUs()
9497
go ps.watchAndRegister()
9598
return nil
9699
}
97100

101+
func (ps *PluginServer) startPeriodicCheckIdleVNPUs() {
102+
ticker := time.NewTicker(time.Duration(ps.checkIdleVNPUInterval) * time.Second)
103+
defer ticker.Stop()
104+
for {
105+
select {
106+
case <-ticker.C:
107+
klog.Info("Running scheduled idle vNPU cleanup")
108+
if err := ps.CleanupIdleVNPUs(); err != nil {
109+
klog.Errorf("Failed to cleanup idle vNPUs: %v", err)
110+
}
111+
case <-ps.stopCh:
112+
klog.Info("Stopping cleanup goroutine")
113+
return
114+
}
115+
}
116+
}
117+
98118
func (ps *PluginServer) Stop() error {
99119
close(ps.stopCh)
100120
ps.grpcServer.Stop()
101121
return nil
102122
}
103123

124+
func (ps *PluginServer) StopCh() <-chan interface{} {
125+
return ps.stopCh
126+
}
127+
128+
func (ps *PluginServer) CleanupIdleVNPUs() error {
129+
return ps.mgr.CleanupIdleVNPUs()
130+
}
131+
104132
func (ps *PluginServer) dial(unixSocketPath string, timeout time.Duration) (*grpc.ClientConn, error) {
105133
ctx, cancel := context.WithTimeout(context.Background(), timeout)
106134
defer cancel()

mind-cluster

Submodule mind-cluster updated from c9cf42d to 2ee74e4

0 commit comments

Comments
 (0)