8*H20 (140G VRAM)
E0116 16:52:03.819472 175650 client.go:88] "ListAndWatch ended unexpectedly for device plugin" err="rpc error: code = ResourceExhausted desc = grpc: received message larger than max (23264648 vs. 4194304)" resource="volcano.sh/vgpu-memory"
an 16 16:52:00 prod-ncyd-slyt-h20 kubelet[175650]: I0116 16:52:00.741553 175650 kubelet.go:2421] "SyncLoop (PLEG): event for pod" pod="kube-system/volcano-device-plugin-4rcx5" event={"ID":"d963e7ec-eff1-478f-9eda-22cdf70b42f6","Type"
"ContainerStarted","Data":"3f9654c6630b680fa042ae272bca22ce87382280b0f79d1d7301e0511b675045"}
an 16 16:52:02 prod-ncyd-slyt-h20 kubelet[175650]: I0116 16:52:02.751785 175650 kubelet.go:2421] "SyncLoop (PLEG): event for pod" pod="kube-system/volcano-device-plugin-4rcx5" event={"ID":"d963e7ec-eff1-478f-9eda-22cdf70b42f6","Type"
"ContainerStarted","Data":"ec2bcca5403f868f3ca82a1e91e51f3edeefd4bc959c68024d0b7311cd3eadac"}
an 16 16:52:03 prod-ncyd-slyt-h20 kubelet[175650]: I0116 16:52:03.110671 175650 server.go:144] "Got registration request from device plugin with resource" resourceName="volcano.sh/vgpu-number"
an 16 16:52:03 prod-ncyd-slyt-h20 kubelet[175650]: I0116 16:52:03.110748 175650 handler.go:94] "Registered client" name="volcano.sh/vgpu-number"
an 16 16:52:03 prod-ncyd-slyt-h20 kubelet[175650]: I0116 16:52:03.112444 175650 manager.go:229] "Device plugin connected" resourceName="volcano.sh/vgpu-number"
an 16 16:52:03 prod-ncyd-slyt-h20 kubelet[175650]: I0116 16:52:03.113084 175650 client.go:91] "State pushed for device plugin" resource="volcano.sh/vgpu-number" resourceCapacity=80
an 16 16:52:03 prod-ncyd-slyt-h20 kubelet[175650]: I0116 16:52:03.114406 175650 manager.go:278] "Processed device updates for resource" resourceName="volcano.sh/vgpu-number" totalCount=80 healthyCount=80
an 16 16:52:03 prod-ncyd-slyt-h20 kubelet[175650]: I0116 16:52:03.724440 175650 server.go:144] "Got registration request from device plugin with resource" resourceName="volcano.sh/vgpu-memory"
an 16 16:52:03 prod-ncyd-slyt-h20 kubelet[175650]: I0116 16:52:03.724516 175650 handler.go:94] "Registered client" name="volcano.sh/vgpu-memory"
an 16 16:52:03 prod-ncyd-slyt-h20 kubelet[175650]: I0116 16:52:03.725480 175650 manager.go:229] "Device plugin connected" resourceName="volcano.sh/vgpu-memory"
an 16 16:52:03 prod-ncyd-slyt-h20 kubelet[175650]: E0116 16:52:03.819472 175650 client.go:88] "ListAndWatch ended unexpectedly for device plugin" err="rpc error: code = ResourceExhausted desc = grpc: received message larger than max
23264648 vs. 4194304)" resource="volcano.sh/vgpu-memory"
an 16 16:52:03 prod-ncyd-slyt-h20 kubelet[175650]: I0116 16:52:03.819513 175650 handler.go:102] "Deregistered client" name="volcano.sh/vgpu-memory"
an 16 16:52:03 prod-ncyd-slyt-h20 kubelet[175650]: I0116 16:52:03.819585 175650 manager.go:367] "Mark all resources Unhealthy for resource" resourceName="volcano.sh/vgpu-memory"
an 16 16:52:03 prod-ncyd-slyt-h20 kubelet[175650]: I0116 16:52:03.819634 175650 manager.go:241] "Endpoint became unhealthy" resourceName="volcano.sh/vgpu-memory" endpoint={}
an 16 16:52:04 prod-ncyd-slyt-h20 kubelet[175650]: I0116 16:52:04.333209 175650 server.go:144] "Got registration request from device plugin with resource" resourceName="volcano.sh/vgpu-cores"
an 16 16:52:04 prod-ncyd-slyt-h20 kubelet[175650]: I0116 16:52:04.333312 175650 handler.go:94] "Registered client" name="volcano.sh/vgpu-cores"
an 16 16:52:04 prod-ncyd-slyt-h20 kubelet[175650]: I0116 16:52:04.334513 175650 manager.go:229] "Device plugin connected" resourceName="volcano.sh/vgpu-cores"
an 16 16:52:04 prod-ncyd-slyt-h20 kubelet[175650]: I0116 16:52:04.336098 175650 client.go:91] "State pushed for device plugin" resource="volcano.sh/vgpu-cores" resourceCapacity=800
an 16 16:52:04 prod-ncyd-slyt-h20 kubelet[175650]: I0116 16:52:04.338885 175650 manager.go:278] "Processed device updates for resource" resourceName="volcano.sh/vgpu-cores" totalCount=800 healthyCount=800
an 16 16:52:04 prod-ncyd-slyt-h20 kubelet[175650]: I0116 16:52:04.760619 175650 kubelet.go:2421] "SyncLoop (PLEG): event for pod" pod="kube-system/volcano-device-plugin-4rcx5" event={"ID":"d963e7ec-eff1-478f-9eda-22cdf70b42f6","Type"
"ContainerStarted","Data":"6749ea57b240501eea95ed2b912eb6c4f92f0b319874bedfa22287181f9808a0"}
an 16 16:52:04 prod-ncyd-slyt-h20 kubelet[175650]: I0116 16:52:04.789566 175650 pod_startup_latency_tracker.go:102] "Observed pod startup duration" pod="kube-system/volcano-device-plugin-4rcx5" podStartSLOduration=307.7894862 podCrea
ionTimestamp="2026-01-16 16:46:57 +0800 CST" firstStartedPulling="0001-01-01 00:00:00 +0000 UTC" lastFinishedPulling="0001-01-01 00:00:00 +0000 UTC" observedRunningTime="2026-01-16 16:52:04.788576564 +0800 CST m=+93059.863248142" watc
ObservedRunningTime="2026-01-16 16:52:04.7894862 +0800 CST m=+93059.864157777"
an 16 16:52:08 prod-ncyd-slyt-h20 kubelet[175650]: I0116 16:52:08.528473 175650 setters.go:317] "Updated capacity for device plugin" plugin="volcano.sh/vgpu-number" capacity=80
an 16 16:52:08 prod-ncyd-slyt-h20 kubelet[175650]: I0116 16:52:08.528515 175650 setters.go:317] "Updated capacity for device plugin" plugin="volcano.sh/vgpu-cores" capacity=800
an 16 16:52:08 prod-ncyd-slyt-h20 kubelet[175650]: I0116 16:52:08.528539 175650 setters.go:317] "Updated capacity for device plugin" plugin="volcano.sh/vgpu-memory" capacity=0
an 16 16:57:04 prod-ncyd-slyt-h20 kubelet[175650]: I0116 16:57:04.904530 175650 setters.go:323] "Set capacity for removed resource to 0 on device removal" device="volcano.sh/vgpu-memory"
C
@archlitchi
How can I fix this error?
8*H20 (140G VRAM)
E0116 16:52:03.819472 175650 client.go:88] "ListAndWatch ended unexpectedly for device plugin" err="rpc error: code = ResourceExhausted desc = grpc: received message larger than max (23264648 vs. 4194304)" resource="volcano.sh/vgpu-memory"
an 16 16:52:00 prod-ncyd-slyt-h20 kubelet[175650]: I0116 16:52:00.741553 175650 kubelet.go:2421] "SyncLoop (PLEG): event for pod" pod="kube-system/volcano-device-plugin-4rcx5" event={"ID":"d963e7ec-eff1-478f-9eda-22cdf70b42f6","Type" "ContainerStarted","Data":"3f9654c6630b680fa042ae272bca22ce87382280b0f79d1d7301e0511b675045"} an 16 16:52:02 prod-ncyd-slyt-h20 kubelet[175650]: I0116 16:52:02.751785 175650 kubelet.go:2421] "SyncLoop (PLEG): event for pod" pod="kube-system/volcano-device-plugin-4rcx5" event={"ID":"d963e7ec-eff1-478f-9eda-22cdf70b42f6","Type" "ContainerStarted","Data":"ec2bcca5403f868f3ca82a1e91e51f3edeefd4bc959c68024d0b7311cd3eadac"} an 16 16:52:03 prod-ncyd-slyt-h20 kubelet[175650]: I0116 16:52:03.110671 175650 server.go:144] "Got registration request from device plugin with resource" resourceName="volcano.sh/vgpu-number" an 16 16:52:03 prod-ncyd-slyt-h20 kubelet[175650]: I0116 16:52:03.110748 175650 handler.go:94] "Registered client" name="volcano.sh/vgpu-number" an 16 16:52:03 prod-ncyd-slyt-h20 kubelet[175650]: I0116 16:52:03.112444 175650 manager.go:229] "Device plugin connected" resourceName="volcano.sh/vgpu-number" an 16 16:52:03 prod-ncyd-slyt-h20 kubelet[175650]: I0116 16:52:03.113084 175650 client.go:91] "State pushed for device plugin" resource="volcano.sh/vgpu-number" resourceCapacity=80 an 16 16:52:03 prod-ncyd-slyt-h20 kubelet[175650]: I0116 16:52:03.114406 175650 manager.go:278] "Processed device updates for resource" resourceName="volcano.sh/vgpu-number" totalCount=80 healthyCount=80 an 16 16:52:03 prod-ncyd-slyt-h20 kubelet[175650]: I0116 16:52:03.724440 175650 server.go:144] "Got registration request from device plugin with resource" resourceName="volcano.sh/vgpu-memory" an 16 16:52:03 prod-ncyd-slyt-h20 kubelet[175650]: I0116 16:52:03.724516 175650 handler.go:94] "Registered client" name="volcano.sh/vgpu-memory" an 16 16:52:03 prod-ncyd-slyt-h20 kubelet[175650]: I0116 16:52:03.725480 175650 manager.go:229] "Device plugin connected" resourceName="volcano.sh/vgpu-memory" an 16 16:52:03 prod-ncyd-slyt-h20 kubelet[175650]: E0116 16:52:03.819472 175650 client.go:88] "ListAndWatch ended unexpectedly for device plugin" err="rpc error: code = ResourceExhausted desc = grpc: received message larger than max 23264648 vs. 4194304)" resource="volcano.sh/vgpu-memory" an 16 16:52:03 prod-ncyd-slyt-h20 kubelet[175650]: I0116 16:52:03.819513 175650 handler.go:102] "Deregistered client" name="volcano.sh/vgpu-memory" an 16 16:52:03 prod-ncyd-slyt-h20 kubelet[175650]: I0116 16:52:03.819585 175650 manager.go:367] "Mark all resources Unhealthy for resource" resourceName="volcano.sh/vgpu-memory" an 16 16:52:03 prod-ncyd-slyt-h20 kubelet[175650]: I0116 16:52:03.819634 175650 manager.go:241] "Endpoint became unhealthy" resourceName="volcano.sh/vgpu-memory" endpoint={} an 16 16:52:04 prod-ncyd-slyt-h20 kubelet[175650]: I0116 16:52:04.333209 175650 server.go:144] "Got registration request from device plugin with resource" resourceName="volcano.sh/vgpu-cores" an 16 16:52:04 prod-ncyd-slyt-h20 kubelet[175650]: I0116 16:52:04.333312 175650 handler.go:94] "Registered client" name="volcano.sh/vgpu-cores" an 16 16:52:04 prod-ncyd-slyt-h20 kubelet[175650]: I0116 16:52:04.334513 175650 manager.go:229] "Device plugin connected" resourceName="volcano.sh/vgpu-cores" an 16 16:52:04 prod-ncyd-slyt-h20 kubelet[175650]: I0116 16:52:04.336098 175650 client.go:91] "State pushed for device plugin" resource="volcano.sh/vgpu-cores" resourceCapacity=800 an 16 16:52:04 prod-ncyd-slyt-h20 kubelet[175650]: I0116 16:52:04.338885 175650 manager.go:278] "Processed device updates for resource" resourceName="volcano.sh/vgpu-cores" totalCount=800 healthyCount=800 an 16 16:52:04 prod-ncyd-slyt-h20 kubelet[175650]: I0116 16:52:04.760619 175650 kubelet.go:2421] "SyncLoop (PLEG): event for pod" pod="kube-system/volcano-device-plugin-4rcx5" event={"ID":"d963e7ec-eff1-478f-9eda-22cdf70b42f6","Type" "ContainerStarted","Data":"6749ea57b240501eea95ed2b912eb6c4f92f0b319874bedfa22287181f9808a0"} an 16 16:52:04 prod-ncyd-slyt-h20 kubelet[175650]: I0116 16:52:04.789566 175650 pod_startup_latency_tracker.go:102] "Observed pod startup duration" pod="kube-system/volcano-device-plugin-4rcx5" podStartSLOduration=307.7894862 podCrea ionTimestamp="2026-01-16 16:46:57 +0800 CST" firstStartedPulling="0001-01-01 00:00:00 +0000 UTC" lastFinishedPulling="0001-01-01 00:00:00 +0000 UTC" observedRunningTime="2026-01-16 16:52:04.788576564 +0800 CST m=+93059.863248142" watc ObservedRunningTime="2026-01-16 16:52:04.7894862 +0800 CST m=+93059.864157777" an 16 16:52:08 prod-ncyd-slyt-h20 kubelet[175650]: I0116 16:52:08.528473 175650 setters.go:317] "Updated capacity for device plugin" plugin="volcano.sh/vgpu-number" capacity=80 an 16 16:52:08 prod-ncyd-slyt-h20 kubelet[175650]: I0116 16:52:08.528515 175650 setters.go:317] "Updated capacity for device plugin" plugin="volcano.sh/vgpu-cores" capacity=800 an 16 16:52:08 prod-ncyd-slyt-h20 kubelet[175650]: I0116 16:52:08.528539 175650 setters.go:317] "Updated capacity for device plugin" plugin="volcano.sh/vgpu-memory" capacity=0 an 16 16:57:04 prod-ncyd-slyt-h20 kubelet[175650]: I0116 16:57:04.904530 175650 setters.go:323] "Set capacity for removed resource to 0 on device removal" device="volcano.sh/vgpu-memory" C@archlitchi
How can I fix this error?