Skip to content

received message larger than max #99

@V-yw

Description

@V-yw

8*H20 (140G VRAM)

E0116 16:52:03.819472 175650 client.go:88] "ListAndWatch ended unexpectedly for device plugin" err="rpc error: code = ResourceExhausted desc = grpc: received message larger than max (23264648 vs. 4194304)" resource="volcano.sh/vgpu-memory"

an 16 16:52:00 prod-ncyd-slyt-h20 kubelet[175650]: I0116 16:52:00.741553  175650 kubelet.go:2421] "SyncLoop (PLEG): event for pod" pod="kube-system/volcano-device-plugin-4rcx5" event={"ID":"d963e7ec-eff1-478f-9eda-22cdf70b42f6","Type"
"ContainerStarted","Data":"3f9654c6630b680fa042ae272bca22ce87382280b0f79d1d7301e0511b675045"}
an 16 16:52:02 prod-ncyd-slyt-h20 kubelet[175650]: I0116 16:52:02.751785  175650 kubelet.go:2421] "SyncLoop (PLEG): event for pod" pod="kube-system/volcano-device-plugin-4rcx5" event={"ID":"d963e7ec-eff1-478f-9eda-22cdf70b42f6","Type"
"ContainerStarted","Data":"ec2bcca5403f868f3ca82a1e91e51f3edeefd4bc959c68024d0b7311cd3eadac"}
an 16 16:52:03 prod-ncyd-slyt-h20 kubelet[175650]: I0116 16:52:03.110671  175650 server.go:144] "Got registration request from device plugin with resource" resourceName="volcano.sh/vgpu-number"
an 16 16:52:03 prod-ncyd-slyt-h20 kubelet[175650]: I0116 16:52:03.110748  175650 handler.go:94] "Registered client" name="volcano.sh/vgpu-number"
an 16 16:52:03 prod-ncyd-slyt-h20 kubelet[175650]: I0116 16:52:03.112444  175650 manager.go:229] "Device plugin connected" resourceName="volcano.sh/vgpu-number"
an 16 16:52:03 prod-ncyd-slyt-h20 kubelet[175650]: I0116 16:52:03.113084  175650 client.go:91] "State pushed for device plugin" resource="volcano.sh/vgpu-number" resourceCapacity=80
an 16 16:52:03 prod-ncyd-slyt-h20 kubelet[175650]: I0116 16:52:03.114406  175650 manager.go:278] "Processed device updates for resource" resourceName="volcano.sh/vgpu-number" totalCount=80 healthyCount=80
an 16 16:52:03 prod-ncyd-slyt-h20 kubelet[175650]: I0116 16:52:03.724440  175650 server.go:144] "Got registration request from device plugin with resource" resourceName="volcano.sh/vgpu-memory"
an 16 16:52:03 prod-ncyd-slyt-h20 kubelet[175650]: I0116 16:52:03.724516  175650 handler.go:94] "Registered client" name="volcano.sh/vgpu-memory"
an 16 16:52:03 prod-ncyd-slyt-h20 kubelet[175650]: I0116 16:52:03.725480  175650 manager.go:229] "Device plugin connected" resourceName="volcano.sh/vgpu-memory"
an 16 16:52:03 prod-ncyd-slyt-h20 kubelet[175650]: E0116 16:52:03.819472  175650 client.go:88] "ListAndWatch ended unexpectedly for device plugin" err="rpc error: code = ResourceExhausted desc = grpc: received message larger than max 
23264648 vs. 4194304)" resource="volcano.sh/vgpu-memory"
an 16 16:52:03 prod-ncyd-slyt-h20 kubelet[175650]: I0116 16:52:03.819513  175650 handler.go:102] "Deregistered client" name="volcano.sh/vgpu-memory"
an 16 16:52:03 prod-ncyd-slyt-h20 kubelet[175650]: I0116 16:52:03.819585  175650 manager.go:367] "Mark all resources Unhealthy for resource" resourceName="volcano.sh/vgpu-memory"
an 16 16:52:03 prod-ncyd-slyt-h20 kubelet[175650]: I0116 16:52:03.819634  175650 manager.go:241] "Endpoint became unhealthy" resourceName="volcano.sh/vgpu-memory" endpoint={}
an 16 16:52:04 prod-ncyd-slyt-h20 kubelet[175650]: I0116 16:52:04.333209  175650 server.go:144] "Got registration request from device plugin with resource" resourceName="volcano.sh/vgpu-cores"
an 16 16:52:04 prod-ncyd-slyt-h20 kubelet[175650]: I0116 16:52:04.333312  175650 handler.go:94] "Registered client" name="volcano.sh/vgpu-cores"
an 16 16:52:04 prod-ncyd-slyt-h20 kubelet[175650]: I0116 16:52:04.334513  175650 manager.go:229] "Device plugin connected" resourceName="volcano.sh/vgpu-cores"
an 16 16:52:04 prod-ncyd-slyt-h20 kubelet[175650]: I0116 16:52:04.336098  175650 client.go:91] "State pushed for device plugin" resource="volcano.sh/vgpu-cores" resourceCapacity=800
an 16 16:52:04 prod-ncyd-slyt-h20 kubelet[175650]: I0116 16:52:04.338885  175650 manager.go:278] "Processed device updates for resource" resourceName="volcano.sh/vgpu-cores" totalCount=800 healthyCount=800
an 16 16:52:04 prod-ncyd-slyt-h20 kubelet[175650]: I0116 16:52:04.760619  175650 kubelet.go:2421] "SyncLoop (PLEG): event for pod" pod="kube-system/volcano-device-plugin-4rcx5" event={"ID":"d963e7ec-eff1-478f-9eda-22cdf70b42f6","Type"
"ContainerStarted","Data":"6749ea57b240501eea95ed2b912eb6c4f92f0b319874bedfa22287181f9808a0"}
an 16 16:52:04 prod-ncyd-slyt-h20 kubelet[175650]: I0116 16:52:04.789566  175650 pod_startup_latency_tracker.go:102] "Observed pod startup duration" pod="kube-system/volcano-device-plugin-4rcx5" podStartSLOduration=307.7894862 podCrea
ionTimestamp="2026-01-16 16:46:57 +0800 CST" firstStartedPulling="0001-01-01 00:00:00 +0000 UTC" lastFinishedPulling="0001-01-01 00:00:00 +0000 UTC" observedRunningTime="2026-01-16 16:52:04.788576564 +0800 CST m=+93059.863248142" watc
ObservedRunningTime="2026-01-16 16:52:04.7894862 +0800 CST m=+93059.864157777"
an 16 16:52:08 prod-ncyd-slyt-h20 kubelet[175650]: I0116 16:52:08.528473  175650 setters.go:317] "Updated capacity for device plugin" plugin="volcano.sh/vgpu-number" capacity=80
an 16 16:52:08 prod-ncyd-slyt-h20 kubelet[175650]: I0116 16:52:08.528515  175650 setters.go:317] "Updated capacity for device plugin" plugin="volcano.sh/vgpu-cores" capacity=800
an 16 16:52:08 prod-ncyd-slyt-h20 kubelet[175650]: I0116 16:52:08.528539  175650 setters.go:317] "Updated capacity for device plugin" plugin="volcano.sh/vgpu-memory" capacity=0
an 16 16:57:04 prod-ncyd-slyt-h20 kubelet[175650]: I0116 16:57:04.904530  175650 setters.go:323] "Set capacity for removed resource to 0 on device removal" device="volcano.sh/vgpu-memory"
C

@archlitchi

How can I fix this error?

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions