Skip to content

Commit 4c6029a

Browse files
committed
19879 nvidia_smi: don't crash when no GPU is found
The bash agent plugin emits the section header before running nvidia-smi -q -x, so any non-XML output (Linux "No devices were found", Windows "ERROR: nvidia-smi.exe was not found in: ...") was passed verbatim to ElementTree.fromstring and crashed parse_nvidia_smi with ParseError. Treat ParseError as an empty section. Crash-Group-ID: 3616 Jira: SUP-28831 AI-Generated: true Change-Id: I6bf22854809ab0ba4c1d5598a6162a53a8ac9fb0
1 parent 2c40681 commit 4c6029a

3 files changed

Lines changed: 57 additions & 26 deletions

File tree

.werks/19879.md

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
[//]: # (werk v3)
2+
# nvidia_smi: don't crash when no GPU is found
3+
4+
key | value
5+
---------- | ---
6+
date | 2026-05-13T08:23:12.201670+00:00
7+
version | 2.6.0b1
8+
class | fix
9+
edition | community
10+
component | checks
11+
level | 1
12+
compatible | yes
13+
14+
When the agent host had `nvidia-smi` installed but no GPU was currently
15+
visible to the driver (the Linux plugin then forwarded `No devices were
16+
found`), or when `nvidia-smi.exe` was not installed at the path the
17+
Windows agent plugin searches (the plugin forwarded a `not found in:`
18+
error block), the agent emitted a plain-text message instead of the
19+
expected XML. The `nvidia_smi` section then crashed with `ParseError:
20+
syntax error: line 1, column 0` and produced a crash report on every
21+
check cycle.
22+
23+
The check now treats a non-XML body as an empty section: no `nvidia_smi`
24+
services are reported for the affected host, and no crash report is
25+
generated.

cmk/plugins/nvidia/agent_based/nvidia_smi.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -146,7 +146,16 @@ def _let_pydantic_check_power_state(value: str | None) -> PowerState:
146146

147147

148148
def parse_nvidia_smi(string_table: StringTable) -> Section:
149-
xml = ElementTree.fromstring("".join([element[0] for element in string_table]))
149+
try:
150+
xml = ElementTree.fromstring("".join([element[0] for element in string_table]))
151+
except ElementTree.ParseError:
152+
return Section(
153+
timestamp=None,
154+
driver_version=None,
155+
cuda_version=None,
156+
attached_gpus=None,
157+
gpus={},
158+
)
150159
# find the element name for power_readings
151160
power_readings_element = "gpu_power_readings"
152161
if xml.find(f"gpu/{power_readings_element}") is None:

tests/unit/cmk/plugins/nvidia/agent_based/test_nvidia_smi.py

Lines changed: 22 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -273,47 +273,44 @@ def empty_value_store(monkeypatch: pytest.MonkeyPatch) -> None:
273273
)
274274

275275

276-
@pytest.mark.parametrize(
277-
"string_table, expected_result",
278-
[
279-
(
280-
STRING_TABLE,
281-
SECTION,
282-
),
283-
],
276+
EMPTY_SECTION = nvidia_smi.Section(
277+
timestamp=None,
278+
driver_version=None,
279+
cuda_version=None,
280+
attached_gpus=None,
281+
gpus={},
284282
)
285-
def test_parse_nvidia_smi(
286-
string_table: StringTable,
287-
expected_result: nvidia_smi.Section,
288-
) -> None:
289-
assert nvidia_smi.parse_nvidia_smi(string_table) == expected_result
290283

291284

292-
# Inputs captured verbatim from real agents that emit non-XML on the nvidia_smi
293-
# section. They must be tolerated, not crash the parser with xml.etree.ParseError.
294-
# * linux_no_devices: SUP-28831 — RHEL host with installed nvidia-smi but no
295-
# visible GPU returns "No devices were found".
296-
# * windows_nvidia_smi_missing: crash group 3616 — Windows agent plugin emits
297-
# a "not found in:" error block when nvidia-smi.exe is absent.
298-
@pytest.mark.xfail(strict=True, reason="Crash group 3616: ParseError on non-XML nvidia-smi output")
285+
# linux_no_devices and windows_nvidia_smi_missing capture verbatim agent
286+
# outputs that today reach parse_nvidia_smi as non-XML and used to crash it
287+
# with xml.etree.ElementTree.ParseError:
288+
# * linux_no_devices: SUP-28831 — RHEL host with installed nvidia-smi but
289+
# no visible GPU returns "No devices were found".
290+
# * windows_nvidia_smi_missing: crash group 3616 — Windows agent plugin
291+
# emits a "not found in:" error block when nvidia-smi.exe is absent.
299292
@pytest.mark.parametrize(
300-
"string_table",
293+
"string_table, expected_result",
301294
[
302-
pytest.param([["No devices were found"]], id="linux_no_devices"),
295+
pytest.param(STRING_TABLE, SECTION, id="valid_xml"),
296+
pytest.param([["No devices were found"]], EMPTY_SECTION, id="linux_no_devices"),
303297
pytest.param(
304298
[
305299
["ERROR: nvidia-smi.exe was not found in:"],
306300
["- (configured path)"],
307301
["- C:\\Program Files\\NVIDIA Corporation\\NVSMI\\nvidia-smi.exe (default path)"],
308302
["- system PATH"],
309303
],
304+
EMPTY_SECTION,
310305
id="windows_nvidia_smi_missing",
311306
),
312307
],
313308
)
314-
def test_parse_nvidia_smi_non_xml_output(string_table: StringTable) -> None:
315-
section = nvidia_smi.parse_nvidia_smi(string_table)
316-
assert section.gpus == {}
309+
def test_parse_nvidia_smi(
310+
string_table: StringTable,
311+
expected_result: nvidia_smi.Section,
312+
) -> None:
313+
assert nvidia_smi.parse_nvidia_smi(string_table) == expected_result
317314

318315

319316
@pytest.mark.parametrize(

0 commit comments

Comments
 (0)