Skip to content

Commit 2c41ffc

Browse files
Merge pull request #211 from run-ai/bzilber/RUN-39984-nvidia-smi-error-messages
fix(nvidia-smi): use nvidia-smi-style failure messages per error path (#206)
2 parents 8666247 + b9c7bbf commit 2c41ffc

2 files changed

Lines changed: 11 additions & 8 deletions

File tree

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/).
2323
`release-helm`, which then never published the chart the next run needed.
2424
Mirrored in the `make e2e-upgrade-from-main` target. (RUN-40080)
2525
- Fake `nvidia-smi` exits gracefully instead of panicking on errors. ([#206](https://github.com/run-ai/fake-gpu-operator/issues/206))
26+
- Fake `nvidia-smi` failure output mirrors real `nvidia-smi` per error instead of one generic line.
2627

2728

2829
## [0.0.81] - 2026-05-27

cmd/nvidia-smi/main.go

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -40,10 +40,10 @@ func main() {
4040
fmt.Println("Debug mode enabled")
4141
}
4242

43-
args, errs := getNvidiaSmiArgs()
43+
args, summary, errs := getNvidiaSmiArgs()
4444

4545
if len(args) == 0 {
46-
fmt.Println("no devices found")
46+
fmt.Println(summary)
4747
printErrors(errs)
4848
os.Exit(1)
4949
}
@@ -58,8 +58,10 @@ func printErrors(errs []error) {
5858
}
5959
}
6060

61-
func getNvidiaSmiArgs() ([]nvidiaSmiArgs, []error) {
62-
var errs []error
61+
// getNvidiaSmiArgs returns the per-GPU rows to render. When no devices can be
62+
// obtained it returns an empty slice plus a user-facing summary line describing
63+
// the failure (printed to stdout), with the underlying causes in errs (stderr).
64+
func getNvidiaSmiArgs() (args []nvidiaSmiArgs, summary string, errs []error) {
6365

6466
nodeName := os.Getenv(constants.EnvNodeName)
6567
if conf.Debug {
@@ -74,7 +76,7 @@ func getNvidiaSmiArgs() ([]nvidiaSmiArgs, []error) {
7476
// are fatal: accumulate the error and return no devices.
7577
resp, err := http.Get(topologyUrl)
7678
if err != nil {
77-
return nil, append(errs, fmt.Errorf("fetching topology from %s: %w", topologyUrl, err))
79+
return nil, "NVIDIA-SMI has failed because it couldn't communicate with the topology server.", append(errs, fmt.Errorf("fetching topology from %s: %w", topologyUrl, err))
7880
}
7981
defer func() {
8082
if err := resp.Body.Close(); err != nil {
@@ -84,12 +86,12 @@ func getNvidiaSmiArgs() ([]nvidiaSmiArgs, []error) {
8486

8587
if resp.StatusCode != http.StatusOK {
8688
body, _ := io.ReadAll(resp.Body)
87-
return nil, append(errs, fmt.Errorf("topology server %s returned %s: %s", topologyUrl, resp.Status, strings.TrimSpace(string(body))))
89+
return nil, "No devices were found", append(errs, fmt.Errorf("topology server %s returned %s: %s", topologyUrl, resp.Status, strings.TrimSpace(string(body))))
8890
}
8991

9092
var nodeTopology topology.NodeTopology
9193
if err := json.NewDecoder(resp.Body).Decode(&nodeTopology); err != nil {
92-
return nil, append(errs, fmt.Errorf("decoding topology from %s: %w", topologyUrl, err))
94+
return nil, "Failed to parse devices data", append(errs, fmt.Errorf("decoding topology from %s: %w", topologyUrl, err))
9395
}
9496
if conf.Debug {
9597
fmt.Printf("Received topology: %+v\n", nodeTopology)
@@ -160,7 +162,7 @@ func getNvidiaSmiArgs() ([]nvidiaSmiArgs, []error) {
160162
})
161163
}
162164

163-
return allArgs, errs
165+
return allArgs, "", errs
164166
}
165167

166168
func readProcessName() (string, error) {

0 commit comments

Comments
 (0)