Skip to content

Commit 28012cf

Browse files
authored
Add NVIDIA GPU passive health checks (#2952)
* shim: Integrate libdcgm, add a new endpoint returning overall GPU health with list of incidents. * Periodically pull instance health from shim, store the raw response in a new DB table. Infer overall instance health and store it in a new column of the "instances" table. * Don't consider failed instances for submitted jobs. Note: instances with warnings are still considered for jobs. * API: add a new method returning a list of instance health checks with unified structure. * CLI: display "warning" and "failure" health statuses in the same way as "unreachable", below the instance status. Closes: #2930
1 parent 6c492df commit 28012cf

File tree

37 files changed

+1207
-150
lines changed

37 files changed

+1207
-150
lines changed

.github/workflows/build.yml

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -126,7 +126,7 @@ jobs:
126126
runs-on: ${{ matrix.os }}
127127
strategy:
128128
matrix:
129-
os: [ubuntu-latest, macos-latest]
129+
os: [ubuntu-latest]
130130
steps:
131131
- uses: actions/checkout@v4
132132
- name: Set up Go
@@ -167,9 +167,9 @@ jobs:
167167
strategy:
168168
matrix:
169169
include:
170-
- { goos: "linux", goarch: "amd64" }
171-
- { goos: "linux", goarch: "arm64" }
172-
runs-on: ubuntu-latest
170+
- { runs-on: "ubuntu-24.04", goos: "linux", goarch: "amd64" }
171+
- { runs-on: "ubuntu-24.04-arm", goos: "linux", goarch: "arm64" }
172+
runs-on: ${{ matrix.runs-on }}
173173
steps:
174174
- uses: actions/checkout@v4
175175
- name: Set up Go
@@ -181,11 +181,10 @@ jobs:
181181
env:
182182
GOOS: ${{ matrix.goos }}
183183
GOARCH: ${{ matrix.goarch }}
184-
CGO_ENABLED: 0
185184
run: |
186185
VERSION=$((${{ github.run_number }} + ${{ env.BUILD_INCREMENT }}))
187-
go build -ldflags "-X 'main.Version=$VERSION' -extldflags '-static'" -o dstack-runner-$GOOS-$GOARCH $REPO_NAME/runner/cmd/runner
188-
go build -ldflags "-X 'main.Version=$VERSION' -extldflags '-static'" -o dstack-shim-$GOOS-$GOARCH $REPO_NAME/runner/cmd/shim
186+
CGO_ENABLED=0 go build -ldflags "-X 'main.Version=$VERSION' -extldflags '-static'" -o dstack-runner-$GOOS-$GOARCH $REPO_NAME/runner/cmd/runner
187+
CGO_ENABLED=1 go build -ldflags "-X 'main.Version=$VERSION'" -o dstack-shim-$GOOS-$GOARCH $REPO_NAME/runner/cmd/shim
189188
echo $VERSION
190189
- uses: actions/upload-artifact@v4
191190
with:

docs/docs/reference/environment-variables.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -126,6 +126,10 @@ For more details on the options below, refer to the [server deployment](../guide
126126
- `DSTACK_SERVER_BACKGROUND_PROCESSING_DISABLED`{ #DSTACK_SERVER_BACKGROUND_PROCESSING_DISABLED } - Disables background processing if set to any value. Useful to run only web frontend and API server.
127127
- `DSTACK_SERVER_MAX_PROBES_PER_JOB`{ #DSTACK_SERVER_MAX_PROBES_PER_JOB } - Maximum number of probes allowed in a run configuration. Validated at apply time.
128128
- `DSTACK_SERVER_MAX_PROBE_TIMEOUT`{ #DSTACK_SERVER_MAX_PROBE_TIMEOUT } - Maximum allowed timeout for a probe. Validated at apply time.
129+
- `DSTACK_SERVER_METRICS_RUNNING_TTL_SECONDS`{ #DSTACK_SERVER_METRICS_RUNNING_TTL_SECONDS } – Maximum age of metrics samples for running jobs.
130+
- `DSTACK_SERVER_METRICS_FINISHED_TTL_SECONDS`{ #DSTACK_SERVER_METRICS_FINISHED_TTL_SECONDS } – Maximum age of metrics samples for finished jobs.
131+
- `DSTACK_SERVER_INSTANCE_HEALTH_TTL_SECONDS`{ #DSTACK_SERVER_INSTANCE_HEALTH_TTL_SECONDS } – Maximum age of instance health checks.
132+
- `DSTACK_SERVER_INSTANCE_HEALTH_MIN_COLLECT_INTERVAL_SECONDS`{ #DSTACK_SERVER_INSTANCE_HEALTH_MIN_COLLECT_INTERVAL_SECONDS } – Minimum time interval between consecutive health checks of the same instance.
129133

130134
??? info "Internal environment variables"
131135
The following environment variables are intended for development purposes:

runner/cmd/shim/main.go

Lines changed: 28 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -112,6 +112,14 @@ func main() {
112112
Destination: &args.DCGMExporter.Interval,
113113
EnvVars: []string{"DSTACK_DCGM_EXPORTER_INTERVAL"},
114114
},
115+
/* DCGM Parameters */
116+
&cli.StringFlag{
117+
Name: "dcgm-address",
118+
Usage: "nv-hostengine `hostname`, e.g., `localhost`",
119+
DefaultText: "start libdcgm in embedded mode",
120+
Destination: &args.DCGM.Address,
121+
EnvVars: []string{"DSTACK_DCGM_ADDRESS"},
122+
},
115123
/* Docker Parameters */
116124
&cli.BoolFlag{
117125
Name: "privileged",
@@ -196,6 +204,7 @@ func start(ctx context.Context, args shim.CLIArgs, serviceMode bool) (err error)
196204
}
197205

198206
var dcgmExporter *dcgm.DCGMExporter
207+
var dcgmWrapper *dcgm.DCGMWrapper
199208

200209
if common.GetGpuVendor() == common.GpuVendorNvidia {
201210
dcgmExporterPath, err := dcgm.GetDCGMExporterExecPath(ctx)
@@ -207,16 +216,32 @@ func start(ctx context.Context, args shim.CLIArgs, serviceMode bool) (err error)
207216
if err == nil {
208217
log.Info(ctx, "using DCGM Exporter")
209218
defer func() {
210-
_ = dcgmExporter.Stop(ctx)
219+
if err := dcgmExporter.Stop(ctx); err != nil {
220+
log.Error(ctx, "failed to stop DCGM Exporter", "err", err)
221+
}
211222
}()
212223
} else {
213224
log.Warning(ctx, "not using DCGM Exporter", "err", err)
214-
dcgmExporter = nil
225+
}
226+
227+
dcgmWrapper, err = dcgm.NewDCGMWrapper(args.DCGM.Address)
228+
if err == nil {
229+
log.Info(ctx, "using libdcgm")
230+
defer func() {
231+
if err := dcgmWrapper.Shutdown(); err != nil {
232+
log.Error(ctx, "failed to shut down libdcgm", "err", err)
233+
}
234+
}()
235+
if err := dcgmWrapper.EnableHealthChecks(); err != nil {
236+
log.Error(ctx, "failed to enable libdcgm health checks", "err", err)
237+
}
238+
} else {
239+
log.Warning(ctx, "not using libdcgm", "err", err)
215240
}
216241
}
217242

218243
address := fmt.Sprintf(":%d", args.Shim.HTTPPort)
219-
shimServer := api.NewShimServer(ctx, address, dockerRunner, dcgmExporter, Version)
244+
shimServer := api.NewShimServer(ctx, address, Version, dockerRunner, dcgmExporter, dcgmWrapper)
220245

221246
defer func() {
222247
shutdownCtx, cancelShutdown := context.WithTimeout(ctx, 5*time.Second)

runner/docs/shim.openapi.yaml

Lines changed: 95 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ openapi: 3.1.1
22

33
info:
44
title: dstack-shim API
5-
version: v2/0.18.34
5+
version: v2/0.19.22
66
x-logo:
77
url: https://avatars.githubusercontent.com/u/54146142?s=260
88
description: >
@@ -50,10 +50,25 @@ paths:
5050
schema:
5151
$ref: "#/components/schemas/HealthcheckResponse"
5252

53+
/instance/health:
54+
get:
55+
summary: Get instance health
56+
57+
description: (since [0.19.22](https://github.com/dstackai/dstack/releases/tag/0.19.22)) Returns an object of optional passive system checks
58+
tags: [Instance]
59+
responses:
60+
"200":
61+
description: ""
62+
content:
63+
application/json:
64+
schema:
65+
$ref: "#/components/schemas/InstanceHealthResponse"
66+
5367
/tasks:
5468
get:
5569
summary: Get task list
5670
description: Returns a list of all tasks known to shim, including terminated ones
71+
tags: [Tasks]
5772
responses:
5873
"200":
5974
description: ""
@@ -63,6 +78,7 @@ paths:
6378
$ref: "#/components/schemas/TaskListResponse"
6479
post:
6580
summary: Submit and run new task
81+
tags: [Tasks]
6682
requestBody:
6783
required: true
6884
content:
@@ -86,6 +102,7 @@ paths:
86102
/tasks/{id}:
87103
get:
88104
summary: Get task info
105+
tags: [Tasks]
89106
parameters:
90107
- $ref: "#/parameters/taskId"
91108
responses:
@@ -102,6 +119,7 @@ paths:
102119
Stops the task, that is, cancels image pulling if in progress,
103120
stops the container if running, and sets the status to `terminated`.
104121
No-op if the task is already terminated
122+
tags: [Tasks]
105123
parameters:
106124
- in: path
107125
name: id
@@ -131,6 +149,7 @@ paths:
131149
description: >
132150
Removes the task from in-memory storage and destroys its associated
133151
resources: a container, logs, etc.
152+
tags: [Tasks]
134153
parameters:
135154
- $ref: "#/parameters/taskId"
136155
responses:
@@ -270,7 +289,7 @@ components:
270289
type: string
271290
default: ""
272291
description: Mount point inside container
273-
292+
274293
GPUDevice:
275294
title: shim.GPUDevice
276295
type: object
@@ -284,6 +303,72 @@ components:
284303
default: ""
285304
description: Path inside container
286305

306+
DCGMHealth:
307+
title: shim.dcgm.Health
308+
type: object
309+
properties:
310+
overall_health:
311+
type: integer
312+
description: >
313+
[dcgmHealthWatchResult_enum](https://docs.nvidia.com/datacenter/dcgm/latest/dcgm-api/dcgm-api-data-structs.html#_CPPv426dcgmHealthWatchResult_enum)
314+
examples:
315+
- 10
316+
incidents:
317+
type: array
318+
items:
319+
$ref: "#/components/schemas/DCGMHealthIncident"
320+
required:
321+
- overall_health
322+
- incidents
323+
additionalProperties: false
324+
325+
DCGMHealthIncident:
326+
title: shim.dcgm.HealthIncident
327+
type: object
328+
properties:
329+
system:
330+
type: integer
331+
description: >
332+
[dcgmHealthSystems_enum](https://docs.nvidia.com/datacenter/dcgm/latest/dcgm-api/dcgm-api-data-structs.html#_CPPv422dcgmHealthSystems_enum)
333+
examples:
334+
- 1
335+
health:
336+
type: integer
337+
description: >
338+
[dcgmHealthWatchResult_enum](https://docs.nvidia.com/datacenter/dcgm/latest/dcgm-api/dcgm-api-data-structs.html#_CPPv426dcgmHealthWatchResult_enum)
339+
examples:
340+
- 10
341+
error_message:
342+
type: string
343+
examples:
344+
- >
345+
Detected more than 16 PCIe replays per minute for GPU 0 : 99 Reconnect PCIe card.
346+
Run system side PCIE diagnostic utilities to verify hops off the GPU board. If issue is on the board, run the field diagnostic.
347+
error_code:
348+
type: integer
349+
description: >
350+
[dcgmError_enum](https://github.com/NVIDIA/DCGM/blob/master/dcgmlib/dcgm_errors.h)
351+
examples:
352+
- 3
353+
entity_group_id:
354+
type: integer
355+
description: >
356+
[dcgm_field_entity_group_t](https://docs.nvidia.com/datacenter/dcgm/latest/dcgm-api/dcgm-api-field-entity.html#_CPPv425dcgm_field_entity_group_t)
357+
examples:
358+
- 1
359+
entity_id:
360+
type: integer
361+
examples:
362+
- 0
363+
required:
364+
- system
365+
- health
366+
- error_message
367+
- error_code
368+
- entity_group_id
369+
- entity_id
370+
additionalProperties: false
371+
287372
HealthcheckResponse:
288373
title: shim.api.HealthcheckResponse
289374
type: object
@@ -299,6 +384,14 @@ components:
299384
- version
300385
additionalProperties: false
301386

387+
InstanceHealthResponse:
388+
title: shim.api.InstanceHealthResponse
389+
type: object
390+
properties:
391+
dcgm:
392+
$ref: "#/components/schemas/DCGMHealth"
393+
additionalProperties: false
394+
302395
TaskListResponse:
303396
title: shim.api.TaskListResponse
304397
type: object

runner/go.mod

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ module github.com/dstackai/dstack/runner
33
go 1.23.8
44

55
require (
6+
github.com/NVIDIA/go-dcgm v0.0.0-20250707210631-823394f2bd9b
67
github.com/alexellis/go-execute/v2 v2.2.1
78
github.com/bluekeyes/go-gitdiff v0.7.2
89
github.com/codeclysm/extract/v4 v4.0.0
@@ -29,6 +30,7 @@ require (
2930
dario.cat/mergo v1.0.0 // indirect
3031
github.com/Microsoft/go-winio v0.6.1 // indirect
3132
github.com/ProtonMail/go-crypto v1.0.0 // indirect
33+
github.com/bits-and-blooms/bitset v1.22.0 // indirect
3234
github.com/cloudflare/circl v1.3.7 // indirect
3335
github.com/containerd/log v0.1.0 // indirect
3436
github.com/cpuguy83/go-md2man/v2 v2.0.4 // indirect

runner/go.sum

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@ github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03
77
github.com/Microsoft/go-winio v0.5.2/go.mod h1:WpS1mjBmmwHBEWmogvA2mj8546UReBk4v8QkMxJ6pZY=
88
github.com/Microsoft/go-winio v0.6.1 h1:9/kr64B9VUZrLm5YYwbGtUJnMgqWVOdUAXu6Migciow=
99
github.com/Microsoft/go-winio v0.6.1/go.mod h1:LRdKpFKfdobln8UmuiYcKPot9D2v6svN5+sAH+4kjUM=
10+
github.com/NVIDIA/go-dcgm v0.0.0-20250707210631-823394f2bd9b h1:FL0NJYUNMX1ezl2Dv0azgedHPBXDuqHnqGDtqj6aqZM=
11+
github.com/NVIDIA/go-dcgm v0.0.0-20250707210631-823394f2bd9b/go.mod h1:cA0Bv7+JtAd8sqCCZizhAQjj4+Z47x/d8KD60iYBT+g=
1012
github.com/ProtonMail/go-crypto v1.0.0 h1:LRuvITjQWX+WIfr930YHG2HNfjR1uOfyf5vE0kC2U78=
1113
github.com/ProtonMail/go-crypto v1.0.0/go.mod h1:EjAoLdwvbIOoOQr3ihjnSoLZRtE8azugULFRteWMNc0=
1214
github.com/alexellis/go-execute/v2 v2.2.1 h1:4Ye3jiCKQarstODOEmqDSRCqxMHLkC92Bhse743RdOI=
@@ -17,6 +19,8 @@ github.com/arduino/go-paths-helper v1.12.1 h1:WkxiVUxBjKWlLMiMuYy8DcmVrkxdP7aKxQ
1719
github.com/arduino/go-paths-helper v1.12.1/go.mod h1:jcpW4wr0u69GlXhTYydsdsqAjLaYK5n7oWHfKqOG6LM=
1820
github.com/armon/go-socks5 v0.0.0-20160902184237-e75332964ef5 h1:0CwZNZbxp69SHPdPJAN/hZIm0C4OItdklCFmMRWYpio=
1921
github.com/armon/go-socks5 v0.0.0-20160902184237-e75332964ef5/go.mod h1:wHh0iHkYZB8zMSxRWpUBQtwG5a7fFgvEO+odwuTv2gs=
22+
github.com/bits-and-blooms/bitset v1.22.0 h1:Tquv9S8+SGaS3EhyA+up3FXzmkhxPGjQQCkcs2uw7w4=
23+
github.com/bits-and-blooms/bitset v1.22.0/go.mod h1:7hO7Gc7Pp1vODcmWvKMRA9BNmbv6a/7QIWpPxHddWR8=
2024
github.com/bluekeyes/go-gitdiff v0.7.2 h1:42jrcVZdjjxXtVsFNYTo/I6T1ZvIiQL+iDDLiH904hw=
2125
github.com/bluekeyes/go-gitdiff v0.7.2/go.mod h1:QpfYYO1E0fTVHVZAZKiRjtSGY9823iCdvGXBcEzHGbM=
2226
github.com/bradfitz/gomemcache v0.0.0-20170208213004-1952afaa557d/go.mod h1:PmM6Mmwb0LSuEubjR8N7PtNe1KxZLtOUHtbeikc5h60=

runner/internal/shim/api/handlers.go

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,20 @@ func (s *ShimServer) HealthcheckHandler(w http.ResponseWriter, r *http.Request)
2121
}, nil
2222
}
2323

24+
func (s *ShimServer) InstanceHealthHandler(w http.ResponseWriter, r *http.Request) (interface{}, error) {
25+
ctx := r.Context()
26+
response := InstanceHealthResponse{}
27+
if s.dcgmWrapper != nil {
28+
if dcgmHealth, err := s.dcgmWrapper.GetHealth(); err != nil {
29+
log.Error(ctx, "failed to get health from DCGM", "err", err)
30+
} else {
31+
response.DCGM = &dcgmHealth
32+
}
33+
}
34+
35+
return &response, nil
36+
}
37+
2438
func (s *ShimServer) TaskListHandler(w http.ResponseWriter, r *http.Request) (interface{}, error) {
2539
return &TaskListResponse{IDs: s.runner.TaskIDs()}, nil
2640
}

runner/internal/shim/api/handlers_test.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ func TestHealthcheck(t *testing.T) {
1313
request := httptest.NewRequest("GET", "/api/healthcheck", nil)
1414
responseRecorder := httptest.NewRecorder()
1515

16-
server := NewShimServer(context.Background(), ":12345", NewDummyRunner(), nil, "0.0.1.dev2")
16+
server := NewShimServer(context.Background(), ":12345", "0.0.1.dev2", NewDummyRunner(), nil, nil)
1717

1818
f := common.JSONResponseHandler(server.HealthcheckHandler)
1919
f(responseRecorder, request)
@@ -30,7 +30,7 @@ func TestHealthcheck(t *testing.T) {
3030
}
3131

3232
func TestTaskSubmit(t *testing.T) {
33-
server := NewShimServer(context.Background(), ":12340", NewDummyRunner(), nil, "0.0.1.dev2")
33+
server := NewShimServer(context.Background(), ":12340", "0.0.1.dev2", NewDummyRunner(), nil, nil)
3434
requestBody := `{
3535
"id": "dummy-id",
3636
"name": "dummy-name",

runner/internal/shim/api/schemas.go

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,19 @@
11
package api
22

3-
import "github.com/dstackai/dstack/runner/internal/shim"
3+
import (
4+
"github.com/dstackai/dstack/runner/internal/shim"
5+
"github.com/dstackai/dstack/runner/internal/shim/dcgm"
6+
)
47

58
type HealthcheckResponse struct {
69
Service string `json:"service"`
710
Version string `json:"version"`
811
}
912

13+
type InstanceHealthResponse struct {
14+
DCGM *dcgm.Health `json:"dcgm"`
15+
}
16+
1017
type TaskListResponse struct {
1118
IDs []string `json:"ids"`
1219
}

runner/internal/shim/api/server.go

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,11 +29,15 @@ type ShimServer struct {
2929
runner TaskRunner
3030

3131
dcgmExporter *dcgm.DCGMExporter
32+
dcgmWrapper *dcgm.DCGMWrapper
3233

3334
version string
3435
}
3536

36-
func NewShimServer(ctx context.Context, address string, runner TaskRunner, dcgmExporter *dcgm.DCGMExporter, version string) *ShimServer {
37+
func NewShimServer(
38+
ctx context.Context, address string, version string,
39+
runner TaskRunner, dcgmExporter *dcgm.DCGMExporter, dcgmWrapper *dcgm.DCGMWrapper,
40+
) *ShimServer {
3741
r := api.NewRouter()
3842
s := &ShimServer{
3943
HttpServer: &http.Server{
@@ -45,12 +49,14 @@ func NewShimServer(ctx context.Context, address string, runner TaskRunner, dcgmE
4549
runner: runner,
4650

4751
dcgmExporter: dcgmExporter,
52+
dcgmWrapper: dcgmWrapper,
4853

4954
version: version,
5055
}
5156

5257
// The healthcheck endpoint should stay backward compatible, as it is used for negotiation
5358
r.AddHandler("GET", "/api/healthcheck", s.HealthcheckHandler)
59+
r.AddHandler("GET", "/api/instance/health", s.InstanceHealthHandler)
5460
r.AddHandler("GET", "/api/tasks", s.TaskListHandler)
5561
r.AddHandler("GET", "/api/tasks/{id}", s.TaskInfoHandler)
5662
r.AddHandler("POST", "/api/tasks", s.TaskSubmitHandler)

0 commit comments

Comments
 (0)