Skip to content

Commit 0dcfe57

Browse files
committed
Add DRA request Prometheus metrics
Signed-off-by: Sheng Lin <shelin@nvidia.com>
1 parent 476b478 commit 0dcfe57

17 files changed

Lines changed: 534 additions & 24 deletions

File tree

cmd/compute-domain-controller/computedomain.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ import (
2828
"k8s.io/klog/v2"
2929

3030
nvapi "sigs.k8s.io/nvidia-dra-driver-gpu/api/nvidia.com/resource/v1beta1"
31+
"sigs.k8s.io/nvidia-dra-driver-gpu/pkg/metrics"
3132
nvinformers "sigs.k8s.io/nvidia-dra-driver-gpu/pkg/nvidia.com/informers/externalversions"
3233
nvlisters "sigs.k8s.io/nvidia-dra-driver-gpu/pkg/nvidia.com/listers/resource/v1beta1"
3334
)
@@ -207,6 +208,8 @@ func (m *ComputeDomainManager) UpdateStatus(ctx context.Context, cd *nvapi.Compu
207208
// Recalculate global status based on current state
208209
cd.Status.Status = m.calculateGlobalStatus(cd)
209210

211+
metrics.ObserveComputeDomainStatus(string(cd.UID), cd.Status.Status)
212+
210213
updatedCD, err := m.config.clientsets.Nvidia.ResourceV1beta1().ComputeDomains(cd.Namespace).UpdateStatus(ctx, cd, metav1.UpdateOptions{})
211214
if err != nil {
212215
return nil, err
@@ -344,6 +347,7 @@ func (m *ComputeDomainManager) onAddOrUpdate(ctx context.Context, obj any) error
344347
return fmt.Errorf("error removing finalizer: %w", err)
345348
}
346349

350+
metrics.ForgetComputeDomain(string(cd.UID))
347351
return nil
348352
}
349353

cmd/compute-domain-controller/controller.go

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,13 @@ type ManagerConfig struct {
5656
// logVerbosityCDDaemon controls the log verbosity for dynamically launched
5757
// ComputeDomain daemons.
5858
logVerbosityCDDaemon int
59+
60+
// httpEndpoint is the TCP network address where the HTTP server for diagnostics
61+
// (including pprof and metrics) will listen
62+
httpEndpoint string
63+
64+
// metricsPath is the HTTP path for Prometheus metrics
65+
metricsPath string
5966
}
6067

6168
// Controller manages the lifecycle of the DRA driver and its components.
@@ -84,6 +91,8 @@ func (c *Controller) Run(ctx context.Context) error {
8491
clientsets: c.config.clientsets,
8592
workQueue: workQueue,
8693
logVerbosityCDDaemon: c.config.flags.logVerbosityCDDaemon,
94+
httpEndpoint: c.config.flags.httpEndpoint,
95+
metricsPath: c.config.flags.metricsPath,
8796
}
8897

8998
// TODO: log full, nested cliFlags structure.

cmd/compute-domain-controller/main.go

Lines changed: 2 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -28,15 +28,12 @@ import (
2828
"syscall"
2929

3030
"github.com/google/uuid"
31-
"github.com/prometheus/client_golang/prometheus"
32-
"github.com/prometheus/client_golang/prometheus/promhttp"
3331
"github.com/urfave/cli/v2"
3432

3533
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
3634
"k8s.io/client-go/tools/leaderelection"
3735
"k8s.io/client-go/tools/leaderelection/resourcelock"
3836
"k8s.io/component-base/logs"
39-
"k8s.io/component-base/metrics/legacyregistry"
4037
"k8s.io/klog/v2"
4138

4239
_ "k8s.io/component-base/metrics/prometheus/restclient" // for client metric registration
@@ -47,6 +44,7 @@ import (
4744
"sigs.k8s.io/nvidia-dra-driver-gpu/internal/info"
4845
"sigs.k8s.io/nvidia-dra-driver-gpu/pkg/featuregates"
4946
pkgflags "sigs.k8s.io/nvidia-dra-driver-gpu/pkg/flags"
47+
"sigs.k8s.io/nvidia-dra-driver-gpu/pkg/metrics"
5048
)
5149

5250
const (
@@ -371,24 +369,9 @@ func runWithLeaderElection(ctx context.Context, config *Config, controller *Cont
371369

372370
func SetupHTTPEndpoint(config *Config) error {
373371
if config.flags.metricsPath != "" {
374-
// To collect metrics data from the metric handler itself, we
375-
// let it register itself and then collect from that registry.
376-
reg := prometheus.NewRegistry()
377-
gatherers := prometheus.Gatherers{
378-
// Include Go runtime and process metrics:
379-
// https://github.com/kubernetes/kubernetes/blob/9780d88cb6a4b5b067256ecb4abf56892093ee87/staging/src/k8s.io/component-base/metrics/legacyregistry/registry.go#L46-L49
380-
legacyregistry.DefaultGatherer,
381-
}
382-
gatherers = append(gatherers, reg)
383-
384372
actualPath := path.Join("/", config.flags.metricsPath)
385373
klog.InfoS("Starting metrics", "path", actualPath)
386-
// This is similar to k8s.io/component-base/metrics HandlerWithReset
387-
// except that we gather from multiple sources.
388-
config.mux.Handle(actualPath,
389-
promhttp.InstrumentMetricHandler(
390-
reg,
391-
promhttp.HandlerFor(gatherers, promhttp.HandlerOpts{})))
374+
config.mux.Handle(path.Join("/", config.flags.metricsPath), metrics.NewLegacyPrometheusHandler())
392375
}
393376

394377
if config.flags.profilePath != "" {

cmd/compute-domain-daemon/controller.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,8 @@ type DaemonInfoManager interface {
3636

3737
// ManagerConfig holds the configuration for the compute domain manager.
3838
type ManagerConfig struct {
39+
httpEndpoint string
40+
metricsPath string
3941
workQueue *workqueue.WorkQueue
4042
clientsets flags.ClientSets
4143
nodeName string
@@ -52,6 +54,8 @@ type ManagerConfig struct {
5254

5355
// ControllerConfig holds the configuration for the controller.
5456
type ControllerConfig struct {
57+
httpEndpoint string
58+
metricsPath string
5559
clientsets flags.ClientSets
5660
nodeName string
5761
computeDomainUUID string
@@ -77,6 +81,8 @@ func NewController(config *ControllerConfig) (*Controller, error) {
7781

7882
mc := &ManagerConfig{
7983
workQueue: workQueue,
84+
httpEndpoint: config.httpEndpoint,
85+
metricsPath: config.metricsPath,
8086
clientsets: config.clientsets,
8187
nodeName: config.nodeName,
8288
computeDomainUUID: config.computeDomainUUID,

cmd/compute-domain-daemon/main.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,8 @@ type Flags struct {
6161
podName string
6262
podNamespace string
6363
maxNodesPerIMEXDomain int
64+
httpEndpoint string
65+
metricsPath string
6466
klogVerbosity int
6567
}
6668

@@ -244,6 +246,8 @@ func run(ctx context.Context, cancel context.CancelFunc, flags *Flags) error {
244246
}
245247

246248
config := &ControllerConfig{
249+
httpEndpoint: flags.httpEndpoint,
250+
metricsPath: flags.metricsPath,
247251
clientsets: clientsets,
248252
cliqueID: flags.cliqueID,
249253
computeDomainUUID: flags.computeDomainUUID,

cmd/compute-domain-kubelet-plugin/device_state.go

Lines changed: 43 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ import (
3434
configapi "sigs.k8s.io/nvidia-dra-driver-gpu/api/nvidia.com/resource/v1beta1"
3535
"sigs.k8s.io/nvidia-dra-driver-gpu/internal/common"
3636
"sigs.k8s.io/nvidia-dra-driver-gpu/pkg/featuregates"
37+
drametrics "sigs.k8s.io/nvidia-dra-driver-gpu/pkg/metrics"
3738
)
3839

3940
type OpaqueDeviceConfig struct {
@@ -132,6 +133,11 @@ func NewDeviceState(ctx context.Context, config *Config) (*DeviceState, error) {
132133
for _, c := range checkpoints {
133134
if c == DriverPluginCheckpointFileBasename {
134135
klog.Infof("Found previous checkpoint: %s", c)
136+
cp, err := state.getCheckpoint()
137+
if err != nil {
138+
return nil, fmt.Errorf("unable to get checkpoint: %w", err)
139+
}
140+
syncPreparedDevicesGaugeFromCheckpoint(config.flags.nodeName, cp)
135141
return state, nil
136142
}
137143
}
@@ -288,7 +294,11 @@ func (s *DeviceState) Unprepare(ctx context.Context, claimRef kubeletplugin.Name
288294
}
289295

290296
func (s *DeviceState) createCheckpoint(cp *Checkpoint) error {
291-
return s.checkpointManager.CreateCheckpoint(DriverPluginCheckpointFileBasename, cp)
297+
if err := s.checkpointManager.CreateCheckpoint(DriverPluginCheckpointFileBasename, cp); err != nil {
298+
return err
299+
}
300+
syncPreparedDevicesGaugeFromCheckpoint(s.config.flags.nodeName, cp)
301+
return nil
292302
}
293303

294304
func (s *DeviceState) getCheckpoint() (*Checkpoint, error) {
@@ -317,6 +327,7 @@ func (s *DeviceState) updateCheckpoint(mutate func(*Checkpoint)) error {
317327
return fmt.Errorf("unable to create checkpoint: %w", err)
318328
}
319329

330+
syncPreparedDevicesGaugeFromCheckpoint(s.config.flags.nodeName, checkpoint)
320331
return nil
321332
}
322333

@@ -825,3 +836,34 @@ func (s *DeviceState) validateNoOverlappingPreparedDevices(checkpoint *Checkpoin
825836
}
826837
return nil
827838
}
839+
840+
func syncPreparedDevicesGaugeFromCheckpoint(nodeName string, cp *Checkpoint) {
841+
counts := make(map[string]int)
842+
if cp == nil {
843+
return
844+
}
845+
lv := cp.ToLatestVersion()
846+
if lv != nil && lv.V2 != nil {
847+
for _, pc := range lv.V2.PreparedClaims {
848+
if pc.CheckpointState != ClaimCheckpointStatePrepareCompleted {
849+
continue
850+
}
851+
for _, g := range pc.PreparedDevices {
852+
for _, dev := range g.Devices {
853+
if _, ok := counts[dev.Type()]; !ok {
854+
counts[dev.Type()] = 0
855+
}
856+
counts[dev.Type()]++
857+
}
858+
}
859+
}
860+
}
861+
862+
for _, dt := range []string{ComputeDomainChannelType, ComputeDomainDaemonType, UnknownDeviceType} {
863+
if count, ok := counts[dt]; !ok {
864+
drametrics.SetPreparedDevicesCounts(nodeName, DriverName, dt, 0)
865+
} else {
866+
drametrics.SetPreparedDevicesCounts(nodeName, DriverName, dt, count)
867+
}
868+
}
869+
}

cmd/compute-domain-kubelet-plugin/driver.go

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ import (
3333
"k8s.io/klog/v2"
3434

3535
"sigs.k8s.io/nvidia-dra-driver-gpu/pkg/flock"
36+
drametrics "sigs.k8s.io/nvidia-dra-driver-gpu/pkg/metrics"
3637
"sigs.k8s.io/nvidia-dra-driver-gpu/pkg/workqueue"
3738
)
3839

@@ -241,14 +242,19 @@ func (d *driver) HandleError(ctx context.Context, err error, msg string) {
241242
// also reflect an error. Set the boolean to `true` for any result wrapping a
242243
// non-retryable error.
243244
func (d *driver) nodePrepareResource(ctx context.Context, claim *resourceapi.ResourceClaim) (bool, kubeletplugin.PrepareResult) {
245+
t0 := time.Now()
246+
244247
release, err := d.pulock.Acquire(ctx, flock.WithTimeout(10*time.Second))
245248
if err != nil {
249+
drametrics.IncNodePrepareError(DriverName, "lock_acquire")
246250
res := kubeletplugin.PrepareResult{
247251
Err: fmt.Errorf("error acquiring prep/unprep lock: %w", err),
248252
}
249253
return false, res
250254
}
251255
defer release()
256+
doneInFlight := drametrics.TrackInFlight(DriverName, "prepare")
257+
defer doneInFlight()
252258

253259
if claim.Status.Allocation == nil {
254260
res := kubeletplugin.PrepareResult{
@@ -259,6 +265,7 @@ func (d *driver) nodePrepareResource(ctx context.Context, claim *resourceapi.Res
259265

260266
devs, err := d.state.Prepare(ctx, claim)
261267
if err != nil {
268+
drametrics.IncNodePrepareError(DriverName, "prepare_devices")
262269
res := kubeletplugin.PrepareResult{
263270
Err: fmt.Errorf("error preparing devices for claim '%s': %w", ResourceClaimToString(claim), err),
264271
}
@@ -270,24 +277,32 @@ func (d *driver) nodePrepareResource(ctx context.Context, claim *resourceapi.Res
270277
}
271278

272279
klog.V(1).Infof("Prepared devices for claim '%s': %v", ResourceClaimToString(claim), devs)
280+
drametrics.ObserveRequest(DriverName, "prepare", time.Since(t0))
273281

274282
return true, kubeletplugin.PrepareResult{Devices: devs}
275283
}
276284

277285
// Return 2-tuple: the first value is a boolean indicating to the retry logic
278286
// whether the work is 'done'.
279287
func (d *driver) nodeUnprepareResource(ctx context.Context, claimRef kubeletplugin.NamespacedObject) (bool, error) {
288+
tstart := time.Now()
289+
280290
release, err := d.pulock.Acquire(ctx, flock.WithTimeout(10*time.Second))
281291
if err != nil {
292+
drametrics.IncNodeUnprepareError(DriverName, "lock_acquire")
282293
return false, fmt.Errorf("error acquiring prep/unprep lock: %w", err)
283294
}
284295
defer release()
296+
doneInFlight := drametrics.TrackInFlight(DriverName, "unprepare")
297+
defer doneInFlight()
285298

286299
if err := d.state.Unprepare(ctx, claimRef); err != nil {
300+
drametrics.IncNodeUnprepareError(DriverName, "unprepare_devices")
287301
return isPermanentError(err), fmt.Errorf("error unpreparing devices for claim '%v': %w", claimRef.String(), err)
288302
}
289303

290304
klog.V(1).Infof("Unprepared devices for claim '%v'", claimRef.String())
305+
drametrics.ObserveRequest(DriverName, "unprepare", time.Since(tstart))
291306
return true, nil
292307
}
293308

cmd/compute-domain-kubelet-plugin/main.go

Lines changed: 23 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,15 +27,15 @@ import (
2727

2828
"github.com/urfave/cli/v2"
2929

30+
"k8s.io/component-base/logs"
3031
"k8s.io/dynamic-resource-allocation/kubeletplugin"
3132
"k8s.io/klog/v2"
3233

33-
"k8s.io/component-base/logs"
34-
3534
"sigs.k8s.io/nvidia-dra-driver-gpu/internal/common"
3635
"sigs.k8s.io/nvidia-dra-driver-gpu/internal/info"
3736
"sigs.k8s.io/nvidia-dra-driver-gpu/pkg/featuregates"
3837
pkgflags "sigs.k8s.io/nvidia-dra-driver-gpu/pkg/flags"
38+
"sigs.k8s.io/nvidia-dra-driver-gpu/pkg/metrics"
3939
)
4040

4141
const (
@@ -48,6 +48,8 @@ type Flags struct {
4848
kubeClientConfig pkgflags.KubeClientConfig
4949

5050
nodeName string
51+
httpEndpoint string
52+
metricsPath string
5153
namespace string
5254
cdiRoot string
5355
containerDriverRoot string
@@ -95,6 +97,19 @@ func newApp() *cli.App {
9597
Destination: &flags.namespace,
9698
EnvVars: []string{"NAMESPACE"},
9799
},
100+
&cli.StringFlag{
101+
Name: "http-endpoint",
102+
Usage: "The TCP network `address` where the metrics HTTP server will listen (example: `:8080`). The default is the empty string, which means the server is disabled.",
103+
Destination: &flags.httpEndpoint,
104+
EnvVars: []string{"HTTP_ENDPOINT"},
105+
},
106+
&cli.StringFlag{
107+
Name: "metrics-path",
108+
Usage: "The HTTP `path` where Prometheus metrics are exposed, disabled if empty.",
109+
Value: "/metrics",
110+
Destination: &flags.metricsPath,
111+
EnvVars: []string{"METRICS_PATH"},
112+
},
98113
&cli.StringFlag{
99114
Name: "cdi-root",
100115
Usage: "Absolute path to the directory where CDI files will be generated.",
@@ -239,6 +254,12 @@ func RunPlugin(ctx context.Context, config *Config) error {
239254
ctx, cancel := signal.NotifyContext(ctx, syscall.SIGHUP, syscall.SIGINT, syscall.SIGTERM, syscall.SIGQUIT)
240255
defer cancel()
241256

257+
if config.flags.httpEndpoint != "" {
258+
if err := metrics.RunPrometheusMetricsServer(ctx, config.flags.httpEndpoint, config.flags.metricsPath); err != nil {
259+
return fmt.Errorf("setup metrics endpoint: %w", err)
260+
}
261+
}
262+
242263
// Create and start the driver
243264
driver, err := NewDriver(ctx, config)
244265
if err != nil {

0 commit comments

Comments
 (0)