Skip to content

Commit 1258b6b

Browse files
committed
Enrich per-process GPU metrics with container/pod labels
Attach Kubernetes namespace/pod/container labels to per-process GPU metrics so they share identity with parca-agent's profiles, enabling queries like GPU utilization by pod and joins between GPU metrics and flame graphs on the same pod/container labels. This is the per-process attribution dcgm-exporter cannot do. - gpumetrics.LabelResolver: optional interface for resolving extra attributes from a host PID; threaded into per-process collection and applied once per PID across its utilization samples. - gpumetrics.ContainerLabelResolver: backs the interface with parca-agent's container metadata provider, returning a curated, low-cardinality allow-list (namespace, pod, pod_container_name, pod_container_id, pod_uid, pod_controller_kind/name). - main.go builds the resolver from the node name and passes it to the producer; resolver construction failure falls back to pid/comm only rather than disabling GPU metrics. NVML reports host PIDs and the metadata provider reads host /proc, so this assumes the agent runs in the host PID namespace (the standard deployment).
1 parent e3764f1 commit 1258b6b

3 files changed

Lines changed: 128 additions & 4 deletions

File tree

gpumetrics/enrich.go

Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
// Copyright 2026 The Parca Authors
2+
// Licensed under the Apache License, Version 2.0 (the "License");
3+
// you may not use this file except in compliance with the License.
4+
// You may obtain a copy of the License at
5+
//
6+
// http://www.apache.org/licenses/LICENSE-2.0
7+
//
8+
// Unless required by applicable law or agreed to in writing, software
9+
// distributed under the License is distributed on an "AS IS" BASIS,
10+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11+
// See the License for the specific language governing permissions and
12+
// limitations under the License.
13+
14+
package gpumetrics
15+
16+
import (
17+
"context"
18+
19+
"github.com/prometheus/prometheus/model/labels"
20+
"go.opentelemetry.io/ebpf-profiler/libpf"
21+
22+
"github.com/parca-dev/parca-agent/reporter/metadata"
23+
)
24+
25+
// defaultEnrichmentLabels is the curated set of container/pod labels attached to
26+
// per-process GPU metrics. It is intentionally small: each label combination is
27+
// a distinct time series, so we keep only stable, low-churn identifiers useful
28+
// for grouping (namespace, pod, container) and deliberately exclude high-churn
29+
// or verbose labels (pod_container_image, pod_ip, ...) that would inflate
30+
// cardinality. These match labels parca-agent already attaches to profiles, so
31+
// GPU metrics and profiles join on the same pod/container identity.
32+
var defaultEnrichmentLabels = map[string]struct{}{
33+
"namespace": {},
34+
"pod": {},
35+
"pod_container_name": {},
36+
"pod_container_id": {},
37+
"pod_uid": {},
38+
"pod_controller_kind": {},
39+
"pod_controller_name": {},
40+
}
41+
42+
// ContainerLabelResolver enriches per-process GPU metrics with Kubernetes
43+
// container/pod labels, using parca-agent's container metadata provider. It
44+
// implements LabelResolver.
45+
type ContainerLabelResolver struct {
46+
ctx context.Context
47+
provider metadata.MetadataProvider
48+
allow map[string]struct{}
49+
}
50+
51+
// NewContainerLabelResolver builds a resolver backed by the container metadata
52+
// provider for the given Kubernetes node. The provider maintains its own caches,
53+
// so per-PID lookups on the hot path are cheap after the first resolution.
54+
func NewContainerLabelResolver(ctx context.Context, nodeName string) (*ContainerLabelResolver, error) {
55+
provider, err := metadata.NewContainerMetadataProvider(ctx, nodeName)
56+
if err != nil {
57+
return nil, err
58+
}
59+
return &ContainerLabelResolver{
60+
ctx: ctx,
61+
provider: provider,
62+
allow: defaultEnrichmentLabels,
63+
}, nil
64+
}
65+
66+
// LabelsForPID returns the curated container/pod labels for a host PID. PIDs
67+
// that don't belong to a container (or that can't be resolved) yield an empty
68+
// map, leaving the data point with only its pid/comm attributes.
69+
func (r *ContainerLabelResolver) LabelsForPID(pid uint32) map[string]string {
70+
lb := labels.NewBuilder(labels.EmptyLabels())
71+
r.provider.AddMetadata(r.ctx, libpf.PID(pid), lb)
72+
73+
out := make(map[string]string, len(r.allow))
74+
lb.Range(func(l labels.Label) {
75+
if _, ok := r.allow[l.Name]; ok && l.Value != "" {
76+
out[l.Name] = l.Value
77+
}
78+
})
79+
return out
80+
}

gpumetrics/nvidia.go

Lines changed: 34 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -65,16 +65,29 @@ const (
6565
metricNameGPUUtilizationPercent = "gpu_utilization_percent"
6666
)
6767

68+
// LabelResolver resolves additional attributes (e.g. Kubernetes namespace,
69+
// pod, container) for a process by its host PID. The returned labels are
70+
// attached to per-process GPU metric data points so they share identity with
71+
// parca-agent's profiles. NVML reports host PIDs, so the resolver must also
72+
// operate in the host PID namespace (the standard parca-agent deployment).
73+
type LabelResolver interface {
74+
LabelsForPID(pid uint32) map[string]string
75+
}
76+
6877
// Producer collects NVIDIA GPU metrics and implements metricexport.Producer.
6978
type Producer struct {
70-
devices []*perDeviceState
79+
devices []*perDeviceState
80+
resolver LabelResolver
7181
}
7282

7383
// NewProducer initializes NVML and enumerates the available NVIDIA devices.
7484
// It returns an error if NVML is unavailable (e.g. no driver / no GPU), which
7585
// the caller should treat as "GPU metrics disabled on this node" rather than
7686
// fatal.
77-
func NewProducer() (*Producer, error) {
87+
//
88+
// resolver may be nil, in which case per-process metrics carry only pid and
89+
// comm; otherwise its labels are attached to per-process data points.
90+
func NewProducer(resolver LabelResolver) (*Producer, error) {
7891
ret := nvml.Init()
7992
if !errors.Is(ret, nvml.SUCCESS) {
8093
return nil, fmt.Errorf("failed to initialize NVML library: %s", nvml.ErrorString(ret))
@@ -108,6 +121,7 @@ func NewProducer() (*Producer, error) {
108121
uuid: uuid,
109122
index: i,
110123
powerLimit: powerLimit,
124+
resolver: resolver,
111125

112126
mu: &sync.RWMutex{},
113127
lastTimestamp: map[string]uint64{
@@ -119,7 +133,7 @@ func NewProducer() (*Producer, error) {
119133
gauges: map[string]pmetric.Gauge{},
120134
}
121135
}
122-
return &Producer{devices: devices}, nil
136+
return &Producer{devices: devices, resolver: resolver}, nil
123137
}
124138

125139
// DeviceCount reports how many NVIDIA devices were enumerated.
@@ -251,6 +265,7 @@ type perDeviceState struct {
251265
uuid string
252266
index int
253267
powerLimit uint32
268+
resolver LabelResolver
254269

255270
mu *sync.RWMutex
256271
lastTimestamp map[string]uint64
@@ -413,6 +428,13 @@ func (ds *perDeviceState) collectMemoryUtilization() error {
413428
return nil
414429
}
415430

431+
// putExtraLabels copies resolver-provided labels onto a data point's attributes.
432+
func putExtraLabels(attrs pcommon.Map, extra map[string]string) {
433+
for k, v := range extra {
434+
attrs.PutStr(k, v)
435+
}
436+
}
437+
416438
func (ds *perDeviceState) collectProcessUtilization() error {
417439
util := pmetric.NewGauge()
418440
utilMem := pmetric.NewGauge()
@@ -452,12 +474,20 @@ func (ds *perDeviceState) collectProcessUtilization() error {
452474
return fmt.Errorf("failed to get process name for %d - pid: %d - %s", ds.index, process.Pid, nvml.ErrorString(ret))
453475
}
454476

477+
// Resolve container/pod labels once per process; they're identical
478+
// across this process's utilization samples.
479+
var extraLabels map[string]string
480+
if ds.resolver != nil {
481+
extraLabels = ds.resolver.LabelsForPID(process.Pid)
482+
}
483+
455484
for _, sample := range utilization {
456485
dpUtil := util.DataPoints().AppendEmpty()
457486
dpUtil.Attributes().PutStr(attributeUUID, ds.uuid)
458487
dpUtil.Attributes().PutInt(attributeIndex, int64(ds.index))
459488
dpUtil.Attributes().PutInt(attributePID, int64(process.Pid))
460489
dpUtil.Attributes().PutStr(attributeComm, processName)
490+
putExtraLabels(dpUtil.Attributes(), extraLabels)
461491
dpUtil.SetTimestamp(pcommon.Timestamp(ts.UnixNano()))
462492
dpUtil.SetIntValue(int64(sample.SmUtil))
463493

@@ -466,6 +496,7 @@ func (ds *perDeviceState) collectProcessUtilization() error {
466496
dpMem.Attributes().PutInt(attributeIndex, int64(ds.index))
467497
dpMem.Attributes().PutInt(attributePID, int64(process.Pid))
468498
dpMem.Attributes().PutStr(attributeComm, processName)
499+
putExtraLabels(dpMem.Attributes(), extraLabels)
469500
dpMem.SetTimestamp(pcommon.Timestamp(ts.UnixNano()))
470501
dpMem.SetIntValue(int64(sample.MemUtil))
471502
}

main.go

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -567,7 +567,7 @@ func mainWithExitCode() flags.ExitCode {
567567
if f.GpuMetrics.Enable {
568568
if grpcConn == nil {
569569
log.Warn("--gpu-metrics-enable is set but no remote-store is configured; GPU metrics disabled")
570-
} else if producer, err := gpumetrics.NewProducer(); err != nil {
570+
} else if producer, err := gpumetrics.NewProducer(newGPULabelResolver(mainCtx, f.Node)); err != nil {
571571
log.Warnf("GPU metrics disabled (NVML unavailable): %v", err)
572572
} else {
573573
log.Infof("GPU metrics enabled: collecting from %d NVIDIA device(s)", producer.DeviceCount())
@@ -704,6 +704,19 @@ func getTracePipe() (*os.File, error) {
704704
return nil, os.ErrNotExist
705705
}
706706

707+
// newGPULabelResolver builds a container/pod label resolver for GPU per-process
708+
// metrics. On failure it logs and returns a nil resolver, so per-process metrics
709+
// fall back to pid/comm only rather than disabling GPU metrics entirely. The
710+
// return type is the interface, so a nil here is a true nil interface.
711+
func newGPULabelResolver(ctx context.Context, nodeName string) gpumetrics.LabelResolver {
712+
resolver, err := gpumetrics.NewContainerLabelResolver(ctx, nodeName)
713+
if err != nil {
714+
log.Warnf("GPU metrics: container label enrichment disabled: %v", err)
715+
return nil
716+
}
717+
return resolver
718+
}
719+
707720
func readTracePipe(ctx context.Context) {
708721
tp, err := getTracePipe()
709722
if err != nil {

0 commit comments

Comments
 (0)