parca-dev
diff --git a/‎flags/flags.go‎
Lines changed: 11 additions & 0 deletions b/‎flags/flags.go‎
Lines changed: 11 additions & 0 deletions
diff --git a/‎go.mod‎
Lines changed: 1 addition & 0 deletions b/‎go.mod‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎go.sum‎
Lines changed: 2 additions & 0 deletions b/‎go.sum‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎gpumetrics/enrich.go‎
Lines changed: 80 additions & 0 deletions b/‎gpumetrics/enrich.go‎
Lines changed: 80 additions & 0 deletions
diff --git a/‎gpumetrics/gpumetrics.go‎
Lines changed: 35 additions & 0 deletions b/‎gpumetrics/gpumetrics.go‎
Lines changed: 35 additions & 0 deletions
@@ -163,9 +163,20 @@ type Flags struct {
 
 	MergeGpuProfiles bool `default:"false" help:"Report GPU kernel timing and GPU PC sampling under a single gpu_time/nanoseconds sample_type, differentiated by a gpu_view label (pc_sample|kernel_time). When false (the default), they are reported as separate sample_types (gpu_kernel_time/nanoseconds and gpu_pcsample/count) with no per-sample labels."`
 
+	GpuMetrics FlagsGpuMetrics `embed:"" prefix:"gpu-metrics-"`
+
 	OTLPLogging bool `default:"false" help:"Forward parca-agent's own logrus output to the remote-store as OTLP log records (in addition to local stderr). Requires a remote-store; ignored in offline mode."`
 }
 
+// FlagsGpuMetrics configures NVML-based GPU metrics collection and OTLP egress
+// (utilization, power, temperature, clocks, PCIe throughput, and per-process
+// GPU utilization). Requires a remote-store and an NVIDIA driver; if NVML can't
+// be initialized the producer is disabled and the agent continues normally.
+type FlagsGpuMetrics struct {
+	Enable         bool          `default:"false" help:"Enable NVML-based GPU metrics collection and OTLP export to the remote-store."`
+	ExportInterval time.Duration `default:"10s"   help:"How frequently collected GPU metrics are batched and exported over OTLP."`
+}
+
 type ExitCode int
 
 const (
 
@@ -7,6 +7,7 @@ require (
 	buf.build/gen/go/parca-dev/parca/protocolbuffers/go v1.36.11-20260225102827-5fda07223114.1
 	buf.build/gen/go/prometheus/prometheus/protocolbuffers/go v1.36.6-20250320161912-af2aab87b1b3.1
 	github.com/KimMachineGun/automemlimit v0.7.3
+	github.com/NVIDIA/go-nvml v0.12.4-1
 	github.com/alecthomas/kong v1.12.1
 	github.com/alecthomas/kong-yaml v0.2.0
 	github.com/apache/arrow-go/v18 v18.5.2
 
@@ -24,6 +24,8 @@ github.com/Microsoft/go-winio v0.6.2 h1:F2VQgta7ecxGYO8k3ZZz3RS8fVIXVxONVUPlNERo
 github.com/Microsoft/go-winio v0.6.2/go.mod h1:yd8OoFMLzJbo9gZq8j5qaps8bJ9aShtEA8Ipt1oGCvU=
 github.com/Microsoft/hcsshim v0.12.9 h1:2zJy5KA+l0loz1HzEGqyNnjd3fyZA31ZBCGKacp6lLg=
 github.com/Microsoft/hcsshim v0.12.9/go.mod h1:fJ0gkFAna6ukt0bLdKB8djt4XIJhF/vEPuoIWYVvZ8Y=
+github.com/NVIDIA/go-nvml v0.12.4-1 h1:WKUvqshhWSNTfm47ETRhv0A0zJyr1ncCuHiXwoTrBEc=
+github.com/NVIDIA/go-nvml v0.12.4-1/go.mod h1:8Llmj+1Rr+9VGGwZuRer5N/aCjxGuR5nPb/9ebBiIEQ=
 github.com/alecthomas/assert/v2 v2.11.0 h1:2Q9r3ki8+JYXvGsDyBXwH3LcJ+WK5D0gc5E8vS6K3D0=
 github.com/alecthomas/assert/v2 v2.11.0/go.mod h1:Bze95FyfUr7x34QZrjL+XP+0qgp/zg8yS+TtBj1WA3k=
 github.com/alecthomas/kong v1.12.1 h1:iq6aMJDcFYP9uFrLdsiZQ2ZMmcshduyGv4Pek0MQPW0=
 
@@ -0,0 +1,80 @@
+// Copyright 2026 The Parca Authors
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package gpumetrics
+
+import (
+	"context"
+
+	"github.com/prometheus/prometheus/model/labels"
+	"go.opentelemetry.io/ebpf-profiler/libpf"
+
+	"github.com/parca-dev/parca-agent/reporter/metadata"
+)
+
+// defaultEnrichmentLabels is the curated set of container/pod labels attached to
+// per-process GPU metrics. It is intentionally small: each label combination is
+// a distinct time series, so we keep only stable, low-churn identifiers useful
+// for grouping (namespace, pod, container) and deliberately exclude high-churn
+// or verbose labels (pod_container_image, pod_ip, ...) that would inflate
+// cardinality. These match labels parca-agent already attaches to profiles, so
+// GPU metrics and profiles join on the same pod/container identity.
+var defaultEnrichmentLabels = map[string]struct{}{
+	"namespace":           {},
+	"pod":                 {},
+	"pod_container_name":  {},
+	"pod_container_id":    {},
+	"pod_uid":             {},
+	"pod_controller_kind": {},
+	"pod_controller_name": {},
+}
+
+// ContainerLabelResolver enriches per-process GPU metrics with Kubernetes
+// container/pod labels, using parca-agent's container metadata provider. It
+// implements LabelResolver.
+type ContainerLabelResolver struct {
+	ctx      context.Context
+	provider metadata.MetadataProvider
+	allow    map[string]struct{}
+}
+
+// NewContainerLabelResolver builds a resolver backed by the container metadata
+// provider for the given Kubernetes node. The provider maintains its own caches,
+// so per-PID lookups on the hot path are cheap after the first resolution.
+func NewContainerLabelResolver(ctx context.Context, nodeName string) (*ContainerLabelResolver, error) {
+	provider, err := metadata.NewContainerMetadataProvider(ctx, nodeName)
+	if err != nil {
+		return nil, err
+	}
+	return &ContainerLabelResolver{
+		ctx:      ctx,
+		provider: provider,
+		allow:    defaultEnrichmentLabels,
+	}, nil
+}
+
+// LabelsForPID returns the curated container/pod labels for a host PID. PIDs
+// that don't belong to a container (or that can't be resolved) yield an empty
+// map, leaving the data point with only its pid/comm attributes.
+func (r *ContainerLabelResolver) LabelsForPID(pid uint32) map[string]string {
+	lb := labels.NewBuilder(labels.EmptyLabels())
+	r.provider.AddMetadata(r.ctx, libpf.PID(pid), lb)
+
+	out := make(map[string]string, len(r.allow))
+	lb.Range(func(l labels.Label) {
+		if _, ok := r.allow[l.Name]; ok && l.Value != "" {
+			out[l.Name] = l.Value
+		}
+	})
+	return out
+}
@@ -0,0 +1,35 @@
+// Copyright 2026 The Parca Authors
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package gpumetrics collects NVIDIA GPU metrics via NVML and renders them as
+// OTLP metrics for the metricexport egress path. It is a port of Polar Signals'
+// standalone gpu-metrics-agent NVML producer, adapted to run inside parca-agent
+// as a metricexport.Producer.
+//
+// The NVML producer (nvidia.go) requires cgo and a dynamically linked binary
+// so go-nvml can dlopen libnvidia-ml at runtime; it is excluded by the "nonvml"
+// build tag, under which nvidia_stub.go provides a no-op stand-in for the
+// fully-static build. This file holds the declarations shared by both builds.
+package gpumetrics
+
+// ScopeName is the OTLP instrumentation scope GPU metrics are reported under.
+const ScopeName = "parca.nvidia_gpu_metrics"
+
+// LabelResolver resolves additional attributes (e.g. Kubernetes namespace,
+// pod, container) for a process by its host PID. The returned labels are
+// attached to per-process GPU metric data points so they share identity with
+// parca-agent's profiles. NVML reports host PIDs, so the resolver must also
+// operate in the host PID namespace (the standard parca-agent deployment).
+type LabelResolver interface {
+	LabelsForPID(pid uint32) map[string]string
+}