Skip to content

Commit 60d213f

Browse files
committed
Support new activity_batch parcagpucupti probe
In order to reduce bpf overhead send through up to 128 kernel launch timing activities to the usdt probe. The old single shot kernel_executed probe is still supported. Inline correlation and kernel_exec into cuda_probe, tail-call only activity_batch The unwinder is sensitive to tail calls, so minimize them: inline cuda_correlation and cuda_kernel_exec directly into cuda_probe's switch statement using bpf_usdt_arg() for USDT arg reading.
1 parent ede3a25 commit 60d213f

12 files changed

Lines changed: 432 additions & 121 deletions

File tree

interpreter/gpu/cuda.go

Lines changed: 88 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -22,14 +22,30 @@ import (
2222
const (
2323
// eBPF program names for USDT probes
2424
// These correspond to the function names in cuda.ebpf.c, not the SEC() paths
25-
USDTProgCudaCorrelation = "cuda_correlation"
26-
USDTProgCudaKernel = "cuda_kernel_exec"
27-
USDTProgCudaProbe = "cuda_probe"
25+
USDTProgCudaCorrelation = "cuda_correlation"
26+
USDTProgCudaKernel = "cuda_kernel_exec"
27+
USDTProgCudaActivityBatch = "cuda_activity_batch"
28+
USDTProgCudaProbe = "cuda_probe"
29+
30+
// BPF attach cookie values — must match CUDA_PROG_* in cuda.ebpf.c.
31+
// Used in the low 32 bits of the BPF attach cookie so cuda_probe can
32+
// distinguish probes. The cuda_progs prog array uses a fixed key (0)
33+
// for the single tail-call target (activity_batch).
34+
CudaProgCorrelation = 0
35+
CudaProgKernelExec = 1
36+
CudaProgActivityBatch = 2
2837
)
2938

39+
const cudaProgsMap = "cuda_progs"
40+
3041
var (
3142
// gpuFixers maps PID to gpuTraceFixer
3243
gpuFixers sync.Map
44+
45+
// cudaTailCallOnce ensures the cuda_progs prog array is populated exactly
46+
// once. The tail-call targets must be in place before cuda_probe fires.
47+
cudaTailCallOnce sync.Once
48+
cudaTailCallFailed bool
3349
)
3450

3551
// SymbolizedCudaTrace holds a symbolized trace awaiting GPU timing information.
@@ -66,9 +82,10 @@ type gpuTraceFixer struct {
6682
}
6783

6884
type data struct {
69-
path string
70-
link interpreter.LinkCloser
71-
probes []pfelf.USDTProbe
85+
path string
86+
link interpreter.LinkCloser
87+
probes []pfelf.USDTProbe
88+
kernelFallback *pfelf.USDTProbe // kernel_executed probe, kept as fallback if activity_batch fails
7289
}
7390

7491
// Instance is the CUDA interpreter instance
@@ -110,15 +127,36 @@ func Loader(ebpf interpreter.EbpfHandler, info *interpreter.LoaderInfo) (interpr
110127
return nil, nil
111128
}
112129

113-
// Filter to only the probes we need
114-
var requiredProbes []pfelf.USDTProbe
115-
for _, probe := range parcagpuProbes {
116-
if probe.Name == "cuda_correlation" || probe.Name == "kernel_executed" {
117-
requiredProbes = append(requiredProbes, probe)
130+
// Filter to only the probes we need.
131+
// Always require cuda_correlation. Prefer activity_batch over kernel_executed.
132+
var correlationProbe *pfelf.USDTProbe
133+
var kernelProbe *pfelf.USDTProbe
134+
var batchProbe *pfelf.USDTProbe
135+
for i := range parcagpuProbes {
136+
switch parcagpuProbes[i].Name {
137+
case "cuda_correlation":
138+
correlationProbe = &parcagpuProbes[i]
139+
case "kernel_executed":
140+
kernelProbe = &parcagpuProbes[i]
141+
case "activity_batch":
142+
batchProbe = &parcagpuProbes[i]
118143
}
119144
}
120-
if len(requiredProbes) != 2 {
121-
log.Warnf("parcagpu USDT probes in %s missing required probes (need cuda_correlation and kernel_executed): %v", info.FileName(), parcagpuProbes)
145+
if correlationProbe == nil {
146+
log.Warnf("parcagpu USDT probes in %s missing cuda_correlation: %v", info.FileName(), parcagpuProbes)
147+
return nil, nil
148+
}
149+
150+
var requiredProbes []pfelf.USDTProbe
151+
requiredProbes = append(requiredProbes, *correlationProbe)
152+
if batchProbe != nil {
153+
requiredProbes = append(requiredProbes, *batchProbe)
154+
log.Debugf("parcagpu: using activity_batch mode for %s", info.FileName())
155+
} else if kernelProbe != nil {
156+
requiredProbes = append(requiredProbes, *kernelProbe)
157+
log.Debugf("parcagpu: using kernel_executed mode for %s", info.FileName())
158+
} else {
159+
log.Warnf("parcagpu USDT probes in %s missing kernel probe (need activity_batch or kernel_executed): %v", info.FileName(), parcagpuProbes)
122160
return nil, nil
123161
}
124162
parcagpuProbes = requiredProbes
@@ -129,6 +167,11 @@ func Loader(ebpf interpreter.EbpfHandler, info *interpreter.LoaderInfo) (interpr
129167
path: info.FileName(),
130168
probes: parcagpuProbes,
131169
}
170+
// If using activity_batch, keep kernel_executed as fallback in case
171+
// the tail-call prog array setup fails (e.g. verifier rejection).
172+
if batchProbe != nil && kernelProbe != nil {
173+
d.kernelFallback = kernelProbe
174+
}
132175

133176
return d, nil
134177
}
@@ -137,20 +180,46 @@ func Loader(ebpf interpreter.EbpfHandler, info *interpreter.LoaderInfo) (interpr
137180

138181
func (d *data) Attach(ebpf interpreter.EbpfHandler, pid libpf.PID, _ libpf.Address,
139182
_ remotememory.RemoteMemory) (interpreter.Instance, error) {
140-
// Maps usdt probe name to ebpf program name.
141-
// Use the first character of the probe name as a cookie.
142-
// 'c' -> cuda_correlation
143-
// 'k' -> cuda_kernel_exec
183+
// If using activity_batch, ensure the tail-call prog array is populated.
184+
// On failure (e.g. verifier rejection), fall back to kernel_executed.
185+
for i, probe := range d.probes {
186+
if probe.Name != "activity_batch" {
187+
continue
188+
}
189+
cudaTailCallOnce.Do(func() {
190+
if err := ebpf.UpdateProgArray(cudaProgsMap, 0,
191+
USDTProgCudaActivityBatch); err != nil {
192+
log.Errorf("[cuda] activity_batch tail call failed: %v", err)
193+
cudaTailCallFailed = true
194+
}
195+
})
196+
if cudaTailCallFailed {
197+
if d.kernelFallback != nil {
198+
d.probes[i] = *d.kernelFallback
199+
log.Warnf("[cuda] falling back to kernel_executed mode")
200+
} else {
201+
log.Errorf("[cuda] activity_batch failed and no kernel_executed fallback")
202+
d.probes = append(d.probes[:i], d.probes[i+1:]...)
203+
}
204+
}
205+
break
206+
}
207+
208+
// Map USDT probe names to eBPF program names and tail-call indices.
209+
// The cookie doubles as the cuda_progs prog array key for tail-call dispatch.
144210
cookies := make([]uint64, len(d.probes))
145211
progNames := make([]string, len(d.probes))
146212
for i, probe := range d.probes {
147-
cookies[i] = uint64(probe.Name[0])
148-
// Map probe names to specific program names for single-shot mode
149213
switch probe.Name {
150214
case "cuda_correlation":
215+
cookies[i] = CudaProgCorrelation
151216
progNames[i] = USDTProgCudaCorrelation
152217
case "kernel_executed":
218+
cookies[i] = CudaProgKernelExec
153219
progNames[i] = USDTProgCudaKernel
220+
case "activity_batch":
221+
cookies[i] = CudaProgActivityBatch
222+
progNames[i] = USDTProgCudaActivityBatch
154223
default:
155224
log.Debugf("unknown parcagpu USDT probe name: %s", probe.Name)
156225
}

interpreter/gpu/cuda_test.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ func TestProgramNamesExist(t *testing.T) {
3232
progNames := []string{
3333
gpu.USDTProgCudaCorrelation,
3434
gpu.USDTProgCudaKernel,
35+
gpu.USDTProgCudaActivityBatch,
3536
}
3637

3738
for _, progName := range progNames {

interpreter/instancestubs.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,10 @@ func (mockup *EbpfHandlerStubs) AttachUSDTProbes(libpf.PID, string, string, []pf
7777
return nil, nil
7878
}
7979

80+
func (mockup *EbpfHandlerStubs) UpdateProgArray(string, uint32, string) error {
81+
return nil
82+
}
83+
8084
func (mockup *EbpfHandlerStubs) AttachUprobe(
8185
libpf.PID, string, uint64, string) (LinkCloser, error) {
8286
return nil, nil

interpreter/types.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -136,6 +136,11 @@ type EbpfHandler interface {
136136

137137
// AttachUprobe attaches an eBPF uprobe to a function at a specific offset in a binary
138138
AttachUprobe(pid libpf.PID, path string, offset uint64, progName string) (LinkCloser, error)
139+
140+
// UpdateProgArray loads an eBPF program by name and inserts it into the
141+
// named BPF_MAP_TYPE_PROG_ARRAY at the given key. The program is loaded
142+
// once and cached; subsequent calls with the same progName reuse it.
143+
UpdateProgArray(mapName string, key uint32, progName string) error
139144
}
140145

141146
type LinkCloser interface {

0 commit comments

Comments
 (0)