@@ -22,14 +22,30 @@ import (
2222const (
2323 // eBPF program names for USDT probes
2424 // These correspond to the function names in cuda.ebpf.c, not the SEC() paths
25- USDTProgCudaCorrelation = "cuda_correlation"
26- USDTProgCudaKernel = "cuda_kernel_exec"
27- USDTProgCudaProbe = "cuda_probe"
25+ USDTProgCudaCorrelation = "cuda_correlation"
26+ USDTProgCudaKernel = "cuda_kernel_exec"
27+ USDTProgCudaActivityBatch = "cuda_activity_batch"
28+ USDTProgCudaProbe = "cuda_probe"
29+
30+ // BPF attach cookie values — must match CUDA_PROG_* in cuda.ebpf.c.
31+ // Used in the low 32 bits of the BPF attach cookie so cuda_probe can
32+ // distinguish probes. The cuda_progs prog array uses a fixed key (0)
33+ // for the single tail-call target (activity_batch).
34+ CudaProgCorrelation = 0
35+ CudaProgKernelExec = 1
36+ CudaProgActivityBatch = 2
2837)
2938
39+ const cudaProgsMap = "cuda_progs"
40+
3041var (
3142 // gpuFixers maps PID to gpuTraceFixer
3243 gpuFixers sync.Map
44+
45+ // cudaTailCallOnce ensures the cuda_progs prog array is populated exactly
46+ // once. The tail-call targets must be in place before cuda_probe fires.
47+ cudaTailCallOnce sync.Once
48+ cudaTailCallFailed bool
3349)
3450
3551// SymbolizedCudaTrace holds a symbolized trace awaiting GPU timing information.
@@ -66,9 +82,10 @@ type gpuTraceFixer struct {
6682}
6783
6884type data struct {
69- path string
70- link interpreter.LinkCloser
71- probes []pfelf.USDTProbe
85+ path string
86+ link interpreter.LinkCloser
87+ probes []pfelf.USDTProbe
88+ kernelFallback * pfelf.USDTProbe // kernel_executed probe, kept as fallback if activity_batch fails
7289}
7390
7491// Instance is the CUDA interpreter instance
@@ -110,15 +127,36 @@ func Loader(ebpf interpreter.EbpfHandler, info *interpreter.LoaderInfo) (interpr
110127 return nil , nil
111128 }
112129
113- // Filter to only the probes we need
114- var requiredProbes []pfelf.USDTProbe
115- for _ , probe := range parcagpuProbes {
116- if probe .Name == "cuda_correlation" || probe .Name == "kernel_executed" {
117- requiredProbes = append (requiredProbes , probe )
130+ // Filter to only the probes we need.
131+ // Always require cuda_correlation. Prefer activity_batch over kernel_executed.
132+ var correlationProbe * pfelf.USDTProbe
133+ var kernelProbe * pfelf.USDTProbe
134+ var batchProbe * pfelf.USDTProbe
135+ for i := range parcagpuProbes {
136+ switch parcagpuProbes [i ].Name {
137+ case "cuda_correlation" :
138+ correlationProbe = & parcagpuProbes [i ]
139+ case "kernel_executed" :
140+ kernelProbe = & parcagpuProbes [i ]
141+ case "activity_batch" :
142+ batchProbe = & parcagpuProbes [i ]
118143 }
119144 }
120- if len (requiredProbes ) != 2 {
121- log .Warnf ("parcagpu USDT probes in %s missing required probes (need cuda_correlation and kernel_executed): %v" , info .FileName (), parcagpuProbes )
145+ if correlationProbe == nil {
146+ log .Warnf ("parcagpu USDT probes in %s missing cuda_correlation: %v" , info .FileName (), parcagpuProbes )
147+ return nil , nil
148+ }
149+
150+ var requiredProbes []pfelf.USDTProbe
151+ requiredProbes = append (requiredProbes , * correlationProbe )
152+ if batchProbe != nil {
153+ requiredProbes = append (requiredProbes , * batchProbe )
154+ log .Debugf ("parcagpu: using activity_batch mode for %s" , info .FileName ())
155+ } else if kernelProbe != nil {
156+ requiredProbes = append (requiredProbes , * kernelProbe )
157+ log .Debugf ("parcagpu: using kernel_executed mode for %s" , info .FileName ())
158+ } else {
159+ log .Warnf ("parcagpu USDT probes in %s missing kernel probe (need activity_batch or kernel_executed): %v" , info .FileName (), parcagpuProbes )
122160 return nil , nil
123161 }
124162 parcagpuProbes = requiredProbes
@@ -129,6 +167,11 @@ func Loader(ebpf interpreter.EbpfHandler, info *interpreter.LoaderInfo) (interpr
129167 path : info .FileName (),
130168 probes : parcagpuProbes ,
131169 }
170+ // If using activity_batch, keep kernel_executed as fallback in case
171+ // the tail-call prog array setup fails (e.g. verifier rejection).
172+ if batchProbe != nil && kernelProbe != nil {
173+ d .kernelFallback = kernelProbe
174+ }
132175
133176 return d , nil
134177 }
@@ -137,20 +180,46 @@ func Loader(ebpf interpreter.EbpfHandler, info *interpreter.LoaderInfo) (interpr
137180
138181func (d * data ) Attach (ebpf interpreter.EbpfHandler , pid libpf.PID , _ libpf.Address ,
139182 _ remotememory.RemoteMemory ) (interpreter.Instance , error ) {
140- // Maps usdt probe name to ebpf program name.
141- // Use the first character of the probe name as a cookie.
142- // 'c' -> cuda_correlation
143- // 'k' -> cuda_kernel_exec
183+ // If using activity_batch, ensure the tail-call prog array is populated.
184+ // On failure (e.g. verifier rejection), fall back to kernel_executed.
185+ for i , probe := range d .probes {
186+ if probe .Name != "activity_batch" {
187+ continue
188+ }
189+ cudaTailCallOnce .Do (func () {
190+ if err := ebpf .UpdateProgArray (cudaProgsMap , 0 ,
191+ USDTProgCudaActivityBatch ); err != nil {
192+ log .Errorf ("[cuda] activity_batch tail call failed: %v" , err )
193+ cudaTailCallFailed = true
194+ }
195+ })
196+ if cudaTailCallFailed {
197+ if d .kernelFallback != nil {
198+ d .probes [i ] = * d .kernelFallback
199+ log .Warnf ("[cuda] falling back to kernel_executed mode" )
200+ } else {
201+ log .Errorf ("[cuda] activity_batch failed and no kernel_executed fallback" )
202+ d .probes = append (d .probes [:i ], d .probes [i + 1 :]... )
203+ }
204+ }
205+ break
206+ }
207+
208+ // Map USDT probe names to eBPF program names and tail-call indices.
209+ // The cookie doubles as the cuda_progs prog array key for tail-call dispatch.
144210 cookies := make ([]uint64 , len (d .probes ))
145211 progNames := make ([]string , len (d .probes ))
146212 for i , probe := range d .probes {
147- cookies [i ] = uint64 (probe .Name [0 ])
148- // Map probe names to specific program names for single-shot mode
149213 switch probe .Name {
150214 case "cuda_correlation" :
215+ cookies [i ] = CudaProgCorrelation
151216 progNames [i ] = USDTProgCudaCorrelation
152217 case "kernel_executed" :
218+ cookies [i ] = CudaProgKernelExec
153219 progNames [i ] = USDTProgCudaKernel
220+ case "activity_batch" :
221+ cookies [i ] = CudaProgActivityBatch
222+ progNames [i ] = USDTProgCudaActivityBatch
154223 default :
155224 log .Debugf ("unknown parcagpu USDT probe name: %s" , probe .Name )
156225 }
0 commit comments