@@ -5,9 +5,17 @@ package gpu_test
55
66import (
77 "testing"
8+ "unique"
9+ "unsafe"
810
11+ "github.com/stretchr/testify/assert"
912 "github.com/stretchr/testify/require"
13+ "github.com/zeebo/xxh3"
14+
15+ "go.opentelemetry.io/ebpf-profiler/host"
1016 "go.opentelemetry.io/ebpf-profiler/interpreter/gpu"
17+ "go.opentelemetry.io/ebpf-profiler/libpf"
18+ "go.opentelemetry.io/ebpf-profiler/reporter/samples"
1119 "go.opentelemetry.io/ebpf-profiler/support"
1220)
1321
@@ -42,3 +50,344 @@ func TestProgramNamesExist(t *testing.T) {
4250 t .Logf ("Found program %q" , gpu .USDTProgCudaProbe )
4351 })
4452}
53+
54+ // computeTraceHash replicates the hash logic from tracer.loadBpfTrace:
55+ // zero per-sample fields, then hash the raw bytes.
56+ func computeTraceHash (tr * support.Trace ) host.TraceHash {
57+ // Work on a copy so we don't mutate the caller's data.
58+ clone := * tr
59+ clone .ZeroPerSampleFields ()
60+ raw := unsafe .Slice ((* byte )(unsafe .Pointer (& clone )), unsafe .Sizeof (clone ))
61+ return host .TraceHash (xxh3 .Hash128 (raw ).Lo )
62+ }
63+
64+ // makeCUDATrace builds a support.Trace with one CUDA kernel frame (at position 0)
65+ // followed by nativFrames native frames. The CUDA frame encodes the given
66+ // correlationID and cbid.
67+ func makeCUDATrace (pid uint32 , correlationID uint32 , cbid int32 ,
68+ nativeFrames []support.Frame ) support.Trace {
69+ tr := support.Trace {
70+ Pid : pid ,
71+ Tid : pid ,
72+ Origin : support .TraceOriginCuda ,
73+ Kernel_stack_id : - 1 , // no kernel stack
74+ }
75+
76+ // CUDA kernel frame first (matches BPF collect_trace ordering).
77+ cudaID := uint64 (correlationID ) | (uint64 (uint32 (cbid )) << 32 )
78+ tr .Frames [0 ] = support.Frame {
79+ Kind : support .FrameMarkerCUDAKernel ,
80+ Addr_or_line : cudaID ,
81+ }
82+ tr .Stack_len = 1
83+
84+ for i , f := range nativeFrames {
85+ tr .Frames [1 + i ] = f
86+ tr .Stack_len ++
87+ }
88+
89+ return tr
90+ }
91+
92+ func TestCUDATraceHashStability (t * testing.T ) {
93+ // Two launches from the same call site (identical native frames)
94+ // with different correlation IDs must produce the same hash.
95+ nativeFrames := []support.Frame {
96+ {File_id : 0xaaaa , Addr_or_line : 0x1000 , Kind : 8 }, // native
97+ {File_id : 0xaaaa , Addr_or_line : 0x2000 , Kind : 8 }, // native
98+ {File_id : 0xbbbb , Addr_or_line : 0x3000 , Kind : 8 }, // native
99+ }
100+
101+ tr1 := makeCUDATrace (100 , 42 , 1 , nativeFrames )
102+ tr2 := makeCUDATrace (100 , 999 , 1 , nativeFrames )
103+
104+ hash1 := computeTraceHash (& tr1 )
105+ hash2 := computeTraceHash (& tr2 )
106+ assert .Equal (t , hash1 , hash2 ,
107+ "same call site with different correlation IDs should produce identical hashes" )
108+
109+ // Different CBID (different API call type) with same native stack should
110+ // also produce the same hash since cbid is part of addr_or_line.
111+ tr3 := makeCUDATrace (100 , 42 , 7 , nativeFrames )
112+ hash3 := computeTraceHash (& tr3 )
113+ assert .Equal (t , hash1 , hash3 ,
114+ "same call site with different CBIDs should produce identical hashes" )
115+ }
116+
117+ func TestCUDATraceHashDiffers (t * testing.T ) {
118+ framesA := []support.Frame {
119+ {File_id : 0xaaaa , Addr_or_line : 0x1000 , Kind : 8 },
120+ }
121+ framesB := []support.Frame {
122+ {File_id : 0xaaaa , Addr_or_line : 0x2000 , Kind : 8 }, // different addr
123+ }
124+
125+ trA := makeCUDATrace (100 , 42 , 1 , framesA )
126+ trB := makeCUDATrace (100 , 42 , 1 , framesB )
127+
128+ hashA := computeTraceHash (& trA )
129+ hashB := computeTraceHash (& trB )
130+ assert .NotEqual (t , hashA , hashB ,
131+ "different native stacks should produce different hashes" )
132+ }
133+
134+ func TestCUDATraceHashExcludesPerSampleFields (t * testing.T ) {
135+ frames := []support.Frame {
136+ {File_id : 0xaaaa , Addr_or_line : 0x1000 , Kind : 8 },
137+ }
138+
139+ tr1 := makeCUDATrace (100 , 42 , 1 , frames )
140+ tr2 := makeCUDATrace (100 , 42 , 1 , frames )
141+
142+ // Vary all the per-sample fields that should be excluded.
143+ tr2 .Ktime = 99999
144+ tr2 .Origin = support .TraceOriginOffCPU
145+ tr2 .Offtime = 12345
146+ tr2 .Comm = [16 ]byte {'d' , 'i' , 'f' , 'f' , 'e' , 'r' , 'e' , 'n' , 't' }
147+
148+ hash1 := computeTraceHash (& tr1 )
149+ hash2 := computeTraceHash (& tr2 )
150+ assert .Equal (t , hash1 , hash2 ,
151+ "per-sample fields (ktime, origin, offtime, comm) must not affect hash" )
152+ }
153+
154+ func TestNonCUDATraceHashIncludesAddrOrLine (t * testing.T ) {
155+ // For non-CUDA frames, addr_or_line MUST be included in the hash.
156+ makeNative := func (addr uint64 ) support.Trace {
157+ tr := support.Trace {
158+ Pid : 100 ,
159+ Tid : 100 ,
160+ Origin : support .TraceOriginSampling ,
161+ Stack_len : 1 ,
162+ Kernel_stack_id : - 1 ,
163+ }
164+ tr .Frames [0 ] = support.Frame {
165+ File_id : 0xaaaa ,
166+ Addr_or_line : addr ,
167+ Kind : 8 , // native
168+ }
169+ return tr
170+ }
171+
172+ tr1 := makeNative (0x1000 )
173+ tr2 := makeNative (0x2000 )
174+
175+ hash1 := computeTraceHash (& tr1 )
176+ hash2 := computeTraceHash (& tr2 )
177+ assert .NotEqual (t , hash1 , hash2 ,
178+ "non-CUDA traces with different addresses must have different hashes" )
179+ }
180+
181+ // makeSymbolizedTrace builds a libpf.Trace that looks like what ConvertTrace
182+ // produces for a CUDA trace: frames before cudaFrameIdx are native, then the
183+ // CUDAKernelFrame, then more native frames.
184+ func makeSymbolizedTrace (cudaFrameIdx int , nativeFrameCount int ) * libpf.Trace {
185+ trace := & libpf.Trace {}
186+ for i := range cudaFrameIdx + 1 + nativeFrameCount {
187+ if i == cudaFrameIdx {
188+ trace .Frames = append (trace .Frames , unique .Make (libpf.Frame {
189+ Type : libpf .CUDAKernelFrame ,
190+ }))
191+ } else {
192+ trace .Frames = append (trace .Frames , unique .Make (libpf.Frame {
193+ Type : libpf .NativeFrame ,
194+ AddressOrLineno : libpf .AddressOrLineno (0x1000 * (i + 1 )),
195+ FileID : libpf .NewFileID (uint64 (i + 1 ), 0 ),
196+ }))
197+ }
198+ }
199+ return trace
200+ }
201+
202+ func TestAddTraceAndTimes (t * testing.T ) {
203+ const pid = libpf .PID (500 )
204+ gpu .RegisterTestFixer (pid )
205+ t .Cleanup (func () { gpu .UnregisterTestFixer (pid ) })
206+
207+ // Simulate: trace arrives first, timing arrives second.
208+ trace := makeSymbolizedTrace (0 , 2 ) // CUDA frame at index 0
209+ meta := & samples.TraceEventMeta {PID : pid }
210+
211+ st := & gpu.SymbolizedCudaTrace {
212+ Trace : trace ,
213+ Meta : meta ,
214+ CUDAFrameIdx : 0 ,
215+ CorrelationID : 100 ,
216+ CBID : 1 ,
217+ }
218+
219+ // AddTrace with no pending timing -> stored, no outputs.
220+ outputs := gpu .AddTrace (st )
221+ assert .Empty (t , outputs , "no timing yet, should produce no outputs" )
222+
223+ // Now timing arrives.
224+ kernelName := [256 ]byte {}
225+ copy (kernelName [:], "_Z9myKernelPfS_i" )
226+
227+ events := []gpu.CuptiTimingEvent {{
228+ Pid : uint32 (pid ),
229+ Id : 100 ,
230+ Start : 1000 ,
231+ End : 2000 ,
232+ Dev : 0 ,
233+ Stream : 7 ,
234+ KernelName : kernelName ,
235+ }}
236+
237+ outputs = gpu .AddTimes (events )
238+ require .Len (t , outputs , 1 , "timing matched, should produce one output" )
239+
240+ out := outputs [0 ]
241+ assert .Equal (t , int64 (1000 ), out .Meta .OffTime , "OffTime should be End-Start" )
242+ assert .Equal (t , "0" , out .Trace .CustomLabels ["cuda_device" ])
243+ assert .Equal (t , "7" , out .Trace .CustomLabels ["cuda_stream" ])
244+
245+ // Verify the CUDA frame got the demangled kernel name and zeroed AddressOrLineno.
246+ cudaFrame := out .Trace .Frames [0 ].Value ()
247+ assert .Equal (t , libpf .CUDAKernelFrame , cudaFrame .Type )
248+ assert .Equal (t , libpf .AddressOrLineno (0 ), cudaFrame .AddressOrLineno ,
249+ "correlation ID should be zeroed in output" )
250+ assert .Contains (t , cudaFrame .FunctionName .String (), "myKernel" ,
251+ "kernel name should be demangled" )
252+ }
253+
254+ func TestAddTimeThenTrace (t * testing.T ) {
255+ const pid = libpf .PID (501 )
256+ gpu .RegisterTestFixer (pid )
257+ t .Cleanup (func () { gpu .UnregisterTestFixer (pid ) })
258+
259+ // Simulate: timing arrives first, trace arrives second.
260+ kernelName := [256 ]byte {}
261+ copy (kernelName [:], "_Z6squarePfS_" )
262+
263+ events := []gpu.CuptiTimingEvent {{
264+ Pid : uint32 (pid ),
265+ Id : 200 ,
266+ Start : 5000 ,
267+ End : 8000 ,
268+ Dev : 1 ,
269+ KernelName : kernelName ,
270+ }}
271+
272+ // Timing arrives first -> stored, no outputs.
273+ outputs := gpu .AddTimes (events )
274+ assert .Empty (t , outputs )
275+
276+ // Now trace arrives and matches.
277+ trace := makeSymbolizedTrace (0 , 1 )
278+ meta := & samples.TraceEventMeta {PID : pid }
279+
280+ st := & gpu.SymbolizedCudaTrace {
281+ Trace : trace ,
282+ Meta : meta ,
283+ CUDAFrameIdx : 0 ,
284+ CorrelationID : 200 ,
285+ CBID : 1 ,
286+ }
287+
288+ outputs = gpu .AddTrace (st )
289+ require .Len (t , outputs , 1 )
290+
291+ out := outputs [0 ]
292+ assert .Equal (t , int64 (3000 ), out .Meta .OffTime )
293+ assert .Equal (t , "1" , out .Trace .CustomLabels ["cuda_device" ])
294+
295+ cudaFrame := out .Trace .Frames [0 ].Value ()
296+ assert .Contains (t , cudaFrame .FunctionName .String (), "square" )
297+ assert .Equal (t , libpf .AddressOrLineno (0 ), cudaFrame .AddressOrLineno )
298+ }
299+
300+ func TestCachedTemplateWithDifferentCorrelationIDs (t * testing.T ) {
301+ const pid = libpf .PID (502 )
302+ gpu .RegisterTestFixer (pid )
303+ t .Cleanup (func () { gpu .UnregisterTestFixer (pid ) })
304+
305+ // Simulate two launches from the same call site (same template) with
306+ // different correlation IDs and different kernel names from timing.
307+ // This is the scenario where the cache provides the template.
308+ template := makeSymbolizedTrace (0 , 2 )
309+
310+ for _ , tc := range []struct {
311+ corrID uint32
312+ kernelName string
313+ offTime int64
314+ }{
315+ {corrID : 300 , kernelName : "_Z7kernelAv" , offTime : 100 },
316+ {corrID : 301 , kernelName : "_Z7kernelBv" , offTime : 200 },
317+ } {
318+ meta := & samples.TraceEventMeta {PID : pid }
319+
320+ st := & gpu.SymbolizedCudaTrace {
321+ Trace : template ,
322+ Meta : meta ,
323+ CUDAFrameIdx : 0 ,
324+ CorrelationID : tc .corrID ,
325+ CBID : 1 ,
326+ }
327+ gpu .AddTrace (st )
328+
329+ kn := [256 ]byte {}
330+ copy (kn [:], tc .kernelName )
331+ outputs := gpu .AddTimes ([]gpu.CuptiTimingEvent {{
332+ Pid : uint32 (pid ),
333+ Id : tc .corrID ,
334+ Start : 0 ,
335+ End : uint64 (tc .offTime ),
336+ KernelName : kn ,
337+ }})
338+ require .Len (t , outputs , 1 , "corrID %d should produce one output" , tc .corrID )
339+
340+ out := outputs [0 ]
341+ assert .Equal (t , tc .offTime , out .Meta .OffTime )
342+
343+ cudaFrame := out .Trace .Frames [0 ].Value ()
344+ assert .Equal (t , libpf .AddressOrLineno (0 ), cudaFrame .AddressOrLineno ,
345+ "correlation ID must not leak into output" )
346+ // Verify each launch got its own kernel name.
347+ assert .Contains (t , cudaFrame .FunctionName .String (), tc .kernelName [3 :10 ],
348+ "each launch should get its own kernel name" )
349+ }
350+ }
351+
352+ func TestCUDAFrameIdxNonZero (t * testing.T ) {
353+ const pid = libpf .PID (503 )
354+ gpu .RegisterTestFixer (pid )
355+ t .Cleanup (func () { gpu .UnregisterTestFixer (pid ) })
356+
357+ // CUDA frame at index 2 (after two kernel/native frames).
358+ trace := makeSymbolizedTrace (2 , 2 ) // [native, native, CUDA, native, native]
359+ meta := & samples.TraceEventMeta {PID : pid }
360+
361+ kernelName := [256 ]byte {}
362+ copy (kernelName [:], "_Z4testv" )
363+
364+ st := & gpu.SymbolizedCudaTrace {
365+ Trace : trace ,
366+ Meta : meta ,
367+ CUDAFrameIdx : 2 ,
368+ CorrelationID : 400 ,
369+ CBID : 1 ,
370+ }
371+ gpu .AddTrace (st )
372+
373+ outputs := gpu .AddTimes ([]gpu.CuptiTimingEvent {{
374+ Pid : uint32 (pid ),
375+ Id : 400 ,
376+ Start : 0 ,
377+ End : 500 ,
378+ KernelName : kernelName ,
379+ }})
380+ require .Len (t , outputs , 1 )
381+
382+ // The CUDA frame at index 2 should have the kernel name.
383+ cudaFrame := outputs [0 ].Trace .Frames [2 ].Value ()
384+ assert .Equal (t , libpf .CUDAKernelFrame , cudaFrame .Type )
385+ assert .Contains (t , cudaFrame .FunctionName .String (), "test" )
386+
387+ // The non-CUDA frames should be untouched.
388+ for _ , idx := range []int {0 , 1 , 3 , 4 } {
389+ f := outputs [0 ].Trace .Frames [idx ].Value ()
390+ assert .Equal (t , libpf .NativeFrame , f .Type ,
391+ "frame %d should remain native" , idx )
392+ }
393+ }
0 commit comments