33
44#pragma once
55
6+ #include < iomanip>
7+ #include < iostream>
8+
69#include " ck/utility/data_type.hpp"
710#include " ck/utility/type_convert.hpp"
811#include " ck/utility/type.hpp"
1316namespace ck {
1417namespace profiler {
1518
19+ // Result struct for GPU verification with detailed error reporting
20+ // Provides backward compatibility via operator bool()
21+ struct GpuVerifyResult
22+ {
23+ unsigned long long error_count; // Number of elements that exceeded tolerance
24+ float max_error; // Maximum error value observed
25+ std::size_t total; // Total number of elements compared
26+ bool all_zero; // True if device result is all zeros (likely kernel issue)
27+
28+ // Implicit conversion to bool for backward compatibility
29+ // Allows: if (gpu_verify(...)) { ... }
30+ operator bool () const { return error_count == 0 ; }
31+
32+ // Calculate error percentage
33+ float error_percentage () const
34+ {
35+ if (total == 0 )
36+ return 0 .0f ;
37+ return static_cast <float >(error_count) / static_cast <float >(total) * 100 .0f ;
38+ }
39+
40+ // Print error summary to stderr (matches check_err format)
41+ void print_error_summary () const
42+ {
43+ if (error_count > 0 )
44+ {
45+ if (all_zero)
46+ {
47+ std::cerr << " WARNING: Device result is all zeros - kernel may not have executed "
48+ " properly!"
49+ << std::endl;
50+ }
51+ std::cerr << " max err: " << max_error;
52+ std::cerr << " , number of errors: " << error_count;
53+ std::cerr << " , " << std::setprecision (2 ) << std::fixed << error_percentage ()
54+ << " % wrong values" << std::endl;
55+ }
56+ }
57+ };
58+
1659// Compute relative tolerance for GPU verification
1760// Matches the logic of ck::utils::get_relative_threshold but handles all types
1861template <typename ComputeDataType, typename OutDataType, typename AccDataType = ComputeDataType>
@@ -63,16 +106,45 @@ inline float compute_relative_tolerance(const int number_of_accumulations = 1)
63106 }
64107}
65108
109+ // Device-side result structure for kernel output
110+ // Packed into a single struct to minimize device memory allocations
111+ struct GpuVerifyDeviceResult
112+ {
113+ unsigned long long error_count; // Number of errors found
114+ float max_error; // Maximum error value
115+ int all_zero; // 1 = device result is all zeros, 0 = has non-zero values
116+ };
117+
66118// GPU verification kernel - compares device result against reference using relative and absolute
67- // tolerance Returns 1 in passed if all elements match within tolerance, 0 otherwise
119+ // tolerance. Tracks all errors (no early exit) to provide detailed error reporting.
120+ //
121+ // Uses LDS (shared memory) for block-level reduction to minimize atomic contention.
122+ // This reduces atomic operations from O(errors) to O(blocks), providing massive speedup
123+ // when there are many errors.
124+ //
125+ // Assumption: Block size is 256
68126template <typename T>
69127__global__ void gpu_verify_kernel (const T* __restrict__ device_result,
70128 const T* __restrict__ reference_result,
71129 float rtol,
72130 float atol,
73131 long long size,
74- int * passed )
132+ GpuVerifyDeviceResult* result )
75133{
134+ constexpr int block_size = 256 ;
135+
136+ // Shared memory for block-level reduction
137+ __shared__ unsigned long long shared_error_count[block_size];
138+ __shared__ float shared_max_error[block_size];
139+ __shared__ int shared_has_error[block_size];
140+ __shared__ int shared_has_nonzero[block_size];
141+
142+ // Thread-local accumulators (in registers)
143+ unsigned long long local_error_count = 0 ;
144+ float local_max_error = 0 .0f ;
145+ int local_has_error = 0 ;
146+ int local_has_nonzero = 0 ;
147+
76148 // Grid-stride loop to handle any tensor size
77149 long long idx = blockIdx.x * blockDim.x + threadIdx.x ;
78150 long long stride = blockDim.x * gridDim.x ;
@@ -83,35 +155,95 @@ __global__ void gpu_verify_kernel(const T* __restrict__ device_result,
83155 float dev_val = type_convert<float >(device_result[i]);
84156 float ref_val = type_convert<float >(reference_result[i]);
85157
158+ // Check if device value is non-zero
159+ if (dev_val != 0 .0f )
160+ {
161+ local_has_nonzero = 1 ;
162+ }
163+
86164 // Compute absolute difference
87165 float abs_diff = fabsf (dev_val - ref_val);
88166
89167 // Check tolerance (matches CPU check_err logic: err > atol + rtol * abs(ref))
90168 if (abs_diff > atol + rtol * fabsf (ref_val))
91169 {
92- atomicMin (passed, 0 ); // Mark as failed
93- return ; // Early exit on first failure
170+ local_has_error = 1 ;
171+ local_error_count++;
172+ local_max_error = fmaxf (local_max_error, abs_diff);
173+ }
174+ }
175+
176+ // Store thread-local results to shared memory
177+ shared_error_count[threadIdx.x ] = local_error_count;
178+ shared_max_error[threadIdx.x ] = local_max_error;
179+ shared_has_error[threadIdx.x ] = local_has_error;
180+ shared_has_nonzero[threadIdx.x ] = local_has_nonzero;
181+ __syncthreads ();
182+
183+ // Block-level reduction: 256 -> 128 -> 64 -> 32
184+ for (unsigned int s = block_size / 2 ; s >= 32 ; s >>= 1 )
185+ {
186+ if (threadIdx.x < s)
187+ {
188+ shared_error_count[threadIdx.x ] += shared_error_count[threadIdx.x + s];
189+ shared_max_error[threadIdx.x ] =
190+ fmaxf (shared_max_error[threadIdx.x ], shared_max_error[threadIdx.x + s]);
191+ shared_has_error[threadIdx.x ] |= shared_has_error[threadIdx.x + s];
192+ shared_has_nonzero[threadIdx.x ] |= shared_has_nonzero[threadIdx.x + s];
193+ }
194+ __syncthreads ();
195+ }
196+
197+ // Final reduction of remaining 32 elements in thread 0
198+ if (threadIdx.x == 0 )
199+ {
200+ for (int i = 1 ; i < 32 ; ++i)
201+ {
202+ shared_error_count[0 ] += shared_error_count[i];
203+ shared_max_error[0 ] = fmaxf (shared_max_error[0 ], shared_max_error[i]);
204+ shared_has_error[0 ] |= shared_has_error[i];
205+ shared_has_nonzero[0 ] |= shared_has_nonzero[i];
206+ }
207+
208+ // Single atomic update per block (reduces contention from O(errors) to O(blocks))
209+ if (shared_has_error[0 ])
210+ {
211+ atomicAdd (&result->error_count , shared_error_count[0 ]);
212+ atomicMax (&result->max_error , shared_max_error[0 ]);
213+ }
214+ // Update all_zero flag: if no nonzero values found, mark as all zero
215+ if (!shared_has_nonzero[0 ])
216+ {
217+ atomicMin (&result->all_zero , 1 );
218+ }
219+ else
220+ {
221+ atomicMin (&result->all_zero , 0 );
94222 }
95223 }
96224}
97225
98226// Host-side wrapper for GPU verification with explicit tolerances
99- // Returns true if verification passed, false otherwise
227+ // Returns GpuVerifyResult with detailed error information
100228template <typename T>
101- bool gpu_verify (const void * device_result,
102- const void * reference_result,
103- float rtol,
104- float atol,
105- std::size_t size,
106- hipStream_t stream = nullptr )
229+ GpuVerifyResult gpu_verify (const void * device_result,
230+ const void * reference_result,
231+ float rtol,
232+ float atol,
233+ std::size_t size,
234+ hipStream_t stream = nullptr )
107235{
108236 // Allocate result buffer on device
109- int * passed_dev ;
110- hip_check_error (hipMalloc (&passed_dev , sizeof (int )));
237+ GpuVerifyDeviceResult* result_dev ;
238+ hip_check_error (hipMalloc (&result_dev , sizeof (GpuVerifyDeviceResult )));
111239
112- // Initialize to passed (1)
113- int passed_host = 1 ;
114- hip_check_error (hipMemcpy (passed_dev, &passed_host, sizeof (int ), hipMemcpyHostToDevice));
240+ // Initialize result struct
241+ GpuVerifyDeviceResult result_host;
242+ result_host.error_count = 0 ; // No errors yet
243+ result_host.max_error = 0 .0f ; // No error observed
244+ result_host.all_zero = 1 ; // Start assuming all zeros (will be cleared if nonzero found)
245+ hip_check_error (
246+ hipMemcpy (result_dev, &result_host, sizeof (GpuVerifyDeviceResult), hipMemcpyHostToDevice));
115247
116248 // Launch kernel with grid-stride loop
117249 // Use 65535 as max grid size (hardware limit for grid dimension in x)
@@ -125,20 +257,28 @@ bool gpu_verify(const void* device_result,
125257 rtol,
126258 atol,
127259 static_cast <long long >(size),
128- passed_dev );
260+ result_dev );
129261
130262 hip_check_error (hipGetLastError ());
131263
132264 // Synchronize the stream to ensure kernel completion before reading results
133265 hip_check_error (hipStreamSynchronize (stream));
134266
135267 // Get result
136- hip_check_error (hipMemcpy (&passed_host, passed_dev, sizeof (int ), hipMemcpyDeviceToHost));
268+ hip_check_error (
269+ hipMemcpy (&result_host, result_dev, sizeof (GpuVerifyDeviceResult), hipMemcpyDeviceToHost));
137270
138271 // Free device memory
139- hip_check_error (hipFree (passed_dev));
272+ hip_check_error (hipFree (result_dev));
273+
274+ // Build and return result struct
275+ GpuVerifyResult result;
276+ result.error_count = result_host.error_count ;
277+ result.max_error = result_host.max_error ;
278+ result.total = size;
279+ result.all_zero = (result_host.all_zero == 1 );
140280
141- return passed_host == 1 ;
281+ return result ;
142282}
143283
144284// Forward declaration of gpu_reduce_max
@@ -147,15 +287,15 @@ float gpu_reduce_max(const void* device_buffer, std::size_t size, hipStream_t st
147287
148288// Host-side wrapper for GPU verification with automatic tolerance computation
149289// Computes max value on GPU, then computes tolerances and verifies
150- // Returns true if verification passed, false otherwise
290+ // Returns GpuVerifyResult with detailed error information
151291template <typename OutDataType,
152292 typename ComputeDataType = OutDataType,
153293 typename AccDataType = ComputeDataType>
154- bool gpu_verify (const void * device_result,
155- const void * reference_result,
156- int number_of_accumulations,
157- std::size_t size,
158- hipStream_t stream = nullptr )
294+ GpuVerifyResult gpu_verify (const void * device_result,
295+ const void * reference_result,
296+ int number_of_accumulations,
297+ std::size_t size,
298+ hipStream_t stream = nullptr )
159299{
160300 // Compute max absolute value on GPU (only 4 bytes transferred!)
161301 double max_abs_value =
@@ -187,24 +327,6 @@ bool gpu_verify(const void* device_result,
187327 return gpu_verify<OutDataType>(device_result, reference_result, rtol, atol, size, stream);
188328}
189329
190- //
191- // Helper function for atomic float max (using compare-and-swap)
192- __device__ __forceinline__ float atomicMaxFloat (float * address, float val)
193- {
194- int * address_as_int = reinterpret_cast <int *>(address);
195- int old = *address_as_int;
196- int assumed;
197-
198- do
199- {
200- assumed = old;
201- old =
202- atomicCAS (address_as_int, assumed, __float_as_int (fmaxf (val, __int_as_float (assumed))));
203- } while (assumed != old);
204-
205- return __int_as_float (old);
206- }
207-
208330// GPU reduction kernel for computing max(abs(data))
209331// This is an internal kernel called only by gpu_reduce_max() wrapper.
210332//
@@ -231,7 +353,7 @@ gpu_reduce_max_kernel(const T* __restrict__ data, long long size, float* __restr
231353 __syncthreads ();
232354
233355 // Block-level reduction: 256 -> 128 -> 64 -> 32
234- for (unsigned int s = block_size / 2 ; s > 32 ; s >>= 1 )
356+ for (unsigned int s = block_size / 2 ; s >= 32 ; s >>= 1 )
235357 {
236358 if (threadIdx.x < s)
237359 {
@@ -240,26 +362,16 @@ gpu_reduce_max_kernel(const T* __restrict__ data, long long size, float* __restr
240362 __syncthreads ();
241363 }
242364
243- // Warp-level reduction: 32 -> 16 -> 8 -> 4 -> 2 -> 1
244- // No sync needed within a warp
245- if (threadIdx.x < 32 )
246- {
247- volatile float * smem = shared_max;
248- smem[threadIdx.x ] = fmaxf (smem[threadIdx.x ], smem[threadIdx.x + 32 ]);
249- smem[threadIdx.x ] = fmaxf (smem[threadIdx.x ], smem[threadIdx.x + 16 ]);
250- smem[threadIdx.x ] = fmaxf (smem[threadIdx.x ], smem[threadIdx.x + 8 ]);
251- smem[threadIdx.x ] = fmaxf (smem[threadIdx.x ], smem[threadIdx.x + 4 ]);
252- smem[threadIdx.x ] = fmaxf (smem[threadIdx.x ], smem[threadIdx.x + 2 ]);
253- smem[threadIdx.x ] = fmaxf (smem[threadIdx.x ], smem[threadIdx.x + 1 ]);
254- }
255-
256- // Two-phase reduction pattern minimizes atomic contention:
257- // 1. Each block reduces to shared memory (above)
258- // 2. Single thread per block updates global max (below)
259- // This limits atomic operations to O(grid_size) rather than O(total_threads)
365+ // Final reduction of remaining 32 elements in thread 0
260366 if (threadIdx.x == 0 )
261367 {
262- atomicMaxFloat (max_val, shared_max[0 ]);
368+ for (int i = 1 ; i < 32 ; ++i)
369+ {
370+ shared_max[0 ] = fmaxf (shared_max[0 ], shared_max[i]);
371+ }
372+
373+ // Single atomic update per block
374+ atomicMax (max_val, shared_max[0 ]);
263375 }
264376}
265377
0 commit comments