Skip to content

Commit f173642

Browse files
[CK] Refactor GPU verification kernel to gather error stats on GPU (#3551)
* Refactor GPU verification kernel to gather erorr stats on GPU * Check if result is all zero * non-negative error count doesn't need custom Atomics * Remove unnecessary AtomicMaxFloat function * Simpler warp reduction, remove passed flag * Move verification header to include * Fix header path in test * Fix block reduction loop
1 parent 3ccb15e commit f173642

5 files changed

Lines changed: 203 additions & 158 deletions

File tree

profiler/include/profiler/gpu_verification.hpp renamed to include/ck/library/utility/gpu_verification.hpp

Lines changed: 175 additions & 63 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,9 @@
33

44
#pragma once
55

6+
#include <iomanip>
7+
#include <iostream>
8+
69
#include "ck/utility/data_type.hpp"
710
#include "ck/utility/type_convert.hpp"
811
#include "ck/utility/type.hpp"
@@ -13,6 +16,46 @@
1316
namespace ck {
1417
namespace profiler {
1518

19+
// Result struct for GPU verification with detailed error reporting
20+
// Provides backward compatibility via operator bool()
21+
struct GpuVerifyResult
22+
{
23+
unsigned long long error_count; // Number of elements that exceeded tolerance
24+
float max_error; // Maximum error value observed
25+
std::size_t total; // Total number of elements compared
26+
bool all_zero; // True if device result is all zeros (likely kernel issue)
27+
28+
// Implicit conversion to bool for backward compatibility
29+
// Allows: if (gpu_verify(...)) { ... }
30+
operator bool() const { return error_count == 0; }
31+
32+
// Calculate error percentage
33+
float error_percentage() const
34+
{
35+
if(total == 0)
36+
return 0.0f;
37+
return static_cast<float>(error_count) / static_cast<float>(total) * 100.0f;
38+
}
39+
40+
// Print error summary to stderr (matches check_err format)
41+
void print_error_summary() const
42+
{
43+
if(error_count > 0)
44+
{
45+
if(all_zero)
46+
{
47+
std::cerr << "WARNING: Device result is all zeros - kernel may not have executed "
48+
"properly!"
49+
<< std::endl;
50+
}
51+
std::cerr << "max err: " << max_error;
52+
std::cerr << ", number of errors: " << error_count;
53+
std::cerr << ", " << std::setprecision(2) << std::fixed << error_percentage()
54+
<< "% wrong values" << std::endl;
55+
}
56+
}
57+
};
58+
1659
// Compute relative tolerance for GPU verification
1760
// Matches the logic of ck::utils::get_relative_threshold but handles all types
1861
template <typename ComputeDataType, typename OutDataType, typename AccDataType = ComputeDataType>
@@ -63,16 +106,45 @@ inline float compute_relative_tolerance(const int number_of_accumulations = 1)
63106
}
64107
}
65108

109+
// Device-side result structure for kernel output
110+
// Packed into a single struct to minimize device memory allocations
111+
struct GpuVerifyDeviceResult
112+
{
113+
unsigned long long error_count; // Number of errors found
114+
float max_error; // Maximum error value
115+
int all_zero; // 1 = device result is all zeros, 0 = has non-zero values
116+
};
117+
66118
// GPU verification kernel - compares device result against reference using relative and absolute
67-
// tolerance Returns 1 in passed if all elements match within tolerance, 0 otherwise
119+
// tolerance. Tracks all errors (no early exit) to provide detailed error reporting.
120+
//
121+
// Uses LDS (shared memory) for block-level reduction to minimize atomic contention.
122+
// This reduces atomic operations from O(errors) to O(blocks), providing massive speedup
123+
// when there are many errors.
124+
//
125+
// Assumption: Block size is 256
68126
template <typename T>
69127
__global__ void gpu_verify_kernel(const T* __restrict__ device_result,
70128
const T* __restrict__ reference_result,
71129
float rtol,
72130
float atol,
73131
long long size,
74-
int* passed)
132+
GpuVerifyDeviceResult* result)
75133
{
134+
constexpr int block_size = 256;
135+
136+
// Shared memory for block-level reduction
137+
__shared__ unsigned long long shared_error_count[block_size];
138+
__shared__ float shared_max_error[block_size];
139+
__shared__ int shared_has_error[block_size];
140+
__shared__ int shared_has_nonzero[block_size];
141+
142+
// Thread-local accumulators (in registers)
143+
unsigned long long local_error_count = 0;
144+
float local_max_error = 0.0f;
145+
int local_has_error = 0;
146+
int local_has_nonzero = 0;
147+
76148
// Grid-stride loop to handle any tensor size
77149
long long idx = blockIdx.x * blockDim.x + threadIdx.x;
78150
long long stride = blockDim.x * gridDim.x;
@@ -83,35 +155,95 @@ __global__ void gpu_verify_kernel(const T* __restrict__ device_result,
83155
float dev_val = type_convert<float>(device_result[i]);
84156
float ref_val = type_convert<float>(reference_result[i]);
85157

158+
// Check if device value is non-zero
159+
if(dev_val != 0.0f)
160+
{
161+
local_has_nonzero = 1;
162+
}
163+
86164
// Compute absolute difference
87165
float abs_diff = fabsf(dev_val - ref_val);
88166

89167
// Check tolerance (matches CPU check_err logic: err > atol + rtol * abs(ref))
90168
if(abs_diff > atol + rtol * fabsf(ref_val))
91169
{
92-
atomicMin(passed, 0); // Mark as failed
93-
return; // Early exit on first failure
170+
local_has_error = 1;
171+
local_error_count++;
172+
local_max_error = fmaxf(local_max_error, abs_diff);
173+
}
174+
}
175+
176+
// Store thread-local results to shared memory
177+
shared_error_count[threadIdx.x] = local_error_count;
178+
shared_max_error[threadIdx.x] = local_max_error;
179+
shared_has_error[threadIdx.x] = local_has_error;
180+
shared_has_nonzero[threadIdx.x] = local_has_nonzero;
181+
__syncthreads();
182+
183+
// Block-level reduction: 256 -> 128 -> 64 -> 32
184+
for(unsigned int s = block_size / 2; s >= 32; s >>= 1)
185+
{
186+
if(threadIdx.x < s)
187+
{
188+
shared_error_count[threadIdx.x] += shared_error_count[threadIdx.x + s];
189+
shared_max_error[threadIdx.x] =
190+
fmaxf(shared_max_error[threadIdx.x], shared_max_error[threadIdx.x + s]);
191+
shared_has_error[threadIdx.x] |= shared_has_error[threadIdx.x + s];
192+
shared_has_nonzero[threadIdx.x] |= shared_has_nonzero[threadIdx.x + s];
193+
}
194+
__syncthreads();
195+
}
196+
197+
// Final reduction of remaining 32 elements in thread 0
198+
if(threadIdx.x == 0)
199+
{
200+
for(int i = 1; i < 32; ++i)
201+
{
202+
shared_error_count[0] += shared_error_count[i];
203+
shared_max_error[0] = fmaxf(shared_max_error[0], shared_max_error[i]);
204+
shared_has_error[0] |= shared_has_error[i];
205+
shared_has_nonzero[0] |= shared_has_nonzero[i];
206+
}
207+
208+
// Single atomic update per block (reduces contention from O(errors) to O(blocks))
209+
if(shared_has_error[0])
210+
{
211+
atomicAdd(&result->error_count, shared_error_count[0]);
212+
atomicMax(&result->max_error, shared_max_error[0]);
213+
}
214+
// Update all_zero flag: if no nonzero values found, mark as all zero
215+
if(!shared_has_nonzero[0])
216+
{
217+
atomicMin(&result->all_zero, 1);
218+
}
219+
else
220+
{
221+
atomicMin(&result->all_zero, 0);
94222
}
95223
}
96224
}
97225

98226
// Host-side wrapper for GPU verification with explicit tolerances
99-
// Returns true if verification passed, false otherwise
227+
// Returns GpuVerifyResult with detailed error information
100228
template <typename T>
101-
bool gpu_verify(const void* device_result,
102-
const void* reference_result,
103-
float rtol,
104-
float atol,
105-
std::size_t size,
106-
hipStream_t stream = nullptr)
229+
GpuVerifyResult gpu_verify(const void* device_result,
230+
const void* reference_result,
231+
float rtol,
232+
float atol,
233+
std::size_t size,
234+
hipStream_t stream = nullptr)
107235
{
108236
// Allocate result buffer on device
109-
int* passed_dev;
110-
hip_check_error(hipMalloc(&passed_dev, sizeof(int)));
237+
GpuVerifyDeviceResult* result_dev;
238+
hip_check_error(hipMalloc(&result_dev, sizeof(GpuVerifyDeviceResult)));
111239

112-
// Initialize to passed (1)
113-
int passed_host = 1;
114-
hip_check_error(hipMemcpy(passed_dev, &passed_host, sizeof(int), hipMemcpyHostToDevice));
240+
// Initialize result struct
241+
GpuVerifyDeviceResult result_host;
242+
result_host.error_count = 0; // No errors yet
243+
result_host.max_error = 0.0f; // No error observed
244+
result_host.all_zero = 1; // Start assuming all zeros (will be cleared if nonzero found)
245+
hip_check_error(
246+
hipMemcpy(result_dev, &result_host, sizeof(GpuVerifyDeviceResult), hipMemcpyHostToDevice));
115247

116248
// Launch kernel with grid-stride loop
117249
// Use 65535 as max grid size (hardware limit for grid dimension in x)
@@ -125,20 +257,28 @@ bool gpu_verify(const void* device_result,
125257
rtol,
126258
atol,
127259
static_cast<long long>(size),
128-
passed_dev);
260+
result_dev);
129261

130262
hip_check_error(hipGetLastError());
131263

132264
// Synchronize the stream to ensure kernel completion before reading results
133265
hip_check_error(hipStreamSynchronize(stream));
134266

135267
// Get result
136-
hip_check_error(hipMemcpy(&passed_host, passed_dev, sizeof(int), hipMemcpyDeviceToHost));
268+
hip_check_error(
269+
hipMemcpy(&result_host, result_dev, sizeof(GpuVerifyDeviceResult), hipMemcpyDeviceToHost));
137270

138271
// Free device memory
139-
hip_check_error(hipFree(passed_dev));
272+
hip_check_error(hipFree(result_dev));
273+
274+
// Build and return result struct
275+
GpuVerifyResult result;
276+
result.error_count = result_host.error_count;
277+
result.max_error = result_host.max_error;
278+
result.total = size;
279+
result.all_zero = (result_host.all_zero == 1);
140280

141-
return passed_host == 1;
281+
return result;
142282
}
143283

144284
// Forward declaration of gpu_reduce_max
@@ -147,15 +287,15 @@ float gpu_reduce_max(const void* device_buffer, std::size_t size, hipStream_t st
147287

148288
// Host-side wrapper for GPU verification with automatic tolerance computation
149289
// Computes max value on GPU, then computes tolerances and verifies
150-
// Returns true if verification passed, false otherwise
290+
// Returns GpuVerifyResult with detailed error information
151291
template <typename OutDataType,
152292
typename ComputeDataType = OutDataType,
153293
typename AccDataType = ComputeDataType>
154-
bool gpu_verify(const void* device_result,
155-
const void* reference_result,
156-
int number_of_accumulations,
157-
std::size_t size,
158-
hipStream_t stream = nullptr)
294+
GpuVerifyResult gpu_verify(const void* device_result,
295+
const void* reference_result,
296+
int number_of_accumulations,
297+
std::size_t size,
298+
hipStream_t stream = nullptr)
159299
{
160300
// Compute max absolute value on GPU (only 4 bytes transferred!)
161301
double max_abs_value =
@@ -187,24 +327,6 @@ bool gpu_verify(const void* device_result,
187327
return gpu_verify<OutDataType>(device_result, reference_result, rtol, atol, size, stream);
188328
}
189329

190-
//
191-
// Helper function for atomic float max (using compare-and-swap)
192-
__device__ __forceinline__ float atomicMaxFloat(float* address, float val)
193-
{
194-
int* address_as_int = reinterpret_cast<int*>(address);
195-
int old = *address_as_int;
196-
int assumed;
197-
198-
do
199-
{
200-
assumed = old;
201-
old =
202-
atomicCAS(address_as_int, assumed, __float_as_int(fmaxf(val, __int_as_float(assumed))));
203-
} while(assumed != old);
204-
205-
return __int_as_float(old);
206-
}
207-
208330
// GPU reduction kernel for computing max(abs(data))
209331
// This is an internal kernel called only by gpu_reduce_max() wrapper.
210332
//
@@ -231,7 +353,7 @@ gpu_reduce_max_kernel(const T* __restrict__ data, long long size, float* __restr
231353
__syncthreads();
232354

233355
// Block-level reduction: 256 -> 128 -> 64 -> 32
234-
for(unsigned int s = block_size / 2; s > 32; s >>= 1)
356+
for(unsigned int s = block_size / 2; s >= 32; s >>= 1)
235357
{
236358
if(threadIdx.x < s)
237359
{
@@ -240,26 +362,16 @@ gpu_reduce_max_kernel(const T* __restrict__ data, long long size, float* __restr
240362
__syncthreads();
241363
}
242364

243-
// Warp-level reduction: 32 -> 16 -> 8 -> 4 -> 2 -> 1
244-
// No sync needed within a warp
245-
if(threadIdx.x < 32)
246-
{
247-
volatile float* smem = shared_max;
248-
smem[threadIdx.x] = fmaxf(smem[threadIdx.x], smem[threadIdx.x + 32]);
249-
smem[threadIdx.x] = fmaxf(smem[threadIdx.x], smem[threadIdx.x + 16]);
250-
smem[threadIdx.x] = fmaxf(smem[threadIdx.x], smem[threadIdx.x + 8]);
251-
smem[threadIdx.x] = fmaxf(smem[threadIdx.x], smem[threadIdx.x + 4]);
252-
smem[threadIdx.x] = fmaxf(smem[threadIdx.x], smem[threadIdx.x + 2]);
253-
smem[threadIdx.x] = fmaxf(smem[threadIdx.x], smem[threadIdx.x + 1]);
254-
}
255-
256-
// Two-phase reduction pattern minimizes atomic contention:
257-
// 1. Each block reduces to shared memory (above)
258-
// 2. Single thread per block updates global max (below)
259-
// This limits atomic operations to O(grid_size) rather than O(total_threads)
365+
// Final reduction of remaining 32 elements in thread 0
260366
if(threadIdx.x == 0)
261367
{
262-
atomicMaxFloat(max_val, shared_max[0]);
368+
for(int i = 1; i < 32; ++i)
369+
{
370+
shared_max[0] = fmaxf(shared_max[0], shared_max[i]);
371+
}
372+
373+
// Single atomic update per block
374+
atomicMax(max_val, shared_max[0]);
263375
}
264376
}
265377

0 commit comments

Comments
 (0)