Skip to content

Commit ef7cef0

Browse files
LessUpqwencoder
andcommitted
style: format all CUDA source files with clang-format-17
Co-authored-by: Qwen-Coder <qwen-coder@alibabacloud.com>
1 parent ce85eeb commit ef7cef0

8 files changed

Lines changed: 154 additions & 223 deletions

File tree

src/kernels/bank_conflict_free_sgemm.cuh

Lines changed: 5 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -40,9 +40,8 @@
4040
*/
4141
template <int TILE_SIZE>
4242
__global__ void bank_conflict_free_sgemm_kernel(const float *__restrict__ A,
43-
const float *__restrict__ B,
44-
float *__restrict__ C, int M,
45-
int K, int N) {
43+
const float *__restrict__ B, float *__restrict__ C,
44+
int M, int K, int N) {
4645
// Shared memory with padding to avoid bank conflicts
4746
// Adding 1 to the second dimension shifts each row by 1 bank
4847
// This ensures column accesses hit different banks
@@ -99,15 +98,12 @@ __global__ void bank_conflict_free_sgemm_kernel(const float *__restrict__ A,
9998
* Launch wrapper for bank conflict free SGEMM kernel
10099
*/
101100
template <int TILE_SIZE = 32>
102-
void launch_bank_conflict_free_sgemm(const float *A, const float *B, float *C,
103-
int M, int K, int N,
101+
void launch_bank_conflict_free_sgemm(const float *A, const float *B, float *C, int M, int K, int N,
104102
cudaStream_t stream = 0) {
105103
dim3 blockDim(TILE_SIZE, TILE_SIZE);
106-
dim3 gridDim((N + TILE_SIZE - 1) / TILE_SIZE,
107-
(M + TILE_SIZE - 1) / TILE_SIZE);
104+
dim3 gridDim((N + TILE_SIZE - 1) / TILE_SIZE, (M + TILE_SIZE - 1) / TILE_SIZE);
108105

109-
bank_conflict_free_sgemm_kernel<TILE_SIZE>
110-
<<<gridDim, blockDim, 0, stream>>>(A, B, C, M, K, N);
106+
bank_conflict_free_sgemm_kernel<TILE_SIZE><<<gridDim, blockDim, 0, stream>>>(A, B, C, M, K, N);
111107

112108
CUDA_CHECK(cudaGetLastError());
113109
}

src/kernels/double_buffer_sgemm.cuh

Lines changed: 6 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -43,10 +43,8 @@
4343
* C: M x N (row-major)
4444
*/
4545
template <int TILE_SIZE>
46-
__global__ void double_buffer_sgemm_kernel(const float *__restrict__ A,
47-
const float *__restrict__ B,
48-
float *__restrict__ C, int M, int K,
49-
int N) {
46+
__global__ void double_buffer_sgemm_kernel(const float *__restrict__ A, const float *__restrict__ B,
47+
float *__restrict__ C, int M, int K, int N) {
5048
// Double buffers with padding to avoid bank conflicts
5149
__shared__ float As[2][TILE_SIZE][TILE_SIZE + 1];
5250
__shared__ float Bs[2][TILE_SIZE][TILE_SIZE + 1];
@@ -132,14 +130,12 @@ __global__ void double_buffer_sgemm_kernel(const float *__restrict__ A,
132130
* Launch wrapper for double buffer SGEMM kernel
133131
*/
134132
template <int TILE_SIZE = 32>
135-
void launch_double_buffer_sgemm(const float *A, const float *B, float *C, int M,
136-
int K, int N, cudaStream_t stream = 0) {
133+
void launch_double_buffer_sgemm(const float *A, const float *B, float *C, int M, int K, int N,
134+
cudaStream_t stream = 0) {
137135
dim3 blockDim(TILE_SIZE, TILE_SIZE);
138-
dim3 gridDim((N + TILE_SIZE - 1) / TILE_SIZE,
139-
(M + TILE_SIZE - 1) / TILE_SIZE);
136+
dim3 gridDim((N + TILE_SIZE - 1) / TILE_SIZE, (M + TILE_SIZE - 1) / TILE_SIZE);
140137

141-
double_buffer_sgemm_kernel<TILE_SIZE>
142-
<<<gridDim, blockDim, 0, stream>>>(A, B, C, M, K, N);
138+
double_buffer_sgemm_kernel<TILE_SIZE><<<gridDim, blockDim, 0, stream>>>(A, B, C, M, K, N);
143139

144140
CUDA_CHECK(cudaGetLastError());
145141
}

src/kernels/naive_sgemm.cuh

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -26,8 +26,7 @@
2626
* B: K x N (row-major)
2727
* C: M x N (row-major)
2828
*/
29-
__global__ void naive_sgemm_kernel(const float *__restrict__ A,
30-
const float *__restrict__ B,
29+
__global__ void naive_sgemm_kernel(const float *__restrict__ A, const float *__restrict__ B,
3130
float *__restrict__ C, int M, int K, int N) {
3231
// Calculate global row and column indices
3332
int row = blockIdx.y * blockDim.y + threadIdx.y;
@@ -62,13 +61,12 @@ __global__ void naive_sgemm_kernel(const float *__restrict__ A,
6261
* @param stream CUDA stream (default: 0)
6362
*/
6463
template <int BLOCK_SIZE = 32>
65-
void launch_naive_sgemm(const float *A, const float *B, float *C, int M, int K,
66-
int N, cudaStream_t stream = 0) {
64+
void launch_naive_sgemm(const float *A, const float *B, float *C, int M, int K, int N,
65+
cudaStream_t stream = 0) {
6766
// Configure grid and block dimensions
6867
// Each thread computes one element of C
6968
dim3 blockDim(BLOCK_SIZE, BLOCK_SIZE);
70-
dim3 gridDim((N + BLOCK_SIZE - 1) / BLOCK_SIZE,
71-
(M + BLOCK_SIZE - 1) / BLOCK_SIZE);
69+
dim3 gridDim((N + BLOCK_SIZE - 1) / BLOCK_SIZE, (M + BLOCK_SIZE - 1) / BLOCK_SIZE);
7270

7371
// Launch kernel
7472
naive_sgemm_kernel<<<gridDim, blockDim, 0, stream>>>(A, B, C, M, K, N);

src/kernels/tiled_sgemm.cuh

Lines changed: 5 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -25,8 +25,7 @@
2525
* C: M x N (row-major)
2626
*/
2727
template <int TILE_SIZE>
28-
__global__ void tiled_sgemm_kernel(const float *__restrict__ A,
29-
const float *__restrict__ B,
28+
__global__ void tiled_sgemm_kernel(const float *__restrict__ A, const float *__restrict__ B,
3029
float *__restrict__ C, int M, int K, int N) {
3130
// Shared memory for tiles of A and B
3231
__shared__ float As[TILE_SIZE][TILE_SIZE];
@@ -102,18 +101,16 @@ __global__ void tiled_sgemm_kernel(const float *__restrict__ A,
102101
* @param stream CUDA stream (default: 0)
103102
*/
104103
template <int TILE_SIZE = 32>
105-
void launch_tiled_sgemm(const float *A, const float *B, float *C, int M, int K,
106-
int N, cudaStream_t stream = 0) {
104+
void launch_tiled_sgemm(const float *A, const float *B, float *C, int M, int K, int N,
105+
cudaStream_t stream = 0) {
107106
// Block size matches tile size
108107
dim3 blockDim(TILE_SIZE, TILE_SIZE);
109108

110109
// Grid covers the output matrix
111-
dim3 gridDim((N + TILE_SIZE - 1) / TILE_SIZE,
112-
(M + TILE_SIZE - 1) / TILE_SIZE);
110+
dim3 gridDim((N + TILE_SIZE - 1) / TILE_SIZE, (M + TILE_SIZE - 1) / TILE_SIZE);
113111

114112
// Launch kernel
115-
tiled_sgemm_kernel<TILE_SIZE>
116-
<<<gridDim, blockDim, 0, stream>>>(A, B, C, M, K, N);
113+
tiled_sgemm_kernel<TILE_SIZE><<<gridDim, blockDim, 0, stream>>>(A, B, C, M, K, N);
117114

118115
CUDA_CHECK(cudaGetLastError());
119116
}

src/utils/benchmark.cuh

Lines changed: 34 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -30,8 +30,7 @@ struct BenchmarkResult {
3030
void print() const {
3131
printf(" %-30s | %4d x %4d x %4d | %8.3f ms | %8.2f GFLOPS | %s | err: "
3232
"%.2e\n",
33-
kernel_name.c_str(), M, K, N, time_ms, gflops,
34-
correct ? "PASS" : "FAIL", max_error);
33+
kernel_name.c_str(), M, K, N, time_ms, gflops, correct ? "PASS" : "FAIL", max_error);
3534
}
3635
};
3736

@@ -54,9 +53,8 @@ public:
5453
}
5554

5655
template <typename KernelFunc>
57-
BenchmarkResult run(const std::string &name, KernelFunc kernel_func, int M,
58-
int K, int N, int warmup_runs = 5,
59-
int benchmark_runs = 20,
56+
BenchmarkResult run(const std::string &name, KernelFunc kernel_func, int M, int K, int N,
57+
int warmup_runs = 5, int benchmark_runs = 20,
6058
VerifyTolerance tolerance = kStandardVerifyTolerance) {
6159
BenchmarkResult result;
6260
result.kernel_name = name;
@@ -77,9 +75,8 @@ public:
7775
d_B.copyFromHost(h_B.data(), K * N);
7876

7977
float alpha = 1.0f, beta = 0.0f;
80-
CUBLAS_CHECK(cublasSgemm(cublas_handle_, CUBLAS_OP_N, CUBLAS_OP_N, N, M, K,
81-
&alpha, d_B.get(), N, d_A.get(), K, &beta,
82-
d_C_ref.get(), N));
78+
CUBLAS_CHECK(cublasSgemm(cublas_handle_, CUBLAS_OP_N, CUBLAS_OP_N, N, M, K, &alpha, d_B.get(),
79+
N, d_A.get(), K, &beta, d_C_ref.get(), N));
8380

8481
for (int i = 0; i < warmup_runs; ++i) {
8582
d_C.zero();
@@ -101,17 +98,15 @@ public:
10198
d_C.copyToHost(h_C.data(), M * N);
10299
d_C_ref.copyToHost(h_C_ref.data(), M * N);
103100

104-
VerifyResult verify_result = compareMatrices(h_C.data(), h_C_ref.data(), M,
105-
N, tolerance);
101+
VerifyResult verify_result = compareMatrices(h_C.data(), h_C_ref.data(), M, N, tolerance);
106102
result.correct = verify_result.passed;
107103
result.max_error = verify_result.max_rel_error;
108104

109105
results_.push_back(result);
110106
return result;
111107
}
112108

113-
BenchmarkResult runCublas(int M, int K, int N, int warmup_runs = 5,
114-
int benchmark_runs = 20) {
109+
BenchmarkResult runCublas(int M, int K, int N, int warmup_runs = 5, int benchmark_runs = 20) {
115110
BenchmarkResult result;
116111
result.kernel_name = "cuBLAS";
117112
result.M = M;
@@ -132,17 +127,15 @@ public:
132127
float alpha = 1.0f, beta = 0.0f;
133128

134129
for (int i = 0; i < warmup_runs; ++i) {
135-
CUBLAS_CHECK(cublasSgemm(cublas_handle_, CUBLAS_OP_N, CUBLAS_OP_N, N, M,
136-
K, &alpha, d_B.get(), N, d_A.get(), K, &beta,
137-
d_C.get(), N));
130+
CUBLAS_CHECK(cublasSgemm(cublas_handle_, CUBLAS_OP_N, CUBLAS_OP_N, N, M, K, &alpha, d_B.get(),
131+
N, d_A.get(), K, &beta, d_C.get(), N));
138132
}
139133
CUDA_CHECK(cudaDeviceSynchronize());
140134

141135
CUDA_CHECK(cudaEventRecord(start_));
142136
for (int i = 0; i < benchmark_runs; ++i) {
143-
CUBLAS_CHECK(cublasSgemm(cublas_handle_, CUBLAS_OP_N, CUBLAS_OP_N, N, M,
144-
K, &alpha, d_B.get(), N, d_A.get(), K, &beta,
145-
d_C.get(), N));
137+
CUBLAS_CHECK(cublasSgemm(cublas_handle_, CUBLAS_OP_N, CUBLAS_OP_N, N, M, K, &alpha, d_B.get(),
138+
N, d_A.get(), K, &beta, d_C.get(), N));
146139
}
147140
CUDA_CHECK(cudaEventRecord(stop_));
148141
CUDA_CHECK(cudaEventSynchronize(stop_));
@@ -157,9 +150,9 @@ public:
157150
return result;
158151
}
159152

160-
BenchmarkResult runTensorCoreComputeOnly(
161-
int M, int K, int N, int warmup_runs = 5, int benchmark_runs = 20,
162-
VerifyTolerance tolerance = kTensorCoreVerifyTolerance) {
153+
BenchmarkResult runTensorCoreComputeOnly(int M, int K, int N, int warmup_runs = 5,
154+
int benchmark_runs = 20,
155+
VerifyTolerance tolerance = kTensorCoreVerifyTolerance) {
163156
if (!tensorCoresAvailable() || !tensorCoreDimensionsSupported(M, K, N)) {
164157
throw CudaError("Tensor Core compute-only benchmark requires sm_70+ and "
165158
"dimensions aligned to 16");
@@ -186,32 +179,27 @@ public:
186179
d_B.copyFromHost(h_B.data(), K * N);
187180

188181
float alpha = 1.0f, beta = 0.0f;
189-
CUBLAS_CHECK(cublasSgemm(cublas_handle_, CUBLAS_OP_N, CUBLAS_OP_N, N, M, K,
190-
&alpha, d_B.get(), N, d_A.get(), K, &beta,
191-
d_C_ref.get(), N));
182+
CUBLAS_CHECK(cublasSgemm(cublas_handle_, CUBLAS_OP_N, CUBLAS_OP_N, N, M, K, &alpha, d_B.get(),
183+
N, d_A.get(), K, &beta, d_C_ref.get(), N));
192184

193185
int blockSize = 256;
194186
int gridSizeA = (M * K + blockSize - 1) / blockSize;
195187
int gridSizeB = (K * N + blockSize - 1) / blockSize;
196188

197-
float_to_half_kernel<<<gridSizeA, blockSize>>>(d_A.get(), d_A_fp16.get(),
198-
M * K);
199-
float_to_half_kernel<<<gridSizeB, blockSize>>>(d_B.get(), d_B_fp16.get(),
200-
K * N);
189+
float_to_half_kernel<<<gridSizeA, blockSize>>>(d_A.get(), d_A_fp16.get(), M * K);
190+
float_to_half_kernel<<<gridSizeB, blockSize>>>(d_B.get(), d_B_fp16.get(), K * N);
201191
CUDA_CHECK(cudaGetLastError());
202192
CUDA_CHECK(cudaDeviceSynchronize());
203193

204194
for (int i = 0; i < warmup_runs; ++i) {
205195
d_C.zero();
206-
launch_tensor_core_sgemm_fp16(d_A_fp16.get(), d_B_fp16.get(), d_C.get(), M,
207-
K, N);
196+
launch_tensor_core_sgemm_fp16(d_A_fp16.get(), d_B_fp16.get(), d_C.get(), M, K, N);
208197
}
209198
CUDA_CHECK(cudaDeviceSynchronize());
210199

211200
CUDA_CHECK(cudaEventRecord(start_));
212201
for (int i = 0; i < benchmark_runs; ++i) {
213-
launch_tensor_core_sgemm_fp16(d_A_fp16.get(), d_B_fp16.get(), d_C.get(), M,
214-
K, N);
202+
launch_tensor_core_sgemm_fp16(d_A_fp16.get(), d_B_fp16.get(), d_C.get(), M, K, N);
215203
}
216204
CUDA_CHECK(cudaEventRecord(stop_));
217205
CUDA_CHECK(cudaEventSynchronize(stop_));
@@ -223,8 +211,7 @@ public:
223211
d_C.copyToHost(h_C.data(), M * N);
224212
d_C_ref.copyToHost(h_C_ref.data(), M * N);
225213

226-
VerifyResult verify_result = compareMatrices(h_C.data(), h_C_ref.data(), M,
227-
N, tolerance);
214+
VerifyResult verify_result = compareMatrices(h_C.data(), h_C_ref.data(), M, N, tolerance);
228215
result.correct = verify_result.passed;
229216
result.max_error = verify_result.max_rel_error;
230217

@@ -239,8 +226,8 @@ public:
239226
printf(" SGEMM Benchmark Results\n");
240227
printf("===================================================================="
241228
"============\n");
242-
printf(" %-30s | %-17s | %10s | %14s | %4s | %s\n", "Kernel",
243-
"Dimensions", "Time", "Performance", "Pass", "Max Error");
229+
printf(" %-30s | %-17s | %10s | %14s | %4s | %s\n", "Kernel", "Dimensions", "Time",
230+
"Performance", "Pass", "Max Error");
244231
printf("--------------------------------------------------------------------"
245232
"------------\n");
246233

@@ -264,13 +251,12 @@ public:
264251
for (const auto &result : results_) {
265252
double flops = 2.0 * result.M * result.N * result.K;
266253
double bytes =
267-
(result.M * result.K + result.K * result.N + result.M * result.N) *
268-
sizeof(float);
254+
(result.M * result.K + result.K * result.N + result.M * result.N) * sizeof(float);
269255
double ai = flops / bytes;
270256

271-
file << result.kernel_name << "," << result.M << "," << result.K << ","
272-
<< result.N << "," << result.time_ms << "," << result.gflops << ","
273-
<< result.bandwidth_gb_s << "," << ai << "\n";
257+
file << result.kernel_name << "," << result.M << "," << result.K << "," << result.N << ","
258+
<< result.time_ms << "," << result.gflops << "," << result.bandwidth_gb_s << "," << ai
259+
<< "\n";
274260
}
275261

276262
file.close();
@@ -289,9 +275,8 @@ private:
289275
double flops = 2.0 * result.M * result.N * result.K;
290276
result.gflops = (flops / (result.time_ms * 1e-3)) / 1e9;
291277

292-
double bytes = (result.M * result.K + result.K * result.N +
293-
result.M * result.N) *
294-
sizeof(float);
278+
double bytes =
279+
(result.M * result.K + result.K * result.N + result.M * result.N) * sizeof(float);
295280
result.bandwidth_gb_s = (bytes / (result.time_ms * 1e-3)) / 1e9;
296281
}
297282

@@ -304,9 +289,8 @@ private:
304289
// Utility Functions
305290
// ============================================================================
306291

307-
inline void
308-
printPerformanceComparison(const std::vector<BenchmarkResult> &results,
309-
float cublas_gflops) {
292+
inline void printPerformanceComparison(const std::vector<BenchmarkResult> &results,
293+
float cublas_gflops) {
310294
printf("\n");
311295
printf("Performance Comparison (vs cuBLAS):\n");
312296
printf("---------------------------------------------------------------------"
@@ -317,8 +301,8 @@ printPerformanceComparison(const std::vector<BenchmarkResult> &results,
317301

318302
for (const auto &result : results) {
319303
float percentage = (result.gflops / cublas_gflops) * 100.0f;
320-
printf(" %-30s | %10.2f | %8.1f%%\n", result.kernel_name.c_str(),
321-
result.gflops, percentage);
304+
printf(" %-30s | %10.2f | %8.1f%%\n", result.kernel_name.c_str(), result.gflops,
305+
percentage);
322306
}
323307
printf("---------------------------------------------------------------------"
324308
"-----------\n");
@@ -348,8 +332,7 @@ inline float getTheoreticalPeakGflops() {
348332
float clockGHz = static_cast<float>(prop.clockRate) / 1e6f;
349333

350334
// Peak GFLOPS = SMs * cores/SM * 2 (FMA) * clock (GHz) * 1000 (MHz factor)
351-
float peakGflops =
352-
prop.multiProcessorCount * coresPerSM * 2 * clockGHz * 1000;
335+
float peakGflops = prop.multiProcessorCount * coresPerSM * 2 * clockGHz * 1000;
353336

354337
return peakGflops;
355338
}

0 commit comments

Comments
 (0)