Skip to content

Commit 8ab1429

Browse files
committed
fix: resolve CI workflow failures
1 parent 1d9d86e commit 8ab1429

11 files changed

Lines changed: 1910 additions & 1996 deletions

File tree

.github/workflows/ci.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -44,8 +44,8 @@ jobs:
4444
steps:
4545
- uses: actions/checkout@v4
4646
- name: Install clang-format
47-
run: sudo apt-get update && sudo apt-get install -y clang-format-14
47+
run: sudo apt-get update && sudo apt-get install -y clang-format
4848
- name: Check formatting
4949
run: |
5050
find . -type f \( -name '*.h' -o -name '*.hpp' -o -name '*.cpp' -o -name '*.cu' -o -name '*.cuh' \) \
51-
-not -path './build/*' -not -path './third_party/*' -print0 | xargs -0 -r clang-format-14 --dry-run --Werror
51+
-not -path './build/*' -not -path './third_party/*' -print0 | xargs -0 -r clang-format --dry-run --Werror
Lines changed: 135 additions & 154 deletions
Original file line numberDiff line numberDiff line change
@@ -1,204 +1,185 @@
11
#pragma once
22

3-
#include <cuda_runtime.h>
43
#include "../utils/cuda_utils.cuh"
4+
#include <cuda_runtime.h>
55

66
/**
77
* Bank Conflict Free SGEMM Kernel
8-
*
8+
*
99
* This implementation eliminates shared memory bank conflicts by adding padding
1010
* to the shared memory arrays.
11-
*
11+
*
1212
* ============================================================================
1313
* Bank Conflict Explanation:
1414
* ============================================================================
15-
*
15+
*
1616
* Shared memory is divided into 32 banks (on modern GPUs).
1717
* Each bank is 4 bytes wide (one float).
1818
* Consecutive 4-byte words map to consecutive banks.
19-
*
19+
*
2020
* Bank assignment: bank_id = (address / 4) % 32
21-
*
21+
*
2222
* For a 32x32 array stored row-major:
2323
* - Element [i][j] is at address: (i * 32 + j) * 4
2424
* - Bank: (i * 32 + j) % 32 = j (since 32 % 32 = 0)
25-
*
25+
*
2626
* Problem: When threads in a warp access column j of the array,
2727
* they all access the same bank j, causing a 32-way bank conflict!
28-
*
28+
*
2929
* Solution: Add 1 element of padding per row.
3030
* - Element [i][j] is now at address: (i * 33 + j) * 4
3131
* - Bank: (i * 33 + j) % 32 = (i + j) % 32
3232
* - Now threads accessing column j get different banks!
33-
*
33+
*
3434
* ============================================================================
35-
*
35+
*
3636
* C = A * B
3737
* A: M x K (row-major)
3838
* B: K x N (row-major)
3939
* C: M x N (row-major)
4040
*/
41-
template<int TILE_SIZE>
42-
__global__ void bank_conflict_free_sgemm_kernel(
43-
const float* __restrict__ A,
44-
const float* __restrict__ B,
45-
float* __restrict__ C,
46-
int M, int K, int N
47-
) {
48-
// Shared memory with padding to avoid bank conflicts
49-
// Adding 1 to the second dimension shifts each row by 1 bank
50-
// This ensures column accesses hit different banks
51-
__shared__ float As[TILE_SIZE][TILE_SIZE + 1]; // +1 padding
52-
__shared__ float Bs[TILE_SIZE][TILE_SIZE + 1]; // +1 padding
53-
54-
int bx = blockIdx.x;
55-
int by = blockIdx.y;
56-
int tx = threadIdx.x;
57-
int ty = threadIdx.y;
58-
59-
int row = by * TILE_SIZE + ty;
60-
int col = bx * TILE_SIZE + tx;
61-
62-
float sum = 0.0f;
63-
int numTiles = (K + TILE_SIZE - 1) / TILE_SIZE;
64-
65-
for (int t = 0; t < numTiles; ++t) {
66-
// Load tile of A into shared memory (coalesced access)
67-
int aCol = t * TILE_SIZE + tx;
68-
if (row < M && aCol < K) {
69-
As[ty][tx] = A[row * K + aCol];
70-
} else {
71-
As[ty][tx] = 0.0f;
72-
}
73-
74-
// Load tile of B into shared memory (coalesced access)
75-
int bRow = t * TILE_SIZE + ty;
76-
if (bRow < K && col < N) {
77-
Bs[ty][tx] = B[bRow * N + col];
78-
} else {
79-
Bs[ty][tx] = 0.0f;
80-
}
81-
82-
__syncthreads();
83-
84-
// Compute partial dot product
85-
// Access pattern: As[ty][k] - row access (no conflict)
86-
// Bs[k][tx] - column access (no conflict due to padding!)
87-
#pragma unroll
88-
for (int k = 0; k < TILE_SIZE; ++k) {
89-
sum += As[ty][k] * Bs[k][tx];
90-
}
91-
92-
__syncthreads();
41+
template <int TILE_SIZE>
42+
__global__ void bank_conflict_free_sgemm_kernel(const float *__restrict__ A,
43+
const float *__restrict__ B,
44+
float *__restrict__ C, int M,
45+
int K, int N) {
46+
// Shared memory with padding to avoid bank conflicts
47+
// Adding 1 to the second dimension shifts each row by 1 bank
48+
// This ensures column accesses hit different banks
49+
__shared__ float As[TILE_SIZE][TILE_SIZE + 1]; // +1 padding
50+
__shared__ float Bs[TILE_SIZE][TILE_SIZE + 1]; // +1 padding
51+
52+
int bx = blockIdx.x;
53+
int by = blockIdx.y;
54+
int tx = threadIdx.x;
55+
int ty = threadIdx.y;
56+
57+
int row = by * TILE_SIZE + ty;
58+
int col = bx * TILE_SIZE + tx;
59+
60+
float sum = 0.0f;
61+
int numTiles = (K + TILE_SIZE - 1) / TILE_SIZE;
62+
63+
for (int t = 0; t < numTiles; ++t) {
64+
// Load tile of A into shared memory (coalesced access)
65+
int aCol = t * TILE_SIZE + tx;
66+
if (row < M && aCol < K) {
67+
As[ty][tx] = A[row * K + aCol];
68+
} else {
69+
As[ty][tx] = 0.0f;
9370
}
94-
95-
if (row < M && col < N) {
96-
C[row * N + col] = sum;
71+
72+
// Load tile of B into shared memory (coalesced access)
73+
int bRow = t * TILE_SIZE + ty;
74+
if (bRow < K && col < N) {
75+
Bs[ty][tx] = B[bRow * N + col];
76+
} else {
77+
Bs[ty][tx] = 0.0f;
9778
}
79+
80+
__syncthreads();
81+
82+
// Compute partial dot product
83+
// Access pattern: As[ty][k] - row access (no conflict)
84+
// Bs[k][tx] - column access (no conflict due to padding!)
85+
#pragma unroll
86+
for (int k = 0; k < TILE_SIZE; ++k) {
87+
sum += As[ty][k] * Bs[k][tx];
88+
}
89+
90+
__syncthreads();
91+
}
92+
93+
if (row < M && col < N) {
94+
C[row * N + col] = sum;
95+
}
9896
}
9997

10098
/**
10199
* Launch wrapper for bank conflict free SGEMM kernel
102100
*/
103-
template<int TILE_SIZE = 32>
104-
void launch_bank_conflict_free_sgemm(
105-
const float* A,
106-
const float* B,
107-
float* C,
108-
int M, int K, int N,
109-
cudaStream_t stream = 0
110-
) {
111-
dim3 blockDim(TILE_SIZE, TILE_SIZE);
112-
dim3 gridDim(
113-
(N + TILE_SIZE - 1) / TILE_SIZE,
114-
(M + TILE_SIZE - 1) / TILE_SIZE
115-
);
116-
117-
bank_conflict_free_sgemm_kernel<TILE_SIZE><<<gridDim, blockDim, 0, stream>>>(
118-
A, B, C, M, K, N
119-
);
120-
121-
CUDA_CHECK(cudaGetLastError());
101+
template <int TILE_SIZE = 32>
102+
void launch_bank_conflict_free_sgemm(const float *A, const float *B, float *C,
103+
int M, int K, int N,
104+
cudaStream_t stream = 0) {
105+
dim3 blockDim(TILE_SIZE, TILE_SIZE);
106+
dim3 gridDim((N + TILE_SIZE - 1) / TILE_SIZE,
107+
(M + TILE_SIZE - 1) / TILE_SIZE);
108+
109+
bank_conflict_free_sgemm_kernel<TILE_SIZE>
110+
<<<gridDim, blockDim, 0, stream>>>(A, B, C, M, K, N);
111+
112+
CUDA_CHECK(cudaGetLastError());
122113
}
123114

124115
/**
125116
* Alternative: Transposed B storage to avoid bank conflicts
126-
*
117+
*
127118
* Instead of padding, we can store B transposed in shared memory.
128119
* This changes the access pattern from column to row access.
129120
*/
130-
template<int TILE_SIZE>
121+
template <int TILE_SIZE>
131122
__global__ void bank_conflict_free_transposed_sgemm_kernel(
132-
const float* __restrict__ A,
133-
const float* __restrict__ B,
134-
float* __restrict__ C,
135-
int M, int K, int N
136-
) {
137-
// No padding needed if we transpose B
138-
__shared__ float As[TILE_SIZE][TILE_SIZE + 1];
139-
__shared__ float BsT[TILE_SIZE][TILE_SIZE + 1]; // B transposed
140-
141-
int bx = blockIdx.x;
142-
int by = blockIdx.y;
143-
int tx = threadIdx.x;
144-
int ty = threadIdx.y;
145-
146-
int row = by * TILE_SIZE + ty;
147-
int col = bx * TILE_SIZE + tx;
148-
149-
float sum = 0.0f;
150-
int numTiles = (K + TILE_SIZE - 1) / TILE_SIZE;
151-
152-
for (int t = 0; t < numTiles; ++t) {
153-
// Load A normally
154-
int aCol = t * TILE_SIZE + tx;
155-
if (row < M && aCol < K) {
156-
As[ty][tx] = A[row * K + aCol];
157-
} else {
158-
As[ty][tx] = 0.0f;
159-
}
160-
161-
// Load B transposed: BsT[tx][ty] instead of Bs[ty][tx]
162-
int bRow = t * TILE_SIZE + ty;
163-
if (bRow < K && col < N) {
164-
BsT[tx][ty] = B[bRow * N + col]; // Note: indices swapped
165-
} else {
166-
BsT[tx][ty] = 0.0f;
167-
}
168-
169-
__syncthreads();
170-
171-
// Now both accesses are row-wise (no bank conflicts)
172-
#pragma unroll
173-
for (int k = 0; k < TILE_SIZE; ++k) {
174-
sum += As[ty][k] * BsT[tx][k]; // Both row accesses
175-
}
176-
177-
__syncthreads();
123+
const float *__restrict__ A, const float *__restrict__ B,
124+
float *__restrict__ C, int M, int K, int N) {
125+
// No padding needed if we transpose B
126+
__shared__ float As[TILE_SIZE][TILE_SIZE + 1];
127+
__shared__ float BsT[TILE_SIZE][TILE_SIZE + 1]; // B transposed
128+
129+
int bx = blockIdx.x;
130+
int by = blockIdx.y;
131+
int tx = threadIdx.x;
132+
int ty = threadIdx.y;
133+
134+
int row = by * TILE_SIZE + ty;
135+
int col = bx * TILE_SIZE + tx;
136+
137+
float sum = 0.0f;
138+
int numTiles = (K + TILE_SIZE - 1) / TILE_SIZE;
139+
140+
for (int t = 0; t < numTiles; ++t) {
141+
// Load A normally
142+
int aCol = t * TILE_SIZE + tx;
143+
if (row < M && aCol < K) {
144+
As[ty][tx] = A[row * K + aCol];
145+
} else {
146+
As[ty][tx] = 0.0f;
178147
}
179-
180-
if (row < M && col < N) {
181-
C[row * N + col] = sum;
148+
149+
// Load B transposed: BsT[tx][ty] instead of Bs[ty][tx]
150+
int bRow = t * TILE_SIZE + ty;
151+
if (bRow < K && col < N) {
152+
BsT[tx][ty] = B[bRow * N + col]; // Note: indices swapped
153+
} else {
154+
BsT[tx][ty] = 0.0f;
182155
}
156+
157+
__syncthreads();
158+
159+
// Now both accesses are row-wise (no bank conflicts)
160+
#pragma unroll
161+
for (int k = 0; k < TILE_SIZE; ++k) {
162+
sum += As[ty][k] * BsT[tx][k]; // Both row accesses
163+
}
164+
165+
__syncthreads();
166+
}
167+
168+
if (row < M && col < N) {
169+
C[row * N + col] = sum;
170+
}
183171
}
184172

185-
template<int TILE_SIZE = 32>
186-
void launch_bank_conflict_free_transposed_sgemm(
187-
const float* A,
188-
const float* B,
189-
float* C,
190-
int M, int K, int N,
191-
cudaStream_t stream = 0
192-
) {
193-
dim3 blockDim(TILE_SIZE, TILE_SIZE);
194-
dim3 gridDim(
195-
(N + TILE_SIZE - 1) / TILE_SIZE,
196-
(M + TILE_SIZE - 1) / TILE_SIZE
197-
);
198-
199-
bank_conflict_free_transposed_sgemm_kernel<TILE_SIZE><<<gridDim, blockDim, 0, stream>>>(
200-
A, B, C, M, K, N
201-
);
202-
203-
CUDA_CHECK(cudaGetLastError());
173+
template <int TILE_SIZE = 32>
174+
void launch_bank_conflict_free_transposed_sgemm(const float *A, const float *B,
175+
float *C, int M, int K, int N,
176+
cudaStream_t stream = 0) {
177+
dim3 blockDim(TILE_SIZE, TILE_SIZE);
178+
dim3 gridDim((N + TILE_SIZE - 1) / TILE_SIZE,
179+
(M + TILE_SIZE - 1) / TILE_SIZE);
180+
181+
bank_conflict_free_transposed_sgemm_kernel<TILE_SIZE>
182+
<<<gridDim, blockDim, 0, stream>>>(A, B, C, M, K, N);
183+
184+
CUDA_CHECK(cudaGetLastError());
204185
}

0 commit comments

Comments
 (0)