Skip to content

Commit cfa7bf8

Browse files
committed
fix(ci): resolve CUDA build and format check failures
- Specify explicit CUDA architectures (70-90) in CI to enable WMMA compilation in containers without GPU (native detection fails) - Format source files with clang-format to pass format check job
1 parent 0279e41 commit cfa7bf8

3 files changed

Lines changed: 47 additions & 53 deletions

File tree

.github/workflows/ci.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ jobs:
4646
uses: actions/checkout@v4
4747

4848
- name: Configure
49-
run: cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -DBUILD_TESTS=OFF
49+
run: cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -DBUILD_TESTS=OFF -DCMAKE_CUDA_ARCHITECTURES="70;75;80;86;89;90"
5050

5151
- name: Build
5252
run: cmake --build build --target sgemm_benchmark -j2

src/main.cu

Lines changed: 15 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ const std::vector<std::tuple<int, int, int>> DEFAULT_CASES = {
3232
{256, 384, 640},
3333
{511, 513, 1025},
3434
};
35-
}
35+
} // namespace
3636

3737
void naive_kernel(const float *A, const float *B, float *C, int M, int K,
3838
int N) {
@@ -98,8 +98,7 @@ void runBenchmarks(int M, int K, int N) {
9898

9999
if (tensorCoreDimensionsSupported(M, K, N)) {
100100
printf("Running Tensor Core SGEMM (compute-only WMMA path)...\n");
101-
benchmark.runTensorCoreComputeOnly(M, K, N, warmup_runs,
102-
benchmark_runs,
101+
benchmark.runTensorCoreComputeOnly(M, K, N, warmup_runs, benchmark_runs,
103102
kTensorCoreVerifyTolerance);
104103
} else {
105104
printf("Skipping Tensor Core compute-only benchmark (requires positive "
@@ -110,8 +109,9 @@ void runBenchmarks(int M, int K, int N) {
110109
CUDA_CHECK(cudaGetDevice(&device));
111110
cudaDeviceProp prop;
112111
CUDA_CHECK(cudaGetDeviceProperties(&prop, device));
113-
printf("Skipping Tensor Core benchmarks (requires sm_70+, current: sm_%d%d)\n",
114-
prop.major, prop.minor);
112+
printf(
113+
"Skipping Tensor Core benchmarks (requires sm_70+, current: sm_%d%d)\n",
114+
prop.major, prop.minor);
115115
}
116116

117117
benchmark.printSummary();
@@ -125,7 +125,8 @@ void runBenchmarks(int M, int K, int N) {
125125
void printUsage(const char *program) {
126126
printf("Usage: %s [options]\n", program);
127127
printf("\nOptions:\n");
128-
printf(" -s, --size SIZE Benchmark one square SIZE x SIZE x SIZE case\n");
128+
printf(" -s, --size SIZE Benchmark one square SIZE x SIZE x SIZE "
129+
"case\n");
129130
printf(" --dims M K N Benchmark one explicit M x K x N case\n");
130131
printf(" -a, --all Run the default benchmark set\n");
131132
printf(" --warmup N Number of warmup runs (default: 5)\n");
@@ -134,7 +135,8 @@ void printUsage(const char *program) {
134135
printf("\nDefault benchmark set includes:\n");
135136
printf(" - aligned square cases (512, 1024)\n");
136137
printf(" - one aligned non-square case (256 x 384 x 640)\n");
137-
printf(" - one unaligned edge case (511 x 513 x 1025) to exercise safe Tensor Core fallback\n");
138+
printf(" - one unaligned edge case (511 x 513 x 1025) to exercise safe "
139+
"Tensor Core fallback\n");
138140
printf("\nExamples:\n");
139141
printf(" %s -s 1024\n", program);
140142
printf(" %s --dims 256 384 640\n", program);
@@ -260,9 +262,12 @@ int main(int argc, char **argv) {
260262
printf("\n");
261263
printf("Notes:\n");
262264
printf(" - Standard kernels are verified with shared FP32 tolerances.\n");
263-
printf(" - Tensor Core verification uses relaxed mixed-precision tolerances.\n");
264-
printf(" - The end-to-end Tensor Core result includes FP32->FP16 conversion and safe fallback behavior.\n");
265-
printf(" - The compute-only Tensor Core result is only shown for WMMA-compatible dimensions.\n");
265+
printf(" - Tensor Core verification uses relaxed mixed-precision "
266+
"tolerances.\n");
267+
printf(" - The end-to-end Tensor Core result includes FP32->FP16 conversion "
268+
"and safe fallback behavior.\n");
269+
printf(" - The compute-only Tensor Core result is only shown for "
270+
"WMMA-compatible dimensions.\n");
266271
printf("\n");
267272

268273
return 0;

tests/test_sgemm.cu

Lines changed: 31 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -25,29 +25,16 @@ constexpr int PBT_ITERATIONS = 100;
2525

2626
std::vector<std::tuple<int, int, int>> getStandardDimensions() {
2727
return {
28-
{1, 1, 1},
29-
{16, 16, 16},
30-
{32, 32, 32},
31-
{64, 64, 64},
32-
{128, 128, 128},
33-
{256, 256, 256},
34-
{512, 512, 512},
35-
{64, 128, 256},
36-
{256, 64, 128},
37-
{128, 256, 64},
38-
{511, 513, 1025},
28+
{1, 1, 1}, {16, 16, 16}, {32, 32, 32}, {64, 64, 64},
29+
{128, 128, 128}, {256, 256, 256}, {512, 512, 512}, {64, 128, 256},
30+
{256, 64, 128}, {128, 256, 64}, {511, 513, 1025},
3931
};
4032
}
4133

4234
std::vector<std::tuple<int, int, int>> getTensorCoreFastPathDimensions() {
4335
return {
44-
{16, 16, 16},
45-
{32, 32, 32},
46-
{64, 64, 64},
47-
{128, 128, 128},
48-
{256, 256, 256},
49-
{64, 128, 256},
50-
{256, 64, 128},
36+
{16, 16, 16}, {32, 32, 32}, {64, 64, 64}, {128, 128, 128},
37+
{256, 256, 256}, {64, 128, 256}, {256, 64, 128},
5138
};
5239
}
5340

@@ -96,9 +83,8 @@ TEST_F(ErrorDetectionTest, StandardKernelErrorDetection) {
9683
h_test_[i] = h_ref_[i] + error_magnitude * (dist(gen) > 0 ? 1 : -1);
9784
}
9885

99-
VerifyResult result =
100-
compareMatrices(h_test_.data(), h_ref_.data(), 64, 64,
101-
kStandardVerifyTolerance);
86+
VerifyResult result = compareMatrices(h_test_.data(), h_ref_.data(), 64, 64,
87+
kStandardVerifyTolerance);
10288

10389
EXPECT_TRUE(SGEMMVerifier::shouldFlagAsIncorrect(result))
10490
<< "Iteration " << iter << ": error above tolerance should be flagged";
@@ -119,9 +105,8 @@ TEST_F(ErrorDetectionTest, StandardKernelPassesWithinTolerance) {
119105
h_test_[i] = h_ref_[i] + error_magnitude * dist(gen);
120106
}
121107

122-
VerifyResult result =
123-
compareMatrices(h_test_.data(), h_ref_.data(), 64, 64,
124-
kStandardVerifyTolerance);
108+
VerifyResult result = compareMatrices(h_test_.data(), h_ref_.data(), 64, 64,
109+
kStandardVerifyTolerance);
125110

126111
EXPECT_TRUE(result.passed)
127112
<< "Iteration " << iter << ": error within tolerance should pass";
@@ -142,9 +127,8 @@ TEST_F(ErrorDetectionTest, TensorCoreErrorDetection) {
142127
h_test_[i] = h_ref_[i] + error_magnitude * (dist(gen) > 0 ? 1 : -1);
143128
}
144129

145-
VerifyResult result =
146-
compareMatrices(h_test_.data(), h_ref_.data(), 64, 64,
147-
kTensorCoreVerifyTolerance);
130+
VerifyResult result = compareMatrices(h_test_.data(), h_ref_.data(), 64, 64,
131+
kTensorCoreVerifyTolerance);
148132

149133
EXPECT_TRUE(SGEMMVerifier::shouldFlagAsIncorrect(result))
150134
<< "Iteration " << iter
@@ -193,9 +177,9 @@ protected:
193177
}
194178

195179
template <typename LaunchFn>
196-
VerifyResult runKernelAndCompare(LaunchFn launch_fn,
197-
VerifyTolerance tolerance =
198-
kStandardVerifyTolerance) {
180+
VerifyResult
181+
runKernelAndCompare(LaunchFn launch_fn,
182+
VerifyTolerance tolerance = kStandardVerifyTolerance) {
199183
CUDA_CHECK(cudaMemset(d_C_, 0, M_ * N_ * sizeof(float)));
200184
launch_fn();
201185
CUDA_CHECK(cudaDeviceSynchronize());
@@ -244,8 +228,9 @@ INSTANTIATE_TEST_SUITE_P(StandardDimensions, TiledSGEMMTest,
244228
class BankConflictFreeSGEMMTest : public SGEMMKernelTest {};
245229

246230
TEST_P(BankConflictFreeSGEMMTest, CorrectnessProperty) {
247-
VerifyResult result = runKernelAndCompare(
248-
[&] { launch_bank_conflict_free_sgemm<32>(d_A_, d_B_, d_C_, M_, K_, N_); });
231+
VerifyResult result = runKernelAndCompare([&] {
232+
launch_bank_conflict_free_sgemm<32>(d_A_, d_B_, d_C_, M_, K_, N_);
233+
});
249234

250235
EXPECT_TRUE(result.passed)
251236
<< "BankConflictFree SGEMM failed for dimensions " << M_ << "x" << K_
@@ -287,8 +272,9 @@ TEST_P(TensorCoreSGEMMTest, FastPathCorrectnessProperty) {
287272
<< "x" << N_ << " (max_rel_error: " << result.max_rel_error << ")";
288273
}
289274

290-
INSTANTIATE_TEST_SUITE_P(TensorCoreFastPathDimensions, TensorCoreSGEMMTest,
291-
::testing::ValuesIn(getTensorCoreFastPathDimensions()));
275+
INSTANTIATE_TEST_SUITE_P(
276+
TensorCoreFastPathDimensions, TensorCoreSGEMMTest,
277+
::testing::ValuesIn(getTensorCoreFastPathDimensions()));
292278

293279
class TensorCoreFallbackTest : public SGEMMKernelTest {};
294280

@@ -302,13 +288,17 @@ TEST_P(TensorCoreFallbackTest, NonAlignedInputsFallbackSafely) {
302288
<< N_ << " (max_rel_error: " << result.max_rel_error << ")";
303289
}
304290

305-
INSTANTIATE_TEST_SUITE_P(TensorCoreFallbackDimensions, TensorCoreFallbackTest,
306-
::testing::ValuesIn(getTensorCoreFallbackDimensions()));
291+
INSTANTIATE_TEST_SUITE_P(
292+
TensorCoreFallbackDimensions, TensorCoreFallbackTest,
293+
::testing::ValuesIn(getTensorCoreFallbackDimensions()));
307294

308295
TEST(TensorCoreWrapperTest, ZeroSizeInputsReturnSafely) {
309-
EXPECT_NO_THROW(launch_tensor_core_sgemm(nullptr, nullptr, nullptr, 0, 16, 16));
310-
EXPECT_NO_THROW(launch_tensor_core_sgemm(nullptr, nullptr, nullptr, 16, 0, 16));
311-
EXPECT_NO_THROW(launch_tensor_core_sgemm(nullptr, nullptr, nullptr, 16, 16, 0));
296+
EXPECT_NO_THROW(
297+
launch_tensor_core_sgemm(nullptr, nullptr, nullptr, 0, 16, 16));
298+
EXPECT_NO_THROW(
299+
launch_tensor_core_sgemm(nullptr, nullptr, nullptr, 16, 0, 16));
300+
EXPECT_NO_THROW(
301+
launch_tensor_core_sgemm(nullptr, nullptr, nullptr, 16, 16, 0));
312302
}
313303

314304
class DimensionInvarianceTest : public ::testing::Test {
@@ -358,9 +348,8 @@ TEST_F(DimensionInvarianceTest, AllStandardKernelsWorkWithVariousDimensions) {
358348
CUDA_CHECK(cudaMemcpy(h_C.data(), d_C, M * N * sizeof(float),
359349
cudaMemcpyDeviceToHost));
360350

361-
VerifyResult result =
362-
compareMatrices(h_C.data(), h_ref.data(), M, N,
363-
kStandardVerifyTolerance);
351+
VerifyResult result = compareMatrices(h_C.data(), h_ref.data(), M, N,
352+
kStandardVerifyTolerance);
364353
EXPECT_TRUE(result.passed)
365354
<< name << " failed at iteration " << iter << " with dimensions " << M
366355
<< "x" << K << "x" << N;

0 commit comments

Comments
 (0)