fix(ci): resolve CUDA build and format check failures

LessUp · LessUp · commit cfa7bf81700d · 2026-04-17T01:04:44.000+08:00
- Specify explicit CUDA architectures (70-90) in CI to enable WMMA
  compilation in containers without GPU (native detection fails)
- Format source files with clang-format to pass format check job
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -46,7 +46,7 @@ jobs:
         uses: actions/checkout@v4
 
       - name: Configure
-        run: cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -DBUILD_TESTS=OFF
+        run: cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -DBUILD_TESTS=OFF -DCMAKE_CUDA_ARCHITECTURES="70;75;80;86;89;90"
 
       - name: Build
         run: cmake --build build --target sgemm_benchmark -j2
diff --git a/src/main.cu b/src/main.cu
@@ -32,7 +32,7 @@ const std::vector<std::tuple<int, int, int>> DEFAULT_CASES = {
     {256, 384, 640},
     {511, 513, 1025},
 };
-}
+} // namespace
 
 void naive_kernel(const float *A, const float *B, float *C, int M, int K,
                   int N) {
@@ -98,8 +98,7 @@ void runBenchmarks(int M, int K, int N) {
 
     if (tensorCoreDimensionsSupported(M, K, N)) {
       printf("Running Tensor Core SGEMM (compute-only WMMA path)...\n");
-      benchmark.runTensorCoreComputeOnly(M, K, N, warmup_runs,
-                                         benchmark_runs,
+      benchmark.runTensorCoreComputeOnly(M, K, N, warmup_runs, benchmark_runs,
                                          kTensorCoreVerifyTolerance);
     } else {
       printf("Skipping Tensor Core compute-only benchmark (requires positive "
@@ -110,8 +109,9 @@ void runBenchmarks(int M, int K, int N) {
     CUDA_CHECK(cudaGetDevice(&device));
     cudaDeviceProp prop;
     CUDA_CHECK(cudaGetDeviceProperties(&prop, device));
-    printf("Skipping Tensor Core benchmarks (requires sm_70+, current: sm_%d%d)\n",
-           prop.major, prop.minor);
+    printf(
+        "Skipping Tensor Core benchmarks (requires sm_70+, current: sm_%d%d)\n",
+        prop.major, prop.minor);
   }
 
   benchmark.printSummary();
@@ -125,7 +125,8 @@ void runBenchmarks(int M, int K, int N) {
 void printUsage(const char *program) {
   printf("Usage: %s [options]\n", program);
   printf("\nOptions:\n");
-  printf("  -s, --size SIZE          Benchmark one square SIZE x SIZE x SIZE case\n");
+  printf("  -s, --size SIZE          Benchmark one square SIZE x SIZE x SIZE "
+         "case\n");
   printf("  --dims M K N            Benchmark one explicit M x K x N case\n");
   printf("  -a, --all               Run the default benchmark set\n");
   printf("  --warmup N              Number of warmup runs (default: 5)\n");
@@ -134,7 +135,8 @@ void printUsage(const char *program) {
   printf("\nDefault benchmark set includes:\n");
   printf("  - aligned square cases (512, 1024)\n");
   printf("  - one aligned non-square case (256 x 384 x 640)\n");
-  printf("  - one unaligned edge case (511 x 513 x 1025) to exercise safe Tensor Core fallback\n");
+  printf("  - one unaligned edge case (511 x 513 x 1025) to exercise safe "
+         "Tensor Core fallback\n");
   printf("\nExamples:\n");
   printf("  %s -s 1024\n", program);
   printf("  %s --dims 256 384 640\n", program);
@@ -260,9 +262,12 @@ int main(int argc, char **argv) {
   printf("\n");
   printf("Notes:\n");
   printf("  - Standard kernels are verified with shared FP32 tolerances.\n");
-  printf("  - Tensor Core verification uses relaxed mixed-precision tolerances.\n");
-  printf("  - The end-to-end Tensor Core result includes FP32->FP16 conversion and safe fallback behavior.\n");
-  printf("  - The compute-only Tensor Core result is only shown for WMMA-compatible dimensions.\n");
+  printf("  - Tensor Core verification uses relaxed mixed-precision "
+         "tolerances.\n");
+  printf("  - The end-to-end Tensor Core result includes FP32->FP16 conversion "
+         "and safe fallback behavior.\n");
+  printf("  - The compute-only Tensor Core result is only shown for "
+         "WMMA-compatible dimensions.\n");
   printf("\n");
 
   return 0;
diff --git a/tests/test_sgemm.cu b/tests/test_sgemm.cu
@@ -25,29 +25,16 @@ constexpr int PBT_ITERATIONS = 100;
 
 std::vector<std::tuple<int, int, int>> getStandardDimensions() {
   return {
-      {1, 1, 1},
-      {16, 16, 16},
-      {32, 32, 32},
-      {64, 64, 64},
-      {128, 128, 128},
-      {256, 256, 256},
-      {512, 512, 512},
-      {64, 128, 256},
-      {256, 64, 128},
-      {128, 256, 64},
-      {511, 513, 1025},
+      {1, 1, 1},       {16, 16, 16},    {32, 32, 32},     {64, 64, 64},
+      {128, 128, 128}, {256, 256, 256}, {512, 512, 512},  {64, 128, 256},
+      {256, 64, 128},  {128, 256, 64},  {511, 513, 1025},
   };
 }
 
 std::vector<std::tuple<int, int, int>> getTensorCoreFastPathDimensions() {
   return {
-      {16, 16, 16},
-      {32, 32, 32},
-      {64, 64, 64},
-      {128, 128, 128},
-      {256, 256, 256},
-      {64, 128, 256},
-      {256, 64, 128},
+      {16, 16, 16},    {32, 32, 32},   {64, 64, 64},   {128, 128, 128},
+      {256, 256, 256}, {64, 128, 256}, {256, 64, 128},
   };
 }
 
@@ -96,9 +83,8 @@ TEST_F(ErrorDetectionTest, StandardKernelErrorDetection) {
       h_test_[i] = h_ref_[i] + error_magnitude * (dist(gen) > 0 ? 1 : -1);
     }
 
-    VerifyResult result =
-        compareMatrices(h_test_.data(), h_ref_.data(), 64, 64,
-                        kStandardVerifyTolerance);
+    VerifyResult result = compareMatrices(h_test_.data(), h_ref_.data(), 64, 64,
+                                          kStandardVerifyTolerance);
 
     EXPECT_TRUE(SGEMMVerifier::shouldFlagAsIncorrect(result))
         << "Iteration " << iter << ": error above tolerance should be flagged";
@@ -119,9 +105,8 @@ TEST_F(ErrorDetectionTest, StandardKernelPassesWithinTolerance) {
       h_test_[i] = h_ref_[i] + error_magnitude * dist(gen);
     }
 
-    VerifyResult result =
-        compareMatrices(h_test_.data(), h_ref_.data(), 64, 64,
-                        kStandardVerifyTolerance);
+    VerifyResult result = compareMatrices(h_test_.data(), h_ref_.data(), 64, 64,
+                                          kStandardVerifyTolerance);
 
     EXPECT_TRUE(result.passed)
         << "Iteration " << iter << ": error within tolerance should pass";
@@ -142,9 +127,8 @@ TEST_F(ErrorDetectionTest, TensorCoreErrorDetection) {
       h_test_[i] = h_ref_[i] + error_magnitude * (dist(gen) > 0 ? 1 : -1);
     }
 
-    VerifyResult result =
-        compareMatrices(h_test_.data(), h_ref_.data(), 64, 64,
-                        kTensorCoreVerifyTolerance);
+    VerifyResult result = compareMatrices(h_test_.data(), h_ref_.data(), 64, 64,
+                                          kTensorCoreVerifyTolerance);
 
     EXPECT_TRUE(SGEMMVerifier::shouldFlagAsIncorrect(result))
         << "Iteration " << iter
@@ -193,9 +177,9 @@ protected:
   }
 
   template <typename LaunchFn>
-  VerifyResult runKernelAndCompare(LaunchFn launch_fn,
-                                   VerifyTolerance tolerance =
-                                       kStandardVerifyTolerance) {
+  VerifyResult
+  runKernelAndCompare(LaunchFn launch_fn,
+                      VerifyTolerance tolerance = kStandardVerifyTolerance) {
     CUDA_CHECK(cudaMemset(d_C_, 0, M_ * N_ * sizeof(float)));
     launch_fn();
     CUDA_CHECK(cudaDeviceSynchronize());
@@ -244,8 +228,9 @@ INSTANTIATE_TEST_SUITE_P(StandardDimensions, TiledSGEMMTest,
 class BankConflictFreeSGEMMTest : public SGEMMKernelTest {};
 
 TEST_P(BankConflictFreeSGEMMTest, CorrectnessProperty) {
-  VerifyResult result = runKernelAndCompare(
-      [&] { launch_bank_conflict_free_sgemm<32>(d_A_, d_B_, d_C_, M_, K_, N_); });
+  VerifyResult result = runKernelAndCompare([&] {
+    launch_bank_conflict_free_sgemm<32>(d_A_, d_B_, d_C_, M_, K_, N_);
+  });
 
   EXPECT_TRUE(result.passed)
       << "BankConflictFree SGEMM failed for dimensions " << M_ << "x" << K_
@@ -287,8 +272,9 @@ TEST_P(TensorCoreSGEMMTest, FastPathCorrectnessProperty) {
       << "x" << N_ << " (max_rel_error: " << result.max_rel_error << ")";
 }
 
-INSTANTIATE_TEST_SUITE_P(TensorCoreFastPathDimensions, TensorCoreSGEMMTest,
-                         ::testing::ValuesIn(getTensorCoreFastPathDimensions()));
+INSTANTIATE_TEST_SUITE_P(
+    TensorCoreFastPathDimensions, TensorCoreSGEMMTest,
+    ::testing::ValuesIn(getTensorCoreFastPathDimensions()));
 
 class TensorCoreFallbackTest : public SGEMMKernelTest {};
 
@@ -302,13 +288,17 @@ TEST_P(TensorCoreFallbackTest, NonAlignedInputsFallbackSafely) {
       << N_ << " (max_rel_error: " << result.max_rel_error << ")";
 }
 
-INSTANTIATE_TEST_SUITE_P(TensorCoreFallbackDimensions, TensorCoreFallbackTest,
-                         ::testing::ValuesIn(getTensorCoreFallbackDimensions()));
+INSTANTIATE_TEST_SUITE_P(
+    TensorCoreFallbackDimensions, TensorCoreFallbackTest,
+    ::testing::ValuesIn(getTensorCoreFallbackDimensions()));
 
 TEST(TensorCoreWrapperTest, ZeroSizeInputsReturnSafely) {
-  EXPECT_NO_THROW(launch_tensor_core_sgemm(nullptr, nullptr, nullptr, 0, 16, 16));
-  EXPECT_NO_THROW(launch_tensor_core_sgemm(nullptr, nullptr, nullptr, 16, 0, 16));
-  EXPECT_NO_THROW(launch_tensor_core_sgemm(nullptr, nullptr, nullptr, 16, 16, 0));
+  EXPECT_NO_THROW(
+      launch_tensor_core_sgemm(nullptr, nullptr, nullptr, 0, 16, 16));
+  EXPECT_NO_THROW(
+      launch_tensor_core_sgemm(nullptr, nullptr, nullptr, 16, 0, 16));
+  EXPECT_NO_THROW(
+      launch_tensor_core_sgemm(nullptr, nullptr, nullptr, 16, 16, 0));
 }
 
 class DimensionInvarianceTest : public ::testing::Test {
@@ -358,9 +348,8 @@ TEST_F(DimensionInvarianceTest, AllStandardKernelsWorkWithVariousDimensions) {
       CUDA_CHECK(cudaMemcpy(h_C.data(), d_C, M * N * sizeof(float),
                             cudaMemcpyDeviceToHost));
 
-      VerifyResult result =
-          compareMatrices(h_C.data(), h_ref.data(), M, N,
-                          kStandardVerifyTolerance);
+      VerifyResult result = compareMatrices(h_C.data(), h_ref.data(), M, N,
+                                            kStandardVerifyTolerance);
       EXPECT_TRUE(result.passed)
           << name << " failed at iteration " << iter << " with dimensions " << M
           << "x" << K << "x" << N;