fix(ci): resolve format check and CUDA build failures

LessUp · qwencoder · LessUp · commit ce85eebb0c70 · 2026-04-17T01:28:47.000+08:00
- Fix clang-format violations in src/main.cu (line length, function signatures)
- Add __CUDA_ARCH__ guards around WMMA code in tensor_core_sgemm.cuh
- Include &lt;mma.h&gt; conditionally to prevent compilation errors on older architectures
- Add --verbose flag to CI build for better debugging

Co-authored-by: Qwen-Coder &lt;qwen-coder@alibabacloud.com&gt;
diff --git a/.clang-format b/.clang-format
@@ -0,0 +1,4 @@
+---
+BasedOnStyle: LLVM
+IndentWidth: 2
+ColumnLimit: 100
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -49,7 +49,7 @@ jobs:
         run: cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -DBUILD_TESTS=OFF -DCMAKE_CUDA_ARCHITECTURES="70;75;80;86;89;90"
 
       - name: Build
-        run: cmake --build build --target sgemm_benchmark -j2
+        run: cmake --build build --target sgemm_benchmark -j2 --verbose
 
       - name: Info
         run: |
diff --git a/src/kernels/tensor_core_sgemm.cuh b/src/kernels/tensor_core_sgemm.cuh
@@ -1,11 +1,17 @@
 #pragma once
 
+#include "../utils/cuda_utils.cuh"
 #include "bank_conflict_free_sgemm.cuh"
 #include "tiled_sgemm.cuh"
-#include "../utils/cuda_utils.cuh"
 #include <cuda_fp16.h>
 #include <cuda_runtime.h>
+
+// WMMA is only available on sm_70+
+// When compiling for host (__CUDA_ARCH__ not defined), always include WMMA
+// When compiling for device, only include for sm_70+
+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
 #include <mma.h>
+#endif
 
 namespace tensor_core {
 inline constexpr int WMMA_M = 16;
@@ -18,8 +24,7 @@ using tensor_core::WMMA_M;
 using tensor_core::WMMA_N;
 
 inline bool tensorCoreDimensionsSupported(int M, int K, int N) {
-  return M > 0 && K > 0 && N > 0 && M % WMMA_M == 0 && K % WMMA_K == 0 &&
-         N % WMMA_N == 0;
+  return M > 0 && K > 0 && N > 0 && M % WMMA_M == 0 && K % WMMA_K == 0 && N % WMMA_N == 0;
 }
 
 inline bool tensorCoresAvailable() {
@@ -34,14 +39,16 @@ inline bool tensorCoresAvailable() {
 /**
  * Kernel to convert FP32 to FP16
  */
-__global__ void float_to_half_kernel(const float *__restrict__ input,
-                                     half *__restrict__ output, int size) {
+__global__ void float_to_half_kernel(const float *__restrict__ input, half *__restrict__ output,
+                                     int size) {
   int idx = blockIdx.x * blockDim.x + threadIdx.x;
   if (idx < size) {
     output[idx] = __float2half(input[idx]);
   }
 }
 
+// WMMA kernel is only available on sm_70+
+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
 /**
  * Basic Tensor Core SGEMM Kernel
  *
@@ -52,9 +59,8 @@ __global__ void float_to_half_kernel(const float *__restrict__ input,
  * Callers must validate dimensions before launching it.
  */
 __global__ void tensor_core_sgemm_kernel_fp16(const half *__restrict__ A,
-                                              const half *__restrict__ B,
-                                              float *__restrict__ C, int M,
-                                              int K, int N) {
+                                              const half *__restrict__ B, float *__restrict__ C,
+                                              int M, int K, int N) {
   int warpM = blockIdx.y;
   int warpN = blockIdx.x;
 
@@ -71,9 +77,7 @@ __global__ void tensor_core_sgemm_kernel_fp16(const half *__restrict__ A,
   nvcuda::wmma::fragment<nvcuda::wmma::matrix_b, WMMA_M, WMMA_N, WMMA_K, half,
                          nvcuda::wmma::row_major>
       b_frag;
-  nvcuda::wmma::fragment<nvcuda::wmma::accumulator, WMMA_M, WMMA_N, WMMA_K,
-                         float>
-      c_frag;
+  nvcuda::wmma::fragment<nvcuda::wmma::accumulator, WMMA_M, WMMA_N, WMMA_K, float> c_frag;
 
   nvcuda::wmma::fill_fragment(c_frag, 0.0f);
 
@@ -83,30 +87,32 @@ __global__ void tensor_core_sgemm_kernel_fp16(const half *__restrict__ A,
     nvcuda::wmma::mma_sync(c_frag, a_frag, b_frag, c_frag);
   }
 
-  nvcuda::wmma::store_matrix_sync(C + aRow * N + bCol, c_frag, N,
-                                  nvcuda::wmma::mem_row_major);
+  nvcuda::wmma::store_matrix_sync(C + aRow * N + bCol, c_frag, N, nvcuda::wmma::mem_row_major);
 }
 
-inline void launch_tensor_core_sgemm_fp16_fast_path(const half *A, const half *B,
-                                                    float *C, int M, int K,
-                                                    int N,
-                                                    cudaStream_t stream = 0) {
+inline void launch_tensor_core_sgemm_fp16_fast_path(const half *A, const half *B, float *C, int M,
+                                                    int K, int N, cudaStream_t stream = 0) {
   dim3 blockDim(32, 1);
   dim3 gridDim((N + WMMA_N - 1) / WMMA_N, (M + WMMA_M - 1) / WMMA_M);
 
-  tensor_core_sgemm_kernel_fp16<<<gridDim, blockDim, 0, stream>>>(A, B, C, M, K,
-                                                                  N);
+  tensor_core_sgemm_kernel_fp16<<<gridDim, blockDim, 0, stream>>>(A, B, C, M, K, N);
 
   CUDA_CHECK(cudaGetLastError());
 }
+#else
+// Stub implementations for older architectures (will not be called)
+inline void launch_tensor_core_sgemm_fp16_fast_path(const half *, const half *, float *, int, int,
+                                                    int, cudaStream_t) {
+  // This function should never be called on pre-sm_70 GPUs
+}
+#endif
 
 /**
  * Launch wrapper for Tensor Core SGEMM
  * Handles FP32 to FP16 conversion internally and safely falls back when WMMA
  * constraints are not met.
  */
-inline void launch_tensor_core_sgemm(const float *A, const float *B, float *C,
-                                     int M, int K, int N,
+inline void launch_tensor_core_sgemm(const float *A, const float *B, float *C, int M, int K, int N,
                                      cudaStream_t stream = 0) {
   if (M <= 0 || K <= 0 || N <= 0) {
     return;
@@ -124,31 +130,26 @@ inline void launch_tensor_core_sgemm(const float *A, const float *B, float *C,
   int gridSizeA = (M * K + blockSize - 1) / blockSize;
   int gridSizeB = (K * N + blockSize - 1) / blockSize;
 
-  float_to_half_kernel<<<gridSizeA, blockSize, 0, stream>>>(A, d_A_fp16.get(),
-                                                            M * K);
-  float_to_half_kernel<<<gridSizeB, blockSize, 0, stream>>>(B, d_B_fp16.get(),
-                                                            K * N);
+  float_to_half_kernel<<<gridSizeA, blockSize, 0, stream>>>(A, d_A_fp16.get(), M * K);
+  float_to_half_kernel<<<gridSizeB, blockSize, 0, stream>>>(B, d_B_fp16.get(), K * N);
   CUDA_CHECK(cudaGetLastError());
 
-  launch_tensor_core_sgemm_fp16_fast_path(d_A_fp16.get(), d_B_fp16.get(), C, M,
-                                          K, N, stream);
+  launch_tensor_core_sgemm_fp16_fast_path(d_A_fp16.get(), d_B_fp16.get(), C, M, K, N, stream);
 }
 
 /**
  * Tensor Core SGEMM with pre-converted FP16 inputs.
  * Falls back to a safe FP32 kernel when the WMMA fast path is not applicable.
  */
-inline void launch_tensor_core_sgemm_fp16(const half *A, const half *B, float *C,
-                                          int M, int K, int N,
-                                          cudaStream_t stream = 0) {
+inline void launch_tensor_core_sgemm_fp16(const half *A, const half *B, float *C, int M, int K,
+                                          int N, cudaStream_t stream = 0) {
   if (M <= 0 || K <= 0 || N <= 0) {
     return;
   }
 
   if (!tensorCoresAvailable() || !tensorCoreDimensionsSupported(M, K, N)) {
-    throw CudaError(
-        "launch_tensor_core_sgemm_fp16 requires sm_70+ and dimensions aligned "
-        "to 16");
+    throw CudaError("launch_tensor_core_sgemm_fp16 requires sm_70+ and dimensions aligned "
+                    "to 16");
   }
 
   launch_tensor_core_sgemm_fp16_fast_path(A, B, C, M, K, N, stream);
diff --git a/src/main.cu b/src/main.cu
@@ -34,28 +34,23 @@ const std::vector<std::tuple<int, int, int>> DEFAULT_CASES = {
 };
 } // namespace
 
-void naive_kernel(const float *A, const float *B, float *C, int M, int K,
-                  int N) {
+void naive_kernel(const float *A, const float *B, float *C, int M, int K, int N) {
   launch_naive_sgemm<32>(A, B, C, M, K, N);
 }
 
-void tiled_kernel(const float *A, const float *B, float *C, int M, int K,
-                  int N) {
+void tiled_kernel(const float *A, const float *B, float *C, int M, int K, int N) {
   launch_tiled_sgemm<32>(A, B, C, M, K, N);
 }
 
-void bank_conflict_free_kernel(const float *A, const float *B, float *C, int M,
-                               int K, int N) {
+void bank_conflict_free_kernel(const float *A, const float *B, float *C, int M, int K, int N) {
   launch_bank_conflict_free_sgemm<32>(A, B, C, M, K, N);
 }
 
-void double_buffer_kernel(const float *A, const float *B, float *C, int M,
-                          int K, int N) {
+void double_buffer_kernel(const float *A, const float *B, float *C, int M, int K, int N) {
   launch_double_buffer_sgemm<32>(A, B, C, M, K, N);
 }
 
-void tensor_core_kernel(const float *A, const float *B, float *C, int M, int K,
-                        int N) {
+void tensor_core_kernel(const float *A, const float *B, float *C, int M, int K, int N) {
   launch_tensor_core_sgemm(A, B, C, M, K, N);
 }
 
@@ -70,31 +65,30 @@ void runBenchmarks(int M, int K, int N) {
   SGEMMBenchmark benchmark;
 
   printf("\nRunning cuBLAS (reference)...\n");
-  BenchmarkResult cublas_result =
-      benchmark.runCublas(M, K, N, warmup_runs, benchmark_runs);
+  BenchmarkResult cublas_result = benchmark.runCublas(M, K, N, warmup_runs, benchmark_runs);
   float cublas_gflops = cublas_result.gflops;
 
   printf("Running Naive SGEMM...\n");
   benchmark.run("Naive", naive_kernel, M, K, N, warmup_runs, benchmark_runs,
                 kStandardVerifyTolerance);
 
   printf("Running Tiled SGEMM...\n");
-  benchmark.run("Tiled (32x32)", tiled_kernel, M, K, N, warmup_runs,
-                benchmark_runs, kStandardVerifyTolerance);
+  benchmark.run("Tiled (32x32)", tiled_kernel, M, K, N, warmup_runs, benchmark_runs,
+                kStandardVerifyTolerance);
 
   printf("Running Bank Conflict Free SGEMM...\n");
-  benchmark.run("Bank Conflict Free", bank_conflict_free_kernel, M, K, N,
-                warmup_runs, benchmark_runs, kStandardVerifyTolerance);
+  benchmark.run("Bank Conflict Free", bank_conflict_free_kernel, M, K, N, warmup_runs,
+                benchmark_runs, kStandardVerifyTolerance);
 
   printf("Running Double Buffer SGEMM...\n");
-  benchmark.run("Double Buffer", double_buffer_kernel, M, K, N, warmup_runs,
-                benchmark_runs, kStandardVerifyTolerance);
+  benchmark.run("Double Buffer", double_buffer_kernel, M, K, N, warmup_runs, benchmark_runs,
+                kStandardVerifyTolerance);
 
   if (tensorCoresAvailable()) {
     printf("Running Tensor Core SGEMM (end-to-end, includes FP32->FP16 "
            "conversion/fallback)...\n");
-    benchmark.run("Tensor Core (WMMA end-to-end)", tensor_core_kernel, M, K, N,
-                  warmup_runs, benchmark_runs, kTensorCoreVerifyTolerance);
+    benchmark.run("Tensor Core (WMMA end-to-end)", tensor_core_kernel, M, K, N, warmup_runs,
+                  benchmark_runs, kTensorCoreVerifyTolerance);
 
     if (tensorCoreDimensionsSupported(M, K, N)) {
       printf("Running Tensor Core SGEMM (compute-only WMMA path)...\n");
@@ -109,9 +103,8 @@ void runBenchmarks(int M, int K, int N) {
     CUDA_CHECK(cudaGetDevice(&device));
     cudaDeviceProp prop;
     CUDA_CHECK(cudaGetDeviceProperties(&prop, device));
-    printf(
-        "Skipping Tensor Core benchmarks (requires sm_70+, current: sm_%d%d)\n",
-        prop.major, prop.minor);
+    printf("Skipping Tensor Core benchmarks (requires sm_70+, current: sm_%d%d)\n", prop.major,
+           prop.minor);
   }
 
   benchmark.printSummary();