LessUp
diff --git a/‎src/kernels/bank_conflict_free_sgemm.cuh‎
Lines changed: 5 additions & 9 deletions b/‎src/kernels/bank_conflict_free_sgemm.cuh‎
Lines changed: 5 additions & 9 deletions
diff --git a/‎src/kernels/double_buffer_sgemm.cuh‎
Lines changed: 6 additions & 10 deletions b/‎src/kernels/double_buffer_sgemm.cuh‎
Lines changed: 6 additions & 10 deletions
diff --git a/‎src/kernels/naive_sgemm.cuh‎
Lines changed: 4 additions & 6 deletions b/‎src/kernels/naive_sgemm.cuh‎
Lines changed: 4 additions & 6 deletions
diff --git a/‎src/kernels/tiled_sgemm.cuh‎
Lines changed: 5 additions & 8 deletions b/‎src/kernels/tiled_sgemm.cuh‎
Lines changed: 5 additions & 8 deletions
diff --git a/‎src/utils/benchmark.cuh‎
Lines changed: 34 additions & 51 deletions b/‎src/utils/benchmark.cuh‎
Lines changed: 34 additions & 51 deletions
@@ -40,9 +40,8 @@
  */
 template <int TILE_SIZE>
 __global__ void bank_conflict_free_sgemm_kernel(const float *__restrict__ A,
-                                                const float *__restrict__ B,
-                                                float *__restrict__ C, int M,
-                                                int K, int N) {
+                                                const float *__restrict__ B, float *__restrict__ C,
+                                                int M, int K, int N) {
   // Shared memory with padding to avoid bank conflicts
   // Adding 1 to the second dimension shifts each row by 1 bank
   // This ensures column accesses hit different banks
@@ -99,15 +98,12 @@ __global__ void bank_conflict_free_sgemm_kernel(const float *__restrict__ A,
  * Launch wrapper for bank conflict free SGEMM kernel
  */
 template <int TILE_SIZE = 32>
-void launch_bank_conflict_free_sgemm(const float *A, const float *B, float *C,
-                                     int M, int K, int N,
+void launch_bank_conflict_free_sgemm(const float *A, const float *B, float *C, int M, int K, int N,
                                      cudaStream_t stream = 0) {
   dim3 blockDim(TILE_SIZE, TILE_SIZE);
-  dim3 gridDim((N + TILE_SIZE - 1) / TILE_SIZE,
-               (M + TILE_SIZE - 1) / TILE_SIZE);
+  dim3 gridDim((N + TILE_SIZE - 1) / TILE_SIZE, (M + TILE_SIZE - 1) / TILE_SIZE);
 
-  bank_conflict_free_sgemm_kernel<TILE_SIZE>
-      <<<gridDim, blockDim, 0, stream>>>(A, B, C, M, K, N);
+  bank_conflict_free_sgemm_kernel<TILE_SIZE><<<gridDim, blockDim, 0, stream>>>(A, B, C, M, K, N);
 
   CUDA_CHECK(cudaGetLastError());
 }
@@ -43,10 +43,8 @@
  * C: M x N (row-major)
  */
 template <int TILE_SIZE>
-__global__ void double_buffer_sgemm_kernel(const float *__restrict__ A,
-                                           const float *__restrict__ B,
-                                           float *__restrict__ C, int M, int K,
-                                           int N) {
+__global__ void double_buffer_sgemm_kernel(const float *__restrict__ A, const float *__restrict__ B,
+                                           float *__restrict__ C, int M, int K, int N) {
   // Double buffers with padding to avoid bank conflicts
   __shared__ float As[2][TILE_SIZE][TILE_SIZE + 1];
   __shared__ float Bs[2][TILE_SIZE][TILE_SIZE + 1];
@@ -132,14 +130,12 @@ __global__ void double_buffer_sgemm_kernel(const float *__restrict__ A,
  * Launch wrapper for double buffer SGEMM kernel
  */
 template <int TILE_SIZE = 32>
-void launch_double_buffer_sgemm(const float *A, const float *B, float *C, int M,
-                                int K, int N, cudaStream_t stream = 0) {
+void launch_double_buffer_sgemm(const float *A, const float *B, float *C, int M, int K, int N,
+                                cudaStream_t stream = 0) {
   dim3 blockDim(TILE_SIZE, TILE_SIZE);
-  dim3 gridDim((N + TILE_SIZE - 1) / TILE_SIZE,
-               (M + TILE_SIZE - 1) / TILE_SIZE);
+  dim3 gridDim((N + TILE_SIZE - 1) / TILE_SIZE, (M + TILE_SIZE - 1) / TILE_SIZE);
 
-  double_buffer_sgemm_kernel<TILE_SIZE>
-      <<<gridDim, blockDim, 0, stream>>>(A, B, C, M, K, N);
+  double_buffer_sgemm_kernel<TILE_SIZE><<<gridDim, blockDim, 0, stream>>>(A, B, C, M, K, N);
 
   CUDA_CHECK(cudaGetLastError());
 }
@@ -26,8 +26,7 @@
  * B: K x N (row-major)
  * C: M x N (row-major)
  */
-__global__ void naive_sgemm_kernel(const float *__restrict__ A,
-                                   const float *__restrict__ B,
+__global__ void naive_sgemm_kernel(const float *__restrict__ A, const float *__restrict__ B,
                                    float *__restrict__ C, int M, int K, int N) {
   // Calculate global row and column indices
   int row = blockIdx.y * blockDim.y + threadIdx.y;
@@ -62,13 +61,12 @@ __global__ void naive_sgemm_kernel(const float *__restrict__ A,
  * @param stream CUDA stream (default: 0)
  */
 template <int BLOCK_SIZE = 32>
-void launch_naive_sgemm(const float *A, const float *B, float *C, int M, int K,
-                        int N, cudaStream_t stream = 0) {
+void launch_naive_sgemm(const float *A, const float *B, float *C, int M, int K, int N,
+                        cudaStream_t stream = 0) {
   // Configure grid and block dimensions
   // Each thread computes one element of C
   dim3 blockDim(BLOCK_SIZE, BLOCK_SIZE);
-  dim3 gridDim((N + BLOCK_SIZE - 1) / BLOCK_SIZE,
-               (M + BLOCK_SIZE - 1) / BLOCK_SIZE);
+  dim3 gridDim((N + BLOCK_SIZE - 1) / BLOCK_SIZE, (M + BLOCK_SIZE - 1) / BLOCK_SIZE);
 
   // Launch kernel
   naive_sgemm_kernel<<<gridDim, blockDim, 0, stream>>>(A, B, C, M, K, N);
 
@@ -25,8 +25,7 @@
  * C: M x N (row-major)
  */
 template <int TILE_SIZE>
-__global__ void tiled_sgemm_kernel(const float *__restrict__ A,
-                                   const float *__restrict__ B,
+__global__ void tiled_sgemm_kernel(const float *__restrict__ A, const float *__restrict__ B,
                                    float *__restrict__ C, int M, int K, int N) {
   // Shared memory for tiles of A and B
   __shared__ float As[TILE_SIZE][TILE_SIZE];
@@ -102,18 +101,16 @@ __global__ void tiled_sgemm_kernel(const float *__restrict__ A,
  * @param stream CUDA stream (default: 0)
  */
 template <int TILE_SIZE = 32>
-void launch_tiled_sgemm(const float *A, const float *B, float *C, int M, int K,
-                        int N, cudaStream_t stream = 0) {
+void launch_tiled_sgemm(const float *A, const float *B, float *C, int M, int K, int N,
+                        cudaStream_t stream = 0) {
   // Block size matches tile size
   dim3 blockDim(TILE_SIZE, TILE_SIZE);
 
   // Grid covers the output matrix
-  dim3 gridDim((N + TILE_SIZE - 1) / TILE_SIZE,
-               (M + TILE_SIZE - 1) / TILE_SIZE);
+  dim3 gridDim((N + TILE_SIZE - 1) / TILE_SIZE, (M + TILE_SIZE - 1) / TILE_SIZE);
 
   // Launch kernel
-  tiled_sgemm_kernel<TILE_SIZE>
-      <<<gridDim, blockDim, 0, stream>>>(A, B, C, M, K, N);
+  tiled_sgemm_kernel<TILE_SIZE><<<gridDim, blockDim, 0, stream>>>(A, B, C, M, K, N);
 
   CUDA_CHECK(cudaGetLastError());
 }
@@ -30,8 +30,7 @@ struct BenchmarkResult {
   void print() const {
     printf("  %-30s | %4d x %4d x %4d | %8.3f ms | %8.2f GFLOPS | %s | err: "
            "%.2e\n",
-           kernel_name.c_str(), M, K, N, time_ms, gflops,
-           correct ? "PASS" : "FAIL", max_error);
+           kernel_name.c_str(), M, K, N, time_ms, gflops, correct ? "PASS" : "FAIL", max_error);
   }
 };
 
@@ -54,9 +53,8 @@ public:
   }
 
   template <typename KernelFunc>
-  BenchmarkResult run(const std::string &name, KernelFunc kernel_func, int M,
-                      int K, int N, int warmup_runs = 5,
-                      int benchmark_runs = 20,
+  BenchmarkResult run(const std::string &name, KernelFunc kernel_func, int M, int K, int N,
+                      int warmup_runs = 5, int benchmark_runs = 20,
                       VerifyTolerance tolerance = kStandardVerifyTolerance) {
     BenchmarkResult result;
     result.kernel_name = name;
@@ -77,9 +75,8 @@ public:
     d_B.copyFromHost(h_B.data(), K * N);
 
     float alpha = 1.0f, beta = 0.0f;
-    CUBLAS_CHECK(cublasSgemm(cublas_handle_, CUBLAS_OP_N, CUBLAS_OP_N, N, M, K,
-                             &alpha, d_B.get(), N, d_A.get(), K, &beta,
-                             d_C_ref.get(), N));
+    CUBLAS_CHECK(cublasSgemm(cublas_handle_, CUBLAS_OP_N, CUBLAS_OP_N, N, M, K, &alpha, d_B.get(),
+                             N, d_A.get(), K, &beta, d_C_ref.get(), N));
 
     for (int i = 0; i < warmup_runs; ++i) {
       d_C.zero();
@@ -101,17 +98,15 @@ public:
     d_C.copyToHost(h_C.data(), M * N);
     d_C_ref.copyToHost(h_C_ref.data(), M * N);
 
-    VerifyResult verify_result = compareMatrices(h_C.data(), h_C_ref.data(), M,
-                                                 N, tolerance);
+    VerifyResult verify_result = compareMatrices(h_C.data(), h_C_ref.data(), M, N, tolerance);
     result.correct = verify_result.passed;
     result.max_error = verify_result.max_rel_error;
 
     results_.push_back(result);
     return result;
   }
 
-  BenchmarkResult runCublas(int M, int K, int N, int warmup_runs = 5,
-                            int benchmark_runs = 20) {
+  BenchmarkResult runCublas(int M, int K, int N, int warmup_runs = 5, int benchmark_runs = 20) {
     BenchmarkResult result;
     result.kernel_name = "cuBLAS";
     result.M = M;
@@ -132,17 +127,15 @@ public:
     float alpha = 1.0f, beta = 0.0f;
 
     for (int i = 0; i < warmup_runs; ++i) {
-      CUBLAS_CHECK(cublasSgemm(cublas_handle_, CUBLAS_OP_N, CUBLAS_OP_N, N, M,
-                               K, &alpha, d_B.get(), N, d_A.get(), K, &beta,
-                               d_C.get(), N));
+      CUBLAS_CHECK(cublasSgemm(cublas_handle_, CUBLAS_OP_N, CUBLAS_OP_N, N, M, K, &alpha, d_B.get(),
+                               N, d_A.get(), K, &beta, d_C.get(), N));
     }
     CUDA_CHECK(cudaDeviceSynchronize());
 
     CUDA_CHECK(cudaEventRecord(start_));
     for (int i = 0; i < benchmark_runs; ++i) {
-      CUBLAS_CHECK(cublasSgemm(cublas_handle_, CUBLAS_OP_N, CUBLAS_OP_N, N, M,
-                               K, &alpha, d_B.get(), N, d_A.get(), K, &beta,
-                               d_C.get(), N));
+      CUBLAS_CHECK(cublasSgemm(cublas_handle_, CUBLAS_OP_N, CUBLAS_OP_N, N, M, K, &alpha, d_B.get(),
+                               N, d_A.get(), K, &beta, d_C.get(), N));
     }
     CUDA_CHECK(cudaEventRecord(stop_));
     CUDA_CHECK(cudaEventSynchronize(stop_));
@@ -157,9 +150,9 @@ public:
     return result;
   }
 
-  BenchmarkResult runTensorCoreComputeOnly(
-      int M, int K, int N, int warmup_runs = 5, int benchmark_runs = 20,
-      VerifyTolerance tolerance = kTensorCoreVerifyTolerance) {
+  BenchmarkResult runTensorCoreComputeOnly(int M, int K, int N, int warmup_runs = 5,
+                                           int benchmark_runs = 20,
+                                           VerifyTolerance tolerance = kTensorCoreVerifyTolerance) {
     if (!tensorCoresAvailable() || !tensorCoreDimensionsSupported(M, K, N)) {
       throw CudaError("Tensor Core compute-only benchmark requires sm_70+ and "
                       "dimensions aligned to 16");
@@ -186,32 +179,27 @@ public:
     d_B.copyFromHost(h_B.data(), K * N);
 
     float alpha = 1.0f, beta = 0.0f;
-    CUBLAS_CHECK(cublasSgemm(cublas_handle_, CUBLAS_OP_N, CUBLAS_OP_N, N, M, K,
-                             &alpha, d_B.get(), N, d_A.get(), K, &beta,
-                             d_C_ref.get(), N));
+    CUBLAS_CHECK(cublasSgemm(cublas_handle_, CUBLAS_OP_N, CUBLAS_OP_N, N, M, K, &alpha, d_B.get(),
+                             N, d_A.get(), K, &beta, d_C_ref.get(), N));
 
     int blockSize = 256;
     int gridSizeA = (M * K + blockSize - 1) / blockSize;
     int gridSizeB = (K * N + blockSize - 1) / blockSize;
 
-    float_to_half_kernel<<<gridSizeA, blockSize>>>(d_A.get(), d_A_fp16.get(),
-                                                   M * K);
-    float_to_half_kernel<<<gridSizeB, blockSize>>>(d_B.get(), d_B_fp16.get(),
-                                                   K * N);
+    float_to_half_kernel<<<gridSizeA, blockSize>>>(d_A.get(), d_A_fp16.get(), M * K);
+    float_to_half_kernel<<<gridSizeB, blockSize>>>(d_B.get(), d_B_fp16.get(), K * N);
     CUDA_CHECK(cudaGetLastError());
     CUDA_CHECK(cudaDeviceSynchronize());
 
     for (int i = 0; i < warmup_runs; ++i) {
       d_C.zero();
-      launch_tensor_core_sgemm_fp16(d_A_fp16.get(), d_B_fp16.get(), d_C.get(), M,
-                                    K, N);
+      launch_tensor_core_sgemm_fp16(d_A_fp16.get(), d_B_fp16.get(), d_C.get(), M, K, N);
     }
     CUDA_CHECK(cudaDeviceSynchronize());
 
     CUDA_CHECK(cudaEventRecord(start_));
     for (int i = 0; i < benchmark_runs; ++i) {
-      launch_tensor_core_sgemm_fp16(d_A_fp16.get(), d_B_fp16.get(), d_C.get(), M,
-                                    K, N);
+      launch_tensor_core_sgemm_fp16(d_A_fp16.get(), d_B_fp16.get(), d_C.get(), M, K, N);
     }
     CUDA_CHECK(cudaEventRecord(stop_));
     CUDA_CHECK(cudaEventSynchronize(stop_));
@@ -223,8 +211,7 @@ public:
     d_C.copyToHost(h_C.data(), M * N);
     d_C_ref.copyToHost(h_C_ref.data(), M * N);
 
-    VerifyResult verify_result = compareMatrices(h_C.data(), h_C_ref.data(), M,
-                                                 N, tolerance);
+    VerifyResult verify_result = compareMatrices(h_C.data(), h_C_ref.data(), M, N, tolerance);
     result.correct = verify_result.passed;
     result.max_error = verify_result.max_rel_error;
 
@@ -239,8 +226,8 @@ public:
     printf("                           SGEMM Benchmark Results\n");
     printf("===================================================================="
            "============\n");
-    printf("  %-30s | %-17s | %10s | %14s | %4s | %s\n", "Kernel",
-           "Dimensions", "Time", "Performance", "Pass", "Max Error");
+    printf("  %-30s | %-17s | %10s | %14s | %4s | %s\n", "Kernel", "Dimensions", "Time",
+           "Performance", "Pass", "Max Error");
     printf("--------------------------------------------------------------------"
            "------------\n");
 
@@ -264,13 +251,12 @@ public:
     for (const auto &result : results_) {
       double flops = 2.0 * result.M * result.N * result.K;
       double bytes =
-          (result.M * result.K + result.K * result.N + result.M * result.N) *
-          sizeof(float);
+          (result.M * result.K + result.K * result.N + result.M * result.N) * sizeof(float);
       double ai = flops / bytes;
 
-      file << result.kernel_name << "," << result.M << "," << result.K << ","
-           << result.N << "," << result.time_ms << "," << result.gflops << ","
-           << result.bandwidth_gb_s << "," << ai << "\n";
+      file << result.kernel_name << "," << result.M << "," << result.K << "," << result.N << ","
+           << result.time_ms << "," << result.gflops << "," << result.bandwidth_gb_s << "," << ai
+           << "\n";
     }
 
     file.close();
@@ -289,9 +275,8 @@ private:
     double flops = 2.0 * result.M * result.N * result.K;
     result.gflops = (flops / (result.time_ms * 1e-3)) / 1e9;
 
-    double bytes = (result.M * result.K + result.K * result.N +
-                    result.M * result.N) *
-                   sizeof(float);
+    double bytes =
+        (result.M * result.K + result.K * result.N + result.M * result.N) * sizeof(float);
     result.bandwidth_gb_s = (bytes / (result.time_ms * 1e-3)) / 1e9;
   }
 
@@ -304,9 +289,8 @@ private:
 // Utility Functions
 // ============================================================================
 
-inline void
-printPerformanceComparison(const std::vector<BenchmarkResult> &results,
-                           float cublas_gflops) {
+inline void printPerformanceComparison(const std::vector<BenchmarkResult> &results,
+                                       float cublas_gflops) {
   printf("\n");
   printf("Performance Comparison (vs cuBLAS):\n");
   printf("---------------------------------------------------------------------"
@@ -317,8 +301,8 @@ printPerformanceComparison(const std::vector<BenchmarkResult> &results,
 
   for (const auto &result : results) {
     float percentage = (result.gflops / cublas_gflops) * 100.0f;
-    printf("  %-30s | %10.2f     | %8.1f%%\n", result.kernel_name.c_str(),
-           result.gflops, percentage);
+    printf("  %-30s | %10.2f     | %8.1f%%\n", result.kernel_name.c_str(), result.gflops,
+           percentage);
   }
   printf("---------------------------------------------------------------------"
          "-----------\n");
@@ -348,8 +332,7 @@ inline float getTheoreticalPeakGflops() {
   float clockGHz = static_cast<float>(prop.clockRate) / 1e6f;
 
   // Peak GFLOPS = SMs * cores/SM * 2 (FMA) * clock (GHz) * 1000 (MHz factor)
-  float peakGflops =
-      prop.multiProcessorCount * coresPerSM * 2 * clockGHz * 1000;
+  float peakGflops = prop.multiProcessorCount * coresPerSM * 2 * clockGHz * 1000;
 
   return peakGflops;
 }