fix: harden CUDA kernel validation and error handling

LessUp · LessUp · commit 1ac5630b4570 · 2026-03-24T09:22:37.000+08:00
Propagate CUDA failures as exceptions and reject unsupported or invalid kernel configurations up front so incorrect launches fail fast instead of silently producing bad results.
diff --git a/src/03_gemm/gemm.cu b/src/03_gemm/gemm.cu
@@ -1,8 +1,22 @@
 #include "gemm.cuh"
 #include "../common/cuda_check.cuh"
+#include <stdexcept>
 
 namespace hpc::gemm {
 
+namespace {
+
+void validate_gemm_args(const void* A, const void* B, const void* C, int M, int N, int K) {
+    if (A == nullptr || B == nullptr || C == nullptr) {
+        throw std::invalid_argument("gemm expects non-null A, B, and C pointers");
+    }
+    if (M <= 0 || N <= 0 || K <= 0) {
+        throw std::invalid_argument("gemm expects positive M, N, and K");
+    }
+}
+
+} // namespace
+
 constexpr int TILE_SIZE = 32;
 
 // Naive GEMM: each thread computes one element
@@ -75,6 +89,7 @@ template <>
 void gemm<float, GemmOpt::Naive>(const float* A, const float* B, float* C,
                                   int M, int N, int K,
                                   float alpha, float beta, cudaStream_t stream) {
+    validate_gemm_args(A, B, C, M, N, K);
     dim3 block(16, 16);
     dim3 grid((N + block.x - 1) / block.x, (M + block.y - 1) / block.y);
     gemm_naive_kernel<float><<<grid, block, 0, stream>>>(A, B, C, M, N, K, alpha, beta);
@@ -85,6 +100,7 @@ template <>
 void gemm<float, GemmOpt::SharedMemTiling>(const float* A, const float* B, float* C,
                                             int M, int N, int K,
                                             float alpha, float beta, cudaStream_t stream) {
+    validate_gemm_args(A, B, C, M, N, K);
     dim3 block(TILE_SIZE, TILE_SIZE);
     dim3 grid((N + TILE_SIZE - 1) / TILE_SIZE, (M + TILE_SIZE - 1) / TILE_SIZE);
     gemm_shared_kernel<float><<<grid, block, 0, stream>>>(A, B, C, M, N, K, alpha, beta);
@@ -177,6 +193,7 @@ template <>
 void gemm<float, GemmOpt::DoubleBuffer>(const float* A, const float* B, float* C,
                                          int M, int N, int K,
                                          float alpha, float beta, cudaStream_t stream) {
+    validate_gemm_args(A, B, C, M, N, K);
     dim3 block(TILE_SIZE, TILE_SIZE);
     dim3 grid((N + TILE_SIZE - 1) / TILE_SIZE, (M + TILE_SIZE - 1) / TILE_SIZE);
     gemm_double_buffer_kernel<float><<<grid, block, 0, stream>>>(A, B, C, M, N, K, alpha, beta);
@@ -301,6 +318,7 @@ template <>
 void gemm<float, GemmOpt::RegisterTiling>(const float* A, const float* B, float* C,
                                            int M, int N, int K,
                                            float alpha, float beta, cudaStream_t stream) {
+    validate_gemm_args(A, B, C, M, N, K);
     constexpr int THREADS_PER_BLOCK = (BLK_M / REG_TILE_M) * (BLK_N / REG_TILE_N);
     dim3 block(THREADS_PER_BLOCK);
     dim3 grid((N + BLK_N - 1) / BLK_N, (M + BLK_M - 1) / BLK_M);
@@ -435,6 +453,10 @@ template <>
 void gemm<__half, GemmOpt::TensorCoreWMMA>(const __half* A, const __half* B, __half* C,
                                             int M, int N, int K,
                                             float alpha, float beta, cudaStream_t stream) {
+    validate_gemm_args(A, B, C, M, N, K);
+    if ((M % 16) != 0 || (N % 16) != 0 || (K % 16) != 0) {
+        throw std::invalid_argument("TensorCoreWMMA requires M, N, and K to be multiples of 16");
+    }
     // Each block has multiple warps
     constexpr int WARPS_PER_BLOCK = (WMMA_BLK_M / WMMA_M) * (WMMA_BLK_N / WMMA_N);
     constexpr int THREADS_PER_BLOCK = WARPS_PER_BLOCK * 32;
@@ -662,6 +684,7 @@ template <>
 void gemm<float, GemmOpt::SoftwarePipeline>(const float* A, const float* B, float* C,
                                              int M, int N, int K,
                                              float alpha, float beta, cudaStream_t stream) {
+    validate_gemm_args(A, B, C, M, N, K);
     constexpr int THREADS_PER_BLOCK = 256;
     dim3 block(THREADS_PER_BLOCK);
     dim3 grid((N + PIPE_TILE_N - 1) / PIPE_TILE_N, (M + PIPE_TILE_M - 1) / PIPE_TILE_M);
@@ -733,6 +756,7 @@ template <>
 void gemm<int8_t, GemmOpt::SharedMemTiling>(const int8_t* A, const int8_t* B, int8_t* C,
                                              int M, int N, int K,
                                              float alpha, float beta, cudaStream_t stream) {
+    validate_gemm_args(A, B, C, M, N, K);
     dim3 block(TILE_SIZE, TILE_SIZE);
     dim3 grid((N + TILE_SIZE - 1) / TILE_SIZE, (M + TILE_SIZE - 1) / TILE_SIZE);
     gemm_shared_kernel<int8_t><<<grid, block, 0, stream>>>(A, B, C, M, N, K, alpha, beta);
diff --git a/src/04_convolution/conv_winograd.cu b/src/04_convolution/conv_winograd.cu
@@ -1,18 +1,23 @@
 #include "conv_winograd.cuh"
 #include "conv_implicit_gemm.cuh"
 #include "../common/cuda_check.cuh"
+#include <stdexcept>
 
 namespace hpc::convolution {
 
-// Winograd F(2x2, 3x3) transformation matrices
-// TODO: Implement full Winograd convolution
-
+// Experimental wrapper: until Winograd transforms are implemented, this path
+// intentionally falls back to the validated implicit GEMM implementation.
 template <>
 void conv2d_winograd<float>(const float* input, const float* weight, float* output,
                             int batch, int in_channels, int out_channels,
                             int height, int width, cudaStream_t stream) {
-    // Placeholder - full implementation requires Winograd transforms
-    // Fall back to implicit GEMM with 3x3 kernel, stride=1, pad=1
+    if (input == nullptr || weight == nullptr || output == nullptr) {
+        throw std::invalid_argument("conv2d_winograd expects non-null input, weight, and output pointers");
+    }
+    if (batch <= 0 || in_channels <= 0 || out_channels <= 0 || height <= 0 || width <= 0) {
+        throw std::invalid_argument("conv2d_winograd expects positive batch/channel/spatial dimensions");
+    }
+
     ConvParams params{batch, in_channels, out_channels, height, width,
                       3, 3, 1, 1, 1, 1, 1, 1};
     conv2d_implicit_gemm<float>(input, weight, output, params, stream);
diff --git a/src/05_attention/flash_attention.cu b/src/05_attention/flash_attention.cu
@@ -1,6 +1,8 @@
 #include "flash_attention.cuh"
 #include "../common/cuda_check.cuh"
 #include <cfloat>
+#include <cmath>
+#include <stdexcept>
 
 namespace hpc::attention {
 
@@ -19,7 +21,6 @@ __global__ void flash_attention_kernel(const T* __restrict__ Q,
     float* q_tile = smem;
     float* k_tile = q_tile + BLOCK_SIZE * HEAD_DIM;
     float* v_tile = k_tile + BLOCK_SIZE * HEAD_DIM;
-    float* scores = v_tile + BLOCK_SIZE * HEAD_DIM;
 
     int batch_head = blockIdx.x;
     int b = batch_head / num_heads;
@@ -118,20 +119,27 @@ void flash_attention_forward<float>(const float* Q, const float* K, const float*
                                     cudaStream_t stream) {
     constexpr int BLOCK_SIZE = 64;
     constexpr int HEAD_DIM = 64;
-    
+
+    if (Q == nullptr || K == nullptr || V == nullptr || O == nullptr) {
+        throw std::invalid_argument("flash_attention_forward expects non-null Q, K, V, and O pointers");
+    }
+    if (config.batch_size <= 0 || config.num_heads <= 0 || config.seq_len <= 0 ||
+        config.head_dim <= 0) {
+        throw std::invalid_argument("flash_attention_forward expects positive batch_size, num_heads, seq_len, and head_dim");
+    }
+    if (!std::isfinite(config.scale) || config.scale <= 0.0f) {
+        throw std::invalid_argument("flash_attention_forward expects a finite positive scale");
+    }
+    if (config.head_dim != HEAD_DIM) {
+        throw std::invalid_argument("flash_attention_forward currently supports head_dim == 64 only");
+    }
+
     dim3 grid(config.batch_size * config.num_heads,
               (config.seq_len + BLOCK_SIZE - 1) / BLOCK_SIZE);
     dim3 block(BLOCK_SIZE);
-    
-    size_t smem_size = 3 * BLOCK_SIZE * HEAD_DIM * sizeof(float) + 
+
+    size_t smem_size = 3 * BLOCK_SIZE * HEAD_DIM * sizeof(float) +
                        BLOCK_SIZE * BLOCK_SIZE * sizeof(float);
-    
-    // HEAD_DIM is a compile-time constant; assert config matches
-    if (config.head_dim != HEAD_DIM) {
-        fprintf(stderr, "flash_attention: config.head_dim=%d but compiled HEAD_DIM=%d\n",
-                config.head_dim, HEAD_DIM);
-        return;
-    }
 
     flash_attention_kernel<float, BLOCK_SIZE, HEAD_DIM><<<grid, block, smem_size, stream>>>(
         Q, K, V, O,
diff --git a/src/06_quantization/int8_quant.cu b/src/06_quantization/int8_quant.cu
@@ -2,6 +2,7 @@
 #include "../common/cuda_check.cuh"
 #include "../common/reduce.cuh"
 #include <cfloat>
+#include <stdexcept>
 
 namespace hpc::quantization {
 
@@ -22,7 +23,7 @@ __global__ void compute_scale_kernel(const float* __restrict__ input,
     max_abs = hpc::block_reduce_max(max_abs);
     
     if (threadIdx.x == 0) {
-        scale[row] = max_abs / 127.0f;
+        scale[row] = max_abs > 0.0f ? (max_abs / 127.0f) : 1.0f;
     }
 }
 
@@ -35,7 +36,12 @@ __global__ void quantize_kernel(const float* __restrict__ input,
     
     for (; idx < total; idx += blockDim.x * gridDim.x) {
         int row = idx / cols;
-        float inv_scale = 1.0f / scale[row];
+        float row_scale = scale[row];
+        if (row_scale == 0.0f) {
+            output[idx] = 0;
+            continue;
+        }
+        float inv_scale = 1.0f / row_scale;
         float val = input[idx] * inv_scale;
         val = fminf(fmaxf(val, -127.0f), 127.0f);
         output[idx] = static_cast<int8_t>(roundf(val));
@@ -44,8 +50,16 @@ __global__ void quantize_kernel(const float* __restrict__ input,
 
 void quantize_int8(const float* input, int8_t* output, float* scale,
                    int rows, int cols, cudaStream_t stream) {
+    if (input == nullptr || output == nullptr || scale == nullptr) {
+        throw std::invalid_argument("quantize_int8 expects non-null input, output, and scale pointers");
+    }
+    if (rows <= 0 || cols <= 0) {
+        throw std::invalid_argument("quantize_int8 expects rows and cols to be positive");
+    }
+
     compute_scale_kernel<<<rows, 256, 0, stream>>>(input, scale, rows, cols);
-    
+    CUDA_CHECK_LAST();
+
     int total = rows * cols;
     int block_size = 256;
     int grid_size = (total + block_size - 1) / block_size;
@@ -68,10 +82,17 @@ __global__ void dequantize_int8_kernel(const int8_t* __restrict__ input,
 
 void dequantize_int8(const int8_t* input, const float* scale,
                      float* output, int rows, int cols, cudaStream_t stream) {
+    if (input == nullptr || output == nullptr || scale == nullptr) {
+        throw std::invalid_argument("dequantize_int8 expects non-null input, output, and scale pointers");
+    }
+    if (rows <= 0 || cols <= 0) {
+        throw std::invalid_argument("dequantize_int8 expects rows and cols to be positive");
+    }
+
     int total = rows * cols;
     int block_size = 256;
     int grid_size = (total + block_size - 1) / block_size;
-    
+
     dequantize_int8_kernel<<<grid_size, block_size, 0, stream>>>(
         input, scale, output, rows, cols);
     CUDA_CHECK_LAST();
diff --git a/src/07_cuda13_features/CMakeLists.txt b/src/07_cuda13_features/CMakeLists.txt
@@ -1,4 +1,6 @@
-# CUDA 13 features module
+# Experimental newer-CUDA feature demos and compatibility fallbacks.
+# These targets are built as part of the lab, but they currently do not imply
+# production-grade Hopper/Blackwell feature coverage.
 hpc_add_cuda_library(hpc_cuda13
     tma.cu
     cluster.cu
diff --git a/src/07_cuda13_features/cluster.cu b/src/07_cuda13_features/cluster.cu
@@ -1,10 +1,11 @@
 #include "cluster.cuh"
 #include "../common/cuda_check.cuh"
+#include <stdexcept>
 
 namespace hpc::cuda13 {
 
-// Thread Block Clusters placeholder
-// Requires Hopper architecture (SM90+) and CUDA 12+
+// Experimental fallback for a future thread-block-cluster implementation.
+// Today this uses a portable block reduction and does not rely on SM90-only features.
 
 template <typename T>
 __global__ void cluster_reduce_kernel(const T* __restrict__ input,
@@ -35,13 +36,22 @@ __global__ void cluster_reduce_kernel(const T* __restrict__ input,
 template <>
 void cluster_reduce<float>(const float* input, float* output, size_t n,
                            const ClusterConfig& config, cudaStream_t stream) {
+    if (input == nullptr || output == nullptr) {
+        throw std::invalid_argument("cluster_reduce expects non-null input and output pointers");
+    }
+    if (n == 0) {
+        throw std::invalid_argument("cluster_reduce expects n > 0");
+    }
+    if (config.block_dims.x == 0) {
+        throw std::invalid_argument("cluster_reduce expects config.block_dims.x > 0");
+    }
+
     int block_size = config.block_dims.x;
     int grid_size = (n + block_size - 1) / block_size;
     size_t smem_size = block_size * sizeof(float);
-    
-    // Initialize output to zero
-    cudaMemsetAsync(output, 0, sizeof(float), stream);
-    
+
+    CUDA_CHECK(cudaMemsetAsync(output, 0, sizeof(float), stream));
+
     cluster_reduce_kernel<float><<<grid_size, block_size, smem_size, stream>>>(
         input, output, n);
     CUDA_CHECK_LAST();
diff --git a/src/07_cuda13_features/fp8_gemm.cu b/src/07_cuda13_features/fp8_gemm.cu
@@ -3,9 +3,8 @@
 
 namespace hpc::cuda13 {
 
-// FP8 GEMM placeholder
-// Requires Hopper architecture (SM90+) and CUDA 12+
-// Uses e4m3 and e5m2 data types
+// Experimental FP8-like demo path.
+// This currently scales float inputs in a standard kernel; it is not a true Hopper FP8 implementation.
 
 template <typename T>
 __global__ void fp8_gemm_kernel(const T* __restrict__ A,
diff --git a/src/07_cuda13_features/tma.cu b/src/07_cuda13_features/tma.cu
@@ -1,10 +1,11 @@
 #include "tma.cuh"
 #include "../common/cuda_check.cuh"
+#include <stdexcept>
 
 namespace hpc::cuda13 {
 
-// TMA (Tensor Memory Accelerator) placeholder
-// Requires Hopper architecture (SM90+) and CUDA 12+
+// Experimental fallback for the future TMA path.
+// This currently performs a regular kernel copy so behavior is portable and testable.
 
 template <typename T>
 __global__ void async_copy_kernel(const T* __restrict__ src,
@@ -22,6 +23,13 @@ __global__ void async_copy_kernel(const T* __restrict__ src,
 template <>
 void tma_copy_2d<float>(const float* src, float* dst,
                         int rows, int cols, cudaStream_t stream) {
+    if (src == nullptr || dst == nullptr) {
+        throw std::invalid_argument("tma_copy_2d expects non-null src and dst pointers");
+    }
+    if (rows <= 0 || cols <= 0) {
+        throw std::invalid_argument("tma_copy_2d expects positive rows and cols");
+    }
+
     dim3 block(256);
     dim3 grid((cols + block.x - 1) / block.x, rows);
     async_copy_kernel<float><<<grid, block, 0, stream>>>(src, dst, rows, cols);
diff --git a/src/common/cuda_check.cuh b/src/common/cuda_check.cuh
@@ -1,18 +1,28 @@
 #pragma once
 
 #include <cuda_runtime.h>
-#include <cstdio>
-#include <cstdlib>
+#include <sstream>
+#include <stdexcept>
+#include <string>
+
+namespace hpc::detail {
+
+[[noreturn]] inline void throw_cuda_error(cudaError_t err, const char* file, int line) {
+    std::ostringstream message;
+    message << "CUDA error at " << file << ':' << line << ": "
+            << cudaGetErrorString(err);
+    throw std::runtime_error(message.str());
+}
+
+} // namespace hpc::detail
 
 // Macros are not scoped by namespaces; define them at file scope.
-#define CUDA_CHECK(call)                                                       \
-    do {                                                                       \
-        cudaError_t err = call;                                                \
-        if (err != cudaSuccess) {                                              \
-            fprintf(stderr, "CUDA error at %s:%d: %s\n", __FILE__, __LINE__,   \
-                    cudaGetErrorString(err));                                  \
-            exit(EXIT_FAILURE);                                                \
-        }                                                                      \
+#define CUDA_CHECK(call)                                                        \
+    do {                                                                        \
+        cudaError_t err = (call);                                               \
+        if (err != cudaSuccess) {                                               \
+            ::hpc::detail::throw_cuda_error(err, __FILE__, __LINE__);           \
+        }                                                                       \
     } while (0)
 
 #define CUDA_CHECK_LAST() CUDA_CHECK(cudaGetLastError())