test: strengthen kernel correctness coverage

shijiashuai · shijiashuai · commit e157199187c9 · 2026-03-24T09:26:17.000+08:00
Replace smoke-style checks with reference-based and boundary-focused tests so quantization, attention, convolution, GEMM, and CUDA13 fallback paths are validated against their actual supported behavior.
diff --git a/tests/attention/test_flash_attention.cpp b/tests/attention/test_flash_attention.cpp
@@ -1,35 +1,148 @@
 #include <gtest/gtest.h>
+#include <algorithm>
+#include <cmath>
+#include <limits>
+#include <stdexcept>
+#include <vector>
+
 #include "05_attention/flash_attention.cuh"
 #include "common/tensor.cuh"
 #include "../test_utils.hpp"
 
-TEST(FlashAttentionTest, BasicTest) {
-    int batch = 1, heads = 1, seq = 64, dim = 64;
-    int total = batch * heads * seq * dim;
-    
-    auto Q = hpc::test::random_vector<float>(total, -1.0f, 1.0f);
-    auto K = hpc::test::random_vector<float>(total, -1.0f, 1.0f);
-    auto V = hpc::test::random_vector<float>(total, -1.0f, 1.0f);
-    
-    hpc::Tensor<float> d_Q(total);
-    hpc::Tensor<float> d_K(total);
-    hpc::Tensor<float> d_V(total);
-    hpc::Tensor<float> d_O(total);
-    
-    d_Q.copy_from_host(Q);
-    d_K.copy_from_host(K);
-    d_V.copy_from_host(V);
-    
-    hpc::attention::FlashAttnConfig config{
+namespace {
+
+std::vector<float> cpu_flash_attention(const std::vector<float>& q,
+                                       const std::vector<float>& k,
+                                       const std::vector<float>& v,
+                                       const hpc::attention::FlashAttnConfig& config) {
+    const int head_dim = config.head_dim;
+    const int seq_len = config.seq_len;
+    const int head_stride = seq_len * head_dim;
+    const int batch_head_stride = config.num_heads * head_stride;
+
+    std::vector<float> out(q.size(), 0.0f);
+
+    for (int batch = 0; batch < config.batch_size; ++batch) {
+        for (int head = 0; head < config.num_heads; ++head) {
+            const int base = batch * batch_head_stride + head * head_stride;
+            for (int q_idx = 0; q_idx < seq_len; ++q_idx) {
+                std::vector<float> scores(seq_len, -std::numeric_limits<float>::infinity());
+                float max_score = -std::numeric_limits<float>::infinity();
+
+                for (int kv_idx = 0; kv_idx < seq_len; ++kv_idx) {
+                    if (config.causal && kv_idx > q_idx) {
+                        continue;
+                    }
+                    float score = 0.0f;
+                    for (int d = 0; d < head_dim; ++d) {
+                        score += q[base + q_idx * head_dim + d] *
+                                 k[base + kv_idx * head_dim + d];
+                    }
+                    score *= config.scale;
+                    scores[kv_idx] = score;
+                    max_score = std::max(max_score, score);
+                }
+
+                float denom = 0.0f;
+                for (int kv_idx = 0; kv_idx < seq_len; ++kv_idx) {
+                    if (scores[kv_idx] == -std::numeric_limits<float>::infinity()) {
+                        continue;
+                    }
+                    denom += std::exp(scores[kv_idx] - max_score);
+                }
+
+                for (int d = 0; d < head_dim; ++d) {
+                    float acc = 0.0f;
+                    for (int kv_idx = 0; kv_idx < seq_len; ++kv_idx) {
+                        if (scores[kv_idx] == -std::numeric_limits<float>::infinity()) {
+                            continue;
+                        }
+                        const float weight = std::exp(scores[kv_idx] - max_score) / denom;
+                        acc += weight * v[base + kv_idx * head_dim + d];
+                    }
+                    out[base + q_idx * head_dim + d] = acc;
+                }
+            }
+        }
+    }
+
+    return out;
+}
+
+void expect_attention_matches_reference(bool causal) {
+    constexpr int batch = 1;
+    constexpr int heads = 2;
+    constexpr int seq = 16;
+    constexpr int dim = 64;
+    const int total = batch * heads * seq * dim;
+
+    const auto q = hpc::test::random_vector<float>(total, -1.0f, 1.0f);
+    const auto k = hpc::test::random_vector<float>(total, -1.0f, 1.0f);
+    const auto v = hpc::test::random_vector<float>(total, -1.0f, 1.0f);
+
+    hpc::Tensor<float> d_q(total);
+    hpc::Tensor<float> d_k(total);
+    hpc::Tensor<float> d_v(total);
+    hpc::Tensor<float> d_o(total);
+
+    d_q.copy_from_host(q);
+    d_k.copy_from_host(k);
+    d_v.copy_from_host(v);
+
+    const hpc::attention::FlashAttnConfig config{
         batch, heads, seq, dim,
         1.0f / std::sqrt(static_cast<float>(dim)),
-        false
+        causal,
     };
-    
+
     hpc::attention::flash_attention_forward<float>(
-        d_Q.data(), d_K.data(), d_V.data(), d_O.data(), config);
+        d_q.data(), d_k.data(), d_v.data(), d_o.data(), config);
     cudaDeviceSynchronize();
-    
-    auto O = d_O.to_host();
-    EXPECT_EQ(O.size(), total);
+
+    const auto expected = cpu_flash_attention(q, k, v, config);
+    const auto actual = d_o.to_host();
+
+    ASSERT_EQ(actual.size(), expected.size());
+    for (size_t i = 0; i < actual.size(); ++i) {
+        ASSERT_TRUE(std::isfinite(actual[i]));
+        EXPECT_NEAR(actual[i], expected[i], 5e-4f);
+    }
+}
+
+} // namespace
+
+TEST(FlashAttentionTest, MatchesReferenceWithoutCausalMask) {
+    expect_attention_matches_reference(false);
+}
+
+TEST(FlashAttentionTest, MatchesReferenceWithCausalMask) {
+    expect_attention_matches_reference(true);
+}
+
+TEST(FlashAttentionTest, RejectsUnsupportedHeadDim) {
+    constexpr int batch = 1;
+    constexpr int heads = 1;
+    constexpr int seq = 8;
+    constexpr int dim = 32;
+    const int total = batch * heads * seq * dim;
+
+    hpc::Tensor<float> d_q(total);
+    hpc::Tensor<float> d_k(total);
+    hpc::Tensor<float> d_v(total);
+    hpc::Tensor<float> d_o(total);
+    d_q.zero();
+    d_k.zero();
+    d_v.zero();
+    d_o.zero();
+
+    const hpc::attention::FlashAttnConfig config{
+        batch, heads, seq, dim,
+        1.0f / std::sqrt(static_cast<float>(dim)),
+        false,
+    };
+
+    EXPECT_THROW(
+        hpc::attention::flash_attention_forward<float>(
+            d_q.data(), d_k.data(), d_v.data(), d_o.data(), config),
+        std::invalid_argument);
 }
diff --git a/tests/convolution/test_conv.cpp b/tests/convolution/test_conv.cpp
@@ -1,36 +1,123 @@
 #include <gtest/gtest.h>
+#include <vector>
+
 #include "04_convolution/conv_implicit_gemm.cuh"
+#include "04_convolution/conv_winograd.cuh"
 #include "common/tensor.cuh"
 #include "../test_utils.hpp"
 
-TEST(ConvolutionTest, BasicConv2d) {
-    int batch = 1, in_c = 3, out_c = 16;
-    int in_h = 32, in_w = 32;
-    int k_h = 3, k_w = 3;
-    int stride = 1, pad = 1;
-    
-    int out_h = (in_h + 2 * pad - k_h) / stride + 1;
-    int out_w = (in_w + 2 * pad - k_w) / stride + 1;
-    
-    auto input = hpc::test::random_vector<float>(batch * in_c * in_h * in_w, -1.0f, 1.0f);
-    auto weight = hpc::test::random_vector<float>(out_c * in_c * k_h * k_w, -1.0f, 1.0f);
-    
-    hpc::Tensor<float> d_input(batch * in_c * in_h * in_w);
-    hpc::Tensor<float> d_weight(out_c * in_c * k_h * k_w);
-    hpc::Tensor<float> d_output(batch * out_c * out_h * out_w);
-    
+namespace {
+
+std::vector<float> cpu_conv2d(const std::vector<float>& input,
+                              const std::vector<float>& weight,
+                              const hpc::convolution::ConvParams& p) {
+    const int out_h = (p.in_height + 2 * p.pad_h - p.dilation_h * (p.kernel_h - 1) - 1) / p.stride_h + 1;
+    const int out_w = (p.in_width + 2 * p.pad_w - p.dilation_w * (p.kernel_w - 1) - 1) / p.stride_w + 1;
+    std::vector<float> output(p.batch * p.out_channels * out_h * out_w, 0.0f);
+
+    for (int b = 0; b < p.batch; ++b) {
+        for (int oc = 0; oc < p.out_channels; ++oc) {
+            for (int oh = 0; oh < out_h; ++oh) {
+                for (int ow = 0; ow < out_w; ++ow) {
+                    float sum = 0.0f;
+                    for (int ic = 0; ic < p.in_channels; ++ic) {
+                        for (int kh = 0; kh < p.kernel_h; ++kh) {
+                            for (int kw = 0; kw < p.kernel_w; ++kw) {
+                                const int ih = oh * p.stride_h - p.pad_h + kh * p.dilation_h;
+                                const int iw = ow * p.stride_w - p.pad_w + kw * p.dilation_w;
+                                if (ih < 0 || ih >= p.in_height || iw < 0 || iw >= p.in_width) {
+                                    continue;
+                                }
+                                const int input_idx = b * (p.in_channels * p.in_height * p.in_width) +
+                                                      ic * (p.in_height * p.in_width) +
+                                                      ih * p.in_width + iw;
+                                const int weight_idx = oc * (p.in_channels * p.kernel_h * p.kernel_w) +
+                                                       ic * (p.kernel_h * p.kernel_w) +
+                                                       kh * p.kernel_w + kw;
+                                sum += input[input_idx] * weight[weight_idx];
+                            }
+                        }
+                    }
+                    const int output_idx = b * (p.out_channels * out_h * out_w) +
+                                           oc * (out_h * out_w) + oh * out_w + ow;
+                    output[output_idx] = sum;
+                }
+            }
+        }
+    }
+
+    return output;
+}
+
+} // namespace
+
+TEST(ConvolutionTest, ImplicitGemmMatchesReference) {
+    const hpc::convolution::ConvParams params{
+        1, 2, 3, 5, 5,
+        3, 3, 1, 1, 1, 1, 1, 1,
+    };
+    const int out_h = (params.in_height + 2 * params.pad_h - params.dilation_h * (params.kernel_h - 1) - 1) /
+                      params.stride_h + 1;
+    const int out_w = (params.in_width + 2 * params.pad_w - params.dilation_w * (params.kernel_w - 1) - 1) /
+                      params.stride_w + 1;
+
+    const auto input = hpc::test::random_vector<float>(
+        params.batch * params.in_channels * params.in_height * params.in_width, -1.0f, 1.0f);
+    const auto weight = hpc::test::random_vector<float>(
+        params.out_channels * params.in_channels * params.kernel_h * params.kernel_w, -1.0f, 1.0f);
+    const auto expected = cpu_conv2d(input, weight, params);
+
+    hpc::Tensor<float> d_input(input.size());
+    hpc::Tensor<float> d_weight(weight.size());
+    hpc::Tensor<float> d_output(expected.size());
+
     d_input.copy_from_host(input);
     d_weight.copy_from_host(weight);
-    
-    hpc::convolution::ConvParams params{
-        batch, in_c, out_c, in_h, in_w,
-        k_h, k_w, stride, stride, pad, pad, 1, 1
-    };
-    
+    d_output.zero();
+
     hpc::convolution::conv2d_implicit_gemm<float>(
         d_input.data(), d_weight.data(), d_output.data(), params);
     cudaDeviceSynchronize();
-    
-    auto output = d_output.to_host();
-    EXPECT_EQ(output.size(), batch * out_c * out_h * out_w);
+
+    const auto output = d_output.to_host();
+    ASSERT_EQ(output.size(), static_cast<size_t>(params.batch * params.out_channels * out_h * out_w));
+    EXPECT_TRUE(hpc::test::vectors_almost_equal(output, expected, 1e-4f, 1e-4f));
+}
+
+TEST(ConvolutionTest, WinogradPathMatchesImplicitGemmFallback) {
+    constexpr int batch = 1;
+    constexpr int in_channels = 2;
+    constexpr int out_channels = 2;
+    constexpr int height = 6;
+    constexpr int width = 6;
+    constexpr int kernel = 3;
+    constexpr int output_size = batch * out_channels * height * width;
+
+    const auto input = hpc::test::random_vector<float>(batch * in_channels * height * width, -1.0f, 1.0f);
+    const auto weight = hpc::test::random_vector<float>(out_channels * in_channels * kernel * kernel, -1.0f, 1.0f);
+
+    hpc::Tensor<float> d_input(input.size());
+    hpc::Tensor<float> d_weight(weight.size());
+    hpc::Tensor<float> d_implicit(output_size);
+    hpc::Tensor<float> d_winograd(output_size);
+
+    d_input.copy_from_host(input);
+    d_weight.copy_from_host(weight);
+    d_implicit.zero();
+    d_winograd.zero();
+
+    const hpc::convolution::ConvParams params{
+        batch, in_channels, out_channels, height, width,
+        kernel, kernel, 1, 1, 1, 1, 1, 1,
+    };
+
+    hpc::convolution::conv2d_implicit_gemm<float>(
+        d_input.data(), d_weight.data(), d_implicit.data(), params);
+    hpc::convolution::conv2d_winograd<float>(
+        d_input.data(), d_weight.data(), d_winograd.data(), batch, in_channels, out_channels, height, width);
+    cudaDeviceSynchronize();
+
+    const auto implicit_output = d_implicit.to_host();
+    const auto winograd_output = d_winograd.to_host();
+    EXPECT_TRUE(hpc::test::vectors_almost_equal(winograd_output, implicit_output, 1e-5f, 1e-5f));
 }
diff --git a/tests/cuda13/test_cluster.cpp b/tests/cuda13/test_cluster.cpp
@@ -6,8 +6,8 @@
 #include "../test_utils.hpp"
 #include <numeric>
 
-// Feature: hpc-ai-optimization-lab, Property 14: Cluster Reduce Correctness
-RC_GTEST_PROP(ClusterTest, ReduceCorrectness, ()) {
+// Property 14: Fallback reduction path remains numerically correct.
+RC_GTEST_PROP(ClusterTest, FallbackReduceCorrectness, ()) {
     auto n = *rc::gen::inRange<size_t>(256, 4096);
     auto input = *rc::gen::container<std::vector<float>>(n,
         rc::gen::map(rc::gen::arbitrary<float>(), [](float x) {
@@ -30,7 +30,7 @@ RC_GTEST_PROP(ClusterTest, ReduceCorrectness, ()) {
     RC_ASSERT(hpc::test::almost_equal(result[0], expected, 1e-2f, 1e-3f));
 }
 
-TEST(ClusterTest, BasicReduce) {
+TEST(ClusterTest, FallbackReduceMatchesReference) {
     size_t n = 1024;
     std::vector<float> input(n, 1.0f);
     
diff --git a/tests/cuda13/test_tma.cpp b/tests/cuda13/test_tma.cpp
@@ -5,8 +5,8 @@
 #include "common/tensor.cuh"
 #include "../test_utils.hpp"
 
-// Feature: hpc-ai-optimization-lab, Property 13: TMA Data Integrity
-RC_GTEST_PROP(TMATest, DataIntegrity, ()) {
+// Property 13: Fallback copy path preserves data integrity.
+RC_GTEST_PROP(TMATest, FallbackCopyPreservesDataIntegrity, ()) {
     auto rows = *rc::gen::inRange<int>(1, 128);
     auto cols = *rc::gen::inRange<int>(1, 128);
     auto input = *rc::gen::container<std::vector<float>>(rows * cols, rc::gen::arbitrary<float>());
@@ -25,7 +25,7 @@ RC_GTEST_PROP(TMATest, DataIntegrity, ()) {
     }
 }
 
-TEST(TMATest, BasicCopy) {
+TEST(TMATest, FallbackCopyMatchesInput) {
     int rows = 64, cols = 64;
     auto input = hpc::test::random_vector<float>(rows * cols);
     
diff --git a/tests/gemm/test_gemm.cpp b/tests/gemm/test_gemm.cpp
diff --git a/tests/quantization/test_quantize.cpp b/tests/quantization/test_quantize.cpp