Skip to content

Commit e157199

Browse files
author
shijiashuai
committed
test: strengthen kernel correctness coverage
Replace smoke-style checks with reference-based and boundary-focused tests so quantization, attention, convolution, GEMM, and CUDA13 fallback paths are validated against their actual supported behavior.
1 parent 6a5cb00 commit e157199

File tree

6 files changed

+500
-94
lines changed

6 files changed

+500
-94
lines changed
Lines changed: 137 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -1,35 +1,148 @@
11
#include <gtest/gtest.h>
2+
#include <algorithm>
3+
#include <cmath>
4+
#include <limits>
5+
#include <stdexcept>
6+
#include <vector>
7+
28
#include "05_attention/flash_attention.cuh"
39
#include "common/tensor.cuh"
410
#include "../test_utils.hpp"
511

6-
TEST(FlashAttentionTest, BasicTest) {
7-
int batch = 1, heads = 1, seq = 64, dim = 64;
8-
int total = batch * heads * seq * dim;
9-
10-
auto Q = hpc::test::random_vector<float>(total, -1.0f, 1.0f);
11-
auto K = hpc::test::random_vector<float>(total, -1.0f, 1.0f);
12-
auto V = hpc::test::random_vector<float>(total, -1.0f, 1.0f);
13-
14-
hpc::Tensor<float> d_Q(total);
15-
hpc::Tensor<float> d_K(total);
16-
hpc::Tensor<float> d_V(total);
17-
hpc::Tensor<float> d_O(total);
18-
19-
d_Q.copy_from_host(Q);
20-
d_K.copy_from_host(K);
21-
d_V.copy_from_host(V);
22-
23-
hpc::attention::FlashAttnConfig config{
12+
namespace {
13+
14+
std::vector<float> cpu_flash_attention(const std::vector<float>& q,
15+
const std::vector<float>& k,
16+
const std::vector<float>& v,
17+
const hpc::attention::FlashAttnConfig& config) {
18+
const int head_dim = config.head_dim;
19+
const int seq_len = config.seq_len;
20+
const int head_stride = seq_len * head_dim;
21+
const int batch_head_stride = config.num_heads * head_stride;
22+
23+
std::vector<float> out(q.size(), 0.0f);
24+
25+
for (int batch = 0; batch < config.batch_size; ++batch) {
26+
for (int head = 0; head < config.num_heads; ++head) {
27+
const int base = batch * batch_head_stride + head * head_stride;
28+
for (int q_idx = 0; q_idx < seq_len; ++q_idx) {
29+
std::vector<float> scores(seq_len, -std::numeric_limits<float>::infinity());
30+
float max_score = -std::numeric_limits<float>::infinity();
31+
32+
for (int kv_idx = 0; kv_idx < seq_len; ++kv_idx) {
33+
if (config.causal && kv_idx > q_idx) {
34+
continue;
35+
}
36+
float score = 0.0f;
37+
for (int d = 0; d < head_dim; ++d) {
38+
score += q[base + q_idx * head_dim + d] *
39+
k[base + kv_idx * head_dim + d];
40+
}
41+
score *= config.scale;
42+
scores[kv_idx] = score;
43+
max_score = std::max(max_score, score);
44+
}
45+
46+
float denom = 0.0f;
47+
for (int kv_idx = 0; kv_idx < seq_len; ++kv_idx) {
48+
if (scores[kv_idx] == -std::numeric_limits<float>::infinity()) {
49+
continue;
50+
}
51+
denom += std::exp(scores[kv_idx] - max_score);
52+
}
53+
54+
for (int d = 0; d < head_dim; ++d) {
55+
float acc = 0.0f;
56+
for (int kv_idx = 0; kv_idx < seq_len; ++kv_idx) {
57+
if (scores[kv_idx] == -std::numeric_limits<float>::infinity()) {
58+
continue;
59+
}
60+
const float weight = std::exp(scores[kv_idx] - max_score) / denom;
61+
acc += weight * v[base + kv_idx * head_dim + d];
62+
}
63+
out[base + q_idx * head_dim + d] = acc;
64+
}
65+
}
66+
}
67+
}
68+
69+
return out;
70+
}
71+
72+
void expect_attention_matches_reference(bool causal) {
73+
constexpr int batch = 1;
74+
constexpr int heads = 2;
75+
constexpr int seq = 16;
76+
constexpr int dim = 64;
77+
const int total = batch * heads * seq * dim;
78+
79+
const auto q = hpc::test::random_vector<float>(total, -1.0f, 1.0f);
80+
const auto k = hpc::test::random_vector<float>(total, -1.0f, 1.0f);
81+
const auto v = hpc::test::random_vector<float>(total, -1.0f, 1.0f);
82+
83+
hpc::Tensor<float> d_q(total);
84+
hpc::Tensor<float> d_k(total);
85+
hpc::Tensor<float> d_v(total);
86+
hpc::Tensor<float> d_o(total);
87+
88+
d_q.copy_from_host(q);
89+
d_k.copy_from_host(k);
90+
d_v.copy_from_host(v);
91+
92+
const hpc::attention::FlashAttnConfig config{
2493
batch, heads, seq, dim,
2594
1.0f / std::sqrt(static_cast<float>(dim)),
26-
false
95+
causal,
2796
};
28-
97+
2998
hpc::attention::flash_attention_forward<float>(
30-
d_Q.data(), d_K.data(), d_V.data(), d_O.data(), config);
99+
d_q.data(), d_k.data(), d_v.data(), d_o.data(), config);
31100
cudaDeviceSynchronize();
32-
33-
auto O = d_O.to_host();
34-
EXPECT_EQ(O.size(), total);
101+
102+
const auto expected = cpu_flash_attention(q, k, v, config);
103+
const auto actual = d_o.to_host();
104+
105+
ASSERT_EQ(actual.size(), expected.size());
106+
for (size_t i = 0; i < actual.size(); ++i) {
107+
ASSERT_TRUE(std::isfinite(actual[i]));
108+
EXPECT_NEAR(actual[i], expected[i], 5e-4f);
109+
}
110+
}
111+
112+
} // namespace
113+
114+
TEST(FlashAttentionTest, MatchesReferenceWithoutCausalMask) {
115+
expect_attention_matches_reference(false);
116+
}
117+
118+
TEST(FlashAttentionTest, MatchesReferenceWithCausalMask) {
119+
expect_attention_matches_reference(true);
120+
}
121+
122+
TEST(FlashAttentionTest, RejectsUnsupportedHeadDim) {
123+
constexpr int batch = 1;
124+
constexpr int heads = 1;
125+
constexpr int seq = 8;
126+
constexpr int dim = 32;
127+
const int total = batch * heads * seq * dim;
128+
129+
hpc::Tensor<float> d_q(total);
130+
hpc::Tensor<float> d_k(total);
131+
hpc::Tensor<float> d_v(total);
132+
hpc::Tensor<float> d_o(total);
133+
d_q.zero();
134+
d_k.zero();
135+
d_v.zero();
136+
d_o.zero();
137+
138+
const hpc::attention::FlashAttnConfig config{
139+
batch, heads, seq, dim,
140+
1.0f / std::sqrt(static_cast<float>(dim)),
141+
false,
142+
};
143+
144+
EXPECT_THROW(
145+
hpc::attention::flash_attention_forward<float>(
146+
d_q.data(), d_k.data(), d_v.data(), d_o.data(), config),
147+
std::invalid_argument);
35148
}

tests/convolution/test_conv.cpp

Lines changed: 112 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -1,36 +1,123 @@
11
#include <gtest/gtest.h>
2+
#include <vector>
3+
24
#include "04_convolution/conv_implicit_gemm.cuh"
5+
#include "04_convolution/conv_winograd.cuh"
36
#include "common/tensor.cuh"
47
#include "../test_utils.hpp"
58

6-
TEST(ConvolutionTest, BasicConv2d) {
7-
int batch = 1, in_c = 3, out_c = 16;
8-
int in_h = 32, in_w = 32;
9-
int k_h = 3, k_w = 3;
10-
int stride = 1, pad = 1;
11-
12-
int out_h = (in_h + 2 * pad - k_h) / stride + 1;
13-
int out_w = (in_w + 2 * pad - k_w) / stride + 1;
14-
15-
auto input = hpc::test::random_vector<float>(batch * in_c * in_h * in_w, -1.0f, 1.0f);
16-
auto weight = hpc::test::random_vector<float>(out_c * in_c * k_h * k_w, -1.0f, 1.0f);
17-
18-
hpc::Tensor<float> d_input(batch * in_c * in_h * in_w);
19-
hpc::Tensor<float> d_weight(out_c * in_c * k_h * k_w);
20-
hpc::Tensor<float> d_output(batch * out_c * out_h * out_w);
21-
9+
namespace {
10+
11+
std::vector<float> cpu_conv2d(const std::vector<float>& input,
12+
const std::vector<float>& weight,
13+
const hpc::convolution::ConvParams& p) {
14+
const int out_h = (p.in_height + 2 * p.pad_h - p.dilation_h * (p.kernel_h - 1) - 1) / p.stride_h + 1;
15+
const int out_w = (p.in_width + 2 * p.pad_w - p.dilation_w * (p.kernel_w - 1) - 1) / p.stride_w + 1;
16+
std::vector<float> output(p.batch * p.out_channels * out_h * out_w, 0.0f);
17+
18+
for (int b = 0; b < p.batch; ++b) {
19+
for (int oc = 0; oc < p.out_channels; ++oc) {
20+
for (int oh = 0; oh < out_h; ++oh) {
21+
for (int ow = 0; ow < out_w; ++ow) {
22+
float sum = 0.0f;
23+
for (int ic = 0; ic < p.in_channels; ++ic) {
24+
for (int kh = 0; kh < p.kernel_h; ++kh) {
25+
for (int kw = 0; kw < p.kernel_w; ++kw) {
26+
const int ih = oh * p.stride_h - p.pad_h + kh * p.dilation_h;
27+
const int iw = ow * p.stride_w - p.pad_w + kw * p.dilation_w;
28+
if (ih < 0 || ih >= p.in_height || iw < 0 || iw >= p.in_width) {
29+
continue;
30+
}
31+
const int input_idx = b * (p.in_channels * p.in_height * p.in_width) +
32+
ic * (p.in_height * p.in_width) +
33+
ih * p.in_width + iw;
34+
const int weight_idx = oc * (p.in_channels * p.kernel_h * p.kernel_w) +
35+
ic * (p.kernel_h * p.kernel_w) +
36+
kh * p.kernel_w + kw;
37+
sum += input[input_idx] * weight[weight_idx];
38+
}
39+
}
40+
}
41+
const int output_idx = b * (p.out_channels * out_h * out_w) +
42+
oc * (out_h * out_w) + oh * out_w + ow;
43+
output[output_idx] = sum;
44+
}
45+
}
46+
}
47+
}
48+
49+
return output;
50+
}
51+
52+
} // namespace
53+
54+
TEST(ConvolutionTest, ImplicitGemmMatchesReference) {
55+
const hpc::convolution::ConvParams params{
56+
1, 2, 3, 5, 5,
57+
3, 3, 1, 1, 1, 1, 1, 1,
58+
};
59+
const int out_h = (params.in_height + 2 * params.pad_h - params.dilation_h * (params.kernel_h - 1) - 1) /
60+
params.stride_h + 1;
61+
const int out_w = (params.in_width + 2 * params.pad_w - params.dilation_w * (params.kernel_w - 1) - 1) /
62+
params.stride_w + 1;
63+
64+
const auto input = hpc::test::random_vector<float>(
65+
params.batch * params.in_channels * params.in_height * params.in_width, -1.0f, 1.0f);
66+
const auto weight = hpc::test::random_vector<float>(
67+
params.out_channels * params.in_channels * params.kernel_h * params.kernel_w, -1.0f, 1.0f);
68+
const auto expected = cpu_conv2d(input, weight, params);
69+
70+
hpc::Tensor<float> d_input(input.size());
71+
hpc::Tensor<float> d_weight(weight.size());
72+
hpc::Tensor<float> d_output(expected.size());
73+
2274
d_input.copy_from_host(input);
2375
d_weight.copy_from_host(weight);
24-
25-
hpc::convolution::ConvParams params{
26-
batch, in_c, out_c, in_h, in_w,
27-
k_h, k_w, stride, stride, pad, pad, 1, 1
28-
};
29-
76+
d_output.zero();
77+
3078
hpc::convolution::conv2d_implicit_gemm<float>(
3179
d_input.data(), d_weight.data(), d_output.data(), params);
3280
cudaDeviceSynchronize();
33-
34-
auto output = d_output.to_host();
35-
EXPECT_EQ(output.size(), batch * out_c * out_h * out_w);
81+
82+
const auto output = d_output.to_host();
83+
ASSERT_EQ(output.size(), static_cast<size_t>(params.batch * params.out_channels * out_h * out_w));
84+
EXPECT_TRUE(hpc::test::vectors_almost_equal(output, expected, 1e-4f, 1e-4f));
85+
}
86+
87+
TEST(ConvolutionTest, WinogradPathMatchesImplicitGemmFallback) {
88+
constexpr int batch = 1;
89+
constexpr int in_channels = 2;
90+
constexpr int out_channels = 2;
91+
constexpr int height = 6;
92+
constexpr int width = 6;
93+
constexpr int kernel = 3;
94+
constexpr int output_size = batch * out_channels * height * width;
95+
96+
const auto input = hpc::test::random_vector<float>(batch * in_channels * height * width, -1.0f, 1.0f);
97+
const auto weight = hpc::test::random_vector<float>(out_channels * in_channels * kernel * kernel, -1.0f, 1.0f);
98+
99+
hpc::Tensor<float> d_input(input.size());
100+
hpc::Tensor<float> d_weight(weight.size());
101+
hpc::Tensor<float> d_implicit(output_size);
102+
hpc::Tensor<float> d_winograd(output_size);
103+
104+
d_input.copy_from_host(input);
105+
d_weight.copy_from_host(weight);
106+
d_implicit.zero();
107+
d_winograd.zero();
108+
109+
const hpc::convolution::ConvParams params{
110+
batch, in_channels, out_channels, height, width,
111+
kernel, kernel, 1, 1, 1, 1, 1, 1,
112+
};
113+
114+
hpc::convolution::conv2d_implicit_gemm<float>(
115+
d_input.data(), d_weight.data(), d_implicit.data(), params);
116+
hpc::convolution::conv2d_winograd<float>(
117+
d_input.data(), d_weight.data(), d_winograd.data(), batch, in_channels, out_channels, height, width);
118+
cudaDeviceSynchronize();
119+
120+
const auto implicit_output = d_implicit.to_host();
121+
const auto winograd_output = d_winograd.to_host();
122+
EXPECT_TRUE(hpc::test::vectors_almost_equal(winograd_output, implicit_output, 1e-5f, 1e-5f));
36123
}

tests/cuda13/test_cluster.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,8 @@
66
#include "../test_utils.hpp"
77
#include <numeric>
88

9-
// Feature: hpc-ai-optimization-lab, Property 14: Cluster Reduce Correctness
10-
RC_GTEST_PROP(ClusterTest, ReduceCorrectness, ()) {
9+
// Property 14: Fallback reduction path remains numerically correct.
10+
RC_GTEST_PROP(ClusterTest, FallbackReduceCorrectness, ()) {
1111
auto n = *rc::gen::inRange<size_t>(256, 4096);
1212
auto input = *rc::gen::container<std::vector<float>>(n,
1313
rc::gen::map(rc::gen::arbitrary<float>(), [](float x) {
@@ -30,7 +30,7 @@ RC_GTEST_PROP(ClusterTest, ReduceCorrectness, ()) {
3030
RC_ASSERT(hpc::test::almost_equal(result[0], expected, 1e-2f, 1e-3f));
3131
}
3232

33-
TEST(ClusterTest, BasicReduce) {
33+
TEST(ClusterTest, FallbackReduceMatchesReference) {
3434
size_t n = 1024;
3535
std::vector<float> input(n, 1.0f);
3636

tests/cuda13/test_tma.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,8 @@
55
#include "common/tensor.cuh"
66
#include "../test_utils.hpp"
77

8-
// Feature: hpc-ai-optimization-lab, Property 13: TMA Data Integrity
9-
RC_GTEST_PROP(TMATest, DataIntegrity, ()) {
8+
// Property 13: Fallback copy path preserves data integrity.
9+
RC_GTEST_PROP(TMATest, FallbackCopyPreservesDataIntegrity, ()) {
1010
auto rows = *rc::gen::inRange<int>(1, 128);
1111
auto cols = *rc::gen::inRange<int>(1, 128);
1212
auto input = *rc::gen::container<std::vector<float>>(rows * cols, rc::gen::arbitrary<float>());
@@ -25,7 +25,7 @@ RC_GTEST_PROP(TMATest, DataIntegrity, ()) {
2525
}
2626
}
2727

28-
TEST(TMATest, BasicCopy) {
28+
TEST(TMATest, FallbackCopyMatchesInput) {
2929
int rows = 64, cols = 64;
3030
auto input = hpc::test::random_vector<float>(rows * cols);
3131

0 commit comments

Comments
 (0)