Skip to content

Commit 330e10b

Browse files
authored
Update LexCHA.cpp
1 parent 47929df commit 330e10b

1 file changed

Lines changed: 9 additions & 29 deletions

File tree

FullPermutation/LexCHA.cpp

Lines changed: 9 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -21,19 +21,21 @@ constexpr int TAIL_DEPTH = 5;
2121
constexpr int FLAT_STEPS = 119;
2222
constexpr int XMM_LANES = 16;
2323

24-
// Align to 16 bytes for SIMD operations
2524
alignas(16) uint8_t flat_lut_N5[FLAT_STEPS][XMM_LANES];
2625

2726
// ── 1. Precompute: SIMD blind-shuffle masks ───────────────────────────
2827
void precompute_only_flat_lut_N5() {
29-
std::array<uint8_t, TAIL_DEPTH> P;
28+
// 修复方案:将数组大小增加到 16,以满足 SIMD 优化的对齐要求,
29+
// 同时避免 std::next_permutation 在处理过小容器时被编译器优化导致的溢出警告。
30+
std::array<uint8_t, 16> P;
3031
for (int i = 0; i < TAIL_DEPTH; ++i) P[i] = i;
3132

3233
for (int step = 0; step < FLAT_STEPS; ++step) {
33-
std::array<uint8_t, TAIL_DEPTH> M;
34+
std::array<uint8_t, 16> M;
3435
for (int j = 0; j < TAIL_DEPTH; ++j) M[P[j]] = j;
3536

36-
std::next_permutation(P.begin(), P.end());
37+
// 使用实际有效的区间 [0, TAIL_DEPTH) 进行置换
38+
std::next_permutation(P.begin(), P.begin() + TAIL_DEPTH);
3739

3840
std::memset(flat_lut_N5[step], 0, XMM_LANES);
3941
for (int i = 0; i < TAIL_DEPTH; ++i) {
@@ -42,16 +44,13 @@ void precompute_only_flat_lut_N5() {
4244
}
4345
}
4446

45-
// ── 2. Accelerated engine: SIMD blind ops + next_permutation boundary skip ──
47+
// ── 2. Accelerated engine ──────────────────────────────────────────
4648
unsigned long long benchmark_accelerated(int N) {
4749
std::vector<int> D(N);
4850
for(int i = 0; i < N; ++i) D[i] = i;
4951

50-
// Increased buffer size to 32 bytes to safely hold 20 bytes (5 ints)
51-
// while maintaining 16-byte alignment for SIMD operations.
5252
alignas(16) uint8_t buffer[32] = {0};
5353

54-
// Copy the last TAIL_DEPTH elements safely
5554
std::memcpy(buffer, &D[N - TAIL_DEPTH], TAIL_DEPTH * sizeof(int));
5655
__m128i p_reg = _mm_load_si128((__m128i*)buffer);
5756

@@ -66,7 +65,6 @@ unsigned long long benchmark_accelerated(int N) {
6665
}
6766
total_count += FLAT_STEPS;
6867

69-
// Store back to aligned buffer
7068
_mm_store_si128((__m128i*)buffer, p_reg);
7169
std::memcpy(&D[N - TAIL_DEPTH], buffer, TAIL_DEPTH * sizeof(int));
7270

@@ -79,37 +77,19 @@ unsigned long long benchmark_accelerated(int N) {
7977
return total_count;
8078
}
8179

82-
// ── 3. Baseline benchmark ────────────────────────────────────────────
83-
unsigned long long benchmark_std(int N) {
84-
std::vector<int> P(N);
85-
for (int i = 0; i < N; ++i) P[i] = i;
86-
unsigned long long count = 1;
87-
while (std::next_permutation(P.begin(), P.end())) count++;
88-
return count;
89-
}
90-
91-
// ── 4. Main driver ───────────────────────────────────────────────────
80+
// ── 3. Main driver ───────────────────────────────────────────────────
9281
int main(int argc, char* argv[]) {
9382
if (argc < 2) return 1;
9483
int N = std::atoi(argv[1]);
9584

9685
precompute_only_flat_lut_N5();
9786

98-
auto s1 = std::chrono::high_resolution_clock::now();
99-
unsigned long long c1 = benchmark_std(N);
100-
auto e1 = std::chrono::high_resolution_clock::now();
101-
10287
auto s2 = std::chrono::high_resolution_clock::now();
10388
unsigned long long c2 = benchmark_accelerated(N);
10489
auto e2 = std::chrono::high_resolution_clock::now();
10590

106-
double d1 = std::chrono::duration<double>(e1 - s1).count();
10791
double d2 = std::chrono::duration<double>(e2 - s2).count();
108-
109-
// Output formatted row: N, Std(s), Acc(s), Std_ns/perm, Acc_ns/perm, Speedup
110-
std::cout << N << " " << d1 << " " << d2 << " "
111-
<< (d1 * 1e9) / c1 << " " << (d2 * 1e9) / c2 << " "
112-
<< d1/d2 << "x" << std::endl;
92+
std::cout << N << " Acc(s): " << d2 << " | Total: " << c2 << std::endl;
11393

11494
return 0;
11595
}

0 commit comments

Comments
 (0)