Update LexCHA.cpp

Yusheng-Hu · web-flow · commit 330e10bd3bfa · 2026-06-10T14:04:24.000+08:00
diff --git a/FullPermutation/LexCHA.cpp b/FullPermutation/LexCHA.cpp
@@ -21,19 +21,21 @@ constexpr int TAIL_DEPTH = 5;
 constexpr int FLAT_STEPS = 119;
 constexpr int XMM_LANES = 16;
 
-// Align to 16 bytes for SIMD operations
 alignas(16) uint8_t flat_lut_N5[FLAT_STEPS][XMM_LANES];
 
 // ── 1. Precompute: SIMD blind-shuffle masks ───────────────────────────
 void precompute_only_flat_lut_N5() {
-    std::array<uint8_t, TAIL_DEPTH> P;
+    // 修复方案：将数组大小增加到 16，以满足 SIMD 优化的对齐要求，
+    // 同时避免 std::next_permutation 在处理过小容器时被编译器优化导致的溢出警告。
+    std::array<uint8_t, 16> P; 
     for (int i = 0; i < TAIL_DEPTH; ++i) P[i] = i;
 
     for (int step = 0; step < FLAT_STEPS; ++step) {
-        std::array<uint8_t, TAIL_DEPTH> M;
+        std::array<uint8_t, 16> M;
         for (int j = 0; j < TAIL_DEPTH; ++j) M[P[j]] = j;
         
-        std::next_permutation(P.begin(), P.end());
+        // 使用实际有效的区间 [0, TAIL_DEPTH) 进行置换
+        std::next_permutation(P.begin(), P.begin() + TAIL_DEPTH);
         
         std::memset(flat_lut_N5[step], 0, XMM_LANES);
         for (int i = 0; i < TAIL_DEPTH; ++i) {
@@ -42,16 +44,13 @@ void precompute_only_flat_lut_N5() {
     }
 }
 
-// ── 2. Accelerated engine: SIMD blind ops + next_permutation boundary skip ──
+// ── 2. Accelerated engine ──────────────────────────────────────────
 unsigned long long benchmark_accelerated(int N) {
     std::vector<int> D(N);
     for(int i = 0; i < N; ++i) D[i] = i;
     
-    // Increased buffer size to 32 bytes to safely hold 20 bytes (5 ints) 
-    // while maintaining 16-byte alignment for SIMD operations.
     alignas(16) uint8_t buffer[32] = {0}; 
     
-    // Copy the last TAIL_DEPTH elements safely
     std::memcpy(buffer, &D[N - TAIL_DEPTH], TAIL_DEPTH * sizeof(int));
     __m128i p_reg = _mm_load_si128((__m128i*)buffer);
 
@@ -66,7 +65,6 @@ unsigned long long benchmark_accelerated(int N) {
         }
         total_count += FLAT_STEPS;
 
-        // Store back to aligned buffer
         _mm_store_si128((__m128i*)buffer, p_reg);
         std::memcpy(&D[N - TAIL_DEPTH], buffer, TAIL_DEPTH * sizeof(int));
         
@@ -79,37 +77,19 @@ unsigned long long benchmark_accelerated(int N) {
     return total_count;
 }
 
-// ── 3. Baseline benchmark ────────────────────────────────────────────
-unsigned long long benchmark_std(int N) {
-    std::vector<int> P(N);
-    for (int i = 0; i < N; ++i) P[i] = i;
-    unsigned long long count = 1;
-    while (std::next_permutation(P.begin(), P.end())) count++;
-    return count;
-}
-
-// ── 4. Main driver ───────────────────────────────────────────────────
+// ── 3. Main driver ───────────────────────────────────────────────────
 int main(int argc, char* argv[]) {
     if (argc < 2) return 1;
     int N = std::atoi(argv[1]);
     
     precompute_only_flat_lut_N5();
 
-    auto s1 = std::chrono::high_resolution_clock::now();
-    unsigned long long c1 = benchmark_std(N);
-    auto e1 = std::chrono::high_resolution_clock::now();
-
     auto s2 = std::chrono::high_resolution_clock::now();
     unsigned long long c2 = benchmark_accelerated(N);
     auto e2 = std::chrono::high_resolution_clock::now();
 
-    double d1 = std::chrono::duration<double>(e1 - s1).count();
     double d2 = std::chrono::duration<double>(e2 - s2).count();
-
-    // Output formatted row: N, Std(s), Acc(s), Std_ns/perm, Acc_ns/perm, Speedup
-    std::cout << N << " " << d1 << " " << d2 << " " 
-              << (d1 * 1e9) / c1 << " " << (d2 * 1e9) / c2 << " " 
-              << d1/d2 << "x" << std::endl;
+    std::cout << N << " Acc(s): " << d2 << " | Total: " << c2 << std::endl;
 
     return 0;
 }