Update LexCHA.cpp

Yusheng-Hu · web-flow · commit b745b74e1df2 · 2026-06-10T14:08:00.000+08:00
diff --git a/FullPermutation/LexCHA.cpp b/FullPermutation/LexCHA.cpp
@@ -1,56 +1,66 @@
-
 /**
  * Official Implementation of LexCHA Indexing Algorithms
  * Author: Yusheng Hu
- * Research: A Divide-and-Conquer Engine for Lexicographical Permutations: 
- Accelerating State Evolution via Hybrid Software-Hardware CPU Instructions
+ * Research: A Divide-and-Conquer Engine for Lexicographical Permutations
  * Repository: https://github.com/Yusheng-Hu/Position-Pure-Algorithm
+ * * Note: Manual permutation logic is used in precomputation to prevent 
+ * GCC -Wstringop-overflow warnings during aggressive optimization.
  */
 
 #include <iostream>
-#include <algorithm>
-#include <chrono>
 #include <vector>
 #include <iomanip>
 #include <immintrin.h>
-#include <array>
-#include <cstring> 
+#include <cstring>
+#include <algorithm>
 
 // ── Architecture Configuration ───────────────────────────────────────
 constexpr int TAIL_DEPTH = 5;
 constexpr int FLAT_STEPS = 119;
 constexpr int XMM_LANES = 16;
 
+// Align LUT to 16-byte boundary for SIMD compatibility
 alignas(16) uint8_t flat_lut_N5[FLAT_STEPS][XMM_LANES];
 
-// ── 1. Precompute: SIMD blind-shuffle masks ───────────────────────────
+// ── 1. Precompute: Manual permutation to avoid iterator-based warnings ──
+void next_perm_manual(uint8_t* p, int n) {
+    int i = n - 1;
+    while (i > 0 && p[i - 1] >= p[i]) i--;
+    if (i <= 0) {
+        std::reverse(p, p + n);
+        return;
+    }
+    int j = n - 1;
+    while (p[j] <= p[i - 1]) j--;
+    std::swap(p[i - 1], p[j]);
+    std::reverse(p + i, p + n);
+}
+
 void precompute_only_flat_lut_N5() {
-    // 修复方案：将数组大小增加到 16，以满足 SIMD 优化的对齐要求，
-    // 同时避免 std::next_permutation 在处理过小容器时被编译器优化导致的溢出警告。
-    std::array<uint8_t, 16> P; 
+    uint8_t P[TAIL_DEPTH];
     for (int i = 0; i < TAIL_DEPTH; ++i) P[i] = i;
 
     for (int step = 0; step < FLAT_STEPS; ++step) {
-        std::array<uint8_t, 16> M;
+        uint8_t M[TAIL_DEPTH];
         for (int j = 0; j < TAIL_DEPTH; ++j) M[P[j]] = j;
         
-        // 使用实际有效的区间 [0, TAIL_DEPTH) 进行置换
-        std::next_permutation(P.begin(), P.begin() + TAIL_DEPTH);
+        next_perm_manual(P, TAIL_DEPTH);
         
+        // Ensure all lanes are initialized to prevent garbage data
         std::memset(flat_lut_N5[step], 0, XMM_LANES);
         for (int i = 0; i < TAIL_DEPTH; ++i) {
             flat_lut_N5[step][i] = M[P[i]];
         }
     }
 }
 
-// ── 2. Accelerated engine ──────────────────────────────────────────
+// ── 2. Accelerated engine: SIMD blind ops + boundary skip ──
 unsigned long long benchmark_accelerated(int N) {
     std::vector<int> D(N);
     for(int i = 0; i < N; ++i) D[i] = i;
     
+    // Aligned buffer to ensure safe memory access for SIMD instructions
     alignas(16) uint8_t buffer[32] = {0}; 
-    
     std::memcpy(buffer, &D[N - TAIL_DEPTH], TAIL_DEPTH * sizeof(int));
     __m128i p_reg = _mm_load_si128((__m128i*)buffer);
 
@@ -59,15 +69,18 @@ unsigned long long benchmark_accelerated(int N) {
     for(int i = 1; i <= N; ++i) max_perms *= i;
 
     while (total_count < max_perms) {
+        // [SIMD path]: Execute 119 rapid state transitions
         for (int step = 0; step < FLAT_STEPS; ++step) {
             __m128i mask = _mm_load_si128((__m128i*)flat_lut_N5[step]);
             p_reg = _mm_shuffle_epi8(p_reg, mask);
         }
         total_count += FLAT_STEPS;
 
+        // [Sync]: Write back to memory
         _mm_store_si128((__m128i*)buffer, p_reg);
         std::memcpy(&D[N - TAIL_DEPTH], buffer, TAIL_DEPTH * sizeof(int));
         
+        // Handle block boundary with standard library permutation
         if (std::next_permutation(D.begin(), D.end())) {
             total_count++;
             std::memcpy(buffer, &D[N - TAIL_DEPTH], TAIL_DEPTH * sizeof(int));
@@ -89,7 +102,7 @@ int main(int argc, char* argv[]) {
     auto e2 = std::chrono::high_resolution_clock::now();
 
     double d2 = std::chrono::duration<double>(e2 - s2).count();
-    std::cout << N << " Acc(s): " << d2 << " | Total: " << c2 << std::endl;
+    std::cout << "N=" << N << " | Acc(s): " << d2 << " | Count: " << c2 << std::endl;
 
     return 0;
 }