Update LexCHA.cpp

Yusheng-Hu · web-flow · commit 3abc50d3b22a · 2026-06-10T14:28:19.000+08:00
diff --git a/FullPermutation/LexCHA.cpp b/FullPermutation/LexCHA.cpp
@@ -2,9 +2,8 @@
  * Official Implementation of LexCHA Indexing Algorithms
  * Author: Yusheng Hu
  * Research: A Divide-and-Conquer Engine for Lexicographical Permutations
- * * Note: This version is optimized for GCC 13+ under -O3 -march=native.
- * It includes strict memory alignment to prevent SIMD stringop-overflow 
- * warnings and strongly prevents dead-code elimination during standard benchmarks.
+ * * Note: Data types fixed to uint8_t to perfectly align with _mm_shuffle_epi8
+ * byte-level operations, eliminating permutation count mismatches.
  */
 
 #include <iostream>
@@ -21,12 +20,9 @@ constexpr int TAIL_DEPTH = 5;
 constexpr int FLAT_STEPS = 119;
 constexpr int XMM_LANES = 16;
 
-// Align LUT to 16-byte boundary for SIMD compatibility
 alignas(16) uint8_t flat_lut_N5[FLAT_STEPS][XMM_LANES];
 
 // ── 1. Manual Permutation ────────────────────────────────────────────
-// Uses 16-byte aligned buffers to satisfy GCC's aggressive SIMD safety checks
-// and avoid -Wstringop-overflow warnings during precomputation.
 void next_perm_manual(uint8_t* p, int n) {
     int i = n - 1;
     while (i > 0 && p[i - 1] >= p[i]) i--;
@@ -50,7 +46,6 @@ void next_perm_manual(uint8_t* p, int n) {
 
 // ── Precomputation ───────────────────────────────────────────────────
 void precompute_only_flat_lut_N5() {
-    // Declared as 16 bytes to ensure SIMD boundary safety
     alignas(16) uint8_t P[16]; 
     for (int i = 0; i < TAIL_DEPTH; ++i) P[i] = i;
 
@@ -60,7 +55,8 @@ void precompute_only_flat_lut_N5() {
         
         next_perm_manual(P, TAIL_DEPTH);
         
-        std::memset(flat_lut_N5[step], 0, XMM_LANES);
+        // 0x80 safely zeros out the unused lanes during _mm_shuffle_epi8
+        std::memset(flat_lut_N5[step], 0x80, XMM_LANES);
         for (int i = 0; i < TAIL_DEPTH; ++i) {
             flat_lut_N5[step][i] = M[P[i]];
         }
@@ -69,11 +65,13 @@ void precompute_only_flat_lut_N5() {
 
 // ── 2. Accelerated Engine ────────────────────────────────────────────
 unsigned long long benchmark_accelerated(int N) {
-    std::vector<int> D(N);
-    for(int i = 0; i < N; ++i) D[i] = i;
+    // FIX: Use uint8_t instead of int for byte-level SIMD compatibility
+    std::vector<uint8_t> D(N);
+    for(int i = 0; i < N; ++i) D[i] = static_cast<uint8_t>(i);
     
-    alignas(16) uint8_t buffer[32] = {0}; 
-    std::memcpy(buffer, &D[N - TAIL_DEPTH], TAIL_DEPTH * sizeof(int));
+    alignas(16) uint8_t buffer[16] = {0}; 
+    // sizeof(uint8_t) is 1, so we copy exactly TAIL_DEPTH bytes
+    std::memcpy(buffer, &D[N - TAIL_DEPTH], TAIL_DEPTH);
     __m128i p_reg = _mm_load_si128((__m128i*)buffer);
 
     unsigned long long total_count = 1;
@@ -88,11 +86,11 @@ unsigned long long benchmark_accelerated(int N) {
         total_count += FLAT_STEPS;
 
         _mm_store_si128((__m128i*)buffer, p_reg);
-        std::memcpy(&D[N - TAIL_DEPTH], buffer, TAIL_DEPTH * sizeof(int));
+        std::memcpy(&D[N - TAIL_DEPTH], buffer, TAIL_DEPTH);
         
         if (std::next_permutation(D.begin(), D.end())) {
             total_count++;
-            std::memcpy(buffer, &D[N - TAIL_DEPTH], TAIL_DEPTH * sizeof(int));
+            std::memcpy(buffer, &D[N - TAIL_DEPTH], TAIL_DEPTH);
             p_reg = _mm_load_si128((__m128i*)buffer);
         }
     }
@@ -106,11 +104,10 @@ int main(int argc, char* argv[]) {
     
     precompute_only_flat_lut_N5();
 
-    // --- Benchmark 1: Standard Method ---
-    // Creating a real permutation loop to prevent GCC from eliminating
-    // the code block via Dead Code Elimination under -O3.
-    std::vector<int> V(N);
-    for(int i = 0; i < N; ++i) V[i] = i;
+    // Benchmark 1: Standard Method
+    // FIX: Using uint8_t to ensure fair comparison
+    std::vector<uint8_t> V(N);
+    for(int i = 0; i < N; ++i) V[i] = static_cast<uint8_t>(i);
     
     auto s1 = std::chrono::high_resolution_clock::now();
     unsigned long long c1 = 0;
@@ -120,26 +117,17 @@ int main(int argc, char* argv[]) {
     auto e1 = std::chrono::high_resolution_clock::now();
     
     double d1 = std::chrono::duration<double>(e1 - s1).count();
-    if (d1 < 1e-9) d1 = 1e-9; // Prevent division by zero
+    if (d1 < 1e-9) d1 = 1e-9; 
 
-    // --- Benchmark 2: Accelerated Method ---
+    // Benchmark 2: Accelerated Method
     auto s2 = std::chrono::high_resolution_clock::now();
     unsigned long long c2 = benchmark_accelerated(N);
     auto e2 = std::chrono::high_resolution_clock::now();
     
     double d2 = std::chrono::duration<double>(e2 - s2).count();
     if (d2 < 1e-9) d2 = 1e-9;
 
-    // --- Sanity Check ---
-    // This check guarantees the compiler MUST evaluate c1 and c2, 
-    // further enforcing that the loops actually run.
-    if (c1 != c2) {
-        std::cerr << "Error: Count mismatch! Std: " << c1 << " Acc: " << c2 << std::endl;
-        return 1;
-    }
-
-    // --- Output formatting for AWK script ---
-    // Target columns: N | Std(s) | Acc(s) | Std_ns/perm | Acc_ns/perm | Speedup
+    // Output formatting for AWK script
     double ns_std = (d1 * 1e9) / c1;
     double ns_acc = (d2 * 1e9) / c2;
     double speedup = d1 / d2;
@@ -151,5 +139,11 @@ int main(int argc, char* argv[]) {
               << ns_acc << " " 
               << speedup << std::endl;
 
+    // Sanity Check: If this triggers, something is fundamentally wrong
+    if (c1 != c2) {
+        std::cerr << "Error: Count mismatch! Std: " << c1 << " Acc: " << c2 << std::endl;
+        return 1;
+    }
+
     return 0;
 }