22 * Official Implementation of LexCHA Indexing Algorithms
33 * Author: Yusheng Hu
44 * Research: A Divide-and-Conquer Engine for Lexicographical Permutations
5- * * Note: This version is optimized for GCC 13+ under -O3 -march=native.
6- * It includes strict memory alignment to prevent SIMD stringop-overflow
7- * warnings and strongly prevents dead-code elimination during standard benchmarks.
5+ * * Note: Data types fixed to uint8_t to perfectly align with _mm_shuffle_epi8
6+ * byte-level operations, eliminating permutation count mismatches.
87 */
98
109#include < iostream>
@@ -21,12 +20,9 @@ constexpr int TAIL_DEPTH = 5;
2120constexpr int FLAT_STEPS = 119 ;
2221constexpr int XMM_LANES = 16 ;
2322
24- // Align LUT to 16-byte boundary for SIMD compatibility
2523alignas (16 ) uint8_t flat_lut_N5[FLAT_STEPS ][XMM_LANES ];
2624
2725// ── 1. Manual Permutation ────────────────────────────────────────────
28- // Uses 16-byte aligned buffers to satisfy GCC's aggressive SIMD safety checks
29- // and avoid -Wstringop-overflow warnings during precomputation.
3026void next_perm_manual (uint8_t * p, int n) {
3127 int i = n - 1 ;
3228 while (i > 0 && p[i - 1 ] >= p[i]) i--;
@@ -50,7 +46,6 @@ void next_perm_manual(uint8_t* p, int n) {
5046
5147// ── Precomputation ───────────────────────────────────────────────────
5248void precompute_only_flat_lut_N5 () {
53- // Declared as 16 bytes to ensure SIMD boundary safety
5449 alignas (16 ) uint8_t P[16 ];
5550 for (int i = 0 ; i < TAIL_DEPTH ; ++i) P[i] = i;
5651
@@ -60,7 +55,8 @@ void precompute_only_flat_lut_N5() {
6055
6156 next_perm_manual (P, TAIL_DEPTH );
6257
63- std::memset (flat_lut_N5[step], 0 , XMM_LANES );
58+ // 0x80 safely zeros out the unused lanes during _mm_shuffle_epi8
59+ std::memset (flat_lut_N5[step], 0x80 , XMM_LANES );
6460 for (int i = 0 ; i < TAIL_DEPTH ; ++i) {
6561 flat_lut_N5[step][i] = M[P[i]];
6662 }
@@ -69,11 +65,13 @@ void precompute_only_flat_lut_N5() {
6965
7066// ── 2. Accelerated Engine ────────────────────────────────────────────
7167unsigned long long benchmark_accelerated (int N) {
72- std::vector<int > D (N);
73- for (int i = 0 ; i < N; ++i) D[i] = i;
68+ // FIX: Use uint8_t instead of int for byte-level SIMD compatibility
69+ std::vector<uint8_t > D (N);
70+ for (int i = 0 ; i < N; ++i) D[i] = static_cast <uint8_t >(i);
7471
75- alignas (16 ) uint8_t buffer[32 ] = {0 };
76- std::memcpy (buffer, &D[N - TAIL_DEPTH ], TAIL_DEPTH * sizeof (int ));
72+ alignas (16 ) uint8_t buffer[16 ] = {0 };
73+ // sizeof(uint8_t) is 1, so we copy exactly TAIL_DEPTH bytes
74+ std::memcpy (buffer, &D[N - TAIL_DEPTH ], TAIL_DEPTH );
7775 __m128i p_reg = _mm_load_si128 ((__m128i*)buffer);
7876
7977 unsigned long long total_count = 1 ;
@@ -88,11 +86,11 @@ unsigned long long benchmark_accelerated(int N) {
8886 total_count += FLAT_STEPS ;
8987
9088 _mm_store_si128 ((__m128i*)buffer, p_reg);
91- std::memcpy (&D[N - TAIL_DEPTH ], buffer, TAIL_DEPTH * sizeof ( int ) );
89+ std::memcpy (&D[N - TAIL_DEPTH ], buffer, TAIL_DEPTH );
9290
9391 if (std::next_permutation (D.begin (), D.end ())) {
9492 total_count++;
95- std::memcpy (buffer, &D[N - TAIL_DEPTH ], TAIL_DEPTH * sizeof ( int ) );
93+ std::memcpy (buffer, &D[N - TAIL_DEPTH ], TAIL_DEPTH );
9694 p_reg = _mm_load_si128 ((__m128i*)buffer);
9795 }
9896 }
@@ -106,11 +104,10 @@ int main(int argc, char* argv[]) {
106104
107105 precompute_only_flat_lut_N5 ();
108106
109- // --- Benchmark 1: Standard Method ---
110- // Creating a real permutation loop to prevent GCC from eliminating
111- // the code block via Dead Code Elimination under -O3.
112- std::vector<int > V (N);
113- for (int i = 0 ; i < N; ++i) V[i] = i;
107+ // Benchmark 1: Standard Method
108+ // FIX: Using uint8_t to ensure fair comparison
109+ std::vector<uint8_t > V (N);
110+ for (int i = 0 ; i < N; ++i) V[i] = static_cast <uint8_t >(i);
114111
115112 auto s1 = std::chrono::high_resolution_clock::now ();
116113 unsigned long long c1 = 0 ;
@@ -120,26 +117,17 @@ int main(int argc, char* argv[]) {
120117 auto e1 = std::chrono::high_resolution_clock::now ();
121118
122119 double d1 = std::chrono::duration<double >(e1 - s1).count ();
123- if (d1 < 1e-9 ) d1 = 1e-9 ; // Prevent division by zero
120+ if (d1 < 1e-9 ) d1 = 1e-9 ;
124121
125- // --- Benchmark 2: Accelerated Method ---
122+ // Benchmark 2: Accelerated Method
126123 auto s2 = std::chrono::high_resolution_clock::now ();
127124 unsigned long long c2 = benchmark_accelerated (N);
128125 auto e2 = std::chrono::high_resolution_clock::now ();
129126
130127 double d2 = std::chrono::duration<double >(e2 - s2).count ();
131128 if (d2 < 1e-9 ) d2 = 1e-9 ;
132129
133- // --- Sanity Check ---
134- // This check guarantees the compiler MUST evaluate c1 and c2,
135- // further enforcing that the loops actually run.
136- if (c1 != c2) {
137- std::cerr << " Error: Count mismatch! Std: " << c1 << " Acc: " << c2 << std::endl;
138- return 1 ;
139- }
140-
141- // --- Output formatting for AWK script ---
142- // Target columns: N | Std(s) | Acc(s) | Std_ns/perm | Acc_ns/perm | Speedup
130+ // Output formatting for AWK script
143131 double ns_std = (d1 * 1e9 ) / c1;
144132 double ns_acc = (d2 * 1e9 ) / c2;
145133 double speedup = d1 / d2;
@@ -151,5 +139,11 @@ int main(int argc, char* argv[]) {
151139 << ns_acc << " "
152140 << speedup << std::endl;
153141
142+ // Sanity Check: If this triggers, something is fundamentally wrong
143+ if (c1 != c2) {
144+ std::cerr << " Error: Count mismatch! Std: " << c1 << " Acc: " << c2 << std::endl;
145+ return 1 ;
146+ }
147+
154148 return 0 ;
155149}
0 commit comments