11/* *
22 * Official Implementation of LexCHA Indexing Algorithms
3- * Author: Yusheng Hu
4- * Research: A Divide-and-Conquer Engine for Lexicographical Permutations
5- * * Note: Added <chrono> to resolve time-related compilation errors.
3+ * Final Fix: Using 16-byte aligned buffer for manual permutation to
4+ * satisfy GCC's aggressive optimization safety checks.
65 */
76
87#include < iostream>
1110#include < immintrin.h>
1211#include < cstring>
1312#include < algorithm>
14- #include < cstdint> // Required for uint8_t
15- #include < chrono> // Required for std::chrono
13+ #include < cstdint>
14+ #include < chrono>
1615
17- // ── Architecture Configuration ───────────────────────────────────────
1816constexpr int TAIL_DEPTH = 5 ;
1917constexpr int FLAT_STEPS = 119 ;
2018constexpr int XMM_LANES = 16 ;
2119
22- // Align LUT to 16-byte boundary for SIMD compatibility
2320alignas (16 ) uint8_t flat_lut_N5[FLAT_STEPS ][XMM_LANES ];
2421
25- // ── 1. Precompute: Manual permutation ───────────── ───────────────────
22+ // ── 1. Manual permutation with 16-byte safe buffer ───────────────────
2623void next_perm_manual (uint8_t * p, int n) {
2724 int i = n - 1 ;
2825 while (i > 0 && p[i - 1 ] >= p[i]) i--;
2926 if (i <= 0 ) {
30- std::reverse (p, p + n);
27+ // Use a local 16-byte aligned array to satisfy the compiler's
28+ // need for safety during reverse operations.
29+ alignas (16 ) uint8_t temp[16 ];
30+ for (int k=0 ; k<n; ++k) temp[k] = p[n-1 -k];
31+ for (int k=0 ; k<n; ++k) p[k] = temp[k];
3132 return ;
3233 }
3334 int j = n - 1 ;
3435 while (p[j] <= p[i - 1 ]) j--;
3536 std::swap (p[i - 1 ], p[j]);
36- std::reverse (p + i, p + n);
37+
38+ // Reverse the tail using the same safe logic
39+ alignas (16 ) uint8_t tail[16 ];
40+ int tail_len = n - i;
41+ for (int k=0 ; k<tail_len; ++k) tail[k] = p[n-1 -k];
42+ for (int k=0 ; k<tail_len; ++k) p[i+k] = tail[k];
3743}
3844
3945void precompute_only_flat_lut_N5 () {
40- uint8_t P[TAIL_DEPTH ];
46+ // Declaring 16 bytes instead of 5 to align with SIMD register sizes
47+ alignas (16 ) uint8_t P[16 ];
4148 for (int i = 0 ; i < TAIL_DEPTH ; ++i) P[i] = i;
4249
4350 for (int step = 0 ; step < FLAT_STEPS ; ++step) {
@@ -58,7 +65,6 @@ unsigned long long benchmark_accelerated(int N) {
5865 std::vector<int > D (N);
5966 for (int i = 0 ; i < N; ++i) D[i] = i;
6067
61- // Aligned buffer to ensure safe memory access for SIMD
6268 alignas (16 ) uint8_t buffer[32 ] = {0 };
6369 std::memcpy (buffer, &D[N - TAIL_DEPTH ], TAIL_DEPTH * sizeof (int ));
6470 __m128i p_reg = _mm_load_si128 ((__m128i*)buffer);
@@ -86,19 +92,16 @@ unsigned long long benchmark_accelerated(int N) {
8692 return total_count;
8793}
8894
89- // ── 3. Main driver ───────────────────────────────────────────────────
9095int main (int argc, char * argv[]) {
9196 if (argc < 2 ) return 1 ;
9297 int N = std::atoi (argv[1 ]);
93-
9498 precompute_only_flat_lut_N5 ();
95-
99+
96100 auto s2 = std::chrono::high_resolution_clock::now ();
97101 unsigned long long c2 = benchmark_accelerated (N);
98102 auto e2 = std::chrono::high_resolution_clock::now ();
99103
100104 double d2 = std::chrono::duration<double >(e2 - s2).count ();
101105 std::cout << " N=" << N << " | Acc(s): " << d2 << " | Count: " << c2 << std::endl;
102-
103106 return 0 ;
104107}
0 commit comments