22 * Official Implementation of LexCHA Indexing Algorithms
33 * Author: Yusheng Hu
44 * Research: A Divide-and-Conquer Engine for Lexicographical Permutations
5- * Repository: https://github.com/Yusheng-Hu/Position-Pure-Algorithm
6- * * Note: Manual permutation logic is used in precomputation to prevent
7- * GCC -Wstringop-overflow warnings during aggressive optimization.
5+ * * Note: Added <cstdint> to resolve compilation errors regarding uint8_t.
86 */
97
108#include < iostream>
1311#include < immintrin.h>
1412#include < cstring>
1513#include < algorithm>
14+ #include < cstdint> // Required for uint8_t
1615
1716// ── Architecture Configuration ───────────────────────────────────────
1817constexpr int TAIL_DEPTH = 5 ;
@@ -22,7 +21,7 @@ constexpr int XMM_LANES = 16;
2221// Align LUT to 16-byte boundary for SIMD compatibility
2322alignas (16 ) uint8_t flat_lut_N5[FLAT_STEPS ][XMM_LANES ];
2423
25- // ── 1. Precompute: Manual permutation to avoid iterator-based warnings ──
24+ // ── 1. Precompute: Manual permutation ────────────────────────────── ──
2625void next_perm_manual (uint8_t * p, int n) {
2726 int i = n - 1 ;
2827 while (i > 0 && p[i - 1 ] >= p[i]) i--;
@@ -46,20 +45,18 @@ void precompute_only_flat_lut_N5() {
4645
4746 next_perm_manual (P, TAIL_DEPTH );
4847
49- // Ensure all lanes are initialized to prevent garbage data
5048 std::memset (flat_lut_N5[step], 0 , XMM_LANES );
5149 for (int i = 0 ; i < TAIL_DEPTH ; ++i) {
5250 flat_lut_N5[step][i] = M[P[i]];
5351 }
5452 }
5553}
5654
57- // ── 2. Accelerated engine: SIMD blind ops + boundary skip ──
55+ // ── 2. Accelerated engine ──────────────────────────────────────── ──
5856unsigned long long benchmark_accelerated (int N) {
5957 std::vector<int > D (N);
6058 for (int i = 0 ; i < N; ++i) D[i] = i;
6159
62- // Aligned buffer to ensure safe memory access for SIMD instructions
6360 alignas (16 ) uint8_t buffer[32 ] = {0 };
6461 std::memcpy (buffer, &D[N - TAIL_DEPTH ], TAIL_DEPTH * sizeof (int ));
6562 __m128i p_reg = _mm_load_si128 ((__m128i*)buffer);
@@ -69,18 +66,15 @@ unsigned long long benchmark_accelerated(int N) {
6966 for (int i = 1 ; i <= N; ++i) max_perms *= i;
7067
7168 while (total_count < max_perms) {
72- // [SIMD path]: Execute 119 rapid state transitions
7369 for (int step = 0 ; step < FLAT_STEPS ; ++step) {
7470 __m128i mask = _mm_load_si128 ((__m128i*)flat_lut_N5[step]);
7571 p_reg = _mm_shuffle_epi8 (p_reg, mask);
7672 }
7773 total_count += FLAT_STEPS ;
7874
79- // [Sync]: Write back to memory
8075 _mm_store_si128 ((__m128i*)buffer, p_reg);
8176 std::memcpy (&D[N - TAIL_DEPTH ], buffer, TAIL_DEPTH * sizeof (int ));
8277
83- // Handle block boundary with standard library permutation
8478 if (std::next_permutation (D.begin (), D.end ())) {
8579 total_count++;
8680 std::memcpy (buffer, &D[N - TAIL_DEPTH ], TAIL_DEPTH * sizeof (int ));
0 commit comments