1-
21/* *
32 * Official Implementation of LexCHA Indexing Algorithms
43 * Author: Yusheng Hu
5- * Research: A Divide-and-Conquer Engine for Lexicographical Permutations:
6- Accelerating State Evolution via Hybrid Software-Hardware CPU Instructions
4+ * Research: A Divide-and-Conquer Engine for Lexicographical Permutations
75 * Repository: https://github.com/Yusheng-Hu/Position-Pure-Algorithm
6+ * * Note: Manual permutation logic is used in precomputation to prevent
7+ * GCC -Wstringop-overflow warnings during aggressive optimization.
88 */
99
1010#include < iostream>
11- #include < algorithm>
12- #include < chrono>
1311#include < vector>
1412#include < iomanip>
1513#include < immintrin.h>
16- #include < array >
17- #include < cstring >
14+ #include < cstring >
15+ #include < algorithm >
1816
1917// ── Architecture Configuration ───────────────────────────────────────
2018constexpr int TAIL_DEPTH = 5 ;
2119constexpr int FLAT_STEPS = 119 ;
2220constexpr int XMM_LANES = 16 ;
2321
22+ // Align LUT to 16-byte boundary for SIMD compatibility
2423alignas (16 ) uint8_t flat_lut_N5[FLAT_STEPS ][XMM_LANES ];
2524
26- // ── 1. Precompute: SIMD blind-shuffle masks ───────────────────────────
25+ // ── 1. Precompute: Manual permutation to avoid iterator-based warnings ──
26+ void next_perm_manual (uint8_t * p, int n) {
27+ int i = n - 1 ;
28+ while (i > 0 && p[i - 1 ] >= p[i]) i--;
29+ if (i <= 0 ) {
30+ std::reverse (p, p + n);
31+ return ;
32+ }
33+ int j = n - 1 ;
34+ while (p[j] <= p[i - 1 ]) j--;
35+ std::swap (p[i - 1 ], p[j]);
36+ std::reverse (p + i, p + n);
37+ }
38+
2739void precompute_only_flat_lut_N5 () {
28- // 修复方案:将数组大小增加到 16,以满足 SIMD 优化的对齐要求,
29- // 同时避免 std::next_permutation 在处理过小容器时被编译器优化导致的溢出警告。
30- std::array<uint8_t , 16 > P;
40+ uint8_t P[TAIL_DEPTH ];
3141 for (int i = 0 ; i < TAIL_DEPTH ; ++i) P[i] = i;
3242
3343 for (int step = 0 ; step < FLAT_STEPS ; ++step) {
34- std::array< uint8_t , 16 > M ;
44+ uint8_t M[ TAIL_DEPTH ] ;
3545 for (int j = 0 ; j < TAIL_DEPTH ; ++j) M[P[j]] = j;
3646
37- // 使用实际有效的区间 [0, TAIL_DEPTH) 进行置换
38- std::next_permutation (P.begin (), P.begin () + TAIL_DEPTH );
47+ next_perm_manual (P, TAIL_DEPTH );
3948
49+ // Ensure all lanes are initialized to prevent garbage data
4050 std::memset (flat_lut_N5[step], 0 , XMM_LANES );
4151 for (int i = 0 ; i < TAIL_DEPTH ; ++i) {
4252 flat_lut_N5[step][i] = M[P[i]];
4353 }
4454 }
4555}
4656
47- // ── 2. Accelerated engine ──────────────────────────────────────── ──
57+ // ── 2. Accelerated engine: SIMD blind ops + boundary skip ──
4858unsigned long long benchmark_accelerated (int N) {
4959 std::vector<int > D (N);
5060 for (int i = 0 ; i < N; ++i) D[i] = i;
5161
62+ // Aligned buffer to ensure safe memory access for SIMD instructions
5263 alignas (16 ) uint8_t buffer[32 ] = {0 };
53-
5464 std::memcpy (buffer, &D[N - TAIL_DEPTH ], TAIL_DEPTH * sizeof (int ));
5565 __m128i p_reg = _mm_load_si128 ((__m128i*)buffer);
5666
@@ -59,15 +69,18 @@ unsigned long long benchmark_accelerated(int N) {
5969 for (int i = 1 ; i <= N; ++i) max_perms *= i;
6070
6171 while (total_count < max_perms) {
72+ // [SIMD path]: Execute 119 rapid state transitions
6273 for (int step = 0 ; step < FLAT_STEPS ; ++step) {
6374 __m128i mask = _mm_load_si128 ((__m128i*)flat_lut_N5[step]);
6475 p_reg = _mm_shuffle_epi8 (p_reg, mask);
6576 }
6677 total_count += FLAT_STEPS ;
6778
79+ // [Sync]: Write back to memory
6880 _mm_store_si128 ((__m128i*)buffer, p_reg);
6981 std::memcpy (&D[N - TAIL_DEPTH ], buffer, TAIL_DEPTH * sizeof (int ));
7082
83+ // Handle block boundary with standard library permutation
7184 if (std::next_permutation (D.begin (), D.end ())) {
7285 total_count++;
7386 std::memcpy (buffer, &D[N - TAIL_DEPTH ], TAIL_DEPTH * sizeof (int ));
@@ -89,7 +102,7 @@ int main(int argc, char* argv[]) {
89102 auto e2 = std::chrono::high_resolution_clock::now ();
90103
91104 double d2 = std::chrono::duration<double >(e2 - s2).count ();
92- std::cout << N << " Acc(s): " << d2 << " | Total : " << c2 << std::endl;
105+ std::cout << " N= " << N << " | Acc(s): " << d2 << " | Count : " << c2 << std::endl;
93106
94107 return 0 ;
95108}
0 commit comments