Skip to content

Commit b745b74

Browse files
authored
Update LexCHA.cpp
1 parent 330e10b commit b745b74

1 file changed

Lines changed: 30 additions & 17 deletions

File tree

FullPermutation/LexCHA.cpp

Lines changed: 30 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,56 +1,66 @@
1-
21
/**
32
* Official Implementation of LexCHA Indexing Algorithms
43
* Author: Yusheng Hu
5-
* Research: A Divide-and-Conquer Engine for Lexicographical Permutations:
6-
Accelerating State Evolution via Hybrid Software-Hardware CPU Instructions
4+
* Research: A Divide-and-Conquer Engine for Lexicographical Permutations
75
* Repository: https://github.com/Yusheng-Hu/Position-Pure-Algorithm
6+
* * Note: Manual permutation logic is used in precomputation to prevent
7+
* GCC -Wstringop-overflow warnings during aggressive optimization.
88
*/
99

1010
#include <iostream>
11-
#include <algorithm>
12-
#include <chrono>
1311
#include <vector>
1412
#include <iomanip>
1513
#include <immintrin.h>
16-
#include <array>
17-
#include <cstring>
14+
#include <cstring>
15+
#include <algorithm>
1816

1917
// ── Architecture Configuration ───────────────────────────────────────
2018
constexpr int TAIL_DEPTH = 5;
2119
constexpr int FLAT_STEPS = 119;
2220
constexpr int XMM_LANES = 16;
2321

22+
// Align LUT to 16-byte boundary for SIMD compatibility
2423
alignas(16) uint8_t flat_lut_N5[FLAT_STEPS][XMM_LANES];
2524

26-
// ── 1. Precompute: SIMD blind-shuffle masks ───────────────────────────
25+
// ── 1. Precompute: Manual permutation to avoid iterator-based warnings ──
26+
void next_perm_manual(uint8_t* p, int n) {
27+
int i = n - 1;
28+
while (i > 0 && p[i - 1] >= p[i]) i--;
29+
if (i <= 0) {
30+
std::reverse(p, p + n);
31+
return;
32+
}
33+
int j = n - 1;
34+
while (p[j] <= p[i - 1]) j--;
35+
std::swap(p[i - 1], p[j]);
36+
std::reverse(p + i, p + n);
37+
}
38+
2739
void precompute_only_flat_lut_N5() {
28-
// 修复方案:将数组大小增加到 16,以满足 SIMD 优化的对齐要求,
29-
// 同时避免 std::next_permutation 在处理过小容器时被编译器优化导致的溢出警告。
30-
std::array<uint8_t, 16> P;
40+
uint8_t P[TAIL_DEPTH];
3141
for (int i = 0; i < TAIL_DEPTH; ++i) P[i] = i;
3242

3343
for (int step = 0; step < FLAT_STEPS; ++step) {
34-
std::array<uint8_t, 16> M;
44+
uint8_t M[TAIL_DEPTH];
3545
for (int j = 0; j < TAIL_DEPTH; ++j) M[P[j]] = j;
3646

37-
// 使用实际有效的区间 [0, TAIL_DEPTH) 进行置换
38-
std::next_permutation(P.begin(), P.begin() + TAIL_DEPTH);
47+
next_perm_manual(P, TAIL_DEPTH);
3948

49+
// Ensure all lanes are initialized to prevent garbage data
4050
std::memset(flat_lut_N5[step], 0, XMM_LANES);
4151
for (int i = 0; i < TAIL_DEPTH; ++i) {
4252
flat_lut_N5[step][i] = M[P[i]];
4353
}
4454
}
4555
}
4656

47-
// ── 2. Accelerated engine ──────────────────────────────────────────
57+
// ── 2. Accelerated engine: SIMD blind ops + boundary skip ──
4858
unsigned long long benchmark_accelerated(int N) {
4959
std::vector<int> D(N);
5060
for(int i = 0; i < N; ++i) D[i] = i;
5161

62+
// Aligned buffer to ensure safe memory access for SIMD instructions
5263
alignas(16) uint8_t buffer[32] = {0};
53-
5464
std::memcpy(buffer, &D[N - TAIL_DEPTH], TAIL_DEPTH * sizeof(int));
5565
__m128i p_reg = _mm_load_si128((__m128i*)buffer);
5666

@@ -59,15 +69,18 @@ unsigned long long benchmark_accelerated(int N) {
5969
for(int i = 1; i <= N; ++i) max_perms *= i;
6070

6171
while (total_count < max_perms) {
72+
// [SIMD path]: Execute 119 rapid state transitions
6273
for (int step = 0; step < FLAT_STEPS; ++step) {
6374
__m128i mask = _mm_load_si128((__m128i*)flat_lut_N5[step]);
6475
p_reg = _mm_shuffle_epi8(p_reg, mask);
6576
}
6677
total_count += FLAT_STEPS;
6778

79+
// [Sync]: Write back to memory
6880
_mm_store_si128((__m128i*)buffer, p_reg);
6981
std::memcpy(&D[N - TAIL_DEPTH], buffer, TAIL_DEPTH * sizeof(int));
7082

83+
// Handle block boundary with standard library permutation
7184
if (std::next_permutation(D.begin(), D.end())) {
7285
total_count++;
7386
std::memcpy(buffer, &D[N - TAIL_DEPTH], TAIL_DEPTH * sizeof(int));
@@ -89,7 +102,7 @@ int main(int argc, char* argv[]) {
89102
auto e2 = std::chrono::high_resolution_clock::now();
90103

91104
double d2 = std::chrono::duration<double>(e2 - s2).count();
92-
std::cout << N << " Acc(s): " << d2 << " | Total: " << c2 << std::endl;
105+
std::cout << "N=" << N << " | Acc(s): " << d2 << " | Count: " << c2 << std::endl;
93106

94107
return 0;
95108
}

0 commit comments

Comments
 (0)