Skip to content

Commit 63600aa

Browse files
authored
Update LexCHA.cpp
1 parent c2bcfd4 commit 63600aa

1 file changed

Lines changed: 19 additions & 16 deletions

File tree

FullPermutation/LexCHA.cpp

Lines changed: 19 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,7 @@
11
/**
22
* Official Implementation of LexCHA Indexing Algorithms
3-
* Author: Yusheng Hu
4-
* Research: A Divide-and-Conquer Engine for Lexicographical Permutations
5-
* * Note: Added <chrono> to resolve time-related compilation errors.
3+
* Final Fix: Using 16-byte aligned buffer for manual permutation to
4+
* satisfy GCC's aggressive optimization safety checks.
65
*/
76

87
#include <iostream>
@@ -11,33 +10,41 @@
1110
#include <immintrin.h>
1211
#include <cstring>
1312
#include <algorithm>
14-
#include <cstdint> // Required for uint8_t
15-
#include <chrono> // Required for std::chrono
13+
#include <cstdint>
14+
#include <chrono>
1615

17-
// ── Architecture Configuration ───────────────────────────────────────
1816
constexpr int TAIL_DEPTH = 5;
1917
constexpr int FLAT_STEPS = 119;
2018
constexpr int XMM_LANES = 16;
2119

22-
// Align LUT to 16-byte boundary for SIMD compatibility
2320
alignas(16) uint8_t flat_lut_N5[FLAT_STEPS][XMM_LANES];
2421

25-
// ── 1. Precompute: Manual permutation ────────────────────────────────
22+
// ── 1. Manual permutation with 16-byte safe buffer ───────────────────
2623
void next_perm_manual(uint8_t* p, int n) {
2724
int i = n - 1;
2825
while (i > 0 && p[i - 1] >= p[i]) i--;
2926
if (i <= 0) {
30-
std::reverse(p, p + n);
27+
// Use a local 16-byte aligned array to satisfy the compiler's
28+
// need for safety during reverse operations.
29+
alignas(16) uint8_t temp[16];
30+
for(int k=0; k<n; ++k) temp[k] = p[n-1-k];
31+
for(int k=0; k<n; ++k) p[k] = temp[k];
3132
return;
3233
}
3334
int j = n - 1;
3435
while (p[j] <= p[i - 1]) j--;
3536
std::swap(p[i - 1], p[j]);
36-
std::reverse(p + i, p + n);
37+
38+
// Reverse the tail using the same safe logic
39+
alignas(16) uint8_t tail[16];
40+
int tail_len = n - i;
41+
for(int k=0; k<tail_len; ++k) tail[k] = p[n-1-k];
42+
for(int k=0; k<tail_len; ++k) p[i+k] = tail[k];
3743
}
3844

3945
void precompute_only_flat_lut_N5() {
40-
uint8_t P[TAIL_DEPTH];
46+
// Declaring 16 bytes instead of 5 to align with SIMD register sizes
47+
alignas(16) uint8_t P[16];
4148
for (int i = 0; i < TAIL_DEPTH; ++i) P[i] = i;
4249

4350
for (int step = 0; step < FLAT_STEPS; ++step) {
@@ -58,7 +65,6 @@ unsigned long long benchmark_accelerated(int N) {
5865
std::vector<int> D(N);
5966
for(int i = 0; i < N; ++i) D[i] = i;
6067

61-
// Aligned buffer to ensure safe memory access for SIMD
6268
alignas(16) uint8_t buffer[32] = {0};
6369
std::memcpy(buffer, &D[N - TAIL_DEPTH], TAIL_DEPTH * sizeof(int));
6470
__m128i p_reg = _mm_load_si128((__m128i*)buffer);
@@ -86,19 +92,16 @@ unsigned long long benchmark_accelerated(int N) {
8692
return total_count;
8793
}
8894

89-
// ── 3. Main driver ───────────────────────────────────────────────────
9095
int main(int argc, char* argv[]) {
9196
if (argc < 2) return 1;
9297
int N = std::atoi(argv[1]);
93-
9498
precompute_only_flat_lut_N5();
95-
99+
96100
auto s2 = std::chrono::high_resolution_clock::now();
97101
unsigned long long c2 = benchmark_accelerated(N);
98102
auto e2 = std::chrono::high_resolution_clock::now();
99103

100104
double d2 = std::chrono::duration<double>(e2 - s2).count();
101105
std::cout << "N=" << N << " | Acc(s): " << d2 << " | Count: " << c2 << std::endl;
102-
103106
return 0;
104107
}

0 commit comments

Comments
 (0)