Skip to content

Commit 3abc50d

Browse files
authored
Update LexCHA.cpp
1 parent 413d204 commit 3abc50d

1 file changed

Lines changed: 25 additions & 31 deletions

File tree

FullPermutation/LexCHA.cpp

Lines changed: 25 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,8 @@
22
* Official Implementation of LexCHA Indexing Algorithms
33
* Author: Yusheng Hu
44
* Research: A Divide-and-Conquer Engine for Lexicographical Permutations
5-
* * Note: This version is optimized for GCC 13+ under -O3 -march=native.
6-
* It includes strict memory alignment to prevent SIMD stringop-overflow
7-
* warnings and strongly prevents dead-code elimination during standard benchmarks.
5+
* * Note: Data types fixed to uint8_t to perfectly align with _mm_shuffle_epi8
6+
* byte-level operations, eliminating permutation count mismatches.
87
*/
98

109
#include <iostream>
@@ -21,12 +20,9 @@ constexpr int TAIL_DEPTH = 5;
2120
constexpr int FLAT_STEPS = 119;
2221
constexpr int XMM_LANES = 16;
2322

24-
// Align LUT to 16-byte boundary for SIMD compatibility
2523
alignas(16) uint8_t flat_lut_N5[FLAT_STEPS][XMM_LANES];
2624

2725
// ── 1. Manual Permutation ────────────────────────────────────────────
28-
// Uses 16-byte aligned buffers to satisfy GCC's aggressive SIMD safety checks
29-
// and avoid -Wstringop-overflow warnings during precomputation.
3026
void next_perm_manual(uint8_t* p, int n) {
3127
int i = n - 1;
3228
while (i > 0 && p[i - 1] >= p[i]) i--;
@@ -50,7 +46,6 @@ void next_perm_manual(uint8_t* p, int n) {
5046

5147
// ── Precomputation ───────────────────────────────────────────────────
5248
void precompute_only_flat_lut_N5() {
53-
// Declared as 16 bytes to ensure SIMD boundary safety
5449
alignas(16) uint8_t P[16];
5550
for (int i = 0; i < TAIL_DEPTH; ++i) P[i] = i;
5651

@@ -60,7 +55,8 @@ void precompute_only_flat_lut_N5() {
6055

6156
next_perm_manual(P, TAIL_DEPTH);
6257

63-
std::memset(flat_lut_N5[step], 0, XMM_LANES);
58+
// 0x80 safely zeros out the unused lanes during _mm_shuffle_epi8
59+
std::memset(flat_lut_N5[step], 0x80, XMM_LANES);
6460
for (int i = 0; i < TAIL_DEPTH; ++i) {
6561
flat_lut_N5[step][i] = M[P[i]];
6662
}
@@ -69,11 +65,13 @@ void precompute_only_flat_lut_N5() {
6965

7066
// ── 2. Accelerated Engine ────────────────────────────────────────────
7167
unsigned long long benchmark_accelerated(int N) {
72-
std::vector<int> D(N);
73-
for(int i = 0; i < N; ++i) D[i] = i;
68+
// FIX: Use uint8_t instead of int for byte-level SIMD compatibility
69+
std::vector<uint8_t> D(N);
70+
for(int i = 0; i < N; ++i) D[i] = static_cast<uint8_t>(i);
7471

75-
alignas(16) uint8_t buffer[32] = {0};
76-
std::memcpy(buffer, &D[N - TAIL_DEPTH], TAIL_DEPTH * sizeof(int));
72+
alignas(16) uint8_t buffer[16] = {0};
73+
// sizeof(uint8_t) is 1, so we copy exactly TAIL_DEPTH bytes
74+
std::memcpy(buffer, &D[N - TAIL_DEPTH], TAIL_DEPTH);
7775
__m128i p_reg = _mm_load_si128((__m128i*)buffer);
7876

7977
unsigned long long total_count = 1;
@@ -88,11 +86,11 @@ unsigned long long benchmark_accelerated(int N) {
8886
total_count += FLAT_STEPS;
8987

9088
_mm_store_si128((__m128i*)buffer, p_reg);
91-
std::memcpy(&D[N - TAIL_DEPTH], buffer, TAIL_DEPTH * sizeof(int));
89+
std::memcpy(&D[N - TAIL_DEPTH], buffer, TAIL_DEPTH);
9290

9391
if (std::next_permutation(D.begin(), D.end())) {
9492
total_count++;
95-
std::memcpy(buffer, &D[N - TAIL_DEPTH], TAIL_DEPTH * sizeof(int));
93+
std::memcpy(buffer, &D[N - TAIL_DEPTH], TAIL_DEPTH);
9694
p_reg = _mm_load_si128((__m128i*)buffer);
9795
}
9896
}
@@ -106,11 +104,10 @@ int main(int argc, char* argv[]) {
106104

107105
precompute_only_flat_lut_N5();
108106

109-
// --- Benchmark 1: Standard Method ---
110-
// Creating a real permutation loop to prevent GCC from eliminating
111-
// the code block via Dead Code Elimination under -O3.
112-
std::vector<int> V(N);
113-
for(int i = 0; i < N; ++i) V[i] = i;
107+
// Benchmark 1: Standard Method
108+
// FIX: Using uint8_t to ensure fair comparison
109+
std::vector<uint8_t> V(N);
110+
for(int i = 0; i < N; ++i) V[i] = static_cast<uint8_t>(i);
114111

115112
auto s1 = std::chrono::high_resolution_clock::now();
116113
unsigned long long c1 = 0;
@@ -120,26 +117,17 @@ int main(int argc, char* argv[]) {
120117
auto e1 = std::chrono::high_resolution_clock::now();
121118

122119
double d1 = std::chrono::duration<double>(e1 - s1).count();
123-
if (d1 < 1e-9) d1 = 1e-9; // Prevent division by zero
120+
if (d1 < 1e-9) d1 = 1e-9;
124121

125-
// --- Benchmark 2: Accelerated Method ---
122+
// Benchmark 2: Accelerated Method
126123
auto s2 = std::chrono::high_resolution_clock::now();
127124
unsigned long long c2 = benchmark_accelerated(N);
128125
auto e2 = std::chrono::high_resolution_clock::now();
129126

130127
double d2 = std::chrono::duration<double>(e2 - s2).count();
131128
if (d2 < 1e-9) d2 = 1e-9;
132129

133-
// --- Sanity Check ---
134-
// This check guarantees the compiler MUST evaluate c1 and c2,
135-
// further enforcing that the loops actually run.
136-
if (c1 != c2) {
137-
std::cerr << "Error: Count mismatch! Std: " << c1 << " Acc: " << c2 << std::endl;
138-
return 1;
139-
}
140-
141-
// --- Output formatting for AWK script ---
142-
// Target columns: N | Std(s) | Acc(s) | Std_ns/perm | Acc_ns/perm | Speedup
130+
// Output formatting for AWK script
143131
double ns_std = (d1 * 1e9) / c1;
144132
double ns_acc = (d2 * 1e9) / c2;
145133
double speedup = d1 / d2;
@@ -151,5 +139,11 @@ int main(int argc, char* argv[]) {
151139
<< ns_acc << " "
152140
<< speedup << std::endl;
153141

142+
// Sanity Check: If this triggers, something is fundamentally wrong
143+
if (c1 != c2) {
144+
std::cerr << "Error: Count mismatch! Std: " << c1 << " Acc: " << c2 << std::endl;
145+
return 1;
146+
}
147+
154148
return 0;
155149
}

0 commit comments

Comments
 (0)