Skip to content

Commit 413d204

Browse files
authored
Update LexCHA.cpp
1 parent 05d957f commit 413d204

1 file changed

Lines changed: 50 additions & 18 deletions

File tree

FullPermutation/LexCHA.cpp

Lines changed: 50 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,10 @@
11
/**
22
* Official Implementation of LexCHA Indexing Algorithms
33
* Author: Yusheng Hu
4-
* Final Version: Optimized for GitHub Actions / GCC 13+
4+
* Research: A Divide-and-Conquer Engine for Lexicographical Permutations
5+
* * Note: This version is optimized for GCC 13+ under -O3 -march=native.
6+
* It includes strict memory alignment to prevent SIMD stringop-overflow
7+
* warnings and strongly prevents dead-code elimination during standard benchmarks.
58
*/
69

710
#include <iostream>
@@ -18,29 +21,36 @@ constexpr int TAIL_DEPTH = 5;
1821
constexpr int FLAT_STEPS = 119;
1922
constexpr int XMM_LANES = 16;
2023

24+
// Align LUT to 16-byte boundary for SIMD compatibility
2125
alignas(16) uint8_t flat_lut_N5[FLAT_STEPS][XMM_LANES];
2226

23-
// ── 1. Manual permutation: Uses aligned buffer to satisfy GCC safety checks ──
27+
// ── 1. Manual Permutation ────────────────────────────────────────────
28+
// Uses 16-byte aligned buffers to satisfy GCC's aggressive SIMD safety checks
29+
// and avoid -Wstringop-overflow warnings during precomputation.
2430
void next_perm_manual(uint8_t* p, int n) {
2531
int i = n - 1;
2632
while (i > 0 && p[i - 1] >= p[i]) i--;
33+
2734
if (i <= 0) {
2835
alignas(16) uint8_t temp[16];
29-
for(int k=0; k<n; ++k) temp[k] = p[n-1-k];
30-
for(int k=0; k<n; ++k) p[k] = temp[k];
36+
for(int k = 0; k < n; ++k) temp[k] = p[n - 1 - k];
37+
for(int k = 0; k < n; ++k) p[k] = temp[k];
3138
return;
3239
}
40+
3341
int j = n - 1;
3442
while (p[j] <= p[i - 1]) j--;
3543
std::swap(p[i - 1], p[j]);
3644

3745
alignas(16) uint8_t tail[16];
3846
int tail_len = n - i;
39-
for(int k=0; k<tail_len; ++k) tail[k] = p[n-1-k];
40-
for(int k=0; k<tail_len; ++k) p[i+k] = tail[k];
47+
for(int k = 0; k < tail_len; ++k) tail[k] = p[n - 1 - k];
48+
for(int k = 0; k < tail_len; ++k) p[i + k] = tail[k];
4149
}
4250

51+
// ── Precomputation ───────────────────────────────────────────────────
4352
void precompute_only_flat_lut_N5() {
53+
// Declared as 16 bytes to ensure SIMD boundary safety
4454
alignas(16) uint8_t P[16];
4555
for (int i = 0; i < TAIL_DEPTH; ++i) P[i] = i;
4656

@@ -57,7 +67,7 @@ void precompute_only_flat_lut_N5() {
5767
}
5868
}
5969

60-
// ── 2. Accelerated engine ──────────────────────────────────────────
70+
// ── 2. Accelerated Engine ────────────────────────────────────────────
6171
unsigned long long benchmark_accelerated(int N) {
6272
std::vector<int> D(N);
6373
for(int i = 0; i < N; ++i) D[i] = i;
@@ -89,35 +99,57 @@ unsigned long long benchmark_accelerated(int N) {
8999
return total_count;
90100
}
91101

92-
// ── 3. Main driver: Output formatted for AWK ─────────────────────────
102+
// ── 3. Main Driver ───────────────────────────────────────────────────
93103
int main(int argc, char* argv[]) {
94104
if (argc < 2) return 1;
95105
int N = std::atoi(argv[1]);
96106

97107
precompute_only_flat_lut_N5();
98108

99-
// Standard baseline
109+
// --- Benchmark 1: Standard Method ---
110+
// Creating a real permutation loop to prevent GCC from eliminating
111+
// the code block via Dead Code Elimination under -O3.
112+
std::vector<int> V(N);
113+
for(int i = 0; i < N; ++i) V[i] = i;
114+
100115
auto s1 = std::chrono::high_resolution_clock::now();
101-
unsigned long long c1 = 1; for(int i=1; i<=N; ++i) c1 *= i;
116+
unsigned long long c1 = 0;
117+
do {
118+
c1++;
119+
} while (std::next_permutation(V.begin(), V.end()));
102120
auto e1 = std::chrono::high_resolution_clock::now();
121+
103122
double d1 = std::chrono::duration<double>(e1 - s1).count();
104-
if(d1 < 1e-9) d1 = 1e-9;
123+
if (d1 < 1e-9) d1 = 1e-9; // Prevent division by zero
105124

106-
// Accelerated run
125+
// --- Benchmark 2: Accelerated Method ---
107126
auto s2 = std::chrono::high_resolution_clock::now();
108127
unsigned long long c2 = benchmark_accelerated(N);
109128
auto e2 = std::chrono::high_resolution_clock::now();
129+
110130
double d2 = std::chrono::duration<double>(e2 - s2).count();
111-
if(d2 < 1e-9) d2 = 1e-9;
131+
if (d2 < 1e-9) d2 = 1e-9;
132+
133+
// --- Sanity Check ---
134+
// This check guarantees the compiler MUST evaluate c1 and c2,
135+
// further enforcing that the loops actually run.
136+
if (c1 != c2) {
137+
std::cerr << "Error: Count mismatch! Std: " << c1 << " Acc: " << c2 << std::endl;
138+
return 1;
139+
}
140+
141+
// --- Output formatting for AWK script ---
142+
// Target columns: N | Std(s) | Acc(s) | Std_ns/perm | Acc_ns/perm | Speedup
143+
double ns_std = (d1 * 1e9) / c1;
144+
double ns_acc = (d2 * 1e9) / c2;
145+
double speedup = d1 / d2;
112146

113-
// Use space-separated values for perfect AWK parsing
114-
// Format: N Std(s) Acc(s) Std_ns/perm Acc_ns/perm Speedup
115147
std::cout << N << " "
116148
<< std::fixed << std::setprecision(6) << d1 << " "
117149
<< d2 << " "
118-
<< (d1 * 1e9) / c1 << " "
119-
<< (d2 * 1e9) / c2 << " "
120-
<< d1/d2 << std::endl;
150+
<< ns_std << " "
151+
<< ns_acc << " "
152+
<< speedup << std::endl;
121153

122154
return 0;
123155
}

0 commit comments

Comments
 (0)