11/* *
22 * Official Implementation of LexCHA Indexing Algorithms
33 * Author: Yusheng Hu
4- * Final Version: Optimized for GitHub Actions / GCC 13+
4+ * Research: A Divide-and-Conquer Engine for Lexicographical Permutations
5+ * * Note: This version is optimized for GCC 13+ under -O3 -march=native.
6+ * It includes strict memory alignment to prevent SIMD stringop-overflow
7+ * warnings and strongly prevents dead-code elimination during standard benchmarks.
58 */
69
710#include < iostream>
@@ -18,29 +21,36 @@ constexpr int TAIL_DEPTH = 5;
1821constexpr int FLAT_STEPS = 119 ;
1922constexpr int XMM_LANES = 16 ;
2023
24+ // Align LUT to 16-byte boundary for SIMD compatibility
2125alignas (16 ) uint8_t flat_lut_N5[FLAT_STEPS ][XMM_LANES ];
2226
23- // ── 1. Manual permutation: Uses aligned buffer to satisfy GCC safety checks ──
27+ // ── 1. Manual Permutation ────────────────────────────────────────────
28+ // Uses 16-byte aligned buffers to satisfy GCC's aggressive SIMD safety checks
29+ // and avoid -Wstringop-overflow warnings during precomputation.
2430void next_perm_manual (uint8_t * p, int n) {
2531 int i = n - 1 ;
2632 while (i > 0 && p[i - 1 ] >= p[i]) i--;
33+
2734 if (i <= 0 ) {
2835 alignas (16 ) uint8_t temp[16 ];
29- for (int k= 0 ; k< n; ++k) temp[k] = p[n- 1 - k];
30- for (int k= 0 ; k< n; ++k) p[k] = temp[k];
36+ for (int k = 0 ; k < n; ++k) temp[k] = p[n - 1 - k];
37+ for (int k = 0 ; k < n; ++k) p[k] = temp[k];
3138 return ;
3239 }
40+
3341 int j = n - 1 ;
3442 while (p[j] <= p[i - 1 ]) j--;
3543 std::swap (p[i - 1 ], p[j]);
3644
3745 alignas (16 ) uint8_t tail[16 ];
3846 int tail_len = n - i;
39- for (int k= 0 ; k< tail_len; ++k) tail[k] = p[n- 1 - k];
40- for (int k= 0 ; k< tail_len; ++k) p[i+ k] = tail[k];
47+ for (int k = 0 ; k < tail_len; ++k) tail[k] = p[n - 1 - k];
48+ for (int k = 0 ; k < tail_len; ++k) p[i + k] = tail[k];
4149}
4250
51+ // ── Precomputation ───────────────────────────────────────────────────
4352void precompute_only_flat_lut_N5 () {
53+ // Declared as 16 bytes to ensure SIMD boundary safety
4454 alignas (16 ) uint8_t P[16 ];
4555 for (int i = 0 ; i < TAIL_DEPTH ; ++i) P[i] = i;
4656
@@ -57,7 +67,7 @@ void precompute_only_flat_lut_N5() {
5767 }
5868}
5969
60- // ── 2. Accelerated engine ──────────────────────────────────────────
70+ // ── 2. Accelerated Engine ── ──────────────────────────────────────────
6171unsigned long long benchmark_accelerated (int N) {
6272 std::vector<int > D (N);
6373 for (int i = 0 ; i < N; ++i) D[i] = i;
@@ -89,35 +99,57 @@ unsigned long long benchmark_accelerated(int N) {
8999 return total_count;
90100}
91101
92- // ── 3. Main driver: Output formatted for AWK ─────────────────────────
102+ // ── 3. Main Driver ────────────────────────── ─────────────────────────
93103int main (int argc, char * argv[]) {
94104 if (argc < 2 ) return 1 ;
95105 int N = std::atoi (argv[1 ]);
96106
97107 precompute_only_flat_lut_N5 ();
98108
99- // Standard baseline
109+ // --- Benchmark 1: Standard Method ---
110+ // Creating a real permutation loop to prevent GCC from eliminating
111+ // the code block via Dead Code Elimination under -O3.
112+ std::vector<int > V (N);
113+ for (int i = 0 ; i < N; ++i) V[i] = i;
114+
100115 auto s1 = std::chrono::high_resolution_clock::now ();
101- unsigned long long c1 = 1 ; for (int i=1 ; i<=N; ++i) c1 *= i;
116+ unsigned long long c1 = 0 ;
117+ do {
118+ c1++;
119+ } while (std::next_permutation (V.begin (), V.end ()));
102120 auto e1 = std::chrono::high_resolution_clock::now ();
121+
103122 double d1 = std::chrono::duration<double >(e1 - s1).count ();
104- if (d1 < 1e-9 ) d1 = 1e-9 ;
123+ if (d1 < 1e-9 ) d1 = 1e-9 ; // Prevent division by zero
105124
106- // Accelerated run
125+ // --- Benchmark 2: Accelerated Method ---
107126 auto s2 = std::chrono::high_resolution_clock::now ();
108127 unsigned long long c2 = benchmark_accelerated (N);
109128 auto e2 = std::chrono::high_resolution_clock::now ();
129+
110130 double d2 = std::chrono::duration<double >(e2 - s2).count ();
111- if (d2 < 1e-9 ) d2 = 1e-9 ;
131+ if (d2 < 1e-9 ) d2 = 1e-9 ;
132+
133+ // --- Sanity Check ---
134+ // This check guarantees the compiler MUST evaluate c1 and c2,
135+ // further enforcing that the loops actually run.
136+ if (c1 != c2) {
137+ std::cerr << " Error: Count mismatch! Std: " << c1 << " Acc: " << c2 << std::endl;
138+ return 1 ;
139+ }
140+
141+ // --- Output formatting for AWK script ---
142+ // Target columns: N | Std(s) | Acc(s) | Std_ns/perm | Acc_ns/perm | Speedup
143+ double ns_std = (d1 * 1e9 ) / c1;
144+ double ns_acc = (d2 * 1e9 ) / c2;
145+ double speedup = d1 / d2;
112146
113- // Use space-separated values for perfect AWK parsing
114- // Format: N Std(s) Acc(s) Std_ns/perm Acc_ns/perm Speedup
115147 std::cout << N << " "
116148 << std::fixed << std::setprecision (6 ) << d1 << " "
117149 << d2 << " "
118- << (d1 * 1e9 ) / c1 << " "
119- << (d2 * 1e9 ) / c2 << " "
120- << d1/d2 << std::endl;
150+ << ns_std << " "
151+ << ns_acc << " "
152+ << speedup << std::endl;
121153
122154 return 0 ;
123155}
0 commit comments