Update LexCHA.cpp

Yusheng-Hu · web-flow · commit 0bb7fc3d9340 · 2026-06-10T14:16:52.000+08:00
diff --git a/FullPermutation/LexCHA.cpp b/FullPermutation/LexCHA.cpp
@@ -1,7 +1,7 @@
 /**
  * Official Implementation of LexCHA Indexing Algorithms
- * Final Fix: Using 16-byte aligned buffer for manual permutation to 
- * satisfy GCC's aggressive optimization safety checks.
+ * Author: Yusheng Hu
+ * Final Version: Optimized for GitHub Actions / GCC 13+
  */
 
 #include <iostream>
@@ -13,19 +13,18 @@
 #include <cstdint>
 #include <chrono>
 
+// ── Architecture Configuration ───────────────────────────────────────
 constexpr int TAIL_DEPTH = 5;
 constexpr int FLAT_STEPS = 119;
 constexpr int XMM_LANES = 16;
 
 alignas(16) uint8_t flat_lut_N5[FLAT_STEPS][XMM_LANES];
 
-// ── 1. Manual permutation with 16-byte safe buffer ───────────────────
+// ── 1. Manual permutation: Uses aligned buffer to satisfy GCC safety checks ──
 void next_perm_manual(uint8_t* p, int n) {
     int i = n - 1;
     while (i > 0 && p[i - 1] >= p[i]) i--;
     if (i <= 0) {
-        // Use a local 16-byte aligned array to satisfy the compiler's 
-        // need for safety during reverse operations.
         alignas(16) uint8_t temp[16];
         for(int k=0; k<n; ++k) temp[k] = p[n-1-k];
         for(int k=0; k<n; ++k) p[k] = temp[k];
@@ -35,15 +34,13 @@ void next_perm_manual(uint8_t* p, int n) {
     while (p[j] <= p[i - 1]) j--;
     std::swap(p[i - 1], p[j]);
     
-    // Reverse the tail using the same safe logic
     alignas(16) uint8_t tail[16];
     int tail_len = n - i;
     for(int k=0; k<tail_len; ++k) tail[k] = p[n-1-k];
     for(int k=0; k<tail_len; ++k) p[i+k] = tail[k];
 }
 
 void precompute_only_flat_lut_N5() {
-    // Declaring 16 bytes instead of 5 to align with SIMD register sizes
     alignas(16) uint8_t P[16]; 
     for (int i = 0; i < TAIL_DEPTH; ++i) P[i] = i;
 
@@ -92,16 +89,33 @@ unsigned long long benchmark_accelerated(int N) {
     return total_count;
 }
 
+// ── 3. Main driver: Output formatted for AWK ─────────────────────────
 int main(int argc, char* argv[]) {
     if (argc < 2) return 1;
     int N = std::atoi(argv[1]);
-    precompute_only_flat_lut_N5();
     
+    precompute_only_flat_lut_N5();
+
+    // Standard baseline (factorial-based approximation for speed)
+    auto s1 = std::chrono::high_resolution_clock::now();
+    unsigned long long c1 = 1; for(int i=1; i<=N; ++i) c1 *= i;
+    auto e1 = std::chrono::high_resolution_clock::now();
+    double d1 = std::chrono::duration<double>(e1 - s1).count();
+    if(d1 == 0) d1 = 1e-9; // Prevent division by zero
+
+    // Accelerated run
     auto s2 = std::chrono::high_resolution_clock::now();
     unsigned long long c2 = benchmark_accelerated(N);
     auto e2 = std::chrono::high_resolution_clock::now();
-
     double d2 = std::chrono::duration<double>(e2 - s2).count();
-    std::cout << "N=" << N << " | Acc(s): " << d2 << " | Count: " << c2 << std::endl;
+
+    // Output raw data: N, Std(s), Acc(s), Std_ns/perm, Acc_ns/perm, Speedup
+    std::cout << N << " " 
+              << d1 << " " 
+              << d2 << " " 
+              << (d1 * 1e9) / c1 << " " 
+              << (d2 * 1e9) / c2 << " " 
+              << d1/d2 << "x" << std::endl;
+
     return 0;
 }