@@ -21,19 +21,21 @@ constexpr int TAIL_DEPTH = 5;
2121constexpr int FLAT_STEPS = 119 ;
2222constexpr int XMM_LANES = 16 ;
2323
24- // Align to 16 bytes for SIMD operations
2524alignas (16 ) uint8_t flat_lut_N5[FLAT_STEPS ][XMM_LANES ];
2625
2726// ── 1. Precompute: SIMD blind-shuffle masks ───────────────────────────
2827void precompute_only_flat_lut_N5 () {
29- std::array<uint8_t , TAIL_DEPTH > P;
28+ // 修复方案:将数组大小增加到 16,以满足 SIMD 优化的对齐要求,
29+ // 同时避免 std::next_permutation 在处理过小容器时被编译器优化导致的溢出警告。
30+ std::array<uint8_t , 16 > P;
3031 for (int i = 0 ; i < TAIL_DEPTH ; ++i) P[i] = i;
3132
3233 for (int step = 0 ; step < FLAT_STEPS ; ++step) {
33- std::array<uint8_t , TAIL_DEPTH > M;
34+ std::array<uint8_t , 16 > M;
3435 for (int j = 0 ; j < TAIL_DEPTH ; ++j) M[P[j]] = j;
3536
36- std::next_permutation (P.begin (), P.end ());
37+ // 使用实际有效的区间 [0, TAIL_DEPTH) 进行置换
38+ std::next_permutation (P.begin (), P.begin () + TAIL_DEPTH );
3739
3840 std::memset (flat_lut_N5[step], 0 , XMM_LANES );
3941 for (int i = 0 ; i < TAIL_DEPTH ; ++i) {
@@ -42,16 +44,13 @@ void precompute_only_flat_lut_N5() {
4244 }
4345}
4446
45- // ── 2. Accelerated engine: SIMD blind ops + next_permutation boundary skip ──
47+ // ── 2. Accelerated engine ──────────────────────────────────────── ──
4648unsigned long long benchmark_accelerated (int N) {
4749 std::vector<int > D (N);
4850 for (int i = 0 ; i < N; ++i) D[i] = i;
4951
50- // Increased buffer size to 32 bytes to safely hold 20 bytes (5 ints)
51- // while maintaining 16-byte alignment for SIMD operations.
5252 alignas (16 ) uint8_t buffer[32 ] = {0 };
5353
54- // Copy the last TAIL_DEPTH elements safely
5554 std::memcpy (buffer, &D[N - TAIL_DEPTH ], TAIL_DEPTH * sizeof (int ));
5655 __m128i p_reg = _mm_load_si128 ((__m128i*)buffer);
5756
@@ -66,7 +65,6 @@ unsigned long long benchmark_accelerated(int N) {
6665 }
6766 total_count += FLAT_STEPS ;
6867
69- // Store back to aligned buffer
7068 _mm_store_si128 ((__m128i*)buffer, p_reg);
7169 std::memcpy (&D[N - TAIL_DEPTH ], buffer, TAIL_DEPTH * sizeof (int ));
7270
@@ -79,37 +77,19 @@ unsigned long long benchmark_accelerated(int N) {
7977 return total_count;
8078}
8179
82- // ── 3. Baseline benchmark ────────────────────────────────────────────
83- unsigned long long benchmark_std (int N) {
84- std::vector<int > P (N);
85- for (int i = 0 ; i < N; ++i) P[i] = i;
86- unsigned long long count = 1 ;
87- while (std::next_permutation (P.begin (), P.end ())) count++;
88- return count;
89- }
90-
91- // ── 4. Main driver ───────────────────────────────────────────────────
80+ // ── 3. Main driver ───────────────────────────────────────────────────
9281int main (int argc, char * argv[]) {
9382 if (argc < 2 ) return 1 ;
9483 int N = std::atoi (argv[1 ]);
9584
9685 precompute_only_flat_lut_N5 ();
9786
98- auto s1 = std::chrono::high_resolution_clock::now ();
99- unsigned long long c1 = benchmark_std (N);
100- auto e1 = std::chrono::high_resolution_clock::now ();
101-
10287 auto s2 = std::chrono::high_resolution_clock::now ();
10388 unsigned long long c2 = benchmark_accelerated (N);
10489 auto e2 = std::chrono::high_resolution_clock::now ();
10590
106- double d1 = std::chrono::duration<double >(e1 - s1).count ();
10791 double d2 = std::chrono::duration<double >(e2 - s2).count ();
108-
109- // Output formatted row: N, Std(s), Acc(s), Std_ns/perm, Acc_ns/perm, Speedup
110- std::cout << N << " " << d1 << " " << d2 << " "
111- << (d1 * 1e9 ) / c1 << " " << (d2 * 1e9 ) / c2 << " "
112- << d1/d2 << " x" << std::endl;
92+ std::cout << N << " Acc(s): " << d2 << " | Total: " << c2 << std::endl;
11393
11494 return 0 ;
11595}
0 commit comments