Replaced q1_0_g128 AVX2 with zcattacz's code

pl752 · pl752 · commit 167652c21070 · 2026-04-06T19:33:46.000+05:00
diff --git a/ggml/src/ggml-cpu/arch/x86/quants.c b/ggml/src/ggml-cpu/arch/x86/quants.c
@@ -708,44 +708,45 @@ void ggml_vec_dot_q1_0_g128_q8_0(int n, float * GGML_RESTRICT s, size_t bs, cons
     const block_q8_0 * GGML_RESTRICT y = vy;
 
 #if defined(__AVX2__)
-    // AVX2: expand each 32-bit sign stream to a byte mask, sign-flip qy
-    // directly in the byte domain, then reduce two Q8_0 sub-blocks in
-    // parallel before folding the pair into the outer block sum.
     const __m256i ones_8 = _mm256_set1_epi8(1);
     const __m256i ones_16 = _mm256_set1_epi16(1);
+    const __m256i byte_shuf = _mm256_setr_epi8(
+            0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
+            2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3);
+    const __m256i bit_masks = _mm256_setr_epi8(
+            1, 2, 4, 8, 16, 32, 64, (char) -128, 1, 2, 4, 8, 16, 32, 64, (char) -128,
+            1, 2, 4, 8, 16, 32, 64, (char) -128, 1, 2, 4, 8, 16, 32, 64, (char) -128);
     const __m256i zero = _mm256_setzero_si256();
     __m256 acc = _mm256_setzero_ps();
 
     for (int ib = 0; ib < nb; ++ib) {
         const float d0 = GGML_CPU_FP16_TO_FP32(x[ib].d);
-        __m256 acc_block_0 = _mm256_setzero_ps();
-        __m256 acc_block_1 = _mm256_setzero_ps();
-
-        for (int k = 0; k < 4; k += 2) {
-            const block_q8_0 * GGML_RESTRICT yb_0 = &y[ib * 4 + k + 0];
-            const block_q8_0 * GGML_RESTRICT yb_1 = &y[ib * 4 + k + 1];
-            const __m256i bit_mask_0 = bytes_from_bits_32(&x[ib].qs[(k + 0) * 4]);
-            const __m256i bit_mask_1 = bytes_from_bits_32(&x[ib].qs[(k + 1) * 4]);
-            const __m256i qy_0 = _mm256_loadu_si256((const __m256i *) yb_0->qs);
-            const __m256i qy_1 = _mm256_loadu_si256((const __m256i *) yb_1->qs);
-            const __m256i sign_mask_0 = _mm256_cmpeq_epi8(bit_mask_0, zero);
-            const __m256i sign_mask_1 = _mm256_cmpeq_epi8(bit_mask_1, zero);
-            const __m256i sy_0 = _mm256_sub_epi8(_mm256_xor_si256(qy_0, sign_mask_0), sign_mask_0);
-            const __m256i sy_1 = _mm256_sub_epi8(_mm256_xor_si256(qy_1, sign_mask_1), sign_mask_1);
-            const __m256i sum16_0 = _mm256_maddubs_epi16(ones_8, sy_0);
-            const __m256i sum16_1 = _mm256_maddubs_epi16(ones_8, sy_1);
-            const __m256i sum32_0 = _mm256_madd_epi16(sum16_0, ones_16);
-            const __m256i sum32_1 = _mm256_madd_epi16(sum16_1, ones_16);
-            const __m256 q_0 = _mm256_cvtepi32_ps(sum32_0);
-            const __m256 q_1 = _mm256_cvtepi32_ps(sum32_1);
-            const __m256 d1_0 = _mm256_set1_ps(GGML_CPU_FP16_TO_FP32(yb_0->d));
-            const __m256 d1_1 = _mm256_set1_ps(GGML_CPU_FP16_TO_FP32(yb_1->d));
-
-            acc_block_0 = _mm256_fmadd_ps(d1_0, q_0, acc_block_0);
-            acc_block_1 = _mm256_fmadd_ps(d1_1, q_1, acc_block_1);
-        }
+        const uint32_t * GGML_RESTRICT qs32 = (const uint32_t *) x[ib].qs;
+        const block_q8_0 * GGML_RESTRICT y_ptr = &y[ib * 4];
 
-        acc = _mm256_fmadd_ps(_mm256_set1_ps(d0), _mm256_add_ps(acc_block_0, acc_block_1), acc);
+        __m256 acc_block;
+        {
+            const __m256i qy = _mm256_loadu_si256((const __m256i *) y_ptr[0].qs);
+            const __m256i sm = _mm256_cmpeq_epi8(
+                    _mm256_and_si256(_mm256_shuffle_epi8(_mm256_set1_epi32((int) qs32[0]), byte_shuf), bit_masks), zero);
+            const __m256i sy = _mm256_sub_epi8(_mm256_xor_si256(qy, sm), sm);
+            const __m256i s32 = _mm256_madd_epi16(_mm256_maddubs_epi16(ones_8, sy), ones_16);
+            acc_block = _mm256_mul_ps(_mm256_set1_ps(GGML_CPU_FP16_TO_FP32(y_ptr[0].d)), _mm256_cvtepi32_ps(s32));
+        }
+#define Q1_AVX2_BLOCK(K) \
+        { \
+            const __m256i qy = _mm256_loadu_si256((const __m256i *) y_ptr[K].qs); \
+            const __m256i sm = _mm256_cmpeq_epi8( \
+                    _mm256_and_si256(_mm256_shuffle_epi8(_mm256_set1_epi32((int) qs32[K]), byte_shuf), bit_masks), zero); \
+            const __m256i sy = _mm256_sub_epi8(_mm256_xor_si256(qy, sm), sm); \
+            const __m256i s32 = _mm256_madd_epi16(_mm256_maddubs_epi16(ones_8, sy), ones_16); \
+            acc_block = _mm256_fmadd_ps(_mm256_set1_ps(GGML_CPU_FP16_TO_FP32(y_ptr[K].d)), _mm256_cvtepi32_ps(s32), acc_block); \
+        }
+        Q1_AVX2_BLOCK(1)
+        Q1_AVX2_BLOCK(2)
+        Q1_AVX2_BLOCK(3)
+#undef Q1_AVX2_BLOCK
+        acc = _mm256_fmadd_ps(_mm256_set1_ps(d0), acc_block, acc);
     }
 
     *s = hsum_float_8(acc);