@@ -637,22 +637,22 @@ void ggml_vec_dot_q1_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
637637 const __m256 q = _mm256_cvtepi32_ps(MM256_SET_M128I(sum32_1, sum32_0)); \
638638 acc_block = _mm256_add_ps(acc_block, _mm256_mul_ps(_mm256_set1_ps(GGML_CPU_FP16_TO_FP32(y_ptr[(K)].d)), q)); \
639639 }
640- { \
641- const __m256i bit_mask = bytes_from_bits_32 (& x [ib ].qs [0 ]); \
642- const __m128i bit_mask_0 = _mm256_castsi256_si128 (bit_mask ); \
643- const __m128i bit_mask_1 = _mm256_extractf128_si256 (bit_mask , 1 ); \
644- const __m128i qy_0 = _mm_loadu_si128 ((const __m128i * ) & y_ptr [0 ].qs [0 ]); \
645- const __m128i qy_1 = _mm_loadu_si128 ((const __m128i * ) & y_ptr [0 ].qs [16 ]); \
646- const __m128i sign_mask_0 = _mm_cmpeq_epi8 (bit_mask_0 , zero ); \
647- const __m128i sign_mask_1 = _mm_cmpeq_epi8 (bit_mask_1 , zero ); \
648- const __m128i sy_0 = _mm_sub_epi8 (_mm_xor_si128 (qy_0 , sign_mask_0 ), sign_mask_0 ); \
649- const __m128i sy_1 = _mm_sub_epi8 (_mm_xor_si128 (qy_1 , sign_mask_1 ), sign_mask_1 ); \
650- const __m128i sum16_0 = _mm_maddubs_epi16 (ones_8 , sy_0 ); \
651- const __m128i sum16_1 = _mm_maddubs_epi16 (ones_8 , sy_1 ); \
652- const __m128i sum32_0 = _mm_madd_epi16 (sum16_0 , ones_16 ); \
653- const __m128i sum32_1 = _mm_madd_epi16 (sum16_1 , ones_16 ); \
654- const __m256 q = _mm256_cvtepi32_ps (MM256_SET_M128I (sum32_1 , sum32_0 )); \
655- acc_block = _mm256_mul_ps (_mm256_set1_ps (GGML_CPU_FP16_TO_FP32 (y_ptr [0 ].d )), q ); \
640+ {
641+ const __m256i bit_mask = bytes_from_bits_32 (& x [ib ].qs [0 ]);
642+ const __m128i bit_mask_0 = _mm256_castsi256_si128 (bit_mask );
643+ const __m128i bit_mask_1 = _mm256_extractf128_si256 (bit_mask , 1 );
644+ const __m128i qy_0 = _mm_loadu_si128 ((const __m128i * ) & y_ptr [0 ].qs [0 ]);
645+ const __m128i qy_1 = _mm_loadu_si128 ((const __m128i * ) & y_ptr [0 ].qs [16 ]);
646+ const __m128i sign_mask_0 = _mm_cmpeq_epi8 (bit_mask_0 , zero );
647+ const __m128i sign_mask_1 = _mm_cmpeq_epi8 (bit_mask_1 , zero );
648+ const __m128i sy_0 = _mm_sub_epi8 (_mm_xor_si128 (qy_0 , sign_mask_0 ), sign_mask_0 );
649+ const __m128i sy_1 = _mm_sub_epi8 (_mm_xor_si128 (qy_1 , sign_mask_1 ), sign_mask_1 );
650+ const __m128i sum16_0 = _mm_maddubs_epi16 (ones_8 , sy_0 );
651+ const __m128i sum16_1 = _mm_maddubs_epi16 (ones_8 , sy_1 );
652+ const __m128i sum32_0 = _mm_madd_epi16 (sum16_0 , ones_16 );
653+ const __m128i sum32_1 = _mm_madd_epi16 (sum16_1 , ones_16 );
654+ const __m256 q = _mm256_cvtepi32_ps (MM256_SET_M128I (sum32_1 , sum32_0 ));
655+ acc_block = _mm256_mul_ps (_mm256_set1_ps (GGML_CPU_FP16_TO_FP32 (y_ptr [0 ].d )), q );
656656 }
657657 Q1_AVX_BLOCK (1 )
658658 Q1_AVX_BLOCK (2 )
0 commit comments