@@ -1435,3 +1435,74 @@ void ggml_vec_dot_rq6_1_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
14351435 }
14361436 * s = sumf ;
14371437}
1438+
1439+ // ============================ TurboQuant WHT wrappers (turbo3_0, turbo2_0)
1440+
1441+ void quantize_row_turbo3_0 (const float * GGML_RESTRICT x , void * GGML_RESTRICT y , int64_t k ) {
1442+ quantize_row_turbo3_0_ref (x , y , k );
1443+ }
1444+
1445+ void quantize_row_turbo2_0 (const float * GGML_RESTRICT x , void * GGML_RESTRICT y , int64_t k ) {
1446+ quantize_row_turbo2_0_ref (x , y , k );
1447+ }
1448+
1449+ void ggml_vec_dot_turbo3_0_q8_0 (int n , float * GGML_RESTRICT s , size_t bs , const void * GGML_RESTRICT vx , size_t bx , const void * GGML_RESTRICT vy , size_t by , int nrc ) {
1450+ GGML_UNUSED (bs ); GGML_UNUSED (bx ); GGML_UNUSED (by ); GGML_UNUSED (nrc );
1451+ assert (nrc == 1 );
1452+ const int nb = n / QK_TURBO3_0 ;
1453+ float sumf = 0.0f ;
1454+ float tmp_x [QK_TURBO3_0 ];
1455+ float tmp_y [QK8_0 ];
1456+ const block_turbo3_0 * GGML_RESTRICT x = (const block_turbo3_0 * )vx ;
1457+ const block_q8_0 * GGML_RESTRICT y = (const block_q8_0 * )vy ;
1458+ for (int i = 0 ; i < nb ; i ++ ) {
1459+ dequantize_row_turbo3_0 (& x [i ], tmp_x , QK_TURBO3_0 );
1460+ dequantize_row_q8_0 (& y [i ], tmp_y , QK8_0 );
1461+ for (int k = 0 ; k < QK8_0 ; k ++ ) {
1462+ sumf += tmp_x [k ] * tmp_y [k ];
1463+ }
1464+ }
1465+ * s = sumf ;
1466+ }
1467+
1468+ void ggml_vec_dot_turbo2_0_q8_0 (int n , float * GGML_RESTRICT s , size_t bs , const void * GGML_RESTRICT vx , size_t bx , const void * GGML_RESTRICT vy , size_t by , int nrc ) {
1469+ GGML_UNUSED (bs ); GGML_UNUSED (bx ); GGML_UNUSED (by ); GGML_UNUSED (nrc );
1470+ assert (nrc == 1 );
1471+ const int nb = n / QK_TURBO2_0 ;
1472+ float sumf = 0.0f ;
1473+ float tmp_x [QK_TURBO2_0 ];
1474+ float tmp_y [QK8_0 ];
1475+ const block_turbo2_0 * GGML_RESTRICT x = (const block_turbo2_0 * )vx ;
1476+ const block_q8_0 * GGML_RESTRICT y = (const block_q8_0 * )vy ;
1477+ for (int i = 0 ; i < nb ; i ++ ) {
1478+ dequantize_row_turbo2_0 (& x [i ], tmp_x , QK_TURBO2_0 );
1479+ dequantize_row_q8_0 (& y [i ], tmp_y , QK8_0 );
1480+ for (int k = 0 ; k < QK8_0 ; k ++ ) {
1481+ sumf += tmp_x [k ] * tmp_y [k ];
1482+ }
1483+ }
1484+ * s = sumf ;
1485+ }
1486+
1487+ void quantize_row_turbo4_0 (const float * GGML_RESTRICT x , void * GGML_RESTRICT y , int64_t k ) {
1488+ quantize_row_turbo4_0_ref (x , y , k );
1489+ }
1490+
1491+ void ggml_vec_dot_turbo4_0_q8_0 (int n , float * GGML_RESTRICT s , size_t bs , const void * GGML_RESTRICT vx , size_t bx , const void * GGML_RESTRICT vy , size_t by , int nrc ) {
1492+ GGML_UNUSED (bs ); GGML_UNUSED (bx ); GGML_UNUSED (by ); GGML_UNUSED (nrc );
1493+ assert (nrc == 1 );
1494+ const int nb = n / QK_TURBO4_0 ;
1495+ float sumf = 0.0f ;
1496+ float tmp_x [QK_TURBO4_0 ];
1497+ float tmp_y [QK8_0 ];
1498+ const block_turbo4_0 * GGML_RESTRICT x_b = (const block_turbo4_0 * )vx ;
1499+ const block_q8_0 * GGML_RESTRICT y_b = (const block_q8_0 * )vy ;
1500+ for (int i = 0 ; i < nb ; i ++ ) {
1501+ dequantize_row_turbo4_0 (& x_b [i ], tmp_x , QK_TURBO4_0 );
1502+ dequantize_row_q8_0 (& y_b [i ], tmp_y , QK8_0 );
1503+ for (int k = 0 ; k < QK8_0 ; k ++ ) {
1504+ sumf += tmp_x [k ] * tmp_y [k ];
1505+ }
1506+ }
1507+ * s = sumf ;
1508+ }
0 commit comments