Merge: ci: fix turbo build + test failures (#66)

TheTom · web-flow · commit 45f8a066ed5f · 2026-04-14T13:41:59.000-05:00
ci: fix turbo build + test failures on feature/turboquant-kv-cache
diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp
@@ -5594,6 +5594,9 @@ void ggml_compute_forward_clamp(
         case GGML_TYPE_TQ2_0:
         case GGML_TYPE_TQ3_1S:
         case GGML_TYPE_TQ4_1S:
+        case GGML_TYPE_TURBO2_0:
+        case GGML_TYPE_TURBO3_0:
+        case GGML_TYPE_TURBO4_0:
         case GGML_TYPE_IQ2_XXS:
         case GGML_TYPE_IQ2_XS:
         case GGML_TYPE_IQ3_XXS:
diff --git a/ggml/src/ggml-cuda/turbo-quant.cuh b/ggml/src/ggml-cuda/turbo-quant.cuh
@@ -176,10 +176,10 @@ static void turbo_innerq_init(void) {
     // Zero accumulators and set calibrating flag on device
     float zeros[INNERQ_MAX_CHANNELS] = {0};
     int zero = 0, one = 1;
-    cudaMemcpyToSymbol(d_innerq_sq_accum, zeros, sizeof(zeros));
-    cudaMemcpyToSymbol(d_innerq_count, &zero, sizeof(int));
-    cudaMemcpyToSymbol(d_innerq_active, &zero, sizeof(int));
-    cudaMemcpyToSymbol(d_innerq_calibrating, &one, sizeof(int));
+    CUDA_CHECK(cudaMemcpyToSymbol(d_innerq_sq_accum, zeros, sizeof(zeros)));
+    CUDA_CHECK(cudaMemcpyToSymbol(d_innerq_count, &zero, sizeof(int)));
+    CUDA_CHECK(cudaMemcpyToSymbol(d_innerq_active, &zero, sizeof(int)));
+    CUDA_CHECK(cudaMemcpyToSymbol(d_innerq_calibrating, &one, sizeof(int)));
 
     GGML_LOG_INFO("%s: InnerQ calibration started (target=%d tokens, strength=%.2f)\n",
                    __func__, innerq_target_tokens, innerq_strength);
@@ -190,14 +190,14 @@ static void turbo_innerq_finalize(int group_size) {
     // Read accumulators from device
     float sq_accum[INNERQ_MAX_CHANNELS];
     int count = 0;
-    cudaMemcpyFromSymbol(sq_accum, d_innerq_sq_accum, group_size * sizeof(float));
-    cudaMemcpyFromSymbol(&count, d_innerq_count, sizeof(int));
+    CUDA_CHECK(cudaMemcpyFromSymbol(sq_accum, d_innerq_sq_accum, group_size * sizeof(float)));
+    CUDA_CHECK(cudaMemcpyFromSymbol(&count, d_innerq_count, sizeof(int)));
 
     if (count <= 0) {
         GGML_LOG_WARN("%s: InnerQ calibration got 0 tokens, disabling\n", __func__);
         innerq_enabled = 0;
         int zero = 0;
-        cudaMemcpyToSymbol(d_innerq_calibrating, &zero, sizeof(int));
+        CUDA_CHECK(cudaMemcpyToSymbol(d_innerq_calibrating, &zero, sizeof(int)));
         return;
     }
 
@@ -231,17 +231,17 @@ static void turbo_innerq_finalize(int group_size) {
                        __func__, max_ratio);
         innerq_enabled = 0;
         int zero = 0;
-        cudaMemcpyToSymbol(d_innerq_calibrating, &zero, sizeof(int));
+        CUDA_CHECK(cudaMemcpyToSymbol(d_innerq_calibrating, &zero, sizeof(int)));
         return;
     }
 
     // Stop calibrating, upload scales, activate
     int zero = 0, one = 1;
-    cudaMemcpyToSymbol(d_innerq_calibrating, &zero, sizeof(int));
-    cudaMemcpyToSymbol(d_innerq_scale, scale, group_size * sizeof(float));
-    cudaMemcpyToSymbol(d_innerq_scale_inv, scale_inv, group_size * sizeof(float));
-    cudaDeviceSynchronize();  // ensure scales are visible before activating
-    cudaMemcpyToSymbol(d_innerq_active, &one, sizeof(int));
+    CUDA_CHECK(cudaMemcpyToSymbol(d_innerq_calibrating, &zero, sizeof(int)));
+    CUDA_CHECK(cudaMemcpyToSymbol(d_innerq_scale, scale, group_size * sizeof(float)));
+    CUDA_CHECK(cudaMemcpyToSymbol(d_innerq_scale_inv, scale_inv, group_size * sizeof(float)));
+    CUDA_CHECK(cudaDeviceSynchronize());  // ensure scales are visible before activating
+    CUDA_CHECK(cudaMemcpyToSymbol(d_innerq_active, &one, sizeof(int)));
 
     innerq_enabled = 2;  // active
 
@@ -272,15 +272,15 @@ static void turbo_innerq_check_finalize(int group_size, int64_t ne00) {
                            __func__, (long long)ne00, group_size);
             innerq_enabled = 0;
             int zero = 0;
-            cudaMemcpyToSymbol(d_innerq_calibrating, &zero, sizeof(int));
+            CUDA_CHECK(cudaMemcpyToSymbol(d_innerq_calibrating, &zero, sizeof(int)));
         }
         return;
     }
 
     // Check if calibration is complete
     if (innerq_enabled == 1) {
         int count = 0;
-        cudaMemcpyFromSymbol(&count, d_innerq_count, sizeof(int));
+        CUDA_CHECK(cudaMemcpyFromSymbol(&count, d_innerq_count, sizeof(int)));
         if (count >= innerq_target_tokens) {
             turbo_innerq_finalize(group_size);
         }
diff --git a/tests/test-quantize-fns.cpp b/tests/test-quantize-fns.cpp
@@ -48,7 +48,10 @@ static float array_rmse(const float * a1, const float * a2, size_t n) {
 
 // Total quantization error on test data
 static float total_quantization_error(const ggml_type_traits * qfns, const ggml_type_traits_cpu * qfns_cpu, size_t test_size, const float * test_data) {
-    std::vector<uint8_t> tmp_q(2*test_size);
+    // Buffer must be large enough for the row's byte size. For types whose
+    // vec_dot_type is GGML_TYPE_F32 (e.g. turbo quants), from_float writes
+    // test_size*sizeof(float) bytes, which exceeds the legacy 2*test_size sizing.
+    std::vector<uint8_t> tmp_q(std::max<size_t>(2*test_size, test_size * sizeof(float)));
     std::vector<float> tmp_out(test_size);
 
     qfns_cpu->from_float(test_data, tmp_q.data(), test_size);
@@ -58,7 +61,7 @@ static float total_quantization_error(const ggml_type_traits * qfns, const ggml_
 
 // Total quantization error on test data
 static float reference_quantization_error(const ggml_type_traits * qfns, const ggml_type_traits_cpu * qfns_cpu, size_t test_size, const float * test_data) {
-    std::vector<uint8_t> tmp_q(2*test_size);
+    std::vector<uint8_t> tmp_q(std::max<size_t>(2*test_size, test_size * sizeof(float)));
     std::vector<float> tmp_out(test_size);
     std::vector<float> tmp_out_ref(test_size);
 
@@ -84,8 +87,10 @@ static float dot_product(const float * a1, const float * a2, size_t test_size) {
 static float dot_product_error(const ggml_type_traits * qfns, const ggml_type_traits_cpu * qfns_cpu, size_t test_size, const float * test_data1, const float * test_data2) {
     GGML_UNUSED(qfns);
 
-    std::vector<uint8_t> tmp_q1(2*test_size);
-    std::vector<uint8_t> tmp_q2(2*test_size);
+    // For turbo quants vec_dot_type is GGML_TYPE_F32, so vdot->from_float writes
+    // test_size*sizeof(float) bytes. Size buffers accordingly.
+    std::vector<uint8_t> tmp_q1(std::max<size_t>(2*test_size, test_size * sizeof(float)));
+    std::vector<uint8_t> tmp_q2(std::max<size_t>(2*test_size, test_size * sizeof(float)));
 
     const auto * vdot = ggml_get_type_traits_cpu(qfns_cpu->vec_dot_type);
 
@@ -137,6 +142,16 @@ int main(int argc, char * argv[]) {
             continue;
         }
 
+        // TurboQuant KV-cache types (TURBO2_0/TURBO3_0/TURBO4_0) intentionally keep
+        // their dequantized output in the WHT-rotated domain; the inverse WHT is
+        // applied separately via GGML_OP_TURBO_WHT in the attention graph. They do
+        // not round-trip through float space, so the total/reference/dot-product
+        // error tests in this harness are not applicable.
+        if (type == GGML_TYPE_TURBO2_0 || type == GGML_TYPE_TURBO3_0 || type == GGML_TYPE_TURBO4_0) {
+            printf("Testing %s (skipped: rotated-domain KV quant)\n", ggml_type_name(type));
+            continue;
+        }
+
         const ggml_type ei = (ggml_type)i;
 
         printf("Testing %s\n", ggml_type_name((ggml_type) i));
@@ -152,6 +167,7 @@ int main(int argc, char * argv[]) {
                 type == GGML_TYPE_Q3_K    ? MAX_QUANTIZATION_TOTAL_ERROR_3BITS :
                 type == GGML_TYPE_IQ3_S   ? MAX_QUANTIZATION_TOTAL_ERROR_3BITS :
                 type == GGML_TYPE_IQ3_XXS ? MAX_QUANTIZATION_TOTAL_ERROR_3BITS_XXS :
+                type == GGML_TYPE_TQ3_1S  ? MAX_QUANTIZATION_TOTAL_ERROR_3BITS :
                 type == GGML_TYPE_NVFP4   ? MAX_QUANTIZATION_TOTAL_ERROR_FP4 : MAX_QUANTIZATION_TOTAL_ERROR;
             failed = !(total_error < max_quantization_error);
             num_failed += failed;
@@ -168,7 +184,8 @@ int main(int argc, char * argv[]) {
 
             const float vec_dot_error = dot_product_error(qfns, qfns_cpu, test_size, test_data.data(), test_data2.data());
             const float max_allowed_error = type == GGML_TYPE_Q2_K || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ2_XXS ||
-                                            type == GGML_TYPE_IQ3_XXS || type == GGML_TYPE_IQ3_S || type == GGML_TYPE_IQ2_S
+                                            type == GGML_TYPE_IQ3_XXS || type == GGML_TYPE_IQ3_S || type == GGML_TYPE_IQ2_S ||
+                                            type == GGML_TYPE_TQ3_1S
                                           ? MAX_DOT_PRODUCT_ERROR_LOWBIT
                                           : type == GGML_TYPE_TQ1_0 || type == GGML_TYPE_TQ2_0
                                           ? MAX_DOT_PRODUCT_ERROR_TERNARY