fix: HIP/ROCm compatibility — check cudaMemcpyToSymbol errors, guard D>=576 MMA

terrysimons · terrysimons · commit d17f1dc93a43 · 2026-03-31T21:50:49.000-07:00
Add CUDA_CHECK() to all cudaMemcpyToSymbol/cudaMemcpyFromSymbol calls
in the InnerQ calibration path. On HIP, unchecked errors from these
calls are sticky and poison the runtime, causing subsequent API calls
to fail with 'no ROCm-capable device is detected'.

Also guard the D&gt;=576 MMA flash attention dispatch and kernel selection
with #ifndef GGML_USE_HIP, matching the existing D&gt;=576 tile exclusion
(these kernels exceed HIP's shared/local memory limits).

Tested on: ROCm 6.4.4, gfx1151 (AMD Ryzen AI Max+ 395 / Strix Halo)
diff --git a/ggml/src/ggml-cuda/fattn.cu b/ggml/src/ggml-cuda/fattn.cu
@@ -135,6 +135,8 @@ static void ggml_cuda_flash_attn_ext_mma_f16(ggml_backend_cuda_context & ctx, gg
             GGML_ASSERT(V->ne[0] == 256);
             ggml_cuda_flash_attn_ext_mma_f16_switch_ncols2<256, 256>(ctx, dst);
             break;
+#ifndef GGML_USE_HIP
+        // D>=576 MMA kernels may exceed HIP shared memory limits
         case 576: {
             // For Deepseek, go straight to the ncols1 switch to avoid compiling unnecessary kernels.
             GGML_ASSERT(V->ne[0] == 512);
@@ -202,6 +204,7 @@ static void ggml_cuda_flash_attn_ext_mma_f16(ggml_backend_cuda_context & ctx, gg
                 ggml_cuda_flash_attn_ext_mma_f16_case<640, 512, 2, 16>(ctx, dst);
             }
         } break;
+#endif // GGML_USE_HIP
         default:
             GGML_ABORT("fatal error");
             break;
@@ -384,6 +387,8 @@ static best_fattn_kernel ggml_cuda_get_best_fattn_kernel(const int device, const
                 return BEST_FATTN_KERNEL_NONE;
             }
             break;
+#ifndef GGML_USE_HIP
+        // D>=576 kernels exceed HIP shared memory / local memory limits
         case 576:
         case 640:
             if (V->ne[0] != 512) {
@@ -393,6 +398,7 @@ static best_fattn_kernel ggml_cuda_get_best_fattn_kernel(const int device, const
                 return BEST_FATTN_KERNEL_NONE;
             }
             break;
+#endif
         default:
             return BEST_FATTN_KERNEL_NONE;
     }
diff --git a/ggml/src/ggml-cuda/turbo-quant.cuh b/ggml/src/ggml-cuda/turbo-quant.cuh
@@ -176,10 +176,10 @@ static void turbo_innerq_init(void) {
     // Zero accumulators and set calibrating flag on device
     float zeros[INNERQ_MAX_CHANNELS] = {0};
     int zero = 0, one = 1;
-    cudaMemcpyToSymbol(d_innerq_sq_accum, zeros, sizeof(zeros));
-    cudaMemcpyToSymbol(d_innerq_count, &zero, sizeof(int));
-    cudaMemcpyToSymbol(d_innerq_active, &zero, sizeof(int));
-    cudaMemcpyToSymbol(d_innerq_calibrating, &one, sizeof(int));
+    CUDA_CHECK(cudaMemcpyToSymbol(d_innerq_sq_accum, zeros, sizeof(zeros)));
+    CUDA_CHECK(cudaMemcpyToSymbol(d_innerq_count, &zero, sizeof(int)));
+    CUDA_CHECK(cudaMemcpyToSymbol(d_innerq_active, &zero, sizeof(int)));
+    CUDA_CHECK(cudaMemcpyToSymbol(d_innerq_calibrating, &one, sizeof(int)));
 
     GGML_LOG_INFO("%s: InnerQ calibration started (target=%d tokens, strength=%.2f)\n",
                    __func__, innerq_target_tokens, innerq_strength);
@@ -190,14 +190,14 @@ static void turbo_innerq_finalize(int group_size) {
     // Read accumulators from device
     float sq_accum[INNERQ_MAX_CHANNELS];
     int count = 0;
-    cudaMemcpyFromSymbol(sq_accum, d_innerq_sq_accum, group_size * sizeof(float));
-    cudaMemcpyFromSymbol(&count, d_innerq_count, sizeof(int));
+    CUDA_CHECK(cudaMemcpyFromSymbol(sq_accum, d_innerq_sq_accum, group_size * sizeof(float)));
+    CUDA_CHECK(cudaMemcpyFromSymbol(&count, d_innerq_count, sizeof(int)));
 
     if (count <= 0) {
         GGML_LOG_WARN("%s: InnerQ calibration got 0 tokens, disabling\n", __func__);
         innerq_enabled = 0;
         int zero = 0;
-        cudaMemcpyToSymbol(d_innerq_calibrating, &zero, sizeof(int));
+        CUDA_CHECK(cudaMemcpyToSymbol(d_innerq_calibrating, &zero, sizeof(int)));
         return;
     }
 
@@ -231,17 +231,17 @@ static void turbo_innerq_finalize(int group_size) {
                        __func__, max_ratio);
         innerq_enabled = 0;
         int zero = 0;
-        cudaMemcpyToSymbol(d_innerq_calibrating, &zero, sizeof(int));
+        CUDA_CHECK(cudaMemcpyToSymbol(d_innerq_calibrating, &zero, sizeof(int)));
         return;
     }
 
     // Stop calibrating, upload scales, activate
     int zero = 0, one = 1;
-    cudaMemcpyToSymbol(d_innerq_calibrating, &zero, sizeof(int));
-    cudaMemcpyToSymbol(d_innerq_scale, scale, group_size * sizeof(float));
-    cudaMemcpyToSymbol(d_innerq_scale_inv, scale_inv, group_size * sizeof(float));
-    cudaDeviceSynchronize();  // ensure scales are visible before activating
-    cudaMemcpyToSymbol(d_innerq_active, &one, sizeof(int));
+    CUDA_CHECK(cudaMemcpyToSymbol(d_innerq_calibrating, &zero, sizeof(int)));
+    CUDA_CHECK(cudaMemcpyToSymbol(d_innerq_scale, scale, group_size * sizeof(float)));
+    CUDA_CHECK(cudaMemcpyToSymbol(d_innerq_scale_inv, scale_inv, group_size * sizeof(float)));
+    CUDA_CHECK(cudaDeviceSynchronize());  // ensure scales are visible before activating
+    CUDA_CHECK(cudaMemcpyToSymbol(d_innerq_active, &one, sizeof(int)));
 
     innerq_enabled = 2;  // active
 
@@ -272,15 +272,15 @@ static void turbo_innerq_check_finalize(int group_size, int64_t ne00) {
                            __func__, (long long)ne00, group_size);
             innerq_enabled = 0;
             int zero = 0;
-            cudaMemcpyToSymbol(d_innerq_calibrating, &zero, sizeof(int));
+            CUDA_CHECK(cudaMemcpyToSymbol(d_innerq_calibrating, &zero, sizeof(int)));
         }
         return;
     }
 
     // Check if calibration is complete
     if (innerq_enabled == 1) {
         int count = 0;
-        cudaMemcpyFromSymbol(&count, d_innerq_count, sizeof(int));
+        CUDA_CHECK(cudaMemcpyFromSymbol(&count, d_innerq_count, sizeof(int)));
         if (count >= innerq_target_tokens) {
             turbo_innerq_finalize(group_size);
         }