Skip to content

Commit d17f1dc

Browse files
committed
fix: HIP/ROCm compatibility — check cudaMemcpyToSymbol errors, guard D>=576 MMA
Add CUDA_CHECK() to all cudaMemcpyToSymbol/cudaMemcpyFromSymbol calls in the InnerQ calibration path. On HIP, unchecked errors from these calls are sticky and poison the runtime, causing subsequent API calls to fail with 'no ROCm-capable device is detected'. Also guard the D>=576 MMA flash attention dispatch and kernel selection with #ifndef GGML_USE_HIP, matching the existing D>=576 tile exclusion (these kernels exceed HIP's shared/local memory limits). Tested on: ROCm 6.4.4, gfx1151 (AMD Ryzen AI Max+ 395 / Strix Halo)
1 parent 8ad0f00 commit d17f1dc

2 files changed

Lines changed: 21 additions & 15 deletions

File tree

ggml/src/ggml-cuda/fattn.cu

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -135,6 +135,8 @@ static void ggml_cuda_flash_attn_ext_mma_f16(ggml_backend_cuda_context & ctx, gg
135135
GGML_ASSERT(V->ne[0] == 256);
136136
ggml_cuda_flash_attn_ext_mma_f16_switch_ncols2<256, 256>(ctx, dst);
137137
break;
138+
#ifndef GGML_USE_HIP
139+
// D>=576 MMA kernels may exceed HIP shared memory limits
138140
case 576: {
139141
// For Deepseek, go straight to the ncols1 switch to avoid compiling unnecessary kernels.
140142
GGML_ASSERT(V->ne[0] == 512);
@@ -202,6 +204,7 @@ static void ggml_cuda_flash_attn_ext_mma_f16(ggml_backend_cuda_context & ctx, gg
202204
ggml_cuda_flash_attn_ext_mma_f16_case<640, 512, 2, 16>(ctx, dst);
203205
}
204206
} break;
207+
#endif // GGML_USE_HIP
205208
default:
206209
GGML_ABORT("fatal error");
207210
break;
@@ -384,6 +387,8 @@ static best_fattn_kernel ggml_cuda_get_best_fattn_kernel(const int device, const
384387
return BEST_FATTN_KERNEL_NONE;
385388
}
386389
break;
390+
#ifndef GGML_USE_HIP
391+
// D>=576 kernels exceed HIP shared memory / local memory limits
387392
case 576:
388393
case 640:
389394
if (V->ne[0] != 512) {
@@ -393,6 +398,7 @@ static best_fattn_kernel ggml_cuda_get_best_fattn_kernel(const int device, const
393398
return BEST_FATTN_KERNEL_NONE;
394399
}
395400
break;
401+
#endif
396402
default:
397403
return BEST_FATTN_KERNEL_NONE;
398404
}

ggml/src/ggml-cuda/turbo-quant.cuh

Lines changed: 15 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -176,10 +176,10 @@ static void turbo_innerq_init(void) {
176176
// Zero accumulators and set calibrating flag on device
177177
float zeros[INNERQ_MAX_CHANNELS] = {0};
178178
int zero = 0, one = 1;
179-
cudaMemcpyToSymbol(d_innerq_sq_accum, zeros, sizeof(zeros));
180-
cudaMemcpyToSymbol(d_innerq_count, &zero, sizeof(int));
181-
cudaMemcpyToSymbol(d_innerq_active, &zero, sizeof(int));
182-
cudaMemcpyToSymbol(d_innerq_calibrating, &one, sizeof(int));
179+
CUDA_CHECK(cudaMemcpyToSymbol(d_innerq_sq_accum, zeros, sizeof(zeros)));
180+
CUDA_CHECK(cudaMemcpyToSymbol(d_innerq_count, &zero, sizeof(int)));
181+
CUDA_CHECK(cudaMemcpyToSymbol(d_innerq_active, &zero, sizeof(int)));
182+
CUDA_CHECK(cudaMemcpyToSymbol(d_innerq_calibrating, &one, sizeof(int)));
183183

184184
GGML_LOG_INFO("%s: InnerQ calibration started (target=%d tokens, strength=%.2f)\n",
185185
__func__, innerq_target_tokens, innerq_strength);
@@ -190,14 +190,14 @@ static void turbo_innerq_finalize(int group_size) {
190190
// Read accumulators from device
191191
float sq_accum[INNERQ_MAX_CHANNELS];
192192
int count = 0;
193-
cudaMemcpyFromSymbol(sq_accum, d_innerq_sq_accum, group_size * sizeof(float));
194-
cudaMemcpyFromSymbol(&count, d_innerq_count, sizeof(int));
193+
CUDA_CHECK(cudaMemcpyFromSymbol(sq_accum, d_innerq_sq_accum, group_size * sizeof(float)));
194+
CUDA_CHECK(cudaMemcpyFromSymbol(&count, d_innerq_count, sizeof(int)));
195195

196196
if (count <= 0) {
197197
GGML_LOG_WARN("%s: InnerQ calibration got 0 tokens, disabling\n", __func__);
198198
innerq_enabled = 0;
199199
int zero = 0;
200-
cudaMemcpyToSymbol(d_innerq_calibrating, &zero, sizeof(int));
200+
CUDA_CHECK(cudaMemcpyToSymbol(d_innerq_calibrating, &zero, sizeof(int)));
201201
return;
202202
}
203203

@@ -231,17 +231,17 @@ static void turbo_innerq_finalize(int group_size) {
231231
__func__, max_ratio);
232232
innerq_enabled = 0;
233233
int zero = 0;
234-
cudaMemcpyToSymbol(d_innerq_calibrating, &zero, sizeof(int));
234+
CUDA_CHECK(cudaMemcpyToSymbol(d_innerq_calibrating, &zero, sizeof(int)));
235235
return;
236236
}
237237

238238
// Stop calibrating, upload scales, activate
239239
int zero = 0, one = 1;
240-
cudaMemcpyToSymbol(d_innerq_calibrating, &zero, sizeof(int));
241-
cudaMemcpyToSymbol(d_innerq_scale, scale, group_size * sizeof(float));
242-
cudaMemcpyToSymbol(d_innerq_scale_inv, scale_inv, group_size * sizeof(float));
243-
cudaDeviceSynchronize(); // ensure scales are visible before activating
244-
cudaMemcpyToSymbol(d_innerq_active, &one, sizeof(int));
240+
CUDA_CHECK(cudaMemcpyToSymbol(d_innerq_calibrating, &zero, sizeof(int)));
241+
CUDA_CHECK(cudaMemcpyToSymbol(d_innerq_scale, scale, group_size * sizeof(float)));
242+
CUDA_CHECK(cudaMemcpyToSymbol(d_innerq_scale_inv, scale_inv, group_size * sizeof(float)));
243+
CUDA_CHECK(cudaDeviceSynchronize()); // ensure scales are visible before activating
244+
CUDA_CHECK(cudaMemcpyToSymbol(d_innerq_active, &one, sizeof(int)));
245245

246246
innerq_enabled = 2; // active
247247

@@ -272,15 +272,15 @@ static void turbo_innerq_check_finalize(int group_size, int64_t ne00) {
272272
__func__, (long long)ne00, group_size);
273273
innerq_enabled = 0;
274274
int zero = 0;
275-
cudaMemcpyToSymbol(d_innerq_calibrating, &zero, sizeof(int));
275+
CUDA_CHECK(cudaMemcpyToSymbol(d_innerq_calibrating, &zero, sizeof(int)));
276276
}
277277
return;
278278
}
279279

280280
// Check if calibration is complete
281281
if (innerq_enabled == 1) {
282282
int count = 0;
283-
cudaMemcpyFromSymbol(&count, d_innerq_count, sizeof(int));
283+
CUDA_CHECK(cudaMemcpyFromSymbol(&count, d_innerq_count, sizeof(int)));
284284
if (count >= innerq_target_tokens) {
285285
turbo_innerq_finalize(group_size);
286286
}

0 commit comments

Comments
 (0)