Skip to content

Commit 05a3019

Browse files
committed
Revert "try fix fattn again, porting some older code. the cc detection is not working well, so its hacky"
This reverts commit 7b04191.
1 parent d20a184 commit 05a3019

1 file changed

Lines changed: 2 additions & 8 deletions

File tree

ggml/src/ggml-cuda/fattn.cu

Lines changed: 2 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -417,12 +417,6 @@ static best_fattn_kernel ggml_cuda_get_best_fattn_kernel(const int device, const
417417
const int warp_size = ggml_cuda_info().devices[device].warp_size;
418418
const enum ggml_prec prec = ggml_flash_attn_ext_get_prec(KQV);
419419

420-
#if defined(GGML_HIP_ROCWMMA_FATTN)
421-
if (GGML_CUDA_CC_IS_AMD(cc) && fp16_mma_available(cc)) { //kcpp: fix for rocwmma
422-
return BEST_FATTN_KERNEL_WMMA_F16;
423-
}
424-
#endif // defined(GGML_HIP_ROCWMMA_FATTN)
425-
426420
switch (K->ne[0]) {
427421
case 64:
428422
case 128:
@@ -539,8 +533,8 @@ static best_fattn_kernel ggml_cuda_get_best_fattn_kernel(const int device, const
539533
return BEST_FATTN_KERNEL_WMMA_F16;
540534
}
541535

542-
//kcpp: always force WMMA for Turing and Volta if above check fails, fix "FlashAttention without tensor cores only supports head sizes 64 and 128."
543-
if (cc == GGML_CUDA_CC_TURING || cc == GGML_CUDA_CC_VOLTA) {
536+
//kcpp: always force WMMA for older gpus, fix issues like "FlashAttention without tensor cores only supports head sizes 64 and 128."
537+
if (ggml_cuda_highest_compiled_arch(cc) <= GGML_CUDA_CC_TURING || cc == GGML_CUDA_CC_TURING) {
544538
return BEST_FATTN_KERNEL_WMMA_F16;
545539
}
546540

0 commit comments

Comments
 (0)