File tree Expand file tree Collapse file tree
Expand file tree Collapse file tree Original file line number Diff line number Diff line change @@ -417,12 +417,6 @@ static best_fattn_kernel ggml_cuda_get_best_fattn_kernel(const int device, const
417417 const int warp_size = ggml_cuda_info ().devices [device].warp_size ;
418418 const enum ggml_prec prec = ggml_flash_attn_ext_get_prec (KQV);
419419
420- #if defined(GGML_HIP_ROCWMMA_FATTN)
421- if (GGML_CUDA_CC_IS_AMD (cc) && fp16_mma_available (cc)) { // kcpp: fix for rocwmma
422- return BEST_FATTN_KERNEL_WMMA_F16;
423- }
424- #endif // defined(GGML_HIP_ROCWMMA_FATTN)
425-
426420 switch (K->ne [0 ]) {
427421 case 64 :
428422 case 128 :
@@ -539,8 +533,8 @@ static best_fattn_kernel ggml_cuda_get_best_fattn_kernel(const int device, const
539533 return BEST_FATTN_KERNEL_WMMA_F16;
540534 }
541535
542- // kcpp: always force WMMA for Turing and Volta if above check fails , fix "FlashAttention without tensor cores only supports head sizes 64 and 128."
543- if (cc == GGML_CUDA_CC_TURING || cc == GGML_CUDA_CC_VOLTA ) {
536+ // kcpp: always force WMMA for older gpus , fix issues like "FlashAttention without tensor cores only supports head sizes 64 and 128."
537+ if (ggml_cuda_highest_compiled_arch (cc) <= GGML_CUDA_CC_TURING || cc == GGML_CUDA_CC_TURING ) {
544538 return BEST_FATTN_KERNEL_WMMA_F16;
545539 }
546540
You can’t perform that action at this time.
0 commit comments