Reduce compile times by dividing mmf into f16, bf16 and f32 variants

am17an · am17an · commit 2bb1567a5bbd · 2025-09-08T22:34:46.000+08:00
diff --git a/ggml/src/ggml-cuda/CMakeLists.txt b/ggml/src/ggml-cuda/CMakeLists.txt
@@ -44,6 +44,8 @@ if (CUDAToolkit_FOUND)
     list(APPEND GGML_SOURCES_CUDA ${SRCS})
     file(GLOB   SRCS "template-instances/mmq*.cu")
     list(APPEND GGML_SOURCES_CUDA ${SRCS})
+    file(GLOB   SRCS "template-instances/mmf*.cu")
+    list(APPEND GGML_SOURCES_CUDA ${SRCS})
 
     if (GGML_CUDA_FA_ALL_QUANTS)
         file(GLOB   SRCS "template-instances/fattn-vec*.cu")
diff --git a/ggml/src/ggml-cuda/mmf.cuh b/ggml/src/ggml-cuda/mmf.cuh
@@ -3,3 +3,16 @@
 void ggml_cuda_mul_mat_f(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst);
 
 bool ggml_cuda_should_use_mmf(enum ggml_type type, int cc, int warp_size, const int64_t * scr0_ne, const int src1_ncols);
+
+template <ggml_type type>
+void mul_mat_f_case(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst) {
+    GGML_ASSERT(src0->type == type);
+    ggml_cuda_mul_mat_f(ctx, src0, src1, ids, dst);
+}
+
+#define DECL_MMF_CASE(type)                                                        \
+    template void mul_mat_f_case<type>(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst)
+
+extern DECL_MMF_CASE(GGML_TYPE_F32);
+extern DECL_MMF_CASE(GGML_TYPE_F16);
+extern DECL_MMF_CASE(GGML_TYPE_BF16);
diff --git a/ggml/src/ggml-cuda/template-instances/generate_cu_files.py b/ggml/src/ggml-cuda/template-instances/generate_cu_files.py
@@ -34,6 +34,17 @@
 DECL_MMQ_CASE({type});
 """
 
+TYPES_MMF = [
+    "GGML_TYPE_F32", "GGML_TYPE_F16", "GGML_TYPE_BF16"
+]
+
+SOURCE_MMF = """// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../mmf.cuh"
+
+DECL_MMF_CASE({type});
+"""
+
 
 def get_short_name(long_quant_name):
     return long_quant_name.replace("GGML_TYPE_", "").lower()
@@ -76,3 +87,7 @@ def get_head_sizes(type_k, type_v):
 for type in TYPES_MMQ:
     with open(f"mmq-instance-{get_short_name(type)}.cu", "w") as f:
         f.write(SOURCE_MMQ.format(type=type))
+
+for type in TYPES_MMF:
+    with open(f"mmf-instance-{get_short_name(type)}.cu", "w") as f:
+        f.write(SOURCE_MMF.format(type=type))
diff --git a/ggml/src/ggml-cuda/template-instances/mmf-instance-bf16.cu b/ggml/src/ggml-cuda/template-instances/mmf-instance-bf16.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../mmf.cuh"
+
+DECL_MMF_CASE(GGML_TYPE_BF16);
diff --git a/ggml/src/ggml-cuda/template-instances/mmf-instance-f16.cu b/ggml/src/ggml-cuda/template-instances/mmf-instance-f16.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../mmf.cuh"
+
+DECL_MMF_CASE(GGML_TYPE_F16);
diff --git a/ggml/src/ggml-cuda/template-instances/mmf-instance-f32.cu b/ggml/src/ggml-cuda/template-instances/mmf-instance-f32.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../mmf.cuh"
+
+DECL_MMF_CASE(GGML_TYPE_F32);