CUDA: PoC for repacking mxfp4

am17an · am17an · commit 5ab5778ba335 · 2026-04-19T15:20:01.000+08:00
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -27,6 +27,9 @@
 #include "ggml-cuda/im2col.cuh"
 #include "ggml-cuda/mmf.cuh"
 #include "ggml-cuda/mmq.cuh"
+#ifdef GGML_CUDA_MXFP4_REPACK
+#include "ggml-cuda/mxfp4-repack.cuh"
+#endif
 #include "ggml-cuda/mmvf.cuh"
 #include "ggml-cuda/mmvq.cuh"
 #include "ggml-cuda/norm.cuh"
@@ -655,11 +658,51 @@ static void ggml_backend_cuda_buffer_memset_tensor(ggml_backend_buffer_t buffer,
     CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
 }
 
+#ifdef GGML_CUDA_MXFP4_REPACK
+// In-place repack of a fully-uploaded MXFP4 tensor into the per-row SoA
+// layout expected by load_tiles_mxfp4_fp4_soa. Allocates a transient device
+// staging buffer, copies the current AoS bytes into it, launches the repack
+// kernel writing back over tensor->data, and frees. Synchronizes the provided
+// stream so staging is safe to free on return.
+static void ggml_cuda_mxfp4_repack_tensor_inplace(ggml_tensor * tensor, cudaStream_t stream) {
+    const int64_t ne0   = tensor->ne[0];
+    const int64_t nrow  = ggml_nrows(tensor);
+    const int     B_src = (int) (ne0 / QK_MXFP4);
+    constexpr int blocks_per_iter = MMQ_ITER_K_MXFP4_FP4 / QK_MXFP4;  // 16
+    const int     B_dst = (B_src + blocks_per_iter - 1) / blocks_per_iter * blocks_per_iter;
+
+    const size_t src_bytes = (size_t) 17 * B_src * nrow;
+
+    void * staging = nullptr;
+    CUDA_CHECK(cudaMallocAsync(&staging, src_bytes, stream));
+    CUDA_CHECK(cudaMemcpyAsync(staging, tensor->data, src_bytes,
+                               cudaMemcpyDeviceToDevice, stream));
+    ggml_cuda_mxfp4_repack_soa_launch(tensor->data, staging,
+                                      (int) nrow, B_src, B_dst, stream);
+    CUDA_CHECK(cudaGetLastError());
+    CUDA_CHECK(cudaFreeAsync(staging, stream));
+}
+
+static inline bool ggml_cuda_mxfp4_should_repack(const ggml_tensor * tensor,
+                                                 size_t offset, size_t size) {
+    // Fires exactly once per tensor: when a write ends at ggml_nbytes(tensor)
+    // the loader has finished uploading this tensor's data.
+    return tensor->type == GGML_TYPE_MXFP4 &&
+           ggml_n_dims(tensor) >= 2 &&
+           offset + size == ggml_nbytes(tensor);
+}
+#endif // GGML_CUDA_MXFP4_REPACK
+
 static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
     ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *) buffer->context;
 
     ggml_cuda_set_device(ctx->device);
     CUDA_CHECK(cudaMemcpyAsync((char *) tensor->data + offset, data, size, cudaMemcpyHostToDevice, cudaStreamPerThread));
+#ifdef GGML_CUDA_MXFP4_REPACK
+    if (ggml_cuda_mxfp4_should_repack(tensor, offset, size)) {
+        ggml_cuda_mxfp4_repack_tensor_inplace(tensor, cudaStreamPerThread);
+    }
+#endif
     CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread));
 }
 
@@ -782,7 +825,17 @@ static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_t
     if (ggml_is_quantized(tensor->type)) {
         if (ne0 % MATRIX_ROW_PADDING != 0) {
             GGML_ASSERT(tensor->nb[0] == ggml_element_size(tensor));
-            size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
+            const size_t pad_bytes_per_row = ggml_row_size(
+                tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
+
+            // MXFP4 weights get repacked per-row into an SoA layout with each
+            // row padded to MATRIX_ROW_PADDING elements, so we need padding
+            // space for every row rather than only the tensor tail.
+            if (tensor->type == GGML_TYPE_MXFP4 && ggml_n_dims(tensor) >= 2) {
+                size += pad_bytes_per_row * ggml_nrows(tensor);
+            } else {
+                size += pad_bytes_per_row;
+            }
         }
     }
 
@@ -2959,6 +3012,11 @@ static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tens
     GGML_ASSERT(buf->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
 
     CUDA_CHECK(cudaMemcpyAsync((char *) tensor->data + offset, data, size, cudaMemcpyHostToDevice, cuda_ctx->stream()));
+#ifdef GGML_CUDA_MXFP4_REPACK
+    if (ggml_cuda_mxfp4_should_repack(tensor, offset, size)) {
+        ggml_cuda_mxfp4_repack_tensor_inplace(tensor, cuda_ctx->stream());
+    }
+#endif
 }
 
 static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
diff --git a/ggml/src/ggml-cuda/mmq.cuh b/ggml/src/ggml-cuda/mmq.cuh
@@ -934,6 +934,73 @@ static __device__ __forceinline__ void load_tiles_mxfp4_fp4(const char * __restr
     }
 }
 
+// SoA variant of load_tiles_mxfp4_fp4. Source tensor must already be repacked
+// per-row into [qs_0..qs_{B_dst-1} | e_0..e_{B_dst-1}] with
+// B_dst = GGML_PAD(B_src, iter_k/QK_MXFP4). qs region is 16B aligned so the
+// per-thread 16B qs load is a single coalesced 128-bit transaction.
+//
+// kbx0 is the flat AoS block index at which this tile starts (it already
+// includes sample*stride_sample_x + channel*stride_channel_x + tile_row*stride).
+// stride is the source-side block count per row (B_src). We decompose kbx0
+// into (flat_row_base, kb0_in_row) and advance by local tile row i.
+template <int mmq_y, bool need_check>
+static __device__ __forceinline__ void load_tiles_mxfp4_fp4_soa(const char * __restrict__ x,
+                                                                int * __restrict__ x_tile,
+                                                                const int kbx0,
+                                                                const int i_max,
+                                                                const int stride) {
+    constexpr int nwarps = mmq_get_nwarps_device();
+    constexpr int warp_size = ggml_cuda_get_physical_warp_size();
+
+    int *      x_qs = (int *) x_tile;
+    uint32_t * x_sc = (uint32_t *) (x_qs + 2 * MMQ_TILE_NE_K);
+
+    const int txi = threadIdx.x;
+
+    constexpr int iter_k = get_iter_k(GGML_TYPE_MXFP4);
+
+    constexpr int threads_per_row = iter_k / QK_MXFP4;  // 16 on Blackwell
+    constexpr int rows_per_warp   = warp_size / threads_per_row;
+    const int     kbx             = txi % threads_per_row;
+    const int     row_in_warp     = txi / threads_per_row;
+
+    // Derive padded blocks-per-row. threads_per_row equals blocks_per_iter for
+    // MXFP4_FP4, so rounding stride to this multiple matches how the repack
+    // kernel pads.
+    constexpr int blocks_per_iter = threads_per_row;
+    const int B_src     = stride;
+    const int B_dst     = (B_src + blocks_per_iter - 1) / blocks_per_iter * blocks_per_iter;
+    const int row_bytes = 17 * B_dst;
+
+    const int flat_row_base = kbx0 / B_src;
+    const int kb0_in_row    = kbx0 - flat_row_base * B_src;
+    const int kbx_in_row    = kb0_in_row + kbx;
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += rows_per_warp * nwarps) {
+        int i = i0 + threadIdx.y * rows_per_warp + row_in_warp;
+
+        if constexpr (need_check) {
+            i = min(i, i_max);
+        }
+
+        const uint8_t * row_base = reinterpret_cast<const uint8_t *>(x)
+                                   + (size_t) (flat_row_base + i) * row_bytes;
+        const uint8_t * qs_base  = row_base;
+        const uint8_t * sc_base  = row_base + 16 * B_dst;
+
+        const int k0 = kbx * 4;
+        const uint4 q = reinterpret_cast<const uint4 *>(qs_base)[kbx_in_row];
+        memcpy(x_qs + i * MMQ_MMA_TILE_X_K_FP4 + k0, &q, 16);
+
+        if (kbx % 2 == 0) {
+            uint32_t e = sc_base[kbx_in_row];
+            e |= ((uint32_t) sc_base[kbx_in_row + 1]) << 8;
+            x_sc[i * MMQ_MMA_TILE_X_K_FP4 + kbx / 2] = e;
+        }
+    }
+}
+
 
 template <int mmq_y, bool need_check>
 static __device__ __forceinline__ void load_tiles_nvfp4(const char * __restrict__ x,
@@ -3427,7 +3494,11 @@ template <int mmq_x, int mmq_y, bool need_check>
 struct mmq_type_traits<mmq_x, mmq_y, need_check, GGML_TYPE_MXFP4> {
     static constexpr int              vdr          = VDR_MXFP4_Q8_1_MMQ;
 #ifdef BLACKWELL_MMA_AVAILABLE
+#ifdef GGML_CUDA_MXFP4_REPACK
+    static constexpr load_tiles_mmq_t load_tiles  = load_tiles_mxfp4_fp4_soa<mmq_y, need_check>;
+#else
     static constexpr load_tiles_mmq_t load_tiles  = load_tiles_mxfp4_fp4<mmq_y, need_check>;
+#endif // GGML_CUDA_MXFP4_REPACK
     static constexpr vec_dot_mmq_t    vec_dot_mma = vec_dot_mxfp4_mxfp4_mma<mmq_x, mmq_y>;
 #else
     static constexpr load_tiles_mmq_t load_tiles   = load_tiles_mxfp4<mmq_y, need_check>;
diff --git a/ggml/src/ggml-cuda/mmvq.cu b/ggml/src/ggml-cuda/mmvq.cu
@@ -5,6 +5,20 @@
 
 #include <cstdint>
 
+#ifdef GGML_CUDA_MXFP4_REPACK
+// Device-side mirror of init_fastdiv_values. Runs once per kernel (with a
+// uniform divisor across the thread block), so the while-loop and 64-bit
+// divide are cheap and hoisted out of hot code by the compiler.
+static __device__ __forceinline__ uint3 init_fastdiv_values_device(uint32_t d) {
+    uint32_t L = 0;
+    while (L < 32 && (uint32_t{ 1 } << L) < d) {
+        L++;
+    }
+    const uint32_t mp = (uint32_t) ((uint64_t{ 1 } << 32) * ((uint64_t{ 1 } << L) - d) / d + 1);
+    return make_uint3(mp, L, d);
+}
+#endif
+
 typedef float (*vec_dot_q_cuda_t)(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs);
 
 static constexpr __device__ vec_dot_q_cuda_t get_vec_dot_q_cuda(ggml_type type) {
@@ -413,6 +427,14 @@ static __global__ void mul_mat_vec_q(
     const     int blocks_per_row_x = ncols_x / qk;
     constexpr int blocks_per_iter = vdr * nwarps*warp_size / qi;
 
+#ifdef GGML_CUDA_MXFP4_REPACK
+    // MXFP4 SoA: fastdiv values for (kbx / B_src, kbx % B_src) computed
+    // once per thread (uniform across the block) and held in registers.
+    const uint3 mxfp4_bsrc_fd = (type == GGML_TYPE_MXFP4)
+        ? init_fastdiv_values_device((uint32_t) blocks_per_row_x)
+        : make_uint3(0, 0, 0);
+#endif
+
     const uint32_t channel_dst = blockIdx.y;
 
     uint32_t channel_x;
@@ -490,12 +512,27 @@ static __global__ void mul_mat_vec_q(
         for (int j = 0; j < ncols_dst; ++j) {
 #pragma unroll
             for (int i = 0; i < rows_per_cuda_block; ++i) {
-                tmp[j][i] += vec_dot_q_cuda(
-                    vx, &y[j*stride_col_y + kby], kbx_offset + i*stride_row_x + kbx, kqs);
-                if constexpr (has_fusion) {
-                    if (use_gate) {
-                        tmp_gate[j][i] += vec_dot_q_cuda(
-                            vgate, &y[j*stride_col_y + kby], kbx_offset + i*stride_row_x + kbx, kqs);
+                const int kbx_arg = kbx_offset + i*stride_row_x + kbx;
+#ifdef GGML_CUDA_MXFP4_REPACK
+                if constexpr (type == GGML_TYPE_MXFP4) {
+                    tmp[j][i] += vec_dot_mxfp4_q8_1_soa(
+                        vx, &y[j*stride_col_y + kby], kbx_arg, kqs, mxfp4_bsrc_fd);
+                    if constexpr (has_fusion) {
+                        if (use_gate) {
+                            tmp_gate[j][i] += vec_dot_mxfp4_q8_1_soa(
+                                vgate, &y[j*stride_col_y + kby], kbx_arg, kqs, mxfp4_bsrc_fd);
+                        }
+                    }
+                } else
+#endif
+                {
+                    tmp[j][i] += vec_dot_q_cuda(
+                        vx, &y[j*stride_col_y + kby], kbx_arg, kqs);
+                    if constexpr (has_fusion) {
+                        if (use_gate) {
+                            tmp_gate[j][i] += vec_dot_q_cuda(
+                                vgate, &y[j*stride_col_y + kby], kbx_arg, kqs);
+                        }
                     }
                 }
             }
@@ -631,13 +668,27 @@ static __global__ void mul_mat_vec_q_moe(
     // partial sum for each thread
     float tmp[c_rows_per_block] = {0.0f};
 
+#ifdef GGML_CUDA_MXFP4_REPACK
+    const uint3 mxfp4_bsrc_fd = (type == GGML_TYPE_MXFP4)
+        ? init_fastdiv_values_device((uint32_t) blocks_per_row_x)
+        : make_uint3(0, 0, 0);
+#endif
+
     for (int kbx = threadIdx.x / (qi/vdr); kbx < blocks_per_row_x; kbx += blocks_per_iter) {
         const int kby = kbx * (qk/QK8_1);
         const int kqs = vdr * (threadIdx.x % (qi/vdr));
 
 #pragma unroll
         for (int i = 0; i < c_rows_per_block; ++i) {
-            tmp[i] += vec_dot_q_cuda(vx, &y[kby], kbx_offset + i*stride_row_x + kbx, kqs);
+            const int kbx_arg = kbx_offset + i*stride_row_x + kbx;
+#ifdef GGML_CUDA_MXFP4_REPACK
+            if constexpr (type == GGML_TYPE_MXFP4) {
+                tmp[i] += vec_dot_mxfp4_q8_1_soa(vx, &y[kby], kbx_arg, kqs, mxfp4_bsrc_fd);
+            } else
+#endif
+            {
+                tmp[i] += vec_dot_q_cuda(vx, &y[kby], kbx_arg, kqs);
+            }
         }
     }
 
@@ -924,12 +975,12 @@ static void mul_mat_vec_q_switch_type(
                  nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
                  nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, ids_stride, stream);
             break;
-        case GGML_TYPE_MXFP4:
+        case GGML_TYPE_MXFP4: {
             mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_MXFP4>
                 (vx, vy, ids, fusion, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
                  nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
                  nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, ids_stride, stream);
-            break;
+            } break;
         case GGML_TYPE_NVFP4:
             mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_NVFP4>
                 (vx, vy, ids, fusion, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
diff --git a/ggml/src/ggml-cuda/vecdotq.cuh b/ggml/src/ggml-cuda/vecdotq.cuh
@@ -308,23 +308,64 @@ static __device__ __forceinline__ float vec_dot_mxfp4_q8_1(
     const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) {
 
     const block_mxfp4 * bq4 = (const block_mxfp4 *) vbq + kbx;
+    const uint8_t * qs_base = bq4->qs;
+    const uint8_t   e_byte  = bq4->e;
 
     const int * q8 = (const int *) bq8_1->qs + iqs;
 
     int sumi = 0;
 #pragma unroll
     for (int l = 0; l < VDR_MXFP4_Q8_1_MMVQ; ++l) {
-        const int aux_q4 = get_int_b1(bq4->qs, iqs + l);
+        const int aux_q4 = get_int_b1(qs_base, iqs + l);
         const int2 v = get_int_from_table_16(aux_q4, kvalues_mxfp4);
 
         sumi = ggml_cuda_dp4a(v.x, q8[l + 0], sumi);
         sumi = ggml_cuda_dp4a(v.y, q8[l + 4], sumi);
     }
 
-    const float d = ggml_cuda_e8m0_to_fp32(bq4->e) * 0.5f * __low2float(bq8_1->ds);
+    const float d = ggml_cuda_e8m0_to_fp32(e_byte) * 0.5f * __low2float(bq8_1->ds);
     return d * sumi;
 }
 
+#ifdef GGML_CUDA_MXFP4_REPACK
+// SoA variant: tensor is repacked per-row into
+//   [qs_0..qs_{B_dst-1} | e_0..e_{B_dst-1}]
+// with B_dst = GGML_PAD(B_src, 16). bsrc_fd carries fastdiv values for
+// B_src = ncols_x / QK_MXFP4 so the caller can supply (row_idx, kbx_in_row)
+// via mulhi + shift instead of a hardware divide. Caller computes bsrc_fd
+// once per kernel and passes it in.
+static __device__ __forceinline__ float vec_dot_mxfp4_q8_1_soa(
+    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1,
+    const int kbx, const int iqs, const uint3 bsrc_fd) {
+
+    const int B_src = (int) bsrc_fd.z;
+    const int B_dst = (B_src + 15) & ~15;
+    const uint2 dm = fast_div_modulo((uint32_t) kbx, bsrc_fd);
+    const int row_idx    = (int) dm.x;
+    const int kbx_in_row = (int) dm.y;
+
+    const uint8_t * row_base = (const uint8_t *) vbq + (size_t) row_idx * 17 * B_dst;
+    const uint8_t * qs_base  = row_base + (size_t) kbx_in_row * 16;
+    const uint8_t * sc_base  = row_base + 16 * B_dst;
+    const uint8_t   e_byte   = sc_base[kbx_in_row];
+
+    const int * q8 = (const int *) bq8_1->qs + iqs;
+
+    int sumi = 0;
+#pragma unroll
+    for (int l = 0; l < VDR_MXFP4_Q8_1_MMVQ; ++l) {
+        const int aux_q4 = get_int_b1(qs_base, iqs + l);
+        const int2 v = get_int_from_table_16(aux_q4, kvalues_mxfp4);
+
+        sumi = ggml_cuda_dp4a(v.x, q8[l + 0], sumi);
+        sumi = ggml_cuda_dp4a(v.y, q8[l + 4], sumi);
+    }
+
+    const float d = ggml_cuda_e8m0_to_fp32(e_byte) * 0.5f * __low2float(bq8_1->ds);
+    return d * sumi;
+}
+#endif // GGML_CUDA_MXFP4_REPACK
+
 #define VDR_NVFP4_Q8_1_MMVQ 4
 #define VDR_NVFP4_Q8_1_MMQ  8