ORippler
diff --git a/‎ggml/src/ggml-cuda/mmf.cu‎
Lines changed: 1 addition & 1 deletion b/‎ggml/src/ggml-cuda/mmf.cu‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎ggml/src/ggml-cuda/mmid.cu‎
Lines changed: 14 additions & 8 deletions b/‎ggml/src/ggml-cuda/mmid.cu‎
Lines changed: 14 additions & 8 deletions
diff --git a/‎ggml/src/ggml-cuda/mmid.cuh‎
Lines changed: 1 addition & 0 deletions b/‎ggml/src/ggml-cuda/mmid.cuh‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎ggml/src/ggml-cuda/mmq.cu‎
Lines changed: 24 additions & 5 deletions b/‎ggml/src/ggml-cuda/mmq.cu‎
Lines changed: 24 additions & 5 deletions
@@ -84,7 +84,7 @@ void ggml_cuda_mul_mat_f(ggml_backend_cuda_context & ctx, const ggml_tensor * sr
 
         GGML_ASSERT(sis1 > 0);
 
-        ggml_cuda_launch_mm_ids_helper(ids_d, ids_src_compact_dev.get(), ids_dst_compact_dev.get(), expert_bounds_dev.get(),
+        ggml_cuda_launch_mm_ids_helper(ids_d, ids_src_compact_dev.get(), ids_dst_compact_dev.get(), expert_bounds_dev.get(), nullptr, nullptr,
             static_cast<int>(n_experts), static_cast<int>(n_tokens), static_cast<int>(n_expert_used), static_cast<int>(ne11), si1, sis1, ctx.stream());
         CUDA_CHECK(cudaGetLastError());
 
 
@@ -27,6 +27,7 @@ template <int n_expert_used_template>
 __launch_bounds__(ggml_cuda_get_physical_warp_size(), 1)
 static __global__ void mm_ids_helper(
         const int32_t * __restrict__ ids, int32_t * __restrict__ ids_src1, int32_t * __restrict__ ids_dst, int32_t * __restrict__ expert_bounds,
+        const float * __restrict__ scales, float * __restrict__ scales_src1,
         const int n_tokens, const int n_expert_used_var, const int nchannels_y, const int si1, const int sis1) {
     constexpr int warp_size = ggml_cuda_get_physical_warp_size();
     const int n_expert_used = n_expert_used_template == 0 ? n_expert_used_var : n_expert_used_template;
@@ -100,6 +101,9 @@ static __global__ void mm_ids_helper(
         const int iex_used = store_it.iex_used();
         ids_src1[nex_prev + itc] = it*sis1          + iex_used % nchannels_y;
         ids_dst [nex_prev + itc] = it*n_expert_used + iex_used;
+        if (scales_src1) {
+            scales_src1[nex_prev + itc] = scales[expert];
+        }
     }
 
     if (threadIdx.x != 0) {
@@ -118,6 +122,7 @@ static __global__ void mm_ids_helper(
 template <int n_expert_used_template>
 static void launch_mm_ids_helper(
         const int32_t * __restrict__ ids, int32_t * __restrict__ ids_src1, int32_t * __restrict__ ids_dst, int32_t * __restrict__ expert_bounds,
+        const float * __restrict__ scales, float * __restrict__ scales_src1,
         const int n_experts, const int n_tokens, const int n_expert_used_var, const int nchannels_y, const int si1, const int sis1, cudaStream_t stream) {
     GGML_ASSERT(n_tokens          < (1 << 22) && "too few bits in mm_ids_helper_store");
     GGML_ASSERT(n_expert_used_var < (1 << 10) && "too few bits in mm_ids_helper_store");
@@ -132,33 +137,34 @@ static void launch_mm_ids_helper(
     const size_t nbytes_shared = n_tokens*sizeof(mm_ids_helper_store);
     GGML_ASSERT(nbytes_shared <= smpbo);
     mm_ids_helper<n_expert_used_template><<<num_blocks, block_size, nbytes_shared, stream>>>
-        (ids, ids_src1, ids_dst, expert_bounds, n_tokens, n_expert_used_var, nchannels_y, si1, sis1);
+        (ids, ids_src1, ids_dst, expert_bounds, scales, scales_src1, n_tokens, n_expert_used_var, nchannels_y, si1, sis1);
 }
 
 void ggml_cuda_launch_mm_ids_helper(
         const int32_t * __restrict__ ids, int32_t * __restrict__ ids_src1, int32_t * __restrict__ ids_dst, int32_t * __restrict__ expert_bounds,
+        const float * __restrict__ scales, float * __restrict__ scales_src1,
         const int n_experts, const int n_tokens, const int n_expert_used, const int nchannels_y, const int si1, const int sis1, cudaStream_t stream) {
     switch (n_expert_used) {
         case  2:
-            launch_mm_ids_helper< 2>(ids, ids_src1, ids_dst, expert_bounds, n_experts, n_tokens, n_expert_used, nchannels_y, si1, sis1, stream);
+            launch_mm_ids_helper< 2>(ids, ids_src1, ids_dst, expert_bounds, scales, scales_src1, n_experts, n_tokens, n_expert_used, nchannels_y, si1, sis1, stream);
             break;
         case  4:
-            launch_mm_ids_helper< 4>(ids, ids_src1, ids_dst, expert_bounds, n_experts, n_tokens, n_expert_used, nchannels_y, si1, sis1, stream);
+            launch_mm_ids_helper< 4>(ids, ids_src1, ids_dst, expert_bounds, scales, scales_src1, n_experts, n_tokens, n_expert_used, nchannels_y, si1, sis1, stream);
             break;
         case  6:
-            launch_mm_ids_helper< 6>(ids, ids_src1, ids_dst, expert_bounds, n_experts, n_tokens, n_expert_used, nchannels_y, si1, sis1, stream);
+            launch_mm_ids_helper< 6>(ids, ids_src1, ids_dst, expert_bounds, scales, scales_src1, n_experts, n_tokens, n_expert_used, nchannels_y, si1, sis1, stream);
             break;
         case  8:
-            launch_mm_ids_helper< 8>(ids, ids_src1, ids_dst, expert_bounds, n_experts, n_tokens, n_expert_used, nchannels_y, si1, sis1, stream);
+            launch_mm_ids_helper< 8>(ids, ids_src1, ids_dst, expert_bounds, scales, scales_src1, n_experts, n_tokens, n_expert_used, nchannels_y, si1, sis1, stream);
             break;
         case 16:
-            launch_mm_ids_helper<16>(ids, ids_src1, ids_dst, expert_bounds, n_experts, n_tokens, n_expert_used, nchannels_y, si1, sis1, stream);
+            launch_mm_ids_helper<16>(ids, ids_src1, ids_dst, expert_bounds, scales, scales_src1, n_experts, n_tokens, n_expert_used, nchannels_y, si1, sis1, stream);
             break;
         case 32:
-            launch_mm_ids_helper<32>(ids, ids_src1, ids_dst, expert_bounds, n_experts, n_tokens, n_expert_used, nchannels_y, si1, sis1, stream);
+            launch_mm_ids_helper<32>(ids, ids_src1, ids_dst, expert_bounds, scales, scales_src1, n_experts, n_tokens, n_expert_used, nchannels_y, si1, sis1, stream);
             break;
         default:
-            launch_mm_ids_helper< 0>(ids, ids_src1, ids_dst, expert_bounds, n_experts, n_tokens, n_expert_used, nchannels_y, si1, sis1, stream);
+            launch_mm_ids_helper< 0>(ids, ids_src1, ids_dst, expert_bounds, scales, scales_src1, n_experts, n_tokens, n_expert_used, nchannels_y, si1, sis1, stream);
             break;
     }
 }
@@ -2,4 +2,5 @@
 
 void ggml_cuda_launch_mm_ids_helper(
         const int32_t * ids, int32_t * ids_src1, int32_t * ids_dst, int32_t * expert_bounds,
+        const float * scales, float * scales_src1,
         int n_experts, int n_tokens, int n_expert_used, int nchannels_y, int si1, int sis1, cudaStream_t stream);
@@ -123,6 +123,9 @@ void ggml_cuda_mul_mat_q(
 
     // TODO: tighter pool buffer size vs q8 path
     const bool use_native_fp4 = blackwell_mma_available(cc) && (src0->type == GGML_TYPE_MXFP4 || src0->type == GGML_TYPE_NVFP4);
+    const ggml_tensor * scale_activations = src0->type == GGML_TYPE_NVFP4 ? (ids ? dst->src[4] : dst->src[3]) : nullptr;
+    const float * scale_activations_d = scale_activations ? (const float *) scale_activations->data : nullptr;
+    const int64_t n_scale_activations = scale_activations ? ggml_nelements(scale_activations) : 0;
 
     if (!ids) {
         const size_t nbytes_src1_q8_1 = ne13*ne12 * ne11*ne10_padded * sizeof(block_q8_1)/QK8_1 +
@@ -135,7 +138,7 @@ void ggml_cuda_mul_mat_q(
             const int64_t s13 = src1->nb[3] / ts_src1;
             if (use_native_fp4) {
                 static_assert(sizeof(block_fp4_mmq) == 4 * sizeof(block_q8_1));
-                quantize_mmq_fp4_cuda(src1_d, nullptr, src1_q8_1.get(), src0->type, ne10, s11, s12, s13, ne10_padded,
+                quantize_mmq_fp4_cuda(src1_d, nullptr, scale_activations_d, n_scale_activations, src1_q8_1.get(), src0->type, ne10, s11, s12, s13, ne10_padded,
                                         ne11, ne12, ne13, stream);
 
             } else {
@@ -152,7 +155,9 @@ void ggml_cuda_mul_mat_q(
         const int64_t s13 = ne12*s12;
 
         const mmq_args args = {
-            src0_d, src0->type, (const int *) src1_q8_1.ptr, nullptr, nullptr, dst_d,
+            src0_d, src0->type, (const int *) src1_q8_1.ptr,
+            use_native_fp4 ? scale_activations_d : nullptr, use_native_fp4 ? n_scale_activations : 0,
+            nullptr, nullptr, dst_d,
             ne00, ne01, ne1, s01, ne11, s1,
             ne02, ne12, s02, s12, s2,
             ne03, ne13, s03, s13, s3,
@@ -172,13 +177,25 @@ void ggml_cuda_mul_mat_q(
     ggml_cuda_pool_alloc<int32_t> ids_src1(ctx.pool(), ne_get_rows);
     ggml_cuda_pool_alloc<int32_t> ids_dst(ctx.pool(), ne_get_rows);
     ggml_cuda_pool_alloc<int32_t> expert_bounds(ctx.pool(), ne02 + 1);
+    ggml_cuda_pool_alloc<float> scale_activations_src1(ctx.pool());
+    const float * scale_activations_q = scale_activations_d;
+    int64_t n_scale_activations_q = n_scale_activations;
+    if (scale_activations) {
+        GGML_ASSERT(n_scale_activations == 1 || n_scale_activations == ne02);
+        if (n_scale_activations != 1) {
+            scale_activations_src1.alloc(ctx.pool(), ne_get_rows);
+            scale_activations_q = scale_activations_src1.get();
+            n_scale_activations_q = ne_get_rows;
+        }
+    }
 
     {
         GGML_ASSERT(ids->nb[0] == ggml_element_size(ids));
         const int si1  = ids->nb[1] / ggml_element_size(ids);
         const int sis1 = nb12 / nb11;
 
         ggml_cuda_launch_mm_ids_helper((const int32_t *) ids->data, ids_src1.get(), ids_dst.get(), expert_bounds.get(),
+            n_scale_activations == 1 ? nullptr : scale_activations_d, n_scale_activations == 1 ? nullptr : scale_activations_src1.get(),
             ne02, ne12, n_expert_used, ne11, si1, sis1, stream);
         CUDA_CHECK(cudaGetLastError());
     }
@@ -197,7 +214,7 @@ void ggml_cuda_mul_mat_q(
         const int64_t s13 = src1->nb[3] / ts_src1;
 
         if (use_native_fp4) {
-            quantize_mmq_fp4_cuda(src1_d, ids_src1.get(), src1_q8_1.get(), src0->type, ne10, s11, s12, s13,
+            quantize_mmq_fp4_cuda(src1_d, ids_src1.get(), scale_activations_q, n_scale_activations_q, src1_q8_1.get(), src0->type, ne10, s11, s12, s13,
                                     ne10_padded, ne11_flat, ne12_flat, ne13_flat, stream);
         } else {
             quantize_mmq_q8_1_cuda(src1_d, ids_src1.get(), src1_q8_1.get(), src0->type, ne10, s11, s12, s13,
@@ -213,7 +230,9 @@ void ggml_cuda_mul_mat_q(
 
     // Note that ne02 is used instead of ne12 because the number of y channels determines the z dimension of the CUDA grid.
     const mmq_args args = {
-        src0_d, src0->type, (const int *) src1_q8_1.get(), ids_dst.get(), expert_bounds.get(), dst_d,
+        src0_d, src0->type, (const int *) src1_q8_1.get(),
+        use_native_fp4 ? scale_activations_q : nullptr, use_native_fp4 ? n_scale_activations_q : 0,
+        ids_dst.get(), expert_bounds.get(), dst_d,
         ne00, ne01, ne_get_rows, s01, ne_get_rows, s1,
         ne02, ne02, s02, s12, s2,
         ne03, ne13, s03, s13, s3,
@@ -253,7 +272,7 @@ void ggml_cuda_op_mul_mat_q(
                             || GGML_CUDA_CC_IS_CDNA(cc))
                             && src1_ncols == ne11;
     const mmq_args args = {
-        src0_dd_i, src0->type, (const int *) src1_ddq_i, nullptr, nullptr, dst_dd_i,
+        src0_dd_i, src0->type, (const int *) src1_ddq_i, nullptr, 0, nullptr, nullptr, dst_dd_i,
         ne00, row_diff, src1_ncols, stride01, ne11, nrows_dst,
         1, 1, 0, 0, 0,
         1, 1, 0, 0, 0,