@@ -255,11 +255,69 @@ __device__ __forceinline__ uint8_t compute_e8m0_scale(float amax) {
255255 return static_cast <uint8_t >(biased);
256256}
257257
258+ static __device__ __forceinline__ void quantize_nvfp4_subblock_16_from_scale_code (
259+ const float * vals, const uint8_t scale_code, uint32_t & q0, uint32_t & q1) {
260+ const float scale = ggml_cuda_ue4m3_to_fp32 (scale_code);
261+ const float inv_scale = scale > 0 .0f ? 0 .5f / scale : 0 .0f ;
262+ q0 = 0 ;
263+ q1 = 0 ;
264+
265+ #pragma unroll
266+ for (int k = 0 ; k < QK_NVFP4_SUB / 4 ; ++k) {
267+ q0 |= (uint32_t ) ggml_cuda_float_to_fp4_e2m1 (vals[k + 0 ], inv_scale) << (8 * k);
268+ q0 |= (uint32_t ) ggml_cuda_float_to_fp4_e2m1 (vals[k + 8 ], inv_scale) << (8 * k + 4 );
269+ q1 |= (uint32_t ) ggml_cuda_float_to_fp4_e2m1 (vals[k + 4 ], inv_scale) << (8 * k);
270+ q1 |= (uint32_t ) ggml_cuda_float_to_fp4_e2m1 (vals[k + 12 ], inv_scale) << (8 * k + 4 );
271+ }
272+ }
273+
274+ static __device__ __forceinline__ float nvfp4_subblock_16_mse_from_scale_code (
275+ const float * vals, const uint8_t scale_code) {
276+ const float scale = ggml_cuda_ue4m3_to_fp32 (scale_code);
277+ const float inv_scale = scale > 0 .0f ? 0 .5f / scale : 0 .0f ;
278+ float mse = 0 .0f ;
279+
280+ #pragma unroll
281+ for (int k = 0 ; k < QK_NVFP4_SUB; ++k) {
282+ const uint8_t q = ggml_cuda_float_to_fp4_e2m1 (vals[k], inv_scale);
283+ const float err_diff = fabsf (vals[k]) - fabsf (kvalues_mxfp4[q & 0x7 ]) * scale;
284+ mse = fmaf (err_diff, err_diff, mse);
285+ }
286+
287+ return mse;
288+ }
289+
290+ static __device__ __forceinline__ void nvfp4_add_unique_scale_code (
291+ int * codes, int & n_codes, const int n_max_codes, const int code) {
292+ if (code < 0 || code > 0x7e || n_codes >= n_max_codes) {
293+ return ;
294+ }
295+
296+ #pragma unroll
297+ for (int i = 0 ; i < 5 ; ++i) {
298+ if (i >= n_codes) {
299+ break ;
300+ }
301+ if (codes[i] == code) {
302+ return ;
303+ }
304+ }
305+
306+ codes[n_codes] = code;
307+ n_codes++;
308+ }
309+
310+ static bool ggml_cuda_nvfp4_activity_adaptive_enabled () {
311+ static const char * env = getenv (" GGML_CUDA_NVFP4_ACTIVITY_ADAPTIVE" );
312+ static const bool enabled = env != nullptr && strcmp (env, " 0" ) != 0 ;
313+ return enabled;
314+ }
315+
258316
259317static __global__ void quantize_mmq_nvfp4 (
260318 const float * __restrict__ x, const int32_t * __restrict__ ids, void * __restrict__ vy,
261319 const int64_t ne00, const int64_t s01, const int64_t s02, const int64_t s03,
262- const int64_t ne0, const int64_t ne1, const int64_t ne2) {
320+ const int64_t ne0, const int64_t ne1, const int64_t ne2, const bool activity_adaptive ) {
263321#if defined(BLACKWELL_MMA_AVAILABLE)
264322
265323 const int64_t i0_base = ((int64_t ) blockDim .x * blockIdx .y + threadIdx .x ) * QK_NVFP4_SUB;
@@ -298,52 +356,58 @@ static __global__ void quantize_mmq_nvfp4(
298356 }
299357 }
300358
301- static constexpr int test_offsets[5 ] = { 0 , -1 , 1 , -2 , 2 };
302- const int first_fp8_code = (int ) ggml_cuda_fp32_to_ue4m3 (amax_raw / 6 .0f );
303-
304359 float best_err = FLT_MAX;
305360 uint8_t fp8_code = 0 ;
306- float subblock_scale = 0 .0f ;
307-
308- #pragma unroll // Check +/- 2 to find best code to reduce NVFP4 activation loss. Negligible overhead on Blackwell.
309- for (int i = 0 ; i < 5 ; i++) {
310- const int test_code = first_fp8_code + test_offsets[i];
311- if (test_code < 0 || test_code > 0x7e ) {
312- continue ;
313- }
314- const uint8_t code = (uint8_t ) test_code;
315- const float test_scale = ggml_cuda_ue4m3_to_fp32 (code);
316- const float test_inv_scale = test_scale > 0 .0f ? 0 .5f / test_scale : 0 .0f ;
317- float cur_err = 0 .0f ;
361+ uint32_t best_q0 = 0 ;
362+ uint32_t best_q1 = 0 ;
363+
364+ if (activity_adaptive) {
365+ static constexpr int n_max_codes = 5 ;
366+ int test_codes[n_max_codes];
367+ int n_test_codes = 0 ;
368+ const int code4 = (int ) ggml_cuda_fp32_to_ue4m3 (amax_raw / 4 .0f );
369+ const int code5 = (int ) ggml_cuda_fp32_to_ue4m3 (amax_raw / 5 .0f );
370+ const int code6 = (int ) ggml_cuda_fp32_to_ue4m3 (amax_raw / 6 .0f );
371+ nvfp4_add_unique_scale_code (test_codes, n_test_codes, n_max_codes, code4);
372+ nvfp4_add_unique_scale_code (test_codes, n_test_codes, n_max_codes, code5);
373+ nvfp4_add_unique_scale_code (test_codes, n_test_codes, n_max_codes, code6);
374+ nvfp4_add_unique_scale_code (test_codes, n_test_codes, n_max_codes, code6 - 1 );
375+ nvfp4_add_unique_scale_code (test_codes, n_test_codes, n_max_codes, code6 + 1 );
318376#pragma unroll
319- for (int k = 0 ; k < QK_NVFP4_SUB; ++k) {
320- const float v = vals_raw[k];
321- const uint8_t q = ggml_cuda_float_to_fp4_e2m1 (v, test_inv_scale);
322- const float err_diff = fabsf (v) - fabsf (kvalues_mxfp4[q & 0x7 ]) * test_scale;
323- cur_err = fmaf (err_diff, err_diff, cur_err);
377+ for (int i = 0 ; i < n_max_codes; i++) {
378+ if (i >= n_test_codes) {
379+ break ;
380+ }
381+ const uint8_t code = (uint8_t ) test_codes[i];
382+ const float cur_err = nvfp4_subblock_16_mse_from_scale_code (vals_raw, code);
383+ if (cur_err < best_err) {
384+ best_err = cur_err;
385+ fp8_code = code;
386+ }
324387 }
325-
326- if (cur_err < best_err) {
327- best_err = cur_err;
328- fp8_code = test_code;
329- subblock_scale = test_scale;
388+ } else {
389+ static constexpr int test_offsets[5 ] = { 0 , -1 , 1 , -2 , 2 };
390+ const int first_fp8_code = (int ) ggml_cuda_fp32_to_ue4m3 (amax_raw / 6 .0f );
391+ #pragma unroll
392+ for (int i = 0 ; i < 5 ; i++) {
393+ const int test_code = first_fp8_code + test_offsets[i];
394+ if (test_code < 0 || test_code > 0x7e ) {
395+ continue ;
396+ }
397+ const uint8_t code = (uint8_t ) test_code;
398+ const float cur_err = nvfp4_subblock_16_mse_from_scale_code (vals_raw, code);
399+ if (cur_err < best_err) {
400+ best_err = cur_err;
401+ fp8_code = code;
402+ }
330403 }
331404 }
332405
333- const float inv_scale = subblock_scale > 0 .0f ? 0 .5f / subblock_scale : 0 .0f ;
334- uint32_t q0 = 0 ;
335- uint32_t q1 = 0 ;
336- #pragma unroll // this is faster than the previous __nv_fp4x4_e2m1
337- for (int k = 0 ; k < QK_NVFP4_SUB / 4 ; ++k) {
338- q0 |= (uint32_t ) ggml_cuda_float_to_fp4_e2m1 (vals_raw[k + 0 ], inv_scale) << (8 * k);
339- q0 |= (uint32_t ) ggml_cuda_float_to_fp4_e2m1 (vals_raw[k + 8 ], inv_scale) << (8 * k + 4 );
340- q1 |= (uint32_t ) ggml_cuda_float_to_fp4_e2m1 (vals_raw[k + 4 ], inv_scale) << (8 * k);
341- q1 |= (uint32_t ) ggml_cuda_float_to_fp4_e2m1 (vals_raw[k + 12 ], inv_scale) << (8 * k + 4 );
342- }
406+ quantize_nvfp4_subblock_16_from_scale_code (vals_raw, fp8_code, best_q0, best_q1);
343407
344408 uint32_t * yqs = reinterpret_cast <uint32_t *>(yb->qs );
345- yqs[2 * sub + 0 ] = q0 ;
346- yqs[2 * sub + 1 ] = q1 ;
409+ yqs[2 * sub + 0 ] = best_q0 ;
410+ yqs[2 * sub + 1 ] = best_q1 ;
347411 reinterpret_cast <uint8_t *>(yb->d4 )[sub] = fp8_code;
348412#else
349413 NO_DEVICE_CODE; // This is for Blackwell NVFP4 activations only.
@@ -829,7 +893,7 @@ void quantize_mmq_fp4_cuda(
829893 const dim3 block_size (nvfp4_block_size, 1 , 1 );
830894 const dim3 num_blocks (ne1, block_num_y, ne2 * ne3);
831895 quantize_mmq_nvfp4<<<num_blocks, block_size, 0 , stream>>> (
832- x, ids, vy, ne00, s01, s02, s03, ne0, ne1, ne2);
896+ x, ids, vy, ne00, s01, s02, s03, ne0, ne1, ne2, ggml_cuda_nvfp4_activity_adaptive_enabled () );
833897 } else {
834898 GGML_ASSERT (ne0 % (2 * QK_MXFP4) == 0 );
835899
0 commit comments