Skip to content

Commit 278d8a6

Browse files
committed
quant: add enhanced NVFP4 quantization modes
Add NVFP4 adaptive scale selection with M={4,5,6}, optional imatrix-aware two-objective selection, and the NVFP4_M mixed tensor policy. Keep the output format as standard GGML_TYPE_NVFP4. Also add experimental CUDA A4 activation adaptive scaling for native NVFP4 W4A4, controlled by an environment option, and preserve W4Q8 / MMVQ fallback behavior.
1 parent 1ad1a80 commit 278d8a6

9 files changed

Lines changed: 957 additions & 62 deletions

File tree

ggml/src/ggml-cuda/common.cuh

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1619,4 +1619,3 @@ static __inline__ void ggml_cuda_kernel_launch(Kernel kernel, const ggml_cuda_ke
16191619
kernel<<<launch_params.block_nums, launch_params.block_dims, launch_params.shmem, launch_params.stream>>>(std::forward<Args>(args)... );
16201620
CUDA_CHECK(cudaGetLastError());
16211621
}
1622-

ggml/src/ggml-cuda/quantize.cu

Lines changed: 103 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -255,11 +255,69 @@ __device__ __forceinline__ uint8_t compute_e8m0_scale(float amax) {
255255
return static_cast<uint8_t>(biased);
256256
}
257257

258+
static __device__ __forceinline__ void quantize_nvfp4_subblock_16_from_scale_code(
259+
const float * vals, const uint8_t scale_code, uint32_t & q0, uint32_t & q1) {
260+
const float scale = ggml_cuda_ue4m3_to_fp32(scale_code);
261+
const float inv_scale = scale > 0.0f ? 0.5f / scale : 0.0f;
262+
q0 = 0;
263+
q1 = 0;
264+
265+
#pragma unroll
266+
for (int k = 0; k < QK_NVFP4_SUB / 4; ++k) {
267+
q0 |= (uint32_t) ggml_cuda_float_to_fp4_e2m1(vals[k + 0], inv_scale) << (8 * k);
268+
q0 |= (uint32_t) ggml_cuda_float_to_fp4_e2m1(vals[k + 8], inv_scale) << (8 * k + 4);
269+
q1 |= (uint32_t) ggml_cuda_float_to_fp4_e2m1(vals[k + 4], inv_scale) << (8 * k);
270+
q1 |= (uint32_t) ggml_cuda_float_to_fp4_e2m1(vals[k + 12], inv_scale) << (8 * k + 4);
271+
}
272+
}
273+
274+
static __device__ __forceinline__ float nvfp4_subblock_16_mse_from_scale_code(
275+
const float * vals, const uint8_t scale_code) {
276+
const float scale = ggml_cuda_ue4m3_to_fp32(scale_code);
277+
const float inv_scale = scale > 0.0f ? 0.5f / scale : 0.0f;
278+
float mse = 0.0f;
279+
280+
#pragma unroll
281+
for (int k = 0; k < QK_NVFP4_SUB; ++k) {
282+
const uint8_t q = ggml_cuda_float_to_fp4_e2m1(vals[k], inv_scale);
283+
const float err_diff = fabsf(vals[k]) - fabsf(kvalues_mxfp4[q & 0x7]) * scale;
284+
mse = fmaf(err_diff, err_diff, mse);
285+
}
286+
287+
return mse;
288+
}
289+
290+
static __device__ __forceinline__ void nvfp4_add_unique_scale_code(
291+
int * codes, int & n_codes, const int n_max_codes, const int code) {
292+
if (code < 0 || code > 0x7e || n_codes >= n_max_codes) {
293+
return;
294+
}
295+
296+
#pragma unroll
297+
for (int i = 0; i < 5; ++i) {
298+
if (i >= n_codes) {
299+
break;
300+
}
301+
if (codes[i] == code) {
302+
return;
303+
}
304+
}
305+
306+
codes[n_codes] = code;
307+
n_codes++;
308+
}
309+
310+
static bool ggml_cuda_nvfp4_activity_adaptive_enabled() {
311+
static const char * env = getenv("GGML_CUDA_NVFP4_ACTIVITY_ADAPTIVE");
312+
static const bool enabled = env != nullptr && strcmp(env, "0") != 0;
313+
return enabled;
314+
}
315+
258316

259317
static __global__ void quantize_mmq_nvfp4(
260318
const float * __restrict__ x, const int32_t * __restrict__ ids, void * __restrict__ vy,
261319
const int64_t ne00, const int64_t s01, const int64_t s02, const int64_t s03,
262-
const int64_t ne0, const int64_t ne1, const int64_t ne2) {
320+
const int64_t ne0, const int64_t ne1, const int64_t ne2, const bool activity_adaptive) {
263321
#if defined(BLACKWELL_MMA_AVAILABLE)
264322

265323
const int64_t i0_base = ((int64_t) blockDim.x * blockIdx.y + threadIdx.x) * QK_NVFP4_SUB;
@@ -298,52 +356,58 @@ static __global__ void quantize_mmq_nvfp4(
298356
}
299357
}
300358

301-
static constexpr int test_offsets[5] = { 0, -1, 1, -2, 2};
302-
const int first_fp8_code = (int) ggml_cuda_fp32_to_ue4m3(amax_raw / 6.0f);
303-
304359
float best_err = FLT_MAX;
305360
uint8_t fp8_code = 0;
306-
float subblock_scale = 0.0f;
307-
308-
#pragma unroll // Check +/- 2 to find best code to reduce NVFP4 activation loss. Negligible overhead on Blackwell.
309-
for (int i = 0; i < 5; i++) {
310-
const int test_code = first_fp8_code + test_offsets[i];
311-
if (test_code < 0 || test_code > 0x7e) {
312-
continue;
313-
}
314-
const uint8_t code = (uint8_t) test_code;
315-
const float test_scale = ggml_cuda_ue4m3_to_fp32(code);
316-
const float test_inv_scale = test_scale > 0.0f ? 0.5f / test_scale : 0.0f;
317-
float cur_err = 0.0f;
361+
uint32_t best_q0 = 0;
362+
uint32_t best_q1 = 0;
363+
364+
if (activity_adaptive) {
365+
static constexpr int n_max_codes = 5;
366+
int test_codes[n_max_codes];
367+
int n_test_codes = 0;
368+
const int code4 = (int) ggml_cuda_fp32_to_ue4m3(amax_raw / 4.0f);
369+
const int code5 = (int) ggml_cuda_fp32_to_ue4m3(amax_raw / 5.0f);
370+
const int code6 = (int) ggml_cuda_fp32_to_ue4m3(amax_raw / 6.0f);
371+
nvfp4_add_unique_scale_code(test_codes, n_test_codes, n_max_codes, code4);
372+
nvfp4_add_unique_scale_code(test_codes, n_test_codes, n_max_codes, code5);
373+
nvfp4_add_unique_scale_code(test_codes, n_test_codes, n_max_codes, code6);
374+
nvfp4_add_unique_scale_code(test_codes, n_test_codes, n_max_codes, code6 - 1);
375+
nvfp4_add_unique_scale_code(test_codes, n_test_codes, n_max_codes, code6 + 1);
318376
#pragma unroll
319-
for (int k = 0; k < QK_NVFP4_SUB; ++k) {
320-
const float v = vals_raw[k];
321-
const uint8_t q = ggml_cuda_float_to_fp4_e2m1(v, test_inv_scale);
322-
const float err_diff = fabsf(v) - fabsf(kvalues_mxfp4[q & 0x7]) * test_scale;
323-
cur_err = fmaf(err_diff, err_diff, cur_err);
377+
for (int i = 0; i < n_max_codes; i++) {
378+
if (i >= n_test_codes) {
379+
break;
380+
}
381+
const uint8_t code = (uint8_t) test_codes[i];
382+
const float cur_err = nvfp4_subblock_16_mse_from_scale_code(vals_raw, code);
383+
if (cur_err < best_err) {
384+
best_err = cur_err;
385+
fp8_code = code;
386+
}
324387
}
325-
326-
if (cur_err < best_err) {
327-
best_err = cur_err;
328-
fp8_code = test_code;
329-
subblock_scale = test_scale;
388+
} else {
389+
static constexpr int test_offsets[5] = { 0, -1, 1, -2, 2 };
390+
const int first_fp8_code = (int) ggml_cuda_fp32_to_ue4m3(amax_raw / 6.0f);
391+
#pragma unroll
392+
for (int i = 0; i < 5; i++) {
393+
const int test_code = first_fp8_code + test_offsets[i];
394+
if (test_code < 0 || test_code > 0x7e) {
395+
continue;
396+
}
397+
const uint8_t code = (uint8_t) test_code;
398+
const float cur_err = nvfp4_subblock_16_mse_from_scale_code(vals_raw, code);
399+
if (cur_err < best_err) {
400+
best_err = cur_err;
401+
fp8_code = code;
402+
}
330403
}
331404
}
332405

333-
const float inv_scale = subblock_scale > 0.0f ? 0.5f / subblock_scale : 0.0f;
334-
uint32_t q0 = 0;
335-
uint32_t q1 = 0;
336-
#pragma unroll // this is faster than the previous __nv_fp4x4_e2m1
337-
for (int k = 0; k < QK_NVFP4_SUB / 4; ++k) {
338-
q0 |= (uint32_t) ggml_cuda_float_to_fp4_e2m1(vals_raw[k + 0], inv_scale) << (8 * k);
339-
q0 |= (uint32_t) ggml_cuda_float_to_fp4_e2m1(vals_raw[k + 8], inv_scale) << (8 * k + 4);
340-
q1 |= (uint32_t) ggml_cuda_float_to_fp4_e2m1(vals_raw[k + 4], inv_scale) << (8 * k);
341-
q1 |= (uint32_t) ggml_cuda_float_to_fp4_e2m1(vals_raw[k + 12], inv_scale) << (8 * k + 4);
342-
}
406+
quantize_nvfp4_subblock_16_from_scale_code(vals_raw, fp8_code, best_q0, best_q1);
343407

344408
uint32_t * yqs = reinterpret_cast<uint32_t *>(yb->qs);
345-
yqs[2 * sub + 0] = q0;
346-
yqs[2 * sub + 1] = q1;
409+
yqs[2 * sub + 0] = best_q0;
410+
yqs[2 * sub + 1] = best_q1;
347411
reinterpret_cast<uint8_t *>(yb->d4)[sub] = fp8_code;
348412
#else
349413
NO_DEVICE_CODE; // This is for Blackwell NVFP4 activations only.
@@ -829,7 +893,7 @@ void quantize_mmq_fp4_cuda(
829893
const dim3 block_size(nvfp4_block_size, 1, 1);
830894
const dim3 num_blocks(ne1, block_num_y, ne2 * ne3);
831895
quantize_mmq_nvfp4<<<num_blocks, block_size, 0, stream>>>(
832-
x, ids, vy, ne00, s01, s02, s03, ne0, ne1, ne2);
896+
x, ids, vy, ne00, s01, s02, s03, ne0, ne1, ne2, ggml_cuda_nvfp4_activity_adaptive_enabled());
833897
} else {
834898
GGML_ASSERT(ne0 % (2 * QK_MXFP4) == 0);
835899

0 commit comments

Comments
 (0)