NVIDIA
diff --git a/‎docs/envvars.rst‎
Lines changed: 24 additions & 0 deletions b/‎docs/envvars.rst‎
Lines changed: 24 additions & 0 deletions
diff --git a/‎tests/cpp/operator/test_cast_nvfp4_transpose.cu‎
Lines changed: 520 additions & 96 deletions b/‎tests/cpp/operator/test_cast_nvfp4_transpose.cu‎
Lines changed: 520 additions & 96 deletions
diff --git a/‎tests/cpp/operator/test_dequantize_nvfp4.cu‎
Lines changed: 54 additions & 14 deletions b/‎tests/cpp/operator/test_dequantize_nvfp4.cu‎
Lines changed: 54 additions & 14 deletions
diff --git a/‎tests/cpp/test_common.cu‎
Lines changed: 12 additions & 0 deletions b/‎tests/cpp/test_common.cu‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎tests/cpp/test_common.h‎
Lines changed: 3 additions & 0 deletions b/‎tests/cpp/test_common.h‎
Lines changed: 3 additions & 0 deletions
@@ -287,6 +287,30 @@ Kernel Configuration
    :Default: ``0``
    :Description: Enable row-scaled NVFP4 tensors for forward activation quantizers in the ``NVFP4BlockScaling`` recipe. When set to ``1`` (or when ``NVFP4BlockScaling(row_scaled_activation=True)`` is used), rowwise ``amax`` metadata is stored as one FP32 value per tensor row instead of a single scalar.
 
+.. envvar:: NVTE_NVFP4_4OVER6
+
+   :Type: ``str`` (``none``, ``weights``, ``activations``, or ``all``)
+   :Default: ``none``
+   :Description: Enable 4over6 adaptive NVFP4 block scaling for weights, activations, or both in the ``NVFP4BlockScaling`` recipe. For each selected FP4 block, quantization compares map-to-4 and map-to-6 candidates and stores the candidate with lower configured error. ``none`` keeps standard NVFP4. Current 4over6 support targets RL and post-training scenarios; pre-training paths that combine 4over6 with RHT are not yet implemented.
+
+.. envvar:: NVTE_NVFP4_4OVER6_E4M3_USE_256
+
+   :Type: ``str`` (``none``, ``weights``, ``activations``, or ``all``)
+   :Default: ``all``
+   :Description: Select NVFP4 4over6 quantizers that use 256 instead of 448 as the global E4M3 scale bound. By default, all 4over6 quantizers use 256. Set the env var to ``none`` (or set ``NVFP4BlockScaling(nvfp4_4over6_e4m3_use_256="none")``) to use the standard NVFP4 448 bound for all 4over6 quantizers. This option is only meaningful for tensor roles that also enable :envvar:`NVTE_NVFP4_4OVER6`.
+
+.. envvar:: NVTE_NVFP4_4OVER6_ERR_MODE
+
+   :Type: ``str`` (``MAE`` or ``MSE``)
+   :Default: ``MAE``
+   :Description: Select the input-domain error metric used by NVFP4 4over6 map-to-4 versus map-to-6 candidate selection in the ``NVFP4BlockScaling`` recipe.
+
+.. envvar:: NVTE_NVFP4_4OVER6_ERR_USE_FAST_MATH
+
+   :Type: ``int`` (0 or 1)
+   :Default: ``0``
+   :Description: Allow the NVFP4 4over6 candidate error computation to use faster non-strict floating-point expressions. By default, 4over6 error comparison uses strict expressions; ``NVTE_USE_FAST_MATH`` does not control this error-comparison path.
+
 Torch Compilation and Fusion
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 
@@ -46,8 +46,9 @@ void compute_ref_dequantize_nvfp4(const uint8_t *packed_data,
                                   OType *output,
                                   size_t rows,
                                   size_t cols,
-                                  size_t scale_stride) {
-    constexpr float factor_inv = 1.0f / (6.0f * 448.0f);
+                                  size_t scale_stride,
+                                  int e4m3_max) {
+    const float factor_inv = 1.0f / (6.0f * static_cast<float>(e4m3_max));
     constexpr size_t BLOCK_SIZE = 16;
     const size_t Mread = cols / BLOCK_SIZE;
     const size_t bytes_per_block = BLOCK_SIZE / 2;
@@ -86,11 +87,18 @@ float compute_amax(test::Tensor &t, size_t rows, size_t cols) {
     return amax;
 }
 
+struct NVFP4DequantizeTestConfig {
+  NVTENVFP44Over6Mode mode = kNVTENVFP44Over6Disabled;
+  int e4m3_max = 448;
+};
+
 // Quantize a high-precision input to NVFP4, then dequantize and compare
 // against a CPU reference computed from the quantized data.
 template <typename OutputType>
 void performTest_dequantize_nvfp4(const size_t rows, const size_t cols,
-                                  const bool row_scaled_nvfp4) {
+                                  const bool row_scaled_nvfp4,
+                                  const NVTENVFP44Over6Mode mode,
+                                  const int e4m3_max) {
     using namespace test;
     DType otype = TypeInfo<OutputType>::dtype;
 
@@ -105,6 +113,8 @@ void performTest_dequantize_nvfp4(const size_t rows, const size_t cols,
 
     // Configure quantized tensor amax
     size_t amax_size = 1;
+    quantized.set_nvfp4_e4m3_max(e4m3_max);
+    ASSERT_EQ(quantized.nvfp4_e4m3_max(), e4m3_max);
     if (row_scaled_nvfp4) {
       quantized.set_row_scaled_nvfp4(true);
       amax_size = rows;
@@ -116,7 +126,9 @@ void performTest_dequantize_nvfp4(const size_t rows, const size_t cols,
 
     // Quantize
     if (rows > 0 && cols > 0) {
-        nvte_quantize(input.data(), quantized.data(), 0);
+        QuantizationConfigWrapper quant_config;
+        quant_config.set_nvfp4_4over6_mode(mode);
+        nvte_quantize_v2(input.data(), quantized.data(), quant_config, 0);
         cudaDeviceSynchronize();
         auto err = cudaGetLastError();
         ASSERT_EQ(err, cudaSuccess) << cudaGetErrorString(err);
@@ -146,7 +158,7 @@ void performTest_dequantize_nvfp4(const size_t rows, const size_t cols,
       std::make_unique<OutputType[]>(rows * cols);
     compute_ref_dequantize_nvfp4<OutputType>(
       fp4_data, scales, amax_vals, ref_output.get(),
-      rows, cols, scale_stride);
+      rows, cols, scale_stride, e4m3_max);
 
     // Compare results from TE and reference impls
     auto [atol, rtol] = getTolerances(otype);
@@ -156,7 +168,9 @@ void performTest_dequantize_nvfp4(const size_t rows, const size_t cols,
 // Dequantize NVFP4 with GEMM-swizzled scales and compare against compact path.
 template <typename OutputType>
 void performTest_dequantize_nvfp4_swizzled(const size_t rows, const size_t cols,
-                                           const bool row_scaled_nvfp4) {
+                                           const bool row_scaled_nvfp4,
+                                           const NVTENVFP44Over6Mode mode,
+                                           const int e4m3_max) {
     using namespace test;
     DType otype = TypeInfo<OutputType>::dtype;
 
@@ -165,6 +179,8 @@ void performTest_dequantize_nvfp4_swizzled(const size_t rows, const size_t cols,
 
     Tensor quantized_compact("quantized_compact", std::vector<size_t>{rows, cols},
                              DType::kFloat4E2M1, true, false, NVTE_NVFP4_1D_SCALING);
+    quantized_compact.set_nvfp4_e4m3_max(e4m3_max);
+    ASSERT_EQ(quantized_compact.nvfp4_e4m3_max(), e4m3_max);
     if (row_scaled_nvfp4) {
         quantized_compact.set_row_scaled_nvfp4(true);
     } else if (rows > 0 && cols > 0) {
@@ -174,7 +190,9 @@ void performTest_dequantize_nvfp4_swizzled(const size_t rows, const size_t cols,
     }
 
     if (rows > 0 && cols > 0) {
-        nvte_quantize(input.data(), quantized_compact.data(), 0);
+        QuantizationConfigWrapper quant_config;
+        quant_config.set_nvfp4_4over6_mode(mode);
+        nvte_quantize_v2(input.data(), quantized_compact.data(), quant_config, 0);
         cudaDeviceSynchronize();
     }
 
@@ -186,6 +204,8 @@ void performTest_dequantize_nvfp4_swizzled(const size_t rows, const size_t cols,
     // Create tensor with same FP4 data but swizzled scales
     Tensor quantized_swizzled("quantized_swizzled", std::vector<size_t>{rows, cols},
                               DType::kFloat4E2M1, true, false, NVTE_NVFP4_1D_SCALING);
+    quantized_swizzled.set_nvfp4_e4m3_max(e4m3_max);
+    ASSERT_EQ(quantized_swizzled.nvfp4_e4m3_max(), e4m3_max);
     if (row_scaled_nvfp4) {
         quantized_swizzled.set_row_scaled_nvfp4(true);
     } else {
@@ -260,7 +280,8 @@ std::vector<std::pair<size_t, size_t>> nvfp4_tensor_dims = {
 class DequantizeNVFP4TestSuite : public ::testing::TestWithParam
     <std::tuple<std::pair<size_t, size_t>,
                 transformer_engine::DType,
-                bool>> {};
+                bool,
+                NVFP4DequantizeTestConfig>> {};
 
 TEST_P(DequantizeNVFP4TestSuite, TestDequantizeNVFP4)
 {
@@ -271,10 +292,12 @@ TEST_P(DequantizeNVFP4TestSuite, TestDequantizeNVFP4)
     const auto tensor_size = std::get<0>(GetParam());
     const DType output_type = std::get<1>(GetParam());
     const bool row_scaled_nvfp4 = std::get<2>(GetParam());
+    const NVFP4DequantizeTestConfig config = std::get<3>(GetParam());
 
     TRANSFORMER_ENGINE_TYPE_SWITCH_FP16_FP32_ONLY(output_type, OutputType,
         performTest_dequantize_nvfp4<OutputType>(
-            tensor_size.first, tensor_size.second, row_scaled_nvfp4);
+            tensor_size.first, tensor_size.second, row_scaled_nvfp4, config.mode,
+            config.e4m3_max);
     );
 }
 
@@ -284,21 +307,29 @@ INSTANTIATE_TEST_SUITE_P(
     ::testing::Combine(
         ::testing::ValuesIn(nvfp4_tensor_dims),
         ::testing::Values(DType::kFloat32, DType::kBFloat16, DType::kFloat16),
-        ::testing::Bool()),
+        ::testing::Bool(),
+        ::testing::Values(NVFP4DequantizeTestConfig{},
+                          NVFP4DequantizeTestConfig{kNVTENVFP44Over6MinMAE, 448},
+                          NVFP4DequantizeTestConfig{kNVTENVFP44Over6MinMAE, 256})),
     [](const testing::TestParamInfo<DequantizeNVFP4TestSuite::ParamType>& info)
     {
+        const NVFP4DequantizeTestConfig config = std::get<3>(info.param);
+        const bool use_4over6 = config.mode != kNVTENVFP44Over6Disabled;
         std::string name = std::to_string(std::get<0>(info.param).first) + "X" +
                            std::to_string(std::get<0>(info.param).second) + "X" +
                            test::typeName(std::get<1>(info.param)) + "X" +
-                           (std::get<2>(info.param) ? "RowScaled" : "PerTensor");
+                           (std::get<2>(info.param) ? "RowScaled" : "PerTensor") + "X" +
+                           (use_4over6 ? "FourOverSix" : "Default") + "X" +
+                           (config.e4m3_max == 256 ? "E4M3Max256" : "E4M3Max448");
         return name;
     }
 );
 
 class DequantizeNVFP4SwizzledTestSuite : public ::testing::TestWithParam
     <std::tuple<std::pair<size_t, size_t>,
                 transformer_engine::DType,
-                bool>> {};
+                bool,
+                NVFP4DequantizeTestConfig>> {};
 
 TEST_P(DequantizeNVFP4SwizzledTestSuite, TestDequantizeNVFP4Swizzled)
 {
@@ -309,10 +340,12 @@ TEST_P(DequantizeNVFP4SwizzledTestSuite, TestDequantizeNVFP4Swizzled)
     const auto tensor_size = std::get<0>(GetParam());
     const DType output_type = std::get<1>(GetParam());
     const bool row_scaled_nvfp4 = std::get<2>(GetParam());
+    const NVFP4DequantizeTestConfig config = std::get<3>(GetParam());
 
     TRANSFORMER_ENGINE_TYPE_SWITCH_FP16_FP32_ONLY(output_type, OutputType,
         performTest_dequantize_nvfp4_swizzled<OutputType>(
-            tensor_size.first, tensor_size.second, row_scaled_nvfp4);
+            tensor_size.first, tensor_size.second, row_scaled_nvfp4, config.mode,
+            config.e4m3_max);
     );
 }
 
@@ -322,13 +355,20 @@ INSTANTIATE_TEST_SUITE_P(
     ::testing::Combine(
         ::testing::ValuesIn(nvfp4_tensor_dims),
         ::testing::Values(DType::kFloat32, DType::kBFloat16, DType::kFloat16),
-        ::testing::Bool()),
+        ::testing::Bool(),
+        ::testing::Values(NVFP4DequantizeTestConfig{},
+                          NVFP4DequantizeTestConfig{kNVTENVFP44Over6MinMAE, 448},
+                          NVFP4DequantizeTestConfig{kNVTENVFP44Over6MinMAE, 256})),
     [](const testing::TestParamInfo<DequantizeNVFP4SwizzledTestSuite::ParamType>& info)
     {
+        const NVFP4DequantizeTestConfig config = std::get<3>(info.param);
+        const bool use_4over6 = config.mode != kNVTENVFP44Over6Disabled;
         std::string name = std::to_string(std::get<0>(info.param).first) + "X" +
                            std::to_string(std::get<0>(info.param).second) + "X" +
                            test::typeName(std::get<1>(info.param)) + "X" +
                            (std::get<2>(info.param) ? "RowScaled" : "PerTensor") + "X" +
+                           (use_4over6 ? "FourOverSix" : "Default") + "X" +
+                           (config.e4m3_max == 256 ? "E4M3Max256" : "E4M3Max448") + "X" +
                            "Swizzled";
         return name;
     }
 
@@ -440,6 +440,18 @@ void Tensor::set_row_scaled_nvfp4(bool row_scaled_nvfp4) {
   }
 }
 
+void Tensor::set_nvfp4_e4m3_max(int nvfp4_e4m3_max) {
+  NVTE_CHECK(tensor_.scaling_mode() == NVTE_NVFP4_1D_SCALING,
+             "NVFP4 E4M3 max is only supported for NVFP4 tensors.");
+  tensor_.set_nvfp4_e4m3_max(nvfp4_e4m3_max);
+}
+
+int Tensor::nvfp4_e4m3_max() const {
+  NVTE_CHECK(tensor_.scaling_mode() == NVTE_NVFP4_1D_SCALING,
+             "NVFP4 E4M3 max is only supported for NVFP4 tensors.");
+  return tensor_.get_nvfp4_e4m3_max();
+}
+
 void Tensor::to_cpu() {
   if (data_rowwise_) { data_rowwise_->to_cpu(); }
   if (data_columnwise_) { data_columnwise_->to_cpu(); }
 
@@ -293,10 +293,13 @@ class Tensor {
     return columnwise_;
   }
 
+  int nvfp4_e4m3_max() const;
+
   void set_tensor_amax_nullptr();
 
   void set_with_gemm_swizzled_scales(bool with_gemm_swizzled_scales);
   void set_row_scaled_nvfp4(bool row_scaled_nvfp4);
+  void set_nvfp4_e4m3_max(int nvfp4_e4m3_max);
 
   void to_cpu();
   void from_cpu();