Tencent · StromNoNo · Sep 15, 2025 · Sep 11, 2025 · Sep 15, 2025 · Sep 15, 2025
diff --git a/angelslim/compressor/quant/core/config.py b/angelslim/compressor/quant/core/config.py
@@ -148,6 +148,30 @@ def __init__(self, config, global_config=None):
                 "group_size": group_size,
                 "ignore_layers": quantization_args.ignore_layers,
             }
+        elif "nvfp4" in self.quant_algo:
+            is_dynamic = "dynamic" if "dynamic" in self.quant_algo else "static"
+            assert (
+                is_dynamic or act_quant_method is not None
+            ), "[Error] nvfp4 need act_quant_method"
+            self.act_observer = (
+                AbsmaxPertensorObserver if "static" in is_dynamic else None
+            )
+            self.weight_observer = AbsmaxPertensorObserver
+            self.kv_cache_observer = None
+            block_size = (
+                16
+                if quantization_args.quant_method["group_size"] == -1
+                else quantization_args.quant_method["group_size"]
+            )
+
+            self.quant_algo_info = {
+                "w": f"nvfp4_{weight_quant_method}",
+                "ignore_layers": quantization_args.ignore_layers,
+                "block_size": block_size,
+            }
+
+            if act_quant_method is not None:
+                self.quant_algo_info["a"] = f"nvfp4_{act_quant_method}-{is_dynamic}"
 
         if "smooth" in self.quant_helpers:
             self.smooth_alpha = quantization_args.smooth_alpha

diff --git a/angelslim/compressor/quant/core/quant_func.py b/angelslim/compressor/quant/core/quant_func.py
@@ -15,6 +15,7 @@
 from typing import Tuple
 
 import torch
+import torch.nn.functional as F
 import triton
 import triton.language as tl
 
@@ -429,3 +430,40 @@ def per_block_weight_quant(
     weight_quant[grid](x, y, s, M, N, BLOCK_SIZE=block_size)
 
     return y, s
+
+
+def reduce_block_padding(input: torch.Tensor, block_sizes: dict, pad_value: float = 0):
+    """Padding the input using block-based reduction for each dimension.
+
+    Args:
+        input_tensor (torch.Tensor): The input tensor.
+        block_sizes (dict): A dictionary specifying the block size for
+            padding each dimension. Example: `{-1: 128, -2: 128}` pads
+            the input over 2D blocks.
+    """
+    with torch.no_grad():
+        padded_tensor = input
+        num_dims = padded_tensor.dim()
+        # Process each specified dimension independently
+        for dim, block in block_sizes.items():
+            # Convert negative dimension to positive index
+            pos_dim = dim if dim >= 0 else num_dims + dim
+
+            # Calculate how many elements are missing along that dimension
+            current_size = padded_tensor.size(pos_dim)
+            remainder = current_size % block
+            pad_amt = 0 if remainder == 0 else block - remainder
+
+            if pad_amt > 0:
+                # F.pad expects a pad tuple of length 2*num_dims.
+                pad = [0] * (2 * num_dims)
+                # For dimension pos_dim, the right padding is at index:
+                # (num_dims - 1 - pos_dim)*2 + 1.
+                pad_index = (num_dims - 1 - pos_dim) * 2
+                pad[pad_index + 1] = (
+                    pad_amt  # Set padding on the right side of the target dimension
+                )
+
+                padded_tensor = F.pad(padded_tensor, pad, value=pad_value)
+
+        return padded_tensor
diff --git a/angelslim/compressor/quant/core/save.py b/angelslim/compressor/quant/core/save.py
@@ -154,6 +154,23 @@ def save(self, save_path):
                 "dynamic": False,
                 "type": "int",
             }
+        elif "nvfp4" in self.quant_model.quant_config.quant_algo:
+            quant_format = "naive-quantized"
+            group_size = self.quant_model.quant_config.quant_algo_info["block_size"]
+            trtllm_config["quantization"]["quant_algo"] = "NVFP4"
+            trtllm_config["quantization"]["group_size"] = group_size
+            act_config = {
+                "num_bits": 4,
+                "group_size": group_size,
+                "dynamic": "dynamic" in a_quant_algo,
+                "type": "float",
+            }
+            weight_config = {
+                "num_bits": 4,
+                "group_size": group_size,
+                "dynamic": False,
+                "type": "float",
+            }
         else:
             raise ValueError(
                 f"{self.quant_model.quant_config.quant_algo} not supported"

diff --git a/angelslim/compressor/quant/modules/__init__.py b/angelslim/compressor/quant/modules/__init__.py
@@ -19,10 +19,12 @@
 from .gptq.gptq import GPTQ  # noqa: F401
 from .gptq.gptq_module import GPTQModule  # noqa: F401
 from .helper_layer import GPTQQuantLinear  # noqa: F401
+from .helper_layer import NVFP4QDQModule  # noqa: F401
 from .helper_layer import QDQModule  # noqa: F401
 from .helper_layer import QDQSingleModule  # noqa: F401
 from .helper_layer import QLinear  # noqa: F401
 from .helper_layer import SmoothHelpModule  # noqa: F401
 from .helper_layer import WQLinearGEMM  # noqa: F401
 from .int8.int8 import INT8  # noqa: F401
+from .nvfp4.nvfp4 import NVFP4  # noqa: F401
 from .smooth.smooth import SmoothQuant  # noqa: F401