Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 24 additions & 0 deletions angelslim/compressor/quant/core/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,30 @@ def __init__(self, config, global_config=None):
"group_size": group_size,
"ignore_layers": quantization_args.ignore_layers,
}
elif "nvfp4" in self.quant_algo:
is_dynamic = "dynamic" if "dynamic" in self.quant_algo else "static"
assert (
is_dynamic or act_quant_method is not None
), "[Error] nvfp4 need act_quant_method"
self.act_observer = (
AbsmaxPertensorObserver if "static" in is_dynamic else None
)
self.weight_observer = AbsmaxPertensorObserver
self.kv_cache_observer = None
block_size = (
16
if quantization_args.quant_method["group_size"] == -1
else quantization_args.quant_method["group_size"]
)

self.quant_algo_info = {
"w": f"nvfp4_{weight_quant_method}",
"ignore_layers": quantization_args.ignore_layers,
"block_size": block_size,
}

if act_quant_method is not None:
self.quant_algo_info["a"] = f"nvfp4_{act_quant_method}-{is_dynamic}"

if "smooth" in self.quant_helpers:
self.smooth_alpha = quantization_args.smooth_alpha
Expand Down
38 changes: 38 additions & 0 deletions angelslim/compressor/quant/core/quant_func.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from typing import Tuple

import torch
import torch.nn.functional as F
import triton
import triton.language as tl

Expand Down Expand Up @@ -429,3 +430,40 @@ def per_block_weight_quant(
weight_quant[grid](x, y, s, M, N, BLOCK_SIZE=block_size)

return y, s


def reduce_block_padding(input: torch.Tensor, block_sizes: dict, pad_value: float = 0):
"""Padding the input using block-based reduction for each dimension.

Args:
input_tensor (torch.Tensor): The input tensor.
block_sizes (dict): A dictionary specifying the block size for
padding each dimension. Example: `{-1: 128, -2: 128}` pads
the input over 2D blocks.
"""
with torch.no_grad():
padded_tensor = input
num_dims = padded_tensor.dim()
# Process each specified dimension independently
for dim, block in block_sizes.items():
# Convert negative dimension to positive index
pos_dim = dim if dim >= 0 else num_dims + dim

# Calculate how many elements are missing along that dimension
current_size = padded_tensor.size(pos_dim)
remainder = current_size % block
pad_amt = 0 if remainder == 0 else block - remainder

if pad_amt > 0:
# F.pad expects a pad tuple of length 2*num_dims.
pad = [0] * (2 * num_dims)
# For dimension pos_dim, the right padding is at index:
# (num_dims - 1 - pos_dim)*2 + 1.
pad_index = (num_dims - 1 - pos_dim) * 2
pad[pad_index + 1] = (
pad_amt # Set padding on the right side of the target dimension
)

padded_tensor = F.pad(padded_tensor, pad, value=pad_value)

return padded_tensor
17 changes: 17 additions & 0 deletions angelslim/compressor/quant/core/save.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,23 @@ def save(self, save_path):
"dynamic": False,
"type": "int",
}
elif "nvfp4" in self.quant_model.quant_config.quant_algo:
quant_format = "naive-quantized"
group_size = self.quant_model.quant_config.quant_algo_info["block_size"]
trtllm_config["quantization"]["quant_algo"] = "NVFP4"
trtllm_config["quantization"]["group_size"] = group_size
act_config = {
"num_bits": 4,
"group_size": group_size,
"dynamic": "dynamic" in a_quant_algo,
"type": "float",
}
weight_config = {
"num_bits": 4,
"group_size": group_size,
"dynamic": False,
"type": "float",
}
else:
raise ValueError(
f"{self.quant_model.quant_config.quant_algo} not supported"
Expand Down
2 changes: 2 additions & 0 deletions angelslim/compressor/quant/modules/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,12 @@
from .gptq.gptq import GPTQ # noqa: F401
from .gptq.gptq_module import GPTQModule # noqa: F401
from .helper_layer import GPTQQuantLinear # noqa: F401
from .helper_layer import NVFP4QDQModule # noqa: F401
from .helper_layer import QDQModule # noqa: F401
from .helper_layer import QDQSingleModule # noqa: F401
from .helper_layer import QLinear # noqa: F401
from .helper_layer import SmoothHelpModule # noqa: F401
from .helper_layer import WQLinearGEMM # noqa: F401
from .int8.int8 import INT8 # noqa: F401
from .nvfp4.nvfp4 import NVFP4 # noqa: F401
from .smooth.smooth import SmoothQuant # noqa: F401
Loading