Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 14 additions & 7 deletions angelslim/compressor/quant/core/save.py
Original file line number Diff line number Diff line change
Expand Up @@ -401,7 +401,6 @@ def save(self, save_path):

if fused_act_scale_dict:
for k, v in fused_act_scale_dict.items():
torch.distributed.all_reduce(v, op=torch.distributed.ReduceOp.MAX)
_save_path = os.path.join(
save_path, "{}.input_scale.{}.pt".format(k, _index)
)
Expand All @@ -412,13 +411,17 @@ def save(self, save_path):
)
torch.save(v, _save_path)
else:
torch.distributed.all_reduce(v, op=torch.distributed.ReduceOp.MAX)
if self.rank == 0:
torch.save(v, _save_path)
print_info("save act scales done.")

if self.quant_model.weight_scales_dict:
for k, v in self.quant_model.weight_scales_dict.items():
max_value_group_wise = v
# fp8 pertensor scale
fused_max_value = fused_weight_fp8_scale_dict[k]

# if weight quant is int4 and act quant is fp8, extra save int4 absmax
if (
self.quant_model.quant_algo_dict["w_quant_algo"] == "int4"
Expand All @@ -442,21 +445,23 @@ def save(self, save_path):
_save_path,
self.quant_model.quant_algo_dict["all_reduce"],
)
scale = (fused_max_value.max() / 448.0).to(fused_max_value.dtype)
elif self.quant_model.quant_algo_dict["w_quant_algo"] == "fp8":
scale = fused_max_value.max().to(fused_max_value.dtype)

# fp8 pertensor scale
fused_max_value = fused_weight_fp8_scale_dict[k]
scale = (fused_max_value.max() / 448.0).to(fused_max_value.dtype)
assert scale.numel() == 1
print_info(f"before all reduce scale = {scale}")
torch.distributed.all_reduce(scale, op=torch.distributed.ReduceOp.MAX)
print_info(f"after all reduce scale = {scale}")

if "experts" in k and "shared_experts" not in k:
_save_path = os.path.join(
save_path, "{}.weight_scale.{}.pt".format(k, self.rank)
)
torch.save(scale, _save_path)
else:
print_info(f"before all reduce scale = {scale}")
torch.distributed.all_reduce(
scale, op=torch.distributed.ReduceOp.MAX
)
print_info(f"after all reduce scale = {scale}")
_save_path = os.path.join(
save_path, "{}.weight_scale.{}.pt".format(k, _index)
)
Expand Down Expand Up @@ -492,6 +497,8 @@ def save(self, save_path):

if os.path.exists(tmp_path):
shutil.rmtree(tmp_path)
if os.path.exists(save_path):
shutil.rmtree(save_path)
parent_dir = os.path.dirname(
self.quant_model.model.ori_model_path.rstrip("/")
)
Expand Down
122 changes: 104 additions & 18 deletions angelslim/compressor/quant/modules/awq/auto_scale.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
import torch

from .....utils import get_op_by_name, get_op_name, print_info, set_op_by_name
from ...core import mse_loss
from ...core import mse_loss, per_block_weight_quant, weight_dequant
from ...modules.helper_layer import SmoothHelpModule
from .search import AWQSearch

Expand Down Expand Up @@ -75,7 +75,13 @@ def apply_scale(self, module, scales_list, input_feat_dict=None):
assert torch.isnan(p).sum() == 0, f"nan in {prev_op_name} weight"

for layer in layers:
layer.weight.mul_(scales.view(1, -1))
if layer.weight.dtype == torch.float8_e4m3fn:
weight = weight_dequant(layer.weight, layer.weight_scale_inv)
weight.mul_(scales.view(1, -1))
weight, _ = per_block_weight_quant(weight)
layer.weight.data.copy_(weight)
else:
layer.weight.mul_(scales.view(1, -1))
for p in layer.parameters():
assert torch.isnan(p).sum() == 0, f"nan in {layer_names} weight"

Expand Down Expand Up @@ -144,31 +150,59 @@ def _auto_get_scale(

scales_list = []
print_info(input_feat.keys())
scales_list.append(
_auto_get_scale(
layer_name="attn.qkv",
prev_op=module.input_layernorm,
layers=[
module.self_attn.q_proj,
module.self_attn.k_proj,
module.self_attn.v_proj,
],
inp=input_feat["self_attn.q_proj"],
module2inspect=module.self_attn,
cache=cache,
if self.model_type == "deepseek_v3":
scales_list.append(
_auto_get_scale(
layer_name="attn.qkv",
prev_op=module.input_layernorm,
layers=[
module.self_attn.q_a_proj,
module.self_attn.kv_a_proj_with_mqa,
],
inp=input_feat["self_attn.q_a_proj"],
module2inspect=module.self_attn,
cache=cache,
)
)
)

# attention output
if module.self_attn.v_proj.weight.shape == module.self_attn.o_proj.weight.shape:
# attention output
scales_list.append(
_auto_get_scale(
layer_name="attn.o",
prev_op=module.self_attn.v_proj,
prev_op=module.self_attn.kv_b_proj,
layers=[module.self_attn.o_proj],
inp=input_feat["self_attn.o_proj"],
)
)
else:
scales_list.append(
_auto_get_scale(
layer_name="attn.qkv",
prev_op=module.input_layernorm,
layers=[
module.self_attn.q_proj,
module.self_attn.k_proj,
module.self_attn.v_proj,
],
inp=input_feat["self_attn.q_proj"],
module2inspect=module.self_attn,
cache=cache,
)
)

# attention output
if (
module.self_attn.v_proj.weight.shape
== module.self_attn.o_proj.weight.shape
):
scales_list.append(
_auto_get_scale(
layer_name="attn.o",
prev_op=module.self_attn.v_proj,
layers=[module.self_attn.o_proj],
inp=input_feat["self_attn.o_proj"],
)
)

if hasattr(module.mlp, "gate"):
print_info("auto scale -> MoeAWQ")
Expand Down Expand Up @@ -221,6 +255,58 @@ def _auto_get_scale(
inp=input_feat[f"mlp.experts.{i}.down_proj"],
)
)
elif self.model_type == "deepseek_v3":
# share_mlp fc1
scales_list.append(
_auto_get_scale(
layer_name="shared_experts.gate_proj",
prev_op=module.post_attention_layernorm,
layers=[
module.mlp.shared_experts.gate_proj,
module.mlp.shared_experts.up_proj,
],
inp=input_feat["mlp"],
module2inspect=module.mlp,
cache=cache,
)
)
# share_mlp fc2
scales_list.append(
_auto_get_scale(
layer_name="shared_experts.down_proj",
prev_op=module.mlp.shared_experts.up_proj,
layers=[module.mlp.shared_experts.down_proj],
inp=input_feat["mlp.shared_experts.down_proj"].view(
input_feat["mlp"].shape[0], input_feat["mlp"].shape[1], -1
),
)
)
# fc1
scales_list.append(
_auto_get_scale(
layer_name="expert.gate_proj",
prev_op=module.post_attention_layernorm,
layers=[
w
for expert in module.mlp.experts
for w in [expert.gate_proj, expert.up_proj]
]
+ [module.mlp.gate],
inp=input_feat["mlp"],
module2inspect=module.mlp,
cache=cache,
)
)
# fc2
for i, expert in enumerate(module.mlp.experts):
scales_list.append(
_auto_get_scale(
layer_name=f"expert.{i}.down_proj",
prev_op=expert.up_proj,
layers=[expert.down_proj],
inp=input_feat[f"mlp.experts.{i}.down_proj"].unsqueeze(0),
)
)
else:
# fc1
scales_list.append(
Expand Down
6 changes: 3 additions & 3 deletions angelslim/compressor/quant/modules/awq/awq.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,9 +158,9 @@ def run(self, dataloader):
if not self.low_memory:
outs = outs.to(dev)
self.inps = self.inps.to(dev)
subset = find_layers(layer)
subset = find_layers(layer, layers=self.observer_layer_classes)

if self.model_arch_type in ["qwen3_moe", "hunyuan_v1_moe"]:
if self.model_arch_type in ["qwen3_moe", "hunyuan_v1_moe", "deepseek_v3"]:
subset = {
**subset,
"mlp": layer.mlp,
Expand Down Expand Up @@ -334,7 +334,7 @@ def _apply_quant(self, module, named_linears: Dict[str, nn.Linear]):

def _convert_llm(self):
for i in tqdm(range(len(self.layers)), desc="AWQ"):
subset = find_layers(self.layers[i])
subset = find_layers(self.layers[i], layers=self.observer_layer_classes)
self._apply_quant(self.layers[i], subset)

def convert(self):
Expand Down
15 changes: 13 additions & 2 deletions angelslim/compressor/quant/modules/awq/search.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,12 @@
from torch.nn import Linear

from .....utils import get_best_device, print_info
from ...core import mse_loss, pseudo_quantize_tensor
from ...core import (
mse_loss,
per_block_weight_quant,
pseudo_quantize_tensor,
weight_dequant,
)

print_func = print_info

Expand Down Expand Up @@ -95,7 +100,13 @@ def search_by_block(
scales = act_abs_max_tmp.pow(ratio).clamp(min=1e-4).view(-1)
scales = scales / (scales.max() * scales.min()).sqrt()
for layer in layers:
layer.weight.mul_(scales.view(1, -1))
if layer.weight.dtype == torch.float8_e4m3fn:
weight = weight_dequant(layer.weight, layer.weight_scale_inv)
weight.mul_(scales.view(1, -1))
weight, _ = per_block_weight_quant(weight)
layer.weight.data.copy_(weight)
else:
layer.weight.mul_(scales.view(1, -1))
if type(layer) in [Linear]:
quant_dequant_weight = pseudo_quantize_tensor(
layer.weight,
Expand Down
3 changes: 1 addition & 2 deletions angelslim/compressor/quant/ptq.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
# limitations under the License.

import torch
import torch.nn as nn

from ...utils import find_parent_layer_and_sub_name, print_info
from ..compressor_factory import CompressorFactory
Expand Down Expand Up @@ -63,7 +62,7 @@ def __init__(self, model, slim_config=None):
hidden_size=hidden_size,
model_arch_type=model_arch_type,
mse_range=self.quant_model.quant_config.quant_algo_info["mse_range"],
observer_layer_classes=[nn.Linear],
observer_layer_classes=self.quant_model.observer_layer_classes,
low_memory=self.quant_model.quant_config.low_memory,
)
elif "fp8" in self.quant_algo:
Expand Down
1 change: 1 addition & 0 deletions angelslim/models/base_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ def __init__(
self.tokenizer = None
self.modal_type = "LLM"
self.pre_transformer_module_names = ["model.embed_tokens"]
self.observer_layer_classes = [torch.nn.Linear]

def from_pretrained(
self,
Expand Down
3 changes: 1 addition & 2 deletions angelslim/models/diffusion/flux.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,9 +143,8 @@ def get_observer_layers(self):
"norm1_context.linear",
]
self.quant_module = self.model.transformer
obs_layers = [nn.Linear]
observer_layers_dict = {}
layers_dict = find_layers(self.quant_module, layers=obs_layers)
layers_dict = find_layers(self.quant_module, layers=self.observer_layer_classes)

ignore_layers = self.skip_layer_names()
for name, module in layers_dict.items():
Expand Down
6 changes: 4 additions & 2 deletions angelslim/models/llm/deepseek.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ def __init__(
self.block_name = "model.layers"
self.column_parallel_linear_class = ColumnParallelLinear
self.row_parallel_linear_class = RowParallelLinear
self.observer_layer_classes = [nn.Linear, Linear]
torch.set_default_dtype(torch.bfloat16)

def from_pretrained(
Expand Down Expand Up @@ -96,8 +97,9 @@ def from_pretrained(

def get_observer_layers(self):
names = self.quant_config.quant_algo_info["ignore_layers"]
obs_layers = [nn.Linear, Linear]
observer_layers_dict = find_layers(self.model, layers=obs_layers)
observer_layers_dict = find_layers(
self.model, layers=self.observer_layer_classes
)
observer_layers_dict = {
k: v
for k, v in observer_layers_dict.items()
Expand Down
7 changes: 3 additions & 4 deletions angelslim/models/llm/hunyuan_dense.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import torch.nn as nn

from ...compressor.quant.core import PTQSaveVllmHF
from ...utils.utils import find_layers
from ..base_model import BaseLLMModel
Expand Down Expand Up @@ -45,8 +43,9 @@ def get_observer_layers(self):
"mlp.down_proj",
"mlp.gate_and_up_proj",
]
obs_layers = [nn.Linear]
observer_layers_dict = find_layers(self.model, layers=obs_layers)
observer_layers_dict = find_layers(
self.model, layers=self.observer_layer_classes
)

observer_layers_dict = {
k: v
Expand Down
7 changes: 3 additions & 4 deletions angelslim/models/llm/hunyuan_moe.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,6 @@

import re

import torch.nn as nn

from ...compressor.quant.core import PTQSaveVllmHF
from ...utils.utils import find_layers
from ..base_model import BaseLLMModel
Expand Down Expand Up @@ -51,8 +49,9 @@ def get_observer_layers(self):
r"model\.layers\.\d+\.mlp\.experts\.\d+\.down_proj",
]

obs_layers = [nn.Linear]
observer_layers_dict = find_layers(self.model, layers=obs_layers)
observer_layers_dict = find_layers(
self.model, layers=self.observer_layer_classes
)

compiled_patterns = [re.compile(pattern) for pattern in expert_pattern]

Expand Down
7 changes: 3 additions & 4 deletions angelslim/models/llm/llama.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import torch.nn as nn

from ...compressor.quant.core import PTQSaveVllmHF
from ...utils.utils import find_layers
from ..base_model import BaseLLMModel
Expand Down Expand Up @@ -43,8 +41,9 @@ def get_observer_layers(self):
"mlp.gate_proj",
"mlp.down_proj",
]
obs_layers = [nn.Linear]
observer_layers_dict = find_layers(self.model, layers=obs_layers)
observer_layers_dict = find_layers(
self.model, layers=self.observer_layer_classes
)
observer_layers_dict = {
k: v
for k, v in observer_layers_dict.items()
Expand Down
Loading