Tencent
diff --git a/‎angelslim/compressor/quant/core/vllm_calibrate_utils/__init__.py‎
Lines changed: 3 additions & 3 deletions b/‎angelslim/compressor/quant/core/vllm_calibrate_utils/__init__.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎angelslim/compressor/quant/core/vllm_calibrate_utils/hooks.py‎
Lines changed: 0 additions & 4 deletions b/‎angelslim/compressor/quant/core/vllm_calibrate_utils/hooks.py‎
Lines changed: 0 additions & 4 deletions
diff --git a/‎angelslim/compressor/quant/core/vllm_calibrate_utils/search.py‎
Lines changed: 3 additions & 3 deletions b/‎angelslim/compressor/quant/core/vllm_calibrate_utils/search.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎angelslim/compressor/transform/smooth/__init__.py‎
Lines changed: 37 additions & 0 deletions b/‎angelslim/compressor/transform/smooth/__init__.py‎
Lines changed: 37 additions & 0 deletions
diff --git a/‎angelslim/compressor/transform/smooth/config.py‎
Lines changed: 51 additions & 0 deletions b/‎angelslim/compressor/transform/smooth/config.py‎
Lines changed: 51 additions & 0 deletions
diff --git a/‎angelslim/compressor/transform/smooth/convert/__init__.py‎
Lines changed: 67 additions & 0 deletions b/‎angelslim/compressor/transform/smooth/convert/__init__.py‎
Lines changed: 67 additions & 0 deletions
@@ -13,9 +13,9 @@
 * :mod:`.search`   – KV-cache FP8 scale grid-search (per-tensor and
   per-head) with the value-capture hooks needed by the searchers.
 
-The vLLM ``fused_moe.py`` patch only imports
-``collect_fused_moe_internal_stats`` from this package, which is
-re-exported via :mod:`.hooks`.
+Smooth / Smooth-Alpha-Search APIs have been moved to
+:mod:`angelslim.compressor.transform.smooth.vllm` — import from there
+directly.
 """
 
 from .hooks import (
 
@@ -232,10 +232,6 @@ def setup_activation_hooks(model, kv_granularity="per-tensor"):
                 if hasattr(layer, "w13_weight") and layer.w13_weight is not None:
                     layer.w13_weight._vllm_layer_name = name
                     layer.w13_weight._moe_activation_stats_of_model = model._moe_activation_stats
-                    print(
-                        f"[DEBUG] Set w13_weight._vllm_layer_name = {name}, "
-                        f"type={type(layer.w13_weight)}"
-                    )
                 else:
                     print(
                         f"[DEBUG] Cannot set w13_weight._vllm_layer_name: "
 
@@ -13,6 +13,9 @@
 are kept module-private with the underscore prefix.
 """
 
+import os
+from concurrent.futures import ThreadPoolExecutor, as_completed
+
 import torch
 
 from ._common import _compute_perhead_layout, _find_layers, _get_dist_info, _get_kv_role
@@ -283,9 +286,6 @@ def __init__(
         self.num_steps = num_steps
 
     def __call__(self, model):
-        import os
-        from concurrent.futures import ThreadPoolExecutor, as_completed
-
         fp8_max = torch.finfo(torch.float8_e4m3fn).max  # 448.0
 
         # Collect raw kv tensors stored by the value hook
 
@@ -0,0 +1,37 @@
+# Copyright 2025 Tencent Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""SmoothQuant transform module.
+
+Three sub-packages share a common :mod:`.core` algorithm layer:
+
+* :mod:`.core`     — backend-agnostic tensor primitives (formulas, QDQ,
+  RoPE-aware pairing, GQA expansion, alpha-search inner loop, smooth-stats
+  serialisation).  Imported by both the vLLM and convert pipelines.
+* :mod:`.vllm`     — online stat collection on a live vLLM model: hook
+  classes, ``setup_smooth_hooks`` / ``get_smooth_stats``, the TP-aware
+  ``SmoothAlphaSearcher``, and FusedMoE kernel-injection entry points.
+* :mod:`.convert`  — offline weight conversion on a HuggingFace model:
+  ``apply_qk_smooth`` / ``apply_vo_smooth`` / ``apply_down_proj_smooth``
+  (+ alpha-search variant), plus snapshot/verify utilities.
+
+Top-level :mod:`.config` holds the dataclasses that travel with both
+pipelines (:class:`SmoothAlphaSearchConfig`).
+"""
+
+from .config import SmoothAlphaSearchConfig
+
+__all__ = [
+    "SmoothAlphaSearchConfig",
+]
@@ -0,0 +1,51 @@
+# Copyright 2025 Tencent Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Configuration dataclasses shared by the vLLM (online) and convert
+(offline) smooth pipelines.
+
+These are *pure data containers* — keep the module free of any heavy
+imports so it can be loaded from CLI scripts without pulling in torch
+distributed / vLLM machinery.
+"""
+
+from dataclasses import dataclass
+
+__all__ = [
+    "SmoothAlphaSearchConfig",
+]
+
+
+@dataclass
+class SmoothAlphaSearchConfig:
+    """Configuration for smooth alpha grid search."""
+
+    alpha_min: float = 0.3
+    alpha_max: float = 1.0
+    alpha_steps: int = 8  # [0.3, 0.4, ..., 1.0]
+    act_quant_method: str = "per_token"  # "per_tensor" | "per_token"
+    act_quant_type: str = "int8"  # "int8" | "fp8"
+    weight_quant_method: str = (
+        "per_channel"  # "per_tensor" | "per_channel" | "per_group" | "per_block"
+    )
+    weight_quant_type: str = "int8"  # "int8" | "int4" | "fp8"
+    weight_quant_bits: int = 8
+    weight_group_size: int = 128  # per_group, -1 = per_channel
+    block_size: int = 128  # per_block fp8
+    use_ema_for_absmax: bool = False
+    smooth_search_mode: str = "default"  # "default" | "per-tensor-act-first"
+    act_mul_min: float = 0.1  # per-tensor-act-first: multiplier range min
+    act_mul_max: float = 1.0  # per-tensor-act-first: multiplier range max
+    smooth_min: float = 1e-6  # per-tensor-act-first: smooth clamp lower bound
+    smooth_max: float = 1e6  # per-tensor-act-first: smooth clamp upper bound
@@ -0,0 +1,67 @@
+# Copyright 2025 Tencent Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""HuggingFace-side smooth pipeline (offline weight conversion).
+
+Re-exports the public API consumed by
+``tools/smooth/convert_smooth_weights.py`` (Phase 2 driver).
+"""
+
+from .apply_funcs import (
+    apply_down_proj_smooth,
+    apply_down_proj_smooth_from_search,
+    apply_qk_smooth,
+    apply_vo_smooth,
+)
+from .utils import (
+    DEFAULT_KEY_MAP,
+    HY_V3_KEY_MAP,
+    LLAMA_KEY_MAP,
+    MIXTRAL_KEY_MAP,
+    PREDEFINED_KEY_MAPS,
+    QWEN3_MOE_KEY_MAP,
+    attn_key_to_hf_prefix,
+    find_first_attn_module,
+    get_submodule_safe,
+    maybe_materialize,
+    snapshot_attn_output_before,
+    snapshot_mlp_outputs_before,
+    verify_attn_output_diff,
+    verify_mlp_output_diff,
+)
+
+__all__ = [
+    # apply
+    "apply_qk_smooth",
+    "apply_vo_smooth",
+    "apply_down_proj_smooth",
+    "apply_down_proj_smooth_from_search",
+    # key maps
+    "DEFAULT_KEY_MAP",
+    "HY_V3_KEY_MAP",
+    "LLAMA_KEY_MAP",
+    "MIXTRAL_KEY_MAP",
+    "QWEN3_MOE_KEY_MAP",
+    "PREDEFINED_KEY_MAPS",
+    # helpers
+    "get_submodule_safe",
+    "maybe_materialize",
+    "attn_key_to_hf_prefix",
+    # snapshot / verify
+    "find_first_attn_module",
+    "snapshot_attn_output_before",
+    "snapshot_mlp_outputs_before",
+    "verify_attn_output_diff",
+    "verify_mlp_output_diff",
+]