diff --git a/angelslim/compressor/quant/core/fp8_analyse_tools.py b/angelslim/compressor/quant/core/fp8_analyse_tools.py
index 49eb9cb8..4c2fac86 100644
--- a/angelslim/compressor/quant/core/fp8_analyse_tools.py
+++ b/angelslim/compressor/quant/core/fp8_analyse_tools.py
@@ -158,7 +158,7 @@ def get_weight_dict(model_path):
 def draw_hist(uniform_data, ax, name):
     uniform_data.sort()
     s = pd.Series(uniform_data)
-    ax.hist(s, bins=50, rwidth=1)
+    ax.hist(s, bins=1000, rwidth=1)
     ax.set_title(name + "_histgram")
     ax.grid(True)
 
diff --git a/angelslim/compressor/quant/modules/__init__.py b/angelslim/compressor/quant/modules/__init__.py
index 6bfd736a..d655c9e7 100644
--- a/angelslim/compressor/quant/modules/__init__.py
+++ b/angelslim/compressor/quant/modules/__init__.py
@@ -14,6 +14,7 @@
 
 from .awq.awq import AWQ  # noqa: F401
 from .fp8.fp8 import FP8  # noqa: F401
+from .fp8.lepto_fp8 import LeptoFP8  # noqa: F401
 from .gptq.gptq import GPTQ  # noqa: F401
 from .gptq.gptq_module import GPTQModule  # noqa: F401
 from .helper_layer import GPTQQuantLinear  # noqa: F401
diff --git a/angelslim/compressor/quant/modules/fp8/lepto_fp8.py b/angelslim/compressor/quant/modules/fp8/lepto_fp8.py
new file mode 100644
index 00000000..fee006ba
--- /dev/null
+++ b/angelslim/compressor/quant/modules/fp8/lepto_fp8.py
@@ -0,0 +1,261 @@
+# Copyright 2025 Tencent Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import functools
+from collections import defaultdict
+
+import torch
+import torch.nn as nn
+
+from .....utils import find_parent_layer_and_sub_name, get_best_device, print_info
+from ...core.quant_func import get_fp_maxval
+from ...modules.catcher import Catcher
+from .lepto_scale import AutoLayerScale
+
+__all__ = ["LeptoFP8"]
+
+
+class LeptoFP8:
+    def __init__(
+        self,
+        ptq_hook,
+        model,
+        seq_length=2048,
+        hidden_size=2560,
+        model_arch_type=None,
+        low_memory=False,
+    ):
+        """
+        Args:
+            model(nn.Module, required): The model to be smoothed.
+            seq_length(int, optional): The length of the sequence. Default: 2048.
+            hidden_size(int, optional): The size of the hidden layer. Default: 2560.
+            model_arch_type(str, optional): model arch type.Default: None.
+            low_memory(boll, optional): using low memory .Default: None.
+        """
+        super(LeptoFP8, self).__init__()
+        self.ptq_hook = ptq_hook
+        self.quant_model = model  # self.quant_model
+        self.modal_type = self.quant_model.modal_type
+        self.layers = self.quant_model.model.model.layers
+        self.quant_bits = self.quant_model.quant_config.quant_bit
+        self.seq_length = seq_length
+        self.hidden_size = hidden_size
+        self.model_arch_type = model_arch_type
+        self.low_memory = low_memory
+        self.dtype = torch.bfloat16
+        self.scales_dict = {}
+        self.inps = None
+        self.observer_layer_classes = [nn.Linear]
+        self.scale_function = AutoLayerScale(
+            model_type=self.model_arch_type,
+            observer_layer_classes=self.observer_layer_classes,
+        )
+
+    def move_embed(self, model, device: str):
+        print_info(model)
+        model.model.model.embed_tokens = model.model.model.embed_tokens.to(device)
+        model.model.model.rotary_emb = model.model.model.rotary_emb.to(device)
+
+    @torch.no_grad()
+    def run(self, dataloader):
+        for model_module in self.layers:
+            model_module.eval()
+        layers = self.layers
+        dev = get_best_device()
+        nsamples = len(dataloader)
+        self.inps = torch.zeros(
+            (int(nsamples), self.seq_length, self.hidden_size),
+            device=dev,
+            dtype=self.dtype,
+        )
+        cache = {"i": 0}
+        layers[0] = layers[0].to(dev)
+        self.quant_model.model.model.embed_tokens = (
+            self.quant_model.model.model.embed_tokens.to(dev)
+        )
+        layers[0] = Catcher(layers[0], self.inps, cache)
+        self.quant_model.model_forward(dataloader)
+        layer_kwargs = layers[0].layer_kwargs
+        for k, v in layer_kwargs.items():
+            # position embeddings
+            if isinstance(v, tuple):
+                layer_kwargs[k] = tuple(
+                    (
+                        item.to(dev)
+                        if isinstance(item, (torch.Tensor, nn.Module))
+                        else item
+                    )
+                    for item in v
+                )
+
+        print_info("cache['i']:{}".format(cache["i"]))
+        print_info(len(layers))
+        layers[0] = layers[0].module
+        print_info(self.inps.shape)
+        outs = torch.zeros_like(self.inps)
+        # begin the lepto process
+        print_info("Ready.")
+        layers = layers.cpu()
+        torch.cuda.empty_cache()
+
+        outs = outs.to("cpu")
+        self.inps = self.inps.to("cpu")
+        print_info(layer_kwargs)
+
+        for i in range(len(layers)):
+            if torch.cuda.is_available():
+                print_info(
+                    f"GPU Memory: {torch.cuda.memory_allocated() / 1024 ** 2:.2f} MB"
+                )
+
+            layer = layers[i].to(dev)
+            outs = outs.to(dev)
+            self.inps = self.inps.to(dev)
+            subset = self._find_layers(layer)
+
+            if self.model_arch_type == "qwen3_moe":
+                subset = {
+                    **subset,
+                    "mlp": layer.mlp,
+                }
+
+            # firstly, get input features of all linear layers
+            def cache_input_hook(m, x, y, name, feat_dict, layer):
+                x = x[0]
+                x = x.detach().cpu()
+                feat_dict[name].append(x)
+
+            input_feat = defaultdict(list)
+            handles = []
+            for name in subset:
+                handles.append(
+                    subset[name].register_forward_hook(
+                        functools.partial(
+                            cache_input_hook,
+                            name=name,
+                            feat_dict=input_feat,
+                            layer=subset[name],
+                        )
+                    )
+                )
+            # being hook
+            for j in range(min(self.inps.shape[0], nsamples)):
+                with torch.no_grad():
+                    outs[j, :, :] = layer(
+                        hidden_states=self.inps[j, :, :].unsqueeze(0), **layer_kwargs
+                    )[0].squeeze(1)
+
+            # remove duplicate
+            def deduplicate_tensors(tensor_list):
+                unique_tensors = []
+                assert len(tensor_list) % 2 == 0
+                for i in range(int(len(tensor_list) / 2)):
+                    if torch.equal(tensor_list[i * 2], tensor_list[i * 2 + 1]):
+                        unique_tensors.append(tensor_list[i * 2])
+                    else:
+                        raise ValueError
+                for tensor in tensor_list:
+                    if not any(torch.equal(tensor, t) for t in unique_tensors):
+                        unique_tensors.append(tensor)
+                return unique_tensors
+
+            for k, v in input_feat.items():
+                if len(v) > nsamples:
+                    print_info(f"Warning: repetition hook {k}")
+                    input_feat[k] = deduplicate_tensors(v)
+
+            print_info("HOOK Step{}".format(j))
+            for h in handles:
+                h.remove()
+
+            # now solve for scaling and clipping
+            input_feat = {k: torch.cat(v, dim=0) for k, v in input_feat.items()}
+
+            # Clear GPU memory
+            torch.cuda.empty_cache()
+
+            scales_list = self.scale_function.auto_scale(
+                self.ptq_hook, layer, input_feat, layer_kwargs
+            )
+
+            for scales in scales_list:
+                for kn in scales[0]:
+                    name = "model.layers.{}.{}".format(i, kn)
+                    self.scales_dict[name] = scales[1]
+
+            layers[i] = layers[i].cpu()
+            layer = layer.cpu()
+            torch.cuda.empty_cache()
+            self.inps, outs = outs, self.inps
+            print_info("LEPTO FP8 end layer {}\n".format(i))
+
+        print_info(self.scales_dict)
+
+    def _find_layers(self, module, layers=None, name=""):
+        if not layers:
+            layers = self.observer_layer_classes
+        if type(module) in layers:
+            return {name: module}
+        res = {}
+        for name1, child in module.named_children():
+            res.update(
+                self._find_layers(
+                    child,
+                    layers=layers,
+                    name=name + "." + name1 if name != "" else name1,
+                )
+            )
+        return res
+
+    def convert(self):
+        # 1. get act, weight and kv-cache scale
+        old_list = []
+        new_list = []
+        for name, sub_layer in self.ptq_hook.quant_layers_dict.items():
+            weight_scales = self.quant_model.get_weight_scales(
+                sub_layer, self.ptq_hook.observer_dict[sub_layer].weight_observer
+            )
+
+            self.quant_model.weight_scales_dict[name] = weight_scales / get_fp_maxval(
+                bits=8
+            ).type(weight_scales.dtype)
+            old_scale = self.ptq_hook.observer_dict[sub_layer].act_observer.scales()
+            lepto_scale = torch.clamp(
+                self.scales_dict.pop(name).squeeze().detach().to(old_scale.device),
+                min=0,
+                max=99999,
+            )
+
+            self.quant_model.act_scales_dict[name] = lepto_scale
+            print_info(
+                f"{name} , {old_scale}, "
+                f"{old_scale / get_fp_maxval(bits=8).type(weight_scales.dtype).item()} "
+                f"{lepto_scale.item()}"
+            )
+            old_list.append(old_scale / get_fp_maxval(bits=8).type(weight_scales.dtype))
+            new_list.append(self.quant_model.act_scales_dict[name])
+        print_info(sum(old_list))
+        print_info(sum(new_list))
+        self.ptq_hook.remove_hook()
+        torch.cuda.empty_cache()
+
+        # 2. insert qdq module
+        layers = self.quant_model.get_model()
+        for name, sub_layer in self.ptq_hook.quant_layers_dict.items():
+            parent_layer, sub_name = find_parent_layer_and_sub_name(layers, name)
+
+            qdq_module = self.quant_model.get_qdq_module(sub_layer, name)
+            setattr(parent_layer, sub_name, qdq_module)
+        self.quant_model.quantized = True
diff --git a/angelslim/compressor/quant/modules/fp8/lepto_scale.py b/angelslim/compressor/quant/modules/fp8/lepto_scale.py
new file mode 100644
index 00000000..57d2dd4d
--- /dev/null
+++ b/angelslim/compressor/quant/modules/fp8/lepto_scale.py
@@ -0,0 +1,290 @@
+# Copyright 2025 Tencent Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import functools
+
+import torch
+
+from .....utils import get_op_name, print_info
+from ...core import get_fp_maxval, mse_loss
+from ...core.quant_func import quantize_weight_per_tensor_fp8, tensor_quant_dequant_fp8
+
+
+class AutoLayerScale:
+    def __init__(
+        self,
+        loss_function=mse_loss,
+        merge_samples=True,
+        model_type="dense",
+        observer_layer_classes=None,
+    ):
+        """ """
+        self.loss_function = loss_function
+        self.merge_samples = merge_samples
+        self.model_type = model_type
+        self.layer_count = 0
+        self.observer_layer_classes = observer_layer_classes
+        self.n_exponent = 5
+        self.search_step = 10
+
+    def auto_scale(self, ptq_hook, module, input_feat, cache):
+        print_info("[auto scale] start")
+
+        def _auto_get_scale(layer_name, layers, inp, module2inspect=None, cache=None):
+            if module2inspect is None:
+                assert len(layers) == 1
+                module2inspect = layers[0]
+
+            inp = inp.to(layers[0].weight.device)
+            if self.merge_samples:
+                act_abs_max = (
+                    inp.abs().reshape(-1, inp.shape[-1]).mean(0).reshape(1, -1)
+                )
+            else:
+                all_inp = inp
+                act_abs_max = (
+                    all_inp.abs().reshape(-1, all_inp.shape[-1]).mean(0).reshape(1, -1)
+                )
+                del all_inp
+
+            print_info(f"[auto scale] {layer_name} act_abs_max: {act_abs_max}")
+
+            scales = self.search_by_block(
+                layer_name,
+                inp,
+                act_abs_max,
+                layers,
+                module2inspect,
+                cache,
+            )
+            scales = scales.detach().cpu()
+            print_info(f"[auto scale] {layer_name} scales: {scales}")
+            inp = inp.cpu()
+            torch.cuda.empty_cache()
+
+            # prev_op_name, [layer_name], scale
+            return (
+                tuple([get_op_name(module, m) for m in layers]),
+                scales,
+            )
+
+        scales_list = []
+        print_info(input_feat.keys())
+        scales_list.append(
+            _auto_get_scale(
+                layer_name="attn.qkv",
+                layers=[
+                    module.self_attn.q_proj,
+                    module.self_attn.k_proj,
+                    module.self_attn.v_proj,
+                ],
+                inp=input_feat["self_attn.q_proj"],
+                module2inspect=module.self_attn,
+                cache=cache,
+            )
+        )
+
+        # attention output
+        scales_list.append(
+            _auto_get_scale(
+                layer_name="attn.o",
+                layers=[module.self_attn.o_proj],
+                inp=input_feat["self_attn.q_proj"],
+                module2inspect=module.self_attn,
+                cache=cache,
+            )
+        )
+
+        print_info("auto scale -> Denselepto")
+        # fc1
+        scales_list.append(
+            _auto_get_scale(
+                layer_name="mlp.gate_proj",
+                layers=[module.mlp.gate_proj, module.mlp.up_proj],
+                inp=input_feat["mlp.gate_proj"],
+                module2inspect=module.mlp,
+                cache=cache,
+            )
+        )
+        # fc2
+        scales_list.append(
+            _auto_get_scale(
+                layer_name="mlp.down_proj",
+                layers=[module.mlp.down_proj],
+                inp=input_feat["mlp.gate_proj"],
+                module2inspect=module.mlp,
+                cache=cache,
+            )
+        )
+        self.layer_count += 1
+        print_info("[auto scale] end")
+        return scales_list
+
+    def _get_out(self, layer_name, act, block, cache):
+        if "att" in layer_name:
+            return block(act, **cache)[0].squeeze(1)
+        else:
+            return block(act)[0].squeeze(1)
+
+    def lepto_qdq_fp8_tensor(self, tensor, ratio):
+        assert len(tensor.shape) == 1, f"tensor.device:{tensor.device}"
+        w_scale = tensor.abs().max() / get_fp_maxval(bits=8)
+
+        orig_fp8w, _ = quantize_weight_per_tensor_fp8(tensor, w_scale)
+
+        outlier_point = 0.999 + 0.00005 * ratio
+        n = min(round(len(tensor) * outlier_point), len(tensor) - 1)  # 0.001%0.001
+        sorted_indices = torch.argsort(tensor.abs())
+        closest_indices = sorted_indices[n]
+
+        cut_np_fp8w1 = orig_fp8w[closest_indices].float().abs()
+
+        adapt_scale = tensor.abs().max() / cut_np_fp8w1.type(tensor.dtype)
+        print_info(
+            f"w_scale:{w_scale.item()}, adapt_scale:{adapt_scale.item()},"
+            f" cut_np_fp8w1: {cut_np_fp8w1.item()}"
+        )
+        return adapt_scale.to(tensor.dtype)
+
+    def lepto_qdq_fp8_tensor_v2(self, tensor, ratio):
+        assert len(tensor.shape) == 1, f"tensor.device:{tensor.device}"
+        w_scale = tensor.abs().max() / get_fp_maxval(bits=8)
+
+        orig_fp8w, _ = quantize_weight_per_tensor_fp8(tensor, w_scale)
+
+        outlier_point = 0.999
+        n = min(round(len(tensor) * outlier_point), len(tensor) - 1)  # 0.001%0.001
+        sorted_indices = torch.argsort(tensor.abs())
+        closest_indices = sorted_indices[n]
+
+        cut_np_fp8w1 = max(
+            orig_fp8w[closest_indices].float().abs(), get_fp_maxval(bits=8) / 7
+        )
+
+        step = (get_fp_maxval(bits=8) - cut_np_fp8w1) / self.search_step
+        break_point = min(cut_np_fp8w1 + (step * (ratio + 1)), get_fp_maxval(bits=8))
+
+        # FP8-list
+        # r_list = [22, 30, 44, 60, 88, 120, 176, 224, 288, 320, 352, 384, 416, 448]
+        # break_point = torch.tensor(r_list[ratio])
+
+        adapt_scale = tensor.abs().max() / break_point.type(tensor.dtype)
+        print_info(
+            f"{w_scale.item()}, {adapt_scale.item()}, "
+            f"{tensor.abs().max().item()}, "
+            f"{break_point.type(tensor.dtype).item()}"
+        )
+        return adapt_scale.to(tensor.dtype)
+
+    def lepto_input_hook(self, module, input, scale):
+        modified_input = tensor_quant_dequant_fp8(input[0], scale)
+        new_input = [modified_input]
+        for i in range(len(input) - 1):
+            new_input.append(input[1 + i])
+            exit()
+        return tuple(new_input)
+
+    def search_by_block(
+        self,
+        layer_name,
+        act_input,
+        act_abs_max,
+        layers,
+        block,
+        cache,
+    ):
+        print_info(f"inp.shape:{act_input.shape}")
+        print_info(f"block:{block}")
+        print_info(f"act_abs_max.shape:{act_abs_max.shape}")
+        act = act_input
+        print_info("[lepto search] search input of %s" % layer_name)
+        best_error = float("inf")
+        best_ratio = -1
+        best_scales = None
+
+        with torch.no_grad():
+            if cache is not None:
+                origin_out = torch.ones_like(act)
+                new_out = torch.ones_like(act)
+            else:
+                origin_out = torch.ones(
+                    (act.shape[0], act.shape[1], layers[0].weight.shape[0]),
+                    dtype=act.dtype,
+                    device=act.device,
+                )
+                new_out = torch.ones(
+                    (act.shape[0], act.shape[1], layers[0].weight.shape[0]),
+                    dtype=act.dtype,
+                    device=act.device,
+                )
+
+            for j in range(act.shape[0]):
+                origin_out[j, :, :] = self._get_out(
+                    layer_name, act[j, :, :].unsqueeze(0), block, cache
+                )
+            print_info(f"origin_out.shape:{origin_out.shape}")
+            org_w = []
+            for layer in layers:
+                org_w.append(layer.weight.clone().cpu())
+
+            for ratio in range(8, 21):
+                adapt_scale = self.lepto_qdq_fp8_tensor(
+                    act.unsqueeze(0).view(-1), ratio
+                ).unsqueeze(0)
+                handles = []
+                for layer in layers:
+                    handles.append(
+                        layer.register_forward_pre_hook(
+                            functools.partial(self.lepto_input_hook, scale=adapt_scale)
+                        )
+                    )
+
+                for j in range(act.shape[0]):
+                    new_act = act[j, :, :].unsqueeze(0)
+                    new_out[j, :, :] = self._get_out(layer_name, new_act, block, cache)
+
+                loss = self.loss_function(origin_out, new_out).to(torch.float32)
+                print_info(
+                    "ratio: {}, adscale: {}, loss: {}".format(ratio, adapt_scale, loss)
+                )
+                if loss < best_error:
+                    best_error = loss
+                    best_ratio = ratio
+                    best_scales = adapt_scale
+
+                for layer, w in zip(layers, org_w):
+                    layer.weight.data.copy_(w)
+
+                for h in handles:
+                    h.remove()
+
+        origin_out = origin_out.detach().cpu()
+        new_out = w.detach().cpu()
+        del origin_out
+        del new_out
+        for w in org_w:
+            w = w.detach().cpu()
+            del w
+
+        if best_scales is None:
+            best_scales = torch.ones(adapt_scale.shape, dtype=act.dtype)
+            print_info("Cannot find better ratio.")
+        else:
+            print_info(
+                "Best ratio :{}, minimal loss : {}, best_scales:{}.".format(
+                    best_ratio, best_error, best_scales
+                )
+            )
+        return best_scales.detach().cpu()
diff --git a/angelslim/compressor/quant/ptq.py b/angelslim/compressor/quant/ptq.py
index d2ac1ee1..3f860b59 100644
--- a/angelslim/compressor/quant/ptq.py
+++ b/angelslim/compressor/quant/ptq.py
@@ -18,7 +18,7 @@
 from ...utils import find_parent_layer_and_sub_name, print_info
 from ..compressor_factory import CompressorFactory
 from .core import PTQHook
-from .modules import AWQ, FP8, GPTQ, INT8, SmoothQuant
+from .modules import AWQ, FP8, GPTQ, INT8, LeptoFP8, SmoothQuant
 
 __all__ = ["PTQ"]
 
@@ -69,13 +69,23 @@ def __init__(self, model, slim_config=None):
             max_seq_length = self.quant_model.quant_config.max_seq_length
             hidden_size = self.quant_model.quant_config.hidden_size
             model_arch_type = self.quant_model.quant_config.model_arch_type
-            self.fp8 = FP8(
-                self.quant_model,
-                seq_length=max_seq_length,
-                hidden_size=hidden_size,
-                model_arch_type=model_arch_type,
-                low_memory=self.quant_model.quant_config.low_memory,
-            )
+            if "lepto" in self.quant_algo:
+                self.fp8 = LeptoFP8(
+                    self.ptq_hook,
+                    self.quant_model,
+                    seq_length=max_seq_length,
+                    hidden_size=hidden_size,
+                    model_arch_type=model_arch_type,
+                    low_memory=self.quant_model.quant_config.low_memory,
+                )
+            else:
+                self.fp8 = FP8(
+                    self.quant_model,
+                    seq_length=max_seq_length,
+                    hidden_size=hidden_size,
+                    model_arch_type=model_arch_type,
+                    low_memory=self.quant_model.quant_config.low_memory,
+                )
         if "int8" in self.quant_algo:
             max_seq_length = self.quant_model.quant_config.max_seq_length
             hidden_size = self.quant_model.quant_config.hidden_size
@@ -117,6 +127,8 @@ def convert(self):
             self.gptq.convert()
         elif "awq" in self.quant_algo:
             self.awq.convert()
+        elif "lepto" in self.quant_algo:
+            self.fp8.convert()
         else:
             if self.modal_type in ["LLM", "VLM"]:
                 if "smooth" in self.quant_helpers:
@@ -139,7 +151,7 @@ def save(self, save_path: str):
             for k in self.quant_model.act_scales_dict.keys():
                 act_scales_data = self.quant_model.act_scales_dict[k].data
                 if act_scales_data > 1.5:
-                    print(
+                    print_info(
                         f"[AngelSlim Warning] Act_scales {k}: "
                         f"The weight is too high:{act_scales_data}. "
                         f"It is recommended to clip it to 1.5 "
@@ -147,7 +159,7 @@ def save(self, save_path: str):
             for k in self.quant_model.weight_scales_dict.keys():
                 weight_scales_data = self.quant_model.weight_scales_dict[k].data
                 if weight_scales_data > 1.5:
-                    print(
+                    print_info(
                         f"[AngelSlim Warning] Weight_scales {k}: "
                         f"The weight is too high:{weight_scales_data}. "
                         f"It is recommended to clip it to 1.5 "
diff --git a/angelslim/utils/config_parser.py b/angelslim/utils/config_parser.py
index 55457e8d..91b13406 100644
--- a/angelslim/utils/config_parser.py
+++ b/angelslim/utils/config_parser.py
@@ -256,6 +256,7 @@ def __init__(self):
         self.supported_quant_methods = [
             "fp8_static",
             "fp8_dynamic",
+            "fp8_lepto",
             "int4_awq",
             "int4_gptq",
             "int8_dynamic",
@@ -395,6 +396,7 @@ def get_default_config() -> FullConfig:
             ),
             dataset_config=None,
             global_config=global_config,
+            infer_config=None,
         )
 
 
diff --git a/configs/hunyuan/fp8_static/hunyuan_2b_dense_lepto_fp8_static.yaml b/configs/hunyuan/fp8_static/hunyuan_2b_dense_lepto_fp8_static.yaml
new file mode 100644
index 00000000..e3cd159a
--- /dev/null
+++ b/configs/hunyuan/fp8_static/hunyuan_2b_dense_lepto_fp8_static.yaml
@@ -0,0 +1,34 @@
+# Global configuration of pipeline
+global:
+  save_path: ./output
+
+# Simplified Configuration for LLM compression
+model:
+  name: HunyuanDense
+  model_path:  tencent/Hunyuan-1.8B-Instruct
+  trust_remote_code: true
+  low_cpu_mem_usage: true
+  use_cache: false
+  torch_dtype: auto
+  device_map: auto
+
+# Compression configuration
+compression:
+  name: PTQ
+  quantization:
+    name: fp8_lepto     # Supported: fp8_static, fp8_dynamic, int4_awq, int4_gptq
+    bits: 8                # Quantization bits (4/8)
+    quant_method:
+      weight: "per-tensor"
+      activation: "per-tensor"
+    ignore_layers:         # Skip quantization for these layers
+      - "lm_head"
+      - "model.embed_tokens"
+
+# Dataset for calibration
+dataset:
+  name: TextDataset
+  data_path: ./dataset/sharegpt_gpt4/sharegpt_gpt4_256.jsonl
+  max_seq_length: 4096
+  num_samples: 32
+  batch_size: 1
diff --git a/configs/hunyuan/fp8_static/hunyuan_4b_dense_lepto_fp8_static.yaml b/configs/hunyuan/fp8_static/hunyuan_4b_dense_lepto_fp8_static.yaml
new file mode 100644
index 00000000..b538b588
--- /dev/null
+++ b/configs/hunyuan/fp8_static/hunyuan_4b_dense_lepto_fp8_static.yaml
@@ -0,0 +1,34 @@
+# Global configuration of pipeline
+global:
+  save_path: ./output
+
+# Simplified Configuration for LLM compression
+model:
+  name: HunyuanDense
+  model_path: tencent/Hunyuan-4B-Instruct
+  trust_remote_code: true
+  low_cpu_mem_usage: true
+  use_cache: false
+  torch_dtype: auto
+  device_map: auto
+
+# Compression configuration
+compression:
+  name: PTQ
+  quantization:
+    name: fp8_lepto     # Supported: fp8_static, fp8_dynamic, int4_awq, int4_gptq
+    bits: 8                # Quantization bits (4/8)
+    quant_method:
+      weight: "per-tensor"
+      activation: "per-tensor"
+    ignore_layers:         # Skip quantization for these layers
+      - "lm_head"
+      - "model.embed_tokens"
+
+# Dataset for calibration
+dataset:
+  name: TextDataset
+  data_path: ./dataset/sharegpt_gpt4/sharegpt_gpt4_256.jsonl
+  max_seq_length: 4096
+  num_samples: 32
+  batch_size: 1
diff --git a/configs/qwen2_5/fp8_static/qwen2_5-1_5b_instruct_ados_fp8_static.yaml b/configs/qwen2_5/fp8_static/qwen2_5-1_5b_instruct_ados_fp8_static.yaml
new file mode 100644
index 00000000..1a89a022
--- /dev/null
+++ b/configs/qwen2_5/fp8_static/qwen2_5-1_5b_instruct_ados_fp8_static.yaml
@@ -0,0 +1,34 @@
+# Global configuration of pipeline
+global:
+  save_path: ./output
+
+# Simplified Configuration for LLM compression
+model:
+  name: Qwen
+  model_path: Qwen/Qwen2.5-1.5B-Instruct
+  trust_remote_code: true
+  low_cpu_mem_usage: true
+  use_cache: false
+  torch_dtype: auto
+  device_map: auto
+
+# Compression configuration
+compression:
+  name: PTQ
+  quantization:
+    name: fp8_lepto     # Supported: fp8_static, fp8_dynamic, int4_awq, int4_gptq
+    bits: 8                # Quantization bits (4/8)
+    quant_method:
+      weight: "per-tensor"
+      activation: "per-tensor"
+    ignore_layers:         # Skip quantization for these layers
+      - "lm_head"
+      - "model.embed_tokens"
+
+# Dataset for calibration
+dataset:
+  name: TextDataset
+  data_path: ./dataset/sharegpt_gpt4/sharegpt_gpt4_256.jsonl
+  max_seq_length: 4096
+  num_samples: 32
+  batch_size: 1
diff --git a/configs/qwen3/fp8_static/qwen3-0_6b_lepto_fp8_static.yaml b/configs/qwen3/fp8_static/qwen3-0_6b_lepto_fp8_static.yaml
new file mode 100644
index 00000000..fe876fab
--- /dev/null
+++ b/configs/qwen3/fp8_static/qwen3-0_6b_lepto_fp8_static.yaml
@@ -0,0 +1,34 @@
+# Global configuration of pipeline
+global:
+  save_path: ./output
+
+# Simplified Configuration for LLM compression
+model:
+  name: Qwen
+  model_path:  Qwen/Qwen3-0.6B
+  trust_remote_code: true
+  low_cpu_mem_usage: true
+  use_cache: false
+  torch_dtype: auto
+  device_map: auto
+
+# Compression configuration
+compression:
+  name: PTQ
+  quantization:
+    name: fp8_lepto
+    bits: 8
+    quant_method:
+      weight: "per-tensor"
+      activation: "per-tensor"
+    ignore_layers:         # Skip quantization for these layers
+      - "lm_head"
+      - "model.embed_tokens"
+
+# Dataset for calibration
+dataset:
+  name: TextDataset
+  data_path: ./dataset/sharegpt_gpt4/sharegpt_gpt4_256.jsonl
+  max_seq_length: 4096
+  num_samples: 32
+  batch_size: 1
diff --git a/configs/qwen3/fp8_static/qwen3-4b_lepto_fp8_static.yaml b/configs/qwen3/fp8_static/qwen3-4b_lepto_fp8_static.yaml
new file mode 100644
index 00000000..814c1462
--- /dev/null
+++ b/configs/qwen3/fp8_static/qwen3-4b_lepto_fp8_static.yaml
@@ -0,0 +1,34 @@
+# Global configuration of pipeline
+global:
+  save_path: ./output
+
+# Simplified Configuration for LLM compression
+model:
+  name: Qwen
+  model_path: Qwen/Qwen3-4B
+  trust_remote_code: true
+  low_cpu_mem_usage: true
+  use_cache: false
+  torch_dtype: auto
+  device_map: auto
+
+# Compression configuration
+compression:
+  name: PTQ
+  quantization:
+    name: fp8_lepto
+    bits: 8
+    quant_method:
+      weight: "per-tensor"
+      activation: "per-tensor"
+    ignore_layers:         # Skip quantization for these layers
+      - "lm_head"
+      - "model.embed_tokens"
+
+# Dataset for calibration
+dataset:
+  name: TextDataset
+  data_path: ./dataset/sharegpt_gpt4/sharegpt_gpt4_256.jsonl
+  max_seq_length: 4096
+  num_samples: 32
+  batch_size: 1
diff --git a/configs/qwen3/fp8_static/qwen3-8b_lepto_fp8_static.yaml b/configs/qwen3/fp8_static/qwen3-8b_lepto_fp8_static.yaml
new file mode 100644
index 00000000..108576de
--- /dev/null
+++ b/configs/qwen3/fp8_static/qwen3-8b_lepto_fp8_static.yaml
@@ -0,0 +1,34 @@
+# Global configuration of pipeline
+global:
+  save_path: ./output
+
+# Simplified Configuration for LLM compression
+model:
+  name: Qwen
+  model_path: Qwen/Qwen3-8B
+  trust_remote_code: true
+  low_cpu_mem_usage: true
+  use_cache: false
+  torch_dtype: auto
+  device_map: auto
+
+# Compression configuration
+compression:
+  name: PTQ
+  quantization:
+    name: fp8_lepto
+    bits: 8
+    quant_method:
+      weight: "per-tensor"
+      activation: "per-tensor"
+    ignore_layers:         # Skip quantization for these layers
+      - "lm_head"
+      - "model.embed_tokens"
+
+# Dataset for calibration
+dataset:
+  name: TextDataset
+  data_path: ./dataset/sharegpt_gpt4/sharegpt_gpt4_256.jsonl
+  max_seq_length: 4096
+  num_samples: 32
+  batch_size: 1
diff --git a/docs/source/features/quantization/fp8_lepto.md b/docs/source/features/quantization/fp8_lepto.md
new file mode 100644
index 00000000..4ee5d54c
--- /dev/null
+++ b/docs/source/features/quantization/fp8_lepto.md
@@ -0,0 +1,63 @@
+(fp8_lepto)=
+
+# FP8_LEPTO量化
+
+通常情况下PTQ统计Activation和Weight的abs Max值作为量化缩放系数。通过观察FP8 PTQ量化后有损的模型的数值分布发现，相较于量化无损模型会出现激活值分布方差过大的情况，这种数值分布会使得量化数值落在FP8难以表达的量化范围，导致模型在一些数学难题或文本格式要求严格的任务上损失过大。
+
+
+通过观察原始精度数值分布发现，该权重整体数值集中分布为尖峰分布，存在明显的outlier且大部分数据集中在0点附近，数值之间的相对距离较小导致计算过程中对于数值的精度要求更高。FP8 QDQ量化后的权重分布如右图所示，可以发现量化后的分布对比原始精度较为平滑。由于FP8-E4M3的数值表达在越靠近零点可表示的数值越多，趋近于正态分布的下的原始精度权重Worigin，通过传统FP8量化会导致原本数值较密集的数被平滑到了表达能力较差的FP8精度范围，导致精度表达能力下降带来效果损失。
+
+针对上述FP8量化问题，我们推出了Leptokurtic Quant（LeptoQuant），一种通过隔离outlier将FP8权重映射范围集中至高精度区域的搜索策略。通常情况下对于权重和激活的量化难易度上激活值难度更高，因此我们着重优化激活的FP8。LEPTO将原始FP8的outlier值作为FP8精度表达上限，从而计算出新的Scale将数值分布压缩至高精度分布范围，使得量化后的激活值具有更好的精度表达。
+
+
+运行示例如下：
+
+```shell
+python3 tools/run.py -c configs/qwen3/fp8_static/qwen3-0_6b_lepto_fp8_static.yaml
+```
+
+该配置文件中，量化相关参数如下：
+- `name`：压缩策略。
+- `quantization.name`：压缩算法填`fp8_lepto`。
+- `quantization.bits`：fp8量化对应填写8bit。
+- `quantization.quant_method`：主要指定权重和激活的量化粒度为`per-tensor`。
+- `quantization.ignore_layers`：需要忽略不进行量化的线性层。
+
+```yaml
+compression:
+  name: PTQ
+  quantization:
+    name: fp8_lepto
+    bits: 8
+    quant_method:
+      weight: "per-tensor"
+      activation: "per-tensor"
+    ignore_layers:         # Skip quantization for these layers
+      - "lm_head"
+      - "model.embed_tokens"
+```
+
+激活静态量化需要指定校准数据集，例如使用`sharegpt`数据集：
+
+```yaml
+dataset:
+  name: TextDataset
+  data_path: ./dataset/sharegpt_gpt4/sharegpt_gpt4_256.jsonl
+  max_seq_length: 4096
+  num_samples: 256
+  batch_size: 1
+```
+
+数据集相关参数如下：
+- `name`：校准数据集类型，文生文任务选择`TextDataset`。
+- `data_path`：校准数据JSONL文件位置。
+- `max_seq_length`：校准截断最大上下文长度。
+- `num_samples`：校准最大样本个数。
+- `batch_size`：校准批量大小。
+
+支持数据格式详见[数据准备文档](../design/prepare_dataset.md)。
+
+
+## 产出模型&部署
+
+同FP8
diff --git a/docs/source/features/quantization/index.md b/docs/source/features/quantization/index.md
index d5dd153d..49636faa 100644
--- a/docs/source/features/quantization/index.md
+++ b/docs/source/features/quantization/index.md
@@ -11,4 +11,5 @@ fp8
 int8
 awq
 gptq
+fp8_lepto
 :::
diff --git a/requirements.txt b/requirements.txt
index c6e65511..e8c8abf2 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -2,12 +2,12 @@ torch>=2.6.0
 torchvision>=0.21.0
 transformers>=4.52.0
 safetensors>=0.5.3
-diffusers
+diffusers==0.34.0
 numpy
 tqdm
 pyarrow
 threadpoolctl
-qwen_vl_utils
+qwen_vl_utils==0.0.11
 tiktoken
 triton
 datasets