diff --git a/README.md b/README.md
index 9768d2d0..3875893a 100644
--- a/README.md
+++ b/README.md
@@ -169,11 +169,13 @@ A more accessible, comprehensive, and efficient toolkit for large model compress
diff --git a/README_cn.md b/README_cn.md
index 6085e27d..8b9d262e 100644
--- a/README_cn.md
+++ b/README_cn.md
@@ -170,11 +170,13 @@
diff --git a/angelslim/data/audio_dataset.py b/angelslim/data/audio_dataset.py
new file mode 100644
index 00000000..0a66e468
--- /dev/null
+++ b/angelslim/data/audio_dataset.py
@@ -0,0 +1,139 @@
+# Copyright 2025 Tencent Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import os
+from typing import Dict, List, Union
+
+import requests
+from transformers import ProcessorMixin
+from transformers.pipelines.audio_utils import ffmpeg_read
+
+from .base_dataset import BaseDataset
+
+
+class AudioDataset(BaseDataset):
+ """Dataset for multimodal (text + image) data"""
+
+ def __init__(
+ self,
+ processor: ProcessorMixin,
+ device: str = "cpu",
+ max_length: int = 4096,
+ num_samples: int = -1,
+ data_source: Union[str, Dict] = None,
+ is_hf_dataset: bool = False,
+ model_name: str = None,
+ ):
+ super().__init__(processor, device, max_length)
+ self.is_hf_dataset = is_hf_dataset
+ self.model_name = model_name
+
+ self._load_file_based_dataset(data_source, num_samples)
+
+ def _load_file_based_dataset(self, data_path: str, num_samples: int):
+ """Load dataset from local file system"""
+ audio_dir = os.path.join(os.path.dirname(data_path), "audios")
+ line_count = 0
+
+ with open(data_path, "r") as f:
+ for line in f:
+ if num_samples > 0 and line_count >= num_samples:
+ break
+
+ data = json.loads(line.strip())
+ if data["audio_path"].startswith("http://") or data[
+ "audio_path"
+ ].startswith("https://"):
+ audio_path = data["audio_path"]
+ else:
+ audio_path = os.path.join(audio_dir, data["audio_path"])
+
+ # Prepare chat messages with image
+ messages = [
+ {
+ "role": "user",
+ "content": [
+ {"type": "audio", "audio_url": audio_path},
+ {
+ "type": "text",
+ "text": data["question"].replace("", ""),
+ },
+ ],
+ },
+ {
+ "role": "assistant",
+ "content": [{"type": "text", "text": data["answer"]}],
+ },
+ ]
+
+ self._process_and_append(messages)
+ line_count += 1
+
+ def _process_and_append(self, messages: List[Dict]):
+ """Process messages and append to dataset"""
+
+ input_text = self.processor.apply_chat_template(
+ messages, add_generation_prompt=True, tokenize=False
+ )
+ input_audios = self._extract_audio_info(messages)
+
+ # Process inputs
+ inputs = self.processor(
+ text=input_text,
+ audio=input_audios,
+ sampling_rate=self.processor.feature_extractor.sampling_rate,
+ return_tensors="pt",
+ padding=True,
+ )
+ self.data.append(inputs)
+
+ @staticmethod
+ def read_audio(audio_path):
+ if audio_path.startswith("http://") or audio_path.startswith("https://"):
+ # We need to actually check for a real protocol,
+ # otherwise it's impossible to use a local file
+ # like http_huggingface_co.png
+ inputs = requests.get(audio_path).content
+ else:
+ with open(audio_path, "rb") as f:
+ inputs = f.read()
+ return inputs
+
+ def _extract_audio_info(self, messages: List[Dict]) -> tuple:
+ """Extract Audio paths from messages"""
+ audio_paths = []
+ sampling_rate = self.processor.feature_extractor.sampling_rate
+
+ for message in messages:
+ content = message.get("content", [])
+ if not isinstance(content, list):
+ continue
+
+ for item in content:
+ if item.get("type") == "audio":
+ # Handle both file paths and PIL images
+ if isinstance(item["audio_url"], str):
+ try:
+ audio_paths.append(
+ ffmpeg_read(
+ self.read_audio(item["audio_url"]),
+ sampling_rate=sampling_rate,
+ )
+ )
+ except ValueError as e:
+ raise ValueError(
+ f"Could not open audio file: {item['audio_url']}, {e}"
+ )
+ return audio_paths
diff --git a/angelslim/data/dataloader.py b/angelslim/data/dataloader.py
index c41b755d..1e7b9160 100644
--- a/angelslim/data/dataloader.py
+++ b/angelslim/data/dataloader.py
@@ -18,6 +18,7 @@
from torch.utils.data import DataLoader
from transformers import ProcessorMixin
+from .audio_dataset import AudioDataset
from .base_dataset import BaseDataset
from .multimodal_dataset import MultiModalDataset
from .omni_dataset import OmniDataset
@@ -110,6 +111,16 @@ def create_data_loader(
is_hf_dataset=not os.path.isfile(data_source),
use_audio_in_video=use_audio_in_video,
)
+ elif data_type == "AudioDataset":
+ dataset = AudioDataset(
+ processor=processor,
+ device=device,
+ max_length=max_length,
+ num_samples=num_samples,
+ data_source=data_source,
+ is_hf_dataset=not os.path.isfile(data_source),
+ model_name=model_name,
+ )
else:
raise ValueError(f"Unsupported data type: {data_type}")
diff --git a/angelslim/engine.py b/angelslim/engine.py
index 1d435811..3ec3e3af 100644
--- a/angelslim/engine.py
+++ b/angelslim/engine.py
@@ -106,7 +106,7 @@ def prepare_model(
self.series = SlimModelFactory.get_series_by_models(model_name)
- if self.series in ["LLM", "VLM"]:
+ if self.series in ["LLM", "VLM", "Audio"]:
if model:
assert tokenizer, " If model is set, tokenizer must be also set."
self.slim_model.tokenizer = tokenizer
@@ -162,7 +162,7 @@ def prepare_data(
data_type=data_type,
processor=(
self.slim_model.processor
- if self.series == "VLM" or self.series == "Omni"
+ if self.series in ["VLM", "Omni", "Audio"]
else self.slim_model.tokenizer
),
device=self.slim_model.model.device,
@@ -205,7 +205,7 @@ def prepare_compressor(
f"Compression method '{method_name}' not registered. "
f"Available methods: {CompressorFactory.get_available_compressor()}"
)
- if self.series in ["LLM", "VLM", "Omni"]:
+ if self.series in ["LLM", "VLM", "Omni", "Audio"]:
global_config.update(self.model_path, self.max_seq_length)
if default_method:
diff --git a/angelslim/models/__init__.py b/angelslim/models/__init__.py
index 4036371a..b73a4521 100644
--- a/angelslim/models/__init__.py
+++ b/angelslim/models/__init__.py
@@ -12,6 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
+from .audio import * # noqa: F401 F403
from .diffusion import * # noqa: F401 F403
from .llm import * # noqa: F401 F403
from .model_factory import SlimModelFactory # noqa: F401
diff --git a/angelslim/models/audio/__init__.py b/angelslim/models/audio/__init__.py
new file mode 100644
index 00000000..54e1528f
--- /dev/null
+++ b/angelslim/models/audio/__init__.py
@@ -0,0 +1,16 @@
+# Copyright 2025 Tencent Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from .qwen2_audio import Qwen2_Audio # noqa: F401
diff --git a/angelslim/models/audio/qwen2_audio.py b/angelslim/models/audio/qwen2_audio.py
new file mode 100644
index 00000000..5afaeaba
--- /dev/null
+++ b/angelslim/models/audio/qwen2_audio.py
@@ -0,0 +1,185 @@
+# Copyright 2025 Tencent Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import re
+
+import torch
+from tqdm import tqdm
+from transformers import (
+ AutoProcessor,
+ AutoTokenizer,
+ Qwen2AudioForConditionalGeneration,
+)
+
+from ...compressor.quant.core import PTQVLMSaveVllmHF
+from ...utils import find_layers, print_info
+from ..base_model import BaseLLMModel
+from ..model_factory import SlimModelFactory
+
+
+@SlimModelFactory.register
+class Qwen2_Audio(BaseLLMModel):
+ def __init__(
+ self,
+ model=None,
+ deploy_backend="vllm",
+ ):
+ super().__init__(
+ model=model,
+ deploy_backend=deploy_backend,
+ )
+ self.modal_type = "Audio"
+ self.block_name = "language_model.model.layers"
+ self.audio_block_name = "audio_tower.layers"
+
+ def from_pretrained(
+ self,
+ model_path,
+ torch_dtype="auto",
+ device_map="auto",
+ trust_remote_code=True,
+ low_cpu_mem_usage=True,
+ use_cache=False,
+ using_multi_nodes=False,
+ ):
+ self.model = Qwen2AudioForConditionalGeneration.from_pretrained(
+ model_path,
+ torch_dtype=torch_dtype,
+ device_map=device_map,
+ trust_remote_code=trust_remote_code,
+ low_cpu_mem_usage=low_cpu_mem_usage,
+ )
+
+ # Load tokenizer
+ self.tokenizer = AutoTokenizer.from_pretrained(
+ model_path, trust_remote_code=trust_remote_code
+ )
+
+ # Load processor
+ self.processor = AutoProcessor.from_pretrained(
+ model_path, trust_remote_code=trust_remote_code
+ )
+
+ def get_observer_layers(self):
+ names = [
+ "self_attn.k_proj",
+ "self_attn.v_proj",
+ "self_attn.q_proj",
+ "self_attn.o_proj",
+ "mlp.up_proj",
+ "mlp.gate_proj",
+ "mlp.down_proj",
+ ]
+
+ if hasattr(self.quant_config, "quant_audio") and self.quant_config.quant_audio:
+ audio_tower_names = [
+ "self_attn.k_proj",
+ "self_attn.v_proj",
+ "self_attn.q_proj",
+ "self_attn.out_proj",
+ "self_attn.fc1",
+ "self_attn.fc2",
+ ]
+ names.extend(audio_tower_names)
+
+ observer_layers_dict = {}
+ layers_dict = find_layers(self.model, layers=self.observer_layer_classes)
+
+ ignore_layers = self.skip_layer_names()
+ for name, module in layers_dict.items():
+ block_condition = name.startswith(self.block_name) or (
+ hasattr(self.quant_config, "quant_audio")
+ and self.quant_config.quant_audio
+ and name.startswith(self.audio_block_name)
+ )
+ parts = name.split(".")
+ result = ".".join(parts[-2:])
+ if block_condition and result in names:
+ observer_layers_dict[name] = module
+ else:
+ ignore_layers.append(name)
+ self.quant_config.quant_algo_info["ignore_layers"] = ignore_layers
+
+ if self.quant_config.custom_observe_layers_names != "default":
+ for custom_observe_name in self.quant_config.custom_observe_layers_names:
+ for default_name in observer_layers_dict.keys():
+ if custom_observe_name not in default_name:
+ observer_layers_dict.pop(default_name)
+ return observer_layers_dict
+
+ def get_smooth_mapping_layers(self, smooth_config, mappings=None):
+ if mappings is None:
+ mappings = [
+ (["q_proj", "k_proj", "v_proj"], "input_layernorm"),
+ (["gate_proj", "up_proj"], "post_attention_layernorm"),
+ ]
+ print(f"smooth mappings={mappings}")
+ assert len(mappings) == 2
+ assert smooth_config.smooth_first_linears or smooth_config.smooth_last_linears
+ # TODO: support smooth_last_linears
+ return super().get_smooth_mapping_layers(smooth_config, mappings)
+
+ def get_parent_dict(self, observer_layers_dict):
+ parent_mapping = {r"experts\.\d+": "experts"}
+ parent_dict = {}
+ for layer_name in observer_layers_dict.keys():
+ parent_name = layer_name
+ for k, v in parent_mapping.items():
+ parent_name = re.sub(k, v, layer_name)
+ if parent_name != layer_name:
+ parent_dict[layer_name] = parent_name
+ return parent_dict
+
+ def model_forward(self, dataloader, **kwargs):
+ self.model.use_cache = False
+
+ calibrated_cnt = 0
+ if (
+ "gptq" in self.quant_config.quant_algo
+ or "awq" in self.quant_config.quant_algo
+ or "gptaq" in self.quant_config.quant_algo
+ ):
+ device = "cuda:0"
+ else:
+ device = self.model.device
+ print_info(f"device is {device}")
+ if dataloader is not None:
+ with torch.no_grad():
+ for batch in tqdm(
+ dataloader, desc="calibrating...", total=len(dataloader)
+ ):
+ inputs = {k: v.to(device) for k, v in batch.items()}
+ inputs["use_cache"] = False
+ try:
+ _ = self.model(**inputs)
+
+ calibrated_cnt += 1
+ except ValueError:
+ calibrated_cnt += 1
+ pass
+
+ def get_quant_module(self):
+ """
+ Returns the module that will be quantized.
+ This is typically the main transformer module of the model.
+ """
+ return self.model.language_model.model.layers
+
+ def get_save_func(self):
+ if self.deploy_backend in ["vllm", "huggingface"]:
+ return PTQVLMSaveVllmHF
+ else:
+ raise NotImplementedError(
+ f"deploy_backend {self.deploy_backend} is not supported for saving."
+ )
diff --git a/angelslim/models/model_factory.py b/angelslim/models/model_factory.py
index 8e029961..111f6114 100644
--- a/angelslim/models/model_factory.py
+++ b/angelslim/models/model_factory.py
@@ -22,7 +22,7 @@ class SlimModelFactory:
registry: Dict[str, Type] = {}
series_registry: Dict[str, str] = {}
- ALLOWED_SERIES = ("LLM", "VLM", "Diffusion", "Omni")
+ ALLOWED_SERIES = ("LLM", "VLM", "Diffusion", "Omni", "Audio")
@classmethod
def register(cls, model_class: Type) -> Type:
@@ -41,6 +41,8 @@ def register(cls, model_class: Type) -> Type:
series = "Diffusion"
elif "omni" in module_path:
series = "Omni"
+ elif "audio" in module_path:
+ series = "Audio"
else:
raise ValueError(
f"model_class '{class_name}' is not in a valid series: {cls.ALLOWED_SERIES}" # noqa: E501
diff --git a/angelslim/utils/config_parser.py b/angelslim/utils/config_parser.py
index f7aa6548..5c048f73 100644
--- a/angelslim/utils/config_parser.py
+++ b/angelslim/utils/config_parser.py
@@ -60,7 +60,7 @@ class GlobalConfig:
save_path: str = field(default="./output")
# Shared max_seq_length configuration
max_seq_length: int = field(default=2048)
- hidden_size: int = field(default=2048)
+ hidden_size: int = field(default=4096)
model_arch_type: str = field(default=None)
absolute_model_path: str = field(default=None)
deploy_backend: str = field(default="vllm")
@@ -91,16 +91,24 @@ def get_max_seq_length(self) -> int:
def set_model_hidden_size(self, model_path) -> int:
json_data = get_hf_config(model_path)
- if json_data["model_type"] in ["qwen3_vl"]:
- self.hidden_size = json_data["text_config"]["hidden_size"]
- elif (
- json_data["architectures"][0]
- if isinstance(json_data["architectures"], list)
- else json_data["architectures"]
- ) == "Qwen3OmniMoeForConditionalGeneration":
- self.hidden_size = json_data["thinker_config"]["text_config"]["hidden_size"]
- else:
- self.hidden_size = json_data["hidden_size"]
+ try:
+ if json_data["model_type"] in ["qwen3_vl"]:
+ self.hidden_size = json_data["text_config"]["hidden_size"]
+ elif (
+ json_data["architectures"][0]
+ if isinstance(json_data["architectures"], list)
+ else json_data["architectures"]
+ ) == "Qwen3OmniMoeForConditionalGeneration":
+ self.hidden_size = json_data["thinker_config"]["text_config"][
+ "hidden_size"
+ ]
+ else:
+ self.hidden_size = json_data["hidden_size"]
+ except KeyError:
+ print(
+ "Warning: Failed to set model hidden size from config.json. "
+ f"Using default hidden size {self.hidden_size}."
+ )
def set_model_arch_type(self, model_path) -> str:
json_data = get_hf_config(model_path)
@@ -247,9 +255,6 @@ def need_dataset(self) -> bool:
for method in self.name:
# PTQ/QAT usually need calibration dataset
if method in ["PTQ", "QAT"]:
- # Check if dynamic quantization (usually doesn't need dataset)
- if self.quantization and "dynamic" in self.quantization.name:
- continue
# Check if specific quantization helpers need dataset
if (
self.quantization
@@ -257,6 +262,9 @@ def need_dataset(self) -> bool:
and "smooth" in self.quantization.quant_helpers
):
return True
+ # Check if dynamic quantization (usually doesn't need dataset)
+ if self.quantization and "dynamic" in self.quantization.name:
+ continue
# Default PTQ/QAT needs dataset
return True
return False
diff --git a/configs/qwen2_audio/fp8_dynamic/qwen2_audio_7b_fp8_dynamic.yaml b/configs/qwen2_audio/fp8_dynamic/qwen2_audio_7b_fp8_dynamic.yaml
new file mode 100644
index 00000000..d62d6b6a
--- /dev/null
+++ b/configs/qwen2_audio/fp8_dynamic/qwen2_audio_7b_fp8_dynamic.yaml
@@ -0,0 +1,25 @@
+# Global configuration of pipeline
+global:
+ save_path: ./output
+
+# Simplified Configuration for LLM compression
+model:
+ name: Qwen2_Audio
+ model_path: Qwen/Qwen2-Audio-7B
+ trust_remote_code: true
+ low_cpu_mem_usage: true
+ use_cache: false
+ torch_dtype: auto
+ device_map: auto
+
+# Compression configuration
+compression:
+ name: PTQ
+ quantization:
+ name: fp8_dynamic
+ bits: 8
+ quant_method:
+ weight: "per-tensor"
+ activation: "per-tensor"
+ ignore_layers: # Skip quantization for these layers
+ - "lm_head"
diff --git a/configs/qwen2_audio/fp8_static/qwen2_audio_7b_fp8_static.yaml b/configs/qwen2_audio/fp8_static/qwen2_audio_7b_fp8_static.yaml
new file mode 100644
index 00000000..8a50e75c
--- /dev/null
+++ b/configs/qwen2_audio/fp8_static/qwen2_audio_7b_fp8_static.yaml
@@ -0,0 +1,33 @@
+# Global configuration of pipeline
+global:
+ save_path: ./output
+
+# Simplified Configuration for LLM compression
+model:
+ name: Qwen2_Audio
+ model_path: Qwen/Qwen2-Audio-7B
+ trust_remote_code: true
+ low_cpu_mem_usage: true
+ use_cache: false
+ torch_dtype: auto
+ device_map: auto
+
+# Compression configuration
+compression:
+ name: PTQ
+ quantization:
+ name: fp8_static
+ bits: 8
+ quant_method:
+ weight: "per-tensor"
+ activation: "per-tensor"
+ ignore_layers: # Skip quantization for these layers
+ - "lm_head"
+
+# Dataset for calibration
+dataset:
+ name: AudioDataset
+ data_path: ./dataset/audio_fake_data/fake_data.json
+ max_seq_length: 4096
+ num_samples: 256
+ batch_size: 1
diff --git a/configs/qwen2_audio/int8_dynamic/qwen2_audio_7b_int8_dynamic.yaml b/configs/qwen2_audio/int8_dynamic/qwen2_audio_7b_int8_dynamic.yaml
new file mode 100644
index 00000000..50ca9bac
--- /dev/null
+++ b/configs/qwen2_audio/int8_dynamic/qwen2_audio_7b_int8_dynamic.yaml
@@ -0,0 +1,25 @@
+# Global configuration of pipeline
+global:
+ save_path: ./output
+
+# Simplified Configuration for LLM compression
+model:
+ name: Qwen2_Audio
+ model_path: Qwen/Qwen2-Audio-7B
+ trust_remote_code: true
+ low_cpu_mem_usage: true
+ use_cache: false
+ torch_dtype: auto
+ device_map: auto
+
+# Compression configuration
+compression:
+ name: PTQ
+ quantization:
+ name: int8_dynamic
+ bits: 8
+ quant_method:
+ weight: "per-channel"
+ activation: "per-token"
+ ignore_layers: # Skip quantization for these layers
+ - "lm_head"
diff --git a/configs/qwen2_audio/smooth_int8/qwen2_audio_7b_int8_dynamic_smooth.yaml b/configs/qwen2_audio/smooth_int8/qwen2_audio_7b_int8_dynamic_smooth.yaml
new file mode 100644
index 00000000..76493d4e
--- /dev/null
+++ b/configs/qwen2_audio/smooth_int8/qwen2_audio_7b_int8_dynamic_smooth.yaml
@@ -0,0 +1,35 @@
+# Global configuration of pipeline
+global:
+ save_path: ./output
+
+# Simplified Configuration for LLM compression
+model:
+ name: Qwen2_Audio
+ model_path: Qwen/Qwen2-Audio-7B
+ trust_remote_code: true
+ low_cpu_mem_usage: true
+ use_cache: false
+ torch_dtype: auto
+ device_map: auto
+
+# Compression configuration
+compression:
+ name: PTQ
+ quantization:
+ name: int8_dynamic
+ bits: 8
+ quant_method:
+ weight: "per-channel"
+ activation: "per-token"
+ quant_helpers:
+ - "smooth"
+ ignore_layers:
+ - "lm_head"
+
+# Dataset for calibration
+dataset:
+ name: AudioDataset
+ data_path: ./dataset/audio_fake_data/fake_data.json
+ max_seq_length: 4096
+ num_samples: 256
+ batch_size: 1
diff --git a/dataset/audio_fake_data/audios/1.wav b/dataset/audio_fake_data/audios/1.wav
new file mode 100644
index 00000000..256e4afd
Binary files /dev/null and b/dataset/audio_fake_data/audios/1.wav differ
diff --git a/dataset/audio_fake_data/audios/2.wav b/dataset/audio_fake_data/audios/2.wav
new file mode 100644
index 00000000..1f662557
Binary files /dev/null and b/dataset/audio_fake_data/audios/2.wav differ
diff --git a/dataset/audio_fake_data/fake_data.json b/dataset/audio_fake_data/fake_data.json
new file mode 100755
index 00000000..130dd12e
--- /dev/null
+++ b/dataset/audio_fake_data/fake_data.json
@@ -0,0 +1,2 @@
+{"question": "Detect the language and recognize the speech: ", "answer": "甚至出现交易几乎停滞的情况","audio_path": "./1.wav","type": "multimodal"}
+{"question": "Detect the language and recognize the speech: ","answer": "换一首歌","audio_path": "./2.wav","type": "multimodal"}
\ No newline at end of file
diff --git a/docs/source/index.md b/docs/source/index.md
index 51656fba..ad98e76d 100644
--- a/docs/source/index.md
+++ b/docs/source/index.md
@@ -98,7 +98,9 @@ AngelSlim是腾讯自研的,致力于打造更易用、更全面和更高效
- 建设中
* - **语音(TTS/ASR)**
- - Qwen3-Omni
+ - Qwen2-Audio
- - FP8-Static/Dynamic
+ - INT8-Dynamic
- - 建设中
- - **Token剪枝**