Tencent · irisliu10 · Jan 8, 2026 · Jan 7, 2026 · Jan 8, 2026
diff --git a/README.md b/README.md
@@ -169,11 +169,13 @@ A more accessible, comprehensive, and efficient toolkit for large model compress
       <td>
         <ul style="padding-left: 0; list-style-position: inside;">
           <li><a href="https://huggingface.co/collections/Qwen/qwen3-omni">Qwen3-Omni</a></li>
+          <li><a href="https://huggingface.co/collections/Qwen/qwen2-audio">Qwen2-Audio</a></li>
         </ul>
       </td>
       <td>
         <ul style="padding-left: 0; list-style-position: inside;">
           <li><a href="https://github.com/Tencent/AngelSlim/blob/main/docs/source/models/qwen3_omni/qwen3_omni_quant.md">FP8-Static/Dynamic</a></li>
+          <li><a href="https://github.com/Tencent/AngelSlim/tree/main/configs/qwen2_audio">INT8-Dynamic</a></li>
         </ul>
       </td>
       <td>

diff --git a/README_cn.md b/README_cn.md
@@ -170,11 +170,13 @@
       <td>
         <ul style="padding-left: 0; list-style-position: inside;">
           <li><a href="https://huggingface.co/collections/Qwen/qwen3-omni">Qwen3-Omni</a></li>
+          <li><a href="https://huggingface.co/collections/Qwen/qwen2-audio">Qwen2-Audio</a></li>
         </ul>
       </td>
       <td>
         <ul style="padding-left: 0; list-style-position: inside;">
           <li><a href="https://github.com/Tencent/AngelSlim/blob/main/docs/source/models/qwen3_omni/qwen3_omni_quant.md">FP8-Static/Dynamic</a></li>
+          <li><a href="https://github.com/Tencent/AngelSlim/tree/main/configs/qwen2_audio">INT8-Dynamic</a></li>
         </ul>
       </td>
       <td>

diff --git a/angelslim/data/audio_dataset.py b/angelslim/data/audio_dataset.py
@@ -0,0 +1,139 @@
+# Copyright 2025 Tencent Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import os
+from typing import Dict, List, Union
+
+import requests
+from transformers import ProcessorMixin
+from transformers.pipelines.audio_utils import ffmpeg_read
+
+from .base_dataset import BaseDataset
+
+
+class AudioDataset(BaseDataset):
+    """Dataset for multimodal (text + image) data"""
+
+    def __init__(
+        self,
+        processor: ProcessorMixin,
+        device: str = "cpu",
+        max_length: int = 4096,
+        num_samples: int = -1,
+        data_source: Union[str, Dict] = None,
+        is_hf_dataset: bool = False,
+        model_name: str = None,
+    ):
+        super().__init__(processor, device, max_length)
+        self.is_hf_dataset = is_hf_dataset
+        self.model_name = model_name
+
+        self._load_file_based_dataset(data_source, num_samples)
+
+    def _load_file_based_dataset(self, data_path: str, num_samples: int):
+        """Load dataset from local file system"""
+        audio_dir = os.path.join(os.path.dirname(data_path), "audios")
+        line_count = 0
+
+        with open(data_path, "r") as f:
+            for line in f:
+                if num_samples > 0 and line_count >= num_samples:
+                    break
+
+                data = json.loads(line.strip())
+                if data["audio_path"].startswith("http://") or data[
+                    "audio_path"
+                ].startswith("https://"):
+                    audio_path = data["audio_path"]
+                else:
+                    audio_path = os.path.join(audio_dir, data["audio_path"])
+
+                # Prepare chat messages with image
+                messages = [
+                    {
+                        "role": "user",
+                        "content": [
+                            {"type": "audio", "audio_url": audio_path},
+                            {
+                                "type": "text",
+                                "text": data["question"].replace("<audio>", ""),
+                            },
+                        ],
+                    },
+                    {
+                        "role": "assistant",
+                        "content": [{"type": "text", "text": data["answer"]}],
+                    },
+                ]
+
+                self._process_and_append(messages)
+                line_count += 1
+
+    def _process_and_append(self, messages: List[Dict]):
+        """Process messages and append to dataset"""
+
+        input_text = self.processor.apply_chat_template(
+            messages, add_generation_prompt=True, tokenize=False
+        )
+        input_audios = self._extract_audio_info(messages)
+
+        # Process inputs
+        inputs = self.processor(
+            text=input_text,
+            audio=input_audios,
+            sampling_rate=self.processor.feature_extractor.sampling_rate,
+            return_tensors="pt",
+            padding=True,
+        )
+        self.data.append(inputs)
+
+    @staticmethod
+    def read_audio(audio_path):
+        if audio_path.startswith("http://") or audio_path.startswith("https://"):
+            # We need to actually check for a real protocol,
+            # otherwise it's impossible to use a local file
+            # like http_huggingface_co.png
+            inputs = requests.get(audio_path).content
+        else:
+            with open(audio_path, "rb") as f:
+                inputs = f.read()
+        return inputs
+
+    def _extract_audio_info(self, messages: List[Dict]) -> tuple:
+        """Extract Audio paths from messages"""
+        audio_paths = []
+        sampling_rate = self.processor.feature_extractor.sampling_rate
+
+        for message in messages:
+            content = message.get("content", [])
+            if not isinstance(content, list):
+                continue
+
+            for item in content:
+                if item.get("type") == "audio":
+                    # Handle both file paths and PIL images
+                    if isinstance(item["audio_url"], str):
+                        try:
+                            audio_paths.append(
+                                ffmpeg_read(
+                                    self.read_audio(item["audio_url"]),
+                                    sampling_rate=sampling_rate,
+                                )
+                            )
+                        except ValueError as e:
+                            raise ValueError(
+                                f"Could not open audio file: {item['audio_url']}, {e}"
+                            )
+        return audio_paths
diff --git a/angelslim/data/dataloader.py b/angelslim/data/dataloader.py
@@ -18,6 +18,7 @@
 from torch.utils.data import DataLoader
 from transformers import ProcessorMixin
 
+from .audio_dataset import AudioDataset
 from .base_dataset import BaseDataset
 from .multimodal_dataset import MultiModalDataset
 from .omni_dataset import OmniDataset
@@ -110,6 +111,16 @@ def create_data_loader(
                 is_hf_dataset=not os.path.isfile(data_source),
                 use_audio_in_video=use_audio_in_video,
             )
+        elif data_type == "AudioDataset":
+            dataset = AudioDataset(
+                processor=processor,
+                device=device,
+                max_length=max_length,
+                num_samples=num_samples,
+                data_source=data_source,
+                is_hf_dataset=not os.path.isfile(data_source),
+                model_name=model_name,
+            )
         else:
             raise ValueError(f"Unsupported data type: {data_type}")
 

diff --git a/angelslim/engine.py b/angelslim/engine.py
@@ -106,7 +106,7 @@ def prepare_model(
 
         self.series = SlimModelFactory.get_series_by_models(model_name)
 
-        if self.series in ["LLM", "VLM"]:
+        if self.series in ["LLM", "VLM", "Audio"]:
             if model:
                 assert tokenizer, " If model is set, tokenizer must be also set."
                 self.slim_model.tokenizer = tokenizer
@@ -162,7 +162,7 @@ def prepare_data(
             data_type=data_type,
             processor=(
                 self.slim_model.processor
-                if self.series == "VLM" or self.series == "Omni"
+                if self.series in ["VLM", "Omni", "Audio"]
                 else self.slim_model.tokenizer
             ),
             device=self.slim_model.model.device,
@@ -205,7 +205,7 @@ def prepare_compressor(
                     f"Compression method '{method_name}' not registered. "
                     f"Available methods: {CompressorFactory.get_available_compressor()}"
                 )
-        if self.series in ["LLM", "VLM", "Omni"]:
+        if self.series in ["LLM", "VLM", "Omni", "Audio"]:
             global_config.update(self.model_path, self.max_seq_length)
 
         if default_method:

diff --git a/angelslim/models/__init__.py b/angelslim/models/__init__.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from .audio import *  # noqa: F401 F403
 from .diffusion import *  # noqa: F401 F403
 from .llm import *  # noqa: F401 F403
 from .model_factory import SlimModelFactory  # noqa: F401

diff --git a/angelslim/models/audio/__init__.py b/angelslim/models/audio/__init__.py
@@ -0,0 +1,16 @@
+# Copyright 2025 Tencent Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from .qwen2_audio import Qwen2_Audio  # noqa: F401