From 800343f99f1ab98baa74456207b5918e884bc48b Mon Sep 17 00:00:00 2001 From: root Date: Wed, 7 Jan 2026 22:18:50 +0800 Subject: [PATCH 1/2] feature: support qwen2_audio quantization --- README.md | 2 + README_cn.md | 2 + angelslim/data/audio_dataset.py | 139 +++++++++++++ angelslim/data/dataloader.py | 11 ++ angelslim/engine.py | 6 +- angelslim/models/__init__.py | 1 + angelslim/models/audio/__init__.py | 16 ++ angelslim/models/audio/qwen2_audio.py | 185 ++++++++++++++++++ angelslim/models/model_factory.py | 4 +- angelslim/utils/config_parser.py | 8 +- .../qwen2_audio_7b_fp8_dynamic.yaml | 25 +++ .../fp8_static/qwen2_audio_7b_fp8_static.yaml | 33 ++++ .../qwen2_audio_7b_int8_dynamic.yaml | 25 +++ .../qwen2_audio_7b_int8_dynamic_smooth.yaml | 35 ++++ dataset/audio_fake_data/audios/1.wav | Bin 0 -> 134570 bytes dataset/audio_fake_data/audios/2.wav | Bin 0 -> 63822 bytes dataset/audio_fake_data/fake_data.json | 2 + docs/source/index.md | 2 + 18 files changed, 489 insertions(+), 7 deletions(-) create mode 100644 angelslim/data/audio_dataset.py create mode 100644 angelslim/models/audio/__init__.py create mode 100644 angelslim/models/audio/qwen2_audio.py create mode 100644 configs/qwen2_audio/fp8_dynamic/qwen2_audio_7b_fp8_dynamic.yaml create mode 100644 configs/qwen2_audio/fp8_static/qwen2_audio_7b_fp8_static.yaml create mode 100644 configs/qwen2_audio/int8_dynamic/qwen2_audio_7b_int8_dynamic.yaml create mode 100644 configs/qwen2_audio/smooth_int8/qwen2_audio_7b_int8_dynamic_smooth.yaml create mode 100644 dataset/audio_fake_data/audios/1.wav create mode 100644 dataset/audio_fake_data/audios/2.wav create mode 100755 dataset/audio_fake_data/fake_data.json diff --git a/README.md b/README.md index 9768d2d0..3875893a 100644 --- a/README.md +++ b/README.md @@ -169,11 +169,13 @@ A more accessible, comprehensive, and efficient toolkit for large model compress diff --git a/README_cn.md b/README_cn.md index 6085e27d..8b9d262e 100644 --- a/README_cn.md +++ b/README_cn.md @@ -170,11 +170,13 @@ diff --git a/angelslim/data/audio_dataset.py b/angelslim/data/audio_dataset.py new file mode 100644 index 00000000..0a66e468 --- /dev/null +++ b/angelslim/data/audio_dataset.py @@ -0,0 +1,139 @@ +# Copyright 2025 Tencent Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +import os +from typing import Dict, List, Union + +import requests +from transformers import ProcessorMixin +from transformers.pipelines.audio_utils import ffmpeg_read + +from .base_dataset import BaseDataset + + +class AudioDataset(BaseDataset): + """Dataset for multimodal (text + image) data""" + + def __init__( + self, + processor: ProcessorMixin, + device: str = "cpu", + max_length: int = 4096, + num_samples: int = -1, + data_source: Union[str, Dict] = None, + is_hf_dataset: bool = False, + model_name: str = None, + ): + super().__init__(processor, device, max_length) + self.is_hf_dataset = is_hf_dataset + self.model_name = model_name + + self._load_file_based_dataset(data_source, num_samples) + + def _load_file_based_dataset(self, data_path: str, num_samples: int): + """Load dataset from local file system""" + audio_dir = os.path.join(os.path.dirname(data_path), "audios") + line_count = 0 + + with open(data_path, "r") as f: + for line in f: + if num_samples > 0 and line_count >= num_samples: + break + + data = json.loads(line.strip()) + if data["audio_path"].startswith("http://") or data[ + "audio_path" + ].startswith("https://"): + audio_path = data["audio_path"] + else: + audio_path = os.path.join(audio_dir, data["audio_path"]) + + # Prepare chat messages with image + messages = [ + { + "role": "user", + "content": [ + {"type": "audio", "audio_url": audio_path}, + { + "type": "text", + "text": data["question"].replace("