From a56612dbcea1e7534d5f7177f5f8523a96ee6cf7 Mon Sep 17 00:00:00 2001 From: ali-12388 <1940747290@qq.com> Date: Mon, 3 Nov 2025 18:44:33 +0800 Subject: [PATCH 1/6] add qwen3-omni --- angelslim/data/__init__.py | 1 + angelslim/data/dataloader.py | 12 ++ angelslim/data/omni_dataset.py | 127 +++++++++++++++ angelslim/engine.py | 17 +- angelslim/models/__init__.py | 1 + angelslim/models/model_factory.py | 4 +- angelslim/models/omni/__init__.py | 16 ++ angelslim/models/omni/qwen_omni.py | 149 ++++++++++++++++++ angelslim/utils/config_parser.py | 7 + .../fp8_dynamic/qwen3_omn_fp8_dynamic.yaml | 23 +++ .../fp8_static/qwen3_omni_fp8_static.yaml | 32 ++++ dataset/omni_fake_data/fake_data.json | 3 + dataset/omni_fake_data/images/0.png | Bin 0 -> 44922 bytes requirements/requirements.txt | 1 + tools/run.py | 4 + 15 files changed, 394 insertions(+), 3 deletions(-) create mode 100644 angelslim/data/omni_dataset.py create mode 100644 angelslim/models/omni/__init__.py create mode 100644 angelslim/models/omni/qwen_omni.py create mode 100644 configs/qwen3_omni/fp8_dynamic/qwen3_omn_fp8_dynamic.yaml create mode 100644 configs/qwen3_omni/fp8_static/qwen3_omni_fp8_static.yaml create mode 100755 dataset/omni_fake_data/fake_data.json create mode 100755 dataset/omni_fake_data/images/0.png diff --git a/angelslim/data/__init__.py b/angelslim/data/__init__.py index 97d05f98..7e5c29c5 100644 --- a/angelslim/data/__init__.py +++ b/angelslim/data/__init__.py @@ -6,5 +6,6 @@ from .dataloader import DataLoaderFactory # noqa: F401 from .multimodal_dataset import MultiModalDataset # noqa: F401 +from .omni_dataset import OmniDataset # noqa: F401 from .text2image_dataset import Text2ImageDataset # noqa: F401 from .text_dataset import TextDataset # noqa: F401 diff --git a/angelslim/data/dataloader.py b/angelslim/data/dataloader.py index 1c92cddb..c41b755d 100644 --- a/angelslim/data/dataloader.py +++ b/angelslim/data/dataloader.py @@ -20,6 +20,7 @@ from .base_dataset import BaseDataset from .multimodal_dataset import MultiModalDataset +from .omni_dataset import OmniDataset from .text2image_dataset import Text2ImageDataset from .text_dataset import TextDataset @@ -39,6 +40,7 @@ def create_data_loader( data_type: str = "auto", num_workers: int = 0, inference_settings: Dict = None, + use_audio_in_video: bool = False, model_name: str = None, ) -> DataLoader: """ @@ -98,6 +100,16 @@ def create_data_loader( num_samples=num_samples, inference_settings=inference_settings, ) + elif data_type == "OmniDataset": + dataset = OmniDataset( + processor=processor, + device=device, + max_length=max_length, + num_samples=num_samples, + data_source=data_source, + is_hf_dataset=not os.path.isfile(data_source), + use_audio_in_video=use_audio_in_video, + ) else: raise ValueError(f"Unsupported data type: {data_type}") diff --git a/angelslim/data/omni_dataset.py b/angelslim/data/omni_dataset.py new file mode 100644 index 00000000..6f2f6fd7 --- /dev/null +++ b/angelslim/data/omni_dataset.py @@ -0,0 +1,127 @@ +# Copyright 2025 Tencent Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +import os +from pathlib import Path +from typing import Dict, List, Union + +from qwen_omni_utils import process_mm_info +from transformers import ProcessorMixin + +from .base_dataset import BaseDataset + + +class OmniDataset(BaseDataset): + """Dataset for multimodal (text + image) data""" + + def __init__( + self, + processor: ProcessorMixin, + device: str = "cpu", + max_length: int = 4096, + num_samples: int = -1, + data_source: Union[str, Dict] = None, + is_hf_dataset: bool = False, + use_audio_in_video: bool = False, + ): + super().__init__(processor, device, max_length) + self.is_hf_dataset = is_hf_dataset + self.use_audio_in_video = use_audio_in_video + + self._load_file_based_dataset(data_source, num_samples) + + def _load_file_based_dataset(self, data_path: str, num_samples: int): + """Load dataset from local file system""" + path_obj = Path(data_path) + data_dir = path_obj.parent + + line_count = 0 + with open(data_path, "r") as f: + for line in f: + if num_samples > 0 and line_count >= num_samples: + break + data = json.loads(line.strip()) + video_path = None + audio_path = None + image_path = None + + if "video_path" in data: + video_path = os.path.normpath( + os.path.join(data_dir, data["video_path"]) + ) + if "audio_path" in data: + audio_path = os.path.normpath( + os.path.join(data_dir, data["audio_path"]) + ) + if "image_path" in data: + image_path = os.path.normpath( + os.path.join(data_dir, data["image_path"]) + ) + + ms = data.get("messages") + + conversation = [] + for m in ms: + if m["role"] == "system": + conversation.append( + { + "role": "system", + "content": [{"type": "text", "text": m["content"]}], + } + ) + elif m["role"] == "user": + content = [] + text_content = m["content"] + text_content = ( + text_content.replace("