Skip to content

Commit 3f12551

Browse files
committed
Added script to run with vllm
Signed-off-by: ajrasane <131806219+ajrasane@users.noreply.github.com>
1 parent 4f92fbf commit 3f12551

1 file changed

Lines changed: 134 additions & 0 deletions

File tree

examples/llm_ptq/run_qwen_vllm.py

Lines changed: 134 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,134 @@
1+
# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
# SPDX-License-Identifier: Apache-2.0
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
16+
"""Qwen3-Omni-30B-A3B text inference with vLLM.
17+
18+
Usage:
19+
python qwen3_omni_vllm.py
20+
python qwen3_omni_vllm.py --model /path/to/model --tp 4
21+
"""
22+
23+
from __future__ import annotations
24+
25+
import argparse
26+
import os
27+
import shutil
28+
29+
from huggingface_hub import snapshot_download
30+
from transformers import Qwen3OmniMoeProcessor
31+
from vllm import LLM, SamplingParams
32+
33+
MODEL_ID = "Qwen/Qwen3-Omni-30B-A3B-Thinking"
34+
35+
# Files needed for tokenizer/processor that vLLM loads from model path
36+
TOKENIZER_FILES = [
37+
"vocab.json",
38+
"merges.txt",
39+
"tokenizer.json",
40+
"tokenizer_config.json",
41+
"special_tokens_map.json",
42+
"preprocessor_config.json",
43+
"chat_template.json",
44+
]
45+
46+
47+
def ensure_tokenizer_files(model_path: str, source_model_id: str) -> None:
48+
"""Copy tokenizer files from HF model to local quantized model dir if missing."""
49+
if not os.path.isdir(model_path):
50+
return # Not a local path, nothing to do
51+
52+
# Check if tokenizer files are missing
53+
missing_files = [f for f in TOKENIZER_FILES if not os.path.exists(os.path.join(model_path, f))]
54+
if not missing_files:
55+
return
56+
57+
print(f"Copying missing tokenizer files from {source_model_id}...")
58+
# Download only tokenizer files from HF
59+
cache_dir = snapshot_download(
60+
source_model_id,
61+
allow_patterns=TOKENIZER_FILES,
62+
)
63+
64+
for fname in TOKENIZER_FILES:
65+
src = os.path.join(cache_dir, fname)
66+
dst = os.path.join(model_path, fname)
67+
if os.path.exists(src) and not os.path.exists(dst):
68+
shutil.copy2(src, dst)
69+
print(f" Copied {fname}")
70+
71+
72+
def main():
73+
parser = argparse.ArgumentParser(description="Run Qwen3-Omni text inference with vLLM")
74+
parser.add_argument("--model", default=MODEL_ID, help="Model ID or path")
75+
parser.add_argument("--tp", type=int, default=1, help="Tensor parallel size")
76+
parser.add_argument("--max-model-len", type=int, default=32768, help="Max model length")
77+
78+
args = parser.parse_args()
79+
80+
# Load processor for chat template
81+
processor = Qwen3OmniMoeProcessor.from_pretrained(MODEL_ID)
82+
83+
# Text-only conversations
84+
conversations = [
85+
[
86+
{
87+
"role": "user",
88+
"content": [{"type": "text", "text": "What are the key features of Qwen3-Omni?"}],
89+
}
90+
],
91+
]
92+
93+
# Apply chat template with thinking disabled
94+
texts = processor.apply_chat_template(
95+
conversations,
96+
add_generation_prompt=True,
97+
tokenize=False,
98+
enable_thinking=False,
99+
)
100+
101+
# Process multimodal info (returns empty for text-only)
102+
# audios, images, videos = process_mm_info(conversations, use_audio_in_video=False)
103+
104+
# Ensure tokenizer files exist in local model dir (vLLM loads processor from model path)
105+
ensure_tokenizer_files(args.model, MODEL_ID)
106+
107+
print(f"Loading model: {args.model}")
108+
llm = LLM(
109+
model=args.model,
110+
tokenizer=MODEL_ID, # Always use original tokenizer from HF
111+
tensor_parallel_size=args.tp,
112+
max_model_len=args.max_model_len,
113+
trust_remote_code=True,
114+
# Disable talker (audio generation) - text output only
115+
# enable_talker=False,
116+
)
117+
118+
sampling_params = SamplingParams(
119+
temperature=0.7,
120+
top_p=0.9,
121+
max_tokens=512,
122+
)
123+
124+
print("Running inference...")
125+
outputs = llm.generate(texts, sampling_params)
126+
127+
for output in outputs:
128+
generated_text = output.outputs[0].text
129+
print("-" * 80)
130+
print(f"Generated: {generated_text}")
131+
132+
133+
if __name__ == "__main__":
134+
main()

0 commit comments

Comments
 (0)