From bf90887cc9f8ceb3b70175ef2b3ce66488a4b1bd Mon Sep 17 00:00:00 2001 From: yushuosun Date: Sun, 17 May 2026 23:54:14 +0000 Subject: [PATCH] [bugfix] fix qwen2.5-vl vllm video infer fps validation (#9357) In vllm mode, Qwen2VLTemplate.replace_tag was passing the local fps probe (a list) through mm_processor_kwargs to vllm's Qwen2_5_VLProcessor, which under transformers v5 validates fps as scalar (int|float|None) and rejects the list with StrictDataclassFieldValidationError. The v3 branch immediately below already guards 'video_metadata' with 'if self.mode != "vllm":' for the same reason. Apply the same guard to the v2_5 fps append so vllm computes fps itself from the video input. The non-vllm _encode path is unaffected: it still receives fps in mm_processor_kwargs to compute second_per_grid_ts. Fixes #9357 Co-authored-by: Claude --- swift/template/templates/qwen.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/swift/template/templates/qwen.py b/swift/template/templates/qwen.py index 29cb53db5f..d90c8ecb90 100644 --- a/swift/template/templates/qwen.py +++ b/swift/template/templates/qwen.py @@ -344,7 +344,8 @@ def replace_tag(self, media_type: Literal['image', 'video', 'audio'], index: int video, video_kwargs = fetch_video(video_inputs, return_video_sample_fps=True, **kwargs) tokens = ['<|vision_start|><|video_pad|><|vision_end|>'] if self.version == 'v2_5': - inputs.mm_processor_kwargs.setdefault('fps', []).append(video_kwargs) + if self.mode != 'vllm': + inputs.mm_processor_kwargs.setdefault('fps', []).append(video_kwargs) elif self.version == 'v3': if self.mode != 'vllm': video, video_metadata = video