vllm-project · AndreasKaratzas · May 3, 2026 · May 3, 2026 · May 4, 2026 · May 4, 2026
diff --git a/setup.py b/setup.py
@@ -1177,6 +1177,7 @@ def _read_requirements(filename: str) -> list[str]:
             "av",
             "scipy",
             "soundfile",
+            "soxr",
             "mistral_common[audio]",
         ],  # Required for audio processing
         "video": [],  # Kept for backwards compatibility

@@ -152,7 +152,11 @@ def test_models(
             "def add(a, b):\n    return a + b\n\ndef sub(a, b):\n    return a - "
         )
 
-    with hf_runner(model) as hf_model:
+    with hf_runner(
+        model,
+        revision=model_info.revision,
+        trust_remote_code=model_info.trust_remote_code,
+    ) as hf_model:
         hf_outputs = hf_model.generate_greedy_logprobs_limit(
             example_prompts, max_tokens, num_logprobs
         )
@@ -188,6 +192,7 @@ def test_models(
         model,
         tokenizer_name=model_info.tokenizer or model,
         tokenizer_mode=model_info.tokenizer_mode,
+        revision=model_info.revision,
         trust_remote_code=model_info.trust_remote_code,
         # Remove the effects of batch variance on ROCm since batch invariance
         # is not yet supported.

@@ -59,6 +59,12 @@ def get_fixture_path(filename):
     )
 
 
+def load_expected_fixture(filename):
+    fixture_path = get_fixture_path(filename)
+    with open(fixture_path) as f:
+        return json.load(f)
+
+
 def assert_output_matches(output, expected_text, expected_token_ids):
     generated = output.outputs[0]
     assert generated.text == expected_text
@@ -76,7 +82,7 @@ def llm():
     model_info.check_transformers_version(on_fail="skip")
 
     try:
-        return LLM(
+        llm = LLM(
             model=MODEL_NAME,
             dtype="bfloat16",
             enforce_eager=True,
@@ -86,14 +92,19 @@ def llm():
     except Exception as e:
         pytest.skip(f"Failed to load model {MODEL_NAME}: {e}")
 
+    # ROCm may compile decoder kernels on the first inference pass; warm up
+    # once so exact fixture assertions cover the steady-state path.
+    llm.chat(
+        messages=SINGLE_CONVERSATION,
+        sampling_params=SamplingParams(temperature=0.0, max_tokens=1),
+        use_tqdm=False,
+    )
 
-def test_single_generation(llm):
-    fixture_path = get_fixture_path("expected_results_single.json")
-    if not os.path.exists(fixture_path):
-        pytest.skip(f"Fixture not found: {fixture_path}")
+    return llm
 
-    with open(fixture_path) as f:
-        expected = json.load(f)
+
+def test_single_generation(llm):
+    expected = load_expected_fixture("expected_results_single.json")
 
     outputs = llm.chat(
         messages=SINGLE_CONVERSATION,
@@ -108,12 +119,7 @@ def test_single_generation(llm):
 
 
 def test_batched_generation(llm):
-    fixture_path = get_fixture_path("expected_results_batched.json")
-    if not os.path.exists(fixture_path):
-        pytest.skip(f"Fixture not found: {fixture_path}")
-
-    with open(fixture_path) as f:
-        expected = json.load(f)
+    expected = load_expected_fixture("expected_results_batched.json")
 
     outputs = llm.chat(
         messages=BATCHED_CONVERSATIONS,

@@ -42,15 +42,30 @@ def __init__(self):
         self.audio_token_id = 12345
         self.max_audio_len = 60
         self.feature_extractor = MockFeatureExtractor()
+        self.tokenizer = self._tokenize
 
-    def __call__(self, text=None, audios=None, **kwargs):
-        return {"input_ids": [1, 2, 3], "input_features": [np.zeros((3000, 80))]}
+    def __call__(self, text=None, audio=None, **kwargs):
+        return {
+            "input_ids": torch.tensor([[1, 2, 3]], dtype=torch.long),
+            "input_features": torch.zeros((3, 80, 3000)),
+            "input_features_mask": torch.ones((3, 3000), dtype=torch.long),
+        }
+
+    def _tokenize(self, text, **kwargs):
+        return {"input_ids": torch.tensor([[1, 2, 3]], dtype=torch.long)}
 
 
 class MockFeatureExtractor:
     def __init__(self):
         self.sampling_rate = 16000
         self.chunk_length = 30
+        self.hop_length = 160
+
+    def __call__(self, audios, **kwargs):
+        return {
+            "input_features": torch.zeros((len(audios), 80, 3000)),
+            "attention_mask": torch.ones((len(audios), 3000), dtype=torch.long),
+        }
 
 
 @pytest.fixture
@@ -60,6 +75,9 @@ def mock_ctx():
     ctx = MagicMock()
     ctx.get_hf_config.return_value = config
     ctx.get_hf_processor.return_value = MockAudioFlamingo3Processor()
+    ctx.call_hf_processor.side_effect = lambda processor, data, kwargs: processor(
+        **data, **kwargs
+    )
     ctx.model_config.hf_config = config
     return ctx
 
@@ -89,21 +107,14 @@ def test_audio_chunk_counting(mock_ctx):
     mm_data = {"audio": [audio_1, audio_2]}
     prompt = "<|user|>Listen.<|end|>"
 
-    from vllm.multimodal.processing import BaseMultiModalProcessor
-
-    def mock_base_call(self, prompt, mm_data, mm_kwargs, tok_kwargs):
-        return {"input_ids": [1, 2, 3], "input_features": torch.randn(1, 80, 3000)}
-
-    with pytest.MonkeyPatch.context() as mp:
-        mp.setattr(BaseMultiModalProcessor, "_call_hf_processor", mock_base_call)
-
-        processed = processor._call_hf_processor(prompt, mm_data, {}, {})
+    processed = processor._call_hf_processor(prompt, mm_data, {}, {})
 
-        chunk_counts = processed["chunk_counts"]
+    chunk_counts = processed["chunk_counts"]
 
-        assert chunk_counts[0].item() == 1
-        assert chunk_counts[1].item() == 2
-        assert len(chunk_counts) == 2
+    assert chunk_counts[0].item() == 1
+    assert chunk_counts[1].item() == 2
+    assert len(chunk_counts) == 2
+    assert processed["feature_attention_mask"].shape == (3, 3000)
 
 
 def test_dummy_data_generation(mock_ctx):

@@ -49,6 +49,13 @@ def __init__(self):
         self.max_audio_len = 1200
         self.feature_extractor = MockFeatureExtractor()
 
+    def __call__(self, text=None, audio=None, **kwargs):
+        return {
+            "input_ids": torch.tensor([[1, 2, 3]], dtype=torch.long),
+            "input_features": torch.zeros((3, 80, 3000)),
+            "input_features_mask": torch.ones((3, 3000), dtype=torch.long),
+        }
+
 
 class MockFeatureExtractor:
     def __init__(self):
@@ -63,6 +70,9 @@ def mock_ctx():
     ctx = MagicMock()
     ctx.get_hf_config.return_value = config
     ctx.get_hf_processor.return_value = MockMusicFlamingoProcessor()
+    ctx.call_hf_processor.side_effect = lambda processor, data, kwargs: processor(
+        **data, **kwargs
+    )
     ctx.model_config.hf_config = config
     return ctx
 
@@ -73,7 +83,7 @@ def check_transformers_version():
     model_info.check_transformers_version(on_fail="skip")
 
 
-def test_musicflamingo_chunk_counting_uses_rote_timestamps(mock_ctx, monkeypatch):
+def test_musicflamingo_chunk_counting_without_rote_timestamps(mock_ctx):
     from vllm.model_executor.models.musicflamingo import (
         MusicFlamingoDummyInputsBuilder,
         MusicFlamingoMultiModalProcessor,
@@ -92,24 +102,13 @@ def test_musicflamingo_chunk_counting_uses_rote_timestamps(mock_ctx, monkeypatch
     mm_data = {"audio": [audio_1, audio_2]}
     prompt = "<|user|>Listen.<|end|>"
 
-    from vllm.multimodal.processing import BaseMultiModalProcessor
-
-    def mock_base_call(self, prompt, mm_data, mm_kwargs, tok_kwargs):
-        del self, prompt, mm_data, mm_kwargs, tok_kwargs
-        return {
-            "input_ids": [1, 2, 3],
-            "input_features": torch.randn(3, 80, 3000),
-            "rote_timestamps": torch.randn(3, 750),
-        }
-
-    monkeypatch.setattr(BaseMultiModalProcessor, "_call_hf_processor", mock_base_call)
-
     processed = processor._call_hf_processor(prompt, mm_data, {}, {})
 
     chunk_counts = processed["chunk_counts"]
 
     assert chunk_counts.tolist() == [1, 2]
-    assert "rote_timestamps" in processed
+    assert "rote_timestamps" not in processed
+    assert processed["feature_attention_mask"].shape == (3, 3000)
 
 
 def test_musicflamingo_dummy_text_uses_plain_audio_tokens(mock_ctx):

@@ -145,22 +145,28 @@ def check_transformers_version(
         # Only check the base version for the min/max version, otherwise preview
         # models cannot be run because `x.yy.0.dev0`<`x.yy.0`
         if min_version and Version(cur_base_version) < Version(min_version):
-            is_version_valid = not check_min_version
+            is_version_valid = False
+            should_check_version = check_min_version
             msg += f">={min_version}` is required to run this model."
         elif max_version and Version(cur_base_version) > Version(max_version):
-            is_version_valid = not check_max_version
+            is_version_valid = False
+            should_check_version = check_max_version
             msg += f"<={max_version}` is required to run this model."
         else:
             is_version_valid = True
+            should_check_version = False
 
-        # check if Transformers version breaks the corresponding model runner,
-        # skip test when model runner not compatible
-        is_reason_valid = not (
-            check_version_reason
-            and self.transformers_version_reason
+        # Reasons explain a known incompatibility with a violated version
+        # bound. They should not skip models when the installed version is
+        # already within the requested range.
+        is_reason_applicable = (
+            not is_version_valid
+            and self.transformers_version_reason is not None
             and check_version_reason in self.transformers_version_reason
         )
-        is_transformers_valid = is_version_valid and is_reason_valid
+        is_transformers_valid = is_version_valid or (
+            not should_check_version and not is_reason_applicable
+        )
         if is_transformers_valid:
             return None
         elif self.transformers_version_reason:
@@ -335,6 +341,12 @@ def check_available_online(
     "HYV3ForCausalLM": _HfExamplesInfo("tencent/Hy3-preview", trust_remote_code=True),
     "HyperCLOVAXForCausalLM": _HfExamplesInfo(
         "naver-hyperclovax/HyperCLOVAX-SEED-Think-14B",
+        max_transformers_version="4.57",
+        transformers_version_reason={
+            "hf": "HF remote code indexes ROPE_INIT_FUNCTIONS['default']; "
+            "Transformers v5 supports default RoPE but handles it outside "
+            "ROPE_INIT_FUNCTIONS."
+        },
         trust_remote_code=True,
     ),
     "InternLMForCausalLM": _HfExamplesInfo(
@@ -419,7 +431,15 @@ def check_available_online(
         "openbmb/MiniCPM3-4B", trust_remote_code=True
     ),
     "MiniCPM4ForCausalLM": _HfExamplesInfo(
-        "openbmb/MiniCPM4.1-8B", trust_remote_code=True
+        "openbmb/MiniCPM4.1-8B",
+        min_transformers_version="4.56",
+        max_transformers_version="4.57",
+        transformers_version_reason={
+            "hf": "HF remote code imports removed `is_torch_fx_available`; "
+            "the upstream compatibility shim request was closed as not planned: "
+            "https://github.com/huggingface/transformers/issues/44561"
+        },
+        trust_remote_code=True,
     ),
     "MiniMaxForCausalLM": _HfExamplesInfo("MiniMaxAI/MiniMax-Text-01-hf"),
     "MiniMaxText01ForCausalLM": _HfExamplesInfo(
@@ -819,7 +839,7 @@ def check_available_online(
     ),
     "MusicFlamingoForConditionalGeneration": _HfExamplesInfo(
         "nvidia/music-flamingo-2601-hf",
-        min_transformers_version="5.3.0",
+        min_transformers_version="5.5.0",
         transformers_version_reason={
             "vllm": "Needs https://github.com/huggingface/transformers/pull/43538"
         },

diff --git a/vllm/model_executor/models/audioflamingo3.py b/vllm/model_executor/models/audioflamingo3.py
@@ -202,7 +202,7 @@ def get_data_parser(self) -> MultiModalDataParser:
         )
 
     def get_supported_mm_limits(self) -> Mapping[str, int | None]:
-        return {"audio": None}
+        return {"audio": 1}
 
 
 class AudioFlamingo3DummyInputsBuilder(
@@ -379,32 +379,37 @@ def _call_hf_processor(
         mm_kwargs: Mapping[str, Any],
         tok_kwargs: Mapping[str, object],
     ) -> BatchFeature:
-        audios = mm_data.pop("audios", [])
-        if audios:
-            mm_data["audio"] = audios
+        processor_mm_data = dict(mm_data)
+        audios = processor_mm_data.pop("audios", None)
+        if audios is not None:
+            processor_mm_data["audio"] = audios
 
-        if not mm_data.get("audio", []):
-            prompt_ids = self.info.get_tokenizer().encode(prompt)
-            prompt_ids = self._apply_hf_processor_tokens_only(prompt_ids)
-            return BatchFeature(dict(input_ids=[prompt_ids]), tensor_type="pt")
-
-        processor = self.info.get_hf_processor(**mm_kwargs)
-        feature_extractor = processor.feature_extractor
-        mm_kwargs = dict(
-            **mm_kwargs,
-            sampling_rate=feature_extractor.sampling_rate,
+        outputs = super()._call_hf_processor(
+            prompt=prompt,
+            mm_data=processor_mm_data,
+            mm_kwargs=mm_kwargs,
+            tok_kwargs=tok_kwargs,
         )
 
-        audio_list = mm_data.get("audio")
-        if not isinstance(audio_list, list):
-            audio_list = [audio_list]
+        if "input_features_mask" in outputs:
+            outputs["feature_attention_mask"] = outputs.pop("input_features_mask")
 
-        chunk_counts = []
+        audio_data = processor_mm_data.get("audio")
+        if audio_data is None:
+            return outputs
+
+        audio_list = audio_data if isinstance(audio_data, list) else [audio_data]
+        if len(audio_list) == 0:
+            return outputs
+
+        processor = self.info.get_hf_processor(**mm_kwargs)
+        feature_extractor = processor.feature_extractor
         sampling_rate = feature_extractor.sampling_rate
         chunk_length = feature_extractor.chunk_length
         window_size = int(sampling_rate * chunk_length)
         max_windows = int(processor.max_audio_len // chunk_length)
 
+        chunk_counts = []
         for audio in audio_list:
             # audio is numpy array or list
             n_samples = len(audio) if isinstance(audio, list) else audio.shape[0]
@@ -414,18 +419,7 @@ def _call_hf_processor(
                 n_win = max_windows
             chunk_counts.append(n_win)
 
-        outputs = super()._call_hf_processor(
-            prompt=prompt,
-            mm_data=mm_data,
-            mm_kwargs=mm_kwargs,
-            tok_kwargs=tok_kwargs,
-        )
-
-        if "input_features_mask" in outputs:
-            outputs["feature_attention_mask"] = outputs.pop("input_features_mask")
-
         outputs["chunk_counts"] = torch.tensor(chunk_counts, dtype=torch.long)
-
         return outputs
 
     def _get_mm_fields_config(
@@ -611,6 +605,10 @@ def _encode_audio_features(
         input_features: torch.Tensor,
         feature_attention_mask: torch.Tensor,
     ) -> torch.Tensor:
+        input_features = input_features.to(
+            dtype=self.audio_tower.conv1.weight.dtype,
+            device=self.audio_tower.conv1.weight.device,
+        )
         audio_attention_mask = _build_audio_encoder_attention_mask(
             feature_attention_mask,
             dtype=self.audio_tower.conv1.weight.dtype,