Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
14 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -1177,6 +1177,7 @@ def _read_requirements(filename: str) -> list[str]:
"av",
"scipy",
"soundfile",
"soxr",
"mistral_common[audio]",
], # Required for audio processing
"video": [], # Kept for backwards compatibility
Expand Down
7 changes: 6 additions & 1 deletion tests/models/language/generation/test_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,7 +152,11 @@ def test_models(
"def add(a, b):\n return a + b\n\ndef sub(a, b):\n return a - "
)

with hf_runner(model) as hf_model:
with hf_runner(
model,
revision=model_info.revision,
trust_remote_code=model_info.trust_remote_code,
) as hf_model:
hf_outputs = hf_model.generate_greedy_logprobs_limit(
example_prompts, max_tokens, num_logprobs
)
Expand Down Expand Up @@ -188,6 +192,7 @@ def test_models(
model,
tokenizer_name=model_info.tokenizer or model,
tokenizer_mode=model_info.tokenizer_mode,
revision=model_info.revision,
trust_remote_code=model_info.trust_remote_code,
# Remove the effects of batch variance on ROCm since batch invariance
# is not yet supported.
Expand Down
32 changes: 19 additions & 13 deletions tests/models/multimodal/generation/test_musicflamingo.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,12 @@ def get_fixture_path(filename):
)


def load_expected_fixture(filename):
fixture_path = get_fixture_path(filename)
with open(fixture_path) as f:
return json.load(f)


def assert_output_matches(output, expected_text, expected_token_ids):
generated = output.outputs[0]
assert generated.text == expected_text
Expand All @@ -76,7 +82,7 @@ def llm():
model_info.check_transformers_version(on_fail="skip")

try:
return LLM(
llm = LLM(
model=MODEL_NAME,
dtype="bfloat16",
enforce_eager=True,
Expand All @@ -86,14 +92,19 @@ def llm():
except Exception as e:
pytest.skip(f"Failed to load model {MODEL_NAME}: {e}")

# ROCm may compile decoder kernels on the first inference pass; warm up
# once so exact fixture assertions cover the steady-state path.
llm.chat(
messages=SINGLE_CONVERSATION,
sampling_params=SamplingParams(temperature=0.0, max_tokens=1),
use_tqdm=False,
)

def test_single_generation(llm):
fixture_path = get_fixture_path("expected_results_single.json")
if not os.path.exists(fixture_path):
pytest.skip(f"Fixture not found: {fixture_path}")
return llm

with open(fixture_path) as f:
expected = json.load(f)

def test_single_generation(llm):
expected = load_expected_fixture("expected_results_single.json")

outputs = llm.chat(
messages=SINGLE_CONVERSATION,
Expand All @@ -108,12 +119,7 @@ def test_single_generation(llm):


def test_batched_generation(llm):
fixture_path = get_fixture_path("expected_results_batched.json")
if not os.path.exists(fixture_path):
pytest.skip(f"Fixture not found: {fixture_path}")

with open(fixture_path) as f:
expected = json.load(f)
expected = load_expected_fixture("expected_results_batched.json")

outputs = llm.chat(
messages=BATCHED_CONVERSATIONS,
Expand Down
41 changes: 26 additions & 15 deletions tests/models/multimodal/processing/test_audioflamingo3.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,15 +42,30 @@ def __init__(self):
self.audio_token_id = 12345
self.max_audio_len = 60
self.feature_extractor = MockFeatureExtractor()
self.tokenizer = self._tokenize

def __call__(self, text=None, audios=None, **kwargs):
return {"input_ids": [1, 2, 3], "input_features": [np.zeros((3000, 80))]}
def __call__(self, text=None, audio=None, **kwargs):
return {
"input_ids": torch.tensor([[1, 2, 3]], dtype=torch.long),
"input_features": torch.zeros((3, 80, 3000)),
"input_features_mask": torch.ones((3, 3000), dtype=torch.long),
}

def _tokenize(self, text, **kwargs):
return {"input_ids": torch.tensor([[1, 2, 3]], dtype=torch.long)}


class MockFeatureExtractor:
def __init__(self):
self.sampling_rate = 16000
self.chunk_length = 30
self.hop_length = 160

def __call__(self, audios, **kwargs):
return {
"input_features": torch.zeros((len(audios), 80, 3000)),
"attention_mask": torch.ones((len(audios), 3000), dtype=torch.long),
}


@pytest.fixture
Expand All @@ -60,6 +75,9 @@ def mock_ctx():
ctx = MagicMock()
ctx.get_hf_config.return_value = config
ctx.get_hf_processor.return_value = MockAudioFlamingo3Processor()
ctx.call_hf_processor.side_effect = lambda processor, data, kwargs: processor(
**data, **kwargs
)
ctx.model_config.hf_config = config
return ctx

Expand Down Expand Up @@ -89,21 +107,14 @@ def test_audio_chunk_counting(mock_ctx):
mm_data = {"audio": [audio_1, audio_2]}
prompt = "<|user|>Listen.<|end|>"

from vllm.multimodal.processing import BaseMultiModalProcessor

def mock_base_call(self, prompt, mm_data, mm_kwargs, tok_kwargs):
return {"input_ids": [1, 2, 3], "input_features": torch.randn(1, 80, 3000)}

with pytest.MonkeyPatch.context() as mp:
mp.setattr(BaseMultiModalProcessor, "_call_hf_processor", mock_base_call)

processed = processor._call_hf_processor(prompt, mm_data, {}, {})
processed = processor._call_hf_processor(prompt, mm_data, {}, {})

chunk_counts = processed["chunk_counts"]
chunk_counts = processed["chunk_counts"]

assert chunk_counts[0].item() == 1
assert chunk_counts[1].item() == 2
assert len(chunk_counts) == 2
assert chunk_counts[0].item() == 1
assert chunk_counts[1].item() == 2
assert len(chunk_counts) == 2
assert processed["feature_attention_mask"].shape == (3, 3000)


def test_dummy_data_generation(mock_ctx):
Expand Down
27 changes: 13 additions & 14 deletions tests/models/multimodal/processing/test_musicflamingo.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,13 @@ def __init__(self):
self.max_audio_len = 1200
self.feature_extractor = MockFeatureExtractor()

def __call__(self, text=None, audio=None, **kwargs):
return {
"input_ids": torch.tensor([[1, 2, 3]], dtype=torch.long),
"input_features": torch.zeros((3, 80, 3000)),
"input_features_mask": torch.ones((3, 3000), dtype=torch.long),
}


class MockFeatureExtractor:
def __init__(self):
Expand All @@ -63,6 +70,9 @@ def mock_ctx():
ctx = MagicMock()
ctx.get_hf_config.return_value = config
ctx.get_hf_processor.return_value = MockMusicFlamingoProcessor()
ctx.call_hf_processor.side_effect = lambda processor, data, kwargs: processor(
**data, **kwargs
)
ctx.model_config.hf_config = config
return ctx

Expand All @@ -73,7 +83,7 @@ def check_transformers_version():
model_info.check_transformers_version(on_fail="skip")


def test_musicflamingo_chunk_counting_uses_rote_timestamps(mock_ctx, monkeypatch):
def test_musicflamingo_chunk_counting_without_rote_timestamps(mock_ctx):
from vllm.model_executor.models.musicflamingo import (
MusicFlamingoDummyInputsBuilder,
MusicFlamingoMultiModalProcessor,
Expand All @@ -92,24 +102,13 @@ def test_musicflamingo_chunk_counting_uses_rote_timestamps(mock_ctx, monkeypatch
mm_data = {"audio": [audio_1, audio_2]}
prompt = "<|user|>Listen.<|end|>"

from vllm.multimodal.processing import BaseMultiModalProcessor

def mock_base_call(self, prompt, mm_data, mm_kwargs, tok_kwargs):
del self, prompt, mm_data, mm_kwargs, tok_kwargs
return {
"input_ids": [1, 2, 3],
"input_features": torch.randn(3, 80, 3000),
"rote_timestamps": torch.randn(3, 750),
}

monkeypatch.setattr(BaseMultiModalProcessor, "_call_hf_processor", mock_base_call)

processed = processor._call_hf_processor(prompt, mm_data, {}, {})

chunk_counts = processed["chunk_counts"]

assert chunk_counts.tolist() == [1, 2]
assert "rote_timestamps" in processed
assert "rote_timestamps" not in processed
assert processed["feature_attention_mask"].shape == (3, 3000)


def test_musicflamingo_dummy_text_uses_plain_audio_tokens(mock_ctx):
Expand Down
40 changes: 30 additions & 10 deletions tests/models/registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,22 +145,28 @@ def check_transformers_version(
# Only check the base version for the min/max version, otherwise preview
# models cannot be run because `x.yy.0.dev0`<`x.yy.0`
if min_version and Version(cur_base_version) < Version(min_version):
is_version_valid = not check_min_version
is_version_valid = False
should_check_version = check_min_version
msg += f">={min_version}` is required to run this model."
elif max_version and Version(cur_base_version) > Version(max_version):
is_version_valid = not check_max_version
is_version_valid = False
should_check_version = check_max_version
msg += f"<={max_version}` is required to run this model."
else:
is_version_valid = True
should_check_version = False

# check if Transformers version breaks the corresponding model runner,
# skip test when model runner not compatible
is_reason_valid = not (
check_version_reason
and self.transformers_version_reason
# Reasons explain a known incompatibility with a violated version
# bound. They should not skip models when the installed version is
# already within the requested range.
is_reason_applicable = (
not is_version_valid
and self.transformers_version_reason is not None
and check_version_reason in self.transformers_version_reason
)
is_transformers_valid = is_version_valid and is_reason_valid
is_transformers_valid = is_version_valid or (
not should_check_version and not is_reason_applicable
)
if is_transformers_valid:
return None
elif self.transformers_version_reason:
Expand Down Expand Up @@ -335,6 +341,12 @@ def check_available_online(
"HYV3ForCausalLM": _HfExamplesInfo("tencent/Hy3-preview", trust_remote_code=True),
"HyperCLOVAXForCausalLM": _HfExamplesInfo(
"naver-hyperclovax/HyperCLOVAX-SEED-Think-14B",
max_transformers_version="4.57",
transformers_version_reason={
"hf": "HF remote code indexes ROPE_INIT_FUNCTIONS['default']; "
"Transformers v5 supports default RoPE but handles it outside "
"ROPE_INIT_FUNCTIONS."
},
Comment thread
AndreasKaratzas marked this conversation as resolved.
trust_remote_code=True,
),
"InternLMForCausalLM": _HfExamplesInfo(
Expand Down Expand Up @@ -419,7 +431,15 @@ def check_available_online(
"openbmb/MiniCPM3-4B", trust_remote_code=True
),
"MiniCPM4ForCausalLM": _HfExamplesInfo(
"openbmb/MiniCPM4.1-8B", trust_remote_code=True
"openbmb/MiniCPM4.1-8B",
min_transformers_version="4.56",
max_transformers_version="4.57",
Comment thread
AndreasKaratzas marked this conversation as resolved.
transformers_version_reason={
"hf": "HF remote code imports removed `is_torch_fx_available`; "
"the upstream compatibility shim request was closed as not planned: "
"https://github.com/huggingface/transformers/issues/44561"
},
trust_remote_code=True,
),
"MiniMaxForCausalLM": _HfExamplesInfo("MiniMaxAI/MiniMax-Text-01-hf"),
"MiniMaxText01ForCausalLM": _HfExamplesInfo(
Expand Down Expand Up @@ -819,7 +839,7 @@ def check_available_online(
),
"MusicFlamingoForConditionalGeneration": _HfExamplesInfo(
"nvidia/music-flamingo-2601-hf",
min_transformers_version="5.3.0",
min_transformers_version="5.5.0",
transformers_version_reason={
"vllm": "Needs https://github.com/huggingface/transformers/pull/43538"
},
Expand Down
56 changes: 27 additions & 29 deletions vllm/model_executor/models/audioflamingo3.py
Original file line number Diff line number Diff line change
Expand Up @@ -202,7 +202,7 @@ def get_data_parser(self) -> MultiModalDataParser:
)

def get_supported_mm_limits(self) -> Mapping[str, int | None]:
return {"audio": None}
return {"audio": 1}


class AudioFlamingo3DummyInputsBuilder(
Expand Down Expand Up @@ -379,32 +379,37 @@ def _call_hf_processor(
mm_kwargs: Mapping[str, Any],
tok_kwargs: Mapping[str, object],
) -> BatchFeature:
audios = mm_data.pop("audios", [])
if audios:
mm_data["audio"] = audios
processor_mm_data = dict(mm_data)
audios = processor_mm_data.pop("audios", None)
if audios is not None:
processor_mm_data["audio"] = audios

if not mm_data.get("audio", []):
prompt_ids = self.info.get_tokenizer().encode(prompt)
prompt_ids = self._apply_hf_processor_tokens_only(prompt_ids)
return BatchFeature(dict(input_ids=[prompt_ids]), tensor_type="pt")

processor = self.info.get_hf_processor(**mm_kwargs)
feature_extractor = processor.feature_extractor
mm_kwargs = dict(
**mm_kwargs,
sampling_rate=feature_extractor.sampling_rate,
outputs = super()._call_hf_processor(
prompt=prompt,
mm_data=processor_mm_data,
mm_kwargs=mm_kwargs,
tok_kwargs=tok_kwargs,
)

audio_list = mm_data.get("audio")
if not isinstance(audio_list, list):
audio_list = [audio_list]
if "input_features_mask" in outputs:
outputs["feature_attention_mask"] = outputs.pop("input_features_mask")

chunk_counts = []
audio_data = processor_mm_data.get("audio")
if audio_data is None:
return outputs

audio_list = audio_data if isinstance(audio_data, list) else [audio_data]
if len(audio_list) == 0:
return outputs

processor = self.info.get_hf_processor(**mm_kwargs)
feature_extractor = processor.feature_extractor
sampling_rate = feature_extractor.sampling_rate
chunk_length = feature_extractor.chunk_length
window_size = int(sampling_rate * chunk_length)
max_windows = int(processor.max_audio_len // chunk_length)

chunk_counts = []
for audio in audio_list:
# audio is numpy array or list
n_samples = len(audio) if isinstance(audio, list) else audio.shape[0]
Expand All @@ -414,18 +419,7 @@ def _call_hf_processor(
n_win = max_windows
chunk_counts.append(n_win)

outputs = super()._call_hf_processor(
prompt=prompt,
mm_data=mm_data,
mm_kwargs=mm_kwargs,
tok_kwargs=tok_kwargs,
)

if "input_features_mask" in outputs:
outputs["feature_attention_mask"] = outputs.pop("input_features_mask")

outputs["chunk_counts"] = torch.tensor(chunk_counts, dtype=torch.long)

return outputs

def _get_mm_fields_config(
Expand Down Expand Up @@ -611,6 +605,10 @@ def _encode_audio_features(
input_features: torch.Tensor,
feature_attention_mask: torch.Tensor,
) -> torch.Tensor:
input_features = input_features.to(
dtype=self.audio_tower.conv1.weight.dtype,
device=self.audio_tower.conv1.weight.device,
)
audio_attention_mask = _build_audio_encoder_attention_mask(
feature_attention_mask,
dtype=self.audio_tower.conv1.weight.dtype,
Expand Down
Loading
Loading