Skip to content

Commit 94e1afa

Browse files
authored
Fix AudioFlamingo3/MusicFlamingo HF parity and RoTE handling (vllm-project#37643)
Signed-off-by: Lasha <26011196+lashahub@users.noreply.github.com>
1 parent 8fc571e commit 94e1afa

12 files changed

Lines changed: 1159 additions & 245 deletions

File tree

docs/models/supported_models.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -535,7 +535,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
535535
| Architecture | Models | Inputs | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) |
536536
| ------------ | ------ | ------ | ----------------- | -------------------- | ------------------------- |
537537
| `AriaForConditionalGeneration` | Aria | T + I<sup>+</sup> | `rhymes-ai/Aria` | | |
538-
| `AudioFlamingo3ForConditionalGeneration` | AudioFlamingo3 | T + A | `nvidia/audio-flamingo-3-hf`, `nvidia/music-flamingo-2601-hf` | ✅︎ | ✅︎ |
538+
| `AudioFlamingo3ForConditionalGeneration` | AudioFlamingo3 | T + A | `nvidia/audio-flamingo-3-hf`, `nvidia/music-flamingo-hf` | ✅︎ | ✅︎ |
539539
| `AyaVisionForConditionalGeneration` | Aya Vision | T + I<sup>+</sup> | `CohereLabs/aya-vision-8b`, `CohereLabs/aya-vision-32b`, etc. | | ✅︎ |
540540
| `BagelForConditionalGeneration` | BAGEL | T + I<sup>+</sup> | `ByteDance-Seed/BAGEL-7B-MoT` | ✅︎ | ✅︎ |
541541
| `BeeForConditionalGeneration` | Bee-8B | T + I<sup>E+</sup> | `Open-Bee/Bee-8B-RL`, `Open-Bee/Bee-8B-SFT` | | ✅︎ |
@@ -586,6 +586,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
586586
| `Mistral3ForConditionalGeneration` | Mistral3 (HF Transformers) | T + I<sup>+</sup> | `mistralai/Mistral-Small-3.1-24B-Instruct-2503`, etc. | ✅︎ | ✅︎ |
587587
| `MolmoForCausalLM` | Molmo | T + I<sup>+</sup> | `allenai/Molmo-7B-D-0924`, `allenai/Molmo-7B-O-0924`, etc. | ✅︎ | ✅︎ |
588588
| `Molmo2ForConditionalGeneration` | Molmo2 | T + I<sup>+</sup> / V | `allenai/Molmo2-4B`, `allenai/Molmo2-8B`, `allenai/Molmo2-O-7B` | ✅︎ | ✅︎ |
589+
| `MusicFlamingoForConditionalGeneration` | MusicFlamingo | T + A | `nvidia/music-flamingo-2601-hf`, `nvidia/music-flamingo-think-2601-hf` | ✅︎ | ✅︎ |
589590
| `NVLM_D_Model` | NVLM-D 1.0 | T + I<sup>+</sup> | `nvidia/NVLM-D-72B`, etc. | | ✅︎ |
590591
| `OpenCUAForConditionalGeneration` | OpenCUA-7B | T + I<sup>E+</sup> | `xlangai/OpenCUA-7B` | ✅︎ | ✅︎ |
591592
| `OpenPanguVLForConditionalGeneration` | openpangu-VL | T + I<sup>E+</sup> + V<sup>E+</sup> | `FreedomIntelligence/openPangu-VL-7B` | ✅︎ | ✅︎ |

examples/offline_inference/audio_language.py

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -104,12 +104,22 @@ def run_musicflamingo(question: str, audio_count: int) -> ModelRequestData:
104104
enforce_eager=True,
105105
)
106106

107-
# MusicFlamingo uses <sound> token for audio
107+
# MusicFlamingo prompt placeholders use <sound>; vLLM's MusicFlamingo
108+
# multimodal processor expands each one into <|sound_bos|> + audio tokens +
109+
# <|sound_eos|> based on extracted audio feature lengths.
108110
audio_placeholder = "<sound>" * audio_count
111+
system_prompt = (
112+
"You are Music Flamingo, a multimodal assistant for language and music. "
113+
"On each turn you receive an audio clip which contains music and optional "
114+
"text, you will receive at least one or both; use your world knowledge and "
115+
"reasoning to help the user with any task. Interpret the entirety of the "
116+
"content any input music--regardlenss of whether the user calls it audio, "
117+
"music, or sound."
118+
)
109119

110120
prompt = (
111121
"<|im_start|>system\n"
112-
"You are a helpful assistant.<|im_end|>\n"
122+
f"{system_prompt}<|im_end|>\n"
113123
"<|im_start|>user\n"
114124
f"{audio_placeholder}{question}<|im_end|>\n"
115125
"<|im_start|>assistant\n"
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
{"transcriptions": ["The content of the input audio is 'you can ask why over and over and over again forever even if one day we explain every physical interaction and scientific law and hope and dream and regret with a single elegant equation'."], "token_ids": [[785, 2213, 315, 279, 1946, 7699, 374, 364, 9330, 646, 2548, 3170, 916, 323, 916, 323, 916, 1549, 15683, 1496, 421, 825, 1899, 582, 10339, 1449, 6961, 16230, 323, 12344, 2329, 323, 3900, 323, 7904, 323, 22231, 448, 264, 3175, 25777, 23606, 4427, 151645]]}
1+
{"transcriptions": ["There is no clear relationship between the barking and the music, as they seem to be independent of each other."], "token_ids": [[3862, 374, 902, 2797, 5025, 1948, 279, 293, 33452, 323, 279, 4627, 11, 438, 807, 2803, 311, 387, 9489, 315, 1817, 1008, 13, 151645]]}
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
{"transcriptions": ["This track is an energetic Eurodance / Dance‑Pop anthem that blends the bright, melodic sensibilities of mainstream pop with the driving, club‑ready pulse of classic Eurodance. The duration of the piece is ", "**Verse 1**\nMidnight cravings in bloom, lights flicker in the room, pepperoni dreams arise, pizza party on your skies\n\n**Verse 2**\nCheese melts on the crust, in flavor we trust, boxes stacked to the"], "token_ids": [[1986, 3754, 374, 458, 44855, 19461, 98875, 378, 107, 14, 378, 107, 35, 681, 55964, 11598, 55564, 429, 57843, 279, 9906, 11, 10581, 52760, 6097, 13450, 315, 20729, 2420, 448, 279, 9842, 11, 6335, 55964, 2307, 27235, 315, 11416, 19461, 98875, 13, 220, 576, 8090, 315, 279, 6573, 374, 220], [334, 68043, 220, 16, 1019, 33648, 9287, 88828, 304, 51454, 11, 12711, 28347, 261, 304, 279, 3054, 11, 24353, 20783, 18707, 30789, 11, 22502, 4614, 389, 697, 49293, 271, 334, 68043, 220, 17, 1019, 26843, 2367, 98091, 389, 279, 39612, 11, 304, 17172, 582, 6950, 11, 14697, 41315, 311, 279]]}
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
{"transcriptions": ["This track is an energetic Eurodance / Dance‑Pop anthem that blends the bright, melodic sensibilities of mainstream pop with the driving, club‑ready pulse of classic Eurodance. The duration of the piece is "], "token_ids": [[1986, 3754, 374, 458, 44855, 19461, 98875, 378, 107, 14, 378, 107, 35, 681, 55964, 11598, 55564, 429, 57843, 279, 9906, 11, 10581, 52760, 6097, 13450, 315, 20729, 2420, 448, 279, 9842, 11, 6335, 55964, 2307, 27235, 315, 11416, 19461, 98875, 13, 220, 576, 8090, 315, 279, 6573, 374, 220]]}

tests/models/multimodal/generation/test_audioflamingo3.py

Lines changed: 89 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,54 @@
2626
from vllm import LLM, SamplingParams
2727

2828
MODEL_NAME = "nvidia/audio-flamingo-3-hf"
29+
SINGLE_CONVERSATION = [
30+
{
31+
"role": "user",
32+
"content": [
33+
{
34+
"type": "text",
35+
"text": "What is surprising about the relationship between "
36+
"the barking and the music?",
37+
},
38+
{
39+
"type": "audio_url",
40+
"audio_url": {
41+
"url": "https://huggingface.co/datasets/nvidia/AudioSkills/"
42+
"resolve/main/assets/"
43+
"dogs_barking_in_sync_with_the_music.wav",
44+
},
45+
},
46+
],
47+
}
48+
]
49+
BATCHED_CONVERSATIONS = [
50+
SINGLE_CONVERSATION,
51+
[
52+
{
53+
"role": "user",
54+
"content": [
55+
{
56+
"type": "text",
57+
"text": "Why is the philosopher's name mentioned in the "
58+
"lyrics? (A) To express a sense of nostalgia "
59+
"(B) To indicate that language cannot express clearly, "
60+
"satirizing the inversion of black and white in the world "
61+
"(C) To add depth and complexity to the lyrics "
62+
"(D) To showcase the wisdom and influence of the "
63+
"philosopher",
64+
},
65+
{
66+
"type": "audio_url",
67+
"audio_url": {
68+
"url": "https://huggingface.co/datasets/nvidia/"
69+
"AudioSkills/resolve/main/assets/"
70+
"Ch6Ae9DT6Ko_00-04-03_00-04-31.wav",
71+
},
72+
},
73+
],
74+
}
75+
],
76+
]
2977

3078

3179
def get_fixture_path(filename):
@@ -34,21 +82,29 @@ def get_fixture_path(filename):
3482
)
3583

3684

85+
def assert_output_matches(output, expected_text, expected_token_ids):
86+
generated = output.outputs[0]
87+
assert generated.text.strip() == expected_text
88+
actual_token_ids = list(generated.token_ids)
89+
assert (
90+
actual_token_ids == expected_token_ids
91+
or actual_token_ids == expected_token_ids[:-1]
92+
or actual_token_ids[:-1] == expected_token_ids
93+
)
94+
95+
3796
@pytest.fixture(scope="module")
3897
def llm():
39-
# Check if the model is supported by the current transformers version
4098
model_info = HF_EXAMPLE_MODELS.get_hf_info("AudioFlamingo3ForConditionalGeneration")
4199
model_info.check_transformers_version(on_fail="skip")
42100

43101
try:
44-
llm = LLM(
102+
return LLM(
45103
model=MODEL_NAME,
46-
trust_remote_code=True,
47104
dtype="bfloat16",
48105
enforce_eager=True,
49106
limit_mm_per_prompt={"audio": 1},
50107
)
51-
return llm
52108
except Exception as e:
53109
pytest.skip(f"Failed to load model {MODEL_NAME}: {e}")
54110

@@ -61,29 +117,17 @@ def test_single_generation(llm):
61117
with open(fixture_path) as f:
62118
expected = json.load(f)
63119

64-
audio_url = "https://huggingface.co/datasets/nvidia/AudioSkills/resolve/main/assets/Why_do_we_ask_questions_converted.wav"
65-
66-
messages = [
67-
{
68-
"role": "user",
69-
"content": [
70-
{"type": "audio_url", "audio_url": {"url": audio_url}},
71-
{"type": "text", "text": "Transcribe the input speech."},
72-
],
73-
}
74-
]
75-
76120
sampling_params = SamplingParams(temperature=0.0, max_tokens=128)
77121

78122
outputs = llm.chat(
79-
messages=messages,
123+
messages=SINGLE_CONVERSATION,
80124
sampling_params=sampling_params,
81125
)
82-
generated_text = outputs[0].outputs[0].text.strip()
83-
84-
expected_text = expected["transcriptions"][0]
85-
86-
assert expected_text in generated_text or generated_text in expected_text
126+
assert_output_matches(
127+
outputs[0],
128+
expected["transcriptions"][0],
129+
expected["token_ids"][0],
130+
)
87131

88132

89133
def test_batched_generation(llm):
@@ -94,49 +138,34 @@ def test_batched_generation(llm):
94138
with open(fixture_path) as f:
95139
expected = json.load(f)
96140

97-
items = [
98-
{
99-
"audio_url": "https://huggingface.co/datasets/nvidia/AudioSkills/resolve/main/assets/dogs_barking_in_sync_with_the_music.wav",
100-
"question": "What is surprising about the relationship "
101-
"between the barking and the music?",
102-
"expected_idx": 0,
103-
},
104-
{
105-
"audio_url": "https://huggingface.co/datasets/nvidia/AudioSkills/resolve/main/assets/Ch6Ae9DT6Ko_00-04-03_00-04-31.wav",
106-
"question": (
107-
"Why is the philosopher's name mentioned in the lyrics? "
108-
"(A) To express a sense of nostalgia "
109-
"(B) To indicate that language cannot express clearly, "
110-
"satirizing the inversion of black and white in the world "
111-
"(C) To add depth and complexity to the lyrics "
112-
"(D) To showcase the wisdom and influence of the philosopher"
113-
),
114-
"expected_idx": 1,
115-
},
116-
]
117-
118-
conversations = []
119-
for item in items:
120-
messages = [
121-
{
122-
"role": "user",
123-
"content": [
124-
{"type": "audio_url", "audio_url": {"url": item["audio_url"]}},
125-
{"type": "text", "text": item["question"]},
126-
],
127-
}
128-
]
129-
conversations.append(messages)
130-
131141
sampling_params = SamplingParams(temperature=0.0, max_tokens=128)
132142

133143
outputs = llm.chat(
134-
messages=conversations,
144+
messages=BATCHED_CONVERSATIONS,
135145
sampling_params=sampling_params,
136146
)
137147

138148
for i, output in enumerate(outputs):
139-
generated_text = output.outputs[0].text.strip()
140-
expected_text = expected["transcriptions"][i]
149+
assert_output_matches(
150+
output,
151+
expected["transcriptions"][i],
152+
expected["token_ids"][i],
153+
)
154+
155+
156+
def test_single_and_batched_generation_match(llm):
157+
sampling_params = SamplingParams(temperature=0.0, max_tokens=128)
141158

142-
assert expected_text in generated_text or generated_text in expected_text
159+
single_output = llm.chat(
160+
messages=SINGLE_CONVERSATION,
161+
sampling_params=sampling_params,
162+
)[0]
163+
batched_output = llm.chat(
164+
messages=BATCHED_CONVERSATIONS,
165+
sampling_params=sampling_params,
166+
)[0]
167+
168+
assert single_output.outputs[0].text == batched_output.outputs[0].text
169+
assert list(single_output.outputs[0].token_ids) == list(
170+
batched_output.outputs[0].token_ids
171+
)
Lines changed: 146 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,146 @@
1+
# SPDX-License-Identifier: Apache-2.0
2+
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3+
4+
import json
5+
import os
6+
7+
import pytest
8+
9+
from tests.models.registry import HF_EXAMPLE_MODELS
10+
from vllm import LLM, SamplingParams
11+
12+
MODEL_NAME = "nvidia/music-flamingo-2601-hf"
13+
SINGLE_CONVERSATION = [
14+
{
15+
"role": "user",
16+
"content": [
17+
{
18+
"type": "text",
19+
"text": "Describe this track in full detail - tell me the "
20+
"genre, tempo, and key, then dive into the instruments, "
21+
"production style, and overall mood it creates.",
22+
},
23+
{
24+
"type": "audio_url",
25+
"audio_url": {
26+
"url": "https://huggingface.co/datasets/nvidia/AudioSkills/"
27+
"resolve/main/assets/song_1.mp3",
28+
},
29+
},
30+
],
31+
}
32+
]
33+
BATCHED_CONVERSATIONS = [
34+
SINGLE_CONVERSATION,
35+
[
36+
{
37+
"role": "user",
38+
"content": [
39+
{
40+
"type": "text",
41+
"text": "Generate a structured lyric sheet from the input music.",
42+
},
43+
{
44+
"type": "audio_url",
45+
"audio_url": {
46+
"url": "https://huggingface.co/datasets/nvidia/"
47+
"AudioSkills/resolve/main/assets/song_2.mp3",
48+
},
49+
},
50+
],
51+
}
52+
],
53+
]
54+
55+
56+
def get_fixture_path(filename):
57+
return os.path.join(
58+
os.path.dirname(__file__), "../../fixtures/musicflamingo", filename
59+
)
60+
61+
62+
def assert_output_matches(output, expected_text, expected_token_ids):
63+
generated = output.outputs[0]
64+
assert generated.text == expected_text
65+
actual_token_ids = list(generated.token_ids)
66+
assert (
67+
actual_token_ids == expected_token_ids
68+
or actual_token_ids == expected_token_ids[:-1]
69+
or actual_token_ids[:-1] == expected_token_ids
70+
)
71+
72+
73+
@pytest.fixture(scope="module")
74+
def llm():
75+
model_info = HF_EXAMPLE_MODELS.get_hf_info("MusicFlamingoForConditionalGeneration")
76+
model_info.check_transformers_version(on_fail="skip")
77+
78+
try:
79+
return LLM(
80+
model=MODEL_NAME,
81+
dtype="bfloat16",
82+
enforce_eager=True,
83+
max_model_len=8192,
84+
limit_mm_per_prompt={"audio": 1},
85+
)
86+
except Exception as e:
87+
pytest.skip(f"Failed to load model {MODEL_NAME}: {e}")
88+
89+
90+
def test_single_generation(llm):
91+
fixture_path = get_fixture_path("expected_results_single.json")
92+
if not os.path.exists(fixture_path):
93+
pytest.skip(f"Fixture not found: {fixture_path}")
94+
95+
with open(fixture_path) as f:
96+
expected = json.load(f)
97+
98+
outputs = llm.chat(
99+
messages=SINGLE_CONVERSATION,
100+
sampling_params=SamplingParams(temperature=0.0, max_tokens=50),
101+
)
102+
103+
assert_output_matches(
104+
outputs[0],
105+
expected["transcriptions"][0],
106+
expected["token_ids"][0],
107+
)
108+
109+
110+
def test_batched_generation(llm):
111+
fixture_path = get_fixture_path("expected_results_batched.json")
112+
if not os.path.exists(fixture_path):
113+
pytest.skip(f"Fixture not found: {fixture_path}")
114+
115+
with open(fixture_path) as f:
116+
expected = json.load(f)
117+
118+
outputs = llm.chat(
119+
messages=BATCHED_CONVERSATIONS,
120+
sampling_params=SamplingParams(temperature=0.0, max_tokens=50),
121+
)
122+
123+
for i, output in enumerate(outputs):
124+
assert_output_matches(
125+
output,
126+
expected["transcriptions"][i],
127+
expected["token_ids"][i],
128+
)
129+
130+
131+
def test_single_and_batched_generation_match(llm):
132+
sampling_params = SamplingParams(temperature=0.0, max_tokens=50)
133+
134+
single_output = llm.chat(
135+
messages=SINGLE_CONVERSATION,
136+
sampling_params=sampling_params,
137+
)[0]
138+
batched_output = llm.chat(
139+
messages=BATCHED_CONVERSATIONS,
140+
sampling_params=sampling_params,
141+
)[0]
142+
143+
assert single_output.outputs[0].text == batched_output.outputs[0].text
144+
assert list(single_output.outputs[0].token_ids) == list(
145+
batched_output.outputs[0].token_ids
146+
)

0 commit comments

Comments
 (0)