my-other-github-account
diff --git a/‎docs/models/supported_models.md‎
Lines changed: 2 additions & 1 deletion b/‎docs/models/supported_models.md‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎examples/offline_inference/audio_language.py‎
Lines changed: 12 additions & 2 deletions b/‎examples/offline_inference/audio_language.py‎
Lines changed: 12 additions & 2 deletions
diff --git a/‎tests/models/fixtures/audioflamingo3/expected_results_single.json‎
Lines changed: 1 addition & 1 deletion b/‎tests/models/fixtures/audioflamingo3/expected_results_single.json‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎tests/models/fixtures/musicflamingo/expected_results_batched.json‎
Lines changed: 1 addition & 0 deletions b/‎tests/models/fixtures/musicflamingo/expected_results_batched.json‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎tests/models/fixtures/musicflamingo/expected_results_single.json‎
Lines changed: 1 addition & 0 deletions b/‎tests/models/fixtures/musicflamingo/expected_results_single.json‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎tests/models/multimodal/generation/test_audioflamingo3.py‎
Lines changed: 89 additions & 60 deletions b/‎tests/models/multimodal/generation/test_audioflamingo3.py‎
Lines changed: 89 additions & 60 deletions
diff --git a/‎tests/models/multimodal/generation/test_musicflamingo.py‎
Lines changed: 146 additions & 0 deletions b/‎tests/models/multimodal/generation/test_musicflamingo.py‎
Lines changed: 146 additions & 0 deletions
@@ -535,7 +535,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
 | Architecture | Models | Inputs | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) |
 | ------------ | ------ | ------ | ----------------- | -------------------- | ------------------------- |
 | `AriaForConditionalGeneration` | Aria | T + I<sup>+</sup> | `rhymes-ai/Aria` | | |
-| `AudioFlamingo3ForConditionalGeneration` | AudioFlamingo3 | T + A | `nvidia/audio-flamingo-3-hf`, `nvidia/music-flamingo-2601-hf` | ✅︎ | ✅︎ |
+| `AudioFlamingo3ForConditionalGeneration` | AudioFlamingo3 | T + A | `nvidia/audio-flamingo-3-hf`, `nvidia/music-flamingo-hf` | ✅︎ | ✅︎ |
 | `AyaVisionForConditionalGeneration` | Aya Vision | T + I<sup>+</sup> | `CohereLabs/aya-vision-8b`, `CohereLabs/aya-vision-32b`, etc. | | ✅︎ |
 | `BagelForConditionalGeneration` | BAGEL | T + I<sup>+</sup> | `ByteDance-Seed/BAGEL-7B-MoT` | ✅︎ | ✅︎ |
 | `BeeForConditionalGeneration` | Bee-8B | T + I<sup>E+</sup> | `Open-Bee/Bee-8B-RL`, `Open-Bee/Bee-8B-SFT` | | ✅︎ |
@@ -586,6 +586,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
 | `Mistral3ForConditionalGeneration` | Mistral3 (HF Transformers) | T + I<sup>+</sup> | `mistralai/Mistral-Small-3.1-24B-Instruct-2503`, etc. | ✅︎ | ✅︎ |
 | `MolmoForCausalLM` | Molmo | T + I<sup>+</sup> | `allenai/Molmo-7B-D-0924`, `allenai/Molmo-7B-O-0924`, etc. | ✅︎ | ✅︎ |
 | `Molmo2ForConditionalGeneration` | Molmo2 | T + I<sup>+</sup> / V | `allenai/Molmo2-4B`, `allenai/Molmo2-8B`, `allenai/Molmo2-O-7B` | ✅︎ | ✅︎ |
+| `MusicFlamingoForConditionalGeneration` | MusicFlamingo | T + A | `nvidia/music-flamingo-2601-hf`, `nvidia/music-flamingo-think-2601-hf` | ✅︎ | ✅︎ |
 | `NVLM_D_Model` | NVLM-D 1.0 | T + I<sup>+</sup> | `nvidia/NVLM-D-72B`, etc. | | ✅︎ |
 | `OpenCUAForConditionalGeneration` | OpenCUA-7B | T + I<sup>E+</sup> | `xlangai/OpenCUA-7B` | ✅︎ | ✅︎ |
 | `OpenPanguVLForConditionalGeneration` | openpangu-VL | T + I<sup>E+</sup> + V<sup>E+</sup> | `FreedomIntelligence/openPangu-VL-7B` | ✅︎ | ✅︎ |
 
@@ -104,12 +104,22 @@ def run_musicflamingo(question: str, audio_count: int) -> ModelRequestData:
         enforce_eager=True,
     )
 
-    # MusicFlamingo uses <sound> token for audio
+    # MusicFlamingo prompt placeholders use <sound>; vLLM's MusicFlamingo
+    # multimodal processor expands each one into <|sound_bos|> + audio tokens +
+    # <|sound_eos|> based on extracted audio feature lengths.
     audio_placeholder = "<sound>" * audio_count
+    system_prompt = (
+        "You are Music Flamingo, a multimodal assistant for language and music. "
+        "On each turn you receive an audio clip which contains music and optional "
+        "text, you will receive at least one or both; use your world knowledge and "
+        "reasoning to help the user with any task. Interpret the entirety of the "
+        "content any input music--regardlenss of whether the user calls it audio, "
+        "music, or sound."
+    )
 
     prompt = (
         "<|im_start|>system\n"
-        "You are a helpful assistant.<|im_end|>\n"
+        f"{system_prompt}<|im_end|>\n"
         "<|im_start|>user\n"
         f"{audio_placeholder}{question}<|im_end|>\n"
         "<|im_start|>assistant\n"
 
@@ -1 +1 @@
-{"transcriptions": ["The content of the input audio is 'you can ask why over and over and over again forever even if one day we explain every physical interaction and scientific law and hope and dream and regret with a single elegant equation'."], "token_ids": [[785, 2213, 315, 279, 1946, 7699, 374, 364, 9330, 646, 2548, 3170, 916, 323, 916, 323, 916, 1549, 15683, 1496, 421, 825, 1899, 582, 10339, 1449, 6961, 16230, 323, 12344, 2329, 323, 3900, 323, 7904, 323, 22231, 448, 264, 3175, 25777, 23606, 4427, 151645]]}
+{"transcriptions": ["There is no clear relationship between the barking and the music, as they seem to be independent of each other."], "token_ids": [[3862, 374, 902, 2797, 5025, 1948, 279, 293, 33452, 323, 279, 4627, 11, 438, 807, 2803, 311, 387, 9489, 315, 1817, 1008, 13, 151645]]}
@@ -0,0 +1 @@
+{"transcriptions": ["This track is an energetic Eurodance / Dance‑Pop anthem that blends the bright, melodic sensibilities of mainstream pop with the driving, club‑ready pulse of classic Eurodance.  The duration of the piece is ", "**Verse 1**\nMidnight cravings in bloom, lights flicker in the room, pepperoni dreams arise, pizza party on your skies\n\n**Verse 2**\nCheese melts on the crust, in flavor we trust, boxes stacked to the"], "token_ids": [[1986, 3754, 374, 458, 44855, 19461, 98875, 378, 107, 14, 378, 107, 35, 681, 55964, 11598, 55564, 429, 57843, 279, 9906, 11, 10581, 52760, 6097, 13450, 315, 20729, 2420, 448, 279, 9842, 11, 6335, 55964, 2307, 27235, 315, 11416, 19461, 98875, 13, 220, 576, 8090, 315, 279, 6573, 374, 220], [334, 68043, 220, 16, 1019, 33648, 9287, 88828, 304, 51454, 11, 12711, 28347, 261, 304, 279, 3054, 11, 24353, 20783, 18707, 30789, 11, 22502, 4614, 389, 697, 49293, 271, 334, 68043, 220, 17, 1019, 26843, 2367, 98091, 389, 279, 39612, 11, 304, 17172, 582, 6950, 11, 14697, 41315, 311, 279]]}
@@ -0,0 +1 @@
+{"transcriptions": ["This track is an energetic Eurodance / Dance‑Pop anthem that blends the bright, melodic sensibilities of mainstream pop with the driving, club‑ready pulse of classic Eurodance.  The duration of the piece is "], "token_ids": [[1986, 3754, 374, 458, 44855, 19461, 98875, 378, 107, 14, 378, 107, 35, 681, 55964, 11598, 55564, 429, 57843, 279, 9906, 11, 10581, 52760, 6097, 13450, 315, 20729, 2420, 448, 279, 9842, 11, 6335, 55964, 2307, 27235, 315, 11416, 19461, 98875, 13, 220, 576, 8090, 315, 279, 6573, 374, 220]]}
@@ -26,6 +26,54 @@
 from vllm import LLM, SamplingParams
 
 MODEL_NAME = "nvidia/audio-flamingo-3-hf"
+SINGLE_CONVERSATION = [
+    {
+        "role": "user",
+        "content": [
+            {
+                "type": "text",
+                "text": "What is surprising about the relationship between "
+                "the barking and the music?",
+            },
+            {
+                "type": "audio_url",
+                "audio_url": {
+                    "url": "https://huggingface.co/datasets/nvidia/AudioSkills/"
+                    "resolve/main/assets/"
+                    "dogs_barking_in_sync_with_the_music.wav",
+                },
+            },
+        ],
+    }
+]
+BATCHED_CONVERSATIONS = [
+    SINGLE_CONVERSATION,
+    [
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "text",
+                    "text": "Why is the philosopher's name mentioned in the "
+                    "lyrics? (A) To express a sense of nostalgia "
+                    "(B) To indicate that language cannot express clearly, "
+                    "satirizing the inversion of black and white in the world "
+                    "(C) To add depth and complexity to the lyrics "
+                    "(D) To showcase the wisdom and influence of the "
+                    "philosopher",
+                },
+                {
+                    "type": "audio_url",
+                    "audio_url": {
+                        "url": "https://huggingface.co/datasets/nvidia/"
+                        "AudioSkills/resolve/main/assets/"
+                        "Ch6Ae9DT6Ko_00-04-03_00-04-31.wav",
+                    },
+                },
+            ],
+        }
+    ],
+]
 
 
 def get_fixture_path(filename):
@@ -34,21 +82,29 @@ def get_fixture_path(filename):
     )
 
 
+def assert_output_matches(output, expected_text, expected_token_ids):
+    generated = output.outputs[0]
+    assert generated.text.strip() == expected_text
+    actual_token_ids = list(generated.token_ids)
+    assert (
+        actual_token_ids == expected_token_ids
+        or actual_token_ids == expected_token_ids[:-1]
+        or actual_token_ids[:-1] == expected_token_ids
+    )
+
+
 @pytest.fixture(scope="module")
 def llm():
-    # Check if the model is supported by the current transformers version
     model_info = HF_EXAMPLE_MODELS.get_hf_info("AudioFlamingo3ForConditionalGeneration")
     model_info.check_transformers_version(on_fail="skip")
 
     try:
-        llm = LLM(
+        return LLM(
             model=MODEL_NAME,
-            trust_remote_code=True,
             dtype="bfloat16",
             enforce_eager=True,
             limit_mm_per_prompt={"audio": 1},
         )
-        return llm
     except Exception as e:
         pytest.skip(f"Failed to load model {MODEL_NAME}: {e}")
 
@@ -61,29 +117,17 @@ def test_single_generation(llm):
     with open(fixture_path) as f:
         expected = json.load(f)
 
-    audio_url = "https://huggingface.co/datasets/nvidia/AudioSkills/resolve/main/assets/Why_do_we_ask_questions_converted.wav"
-
-    messages = [
-        {
-            "role": "user",
-            "content": [
-                {"type": "audio_url", "audio_url": {"url": audio_url}},
-                {"type": "text", "text": "Transcribe the input speech."},
-            ],
-        }
-    ]
-
     sampling_params = SamplingParams(temperature=0.0, max_tokens=128)
 
     outputs = llm.chat(
-        messages=messages,
+        messages=SINGLE_CONVERSATION,
         sampling_params=sampling_params,
     )
-    generated_text = outputs[0].outputs[0].text.strip()
-
-    expected_text = expected["transcriptions"][0]
-
-    assert expected_text in generated_text or generated_text in expected_text
+    assert_output_matches(
+        outputs[0],
+        expected["transcriptions"][0],
+        expected["token_ids"][0],
+    )
 
 
 def test_batched_generation(llm):
@@ -94,49 +138,34 @@ def test_batched_generation(llm):
     with open(fixture_path) as f:
         expected = json.load(f)
 
-    items = [
-        {
-            "audio_url": "https://huggingface.co/datasets/nvidia/AudioSkills/resolve/main/assets/dogs_barking_in_sync_with_the_music.wav",
-            "question": "What is surprising about the relationship "
-            "between the barking and the music?",
-            "expected_idx": 0,
-        },
-        {
-            "audio_url": "https://huggingface.co/datasets/nvidia/AudioSkills/resolve/main/assets/Ch6Ae9DT6Ko_00-04-03_00-04-31.wav",
-            "question": (
-                "Why is the philosopher's name mentioned in the lyrics? "
-                "(A) To express a sense of nostalgia "
-                "(B) To indicate that language cannot express clearly, "
-                "satirizing the inversion of black and white in the world "
-                "(C) To add depth and complexity to the lyrics "
-                "(D) To showcase the wisdom and influence of the philosopher"
-            ),
-            "expected_idx": 1,
-        },
-    ]
-
-    conversations = []
-    for item in items:
-        messages = [
-            {
-                "role": "user",
-                "content": [
-                    {"type": "audio_url", "audio_url": {"url": item["audio_url"]}},
-                    {"type": "text", "text": item["question"]},
-                ],
-            }
-        ]
-        conversations.append(messages)
-
     sampling_params = SamplingParams(temperature=0.0, max_tokens=128)
 
     outputs = llm.chat(
-        messages=conversations,
+        messages=BATCHED_CONVERSATIONS,
         sampling_params=sampling_params,
     )
 
     for i, output in enumerate(outputs):
-        generated_text = output.outputs[0].text.strip()
-        expected_text = expected["transcriptions"][i]
+        assert_output_matches(
+            output,
+            expected["transcriptions"][i],
+            expected["token_ids"][i],
+        )
+
+
+def test_single_and_batched_generation_match(llm):
+    sampling_params = SamplingParams(temperature=0.0, max_tokens=128)
 
-        assert expected_text in generated_text or generated_text in expected_text
+    single_output = llm.chat(
+        messages=SINGLE_CONVERSATION,
+        sampling_params=sampling_params,
+    )[0]
+    batched_output = llm.chat(
+        messages=BATCHED_CONVERSATIONS,
+        sampling_params=sampling_params,
+    )[0]
+
+    assert single_output.outputs[0].text == batched_output.outputs[0].text
+    assert list(single_output.outputs[0].token_ids) == list(
+        batched_output.outputs[0].token_ids
+    )
@@ -0,0 +1,146 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import json
+import os
+
+import pytest
+
+from tests.models.registry import HF_EXAMPLE_MODELS
+from vllm import LLM, SamplingParams
+
+MODEL_NAME = "nvidia/music-flamingo-2601-hf"
+SINGLE_CONVERSATION = [
+    {
+        "role": "user",
+        "content": [
+            {
+                "type": "text",
+                "text": "Describe this track in full detail - tell me the "
+                "genre, tempo, and key, then dive into the instruments, "
+                "production style, and overall mood it creates.",
+            },
+            {
+                "type": "audio_url",
+                "audio_url": {
+                    "url": "https://huggingface.co/datasets/nvidia/AudioSkills/"
+                    "resolve/main/assets/song_1.mp3",
+                },
+            },
+        ],
+    }
+]
+BATCHED_CONVERSATIONS = [
+    SINGLE_CONVERSATION,
+    [
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "text",
+                    "text": "Generate a structured lyric sheet from the input music.",
+                },
+                {
+                    "type": "audio_url",
+                    "audio_url": {
+                        "url": "https://huggingface.co/datasets/nvidia/"
+                        "AudioSkills/resolve/main/assets/song_2.mp3",
+                    },
+                },
+            ],
+        }
+    ],
+]
+
+
+def get_fixture_path(filename):
+    return os.path.join(
+        os.path.dirname(__file__), "../../fixtures/musicflamingo", filename
+    )
+
+
+def assert_output_matches(output, expected_text, expected_token_ids):
+    generated = output.outputs[0]
+    assert generated.text == expected_text
+    actual_token_ids = list(generated.token_ids)
+    assert (
+        actual_token_ids == expected_token_ids
+        or actual_token_ids == expected_token_ids[:-1]
+        or actual_token_ids[:-1] == expected_token_ids
+    )
+
+
+@pytest.fixture(scope="module")
+def llm():
+    model_info = HF_EXAMPLE_MODELS.get_hf_info("MusicFlamingoForConditionalGeneration")
+    model_info.check_transformers_version(on_fail="skip")
+
+    try:
+        return LLM(
+            model=MODEL_NAME,
+            dtype="bfloat16",
+            enforce_eager=True,
+            max_model_len=8192,
+            limit_mm_per_prompt={"audio": 1},
+        )
+    except Exception as e:
+        pytest.skip(f"Failed to load model {MODEL_NAME}: {e}")
+
+
+def test_single_generation(llm):
+    fixture_path = get_fixture_path("expected_results_single.json")
+    if not os.path.exists(fixture_path):
+        pytest.skip(f"Fixture not found: {fixture_path}")
+
+    with open(fixture_path) as f:
+        expected = json.load(f)
+
+    outputs = llm.chat(
+        messages=SINGLE_CONVERSATION,
+        sampling_params=SamplingParams(temperature=0.0, max_tokens=50),
+    )
+
+    assert_output_matches(
+        outputs[0],
+        expected["transcriptions"][0],
+        expected["token_ids"][0],
+    )
+
+
+def test_batched_generation(llm):
+    fixture_path = get_fixture_path("expected_results_batched.json")
+    if not os.path.exists(fixture_path):
+        pytest.skip(f"Fixture not found: {fixture_path}")
+
+    with open(fixture_path) as f:
+        expected = json.load(f)
+
+    outputs = llm.chat(
+        messages=BATCHED_CONVERSATIONS,
+        sampling_params=SamplingParams(temperature=0.0, max_tokens=50),
+    )
+
+    for i, output in enumerate(outputs):
+        assert_output_matches(
+            output,
+            expected["transcriptions"][i],
+            expected["token_ids"][i],
+        )
+
+
+def test_single_and_batched_generation_match(llm):
+    sampling_params = SamplingParams(temperature=0.0, max_tokens=50)
+
+    single_output = llm.chat(
+        messages=SINGLE_CONVERSATION,
+        sampling_params=sampling_params,
+    )[0]
+    batched_output = llm.chat(
+        messages=BATCHED_CONVERSATIONS,
+        sampling_params=sampling_params,
+    )[0]
+
+    assert single_output.outputs[0].text == batched_output.outputs[0].text
+    assert list(single_output.outputs[0].token_ids) == list(
+        batched_output.outputs[0].token_ids
+    )
Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		-{"transcriptions": ["The content of the input audio is 'you can ask why over and over and over again forever even if one day we explain every physical interaction and scientific law and hope and dream and regret with a single elegant equation'."], "token_ids": [[785, 2213, 315, 279, 1946, 7699, 374, 364, 9330, 646, 2548, 3170, 916, 323, 916, 323, 916, 1549, 15683, 1496, 421, 825, 1899, 582, 10339, 1449, 6961, 16230, 323, 12344, 2329, 323, 3900, 323, 7904, 323, 22231, 448, 264, 3175, 25777, 23606, 4427, 151645]]}
	`1`	`+{"transcriptions": ["There is no clear relationship between the barking and the music, as they seem to be independent of each other."], "token_ids": [[3862, 374, 902, 2797, 5025, 1948, 279, 293, 33452, 323, 279, 4627, 11, 438, 807, 2803, 311, 387, 9489, 315, 1817, 1008, 13, 151645]]}`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	+{"transcriptions": ["This track is an energetic Eurodance / Dance‑Pop anthem that blends the bright, melodic sensibilities of mainstream pop with the driving, club‑ready pulse of classic Eurodance. The duration of the piece is ", "Verse 1\nMidnight cravings in bloom, lights flicker in the room, pepperoni dreams arise, pizza party on your skies\n\nVerse 2\nCheese melts on the crust, in flavor we trust, boxes stacked to the"], "token_ids": [[1986, 3754, 374, 458, 44855, 19461, 98875, 378, 107, 14, 378, 107, 35, 681, 55964, 11598, 55564, 429, 57843, 279, 9906, 11, 10581, 52760, 6097, 13450, 315, 20729, 2420, 448, 279, 9842, 11, 6335, 55964, 2307, 27235, 315, 11416, 19461, 98875, 13, 220, 576, 8090, 315, 279, 6573, 374, 220], [334, 68043, 220, 16, 1019, 33648, 9287, 88828, 304, 51454, 11, 12711, 28347, 261, 304, 279, 3054, 11, 24353, 20783, 18707, 30789, 11, 22502, 4614, 389, 697, 49293, 271, 334, 68043, 220, 17, 1019, 26843, 2367, 98091, 389, 279, 39612, 11, 304, 17172, 582, 6950, 11, 14697, 41315, 311, 279]]}