Skip to content

Commit f528800

Browse files
committed
LCORE-2282: Normalize Vertex AI model IDs to workaround llama-stack 0.6.x bug
Fixes 500 error when using Vertex AI models with llama-stack 0.6.x. Root cause: llama-stack 0.6.x inline::meta-reference responses provider normalizes model IDs before checking allowed_models, but doesn't normalize the allowed_models list itself. This causes validation to fail: - Model registered as: publishers/google/models/gemini-2.5-flash - llama-stack strips to: google/gemini-2.5-flash internally - Checks against allowed list with full prefix - Mismatch → 500 error Solution: Strip publishers/google/models/ prefix before passing model ID to llama-stack, matching what it expects internally. This workaround can be removed when upgrading to llama-stack 0.7.0+ which fixes the underlying bug via ogx-ai/ogx#5169 Signed-off-by: Anik Bhattacharjee <anbhatta@redhat.com>
1 parent d531855 commit f528800

5 files changed

Lines changed: 84 additions & 5 deletions

File tree

src/app/endpoints/rlsapi_v1.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@
5151
extract_provider_and_model_from_model_id,
5252
handle_known_apistatus_errors,
5353
is_context_length_error,
54+
normalize_vertex_ai_model_id,
5455
)
5556
from utils.quota import check_tokens_available
5657
from utils.responses import (
@@ -343,9 +344,12 @@ async def _call_llm(
343344

344345
logger.debug("Using model %s for rlsapi v1 inference", resolved_model_id)
345346

347+
# Normalize Vertex AI model IDs to work around llama-stack 0.6.x bug
348+
normalized_model = normalize_vertex_ai_model_id(resolved_model_id)
349+
346350
response = await client.responses.create(
347351
input=question,
348-
model=resolved_model_id,
352+
model=normalized_model,
349353
instructions=instructions,
350354
tools=tools or [],
351355
stream=False,

src/utils/compaction.py

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@
3434

3535
from log import get_logger
3636
from models.compaction import ConversationSummary
37+
from utils.query import normalize_vertex_ai_model_id
3738
from utils.token_estimator import (
3839
estimate_conversation_tokens,
3940
estimate_tokens,
@@ -266,10 +267,14 @@ async def summarize_chunk(
266267
# by utils.responses.get_topic_summary and protects the directives from
267268
# prompt-injection via user message content that ends up in the
268269
# transcript.
270+
271+
# Normalize Vertex AI model IDs to work around llama-stack 0.6.x bug
272+
normalized_model = normalize_vertex_ai_model_id(model)
273+
269274
response = await client.responses.create(
270275
input=f"Conversation:\n{transcript}",
271276
instructions=SUMMARIZATION_PROMPT,
272-
model=model,
277+
model=normalized_model,
273278
stream=False,
274279
store=False,
275280
)
@@ -374,10 +379,14 @@ async def recursively_resummarize(
374379
model,
375380
)
376381
# Same instructions/input split as summarize_chunk — see comment there.
382+
383+
# Normalize Vertex AI model IDs to work around llama-stack 0.6.x bug
384+
normalized_model = normalize_vertex_ai_model_id(model)
385+
377386
response = await client.responses.create(
378387
input=transcript,
379388
instructions=RECURSIVE_RESUMMARIZATION_PROMPT,
380-
model=model,
389+
model=normalized_model,
381390
stream=False,
382391
store=False,
383392
)

src/utils/query.py

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -534,6 +534,34 @@ def extract_provider_and_model_from_model_id(model_id: str) -> tuple[str, str]:
534534
return "", model_id
535535

536536

537+
def normalize_vertex_ai_model_id(model_id: str) -> str:
538+
"""Normalize Vertex AI model ID to work around llama-stack 0.6.x bug.
539+
540+
llama-stack 0.6.x has a bug in the inline::meta-reference responses provider
541+
where it normalizes model IDs before checking against allowed_models, but doesn't
542+
normalize the allowed_models list itself. This causes Vertex AI models to fail
543+
validation because:
544+
- Model is registered as: publishers/google/models/gemini-2.5-flash
545+
- llama-stack strips to: google/gemini-2.5-flash internally
546+
- Checks against allowed list: ['publishers/google/models/gemini-2.5-flash']
547+
- Mismatch → 500 error
548+
549+
This workaround strips the publishers/google/models/ prefix to match what
550+
llama-stack expects internally.
551+
552+
Fixed in llama-stack 0.7.0 via https://github.com/ogx-ai/ogx/pull/5169
553+
554+
Args:
555+
model_id: The model ID, possibly in Vertex AI format
556+
557+
Returns:
558+
Normalized model ID with Vertex AI prefix stripped if present
559+
"""
560+
if model_id.startswith("publishers/google/models/"):
561+
return model_id.replace("publishers/google/models/", "google/", 1)
562+
return model_id
563+
564+
537565
def handle_known_apistatus_errors(
538566
error: LLSApiStatusError | OpenAIAPIStatusError, model_id: str
539567
) -> AbstractErrorResponse:

src/utils/responses.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -118,6 +118,7 @@
118118
from utils.query import (
119119
extract_provider_and_model_from_model_id,
120120
handle_known_apistatus_errors,
121+
normalize_vertex_ai_model_id,
121122
prepare_input,
122123
)
123124
from utils.suid import to_llama_stack_conversation_id
@@ -178,11 +179,14 @@ async def get_topic_summary( # pylint: disable=too-many-nested-blocks
178179
The topic summary for the question
179180
"""
180181
try:
182+
# Normalize Vertex AI model IDs to work around llama-stack 0.6.x bug
183+
normalized_model = normalize_vertex_ai_model_id(model_id)
184+
181185
response = cast(
182186
ResponseObject,
183187
await client.responses.create(
184188
input=question,
185-
model=model_id,
189+
model=normalized_model,
186190
instructions=get_topic_summary_system_prompt(),
187191
stream=False,
188192
store=False, # Don't store topic summary requests
@@ -389,9 +393,13 @@ async def prepare_responses_params( # pylint: disable=too-many-arguments,too-ma
389393

390394
# Build x-llamastack-provider-data header from MCP tool headers
391395
extra_headers = _build_provider_data_headers(tools)
396+
397+
# Normalize Vertex AI model IDs to work around llama-stack 0.6.x bug
398+
normalized_model = normalize_vertex_ai_model_id(model)
399+
392400
return ResponsesApiParams(
393401
input=input_text,
394-
model=model,
402+
model=normalized_model,
395403
instructions=system_prompt,
396404
tools=tools,
397405
conversation=llama_stack_conv_id,

tests/unit/utils/test_responses.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@
5757
from models.api.requests import QueryRequest
5858
from models.common.responses.types import InputTool, InputToolMCP
5959
from models.config import ApprovalFilter, ByokRag, ModelContextProtocolServer
60+
from utils.query import normalize_vertex_ai_model_id
6061
from utils.responses import (
6162
_build_chunk_attributes,
6263
_merge_tools,
@@ -3577,3 +3578,32 @@ async def test_merge_header_no_server_tools_returns_client_only(
35773578
)
35783579
assert tools is not None
35793580
assert len(tools) == 1
3581+
3582+
3583+
class TestNormalizeVertexAIModelId:
3584+
"""Tests for normalize_vertex_ai_model_id function."""
3585+
3586+
def test_normalizes_vertex_ai_model_id(self) -> None:
3587+
"""Test that Vertex AI model IDs are normalized correctly."""
3588+
input_model = "publishers/google/models/gemini-2.5-flash"
3589+
expected = "google/gemini-2.5-flash"
3590+
assert normalize_vertex_ai_model_id(input_model) == expected
3591+
3592+
def test_normalizes_vertex_ai_model_id_with_version(self) -> None:
3593+
"""Test normalization with versioned Vertex AI model ID."""
3594+
input_model = "publishers/google/models/gemini-1.5-pro-001"
3595+
expected = "google/gemini-1.5-pro-001"
3596+
assert normalize_vertex_ai_model_id(input_model) == expected
3597+
3598+
def test_preserves_non_vertex_ai_model_ids(self) -> None:
3599+
"""Test that non-Vertex AI model IDs are returned unchanged."""
3600+
# Regular model IDs should pass through
3601+
assert normalize_vertex_ai_model_id("gpt-4") == "gpt-4"
3602+
assert normalize_vertex_ai_model_id("openai/gpt-4") == "openai/gpt-4"
3603+
assert normalize_vertex_ai_model_id("watsonx/model") == "watsonx/model"
3604+
3605+
def test_preserves_gemini_api_format(self) -> None:
3606+
"""Test that Gemini API format (models/...) is preserved."""
3607+
# Gemini API format doesn't have the publishers prefix
3608+
gemini_api_format = "models/gemini-2.5-flash"
3609+
assert normalize_vertex_ai_model_id(gemini_api_format) == gemini_api_format

0 commit comments

Comments
 (0)