Skip to content
1 change: 1 addition & 0 deletions changes/11491.enhance.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Add `AppProxyClientRegistry` to expose app-proxy domain clients (`DeploymentChatClient`) with the same lazy `@cached_property` pattern as `BackendAIClientRegistry`.
2 changes: 2 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -233,6 +233,8 @@ split-on-trailing-comma = true
"src/ai/backend/client/utils.py" = ["PLC0415"]
# - Client v2 registry uses lazy imports to avoid circular dependencies with domain clients
"src/ai/backend/client/v2/registry.py" = ["PLC0415"]
# - Client v2 app-proxy registry uses lazy imports to avoid circular dependencies with domain clients
"src/ai/backend/client/v2/approxy_registry.py" = ["PLC0415"]
# - vfolder shim uses lazy imports to avoid circular dependency with api/rest/vfolder
"src/ai/backend/manager/api/vfolder.py" = ["PLC0415"]
# - Client func vfolder has progress printing
Expand Down
33 changes: 15 additions & 18 deletions src/ai/backend/client/cli/v2/deployment/chat/commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,11 @@
DeploymentChatConfig,
DeploymentChatHistory,
)
from ai.backend.client.cli.v2.helpers import create_v2_registry, load_v2_config
from ai.backend.client.cli.v2.helpers import (
create_appproxy_registry,
create_v2_registry,
load_v2_config,
)
from ai.backend.common.dto.clients.openai_compat import ChatCompletionRequest
from ai.backend.common.identifier.deployment import DeploymentID

Expand Down Expand Up @@ -96,8 +100,6 @@ def chat(
temperature and top_p differ between runtime variants — pass them
through ``--params``.
"""
from ai.backend.client.v2.config import ClientConfig
from ai.backend.client.v2.deployment_chat import DeploymentChatClient
from ai.backend.client.v2.exceptions import DeploymentAuthError

connection_config = load_v2_config()
Expand Down Expand Up @@ -135,22 +137,15 @@ async def _run() -> None:
cache.save()

token = chat_config.get_token(deployment_id)
# ``endpoint`` is required on ClientConfig but unused by AppProxyClient
# (deployment URLs are passed per-request); pass through the manager
# endpoint so the rest of the connection knobs (TLS, timeouts) match.
client_config = ClientConfig(
endpoint=connection_config.endpoint,
endpoint_type=connection_config.endpoint_type,
api_version=connection_config.api_version,
skip_ssl_verification=connection_config.skip_ssl_verification,
)
async with DeploymentChatClient(client_config) as client:
appproxy_registry = await create_appproxy_registry(connection_config)
try:
client = appproxy_registry.deployment_chat
# Resolution: --model > config.model (user-set) >
# cache.default_model (auto) > GET /v1/models (auto, cached).
request_model = (
model or chat_config.get_model(deployment_id) or endpoint_entry.default_model
)
try:
# Resolution: --model > config.model (user-set) >
# cache.default_model (auto) > GET /v1/models (auto, cached).
request_model = (
model or chat_config.get_model(deployment_id) or endpoint_entry.default_model
)
if request_model is None:
# No explicit --model, no user-set config, no cached
# default — ask the OpenAI-compat endpoint itself which
Expand Down Expand Up @@ -199,6 +194,8 @@ async def _run() -> None:
f"re-register with:\n"
f" ./bai deployment chat-config set {deployment_id} --token <token>"
) from e
finally:
await appproxy_registry.close()
# Only persist when both sides of the round are present, so the file
# never carries half-conversations that would skew future context.
assistant_message = response.assistant_message
Expand Down
21 changes: 21 additions & 0 deletions src/ai/backend/client/cli/v2/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from yarl import URL

if TYPE_CHECKING:
from ai.backend.client.v2.approxy_registry import AppProxyClientRegistry
from ai.backend.client.v2.v2_registry import V2ClientRegistry

CONFIG_DIR = Path.home() / ".backend.ai"
Expand Down Expand Up @@ -126,6 +127,26 @@ async def create_v2_registry(config: V2ConnectionConfig) -> V2ClientRegistry:
return await V2ClientRegistry.create(client_config, auth)


async def create_appproxy_registry(config: V2ConnectionConfig) -> AppProxyClientRegistry:
"""Build an ``AppProxyClientRegistry`` from a ``V2ConnectionConfig``.

The app-proxy registry talks to deployment runtimes (vLLM / SGLang /
NIM / TGI), not the manager API, so HMAC credentials and the cookie
jar are intentionally ignored — only TLS / timeout knobs from the
manager-side config are reused.
"""
from ai.backend.client.v2.approxy_registry import AppProxyClientRegistry
from ai.backend.client.v2.config import ClientConfig

client_config = ClientConfig(
endpoint=config.endpoint,
endpoint_type=config.endpoint_type,
api_version=config.api_version,
skip_ssl_verification=config.skip_ssl_verification,
)
return await AppProxyClientRegistry.create(client_config)


def parse_order_options(
order_by: tuple[str, ...],
order_field_enum: type,
Expand Down
43 changes: 43 additions & 0 deletions src/ai/backend/client/v2/approxy_registry.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
"""App-proxy client registry.

Provides ``AppProxyClientRegistry`` which lazy-loads domain clients that
target inference runtimes fronted by Backend.AI's app-proxy (vLLM, SGLang,
NIM, TGI in Messages API mode, etc.). Mirrors the
:class:`BackendAIClientRegistry` pattern but uses
:class:`BackendAIAppProxyClient` (token-based, deployment URL per request)
instead of :class:`BackendAIAuthClient` (HMAC-signed manager API).
"""

from __future__ import annotations

from functools import cached_property
from typing import TYPE_CHECKING

from .base_client import BackendAIAppProxyClient
from .config import ClientConfig

if TYPE_CHECKING:
from .deployment_chat import DeploymentChatClient


class AppProxyClientRegistry:
"""Registry of domain clients targeting deployment runtimes via app-proxy."""

_client: BackendAIAppProxyClient

def __init__(self, client: BackendAIAppProxyClient) -> None:
self._client = client

@classmethod
async def create(cls, config: ClientConfig) -> AppProxyClientRegistry:
client = BackendAIAppProxyClient(config)
return cls(client)

async def close(self) -> None:
await self._client.close()

@cached_property
def deployment_chat(self) -> DeploymentChatClient:
from .deployment_chat import DeploymentChatClient

return DeploymentChatClient(self._client)
13 changes: 13 additions & 0 deletions src/ai/backend/client/v2/base_appproxy_domain.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
from __future__ import annotations

from typing import TYPE_CHECKING

if TYPE_CHECKING:
from .base_client import BackendAIAppProxyClient


class BaseAppProxyDomainClient:
_client: BackendAIAppProxyClient

def __init__(self, client: BackendAIAppProxyClient) -> None:
self._client = client
12 changes: 3 additions & 9 deletions src/ai/backend/client/v2/base_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -464,16 +464,10 @@ async def typed_request(


class BackendAIAppProxyClient:
"""HTTP client base for direct-to-deployment endpoints fronted by Backend.AI's app-proxy.
"""HTTP client for endpoints fronted by Backend.AI's app-proxy.

Unlike :class:`BackendAIAuthClient` (which signs requests with HMAC against
the Backend.AI manager API), this client targets the runtime's own HTTP
surface (vLLM / SGLang / NIM / TGI / custom) and uses an optional
``Authorization: Bearer <token>`` header. The deployment endpoint URL is
supplied per-request, not via :attr:`ClientConfig.endpoint`.

Subclasses add the contract-specific request methods (e.g. chat-completions,
/generate, etc.).
Uses an optional ``Authorization: Bearer <token>`` header; the target
URL is supplied per-request, not via :attr:`ClientConfig.endpoint`.
"""

_config: ClientConfig
Expand Down
12 changes: 7 additions & 5 deletions src/ai/backend/client/v2/deployment_chat.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@

from typing import Any

from ai.backend.client.v2.base_client import BackendAIAppProxyClient
from ai.backend.client.v2.base_appproxy_domain import BaseAppProxyDomainClient
from ai.backend.common.dto.clients.openai_compat import (
ChatCompletionResponse,
ListModelsResponse,
Expand All @@ -26,14 +26,14 @@
_OPENAI_COMPATIBLE_MODELS_PATH = "/v1/models"


class DeploymentChatClient(BackendAIAppProxyClient):
class DeploymentChatClient(BaseAppProxyDomainClient):
"""OpenAI Chat Completions client for direct-to-deployment inference traffic.

Sends ``POST /v1/chat/completions`` with an OpenAI-shaped
``{model, messages, ...}`` JSON body. Compatible runtimes: vLLM,
SGLang, NVIDIA NIM, and TGI in Messages API mode. Vanilla TGI
(``/generate``) and arbitrary custom containers need a different
:class:`BackendAIAppProxyClient` subclass.
:class:`BaseAppProxyDomainClient` subclass.
"""

async def chat_completion(
Expand All @@ -42,7 +42,7 @@ async def chat_completion(
token: str | None,
body: dict[str, Any],
) -> ChatCompletionResponse:
payload = await self._request(
payload = await self._client._request(
"POST", endpoint_url, _OPENAI_COMPATIBLE_CHAT_PATH, token, body=body
)
return ChatCompletionResponse.model_validate(payload)
Expand All @@ -57,5 +57,7 @@ async def list_models(
Used to auto-derive a default model name when the caller did not
pass ``--model`` and no cached default is known.
"""
payload = await self._request("GET", endpoint_url, _OPENAI_COMPATIBLE_MODELS_PATH, token)
payload = await self._client._request(
"GET", endpoint_url, _OPENAI_COMPATIBLE_MODELS_PATH, token
)
return ListModelsResponse.model_validate(payload)
7 changes: 4 additions & 3 deletions tests/unit/client/v2/test_deployment_chat_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from yarl import URL

from ai.backend.client.exceptions import BackendAPIError, BackendClientError
from ai.backend.client.v2.base_client import BackendAIAppProxyClient
from ai.backend.client.v2.config import ClientConfig
from ai.backend.client.v2.deployment_chat import DeploymentChatClient
from ai.backend.client.v2.exceptions import DeploymentAuthError
Expand All @@ -22,11 +23,11 @@
async def chat_client() -> AsyncIterator[DeploymentChatClient]:
# ``endpoint`` is required on ClientConfig but unused by AppProxyClient.
config = ClientConfig(endpoint=URL("http://manager.unused"))
client = DeploymentChatClient(config)
appproxy_client = BackendAIAppProxyClient(config)
try:
yield client
yield DeploymentChatClient(appproxy_client)
finally:
await client.close()
await appproxy_client.close()


def _make_body() -> dict[str, Any]:
Expand Down
58 changes: 58 additions & 0 deletions tests/unit/client_v2/test_approxy_registry.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
from unittest.mock import AsyncMock, MagicMock, patch

import pytest
from yarl import URL

from ai.backend.client.v2.approxy_registry import AppProxyClientRegistry
from ai.backend.client.v2.base_appproxy_domain import BaseAppProxyDomainClient
from ai.backend.client.v2.base_client import BackendAIAppProxyClient
from ai.backend.client.v2.config import ClientConfig
from ai.backend.client.v2.deployment_chat import DeploymentChatClient


def _build_appproxy_client(session: MagicMock | None = None) -> BackendAIAppProxyClient:
"""Construct a BackendAIAppProxyClient with the aiohttp session swapped for a mock.

Bypasses ``_create_aiohttp_session`` so the synchronous constructor does
not require a running event loop (aiohttp >= 3.13 raises otherwise).
"""
config = ClientConfig(endpoint=URL("https://api.example.com"))
with patch(
"ai.backend.client.v2.base_client._create_aiohttp_session",
return_value=session if session is not None else MagicMock(),
):
return BackendAIAppProxyClient(config)


class TestAppProxyClientRegistry:
@pytest.fixture
def registry(self) -> AppProxyClientRegistry:
return AppProxyClientRegistry(_build_appproxy_client())

async def test_create_factory(self) -> None:
config = ClientConfig(endpoint=URL("https://api.example.com"))
with patch(
"ai.backend.client.v2.base_client._create_aiohttp_session",
return_value=MagicMock(),
):
registry = await AppProxyClientRegistry.create(config)
assert isinstance(registry._client, BackendAIAppProxyClient)

def test_domain_clients_return_correct_types(self, registry: AppProxyClientRegistry) -> None:
assert isinstance(registry.deployment_chat, DeploymentChatClient)

def test_domain_clients_inherit_base(self, registry: AppProxyClientRegistry) -> None:
assert isinstance(registry.deployment_chat, BaseAppProxyDomainClient)

def test_cached_property_returns_same_instance(self, registry: AppProxyClientRegistry) -> None:
first = registry.deployment_chat
second = registry.deployment_chat
assert first is second

async def test_close_delegates_to_client(self) -> None:
mock_session = AsyncMock()
mock_session.closed = False
client = _build_appproxy_client(mock_session)
registry = AppProxyClientRegistry(client)
await registry.close()
mock_session.close.assert_awaited_once()
Loading