fix(BA-5967): pre-validate slot types in sokovan validator chain

fregataa · claude · fregataa · commit 410ebf872c0d · 2026-05-08T23:54:27.000+09:00
Add two SessionSpec validator rules so slot keys absent from the
target resource group's agent inventory are caught with a clear 4xx
before they reach ResourceLimitRule, where the downstream humanizer
would otherwise escalate the missing key into a 500.

ImageSlotTypeRule rejects images that declare a resource_spec slot
the target resource group has no agent for. RequestedSlotTypeRule
rejects caller-supplied resource entries with the same property. Both
rules also reject when the RG has no non-terminated agents at all.

The per-RG slot inventory is sourced from the same ScalingGroupRow
fetch via selectinload over agents -&gt; agent_resource_rows, with
TERMINATED agents filtered out in Python. The new bundle dataclass
_ScalingGroupWithAgentResources keeps the SG row and its filtered
agent list together so the inventory snapshot stays consistent with
the same readonly transaction. The fetch helper raises
ScalingGroupNotFound on missing RG, so the caller no longer needs a
separate None check.

Co-Authored-By: Claude Opus 4.7 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/changes/11515.fix.md b/changes/11515.fix.md
@@ -0,0 +1 @@
+Pre-validate slot types in the Sokovan SessionSpec validator chain so a session request whose image or caller declares a slot the target resource group does not provide is rejected with a clear 4xx, instead of crashing later in error-message humanization with a 500.
diff --git a/src/ai/backend/manager/repositories/scheduler/db_source/db_source.py b/src/ai/backend/manager/repositories/scheduler/db_source/db_source.py
@@ -6,6 +6,7 @@
 from collections import defaultdict
 from collections.abc import AsyncIterator, Mapping, Sequence
 from contextlib import asynccontextmanager as actxmgr
+from dataclasses import dataclass
 from datetime import datetime
 from decimal import Decimal
 from typing import TYPE_CHECKING, Any, cast
@@ -187,6 +188,20 @@ def _create_resource_slot_from_policy(
     return ResourceSlot.from_policy(resource_policy_map, cast(Mapping[str, Any], known_slot_types))
 
 
+@dataclass(frozen=True)
+class _ScalingGroupWithSlotInventory:
+    """Scaling group bundled with the slot-name set served by its agents.
+
+    ``active_slot_names`` is the set of slot names served by agents in
+    this scaling group whose status is not TERMINATED — i.e. the per-RG
+    slot inventory the validator chain consults to reject session
+    requests for slots the resource group does not provide.
+    """
+
+    sg_row: ScalingGroupRow
+    active_slot_names: frozenset[SlotName]
+
+
 class ScheduleDBSource:
     """
     Database source for schedule-related operations.
@@ -289,6 +304,44 @@ async def get_scheduling_data(self, scaling_group: str, spec: SchedulingSpec) ->
                 spec=spec,
             )
 
+    async def _fetch_scaling_group_with_slot_inventory(
+        self,
+        db_sess: SASession,
+        name: str,
+    ) -> _ScalingGroupWithSlotInventory:
+        """Load a scaling group together with its per-RG slot inventory.
+
+        Eager-loads ``agents`` and per-agent ``agent_resource_rows`` via
+        ``selectinload``, filters out TERMINATED agents, and projects the
+        remaining ``agent_resource_rows`` into a ``slot_name`` set. The
+        ``AgentRow`` instances themselves are not exposed — callers only
+        see the SG row and the derived inventory.
+
+        Raises:
+            ScalingGroupNotFound: when the scaling group does not exist.
+        """
+        sg_row = (
+            await db_sess.scalars(
+                sa.select(ScalingGroupRow)
+                .options(
+                    selectinload(ScalingGroupRow.agents).selectinload(AgentRow.agent_resource_rows)
+                )
+                .where(ScalingGroupRow.name == name)
+            )
+        ).one_or_none()
+        if sg_row is None:
+            raise ScalingGroupNotFound(f"Resource group {name} not found")
+        active_slot_names: frozenset[SlotName] = frozenset(
+            SlotName(ar.slot_name)
+            for agent in sg_row.agents
+            if agent.status != AgentStatus.TERMINATED
+            for ar in agent.agent_resource_rows
+        )
+        return _ScalingGroupWithSlotInventory(
+            sg_row=sg_row,
+            active_slot_names=active_slot_names,
+        )
+
     async def _fetch_scaling_group(
         self, db_sess: SASession, scaling_group: str
     ) -> ScalingGroupMeta:
@@ -1463,16 +1516,13 @@ async def fetch_session_spec_contexts(
             network_info: ScalingGroupNetworkInfo | None = None
             rg_defaults = None
             resource_group_allow_fractional = False
+            resource_group_slot_names: frozenset[SlotName] = frozenset()
             if resource_group_name:
-                sg_row = (
-                    await db_sess.scalars(
-                        sa.select(ScalingGroupRow).where(
-                            ScalingGroupRow.name == resource_group_name
-                        )
-                    )
-                ).one_or_none()
-                if sg_row is None:
-                    raise ScalingGroupNotFound(f"Resource group {resource_group_name} not found")
+                rg_bundle = await self._fetch_scaling_group_with_slot_inventory(
+                    db_sess, resource_group_name
+                )
+                sg_row = rg_bundle.sg_row
+                resource_group_slot_names = rg_bundle.active_slot_names
                 # Every production caller of ``enqueue_session_from_draft`` populates
                 # access_key/domain_name/project_id alongside resource_group_name; this
                 # branch flags the contract violation rather than letting the RG
@@ -1632,6 +1682,7 @@ async def fetch_session_spec_contexts(
             dotfile_data=dotfile_bundle,
             active_session_count=active_session_count,
             keypair_resource_policy=keypair_policy,
+            resource_group_slot_names=resource_group_slot_names,
         )
 
     async def pick_default_resource_group(
diff --git a/src/ai/backend/manager/repositories/scheduler/types/session_creation.py b/src/ai/backend/manager/repositories/scheduler/types/session_creation.py
@@ -10,6 +10,7 @@
 from ai.backend.common.types import (
     AccessKey,
     SessionId,
+    SlotName,
     VFolderMount,
 )
 from ai.backend.manager.data.dotfile.types import DotfileBundle
@@ -143,4 +144,5 @@ class SessionSpecContextFetch:
     vfolder_mounts_by_role: dict[str, tuple[VFolderMount, ...]]
     dotfile_data: DotfileBundle
     keypair_resource_policy: Any | None  # KeyPairResourcePolicyData
+    resource_group_slot_names: frozenset[SlotName] = field(default_factory=frozenset)
     active_session_count: int = 0
diff --git a/src/ai/backend/manager/sokovan/scheduling_controller/scheduling_controller.py b/src/ai/backend/manager/sokovan/scheduling_controller/scheduling_controller.py
@@ -48,8 +48,10 @@
     ConcurrentSessionLimitRule,
     ContainerLimitRule,
     DotfileVFolderConflictRule,
+    ImageSlotTypeRule,
     InferenceModelFolderRule,
     MountNameValidationRule,
+    RequestedSlotTypeRule,
     ResourceLimitRule,
     ServicePortRule,
     SessionSpecValidationContext,
@@ -125,6 +127,8 @@ def __init__(self, args: SchedulingControllerArgs) -> None:
         self._spec_validator = SessionSpecValidator([
             ConcurrentSessionLimitRule(),
             ContainerLimitRule(),
+            ImageSlotTypeRule(),
+            RequestedSlotTypeRule(),
             ResourceLimitRule(),
             ServicePortRule(),
             MountNameValidationRule(),
@@ -187,6 +191,7 @@ async def enqueue_session_from_draft(
             keypair_resource_policy=fetched.keypair_resource_policy,
             image_infos=fetched.image_infos,
             known_slot_types=known_slot_types,
+            resource_group_slot_names=fetched.resource_group_slot_names,
             dotfile_data=fetched.dotfile_data,
             active_session_count=fetched.active_session_count,
         )
diff --git a/src/ai/backend/manager/sokovan/scheduling_controller/validators/__init__.py b/src/ai/backend/manager/sokovan/scheduling_controller/validators/__init__.py
@@ -3,8 +3,10 @@
 from .concurrent_session_limit_rule import ConcurrentSessionLimitRule
 from .container_limit_rule import ContainerLimitRule
 from .dotfile_vfolder_conflict_rule import DotfileVFolderConflictRule
+from .image_slot_type_rule import ImageSlotTypeRule
 from .inference_model_folder_rule import InferenceModelFolderRule
 from .mount_name_validation_rule import MountNameValidationRule
+from .requested_slot_type_rule import RequestedSlotTypeRule
 from .resource_limit_rule import ResourceLimitRule
 from .service_port_rule import ServicePortRule
 from .session_spec_base import (
@@ -17,8 +19,10 @@
     "ConcurrentSessionLimitRule",
     "ContainerLimitRule",
     "DotfileVFolderConflictRule",
+    "ImageSlotTypeRule",
     "InferenceModelFolderRule",
     "MountNameValidationRule",
+    "RequestedSlotTypeRule",
     "ResourceLimitRule",
     "ServicePortRule",
     "SessionSpecValidationContext",
diff --git a/src/ai/backend/manager/sokovan/scheduling_controller/validators/image_slot_type_rule.py b/src/ai/backend/manager/sokovan/scheduling_controller/validators/image_slot_type_rule.py
@@ -0,0 +1,63 @@
+"""Image-declared slot-type compatibility validator.
+
+Every slot key declared in an image's ``resource_spec`` must be served
+by some non-terminated agent in the requested resource group. The
+context's ``resource_group_slot_names`` is sourced from
+``agent_resources`` joined with ``agents`` (status != TERMINATED), so it
+reflects the RG's hardware inventory rather than the system-wide etcd
+slot registry.
+
+When the RG has no non-terminated agents the request is rejected
+outright — an empty inventory cannot satisfy any image declaration and
+would otherwise let the session reach the scheduler only to fail there.
+"""
+
+from __future__ import annotations
+
+from ai.backend.manager.data.session.spec import SessionSpec
+from ai.backend.manager.errors.api import InvalidAPIParameters
+from ai.backend.manager.sokovan.scheduling_controller.validators.session_spec_base import (
+    SessionSpecValidationContext,
+    SessionSpecValidatorRule,
+)
+
+
+class ImageSlotTypeRule(SessionSpecValidatorRule):
+    """Image-declared slot keys must be served by an agent in the target RG."""
+
+    def name(self) -> str:
+        return "image_slot_type"
+
+    def validate(
+        self,
+        spec: SessionSpec,
+        context: SessionSpecValidationContext,
+    ) -> None:
+        rg_slot_names = context.resource_group_slot_names
+        if not rg_slot_names:
+            raise InvalidAPIParameters(
+                extra_msg=(
+                    f"resource group '{spec.scope.resource_group_name}' has no "
+                    f"agents serving any resource slot."
+                ),
+            )
+        for idx, kernel in enumerate(spec.kernel_specs):
+            image_info = context.image_infos.get(kernel.execution_spec.image_id)
+            if image_info is None:
+                continue
+            unknown = sorted(
+                slot_name
+                for slot_name in image_info.resource_spec
+                if slot_name not in rg_slot_names
+            )
+            if unknown:
+                raise InvalidAPIParameters(
+                    extra_msg=(
+                        f"kernel_specs[{idx}]: image '{image_info.canonical}' "
+                        f"requires resource slot(s) {unknown} that resource "
+                        f"group '{spec.scope.resource_group_name}' does not "
+                        f"serve. Pick an image whose required slots are "
+                        f"available here, or switch to a resource group that "
+                        f"supports these slots."
+                    ),
+                )
diff --git a/src/ai/backend/manager/sokovan/scheduling_controller/validators/requested_slot_type_rule.py b/src/ai/backend/manager/sokovan/scheduling_controller/validators/requested_slot_type_rule.py
@@ -0,0 +1,60 @@
+"""User-requested slot-type compatibility validator.
+
+Every ``resource_type`` in a kernel's requested resource list must be
+served by some non-terminated agent in the requested resource group.
+The context's ``resource_group_slot_names`` is sourced from
+``agent_resources`` joined with ``agents`` (status != TERMINATED), so it
+reflects the RG's hardware inventory rather than the system-wide etcd
+slot registry.
+
+When the RG has no non-terminated agents the request is rejected
+outright — an empty inventory cannot satisfy any caller-supplied
+request and would otherwise let the session reach the scheduler only
+to fail there.
+"""
+
+from __future__ import annotations
+
+from ai.backend.manager.data.session.spec import SessionSpec
+from ai.backend.manager.errors.api import InvalidAPIParameters
+from ai.backend.manager.sokovan.scheduling_controller.validators.session_spec_base import (
+    SessionSpecValidationContext,
+    SessionSpecValidatorRule,
+)
+
+
+class RequestedSlotTypeRule(SessionSpecValidatorRule):
+    """Requested slot keys must be served by an agent in the target RG."""
+
+    def name(self) -> str:
+        return "requested_slot_type"
+
+    def validate(
+        self,
+        spec: SessionSpec,
+        context: SessionSpecValidationContext,
+    ) -> None:
+        rg_slot_names = context.resource_group_slot_names
+        if not rg_slot_names:
+            raise InvalidAPIParameters(
+                extra_msg=(
+                    f"resource group '{spec.scope.resource_group_name}' has no "
+                    f"agents serving any resource slot."
+                ),
+            )
+        for idx, kernel in enumerate(spec.kernel_specs):
+            unknown = sorted({
+                entry.resource_type
+                for entry in kernel.execution_spec.resources
+                if entry.resource_type not in rg_slot_names
+            })
+            if unknown:
+                raise InvalidAPIParameters(
+                    extra_msg=(
+                        f"kernel_specs[{idx}]: the request asks for resource "
+                        f"slot(s) {unknown} that resource group "
+                        f"'{spec.scope.resource_group_name}' does not serve. "
+                        f"Drop these slots from the request or switch to a "
+                        f"resource group that supports them."
+                    ),
+                )
diff --git a/src/ai/backend/manager/sokovan/scheduling_controller/validators/session_spec_base.py b/src/ai/backend/manager/sokovan/scheduling_controller/validators/session_spec_base.py
@@ -57,6 +57,7 @@ class SessionSpecValidationContext:
     keypair_resource_policy: KeyPairResourcePolicyData | None = None
     image_infos: Mapping[ImageID, ImageInfo] = field(default_factory=dict)
     known_slot_types: Mapping[SlotName, SlotTypes] = field(default_factory=dict)
+    resource_group_slot_names: frozenset[SlotName] = field(default_factory=frozenset)
     dotfile_data: DotfileBundle = field(default_factory=DotfileBundle)
     active_session_count: int = 0
 
diff --git a/tests/unit/manager/repositories/scheduler/test_owner_resource_group_access.py b/tests/unit/manager/repositories/scheduler/test_owner_resource_group_access.py
@@ -33,6 +33,8 @@
     SessionSpecDraft,
 )
 from ai.backend.manager.errors.api import InvalidAPIParameters
+from ai.backend.manager.models.agent import AgentRow
+from ai.backend.manager.models.resource_slot import AgentResourceRow, ResourceSlotTypeRow
 from ai.backend.manager.models.scaling_group import ScalingGroupOpts, ScalingGroupRow
 from ai.backend.manager.models.utils import ExtendedAsyncSAEngine
 from ai.backend.manager.repositories.scheduler.db_source.db_source import ScheduleDBSource
@@ -110,7 +112,13 @@ async def db_with_rg(
         short-circuit on ``ScalingGroupNotFound`` and never exercise the
         invariant under test.
         """
-        async with with_tables(database_connection, [ScalingGroupRow]):
+        # Include the agent tables so the SG fetch's
+        # ``selectinload(agents).selectinload(agent_resource_rows)`` chain
+        # has tables to query, even though we seed no rows below.
+        async with with_tables(
+            database_connection,
+            [ScalingGroupRow, ResourceSlotTypeRow, AgentRow, AgentResourceRow],
+        ):
             async with database_connection.begin_session() as db_sess:
                 db_sess.add(
                     ScalingGroupRow(
diff --git a/tests/unit/manager/sokovan/scheduling_controller/test_enqueue_session_from_draft.py b/tests/unit/manager/sokovan/scheduling_controller/test_enqueue_session_from_draft.py
@@ -208,6 +208,7 @@ def _fetch_bundle(image_id: ImageID) -> SessionSpecContextFetch:
         vfolder_mounts_by_role={"main": (_vfolder_mount(),)},
         dotfile_data=DotfileBundle(),
         keypair_resource_policy=_keypair_policy(),
+        resource_group_slot_names=frozenset({SlotName("cpu"), SlotName("mem")}),
     )
 
 
diff --git a/tests/unit/manager/sokovan/scheduling_controller/validators/test_session_spec_rules.py b/tests/unit/manager/sokovan/scheduling_controller/validators/test_session_spec_rules.py

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+Pre-validate slot types in the Sokovan SessionSpec validator chain so a session request whose image or caller declares a slot the target resource group does not provide is rejected with a clear 4xx, instead of crashing later in error-message humanization with a 500.`
Original file line number	Diff line number	Diff line change
`@@ -208,6 +208,7 @@ def _fetch_bundle(image_id: ImageID) -> SessionSpecContextFetch:`
`208`	`208`	`vfolder_mounts_by_role={"main": (_vfolder_mount(),)},`
`209`	`209`	`dotfile_data=DotfileBundle(),`
`210`	`210`	`keypair_resource_policy=_keypair_policy(),`
	`211`	`+ resource_group_slot_names=frozenset({SlotName("cpu"), SlotName("mem")}),`
`211`	`212`	`)`
`212`	`213`
`213`	`214`