fix(ai-plane): make a fresh irm|iex install bring up MiOS AI operational on GPU

mios-dev · claude · mios-dev · commit 600df69a0009 · 2026-06-21T17:05:19.000-04:00
Live-verified on a fresh dev VM: the :8640 front door now runs the full orchestration pipeline end-to-end and returns a clean answer from granite-4.1-8b on the RTX 4090 (181 tok/s). Root-caused + fixed the whole inert-AI chain: - system-sync-env.sh: generate_env ended on a false `[[ -n "$SECRET" ]] && echo` -> non-zero under set -e -> the install.env write was aborted BEFORE the mv, so the env bridge silently produced nothing on every secret-less host. Force `return 0`. Also emit resolved MIOS_PORT_* as their own numeric vars (systemd EnvironmentFile + Python don't expand ${...} from sibling lines). - mios-hermes-firstboot: VRAM probe used `command -v nvidia-smi`, but WSL2 ships it at /usr/lib/wsl/lib/nvidia-smi which is NOT on systemd's PATH -> a 24GB RTX 4090 read as 0GB -> small tier. Probe explicit candidate paths. - agent-pipe server.py: (1) _toml_section + the [agents.*] registry now expand ${MIOS_PORT_*} endpoint templates (systemd/Python don't); (2) _pick_agent is degrade-open -- a health_gate primary the liveness cache can't confirm has its endpoint blanked (-> BACKEND) and model reset; (3) at the proxy chokepoint, when dispatch resolves to the BACKEND light lane the model is pinned to BACKEND_MODEL (else llama-swap "no router for requested model"). - mios.toml: the :8643 heavy hermes-worker is health_gate=true so the orchestrator drops it when the heavy lane is gated off (degrade-open) instead of 502-ing the front door; it auto-rejoins when the lane is enabled. (firstboot EnvironmentFile=/etc/mios/install.env + userenv.sh-deploy gaps landed in a7cca48 / the overlay.) install-robustness 2026-06-21. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
diff --git a/usr/lib/mios/agent-pipe/server.py b/usr/lib/mios/agent-pipe/server.py
@@ -404,7 +404,23 @@ def _toml_section(section: str) -> dict:
                 out.update(_layer)
     except Exception:  # noqa: BLE001 -- best-effort; callers fall to literals
         log.warning("Failed to load overlay config section %s", section, exc_info=True)
-    return out
+    # Expand ${MIOS_PORT_*}/$VAR placeholders in string values against the
+    # process env (install.env supplies MIOS_PORT_*). mios.toml stores endpoint
+    # URLs as deferred-expansion templates ("http://localhost:${MIOS_PORT_HERMES_WORKER}/v1");
+    # systemd EnvironmentFile and Python do NOT expand ${...}, so without this
+    # the agent registry got a LITERAL "${MIOS_PORT_HERMES_WORKER}" port ->
+    # httpx InvalidURL -> the :8640 front door 500'd on every request. expandvars
+    # only touches $-prefixed tokens (ordinary values untouched; an unknown var
+    # is left verbatim). install-robustness 2026-06-21.
+    def _xpand(v):
+        if isinstance(v, str):
+            return os.path.expandvars(v) if "$" in v else v
+        if isinstance(v, dict):
+            return {k: _xpand(x) for k, x in v.items()}
+        if isinstance(v, list):
+            return [_xpand(x) for x in v]
+        return v
+    return _xpand(out)
 
 
 def _cfg_num(table: dict, env: str, key: str, default, cast=int):
@@ -3829,7 +3845,12 @@ def _load_agent_registry() -> dict[str, dict]:
             if not isinstance(cfg, dict):
                 continue
             registry[name] = {
-                "endpoint": str(cfg.get("endpoint", "")).rstrip("/"),
+                # expandvars: [agents.*].endpoint is stored as a deferred
+                # ${MIOS_PORT_*} template (e.g. the :8643 hermes-worker); the
+                # env supplies the numeric port (install.env). Without this the
+                # registry kept a literal "${MIOS_PORT_HERMES_WORKER}" -> httpx
+                # InvalidURL -> :8640 500 on every request. install-robustness 2026-06-21.
+                "endpoint": os.path.expandvars(str(cfg.get("endpoint", ""))).rstrip("/"),
                 "model":    str(cfg.get("model", name)),
                 "role":     str(cfg.get("role", "general")),
                 "default":  bool(cfg.get("default", False)),
@@ -6765,18 +6786,44 @@ def _validate_enum_args(tool: str, args: dict) -> Optional[str]:
 
 def _pick_agent(role: str) -> tuple[str, dict]:
     """Pick a sub-agent by role match. Order: exact-role -> default
-    -> first registered. Returns (name, cfg)."""
+    -> first registered. Returns (name, cfg).
+
+    Degrade-open (install-robustness 2026-06-21): if the chosen agent is a
+    health_gate (come-and-go) node -- e.g. the :8643 hermes-worker bound to the
+    heavy GPU lane, which is gated off by default -- that the liveness cache does
+    NOT confirm reachable, blank its endpoint so the caller's `endpoint or
+    BACKEND` falls back to the always-on local lane. Without this the PRIMARY
+    dispatch went to a dead gated worker -> httpx "All connection attempts
+    failed" -> 502 on EVERY turn on any host where that lane is down (a fresh
+    dev VM, a CPU host). The worker is still used the moment the probe confirms
+    it live (heavy lane enabled)."""
     role = (role or "").lower().strip()
+    chosen = None
     if role:
         for name, cfg in _AGENT_REGISTRY.items():
             if cfg.get("role", "").lower() == role:
-                return name, cfg
-    for name, cfg in _AGENT_REGISTRY.items():
-        if cfg.get("default"):
-            return name, cfg
-    # Whatever is first.
-    name = next(iter(_AGENT_REGISTRY))
-    return name, _AGENT_REGISTRY[name]
+                chosen = (name, cfg)
+                break
+    if chosen is None:
+        for name, cfg in _AGENT_REGISTRY.items():
+            if cfg.get("default"):
+                chosen = (name, cfg)
+                break
+    if chosen is None:
+        _n = next(iter(_AGENT_REGISTRY))
+        chosen = (_n, _AGENT_REGISTRY[_n])
+    name, cfg = chosen
+    if cfg.get("health_gate"):
+        _c = _NODE_LIVE.get(name)
+        if not (_c and _c[1]):  # not confirmed reachable -> fall back to BACKEND
+            # Blank the endpoint AND swap the model: this agent's model (e.g.
+            # the worker's heavy "mios-heavy") is NOT served by BACKEND (the
+            # light llama-swap lane), so keeping it yields llama-swap "no router
+            # for requested model". Reset to MIOS_AI_MODEL (the light-lane
+            # default) so the fallback request routes. install-robustness 2026-06-21.
+            _fb_model = (os.environ.get("MIOS_AI_MODEL") or "").strip()
+            cfg = {**cfg, "endpoint": "", **({"model": _fb_model} if _fb_model else {})}
+    return name, cfg
 
 
 # Trivial-input bypass regex -- short messages with no question
@@ -28911,6 +28958,15 @@ async def _stream_backend() -> AsyncGenerator[bytes, None]:
     # Non-streaming: run the enrich passes (no live emits on this path) and
     # build the proxy body -- same _finalize the streaming generator runs live.
     _sys_prefix, proxy_body = await _finalize()
+    # Pin the model to the lane this request is ACTUALLY dispatched to. The front
+    # door advertises a single virtual model ("MiOS AI") and sub-agents carry
+    # lane-specific models (e.g. the heavy worker's "mios-heavy"); when the
+    # primary resolves to the BACKEND light lane -- including the health-gate
+    # fallback when the heavy worker is down -- that incoming/heavy model is NOT
+    # served there, so llama-swap returns "no router for requested model". Force
+    # BACKEND_MODEL so the fallback request routes. install-robustness 2026-06-21.
+    if str(target_endpoint).rstrip("/") == str(BACKEND).rstrip("/"):
+        proxy_body["model"] = BACKEND_MODEL
     proxy_bytes = json.dumps(proxy_body).encode("utf-8")
     client = await _get_client()
     # Council fan-out on the NON-streaming path too (operator 2026-05-22
diff --git a/usr/libexec/mios/mios-hermes-firstboot b/usr/libexec/mios/mios-hermes-firstboot
@@ -149,12 +149,27 @@ _small_m="$(_mios_toml_value 'ai.host_thresholds' 'small_ram_model' "$_AI_MODEL_
 # neither exists or returns 0, the host is CPU-only and we stay
 # on small_ram_model.
 _vram_gb=0
-if command -v nvidia-smi >/dev/null 2>&1; then
-    _vram_mib="$(nvidia-smi --query-gpu=memory.total --format=csv,noheader,nounits 2>/dev/null \
+# WSL2 ships nvidia-smi at /usr/lib/wsl/lib/nvidia-smi, which is NOT on the
+# systemd unit PATH (/usr/local/bin:/usr/bin). So `command -v nvidia-smi`
+# returned nothing under firstboot's systemd context and a 24 GB RTX 4090 was
+# mis-detected as 0 GB -> small tier instead of mid (operator-confirmed
+# 2026-06-21: "WHAT ARE YOU TALKING ABOUT CPU-ONLY"). Probe explicit
+# locations (incl. the WSL path) so detection works regardless of PATH.
+# install-robustness 2026-06-21.
+_nvsmi=""
+for _c in nvidia-smi /usr/lib/wsl/lib/nvidia-smi /usr/bin/nvidia-smi /opt/cuda/bin/nvidia-smi; do
+    if command -v "$_c" >/dev/null 2>&1; then _nvsmi="$_c"; break; fi
+done
+_rocmsmi=""
+for _c in rocm-smi /opt/rocm/bin/rocm-smi /usr/bin/rocm-smi; do
+    if command -v "$_c" >/dev/null 2>&1; then _rocmsmi="$_c"; break; fi
+done
+if [[ -n "$_nvsmi" ]]; then
+    _vram_mib="$("$_nvsmi" --query-gpu=memory.total --format=csv,noheader,nounits 2>/dev/null \
         | awk 'BEGIN{m=0} {if ($1+0 > m) m=$1+0} END{print m}')"
     _vram_gb=$(( ${_vram_mib:-0} / 1024 ))
-elif command -v rocm-smi >/dev/null 2>&1; then
-    _vram_mib="$(rocm-smi --showmeminfo vram --csv 2>/dev/null \
+elif [[ -n "$_rocmsmi" ]]; then
+    _vram_mib="$("$_rocmsmi" --showmeminfo vram --csv 2>/dev/null \
         | awk -F, 'NR>1 {gsub(/[^0-9]/,"",$2); if ($2+0 > m) m=$2+0} END{print m+0}')"
     # rocm-smi reports bytes; convert
     _vram_gb=$(( ${_vram_mib:-0} / 1024 / 1024 / 1024 ))
diff --git a/usr/libexec/mios/system-sync-env.sh b/usr/libexec/mios/system-sync-env.sh
@@ -151,6 +151,18 @@ EOF
     [[ -n "${MIOS_PORT_SGLANG:-}" ]] && echo "MIOS_AI_HEAVY_ENDPOINT=\"http://localhost:${MIOS_PORT_SGLANG}/v1\""
     [[ -n "${MIOS_PORT_VLLM:-}" ]]   && echo "MIOS_AI_HEAVY_ALT_ENDPOINT=\"http://localhost:${MIOS_PORT_VLLM}/v1\""
 
+    # Resolved service ports (SSOT [ports].*). Emitted as NUMERIC vars so
+    # EnvironmentFile= consumers (agent-pipe, hermes) AND ${MIOS_PORT_*}
+    # templates in mios.toml endpoint URLs can resolve -- systemd and Python
+    # do NOT expand ${...} from sibling env lines, so the ports must exist as
+    # their own vars. Without this the agent-pipe read a LITERAL
+    # "${MIOS_PORT_HERMES_WORKER}" worker port -> httpx InvalidURL -> :8640 500.
+    # install-robustness 2026-06-21.
+    for _pk in MIOS_PORT_LLM_LIGHT MIOS_PORT_HERMES MIOS_PORT_HERMES_WORKER MIOS_PORT_AGENT_PIPE MIOS_PORT_PREFILTER MIOS_PORT_OPENCODE MIOS_PORT_SGLANG MIOS_PORT_VLLM; do
+        _pv="${!_pk:-}"
+        if [[ -n "$_pv" ]]; then echo "${_pk}=\"${_pv}\""; fi
+    done
+
     # Image
     [[ -n "${MIOS_IMAGE_REF:-}" ]]    && echo "MIOS_IMAGE_REF=\"${MIOS_IMAGE_REF}\""
     [[ -n "${MIOS_BRANCH:-}" ]]       && echo "MIOS_BRANCH=\"${MIOS_BRANCH}\""
diff --git a/usr/share/mios/mios.toml b/usr/share/mios/mios.toml
@@ -876,6 +876,15 @@ font_size          = 14
 endpoint = "http://localhost:${MIOS_PORT_HERMES_WORKER}/v1"
 model    = "mios-heavy"
 role     = "general"
+# health_gate: the :8643 hermes-worker is a SEPARATE service bound to the heavy
+# GPU lane (mios-heavy), which is gated OFF by default (VRAM / operator opt-in).
+# Marking it health-gated makes the orchestrator liveness-probe it and DROP it
+# when unreachable (degrade-open) instead of dispatching the FINAL answer to a
+# dead endpoint -> "All connection attempts failed" / 502. It auto-rejoins once
+# the operator enables the heavy lane and the worker comes up. Without this the
+# :8640 front door 502'd on every turn on any host where the worker is down
+# (e.g. a fresh dev VM with the heavy lane gated). install-robustness 2026-06-21.
+health_gate = true
 # WS-2 per-agent RBAC (optional; default = NO restriction = unchanged behaviour):
 # cap THIS agent's tool surface to what its role should touch, enforced by
 # _agent_rbac_filter at dispatch. denied_verbs drops the named verbs; allowed_verbs