huggingface
diff --git a/‎agent/config.py‎
Lines changed: 9 additions & 0 deletions b/‎agent/config.py‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎agent/context_manager/manager.py‎
Lines changed: 6 additions & 2 deletions b/‎agent/context_manager/manager.py‎
Lines changed: 6 additions & 2 deletions
diff --git a/‎agent/core/agent_loop.py‎
Lines changed: 23 additions & 11 deletions b/‎agent/core/agent_loop.py‎
Lines changed: 23 additions & 11 deletions
diff --git a/‎agent/core/hf_router_catalog.py‎
Lines changed: 129 additions & 0 deletions b/‎agent/core/hf_router_catalog.py‎
Lines changed: 129 additions & 0 deletions
diff --git a/‎agent/core/llm_params.py‎
Lines changed: 117 additions & 0 deletions b/‎agent/core/llm_params.py‎
Lines changed: 117 additions & 0 deletions
@@ -33,6 +33,15 @@ class Config(BaseModel):
     confirm_cpu_jobs: bool = True
     auto_file_upload: bool = False
 
+    # Reasoning effort for models that support it (GPT-5 / o-series, Claude
+    # extended thinking, HF reasoning models like MiniMax M2 / Kimi K2).
+    # Defaults to "high" — we'd rather spend tokens thinking than ship a
+    # wrong ML recipe. Users can dial down with `/effort low|medium|off`.
+    # "minimal" is an OpenAI-only level and is normalized to "low" for HF
+    # router models (MiniMax requires ≥low). Ignored for non-reasoning models.
+    # Valid values: None | "minimal" | "low" | "medium" | "high"
+    reasoning_effort: str | None = "high"
+
 
 def substitute_env_vars(obj: Any) -> Any:
     """
 
@@ -12,7 +12,7 @@
 from jinja2 import Template
 from litellm import Message, acompletion
 
-from agent.llm import resolve_llm_params
+from agent.core.llm_params import _resolve_llm_params
 
 logger = logging.getLogger(__name__)
 
@@ -308,7 +308,11 @@ async def compact(
             )
         )
 
-        llm_params = resolve_llm_params(model_name, session_hf_token=hf_token)
+        llm_params = _resolve_llm_params(
+            model_name,
+            session_hf_token=hf_token,
+            reasoning_effort="high",
+        )
         response = await acompletion(
             messages=messages_to_summarize,
             max_completion_tokens=self.compact_size,
 
@@ -12,23 +12,16 @@
 
 from agent.config import Config
 from agent.core.doom_loop import check_for_doom_loop
+from agent.core.llm_params import _resolve_llm_params
 from agent.core.session import Event, OpType, Session
 from agent.core.tools import ToolRouter
-from agent.llm import resolve_llm_params
 from agent.tools.jobs_tool import CPU_FLAVORS
 
 logger = logging.getLogger(__name__)
 
 ToolCall = ChatCompletionMessageToolCall
 
 
-def _resolve_hf_router_params(
-    model_name: str, session_hf_token: str | None = None
-) -> dict:
-    """Back-compat wrapper for the shared provider resolver."""
-    return resolve_llm_params(model_name, session_hf_token=session_hf_token)
-
-
 def _validate_tool_args(tool_args: dict) -> tuple[bool, str | None]:
     """
     Validate tool arguments structure.
@@ -181,6 +174,23 @@ def _friendly_error_message(error: Exception) -> str | None:
             "at your model provider's dashboard."
         )
 
+    if "not supported by provider" in err_str or "no provider supports" in err_str:
+        return (
+            "The model isn't served by the provider you pinned.\n\n"
+            "Drop the ':<provider>' suffix to let the HF router auto-pick a "
+            "provider, or use '/model' (no arg) to see which providers host "
+            "which models."
+        )
+
+    if "model_not_found" in err_str or (
+        "model" in err_str and ("not found" in err_str or "does not exist" in err_str)
+    ):
+        return (
+            "Model not found. Use '/model' to list suggestions, or paste an "
+            "HF model id like 'MiniMaxAI/MiniMax-M2.7'. Availability is shown "
+            "when you switch."
+        )
+
     return None
 
 
@@ -529,8 +539,10 @@ async def run_agent(
             tools = session.tool_router.get_tool_specs_for_llm()
             try:
                 # ── Call the LLM (streaming or non-streaming) ──
-                llm_params = _resolve_hf_router_params(
-                    session.config.model_name, session.hf_token
+                llm_params = _resolve_llm_params(
+                    session.config.model_name,
+                    session.hf_token,
+                    reasoning_effort=session.config.reasoning_effort,
                 )
                 if session.stream:
                     llm_result = await _call_llm_streaming(
@@ -746,7 +758,7 @@ async def _exec_tool(
                         if not valid:
                             return (tc, name, args, err, False)
                         out, ok = await session.tool_router.call_tool(
-                            name, args, session=session
+                            name, args, session=session, tool_call_id=tc.id
                         )
                         return (tc, name, args, out, ok)
 
 
@@ -0,0 +1,129 @@
+"""Fetch and cache the HF Inference Router model catalog.
+
+The router exposes an OpenAI-compatible listing at
+``https://router.huggingface.co/v1/models`` with per-provider availability,
+pricing, context length, and tool-use support. We use it to:
+
+  • Validate ``/model`` switches with live data instead of a hard-coded allowlist.
+  • Show the user which providers serve a model, at what price, and whether they
+    support tool calls.
+  • Derive a reasonable context-window limit for any routed model.
+
+The listing is cached in-memory for a few minutes so repeated lookups during a
+session are free. On fetch failure we return stale data if we have it, or an
+empty catalog otherwise.
+"""
+
+import logging
+import time
+from dataclasses import dataclass
+from difflib import get_close_matches
+from typing import Optional
+
+import httpx
+
+logger = logging.getLogger(__name__)
+
+_CATALOG_URL = "https://router.huggingface.co/v1/models"
+_CACHE_TTL_SECONDS = 300
+_HTTP_TIMEOUT_SECONDS = 5.0
+
+_cache: Optional[dict] = None
+_cache_time: float = 0.0
+
+
+@dataclass
+class ProviderInfo:
+    provider: str
+    status: str
+    context_length: Optional[int]
+    input_price: Optional[float]
+    output_price: Optional[float]
+    supports_tools: bool
+    supports_structured_output: bool
+
+
+@dataclass
+class ModelInfo:
+    id: str
+    providers: list[ProviderInfo]
+
+    @property
+    def live_providers(self) -> list[ProviderInfo]:
+        return [p for p in self.providers if p.status == "live"]
+
+    @property
+    def max_context_length(self) -> Optional[int]:
+        lengths = [p.context_length for p in self.live_providers if p.context_length]
+        return max(lengths) if lengths else None
+
+    @property
+    def any_supports_tools(self) -> bool:
+        return any(p.supports_tools for p in self.live_providers)
+
+
+def _fetch_catalog(force: bool = False) -> dict:
+    global _cache, _cache_time
+    now = time.time()
+    if not force and _cache is not None and now - _cache_time < _CACHE_TTL_SECONDS:
+        return _cache
+    try:
+        resp = httpx.get(_CATALOG_URL, timeout=_HTTP_TIMEOUT_SECONDS)
+        resp.raise_for_status()
+        _cache = resp.json()
+        _cache_time = now
+    except Exception as e:
+        logger.warning("Failed to fetch HF router catalog: %s", e)
+        if _cache is None:
+            _cache = {"data": []}
+            _cache_time = now
+    return _cache
+
+
+def _parse_entry(entry: dict) -> ModelInfo:
+    providers = []
+    for p in entry.get("providers", []) or []:
+        pricing = p.get("pricing") or {}
+        providers.append(
+            ProviderInfo(
+                provider=p.get("provider", ""),
+                status=p.get("status", ""),
+                context_length=p.get("context_length"),
+                input_price=pricing.get("input"),
+                output_price=pricing.get("output"),
+                supports_tools=bool(p.get("supports_tools", False)),
+                supports_structured_output=bool(p.get("supports_structured_output", False)),
+            )
+        )
+    return ModelInfo(id=entry.get("id", ""), providers=providers)
+
+
+def lookup(model_id: str) -> Optional[ModelInfo]:
+    """Find a model in the router catalog.
+
+    Accepts ``<org>/<model>`` or ``<org>/<model>:<tag>`` — the tag is stripped
+    for lookup. Returns ``None`` if the model isn't listed.
+    """
+    bare = model_id.split(":", 1)[0]
+    catalog = _fetch_catalog()
+    for entry in catalog.get("data", []):
+        if entry.get("id") == bare:
+            return _parse_entry(entry)
+    return None
+
+
+def fuzzy_suggest(model_id: str, limit: int = 3) -> list[str]:
+    """Return the closest model ids from the catalog."""
+    bare = model_id.split(":", 1)[0]
+    catalog = _fetch_catalog()
+    ids = [e.get("id", "") for e in catalog.get("data", []) if e.get("id")]
+    return get_close_matches(bare, ids, n=limit, cutoff=0.4)
+
+
+def prewarm() -> None:
+    """Fetch the catalog so subsequent lookups are instant. Safe to call from
+    a background task — swallows failures."""
+    try:
+        _fetch_catalog(force=False)
+    except Exception:
+        pass
@@ -0,0 +1,117 @@
+"""LiteLLM kwargs resolution for the model ids this agent accepts."""
+
+import os
+
+_HF_ALLOWED_EFFORTS = {"low", "medium", "high"}
+_LM_STUDIO_DEFAULT_BASE = "http://127.0.0.1:1234/v1"
+_DIRECT_PREFIXES = (
+    "anthropic/",
+    "openai/",
+    "openrouter/",
+    "lm_studio/",
+    "opencode/",
+    "opencode-go/",
+)
+
+
+def _looks_like_hf_router_model(model_name: str) -> bool:
+    bare = model_name.removeprefix("huggingface/").split(":", 1)[0]
+    parts = bare.split("/")
+    return len(parts) >= 2 and all(parts)
+
+
+def _custom_openai_compat_params(
+    model_name: str,
+    *,
+    prefix: str,
+    api_base: str,
+    api_key_env: str,
+    reasoning_effort: str | None = None,
+) -> dict:
+    actual_model = model_name[len(prefix) :]
+    params = {
+        "model": f"openai/{actual_model}",
+        "api_base": api_base,
+        "api_key": os.environ.get(api_key_env, ""),
+    }
+    if reasoning_effort:
+        params["extra_body"] = {"reasoning_effort": reasoning_effort}
+    return params
+
+
+def _resolve_llm_params(
+    model_name: str,
+    session_hf_token: str | None = None,
+    reasoning_effort: str | None = None,
+) -> dict:
+    """Build LiteLLM kwargs for supported direct, local, and routed models."""
+    if model_name.startswith(("anthropic/", "openai/")):
+        params: dict = {"model": model_name}
+        if reasoning_effort:
+            params["reasoning_effort"] = reasoning_effort
+        return params
+
+    if model_name.startswith("lm_studio/"):
+        return {
+            "model": model_name,
+            "api_base": os.environ.get(
+                "LMSTUDIO_BASE_URL", _LM_STUDIO_DEFAULT_BASE
+            ).rstrip("/"),
+            "api_key": os.environ.get("LMSTUDIO_API_KEY", "") or "lm-studio",
+        }
+
+    if model_name.startswith("openrouter/"):
+        return _custom_openai_compat_params(
+            model_name,
+            prefix="openrouter/",
+            api_base="https://openrouter.ai/api/v1",
+            api_key_env="OPENROUTER_API_KEY",
+            reasoning_effort=reasoning_effort,
+        )
+
+    if model_name.startswith("opencode/"):
+        return _custom_openai_compat_params(
+            model_name,
+            prefix="opencode/",
+            api_base="https://opencode.ai/zen/v1",
+            api_key_env="OPENCODE_ZEN_API_KEY",
+            reasoning_effort=reasoning_effort,
+        )
+
+    if model_name.startswith("opencode-go/"):
+        return _custom_openai_compat_params(
+            model_name,
+            prefix="opencode-go/",
+            api_base="https://opencode.ai/zen/go/v1",
+            api_key_env="OPENCODE_GO_API_KEY",
+            reasoning_effort=reasoning_effort,
+        )
+
+    if model_name.startswith(_DIRECT_PREFIXES):
+        raise ValueError(f"Unrecognized model prefix: {model_name}")
+
+    if not _looks_like_hf_router_model(model_name):
+        raise ValueError(f"Unrecognized model id: {model_name}")
+
+    hf_model = model_name.removeprefix("huggingface/")
+    api_key = (
+        os.environ.get("INFERENCE_TOKEN")
+        or session_hf_token
+        or os.environ.get("HF_TOKEN")
+    )
+    if not api_key:
+        raise ValueError(
+            "Missing Hugging Face token. Set INFERENCE_TOKEN or HF_TOKEN, or sign in so the session carries your HF token."
+        )
+    params = {
+        "model": f"openai/{hf_model}",
+        "api_base": "https://router.huggingface.co/v1",
+        "api_key": api_key,
+    }
+    if os.environ.get("INFERENCE_TOKEN"):
+        params["extra_headers"] = {"X-HF-Bill-To": "huggingface"}
+    if reasoning_effort:
+        hf_level = "low" if reasoning_effort == "minimal" else reasoning_effort
+        if hf_level in _HF_ALLOWED_EFFORTS:
+            params["extra_body"] = {"reasoning_effort": hf_level}
+    return params