Agenta-AI · jp-agenta · Apr 17, 2026 · Apr 17, 2026 · Apr 17, 2026 · Apr 17, 2026
diff --git a/api/oss/tests/pytest/unit/evaluators/test_catalog_types.py b/api/oss/tests/pytest/unit/evaluators/test_catalog_types.py
@@ -22,4 +22,31 @@ def test_catalog_types_include_message_messages_model_and_prompt_template():
         ]
         == "model"
     )
+    prompt_properties = by_key["prompt-template"]["properties"]
+    fallback_schema = prompt_properties["fallback_llm_configs"]
+    retry_schema = prompt_properties["retry_policy"]
+    fallback_policy_schema = prompt_properties["fallback_policy"]
+    fallback_array_schema = next(
+        option for option in fallback_schema["anyOf"] if option.get("type") == "array"
+    )
+    retry_object_schema = next(
+        option for option in retry_schema["anyOf"] if option.get("type") == "object"
+    )
+    assert fallback_schema["default"] is None
+    assert (
+        fallback_array_schema["items"]["properties"]["model"]["x-ag-type-ref"]
+        == "model"
+    )
+    assert "model" in fallback_array_schema["items"]["required"]
+    assert fallback_policy_schema["x-ag-type"] == "choice"
+    assert fallback_policy_schema["enum"] == [
+        "off",
+        "availability",
+        "capacity",
+        "access",
+        "any",
+    ]
+    assert set(retry_object_schema["properties"]) == {"max_retries", "delay_ms"}
+    assert "chat_template_kwargs" in prompt_properties["llm_config"]["properties"]
     assert by_key["llm"]["properties"]["model"]["x-ag-type-ref"] == "model"
+    assert "chat_template_kwargs" in by_key["llm"]["properties"]
diff --git a/api/run-tests.py b/api/run-tests.py
@@ -7,6 +7,10 @@
 from dotenv import load_dotenv
 
 
+ROOT_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
+LOCAL_SDK_DIR = os.path.join(ROOT_DIR, "sdk")
+
+
 TYPES = {
     "license": ["ee", "oss"],
     "coverage": ["smoke", "full"],
@@ -31,6 +35,14 @@ def _resolve_license() -> str:
     return "ee" if os.getenv("AGENTA_LICENSE") == "ee" else "oss"
 
 
+def _prepend_pythonpath(path: str) -> None:
+    current = os.environ.get("PYTHONPATH")
+    paths = [path]
+    if current:
+        paths.append(current)
+    os.environ["PYTHONPATH"] = os.pathsep.join(paths)
+
+
 @click.command()
 @click.option(
     "--env-file",
@@ -143,6 +155,9 @@ def run_tests(
     license = _resolve_license()
     click.echo(f"AGENTA_LICENSE={license}")
 
+    if os.path.isdir(LOCAL_SDK_DIR):
+        _prepend_pythonpath(LOCAL_SDK_DIR)
+
     # Set optional dimensions
     for name, value in [
         ("COVERAGE", coverage),

diff --git a/docs/designs/extend-prompt-templates/findings.md b/docs/designs/extend-prompt-templates/findings.md
@@ -0,0 +1,122 @@
+# Extend Prompt Templates Findings
+
+Scan scope: `973e80146..9420b8779` on `feat/extend-prompt-templates`
+
+Active path: `docs/designs/extend-prompt-templates`
+
+Sources reviewed:
+
+- `docs/designs/extend-prompt-templates/{gap,initial.specs,plan,proposal,research}.md`
+- `sdk/agenta/sdk/utils/types.py`
+- `sdk/agenta/sdk/engines/running/handlers.py`
+- `sdk/agenta/sdk/engines/running/interfaces.py`
+- `api/oss/src/resources/workflows/catalog.py`
+- `api/pyproject.toml`
+- `web/packages/agenta-entities/src/shared/execution/requestBodyBuilder.ts`
+- `web/packages/agenta-entity-ui/src/DrillInView/components/PlaygroundConfigSection.tsx`
+- `web/packages/agenta-entity-ui/src/DrillInView/SchemaControls/PromptSchemaControl.tsx`
+- `web/oss/src/components/Playground/Components/Modals/RefinePromptModal/hooks/useRefinePrompt.ts`
+
+Verification run:
+
+- `pytest -q sdk/oss/tests/pytest/unit/test_prompt_template_extensions.py api/oss/tests/pytest/unit/evaluators/test_catalog_types.py` from repo root: SDK tests passed, API import failed with `ModuleNotFoundError: No module named 'oss.src'`.
+- `pytest -q oss/tests/pytest/unit/evaluators/test_catalog_types.py` from `api`: failed because `prompt-template` lacked `fallback_llm_configs`.
+- `PYTHONPATH=/Users/junaway/Agenta/github/application/sdk:/Users/junaway/Agenta/github/application/api pytest -q oss/tests/pytest/unit/evaluators/test_catalog_types.py` from `api`: passed.
+- Manual SDK repro confirmed `PromptTemplate.format()` raises when `chat_template_kwargs` contains an unresolved literal `{{...}}`.
+- After fixes, `pytest -q sdk/oss/tests/pytest/unit/test_prompt_template_extensions.py`: passed, 9 tests.
+- After fixes, `poetry run python run-tests.py oss/tests/pytest/unit/evaluators/test_catalog_types.py` from `api`: passed, 1 test.
+- After fixes, `pnpm --filter @agenta/entity-ui build` from `web`: passed.
+
+## Notes
+
+- No whitespace errors were found by `git diff --check HEAD~2..HEAD`.
+- User decision: `chat_template_kwargs` is a strict 1:1 provider pass-through field.
+- User decision: prompt fallback fields are normal `data.parameters` fields and must be editable in the web registry or playground like other parameter fields.
+- User decision: web tests are out of scope for this work.
+
+## Open Questions
+
+No open questions.
+
+## Open Findings
+
+### [OPEN] FPT-004: Runtime coverage is still narrower than the implementation risk
+
+- ID: `FPT-004`
+- Origin: `scan`
+- Lens: `verification`
+- Severity: `P2`
+- Confidence: `medium`
+- Status: `open`
+- Category: `Testing`
+- Summary: The tests now cover SDK data-model defaults, basic fallback movement, catalog shape, and `chat_template_kwargs` 1:1 formatting behavior, but do not yet cover the full runtime fallback matrix: retry ordering, policy categories, no fallback on local errors, exhaustion behavior, and service/API smoke.
+- Evidence: `sdk/oss/tests/pytest/unit/test_prompt_template_extensions.py` covers default/null behavior, `chat_template_kwargs` in `to_openai_kwargs()`, fallback model validation, 404 policy classification, one 503 fallback success, and unchanged `chat_template_kwargs` through `PromptTemplate.format()`. The plan still lists additional tests for retry-before-fallback, 5xx/timeout/429/401/403/400/404/422 categories, local prompt errors, final exhaustion, service completion/chat, and API catalog endpoint exposure. The user explicitly excluded web tests from this work.
+- Files:
+  - `sdk/oss/tests/pytest/unit/test_prompt_template_extensions.py`
+  - `api/oss/tests/pytest/unit/evaluators/test_catalog_types.py`
+  - `docs/designs/extend-prompt-templates/plan.md`
+- Cause: The first implementation added narrow unit coverage but did not follow the full validation matrix for prompt fallback execution.
+- Explanation: The feature changes provider-call control flow. Without targeted tests around failure classification, retry boundaries, and final exhaustion, regressions can look like provider flakiness.
+- Suggested Fix: Add focused SDK tests for retry/exhaustion/local-error behavior and optional service/API smoke tests. Do not add web tests in this work.
+- Alternatives: Accept the remaining runtime matrix as follow-up coverage if this PR only needs the currently added unit guards.
+- Sources: `docs/designs/extend-prompt-templates/plan.md`, test scan, user decision.
+
+## Closed Findings
+
+### [CLOSED] FPT-001: API catalog verification needs the local SDK setup
+
+- ID: `FPT-001`
+- Origin: `scan`
+- Lens: `verification`
+- Severity: `P3`
+- Confidence: `high`
+- Status: `fixed`
+- Category: `Testing`
+- Summary: API catalog verification depended on running with the branch SDK on the import path. Without that setup, `run-tests.py` could import `api/.venv`'s installed SDK and report stale catalog contents.
+- Evidence: The user's full `poetry run python run-tests.py` failed with `KeyError: 'fallback_llm_configs'`. A direct import check showed `agenta.sdk.utils.types` resolving to `api/.venv/lib/python3.11/site-packages/agenta/sdk/utils/types.py`. After updating `api/run-tests.py` to prepend the monorepo `sdk` directory to `PYTHONPATH` for pytest subprocesses, `poetry run python run-tests.py oss/tests/pytest/unit/evaluators/test_catalog_types.py` passed.
+- Files:
+  - `api/run-tests.py`
+  - `api/oss/tests/pytest/unit/evaluators/test_catalog_types.py`
+- Resolution: Fixed by making the API test runner prefer the local monorepo SDK when invoking pytest.
+- Sources: User run output, focused verification run.
+
+### [CLOSED] FPT-002: `chat_template_kwargs` is not passed through unchanged during prompt formatting
+
+- ID: `FPT-002`
+- Origin: `scan`
+- Lens: `verification`
+- Severity: `P1`
+- Confidence: `high`
+- Status: `fixed`
+- Category: `Correctness`
+- Summary: `PromptTemplate.format()` recursively substituted variables inside `llm_config.chat_template_kwargs`, violating the confirmed 1:1 provider pass-through contract.
+- Evidence: The implementation called `_substitute_variables()` on `new_llm_config.chat_template_kwargs`, and a manual repro with `chat_template_kwargs={"literal": "{{provider_flag}}"}` raised `TemplateFormatError`.
+- Files:
+  - `sdk/agenta/sdk/utils/types.py`
+  - `sdk/oss/tests/pytest/unit/test_prompt_template_extensions.py`
+- Resolution: Fixed by excluding `chat_template_kwargs` from prompt substitution and adding a regression test that verifies primary and fallback `chat_template_kwargs` survive `PromptTemplate.format()` unchanged.
+- Sources: `pytest -q sdk/oss/tests/pytest/unit/test_prompt_template_extensions.py`.
+
+### [CLOSED] FPT-003: Fallback root fields are preserved but not editable in the prompt UI
+
+- ID: `FPT-003`
+- Origin: `scan`
+- Lens: `verification`
+- Severity: `P2`
+- Confidence: `high`
+- Status: `fixed`
+- Category: `Completeness`
+- Summary: The prompt editor preserved fallback root fields but did not expose `fallback_llm_configs`, `fallback_policy`, or `retry_policy` for editing.
+- Evidence: The user confirmed these fields must be editable in the web registry or playground like any other `data.parameters` field. `PromptSchemaControl` previously returned only messages, tools, response format, and template format controls.
+- Files:
+  - `web/packages/agenta-entity-ui/src/DrillInView/SchemaControls/PromptSchemaControl.tsx`
+- Resolution: Fixed by rendering prompt-root controls in `PromptSchemaControl` action-bar popovers. `Retry policy` appears to the right of `Prompt Syntax` and edits `max_retries` plus `delay_ms`; `Fallback policy` opens a popover with the fallback policy select and a list of fallback model dropdowns.
+- Sources: `pnpm --filter @agenta/entity-ui build`.
+
+## Triage Plan
+
+Recommended next step: decide whether to backfill the remaining non-web runtime test matrix.
+
+1. If coverage is expanded in this PR, add SDK tests for retry ordering, local-error no-fallback, policy-category coverage, and exhaustion behavior.
+2. If coverage is not expanded in this PR, keep `FPT-004` as a follow-up testing item.
+3. Do not add web tests in this work.
diff --git a/docs/designs/extend-prompt-templates/gap.md b/docs/designs/extend-prompt-templates/gap.md
@@ -0,0 +1,127 @@
+# Fallback Models Gap
+
+## Summary
+
+The current system supports a single prompt-template `llm_config` in legacy completion/chat handlers. The proposal adds optional root-level retry and fallback controls plus ordered fallback LLM configs.
+
+## SDK Type Gap
+
+Current:
+
+- `PromptTemplate` has `messages`, `template_format`, `input_keys`, and `llm_config`.
+- `ModelConfig` is only a primary config.
+- `ModelConfig` does not include `chat_template_kwargs`.
+- There are no prompt-template root fields for retry or fallback.
+- `to_openai_kwargs()` only serializes `self.llm_config`.
+
+Needed:
+
+- Add `fallback_llm_configs`, `retry_policy`, and `fallback_policy` to `PromptTemplate`.
+- Keep every new field optional/null in stored config.
+- Apply runtime behavior defaults outside the data model:
+  - `fallback_llm_configs: null` -> `[]`
+  - `retry_policy: null` -> built-in retry policy
+  - `fallback_policy: null` -> `off`
+  - `chat_template_kwargs: null` -> omitted from provider kwargs
+- Reuse the current `ModelConfig` shape for fallback entries while requiring `model`.
+- Add `chat_template_kwargs` to the reusable LLM config shape for primary and fallback configs.
+- Add enums/models for retry and fallback policy.
+- Add internal candidate-specific kwargs helpers.
+- Ensure `PromptTemplate.format()` preserves and formats relevant fallback fields where needed.
+
+## SDK Handler Gap
+
+Current:
+
+- `completion_v0` resolves provider settings for one model, formats once, and calls once.
+- `chat_v0` resolves provider settings for one model, formats once, appends messages, and calls once.
+- `_call_llm_with_fallback()` exists only for `llm_v0` and uses different secret/provider behavior.
+- Retry behavior exists only as fixed low-level `mockllm` recovery for closed HTTP clients.
+
+Needed:
+
+- Add a shared prompt fallback runner for `completion_v0` and `chat_v0`.
+- Retry each current LLM config before considering fallback.
+- Classify provider-call errors into `availability`, `capacity`, `access`, and `any`.
+- Keep local prompt/input validation outside fallback.
+- Resolve provider settings for each candidate via `SecretsManager.get_provider_settings_from_workflow()`.
+- Clean up `chat_v0` input normalization.
+
+## Interface And Catalog Gap
+
+Current:
+
+- `single_prompt_parameters_schema()` exposes `prompt` as `x-ag-type-ref: "prompt-template"`.
+- `CATALOG_TYPES` exposes `prompt-template`, `model`, `llm`, and `llms`.
+- Tests assert `prompt-template.llm_config.model` has `x-ag-type-ref: "model"`.
+- No catalog schema exists for prompt root fallback fields.
+
+Needed:
+
+- Update the generated/dereferenced prompt-template schema.
+- Ensure `fallback_llm_configs.items` carries the full LLM config schema.
+- Ensure fallback item `model` carries `x-ag-type-ref: "model"`.
+- Add or update catalog tests.
+
+## Services Gap
+
+Current:
+
+- Completion/chat services pass `PromptTemplate` through to SDK handlers.
+- Managed `llm_v0` service is separate and already has its own `llms` flow.
+
+Needed:
+
+- Mostly no explicit service code change if SDK types parse and dump correctly.
+- Service smoke tests should include prompt fallback fields to catch serialization loss.
+
+## API Gap
+
+Current:
+
+- API catalog types are sourced from SDK `CATALOG_TYPES`.
+- API does not implement special prompt fallback behavior.
+- `llm_apps_service.py` only uses `x-ag-type-ref` for parameter inference.
+
+Needed:
+
+- Ensure catalog endpoint returns new prompt-template schema.
+- Ensure no default-stripping or schema normalization drops non-primitive defaults incorrectly.
+- Add API catalog tests for fallback fields.
+
+## Web Schema/UI Gap
+
+Current:
+
+- Web resolves `x-ag-type-ref: "prompt-template"` dynamically.
+- Prompt controls know how to render `messages`, nested `llm_config`, tools, and response format.
+- Generic array/object controls can render arrays, but fallback entry add/remove/reorder UX needs confirmation.
+- Model popover only edits primary `llm_config` or `llms[0]`.
+- The model-parameters panel does not currently expose `chat_template_kwargs`, requested by issue #3996.
+- Refine prompt modal only models/extracts messages and template format, and can drop extra root fields.
+- Registry/display helpers generally pick the first primary model.
+
+Needed:
+
+- Confirm or add a usable array editor for `fallback_llm_configs`.
+- Make fallback item `model` render through the grouped model selector.
+- Render `fallback_policy` as enum/choice.
+- Render `retry_policy` as a small object or inline advanced section.
+- Render `chat_template_kwargs` as a model parameter object field and preserve it unchanged.
+- Preserve fallback fields in prompt refine flows and execution payload building.
+- Optionally show fallback summary in registry/playground headers.
+
+## Test Gap
+
+Current:
+
+- Tests cover prompt-template catalog exposure, interface references, and basic storage roundtrip.
+- No tests cover fallback config storage, schema hints, handler fallback behavior, or web persistence.
+
+Needed:
+
+- SDK unit tests for Pydantic parsing/dumping and candidate construction.
+- SDK unit tests proving new data-model defaults are null while runtime defaults are normalized separately.
+- SDK handler tests for retry, fallback policy acceptance/rejection, and exhaustion.
+- API catalog tests for new schema fields and x-ag metadata.
+- Web tests for editing/preserving fallback fields.