aws-samples
diff --git a/‎agent/src/config.py‎
Lines changed: 8 additions & 0 deletions b/‎agent/src/config.py‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎agent/src/models.py‎
Lines changed: 16 additions & 2 deletions b/‎agent/src/models.py‎
Lines changed: 16 additions & 2 deletions
diff --git a/‎agent/src/observability.py‎
Lines changed: 19 additions & 0 deletions b/‎agent/src/observability.py‎
Lines changed: 19 additions & 0 deletions
diff --git a/‎agent/src/pipeline.py‎
Lines changed: 12 additions & 3 deletions b/‎agent/src/pipeline.py‎
Lines changed: 12 additions & 3 deletions
diff --git a/‎agent/src/prompts/default_agent.py‎
Lines changed: 7 additions & 1 deletion b/‎agent/src/prompts/default_agent.py‎
Lines changed: 7 additions & 1 deletion
diff --git a/‎agent/src/prompts/web_research.py‎
Lines changed: 5 additions & 0 deletions b/‎agent/src/prompts/web_research.py‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎agent/src/runner.py‎
Lines changed: 81 additions & 14 deletions b/‎agent/src/runner.py‎
Lines changed: 81 additions & 14 deletions
diff --git a/‎agent/src/task_state.py‎
Lines changed: 17 additions & 2 deletions b/‎agent/src/task_state.py‎
Lines changed: 17 additions & 2 deletions
@@ -453,6 +453,7 @@ def build_config(
     issue_number: str = "",
     github_token: str = "",
     anthropic_model: str = "",
+    haiku_model: str = "",
     max_turns: int = 10,
     max_budget_usd: float | None = None,
     aws_region: str = "",
@@ -484,6 +485,12 @@ def build_config(
     resolved_anthropic_model = anthropic_model or os.environ.get(
         "ANTHROPIC_MODEL", "us.anthropic.claude-sonnet-4-6"
     )
+    # Small/fast auxiliary model (WebFetch summarization etc.). Falls back to the
+    # deployed ANTHROPIC_DEFAULT_HAIKU_MODEL env, then the platform default. Must
+    # be an inference-profile id (us.*), not a bare model id (see runner).
+    resolved_haiku_model = haiku_model or os.environ.get(
+        "ANTHROPIC_DEFAULT_HAIKU_MODEL", "us.anthropic.claude-haiku-4-5-20251001-v1:0"
+    )
 
     # Resolve the workflow id (the create-task boundary already pinned it; local
     # batch runs default to the coding workflow). Required-input validation is
@@ -561,6 +568,7 @@ def build_config(
         github_token=resolved_github_token,
         aws_region=resolved_aws_region,
         anthropic_model=resolved_anthropic_model,
+        haiku_model=resolved_haiku_model,
         dry_run=dry_run,
         max_turns=max_turns,
         max_budget_usd=max_budget_usd,
 
@@ -154,6 +154,11 @@ class TaskConfig(BaseModel):
     github_token: str = ""
     aws_region: str
     anthropic_model: str = "us.anthropic.claude-sonnet-4-6"
+    # The "small/fast" model Claude Code uses for auxiliary work (e.g. WebFetch
+    # page summarization). Must be a cross-region INFERENCE-PROFILE id (``us.``
+    # prefix), not a bare foundation-model id — Claude 4.x cannot be invoked
+    # on-demand by bare id on Bedrock. Threaded to ANTHROPIC_DEFAULT_HAIKU_MODEL.
+    haiku_model: str = "us.anthropic.claude-haiku-4-5-20251001-v1:0"
     dry_run: bool = False
     max_turns: int = 10
     max_budget_usd: float | None = None
@@ -326,8 +331,13 @@ class TaskResult(BaseModel):
     status: str
     agent_status: str = "unknown"
     pr_url: str | None = None
-    build_passed: bool = False
-    lint_passed: bool = False
+    # Tri-state (#515): True/False once the post-run gate runs; None when it did
+    # not (repo-less workflow has no build/lint; a crash before post-hooks). The
+    # None case is persisted as "absent" by write_terminal's `is not None` guard,
+    # so the replay bundle reports verification:null rather than a fictional
+    # build_passed:false for a gate that never executed.
+    build_passed: bool | None = None
+    lint_passed: bool | None = None
     cost_usd: float | None = None
     # Rev-5 DATA-1: historically the `turns` field was set to the SDK's
     # `ResultMessage.num_turns`, which INCLUDES the attempted turn that
@@ -365,3 +375,7 @@ class TaskResult(BaseModel):
     # Phase 3), or ``None`` for coding tasks / when no artifact was delivered.
     # Surfaced on TaskDetail so the user can retrieve the knowledge-task output.
     artifact_uri: str | None = None
+    # OTEL trace id (32-char hex) of the task's root span, captured at terminal
+    # write so the replay bundle (#515) can correlate the task to its
+    # CloudWatch/X-Ray trace. ``None`` when tracing is unavailable (local/dev).
+    otel_trace_id: str | None = None
@@ -57,6 +57,25 @@ def task_span(
             raise
 
 
+def current_otel_trace_id() -> str | None:
+    """Return the active span's trace id as a 32-char lowercase hex string.
+
+    Used to persist a cross-plane correlation id on the TaskRecord (#515 replay
+    bundle) so operators can join the task to its CloudWatch/X-Ray trace. Returns
+    ``None`` when there is no recording span (e.g. tracing disabled locally) or
+    the context is invalid, so callers can treat it as a graceful-missing field.
+    """
+    span = trace.get_current_span()
+    ctx = span.get_span_context()
+    if not ctx.is_valid:
+        return None
+    # format_trace_id renders the 128-bit id as zero-padded 32-char hex — the
+    # OTEL format, so it joins directly in CloudWatch Transaction Search. Note
+    # the X-Ray console renders trace ids as ``1-{8hex}-{24hex}``; to look this
+    # up there, transform to that form (the timestamp is the first 8 hex chars).
+    return trace.format_trace_id(ctx.trace_id)
+
+
 def set_session_id(session_id: str) -> None:
     """Propagate *session_id* via OTEL baggage for AgentCore session correlation.
 
 
@@ -26,7 +26,7 @@
 from jira_reactions import comment_task_finished, comment_task_started
 from linear_reactions import react_task_finished, react_task_started
 from models import AgentResult, HydratedContext, RepoSetup, TaskConfig, TaskResult
-from observability import task_span
+from observability import current_otel_trace_id, task_span
 from post_hooks import (
     _extract_agent_notes,
     ensure_committed,
@@ -363,6 +363,7 @@ def _run_repoless_task(
         cache_read_input_tokens=usage.cache_read_input_tokens if usage else None,
         cache_creation_input_tokens=usage.cache_creation_input_tokens if usage else None,
         trace_s3_uri=trace_s3_uri,
+        otel_trace_id=current_otel_trace_id(),
     )
     result_dict = result.model_dump()
 
@@ -1127,6 +1128,7 @@ def _on_trace_truncated(max_bytes: int, first_dropped: int) -> None:
                 cache_read_input_tokens=usage.cache_read_input_tokens if usage else None,
                 cache_creation_input_tokens=usage.cache_creation_input_tokens if usage else None,
                 trace_s3_uri=trace_s3_uri,
+                otel_trace_id=current_otel_trace_id(),
             )
 
             result_dict = result.model_dump()
@@ -1137,8 +1139,11 @@ def _on_trace_truncated(max_bytes: int, first_dropped: int) -> None:
                 root_span.set_attribute("agent.cost_usd", float(result.cost_usd))
             if result.turns:
                 root_span.set_attribute("agent.turns", int(result.turns))
-            root_span.set_attribute("build.passed", result.build_passed)
-            root_span.set_attribute("lint.passed", result.lint_passed)
+            # On the repo path these are always real bools (computed by the post
+            # hooks above); coalesce for the span attribute since the field type
+            # is now tri-state (bool | None) for the repo-less/crash case.
+            root_span.set_attribute("build.passed", bool(result.build_passed))
+            root_span.set_attribute("lint.passed", bool(result.lint_passed))
             root_span.set_attribute("pr.url", result.pr_url or "")
             root_span.set_attribute("task.duration_s", result.duration_s)
             if usage:
@@ -1192,6 +1197,10 @@ def _on_trace_truncated(max_bytes: int, first_dropped: int) -> None:
                 task_id=config.task_id,
                 agent_status=agent_for_chain.status if agent_for_chain else "unknown",
                 trace_s3_uri=crash_trace_s3_uri,
+                # Still inside `with task_span()`, so the id is live — capture it
+                # here too or FAILED tasks (the primary post-mortem case for the
+                # replay bundle, #515) persist otel_trace_id: null.
+                otel_trace_id=current_otel_trace_id(),
             )
             task_state.write_terminal(config.task_id, "FAILED", crash_result.model_dump())
             # Best-effort ❌ on the Linear issue so the stale 👀 doesn't linger.
 
@@ -27,5 +27,11 @@
 - Read the task carefully and any attachments referenced in the user message.
 - Use your available tools to research, analyse, or draft as the task requires.
 - When you have completed the work, summarise the result clearly in your final
-  message — that summary is the deliverable.
+  message — that summary is the deliverable, uploaded as the task artifact, so
+  make it self-contained rather than a pointer to work elsewhere.
+- Do the work yourself in this session and finish before you stop. There is no
+  human watching and no follow-up turn: do NOT defer the work to a background
+  job, workflow, or sub-agent, and never end with "results will follow" or
+  "watch progress elsewhere". If you cannot complete it within the turn budget,
+  deliver your best partial answer with what you found — not a promise.
 """
@@ -44,4 +44,9 @@
 - Cite each non-obvious claim with the source it came from (URL or title).
 - Your final message IS the deliverable — it is uploaded as the task artifact,
   so make it self-contained and complete rather than a pointer to work elsewhere.
+- Do the research yourself in this session and finish before you stop. There is
+  no human watching and no follow-up turn: do NOT defer the work to a background
+  job, workflow, or sub-agent, and never end with "results will follow" or
+  "watch progress elsewhere". If you cannot complete it within the turn budget,
+  deliver your best partial answer with what you found — not a promise.
 """
@@ -27,7 +27,7 @@
 
 import os
 import subprocess
-from typing import Any
+from typing import Any, Literal
 from urllib.parse import quote
 
 from config import AGENT_WORKSPACE
@@ -132,7 +132,13 @@ def _setup_agent_env(config: TaskConfig) -> tuple[str | None, str | None]:
     # writes, while the SDK is waiting on stdout).  The stderr callback in
     # ClaudeAgentOptions cannot drain fast enough to prevent this.
     os.environ.pop("ANTHROPIC_LOG", None)
-    os.environ["ANTHROPIC_DEFAULT_HAIKU_MODEL"] = "anthropic.claude-haiku-4-5-20251001-v1:0"
+    # Small/fast auxiliary model (WebFetch summarization etc.), from config like
+    # ANTHROPIC_MODEL above — resolved from the deployed ANTHROPIC_DEFAULT_HAIKU_MODEL
+    # env (agent.ts) with a platform default in config.py. Must be a cross-region
+    # INFERENCE-PROFILE id (``us.`` prefix): Claude 4.x cannot be invoked on-demand
+    # by bare model id on Bedrock (400 "on-demand throughput isn't supported",
+    # seen on WebFetch's Haiku sub-calls); config.py resolves that default.
+    os.environ["ANTHROPIC_DEFAULT_HAIKU_MODEL"] = config.haiku_model
 
     # Save OTLP endpoint/protocol configured by ADOT auto-instrumentation
     # before stripping, so we can re-use it for Claude Code CLI telemetry.
@@ -335,31 +341,87 @@ def _initialize_policy_engine_and_hooks(
 # read-only workflow.
 _WRITE_TOOLS = frozenset(("Write", "Edit"))
 
+# Tools that DEFER work off-session and are hard-blocked for every task. These
+# launch detached / cross-session orchestration that a one-shot headless agent
+# has no supervisor to await: the ``Workflow`` tool returns a task id and runs
+# in the background (its result arrives via a notification into an interactive
+# session that does not exist here), and ``Task``/``Agent`` can spawn background
+# subagents. We saw a repo-less task launch a background ``Workflow`` and then
+# finalize on the first ResultMessage with a placeholder artifact while the real
+# research ran on, detached (task 01KWDEFQH6...). CRITICAL: ``allowed_tools`` is
+# only an auto-APPROVE list — per the Agent SDK docs it does NOT restrict the
+# surface; unlisted tools fall through to ``permission_mode``, and under
+# ``bypassPermissions`` they are simply allowed. ``disallowed_tools`` is the
+# only hard lock (it removes the tool from the model's context even under
+# bypass), so the block must live there, not in the allow-list.
+# ``Workflow`` (background multi-agent orchestration) is the one that bit us;
+# ``Task``/``Agent`` are the sub-agent spawners (name varies by CLI version, so
+# block both); ``Monitor`` streams a background command's output mid-turn;
+# ``SendMessage`` resumes/relaunches background agents; the ``Cron*`` tools
+# schedule deferred work. All are "return now, work continues off-session"
+# vectors a one-shot task cannot await. NOT blockable here: background ``Bash``
+# (a ``run_in_background`` PARAMETER of Bash, not a tool name) — but a detached
+# Bash child dies with the MicroVM on return, so it can't produce
+# arrives-later work the way a cloud Workflow does; the deliver-artifact
+# deferral guard (deliverers._reject_if_deferral) is the backstop for anything
+# that still ends in a placeholder.
+_DISALLOWED_TOOLS = [
+    "Workflow",
+    "Task",
+    "Agent",
+    "Monitor",
+    "SendMessage",
+    "CronCreate",
+    "CronDelete",
+    "CronList",
+]
 
-def _resolve_allowed_tools(config: TaskConfig) -> list[str]:
-    """Resolve the SDK ``allowed_tools`` list for a task.
 
-    This is the second enforcement layer the design promises alongside Cedar's
-    ``context.read_only`` (WORKFLOWS.md §"Agent configuration"):
+def _resolve_allowed_tools(config: TaskConfig) -> list[str]:
+    """Resolve the SDK ``allowed_tools`` (auto-approve) list for a task.
 
     - The resolved workflow's ``agent_config.allowed_tools`` (threaded onto
       ``config.allowed_tools``) is passed to the SDK verbatim. An empty list —
       legacy/batch callers that never resolved a workflow — falls back to the
       built-in full surface.
-    - ``Write``/``Edit`` are dropped whenever ``config.read_only`` is true, so a
-      read-only lane physically cannot mutate the tree even where Cedar's
-      ``read_only`` rules do not fire (e.g. a ``read_only:false`` default that
-      restricts tools by list alone, like ``default/agent-v1``).
-
-    The Cedar PreToolUse hooks still enforce per-task restrictions on top of
-    whatever is allowed here; this list only ever narrows the surface.
+    - ``Write``/``Edit`` are dropped whenever ``config.read_only`` is true.
+
+    IMPORTANT: this list only governs auto-approval, NOT the reachable surface.
+    Per the Agent SDK, a tool omitted here is not blocked — it falls through to
+    ``permission_mode`` (``bypassPermissions`` ⇒ allowed). The actual surface
+    lock is ``_DISALLOWED_TOOLS`` passed to ``disallowed_tools``. NOTE the Cedar
+    PreToolUse hooks are NOT a backstop for an unknown tool name: the engine
+    default-permits on no-match (``policy.py``), so it only denies the specific
+    actions it has ``forbid`` rules for (e.g. Write/Edit under read_only) —
+    ``Workflow``/``Task``/``Agent`` match nothing and would be allowed. So
+    ``disallowed_tools`` is the ONLY thing keeping them out; do not rely on this
+    allow-list, nor on Cedar, to remove a tool from the surface.
     """
     tools = list(config.allowed_tools) if config.allowed_tools else list(_FULL_TOOL_SURFACE)
     if config.read_only:
         tools = [t for t in tools if t not in _WRITE_TOOLS]
     return tools
 
 
+def _resolve_setting_sources(config: TaskConfig) -> list[Literal["user", "project", "local"]]:
+    """Which on-disk Claude Code settings the CLI may load for this task.
+
+    A task with a cloned repo loads ``["project"]`` so the repo's own
+    ``.claude/`` config is honored. A task with no repo loads nothing —
+    defense-in-depth that also stops a stray on-disk skill (e.g. one that spawns
+    a background Workflow) from being reachable. Kept as a named helper so the
+    policy is unit-testable without driving the SDK.
+
+    Keys on ``repo_url`` (repo presence), NOT ``requires_repo`` (a static
+    workflow property): a repo-optional workflow given a repo takes the
+    repo-bound clone path (``pipeline.py`` gates on ``not requires_repo and not
+    repo_url``), so keying on ``requires_repo`` would clone the repo but drop
+    its ``.claude/`` config. Mirrors ``create-task-core.ts`` keying
+    ``branch_name`` on repo presence for the same reason.
+    """
+    return ["project"] if config.repo_url else []
+
+
 async def run_agent(
     prompt: str,
     system_prompt: str,
@@ -439,10 +501,15 @@ def _on_stderr(line: str) -> None:
         model=config.anthropic_model,
         system_prompt=system_prompt,
         allowed_tools=allowed_tools,
+        # Hard surface lock (NOT allowed_tools — that is auto-approve only). Keeps
+        # off-session/defer vectors out of the model's context even under
+        # bypassPermissions, so a one-shot headless task cannot launch detached
+        # work it has no supervisor to await. See _DISALLOWED_TOOLS.
+        disallowed_tools=list(_DISALLOWED_TOOLS),
         permission_mode="bypassPermissions",
         cwd=cwd,
         max_turns=config.max_turns,
-        setting_sources=["project"],
+        setting_sources=_resolve_setting_sources(config),
         hooks=hooks,
         max_budget_usd=config.max_budget_usd,
         stderr=_on_stderr,
 
@@ -7,7 +7,7 @@
 
 import os
 import time
-from typing import TypedDict
+from typing import Any, TypedDict
 
 from shell import log, log_error_cw
 
@@ -246,7 +246,7 @@ def write_terminal(task_id: str, status: str, result: dict | None = None) -> Non
             return
         now = _now_iso()
         expr_names = {"#s": "status"}
-        expr_values = {
+        expr_values: dict[str, Any] = {
             ":s": status,
             ":t": now,
             ":sca": f"{status}#{now}",
@@ -294,6 +294,21 @@ def write_terminal(task_id: str, status: str, result: dict | None = None) -> Non
             if result.get("memory_written") is not None:
                 update_parts.append("memory_written = :mw")
                 expr_values[":mw"] = result["memory_written"]
+            # Verification verdict (#515 replay bundle). build_passed/lint_passed
+            # were historically dropped here (present on TaskResult but never
+            # written), so TaskDetail.build_passed was always null. Persist both
+            # so the replay bundle carries a structured verification signal.
+            if result.get("build_passed") is not None:
+                update_parts.append("build_passed = :bp")
+                expr_values[":bp"] = bool(result["build_passed"])
+            if result.get("lint_passed") is not None:
+                update_parts.append("lint_passed = :lp")
+                expr_values[":lp"] = bool(result["lint_passed"])
+            # OTEL trace id (#515) for cross-plane correlation. Absent on tasks
+            # that predate this field and when tracing is unavailable.
+            if result.get("otel_trace_id"):
+                update_parts.append("otel_trace_id = :otid")
+                expr_values[":otid"] = result["otel_trace_id"]
             # --trace artifact URI (design §10.1). Written atomically
             # with the terminal-status transition so a consumer that
             # reads TaskRecord.trace_s3_uri immediately after