Skip to content

Commit b895999

Browse files
authored
Merge branch 'main' into feat/282-dead-code-detection-gate
2 parents 9e57b44 + 37a5c88 commit b895999

37 files changed

Lines changed: 1798 additions & 47 deletions

agent/src/config.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -453,6 +453,7 @@ def build_config(
453453
issue_number: str = "",
454454
github_token: str = "",
455455
anthropic_model: str = "",
456+
haiku_model: str = "",
456457
max_turns: int = 10,
457458
max_budget_usd: float | None = None,
458459
aws_region: str = "",
@@ -484,6 +485,12 @@ def build_config(
484485
resolved_anthropic_model = anthropic_model or os.environ.get(
485486
"ANTHROPIC_MODEL", "us.anthropic.claude-sonnet-4-6"
486487
)
488+
# Small/fast auxiliary model (WebFetch summarization etc.). Falls back to the
489+
# deployed ANTHROPIC_DEFAULT_HAIKU_MODEL env, then the platform default. Must
490+
# be an inference-profile id (us.*), not a bare model id (see runner).
491+
resolved_haiku_model = haiku_model or os.environ.get(
492+
"ANTHROPIC_DEFAULT_HAIKU_MODEL", "us.anthropic.claude-haiku-4-5-20251001-v1:0"
493+
)
487494

488495
# Resolve the workflow id (the create-task boundary already pinned it; local
489496
# batch runs default to the coding workflow). Required-input validation is
@@ -561,6 +568,7 @@ def build_config(
561568
github_token=resolved_github_token,
562569
aws_region=resolved_aws_region,
563570
anthropic_model=resolved_anthropic_model,
571+
haiku_model=resolved_haiku_model,
564572
dry_run=dry_run,
565573
max_turns=max_turns,
566574
max_budget_usd=max_budget_usd,

agent/src/models.py

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -154,6 +154,11 @@ class TaskConfig(BaseModel):
154154
github_token: str = ""
155155
aws_region: str
156156
anthropic_model: str = "us.anthropic.claude-sonnet-4-6"
157+
# The "small/fast" model Claude Code uses for auxiliary work (e.g. WebFetch
158+
# page summarization). Must be a cross-region INFERENCE-PROFILE id (``us.``
159+
# prefix), not a bare foundation-model id — Claude 4.x cannot be invoked
160+
# on-demand by bare id on Bedrock. Threaded to ANTHROPIC_DEFAULT_HAIKU_MODEL.
161+
haiku_model: str = "us.anthropic.claude-haiku-4-5-20251001-v1:0"
157162
dry_run: bool = False
158163
max_turns: int = 10
159164
max_budget_usd: float | None = None
@@ -326,8 +331,13 @@ class TaskResult(BaseModel):
326331
status: str
327332
agent_status: str = "unknown"
328333
pr_url: str | None = None
329-
build_passed: bool = False
330-
lint_passed: bool = False
334+
# Tri-state (#515): True/False once the post-run gate runs; None when it did
335+
# not (repo-less workflow has no build/lint; a crash before post-hooks). The
336+
# None case is persisted as "absent" by write_terminal's `is not None` guard,
337+
# so the replay bundle reports verification:null rather than a fictional
338+
# build_passed:false for a gate that never executed.
339+
build_passed: bool | None = None
340+
lint_passed: bool | None = None
331341
cost_usd: float | None = None
332342
# Rev-5 DATA-1: historically the `turns` field was set to the SDK's
333343
# `ResultMessage.num_turns`, which INCLUDES the attempted turn that
@@ -365,3 +375,7 @@ class TaskResult(BaseModel):
365375
# Phase 3), or ``None`` for coding tasks / when no artifact was delivered.
366376
# Surfaced on TaskDetail so the user can retrieve the knowledge-task output.
367377
artifact_uri: str | None = None
378+
# OTEL trace id (32-char hex) of the task's root span, captured at terminal
379+
# write so the replay bundle (#515) can correlate the task to its
380+
# CloudWatch/X-Ray trace. ``None`` when tracing is unavailable (local/dev).
381+
otel_trace_id: str | None = None

agent/src/observability.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,25 @@ def task_span(
5757
raise
5858

5959

60+
def current_otel_trace_id() -> str | None:
61+
"""Return the active span's trace id as a 32-char lowercase hex string.
62+
63+
Used to persist a cross-plane correlation id on the TaskRecord (#515 replay
64+
bundle) so operators can join the task to its CloudWatch/X-Ray trace. Returns
65+
``None`` when there is no recording span (e.g. tracing disabled locally) or
66+
the context is invalid, so callers can treat it as a graceful-missing field.
67+
"""
68+
span = trace.get_current_span()
69+
ctx = span.get_span_context()
70+
if not ctx.is_valid:
71+
return None
72+
# format_trace_id renders the 128-bit id as zero-padded 32-char hex — the
73+
# OTEL format, so it joins directly in CloudWatch Transaction Search. Note
74+
# the X-Ray console renders trace ids as ``1-{8hex}-{24hex}``; to look this
75+
# up there, transform to that form (the timestamp is the first 8 hex chars).
76+
return trace.format_trace_id(ctx.trace_id)
77+
78+
6079
def set_session_id(session_id: str) -> None:
6180
"""Propagate *session_id* via OTEL baggage for AgentCore session correlation.
6281

agent/src/pipeline.py

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626
from jira_reactions import comment_task_finished, comment_task_started
2727
from linear_reactions import react_task_finished, react_task_started
2828
from models import AgentResult, HydratedContext, RepoSetup, TaskConfig, TaskResult
29-
from observability import task_span
29+
from observability import current_otel_trace_id, task_span
3030
from post_hooks import (
3131
_extract_agent_notes,
3232
ensure_committed,
@@ -363,6 +363,7 @@ def _run_repoless_task(
363363
cache_read_input_tokens=usage.cache_read_input_tokens if usage else None,
364364
cache_creation_input_tokens=usage.cache_creation_input_tokens if usage else None,
365365
trace_s3_uri=trace_s3_uri,
366+
otel_trace_id=current_otel_trace_id(),
366367
)
367368
result_dict = result.model_dump()
368369

@@ -1127,6 +1128,7 @@ def _on_trace_truncated(max_bytes: int, first_dropped: int) -> None:
11271128
cache_read_input_tokens=usage.cache_read_input_tokens if usage else None,
11281129
cache_creation_input_tokens=usage.cache_creation_input_tokens if usage else None,
11291130
trace_s3_uri=trace_s3_uri,
1131+
otel_trace_id=current_otel_trace_id(),
11301132
)
11311133

11321134
result_dict = result.model_dump()
@@ -1137,8 +1139,11 @@ def _on_trace_truncated(max_bytes: int, first_dropped: int) -> None:
11371139
root_span.set_attribute("agent.cost_usd", float(result.cost_usd))
11381140
if result.turns:
11391141
root_span.set_attribute("agent.turns", int(result.turns))
1140-
root_span.set_attribute("build.passed", result.build_passed)
1141-
root_span.set_attribute("lint.passed", result.lint_passed)
1142+
# On the repo path these are always real bools (computed by the post
1143+
# hooks above); coalesce for the span attribute since the field type
1144+
# is now tri-state (bool | None) for the repo-less/crash case.
1145+
root_span.set_attribute("build.passed", bool(result.build_passed))
1146+
root_span.set_attribute("lint.passed", bool(result.lint_passed))
11421147
root_span.set_attribute("pr.url", result.pr_url or "")
11431148
root_span.set_attribute("task.duration_s", result.duration_s)
11441149
if usage:
@@ -1192,6 +1197,10 @@ def _on_trace_truncated(max_bytes: int, first_dropped: int) -> None:
11921197
task_id=config.task_id,
11931198
agent_status=agent_for_chain.status if agent_for_chain else "unknown",
11941199
trace_s3_uri=crash_trace_s3_uri,
1200+
# Still inside `with task_span()`, so the id is live — capture it
1201+
# here too or FAILED tasks (the primary post-mortem case for the
1202+
# replay bundle, #515) persist otel_trace_id: null.
1203+
otel_trace_id=current_otel_trace_id(),
11951204
)
11961205
task_state.write_terminal(config.task_id, "FAILED", crash_result.model_dump())
11971206
# Best-effort ❌ on the Linear issue so the stale 👀 doesn't linger.

agent/src/prompts/default_agent.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,5 +27,11 @@
2727
- Read the task carefully and any attachments referenced in the user message.
2828
- Use your available tools to research, analyse, or draft as the task requires.
2929
- When you have completed the work, summarise the result clearly in your final
30-
message — that summary is the deliverable.
30+
message — that summary is the deliverable, uploaded as the task artifact, so
31+
make it self-contained rather than a pointer to work elsewhere.
32+
- Do the work yourself in this session and finish before you stop. There is no
33+
human watching and no follow-up turn: do NOT defer the work to a background
34+
job, workflow, or sub-agent, and never end with "results will follow" or
35+
"watch progress elsewhere". If you cannot complete it within the turn budget,
36+
deliver your best partial answer with what you found — not a promise.
3137
"""

agent/src/prompts/web_research.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,4 +44,9 @@
4444
- Cite each non-obvious claim with the source it came from (URL or title).
4545
- Your final message IS the deliverable — it is uploaded as the task artifact,
4646
so make it self-contained and complete rather than a pointer to work elsewhere.
47+
- Do the research yourself in this session and finish before you stop. There is
48+
no human watching and no follow-up turn: do NOT defer the work to a background
49+
job, workflow, or sub-agent, and never end with "results will follow" or
50+
"watch progress elsewhere". If you cannot complete it within the turn budget,
51+
deliver your best partial answer with what you found — not a promise.
4752
"""

agent/src/runner.py

Lines changed: 81 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@
2727

2828
import os
2929
import subprocess
30-
from typing import Any
30+
from typing import Any, Literal
3131
from urllib.parse import quote
3232

3333
from config import AGENT_WORKSPACE
@@ -132,7 +132,13 @@ def _setup_agent_env(config: TaskConfig) -> tuple[str | None, str | None]:
132132
# writes, while the SDK is waiting on stdout). The stderr callback in
133133
# ClaudeAgentOptions cannot drain fast enough to prevent this.
134134
os.environ.pop("ANTHROPIC_LOG", None)
135-
os.environ["ANTHROPIC_DEFAULT_HAIKU_MODEL"] = "anthropic.claude-haiku-4-5-20251001-v1:0"
135+
# Small/fast auxiliary model (WebFetch summarization etc.), from config like
136+
# ANTHROPIC_MODEL above — resolved from the deployed ANTHROPIC_DEFAULT_HAIKU_MODEL
137+
# env (agent.ts) with a platform default in config.py. Must be a cross-region
138+
# INFERENCE-PROFILE id (``us.`` prefix): Claude 4.x cannot be invoked on-demand
139+
# by bare model id on Bedrock (400 "on-demand throughput isn't supported",
140+
# seen on WebFetch's Haiku sub-calls); config.py resolves that default.
141+
os.environ["ANTHROPIC_DEFAULT_HAIKU_MODEL"] = config.haiku_model
136142

137143
# Save OTLP endpoint/protocol configured by ADOT auto-instrumentation
138144
# before stripping, so we can re-use it for Claude Code CLI telemetry.
@@ -335,31 +341,87 @@ def _initialize_policy_engine_and_hooks(
335341
# read-only workflow.
336342
_WRITE_TOOLS = frozenset(("Write", "Edit"))
337343

344+
# Tools that DEFER work off-session and are hard-blocked for every task. These
345+
# launch detached / cross-session orchestration that a one-shot headless agent
346+
# has no supervisor to await: the ``Workflow`` tool returns a task id and runs
347+
# in the background (its result arrives via a notification into an interactive
348+
# session that does not exist here), and ``Task``/``Agent`` can spawn background
349+
# subagents. We saw a repo-less task launch a background ``Workflow`` and then
350+
# finalize on the first ResultMessage with a placeholder artifact while the real
351+
# research ran on, detached (task 01KWDEFQH6...). CRITICAL: ``allowed_tools`` is
352+
# only an auto-APPROVE list — per the Agent SDK docs it does NOT restrict the
353+
# surface; unlisted tools fall through to ``permission_mode``, and under
354+
# ``bypassPermissions`` they are simply allowed. ``disallowed_tools`` is the
355+
# only hard lock (it removes the tool from the model's context even under
356+
# bypass), so the block must live there, not in the allow-list.
357+
# ``Workflow`` (background multi-agent orchestration) is the one that bit us;
358+
# ``Task``/``Agent`` are the sub-agent spawners (name varies by CLI version, so
359+
# block both); ``Monitor`` streams a background command's output mid-turn;
360+
# ``SendMessage`` resumes/relaunches background agents; the ``Cron*`` tools
361+
# schedule deferred work. All are "return now, work continues off-session"
362+
# vectors a one-shot task cannot await. NOT blockable here: background ``Bash``
363+
# (a ``run_in_background`` PARAMETER of Bash, not a tool name) — but a detached
364+
# Bash child dies with the MicroVM on return, so it can't produce
365+
# arrives-later work the way a cloud Workflow does; the deliver-artifact
366+
# deferral guard (deliverers._reject_if_deferral) is the backstop for anything
367+
# that still ends in a placeholder.
368+
_DISALLOWED_TOOLS = [
369+
"Workflow",
370+
"Task",
371+
"Agent",
372+
"Monitor",
373+
"SendMessage",
374+
"CronCreate",
375+
"CronDelete",
376+
"CronList",
377+
]
338378

339-
def _resolve_allowed_tools(config: TaskConfig) -> list[str]:
340-
"""Resolve the SDK ``allowed_tools`` list for a task.
341379

342-
This is the second enforcement layer the design promises alongside Cedar's
343-
``context.read_only`` (WORKFLOWS.md §"Agent configuration"):
380+
def _resolve_allowed_tools(config: TaskConfig) -> list[str]:
381+
"""Resolve the SDK ``allowed_tools`` (auto-approve) list for a task.
344382
345383
- The resolved workflow's ``agent_config.allowed_tools`` (threaded onto
346384
``config.allowed_tools``) is passed to the SDK verbatim. An empty list —
347385
legacy/batch callers that never resolved a workflow — falls back to the
348386
built-in full surface.
349-
- ``Write``/``Edit`` are dropped whenever ``config.read_only`` is true, so a
350-
read-only lane physically cannot mutate the tree even where Cedar's
351-
``read_only`` rules do not fire (e.g. a ``read_only:false`` default that
352-
restricts tools by list alone, like ``default/agent-v1``).
353-
354-
The Cedar PreToolUse hooks still enforce per-task restrictions on top of
355-
whatever is allowed here; this list only ever narrows the surface.
387+
- ``Write``/``Edit`` are dropped whenever ``config.read_only`` is true.
388+
389+
IMPORTANT: this list only governs auto-approval, NOT the reachable surface.
390+
Per the Agent SDK, a tool omitted here is not blocked — it falls through to
391+
``permission_mode`` (``bypassPermissions`` ⇒ allowed). The actual surface
392+
lock is ``_DISALLOWED_TOOLS`` passed to ``disallowed_tools``. NOTE the Cedar
393+
PreToolUse hooks are NOT a backstop for an unknown tool name: the engine
394+
default-permits on no-match (``policy.py``), so it only denies the specific
395+
actions it has ``forbid`` rules for (e.g. Write/Edit under read_only) —
396+
``Workflow``/``Task``/``Agent`` match nothing and would be allowed. So
397+
``disallowed_tools`` is the ONLY thing keeping them out; do not rely on this
398+
allow-list, nor on Cedar, to remove a tool from the surface.
356399
"""
357400
tools = list(config.allowed_tools) if config.allowed_tools else list(_FULL_TOOL_SURFACE)
358401
if config.read_only:
359402
tools = [t for t in tools if t not in _WRITE_TOOLS]
360403
return tools
361404

362405

406+
def _resolve_setting_sources(config: TaskConfig) -> list[Literal["user", "project", "local"]]:
407+
"""Which on-disk Claude Code settings the CLI may load for this task.
408+
409+
A task with a cloned repo loads ``["project"]`` so the repo's own
410+
``.claude/`` config is honored. A task with no repo loads nothing —
411+
defense-in-depth that also stops a stray on-disk skill (e.g. one that spawns
412+
a background Workflow) from being reachable. Kept as a named helper so the
413+
policy is unit-testable without driving the SDK.
414+
415+
Keys on ``repo_url`` (repo presence), NOT ``requires_repo`` (a static
416+
workflow property): a repo-optional workflow given a repo takes the
417+
repo-bound clone path (``pipeline.py`` gates on ``not requires_repo and not
418+
repo_url``), so keying on ``requires_repo`` would clone the repo but drop
419+
its ``.claude/`` config. Mirrors ``create-task-core.ts`` keying
420+
``branch_name`` on repo presence for the same reason.
421+
"""
422+
return ["project"] if config.repo_url else []
423+
424+
363425
async def run_agent(
364426
prompt: str,
365427
system_prompt: str,
@@ -439,10 +501,15 @@ def _on_stderr(line: str) -> None:
439501
model=config.anthropic_model,
440502
system_prompt=system_prompt,
441503
allowed_tools=allowed_tools,
504+
# Hard surface lock (NOT allowed_tools — that is auto-approve only). Keeps
505+
# off-session/defer vectors out of the model's context even under
506+
# bypassPermissions, so a one-shot headless task cannot launch detached
507+
# work it has no supervisor to await. See _DISALLOWED_TOOLS.
508+
disallowed_tools=list(_DISALLOWED_TOOLS),
442509
permission_mode="bypassPermissions",
443510
cwd=cwd,
444511
max_turns=config.max_turns,
445-
setting_sources=["project"],
512+
setting_sources=_resolve_setting_sources(config),
446513
hooks=hooks,
447514
max_budget_usd=config.max_budget_usd,
448515
stderr=_on_stderr,

agent/src/task_state.py

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77

88
import os
99
import time
10-
from typing import TypedDict
10+
from typing import Any, TypedDict
1111

1212
from shell import log, log_error_cw
1313

@@ -246,7 +246,7 @@ def write_terminal(task_id: str, status: str, result: dict | None = None) -> Non
246246
return
247247
now = _now_iso()
248248
expr_names = {"#s": "status"}
249-
expr_values = {
249+
expr_values: dict[str, Any] = {
250250
":s": status,
251251
":t": now,
252252
":sca": f"{status}#{now}",
@@ -294,6 +294,21 @@ def write_terminal(task_id: str, status: str, result: dict | None = None) -> Non
294294
if result.get("memory_written") is not None:
295295
update_parts.append("memory_written = :mw")
296296
expr_values[":mw"] = result["memory_written"]
297+
# Verification verdict (#515 replay bundle). build_passed/lint_passed
298+
# were historically dropped here (present on TaskResult but never
299+
# written), so TaskDetail.build_passed was always null. Persist both
300+
# so the replay bundle carries a structured verification signal.
301+
if result.get("build_passed") is not None:
302+
update_parts.append("build_passed = :bp")
303+
expr_values[":bp"] = bool(result["build_passed"])
304+
if result.get("lint_passed") is not None:
305+
update_parts.append("lint_passed = :lp")
306+
expr_values[":lp"] = bool(result["lint_passed"])
307+
# OTEL trace id (#515) for cross-plane correlation. Absent on tasks
308+
# that predate this field and when tracing is unavailable.
309+
if result.get("otel_trace_id"):
310+
update_parts.append("otel_trace_id = :otid")
311+
expr_values[":otid"] = result["otel_trace_id"]
297312
# --trace artifact URI (design §10.1). Written atomically
298313
# with the terminal-status transition so a consumer that
299314
# reads TaskRecord.trace_s3_uri immediately after

0 commit comments

Comments
 (0)