diff --git a/.github/workflows/compare-trace-tests.yml b/.github/workflows/compare-trace-tests.yml
new file mode 100644
index 00000000..019cc5e1
--- /dev/null
+++ b/.github/workflows/compare-trace-tests.yml
@@ -0,0 +1,30 @@
+name: Compare Trace Skill Tests
+
+on:
+  push:
+    branches: [main]
+    paths:
+      - 'skills/compare-trace/**'
+      - '.github/workflows/compare-trace-tests.yml'
+  pull_request:
+    paths:
+      - 'skills/compare-trace/**'
+      - '.github/workflows/compare-trace-tests.yml'
+
+permissions:
+  contents: read
+
+jobs:
+  test:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+
+      - uses: actions/setup-python@v5
+        with:
+          python-version: '3.11'
+
+      # test_otel_spans.py exercises sources/otel_spans.py via subprocess.
+      # Stdlib only — no pip install needed.
+      - name: Run otel_spans normalizer tests
+        run: python3 skills/compare-trace/tests/test_otel_spans.py
diff --git a/.gitignore b/.gitignore
index 34092a97..402f58ce 100644
--- a/.gitignore
+++ b/.gitignore
@@ -14,3 +14,9 @@ temp/
 __pycache__/
 *.pyc
 .work/
+
+# Local install artifacts from running plugins/codex/scripts/install.sh
+# against this repo. Intended for downstream target repos, not this one.
+.agents/
+.codex/
+plugins/mc-agent-toolkit/
diff --git a/README.md b/README.md
index e0042465..b63a6fe6 100644
--- a/README.md
+++ b/README.md
@@ -48,6 +48,12 @@ Skills are grouped by the job they help you do. Orchestrated workflows sequence
 | **Storage Cost Analysis** | Identifies storage waste (unread, zombie, dead-end tables); uses lineage to verify cleanup is safe and estimates savings. | [README](skills/storage-cost-analysis/README.md) |
 | **Performance Diagnosis** | Diagnoses slow pipelines and expensive queries across Airflow, dbt, Databricks, and other platforms. | [README](skills/performance-diagnosis/README.md) |
 
+### Evaluate — compare agent runs
+
+| Skill | Description | Details |
+|---|---|---|
+| **Compare Trace** | A/B compares two existing agent traces (by ID) — graph path, latency/tokens, tool-call sequence, plus LLM-based semantic and entity-overlap diffs over the final answers. Emits an HTML report. | [README](skills/compare-trace/README.md) |
+
 ### Setup — ingestion and connections
 
 | Skill | Description | Details |
diff --git a/plugins/claude-code/evals/compare-trace/trigger-evals.json b/plugins/claude-code/evals/compare-trace/trigger-evals.json
new file mode 100644
index 00000000..3e975edd
--- /dev/null
+++ b/plugins/claude-code/evals/compare-trace/trigger-evals.json
@@ -0,0 +1,126 @@
+{
+  "skill": "monte-carlo-compare-trace",
+  "description": "Trigger accuracy evals for the monte-carlo-compare-trace skill. Each case specifies whether the skill SHOULD or SHOULD NOT be triggered by the given prompt.",
+  "cases": [
+    {
+      "id": "should-01",
+      "prompt": "Compare these two agent conversations: 019e8f2a-24ae-7880-8901-cbc79aca43ed and 019e9319-e88e-7080-bc78-2aff46543849",
+      "expected": "trigger",
+      "rationale": "Direct A/B compare with two conversation IDs -- core skill use case"
+    },
+    {
+      "id": "should-02",
+      "prompt": "Diff these two agent runs and tell me what changed.",
+      "expected": "trigger",
+      "rationale": "Explicit 'diff two agent runs' phrasing -- listed trigger"
+    },
+    {
+      "id": "should-03",
+      "prompt": "I tweaked the system prompt for the coverage agent and re-ran it on the same conversation. Did the change cause a regression? Here are the two IDs.",
+      "expected": "trigger",
+      "rationale": "Prompt change regression check between two runs -- explicit use case"
+    },
+    {
+      "id": "should-04",
+      "prompt": "/compare-trace 019e8f2a-24ae-7880-8901-cbc79aca43ed 019e9319-e88e-7080-bc78-2aff46543849",
+      "expected": "trigger",
+      "rationale": "Explicit slash-command invocation -- skill's own command"
+    },
+    {
+      "id": "should-05",
+      "prompt": "Here are two conversation IDs from the chat agent — show me how the tool sequences differ.",
+      "expected": "trigger",
+      "rationale": "Tool-sequence diff between two conversations -- tool_call_diff evaluator territory"
+    },
+    {
+      "id": "should-06",
+      "prompt": "We swapped the agent model from claude-3-5 to claude-sonnet-4 and re-ran a fixed scenario. Compare the two runs.",
+      "expected": "trigger",
+      "rationale": "Model-swap A/B with two runs to compare -- explicit use case"
+    },
+    {
+      "id": "should-07",
+      "prompt": "Compare the OTel traces from these two agent runs and produce a side-by-side report.",
+      "expected": "trigger",
+      "rationale": "Trace-level comparison with HTML-report intent -- matches skill output"
+    },
+    {
+      "id": "should-08",
+      "prompt": "I have two agent traces I want to look at side-by-side. Conversation IDs are X and Y.",
+      "expected": "trigger",
+      "rationale": "Side-by-side trace comparison with both IDs supplied"
+    },
+    {
+      "id": "should-09",
+      "prompt": "Show me the difference in graph path between baseline and candidate runs of the coverage agent.",
+      "expected": "trigger",
+      "rationale": "Graph-path diff between two runs -- graph_path_diff evaluator territory"
+    },
+    {
+      "id": "should-10",
+      "prompt": "Did removing the get_use_cases tool change how the coverage agent handles 'what's my coverage gap?' Compare a before and after run.",
+      "expected": "trigger",
+      "rationale": "Tool-loadout change A/B between two runs -- explicit use case"
+    },
+    {
+      "id": "should-11",
+      "prompt": "Compare the latency and token usage between these two agent conversations.",
+      "expected": "trigger",
+      "rationale": "Latency/token diff between two runs -- latency_diff evaluator territory"
+    },
+    {
+      "id": "should-not-01",
+      "prompt": "What went wrong with this agent run? Here's the conversation ID.",
+      "expected": "no-trigger",
+      "rationale": "Single-trace troubleshooting -- not a comparison; routes to analyze-root-cause / incident-response"
+    },
+    {
+      "id": "should-not-02",
+      "prompt": "Investigate why this trace failed. The conversation ID is 019e8f2a-24ae-7880-8901-cbc79aca43ed.",
+      "expected": "no-trigger",
+      "rationale": "Single-trace failure investigation -- not an A/B comparison"
+    },
+    {
+      "id": "should-not-03",
+      "prompt": "Compare row counts between our staging and production orders tables.",
+      "expected": "no-trigger",
+      "rationale": "Cross-table data comparison -- routes to monitoring-advisor (comparison monitor); not agent A/B"
+    },
+    {
+      "id": "should-not-04",
+      "prompt": "How does my chat agent perform overall? Show me aggregate metrics.",
+      "expected": "no-trigger",
+      "rationale": "Aggregate performance question with no two specific conversation IDs to compare"
+    },
+    {
+      "id": "should-not-05",
+      "prompt": "Set up an evaluation monitor for my chat agent to track response quality over time.",
+      "expected": "no-trigger",
+      "rationale": "Agent eval monitor creation -- routes to monitoring-advisor"
+    },
+    {
+      "id": "should-not-06",
+      "prompt": "Diff these two SQL queries and tell me which one is more efficient.",
+      "expected": "no-trigger",
+      "rationale": "SQL comparison -- wrong domain (not agent traces)"
+    },
+    {
+      "id": "should-not-07",
+      "prompt": "Show me the trace for conversation 019e8f2a-24ae-7880-8901-cbc79aca43ed.",
+      "expected": "no-trigger",
+      "rationale": "Single-trace inspection -- not a comparison"
+    },
+    {
+      "id": "should-not-08",
+      "prompt": "Help me build a prompt eval framework for my LangGraph agent.",
+      "expected": "no-trigger",
+      "rationale": "Generic eval-framework engineering -- not an A/B compare on existing runs"
+    },
+    {
+      "id": "should-not-09",
+      "prompt": "Compare these two dbt models and tell me which one has more downstream tables.",
+      "expected": "no-trigger",
+      "rationale": "dbt model comparison -- wrong domain"
+    }
+  ]
+}
diff --git a/plugins/claude-code/skills/compare-trace b/plugins/claude-code/skills/compare-trace
new file mode 120000
index 00000000..51b49682
--- /dev/null
+++ b/plugins/claude-code/skills/compare-trace
@@ -0,0 +1 @@
+../../../skills/compare-trace
\ No newline at end of file
diff --git a/plugins/codex/skills/compare-trace b/plugins/codex/skills/compare-trace
new file mode 120000
index 00000000..51b49682
--- /dev/null
+++ b/plugins/codex/skills/compare-trace
@@ -0,0 +1 @@
+../../../skills/compare-trace
\ No newline at end of file
diff --git a/plugins/copilot/skills/compare-trace b/plugins/copilot/skills/compare-trace
new file mode 120000
index 00000000..51b49682
--- /dev/null
+++ b/plugins/copilot/skills/compare-trace
@@ -0,0 +1 @@
+../../../skills/compare-trace
\ No newline at end of file
diff --git a/plugins/cursor/skills/compare-trace b/plugins/cursor/skills/compare-trace
new file mode 120000
index 00000000..51b49682
--- /dev/null
+++ b/plugins/cursor/skills/compare-trace
@@ -0,0 +1 @@
+../../../skills/compare-trace
\ No newline at end of file
diff --git a/plugins/opencode/skills/compare-trace b/plugins/opencode/skills/compare-trace
new file mode 120000
index 00000000..51b49682
--- /dev/null
+++ b/plugins/opencode/skills/compare-trace
@@ -0,0 +1 @@
+../../../skills/compare-trace
\ No newline at end of file
diff --git a/skills/README.md b/skills/README.md
index fe27ed35..b082bcec 100644
--- a/skills/README.md
+++ b/skills/README.md
@@ -22,6 +22,7 @@ Skills are platform-agnostic instruction sets that tell an AI coding agent what
 | **[Tune Monitor](tune-monitor/)** | Analyzes a Monte Carlo metric monitor's alert history and recommends configuration changes to reduce noise — sensitivity, WHERE conditions, segment exclusions, schedule, and aggregation. |
 | **[Connection Auth Rules](connection-auth-rules/)** | Build a Connection Auth Rules configuration for a Monte Carlo connection type. Fetches live connector schemas and transform steps from the apollo-agent repo. |
 | **[Instrument Agent](instrument-agent/)** | Instruments a Python AI agent for Monte Carlo Agent Observability — detects AI libraries, installs the Monte Carlo OpenTelemetry SDK, sets up tracing, and verifies traces in Monte Carlo. Asks before editing. |
+| **[Compare Trace](compare-trace/)** | A/B compares two Monte Carlo agent traces by ID — runs graph-path, latency/token, and tool-call diffs plus LLM-based semantic and entity-overlap evals over the final answers, and opens an HTML report. |
 
 ## Standalone Installation
 
diff --git a/skills/compare-trace/README.md b/skills/compare-trace/README.md
new file mode 100644
index 00000000..7ff7bfaf
--- /dev/null
+++ b/skills/compare-trace/README.md
@@ -0,0 +1,44 @@
+# compare-trace
+
+A/B compare two Monte Carlo agent conversations by ID and produce an HTML report.
+
+Trace-driven backport of the [Agent A/B Evaluation Framework](https://github.com/monte-carlo-data/ai-agent/pull/1236) (PR #1236 in `ai-agent`). The original ran the agent itself against fixed scenarios; this skill operates on already-captured conversations fetched via the Monte Carlo MCP server.
+
+## Invocation
+
+```
+/compare-trace <conv_id_a> <conv_id_b>
+```
+
+Optional flags: `--mcon`, `--agent`, `--trace-ids a,b` (force specific OTel trace_ids when a conversation has multiple), `--labels A,B`, `--output path.html`.
+
+## ID model
+
+`conversation_id` is the user-facing identifier (per the OTel GenAI `gen_ai.conversation.id` semantic convention). It's stored as a span attribute, **not** as the OTel `trace_id`. One conversation can contain multiple OTel traces (retries, fan-outs, multi-turn).
+
+The skill resolves `conversation_id → trace_id` via `get_agent_conversation`. By default it picks the trace with the most spans (= the "main" execution); override with `--trace-ids` to compare specific sub-traces.
+
+## Signals
+
+| Signal | Type | Notes |
+|---|---|---|
+| Graph Path | deterministic | Jaccard on node sets + LCS/max ordering |
+| Latency & Tokens | deterministic | Per-metric ratios; flag if candidate > 1.5x baseline |
+| Tool Call Sequence + Args | deterministic | Levenshtein on tool-name sequences; matched calls also get a top-level arg-key diff (added / removed / changed) |
+| Semantic Diff | LLM (inline) | Claude runs prompt over both final-completion texts |
+| Entity Overlap | LLM (inline) | Extracts 8 entity types, computes per-type Jaccard |
+
+The two LLM signals require non-empty `final_output_text` for both sides (pulled from the last completion span in each conversation). Without that, the report ships with the 3 structural signals.
+
+## Files
+
+- `SKILL.md` — full workflow Claude follows
+- `scripts/compare_traces.py` — driver that consumes normalized trace JSON + optional LLM-eval JSON and writes HTML
+- `scripts/evaluators/{graph_path_diff,latency_diff,tool_call_diff}.py` — pure-Python evaluators ported from PR #1236
+- `references/PR1236_MAPPING.md` — fields-and-signals mapping from PR #1236 to the trace API
+
+## Known limitations (v0.3)
+
+- Picks one trace per conversation (the largest non-error one by edge count). Multi-trace conversations (retries, fan-outs) currently get their other traces dropped — pass `--trace-ids` to override.
+- Arg-diff matches calls by name + nearest position (greedy). When a tool's count differs between A and B, the surplus calls go unmatched. v0.4 plan: stable-ID fallback using `tool_use_id` when present.
+- No "structured fields" diff (the 6th evaluator in PR #1236) — only meaningful when you control the agent's output schema, which we don't from trace-land.
diff --git a/skills/compare-trace/SKILL.md b/skills/compare-trace/SKILL.md
new file mode 100644
index 00000000..5a9fced5
--- /dev/null
+++ b/skills/compare-trace/SKILL.md
@@ -0,0 +1,326 @@
+---
+name: compare-trace
+description: Compare two Monte Carlo agent conversations side-by-side. Walks each conversation to its OTel trace, runs structural diffs (graph path, latency/tokens, tool-call sequence) plus LLM-based semantic and entity-overlap evals, and opens an HTML report.
+when_to_use: |
+  Invoke when the user wants to A/B compare two AI agent runs by conversation ID — e.g. "compare these two agent conversations", "diff these two agent runs", "did my prompt change cause a regression", or `/compare-trace <conv_id_a> <conv_id_b>`. Useful for evaluating prompt changes, graph changes, model swaps, or tool-loadout changes by replaying a fixed scenario and comparing the resulting traces.
+
+  Do NOT invoke for:
+  - Single-trace inspection or troubleshooting one agent run (use `analyze-root-cause` / `incident-response`).
+  - Comparing data tables, monitors, or alerts (different domain).
+  - Generic prompt evaluation without two existing conversation IDs to compare.
+bucket: Evaluate
+version: 0.4.0
+---
+
+# Compare Trace
+
+A/B compare two existing Monte Carlo agent conversations. Walks each conversation to its OTel trace via MCP, runs deterministic structural evaluators in a helper script, runs LLM-based semantic and entity evaluators inline, and emits an HTML report.
+
+**Arguments:** $ARGUMENTS
+
+Parse the arguments:
+- **conv_id_a** (required): first positional — the baseline conversation ID (UUIDv7 or whatever ID the MC UI exposes for the run/thread).
+- **conv_id_b** (required): second positional — the candidate conversation ID.
+- **`--mcon <mcon>`** (optional): trace-table MCON shared by both conversations. If omitted, discover via `get_agent_metadata`.
+- **`--agent <name>`** (optional): agent name to disambiguate when multiple agents are configured.
+- **`--trace-ids a,b`** (optional): if a conversation contains multiple traces (retries, fan-out, multi-turn), pass the specific OTel trace_ids to compare. If omitted, the skill picks the main trace from each conversation (see Phase 2 for the rule).
+- **`--labels A,B`** (optional): display labels (default `baseline`, `candidate`).
+- **`--output <path>`** (optional): output HTML path. Default `/tmp/compare-trace/<short_a>_vs_<short_b>.html`.
+
+> **Heritage:** This skill is the trace-driven backport of the [Agent A/B Evaluation Framework](https://github.com/monte-carlo-data/ai-agent/pull/1236) — same 5-signal idea, applied to already-captured traces rather than re-running the agent.
+
+> **ID model:** `conversation_id` is the user-facing identifier (`gen_ai.conversation.id` per the OTel GenAI semantic convention). It's stored as a span attribute, **not** as the OTel `trace_id`. A single conversation can contain multiple OTel traces (retries, parallel branches, multi-turn). This skill takes the conversation_id as primary input, walks to the trace, then compares.
+
+> **Field naming:** The MCP server returns **snake_case** in JSON responses (`trace_id`, `page_info`, `turn_errors`, `has_next_page`, `node_name`, `parent_span_id`, `is_tool_call`, `has_error`, `total_tokens`, `start_time`, `end_time`, `duration_seconds`, `is_tool_call`). The schema *descriptions* sometimes show camelCase — trust the response, not the description.
+
+---
+
+## Trace sources
+
+The comparator works on **normalized traces**. Two ingestion paths produce that shape:
+
+- **MC-stored agent conversations** — the default. Phases 1–3 below walk each conversation via MCP to its OTel trace and assemble the normalized dict.
+- **Locally-collected OTel traces** — for A/B-testing changes (prompt, code, model) before they ship, with no production conversation to point at. The skill ships a local OTLP/HTTP receiver and a span-to-normalized converter. Run your agent twice, capture spans, normalize, then jump straight to Phase 4. See [`references/local-otel-collection.md`](references/local-otel-collection.md).
+
+Both paths produce the same normalized trace shape and feed the same Phase 4+ comparator and HTML report.
+
+---
+
+## Setup
+
+**Prerequisites:**
+- **`python3`** for the helper scripts (stdlib only for the default MC-conversation path).
+- Monte Carlo MCP server (`monte-carlo-mcp`) configured and authenticated.
+- *Local OTel path only:* `opentelemetry-proto` in the Python env that runs the receiver (already a transitive dep of `opentelemetry-sdk`).
+
+Helper scripts live under `${CLAUDE_PLUGIN_ROOT}/skills/compare-trace/scripts/`:
+- `compare_traces.py` — main driver; takes two normalized trace JSON files (+ optional LLM-eval results JSON) and writes the HTML report.
+- `evaluators/graph_path_diff.py`, `evaluators/latency_diff.py`, `evaluators/tool_call_diff.py` — deterministic evaluators ported from PR #1236.
+- `local_otlp_receiver.py`, `sources/otel_spans.py` — used only by the local-OTel ingestion path. See `references/local-otel-collection.md`.
+
+---
+
+## Workflow
+
+### Phase 1: Discover MCON and agent_name (skip if `--mcon` was provided)
+
+Call `get_agent_metadata` with no filters. The response lists agents with `agent_name`, `trace_table_mcon`, and `source_type`. Pick the MCON + `agent_name`:
+- If `--agent <name>` was given, match by `agent_name` exactly.
+- Else if exactly one agent is configured, use it.
+- Else ask the user which agent the conversations belong to (list `agent_name` options).
+
+Both conversations must live on the same MCON. If you suspect otherwise (e.g. labels suggest different envs), ask first.
+
+### Phase 2: Resolve each conversation → OTel trace_id + final completion
+
+For each conversation_id, call:
+
+```
+get_agent_conversation(
+  agent_name=<from Phase 1>,
+  trace_table_mcon=<from Phase 1>,
+  conversation_id=<conv_id>,
+  first=100,                         # the server caps `first` at 100
+  start_time="<conv_id timestamp - 1 min, ISO 8601>",
+  end_time="<conv_id timestamp + 1 hour, ISO 8601>",
+)
+```
+
+The conversation_id is a UUIDv7 — decode the timestamp from the first 48 bits and use a tight window. This drastically cuts response size on big tables.
+
+**Pagination:** The server caps `first` at 100. Track `page_info.has_next_page`; paginate until exhausted **only if** you need the final completion text (see step 3 below). For trace-id selection alone (step 2), the first page is usually sufficient.
+
+From the response:
+
+1. **Collect candidate trace_ids.** From `edges[*].node.trace_id` plus `turn_errors[*].trace_id`. Build two sets: `all_trace_ids` and `error_trace_ids`.
+2. **Pick the main trace.** Apply these rules in order:
+   - If `--trace-ids a,b` was provided, use the matching value (no further filtering).
+   - Else, drop any `trace_id` in `error_trace_ids`. This is critical — failed retries can appear as full sub-runs with many spans once you paginate, so "most spans" alone is not enough.
+   - From the remaining trace_ids, pick the one with the most edges in the response. Tie-break by `min(start_time)` (earliest).
+   - If zero non-error trace_ids remain, abort with `"conversation {conv_id} contains only failed traces"` — don't fabricate a comparison.
+3. **Extract `final_output_text` from the picked trace.** This step is what forces full pagination when `page_info.has_next_page` is true.
+   - Filter `edges` to the chosen `trace_id`.
+   - Order by `start_time` ascending.
+   - Walk **from the end** to find the last edge whose `node.completions` is a non-empty JSON-encoded string.
+   - Parse `node.completions` as JSON — it's a stringified `[{is_end, is_start, message, position, role, tool_calls?}]` array (the conversation API serializes assistant turns as a list of message blocks).
+   - From that parsed array, find the last entry where `role == "assistant"` AND `message` is a non-empty string AND `message` itself is not just whitespace. (You don't need to inspect `tool_calls` — a message field with substantive text is the signal that the assistant produced a final answer rather than only emitting tool requests.) That `message` value is `final_output_text`.
+   - If no such entry exists (e.g. trace ended mid-tool-call), set `final_output_text = ""` — the LLM evals will be skipped and the report will note this.
+4. **Extract `tool_calls` (with args) for the picked trace.** v0.3 feeds these into the argument-diff evaluator.
+   - Iterate `edges` for the chosen `trace_id` in `start_time` order.
+   - For each edge, parse `node.completions` as JSON (same as step 3).
+   - Within each parsed assistant message, iterate its `tool_calls` array (may be empty or missing). Each tool_call has the LangChain/Bedrock shape `{"name": "<str>", "id": "<tool_use_id>", "arguments": "<JSON string>"}`.
+   - For each tool_call, parse `arguments` into a dict. Be defensive: `arguments` is a JSON string in the LangChain/Bedrock shape but may already be a `dict` in Anthropic-native or OpenAI Tool API shapes — `isinstance(arguments, dict) -> use as-is`, `isinstance(arguments, str) -> json.loads(arguments or "{}")`. Empty string and missing keys both mean `{}`.
+   - Accumulate ordered `[{"name": "<str>", "args": <parsed dict>, "id": "<tool_use_id>"}, ...]`. This is the `tool_calls` list for the normalized JSON.
+   - If a completion has a different shape (e.g. OpenAI `function.arguments`, raw Anthropic `tool_use.input` block), parse what you can and fall through with `args = {}` for any blob you can't decode. Don't fail the whole comparison on one weird message.
+
+Cache the conversation response and the picked `trace_id` for Phase 3 — don't refetch.
+
+### Phase 3: Fetch each trace's structural data (with fallback)
+
+For each selected `trace_id`, call:
+
+```
+get_agent_trace(
+  mcon=<from Phase 1>,
+  trace_id=<from Phase 2, dashless hex>,
+  trace_start_time=<conv_id timestamp - 1 min, ISO 8601>,
+  trace_end_time=<+1 hour, ISO 8601>,
+)
+```
+
+The response is a flat span list with `node_name`, `parent_span_id`, `child_span_ids`, `start_time`, `end_time`, `duration` (ms), `total_tokens`, `prompt_tokens`, `completion_tokens`, `is_tool_call`, `has_prompts`, `has_completions`, `has_error`.
+
+**Failure modes and the conversation-edge fallback:**
+
+If `get_agent_trace` errors with `"Incomplete trace"` (or returns empty despite the conversation having edges for that trace_id), fall back to **reconstructing structural data from the conversation edges** you already have from Phase 2:
+
+- Filter the cached `edges` to ones with this `trace_id`, ordered by `start_time`.
+- `node_path` ← `[e.node.name for e in edges]`. This will be **shallower** than `get_agent_trace` would give you — only LLM and tool spans, no internal workflow/task spans. Note this in the chat report.
+- `tool_calls` ← **use the list you already built in Phase 2 step 4** (from assistant completion `tool_calls` blocks). Don't rebuild it from tool-execution spans here — those spans don't carry args.
+- `execution_time_seconds` ← `(max(end_time) - min(start_time))` across these edges, in seconds.
+- `llm_call_count` ← count where `e.node.prompts` is non-empty and `e.node.completions` is non-empty.
+- token counts ← sum of `total_tokens` / `prompt_tokens` / `completion_tokens` where present.
+- `has_errors` ← `True` if any edge had a non-null `status` indicating error, else `False`. (The conversation API doesn't expose `has_error` directly.)
+
+If `get_agent_trace` returns 404 / permission denied / retention expired, stop and report which trace failed. Don't silently treat one side as empty.
+
+**Normalized trace JSON shape** (write one file per trace under `/tmp/compare-trace/<short>.json`):
+
+```json
+{
+  "trace_id": "<hex>",
+  "conversation_id": "<the conv_id this came from>",
+  "label": "baseline|candidate|...",
+  "source": "trace_api" | "conversation_fallback",
+  "node_path": ["root_node", "child_a", "child_b", ...],
+  "tool_calls": [{"name": "<tool>", "args": {"k": "v", ...}, "id": "<tool_use_id>"}, ...],
+  "execution_time_seconds": 12.34,
+  "llm_call_count": 5,
+  "total_tokens": 1234,
+  "prompt_tokens": 900,
+  "completion_tokens": 334,
+  "has_errors": false,
+  "final_output_text": "<from Phase 2>"
+}
+```
+
+**Normalization rules when using `get_agent_trace`:**
+- `node_path`: sort spans by `start_time` ascending and take `node_name` for each. Skip spans where `node_name` is empty.
+- `tool_calls`: **use the list you built in Phase 2 step 4** (from assistant completion `tool_calls` blocks). The trace API's tool-call spans only carry names, not args; the conversation API is where args live, so we always read tool_calls from there regardless of whether structural data came from trace_api or conversation_fallback.
+- `execution_time_seconds`: `(max(end_time) - min(start_time))` in seconds across all spans.
+- `llm_call_count`: count spans where `has_prompts == true` and `has_completions == true`.
+- `total_tokens` / `prompt_tokens` / `completion_tokens`: sum across all spans (skip nulls).
+- `has_errors`: any span with `has_error == true`.
+- `final_output_text`: copied in from Phase 2.
+
+### Phase 4: Run LLM-based evaluators inline (only if both `final_output_text` fields are non-empty)
+
+#### 4a. Semantic diff
+
+Run this prompt yourself (Claude) with the two final outputs as inputs:
+
+```
+You are comparing two AI agent outputs for the same scenario.
+The BASELINE is the reference version. The CANDIDATE is the variant under evaluation.
+
+Focus on SUBSTANCE, not wording — two paragraphs saying the same thing in different
+words are "preserved."
+
+BASELINE:
+<paste final_output_text from trace A, truncated to ~3000 chars>
+
+CANDIDATE:
+<paste final_output_text from trace B, truncated to ~3000 chars>
+
+Respond with exactly this JSON structure (no other text):
+{
+  "verdict": "preserved" | "regression" | "improvement" | "mixed",
+  "similarity_score": 0.0-1.0,
+  "lost_findings": ["<exact quote from baseline that candidate dropped>", ...],
+  "added_findings": ["<exact quote from candidate that baseline lacked>", ...],
+  "explanation": "<1-2 sentence summary of the semantic diff>"
+}
+
+Rules:
+- "preserved" = same core findings, even if phrased differently.
+- "regression" = candidate lost important information.
+- "improvement" = candidate added valuable information.
+- "mixed" = some lost, some added.
+- similarity_score: 1.0 = semantically identical, 0.0 = completely different.
+- For lost_findings / added_findings, QUOTE the actual phrases (≤100 words each).
+  Do not paraphrase.
+```
+
+Save your JSON response into `/tmp/compare-trace/llm_semantic.json`.
+
+#### 4b. Entity overlap
+
+Run this extraction prompt twice (once per final output), with text input truncated to ~4000 chars:
+
+```
+Extract all concrete entities from the text below. Return a JSON object with these keys,
+each mapping to a list of strings. Use exact values from the text — do not paraphrase.
+
+Entity types:
+- table_names: fully qualified table/view names (e.g. "db.schema.table")
+- column_names: column or field names referenced
+- metric_values: numeric values with units (e.g. "45.2%", "1000 rows")
+- timestamps: dates, times, or relative time references
+- job_pipeline_names: ETL job, DAG, pipeline, model, or workflow names
+- pr_commit_refs: PR numbers or commit hashes
+- severity_status: status or severity keywords
+- monitoring_types: monitoring/anomaly type names
+
+Omit empty lists. Return ONLY valid JSON, no other text.
+
+Text:
+<paste final_output_text>
+```
+
+Take both extraction results and compute Jaccard overlap per entity type yourself, then assemble:
+
+```json
+{
+  "per_type_jaccard": {"table_names": 0.83, "column_names": 1.0, ...},
+  "shared":         {"table_names": ["analytics.orders"], ...},
+  "baseline_only":  {"table_names": ["staging.orders"], ...},
+  "candidate_only": {"table_names": ["analytics.orders_v2"], ...},
+  "overall_jaccard": 0.71,
+  "baseline_facts": {...full extraction...},
+  "candidate_facts": {...full extraction...}
+}
+```
+
+Lowercase + strip-trailing-punctuation each value before set comparison (normalize like `_normalize` in PR #1236's `fact_overlap.py`).
+
+Save into `/tmp/compare-trace/llm_entities.json`.
+
+#### 4c. Corpus narrative (optional, 2-3 sentences)
+
+A single short narrative summarising the overall verdict. Save to `/tmp/compare-trace/llm_narrative.txt`. The renderer surfaces this above the per-signal tabs.
+
+**Sanity check before rendering:** If `overall_jaccard` is ~0 and `execution_time_seconds` ratio is >5x, the two conversations are likely **not** the same scenario. Say that explicitly in the narrative — don't let the report read as a clean A/B when the inputs aren't.
+
+### Phase 5: Render the report
+
+```bash
+python3 ${CLAUDE_PLUGIN_ROOT}/skills/compare-trace/scripts/compare_traces.py \
+  --baseline /tmp/compare-trace/<short_a>.json \
+  --candidate /tmp/compare-trace/<short_b>.json \
+  --semantic /tmp/compare-trace/llm_semantic.json \
+  --entities /tmp/compare-trace/llm_entities.json \
+  --narrative /tmp/compare-trace/llm_narrative.txt \
+  --output /tmp/compare-trace/<short_a>_vs_<short_b>.html
+```
+
+The `--semantic`, `--entities`, and `--narrative` flags are all optional — omit them when Phase 4 was skipped.
+
+The script opens the HTML in the user's default browser (`open` on macOS, `xdg-open` on Linux). On failure, print the file path for manual opening.
+
+### Phase 6: Report back
+
+Print a compact summary to chat with:
+- The headline number for each signal (graph similarity, tool similarity, latency assessment, semantic verdict, entity overlap).
+- The report path.
+- For each conversation: which `trace_id` was picked, how many turn_errors traces were skipped, and whether structural data came from `get_agent_trace` or the conversation-edge fallback.
+- If the MC webapp URL helps, call `get_mc_webapp_url` (no args) to get the regionalized base URL and include it — but don't fabricate deep-link paths; the conversation-URL schema isn't a documented public contract.
+
+No walls of raw JSON.
+
+---
+
+## Known limitations (v0.3.1)
+
+| Limitation | Why | Plan |
+|---|---|---|
+| **Picks one trace per conversation.** Multi-trace conversations (retries, fan-outs, multi-turn) get the largest non-error trace; the others are reported as skipped. | One-pair comparison is the simplest mental model. | **v0.4 plan:** aggregate latency/tokens across all sub-traces; show retries in a separate tab. |
+| **`get_agent_trace` "Incomplete trace" forces a shallower comparison.** When we fall back to conversation edges, `node_path` only covers LLM + tool spans (no internal workflow/task spans). | The conversation API doesn't expose the framework's nested-span hierarchy. | **v0.4 plan:** report `source: "conversation_fallback"` more prominently in the HTML and add a "graph depth" note so users understand the lower node count. |
+| **Arg-diff matches calls by name + position only.** PR #1236's `_match_tools_by_proximity` algorithm: for each tool name shared between A and B, greedy-pair calls by closest positional index. Doesn't recover when a tool's arg shape changed *and* its name is the only one shared (e.g. `get_warehouses()` called 5x in A and 3x in B — the extra 2 in A are unmatched). | Position-based greedy matching is what the original framework did. | **v0.4 idea:** add a stable-ID fallback using `tool_use_id` if present. |
+| **No "structured fields" diff (the 6th evaluator in PR #1236).** | Trace outputs are free-form text, not named-field dicts. | Stays dropped — only meaningful when you control the agent's output schema, which we don't in trace-land. |
+
+---
+
+## Heuristics and edge cases
+
+- **Empty conversations.** If `get_agent_conversation` returns zero edges for either conv_id, abort with a clear message rather than producing a misleading 0/0 report. Common causes: wrong MCON, retention expired, wrong agent_name.
+- **All-failed conversations.** If every distinct `trace_id` in a conversation appears in `turn_errors`, abort — see Phase 2 rule 2 last bullet.
+- **Single-span traces.** Graph-path diff and tool-call diff still run, just return trivial results. Don't suppress.
+- **Token counts of 0.** If both traces show 0 total tokens (some agents don't report tokens), the latency evaluator's `total_tokens` row is suppressed automatically (rows where both sides are 0 are filtered).
+- **Very large traces (>500 spans).** Truncate `node_path` and `tool_calls` to 200 entries each in the report's "full sequences" detail blocks — the diffs themselves run on the full lists.
+- **Trace ID format quirks.** `get_agent_conversation` returns dashless 32-char hex `trace_id`s; pass those verbatim to `get_agent_trace`. ClickHouse-backed MCONs accept dashed UUIDs too, but BigQuery-backed ones reject dashes (`non-hexadecimal number found`).
+- **`get_agent_conversation` responses routinely exceed the tool-output cap.** A single page of 100 edges is typically multiple MB once prompts and completions are included. The harness will spill the JSON to a file on disk and hand you a path instead of inline content — parse the file. Don't try to pipe the response into `jq` or assume it's in-message. Plan for 2–5 MB per page on busy agents.
+- **`get_agent_conversation` does not support per-`trace_id` filtering.** When a conversation has multiple traces (main + retries from `turn_errors`), pagination returns interleaved edges across all of them. You'll fetch (and pay for) edges from error traces you'll never use. Filter client-side after the fetch; don't try to scope the API call.
+- **Transient 5xx from the upstream GraphQL.** `get_agent_conversation` and `get_agent_trace` occasionally surface a `502` from `monolith-frontend` mid-pagination. Retry once with ~2–3 s backoff before aborting; a single retry resolves the vast majority of these. If a second attempt fails, stop and surface the error — don't loop.
+- **Not actually an A/B?** Per Phase 4c, watch for `overall_jaccard ≈ 0` paired with a `>5x` exec-time ratio — those usually mean the user picked two unrelated runs by mistake. Call it out in the narrative; the report stat tiles alone won't.
+
+---
+
+## Acceptance — what "done" looks like for a single invocation
+
+You've succeeded when:
+1. Both `get_agent_conversation` calls returned edges; you selected a non-error main trace for each.
+2. Both traces have structural data — either from `get_agent_trace` or the conversation-edge fallback (and you noted which).
+3. The HTML report wrote to disk.
+4. The LLM evaluator sections rendered (or are clearly absent with a "skipped: no completion text" note, not silently empty).
+5. The browser opened the report (or you printed the file:// URL for manual opening).
+6. The chat reply lists each signal's headline number, the report path, the picked trace_id per conversation, and the source (trace_api / conversation_fallback) per side. If the inputs look like a mis-paired comparison (per the Phase 4c sanity check), the chat reply says so prominently.
diff --git a/skills/compare-trace/references/PR1236_MAPPING.md b/skills/compare-trace/references/PR1236_MAPPING.md
new file mode 100644
index 00000000..befb87f0
--- /dev/null
+++ b/skills/compare-trace/references/PR1236_MAPPING.md
@@ -0,0 +1,40 @@
+# Mapping from PR #1236 to the MC trace API
+
+PR: `monte-carlo-data/ai-agent#1236` ("Agents regression - TSA ready for now").
+
+The PR builds an A/B framework that **runs the agent** under two branches and saves
+`ScenarioOutput` snapshots locally, then runs 6 evaluators + an LLM summarizer + an
+HTML renderer over the snapshot pairs. This skill backports the comparison half —
+it operates on **already-captured traces** fetched via `get_agent_trace` and
+`get_agent_conversation`, not on a fresh agent run.
+
+## ScenarioOutput → normalized trace JSON
+
+| PR field | Source in the trace API | Notes |
+|---|---|---|
+| `final_output` (dict) | Last `completions` string from `get_agent_conversation` filtered to the trace | Free text in v0.1 (not a named-field dict). |
+| `node_path: list[str]` | `nodeName` of every span in `get_agent_trace`, sorted by `startTime` | Direct fit. |
+| `tool_calls: list[{name,args}]` | Spans with `isToolCall == true`, take `nodeName` for `name`; `args` left empty | v0.2 will parse args from completion `tool_calls` JSON. |
+| `execution_time_seconds` | `(max(endTime) - min(startTime))` across spans | Or sum root-span `duration / 1000`. |
+| `llm_call_count` | Count of spans with `hasPrompts && hasCompletions` | Closer match than the PR's "AI-typed message count". |
+| `total_tokens` | Sum of `totalTokens` across all spans | PR's runner left this at 0; we actually populate it. |
+| `status` / `error` | Any `hasError == true` span → `has_errors: true` | Coarser than the PR's per-scenario try/except, but the agent already ran. |
+
+## Evaluator parity
+
+| PR evaluator | Trace-API parity | Status |
+|---|---|---|
+| `graph_path_diff` | Jaccard + LCS on `node_path` | ✅ identical implementation |
+| `latency_diff` | Same 4 metrics (`execution_time_seconds`, `llm_call_count`, `total_tokens`, `tool_call_count`) | ✅ identical |
+| `tool_call_diff` (names) | Levenshtein on tool-name sequences | ✅ identical |
+| `tool_call_diff` (arguments) | Phase 2 walks `get_agent_conversation` and parses `tool_calls` blocks from each assistant completion (LangChain/Bedrock `arguments` JSON-string shape verified empirically; OpenAI / Anthropic shapes parsed best-effort). `_match_tools_by_proximity` + `_compare_args` ported verbatim from PR #1236. | ✅ shipped in v0.3.0 |
+| `semantic_diff` | LLM prompt over two free-text completions instead of named fields | ⚠️ adapted — Claude runs the prompt inline, no per-field scoring |
+| `fact_overlap` | LLM extraction over two free-text completions instead of named fields | ⚠️ adapted — same prompt, just unified text |
+| `structured_field_diff` | Requires a named-field output schema from the agent | ❌ dropped — not meaningful for arbitrary traces |
+
+## Things deliberately NOT backported
+
+- **`capture` + `run_scenario`** — the PR's runner uses `graph.astream` and `_ScenarioLogCapture`. We skip the whole capture half because the user is comparing existing traces.
+- **`AgentAdapter` protocol** — agent-specific, only useful when re-running.
+- **Per-scenario corpus reporting** — we compare a single pair, not N scenarios. The HTML strips down to one card.
+- **`get_fast_smart_llm` dependency** — the LLM evals run inline as Claude prompts (Phase 4 in SKILL.md), no Python LLM call.
diff --git a/skills/compare-trace/references/local-otel-collection.md b/skills/compare-trace/references/local-otel-collection.md
new file mode 100644
index 00000000..7d9a38a3
--- /dev/null
+++ b/skills/compare-trace/references/local-otel-collection.md
@@ -0,0 +1,237 @@
+# Local OTel collection
+
+Alternative ingestion source for the compare-trace skill. Instead of pulling
+two traces from Monte Carlo's stored agent conversations, collect them
+locally from any OTel-instrumented agent and feed them into the same
+comparator and HTML report.
+
+**When to reach for this:** A/B-testing a change (prompt, code, model) before
+it ships, so there's no production conversation to point at. Run the agent
+twice locally — once with the baseline, once with the candidate — capture
+spans, compare.
+
+The main workflow in `SKILL.md` (Phases 1–6) still applies from Phase 4
+onward; this doc replaces Phases 1–3 (MC conversation walking) with three
+local steps.
+
+---
+
+## Pipeline overview
+
+```
+your agent process  ──OTLP/HTTP──▶  local_otlp_receiver.py  ──▶  *.jsonl
+                                                                    │
+                                              sources/otel_spans.py ▼
+                                                          *.normalized.json
+                                                                    │
+                                                  compare_traces.py ▼
+                                                              report.html
+```
+
+Three scripts, all under `${CLAUDE_PLUGIN_ROOT}/skills/compare-trace/scripts/`:
+
+- `local_otlp_receiver.py` — receiver. Accepts OTLP/HTTP protobuf, writes raw
+  spans as JSON-lines.
+- `sources/otel_spans.py` — normalizer. Converts raw spans into the trace
+  shape the comparator consumes.
+- `compare_traces.py` — main driver (shared with the MC-conversation path).
+
+---
+
+## Step 1: Start the receiver
+
+The receiver requires `opentelemetry-proto` (already a transitive dep of
+`opentelemetry-sdk`, so any venv that runs an OTel-instrumented agent
+already has it).
+
+```bash
+python3 skills/compare-trace/scripts/local_otlp_receiver.py \
+    --output /tmp/run-a.jsonl \
+    --port 4318
+```
+
+Stays in the foreground; send `SIGINT` (Ctrl+C) when the agent run completes
+to stop and flush. Each POST is appended to the JSONL — multiple agent runs
+into one file is fine if you want to accumulate; one-file-per-run is
+cleaner for diffing.
+
+If port 4318 is busy, either pass `--port <n>` or kill the stale process —
+`lsof -i :4318` shows the holder. (The script does not bind-retry; it
+exits on `EADDRINUSE`.)
+
+---
+
+## Step 2: Configure your agent's OTel exporter
+
+The receiver speaks OTLP/HTTP at `/v1/traces` on the bound port. Any agent
+using the OpenTelemetry SDK can point at it with one env var:
+
+```bash
+export OTEL_EXPORTER_OTLP_TRACES_ENDPOINT=http://127.0.0.1:4318/v1/traces
+```
+
+(The SDK appends `/v1/traces` automatically when you set
+`OTEL_EXPORTER_OTLP_ENDPOINT=http://127.0.0.1:4318` instead — either form
+works.)
+
+If your agent uses a wrapper that takes a base URL (e.g. ai-agent's
+`MC_OTEL_ENDPOINT`), set that to `http://127.0.0.1:4318` and let the wrapper
+append the path.
+
+**Force-flush before the process exits.** `BatchSpanProcessor` buffers — if
+the agent exits abruptly, the last few spans never reach the receiver. Call
+`tracer_provider.shutdown()` (or `force_flush()`) at the end of your run
+script. See the ai-agent example in the appendix.
+
+---
+
+## Step 3: Normalize each run
+
+```bash
+python3 skills/compare-trace/scripts/sources/otel_spans.py \
+    /tmp/run-a.jsonl \
+    --output /tmp/run-a.normalized.json
+```
+
+The normalizer reads the JSONL, picks the dominant `trace_id` (handles stray
+spans landing in the same file), and produces the dict shape the comparator
+expects:
+
+```json
+{
+  "trace_id": "...",
+  "node_path": ["initialization", "react_agent", ...],
+  "tool_calls": [{"name": "...", "args": {...}, "id": "..."}, ...],
+  "execution_time_seconds": ...,
+  "llm_call_count": ...,
+  "total_tokens": ...,
+  "tool_call_count": ...,
+  "final_output_text": "..."
+}
+```
+
+Run it once per JSONL.
+
+---
+
+## Step 4: Compare
+
+Hand the two normalized files to the main driver (same call as the
+MC-conversation path):
+
+```bash
+python3 skills/compare-trace/scripts/compare_traces.py \
+    --baseline /tmp/run-a.normalized.json \
+    --candidate /tmp/run-b.normalized.json \
+    --output /tmp/report.html
+```
+
+For the optional LLM-based evaluators (`semantic_diff`, `entity_overlap`),
+follow Phase 4 of `SKILL.md` — the inputs are the same.
+
+---
+
+## Dialect coverage
+
+The normalizer reads three families of attributes, all derived from
+conventions rather than ai-agent-specific code paths:
+
+- **OTel GenAI semantic conventions** —
+  `gen_ai.prompt.*`, `gen_ai.completion.*`, `gen_ai.usage.*`. Emitted by
+  Traceloop's `LangchainInstrumentor`, OpenInference, and the official
+  OpenTelemetry GenAI instrumentations.
+- **LangGraph node spans** — `<node>.task` for each node call, `<name>.workflow`
+  for the compiled-graph root, `<tool>.tool` for tool executions. Emitted by
+  Traceloop when LangGraph is in use.
+- **Tool-call attributes** — `gen_ai.{completion,prompt}.<i>.tool_calls.<j>.{name,arguments,id}`.
+  The normalizer dedupes by `id` because Traceloop's Bedrock instrumentor
+  only emits these under `prompt.*` on the *next* LLM call (where the call
+  shows up as part of the message history), not on the completion that
+  produced them. Other instrumentations emit them on completions; both
+  forms are merged.
+
+If your agent doesn't use LangGraph or Traceloop, you'll get partial
+results — typically `node_path` will be empty and `tool_calls` may be
+missing args. Pointing the normalizer at a different dialect is a contained
+edit (it's ~200 lines); add a second module under `scripts/sources/` keyed
+to whatever attribute conventions your stack uses, and feed its output into
+`compare_traces.py` the same way.
+
+---
+
+## Appendix: ai-agent integration example
+
+Concrete glue for driving ai-agent's `coverage_agent` locally with OTel
+pointed at the receiver. The pattern transfers to other ai-agent graphs
+(chat, tsa, performance) — swap the `invoke_*` import.
+
+**Where this code should live:** in ai-agent (e.g.
+`tests/scripts/coverage_repl_with_otel.py`), not in the skill. It's
+ai-agent-specific glue.
+
+```python
+import os, sys, asyncio
+
+# 1) Env bootstrap (mirrors ai-agent's .envrc dev profile).
+os.environ.setdefault("AWS_PROFILE", "dev")
+os.environ.setdefault("ENV", "local")
+os.environ.setdefault("AUTH_MODE", "none")
+os.environ.setdefault("MONOLITH_URL", "https://cli.dev.mcinfra.io")
+os.environ.setdefault("MC_USER_ID", "d930a36b-ee0b-4200-9f7b-fcc62cbbd645")
+os.environ.setdefault("LANGSMITH_TRACING", "false")
+os.environ["MC_OTEL_ENDPOINT"] = "http://127.0.0.1:4318"
+# IMPORTANT: do NOT set MCP_SERVER_URL. With it unset and no signing key
+# resolved, get_mcp_tools() falls through to load_mcp_tools_in_process,
+# which wraps the local mcp_server package and avoids the lambda-URL 403.
+
+# 2) OTel setup before importing the graph.
+from opentelemetry import trace as otel_trace
+from opentelemetry.instrumentation.langchain import LangchainInstrumentor
+from opentelemetry.sdk.trace import TracerProvider
+from ai_agent.shared.observability import setup_otel_tracing
+
+setup_otel_tracing(
+    agent_name="coverage_agent",
+    instrumentors=[LangchainInstrumentor()],
+)
+
+# 3) Optional: monkeypatch a system prompt for A/B testing. Works because
+# nodes/initialization.py reads coverage_system_prompt at run time, not
+# at module-load time. Won't work for prompts that get baked in at
+# graph-compile time — those need a real file edit (or git worktree).
+from ai_agent.coverage_agent import prompts as _p
+_p.coverage_system_prompt = "<your alternate prompt>"
+
+# 4) Invoke. tests_evals/.../coverage_agent/conftest.py provides
+# invoke_coverage_agent() which compiles the graph with MemorySaver and
+# returns a structured result.
+sys.path.insert(0, "/path/to/ai-agent")
+from tests_evals.ai_agent.coverage_agent.conftest import invoke_coverage_agent
+
+async def run():
+    result = await invoke_coverage_agent(
+        user_message="what is my coverage gap?",
+        sql_permission="ALLOW_SESSION",
+    )
+    print(result.output[:500])
+    # 5) Force-flush spans before exit.
+    provider = otel_trace.get_tracer_provider()
+    if isinstance(provider, TracerProvider):
+        provider.shutdown()
+
+asyncio.run(run())
+```
+
+**Known gotchas:**
+
+- `load_mcp_tools_in_process()` imports `mcp_server.beacon` which requires
+  `slowapi`. If it's missing from the ai-agent venv, MCP tool loading
+  silently returns `[]` (the loader has a bare `except`). Install it with
+  `uv pip install slowapi` (pulls in `limits` and `deprecated`).
+- The `tests_evals` coverage conftest compiles the graph at import time. If
+  you need to A/B-test changes that affect graph structure (not just prompt
+  text), use a git worktree per branch, not in-process monkeypatching.
+- The `MCP_SERVER_URL` unset trick depends on
+  `ai_agent.shared.mcp.tools.get_mcp_tools` checking `is_local_mcp_available()`
+  before falling back to the HTTP path. If that branch is ever removed,
+  local runs will need an explicit override.
diff --git a/skills/compare-trace/scripts/compare_traces.py b/skills/compare-trace/scripts/compare_traces.py
new file mode 100755
index 00000000..da7476be
--- /dev/null
+++ b/skills/compare-trace/scripts/compare_traces.py
@@ -0,0 +1,655 @@
+#!/usr/bin/env python3
+"""Driver for the compare-trace skill.
+
+Takes two normalized trace JSON files (baseline + candidate), runs the three
+deterministic evaluators, optionally folds in LLM-eval results that Claude
+ran inline, and writes a single-pair HTML report. Opens the report in the
+default browser unless ``--no-open`` is passed.
+
+Normalized trace JSON shape::
+
+    {
+      "trace_id": "<hex>",
+      "label": "baseline|candidate|...",
+      "node_path": ["node_a", "node_b", ...],
+      "tool_calls": [{"name": "<tool>", "args": {}}, ...],
+      "execution_time_seconds": 12.34,
+      "llm_call_count": 5,
+      "total_tokens": 1234,
+      "prompt_tokens": 900,
+      "completion_tokens": 334,
+      "has_errors": false,
+      "final_output_text": ""
+    }
+
+LLM-eval JSON shapes are documented in SKILL.md (Phase 4).
+"""
+
+from __future__ import annotations
+
+import argparse
+import html
+import json
+import platform
+import subprocess
+import sys
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import Any
+
+SCRIPT_DIR = Path(__file__).resolve().parent
+sys.path.insert(0, str(SCRIPT_DIR))
+
+from evaluators.graph_path_diff import compare_graph_paths  # noqa: E402
+from evaluators.latency_diff import compare_latency  # noqa: E402
+from evaluators.tool_call_diff import compare_tool_calls  # noqa: E402
+
+
+# ---------------------------------------------------------------------------
+# I/O
+# ---------------------------------------------------------------------------
+
+
+def _load_json(path: Path) -> dict[str, Any]:
+    with path.open() as f:
+        return json.load(f)
+
+
+def _open_in_browser(path: Path) -> None:
+    url = f"file://{path.resolve()}"
+    system = platform.system()
+    try:
+        if system == "Darwin":
+            subprocess.run(["open", url], check=False)
+        elif system == "Linux":
+            subprocess.run(["xdg-open", url], check=False)
+        elif system == "Windows":
+            subprocess.run(["start", url], shell=True, check=False)
+    except FileNotFoundError:
+        # Browser command not available; caller prints the path instead.
+        pass
+
+
+# ---------------------------------------------------------------------------
+# Tab renderers (single-pair flavor, ported from PR #1236's html_renderer.py)
+# ---------------------------------------------------------------------------
+
+
+def _render_graph_tab(g) -> str:
+    rows = ""
+    if g.baseline_only_nodes:
+        rows += (
+            f'<tr><td class="removed">Baseline only</td>'
+            f'<td>{html.escape(", ".join(g.baseline_only_nodes))}</td></tr>'
+        )
+    if g.candidate_only_nodes:
+        rows += (
+            f'<tr><td class="added">Candidate only</td>'
+            f'<td>{html.escape(", ".join(g.candidate_only_nodes))}</td></tr>'
+        )
+    if g.shared_nodes:
+        rows += (
+            f'<tr><td>Shared</td>'
+            f'<td>{html.escape(", ".join(g.shared_nodes))}</td></tr>'
+        )
+    return f"""
+        <div class="signal-summary">
+          <span>Jaccard (node set): <strong>{g.jaccard_similarity:.2f}</strong></span>
+          <span>Ordering (LCS): <strong>{g.ordering_similarity:.2f}</strong></span>
+          <span>Overall: <strong>{g.overall_similarity:.2f}</strong></span>
+        </div>
+        <table class="field-table"><thead><tr><th>Category</th><th>Nodes</th></tr></thead>
+          <tbody>{rows or "<tr><td colspan='2'>Identical paths</td></tr>"}</tbody></table>
+        <details style="margin-top:8px"><summary style="cursor:pointer;font-size:0.85rem;color:#666">Full paths</summary>
+          <div class="raw-columns">
+            <div class="raw-col"><h4>Baseline ({len(g.baseline_path)} nodes)</h4><pre>{html.escape(chr(10).join(g.baseline_path[:200]) or "(empty)")}</pre></div>
+            <div class="raw-col"><h4>Candidate ({len(g.candidate_path)} nodes)</h4><pre>{html.escape(chr(10).join(g.candidate_path[:200]) or "(empty)")}</pre></div>
+          </div>
+        </details>"""
+
+
+def _render_latency_tab(lat) -> str:
+    rows = ""
+    for m in lat.metrics:
+        if m.baseline_value == 0 and m.candidate_value == 0:
+            continue
+        css = "field-changed" if m.is_regression else "field-unchanged"
+        ratio_str = f"{m.ratio:.2f}x" if m.ratio != float("inf") else "inf"
+        badge = (
+            '<span class="badge-regressed">regressed</span>'
+            if m.is_regression
+            else ""
+        )
+        rows += f"""
+          <tr class="{css}">
+            <td>{html.escape(m.field_name)}</td>
+            <td>{m.baseline_value:.1f}</td>
+            <td>{m.candidate_value:.1f}</td>
+            <td>{ratio_str}</td>
+            <td>{badge}</td>
+          </tr>"""
+
+    assessment_css = {"regressed": "red", "improved": "green"}.get(
+        lat.overall_assessment, ""
+    )
+    return f"""
+        <div class="signal-summary">
+          <span>Assessment: <strong class="{assessment_css}">{lat.overall_assessment.upper()}</strong></span>
+        </div>
+        <table class="field-table">
+          <thead><tr><th>Metric</th><th>Baseline</th><th>Candidate</th><th>Ratio</th><th></th></tr></thead>
+          <tbody>{rows or "<tr><td colspan='5'>No metrics</td></tr>"}</tbody>
+        </table>"""
+
+
+_VALUE_TRUNCATE = 80
+
+
+def _format_arg_value(value: Any) -> tuple[str, str]:
+    """Return (truncated_repr, full_repr) for an arg value.
+
+    Both reprs are JSON-encoded for dict/list values so the rendering is
+    stable across types. Truncated is 80 chars with an ellipsis suffix.
+    """
+    if isinstance(value, (dict, list)):
+        full = json.dumps(value, sort_keys=True, default=str)
+    elif value is None:
+        full = "null"
+    elif isinstance(value, bool):
+        full = "true" if value else "false"
+    else:
+        full = str(value)
+    if len(full) <= _VALUE_TRUNCATE:
+        return full, full
+    return full[:_VALUE_TRUNCATE] + "…", full
+
+
+def _render_arg_diff_lines(ac: dict[str, Any]) -> str:
+    """Build the per-row diff content (inline truncated values + expand toggle)."""
+    inline_lines: list[str] = []
+    full_lines: list[str] = []
+
+    added_values = ac.get("added_values", {}) or {}
+    removed_values = ac.get("removed_values", {}) or {}
+    changed_values = ac.get("changed_values", {}) or {}
+
+    for k in ac.get("added_keys", []) or []:
+        v = added_values.get(k, "")
+        trunc, full = _format_arg_value(v)
+        inline_lines.append(
+            f'<div class="added">+ <code>{html.escape(k)}</code>: '
+            f'<code>{html.escape(trunc)}</code></div>'
+        )
+        full_lines.append(f'+ {k}: {full}')
+
+    for k in ac.get("removed_keys", []) or []:
+        v = removed_values.get(k, "")
+        trunc, full = _format_arg_value(v)
+        inline_lines.append(
+            f'<div class="removed">- <code>{html.escape(k)}</code>: '
+            f'<code>{html.escape(trunc)}</code></div>'
+        )
+        full_lines.append(f'- {k}: {full}')
+
+    for k in ac.get("changed_keys", []) or []:
+        pair = changed_values.get(k, {}) or {}
+        b_val = pair.get("baseline", "")
+        c_val = pair.get("candidate", "")
+        b_trunc, b_full = _format_arg_value(b_val)
+        c_trunc, c_full = _format_arg_value(c_val)
+        inline_lines.append(
+            f'<div>Δ <code>{html.escape(k)}</code>: '
+            f'<code class="removed">{html.escape(b_trunc)}</code> → '
+            f'<code class="added">{html.escape(c_trunc)}</code></div>'
+        )
+        full_lines.append(f'Δ {k}:\n  baseline:  {b_full}\n  candidate: {c_full}')
+
+    inline_html = "".join(inline_lines)
+    any_truncated = any(
+        _format_arg_value(v)[0] != _format_arg_value(v)[1]
+        for v in list(added_values.values()) + list(removed_values.values())
+    ) or any(
+        _format_arg_value(pair.get("baseline", ""))[0]
+        != _format_arg_value(pair.get("baseline", ""))[1]
+        or _format_arg_value(pair.get("candidate", ""))[0]
+        != _format_arg_value(pair.get("candidate", ""))[1]
+        for pair in changed_values.values()
+    )
+    if any_truncated:
+        full_block = html.escape("\n".join(full_lines))
+        inline_html += (
+            '<details style="margin-top:4px"><summary '
+            'style="cursor:pointer;font-size:0.72rem;color:#666">▸ show full values</summary>'
+            f'<pre style="margin:4px 0;padding:8px;background:#f8f9fa;border-radius:4px;'
+            f'font-size:0.75rem;white-space:pre-wrap;word-break:break-word">{full_block}</pre>'
+            "</details>"
+        )
+    return inline_html
+
+
+def _render_tool_call_tab(t) -> str:
+    rows = ""
+    if t.added:
+        rows += (
+            f'<tr><td class="added">+ Added tools</td>'
+            f'<td>{html.escape(", ".join(t.added))}</td></tr>'
+        )
+    if t.removed:
+        rows += (
+            f'<tr><td class="removed">- Removed tools</td>'
+            f'<td>{html.escape(", ".join(t.removed))}</td></tr>'
+        )
+    if t.shared:
+        rows += (
+            f'<tr><td>Shared tools</td>'
+            f'<td>{html.escape(", ".join(t.shared))}</td></tr>'
+        )
+
+    arg_rows = ""
+    for ac in t.argument_changes:
+        diff_html = _render_arg_diff_lines(ac)
+        pos = f"#{ac.get('position_baseline', '?')} → #{ac.get('position_candidate', '?')}"
+        arg_rows += (
+            f'<tr><td><code>{html.escape(ac.get("tool_name", ""))}</code></td>'
+            f'<td style="color:#666;font-size:0.78rem;white-space:nowrap">{pos}</td>'
+            f'<td>{diff_html}</td></tr>'
+        )
+
+    arg_section = (
+        '<h4 style="margin-top:14px;font-size:0.95rem">Argument changes (matched calls)</h4>'
+        '<table class="field-table">'
+        '<thead><tr><th>Tool</th><th>Positions</th><th>Diff</th></tr></thead>'
+        f'<tbody>{arg_rows}</tbody></table>'
+        if arg_rows
+        else (
+            '<div style="margin-top:8px;padding:8px 12px;background:#f0fdf4;'
+            'border-radius:4px;font-size:0.8rem;color:#166534">'
+            "No argument-level changes for matched calls "
+            "(or tool_calls were captured without args)."
+            "</div>"
+        )
+    )
+
+    return f"""
+        <div class="signal-summary">
+          <span>Edit distance: <strong>{t.edit_distance}</strong></span>
+          <span>Similarity: <strong>{t.similarity:.2f}</strong></span>
+          <span>Baseline: {len(t.baseline_tools)} calls</span>
+          <span>Candidate: {len(t.candidate_tools)} calls</span>
+          <span>Arg-diff matches: <strong>{len(t.argument_changes)}</strong></span>
+        </div>
+        <table class="field-table"><thead><tr><th>Change</th><th>Tools</th></tr></thead>
+          <tbody>{rows or "<tr><td colspan='2'>Identical tool sequences</td></tr>"}</tbody></table>
+        {arg_section}
+        <details style="margin-top:8px"><summary style="cursor:pointer;font-size:0.85rem;color:#666">Full sequences</summary>
+          <div class="raw-columns">
+            <div class="raw-col"><h4>Baseline</h4><pre>{html.escape(chr(10).join(t.baseline_tools[:200]) or "(empty)")}</pre></div>
+            <div class="raw-col"><h4>Candidate</h4><pre>{html.escape(chr(10).join(t.candidate_tools[:200]) or "(empty)")}</pre></div>
+          </div>
+        </details>"""
+
+
+def _render_semantic_tab(s: dict | None, has_completions: bool) -> str:
+    if s is None:
+        msg = (
+            "Skipped — no final-completion text available for one or both traces. "
+            "Pass <code>--conversation-ids</code> when invoking the skill, or re-run "
+            "with the conversation IDs from the MC UI."
+        ) if not has_completions else (
+            "Skipped — Claude did not run the inline semantic diff for this comparison."
+        )
+        return f'<p style="color:#666;font-size:0.9rem">{msg}</p>'
+
+    verdict = s.get("verdict", "unknown")
+    verdict_css = {
+        "regression": "red",
+        "improvement": "green",
+        "preserved": "green",
+        "mixed": "orange",
+    }.get(verdict, "")
+
+    lost = s.get("lost_findings") or []
+    added = s.get("added_findings") or []
+
+    def _bullets(items: list[str], css: str) -> str:
+        if not items:
+            return ""
+        return (
+            '<ul style="margin:4px 0 0 16px;padding:0">'
+            + "".join(
+                f'<li class="{css}" style="margin:2px 0">{html.escape(str(i))}</li>'
+                for i in items[:10]
+            )
+            + ("<li>…</li>" if len(items) > 10 else "")
+            + "</ul>"
+        )
+
+    return f"""
+        <div class="signal-summary">
+          <span>Overall verdict: <strong class="{verdict_css}">{verdict.upper()}</strong></span>
+          <span>Semantic similarity: <strong>{float(s.get("similarity_score", 0.0)):.2f}</strong></span>
+        </div>
+        <div style="background:#f8f9fa;padding:8px 12px;border-radius:4px;font-size:0.9rem;margin-bottom:12px">
+          {html.escape(s.get("explanation", "") or "")}
+        </div>
+        <div style="display:grid;grid-template-columns:1fr 1fr;gap:16px">
+          <div><strong class="removed">Lost in candidate ({len(lost)})</strong>{_bullets(lost, "removed")}</div>
+          <div><strong class="added">Added in candidate ({len(added)})</strong>{_bullets(added, "added")}</div>
+        </div>"""
+
+
+def _render_entities_tab(f: dict | None, has_completions: bool) -> str:
+    if f is None:
+        msg = (
+            "Skipped — no final-completion text available for one or both traces."
+        ) if not has_completions else (
+            "Skipped — Claude did not run the inline entity overlap for this comparison."
+        )
+        return f'<p style="color:#666;font-size:0.9rem">{msg}</p>'
+
+    per_type = f.get("per_type_jaccard", {}) or {}
+    shared = f.get("shared", {}) or {}
+    b_only = f.get("baseline_only", {}) or {}
+    c_only = f.get("candidate_only", {}) or {}
+
+    def _chips(items: list[str], css: str, limit: int = 5) -> str:
+        if not items:
+            return ""
+        tags = " ".join(
+            f'<code class="{css}">{html.escape(str(i)[:60])}</code>' for i in items[:limit]
+        )
+        suffix = f" +{len(items) - limit} more" if len(items) > limit else ""
+        return tags + suffix
+
+    rows = ""
+    for entity_type in sorted(per_type):
+        jaccard = float(per_type.get(entity_type, 0.0))
+        css = "field-changed" if jaccard < 1.0 else "field-unchanged"
+        details = ""
+        s_items = shared.get(entity_type, []) or []
+        b_items = b_only.get(entity_type, []) or []
+        c_items = c_only.get(entity_type, []) or []
+        if s_items:
+            details += f'<div><strong>Shared ({len(s_items)}):</strong> {_chips(s_items, "entity-shared")}</div>'
+        if b_items:
+            details += f'<div><strong>Baseline only ({len(b_items)}):</strong> {_chips(b_items, "entity-removed")}</div>'
+        if c_items:
+            details += f'<div><strong>Candidate only ({len(c_items)}):</strong> {_chips(c_items, "entity-added")}</div>'
+        rows += f"""
+          <tr class="{css}">
+            <td>{html.escape(entity_type)}</td>
+            <td>{jaccard:.2f}</td>
+            <td>{details or "-"}</td>
+          </tr>"""
+
+    overall = float(f.get("overall_jaccard", 0.0))
+    return f"""
+        <div class="signal-summary">
+          <span>Overall entity overlap: <strong>{overall:.2f}</strong></span>
+        </div>
+        <table class="field-table">
+          <thead><tr><th>Entity Type</th><th>Jaccard</th><th>Details</th></tr></thead>
+          <tbody>{rows or "<tr><td colspan='3'>No entities extracted</td></tr>"}</tbody>
+        </table>"""
+
+
+# ---------------------------------------------------------------------------
+# Top-level renderer
+# ---------------------------------------------------------------------------
+
+
+def render_html(
+    baseline: dict,
+    candidate: dict,
+    graph,
+    latency,
+    tools,
+    semantic: dict | None,
+    entities: dict | None,
+    narrative: str,
+    output_path: Path,
+) -> Path:
+    timestamp = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M UTC")
+    baseline_label = baseline.get("label") or "baseline"
+    candidate_label = candidate.get("label") or "candidate"
+    baseline_id = baseline.get("trace_id", "")
+    candidate_id = candidate.get("trace_id", "")
+
+    has_completions = bool(
+        baseline.get("final_output_text") and candidate.get("final_output_text")
+    )
+
+    semantic_section = _render_semantic_tab(semantic, has_completions)
+    graph_section = _render_graph_tab(graph)
+    latency_section = _render_latency_tab(latency)
+    entities_section = _render_entities_tab(entities, has_completions)
+    tools_section = _render_tool_call_tab(tools)
+
+    n_regressed = sum(1 for m in latency.metrics if m.is_regression)
+    latency_color = "red" if n_regressed > 0 else "green"
+    avg_semantic = (
+        float(semantic.get("similarity_score", 0.0)) if semantic else 0.0
+    )
+    avg_entities = (
+        float(entities.get("overall_jaccard", 0.0)) if entities else 0.0
+    )
+
+    semantic_stat = (
+        f'<div class="stat"><div class="stat-value">{avg_semantic:.2f}</div>'
+        f'<div class="stat-label">Semantic Similarity</div></div>'
+        if semantic else ""
+    )
+    entities_stat = (
+        f'<div class="stat"><div class="stat-value">{avg_entities:.2f}</div>'
+        f'<div class="stat-label">Entity Overlap</div></div>'
+        if entities else ""
+    )
+
+    baseline_raw_pre = html.escape(
+        (baseline.get("final_output_text") or "(no completion text)")[:8000]
+    )
+    candidate_raw_pre = html.escape(
+        (candidate.get("final_output_text") or "(no completion text)")[:8000]
+    )
+
+    body = f"""
+<h1>Trace Comparison</h1>
+<div class="meta">
+  Generated {timestamp} | <strong>{html.escape(baseline_label)}</strong>
+  (<code>{html.escape(baseline_id)}</code>)
+  vs <strong>{html.escape(candidate_label)}</strong>
+  (<code>{html.escape(candidate_id)}</code>)
+</div>
+
+<div class="corpus-summary">
+  <h2>Summary</h2>
+  <div class="stats">
+    <div class="stat">
+      <div class="stat-value">{graph.overall_similarity:.2f}</div>
+      <div class="stat-label">Graph Similarity</div>
+    </div>
+    <div class="stat">
+      <div class="stat-value">{tools.similarity:.2f}</div>
+      <div class="stat-label">Tool Similarity</div>
+    </div>
+    <div class="stat">
+      <div class="stat-value {latency_color}">{n_regressed}</div>
+      <div class="stat-label">Latency Regressed</div>
+    </div>
+    {semantic_stat}
+    {entities_stat}
+  </div>
+  <div class="llm-narrative">{html.escape(narrative or "(no narrative provided)")}</div>
+</div>
+
+<div class="scenario-card expanded">
+  <div class="scenario-body">
+    <div class="tabs">
+      <div class="tab-buttons">
+        <button class="tab-btn active" onclick="switchTab(this, 'card', 'semantic')">Semantic Diff</button>
+        <button class="tab-btn" onclick="switchTab(this, 'card', 'graph')">Graph Path</button>
+        <button class="tab-btn" onclick="switchTab(this, 'card', 'latency')">Latency &amp; Tokens</button>
+        <button class="tab-btn" onclick="switchTab(this, 'card', 'entities')">Entity Overlap</button>
+        <button class="tab-btn" onclick="switchTab(this, 'card', 'tools')">Tool Calls</button>
+      </div>
+      <div class="tab-content" id="card-semantic">{semantic_section}</div>
+      <div class="tab-content" id="card-graph" style="display:none">{graph_section}</div>
+      <div class="tab-content" id="card-latency" style="display:none">{latency_section}</div>
+      <div class="tab-content" id="card-entities" style="display:none">{entities_section}</div>
+      <div class="tab-content" id="card-tools" style="display:none">{tools_section}</div>
+    </div>
+    <details class="raw-data">
+      <summary>Final completion text (both traces)</summary>
+      <div class="raw-columns">
+        <div class="raw-col"><h4>Baseline</h4><pre>{baseline_raw_pre}</pre></div>
+        <div class="raw-col"><h4>Candidate</h4><pre>{candidate_raw_pre}</pre></div>
+      </div>
+    </details>
+  </div>
+</div>
+"""
+
+    html_doc = _TEMPLATE.replace("{{BODY}}", body)
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    output_path.write_text(html_doc)
+    return output_path
+
+
+_TEMPLATE = """\
+<!DOCTYPE html>
+<html lang="en">
+<head>
+<meta charset="utf-8">
+<title>Monte Carlo Trace Comparison</title>
+<style>
+  * { box-sizing: border-box; margin: 0; padding: 0; }
+  body { font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', sans-serif;
+         background: #f5f5f5; color: #333; padding: 24px; max-width: 1200px; margin: 0 auto; }
+  h1 { font-size: 1.5rem; margin-bottom: 4px; }
+  .meta { color: #666; font-size: 0.85rem; margin-bottom: 20px; }
+  .meta code { background: #eef; padding: 1px 6px; border-radius: 4px; font-size: 0.78rem; }
+  .corpus-summary { background: #fff; border: 1px solid #ddd; border-radius: 8px;
+                    padding: 20px; margin-bottom: 24px; }
+  .corpus-summary h2 { font-size: 1.1rem; margin-bottom: 12px; }
+  .stats { display: flex; gap: 24px; margin-bottom: 16px; flex-wrap: wrap; }
+  .stat { text-align: center; min-width: 110px; }
+  .stat-value { font-size: 1.8rem; font-weight: 700; }
+  .stat-label { font-size: 0.7rem; color: #666; text-transform: uppercase; }
+  .stat-value.green { color: #16a34a; }
+  .stat-value.orange { color: #ea580c; }
+  .stat-value.red { color: #dc2626; }
+  .llm-narrative { background: #f8f9fa; padding: 12px; border-radius: 4px;
+                   font-size: 0.9rem; line-height: 1.5; white-space: pre-wrap; }
+  .scenario-card { background: #fff; border: 1px solid #ddd; border-radius: 8px;
+                   margin-bottom: 12px; overflow: hidden; }
+  .scenario-body { padding: 20px; }
+  .tabs { margin-bottom: 16px; }
+  .tab-buttons { display: flex; gap: 4px; margin-bottom: 12px; border-bottom: 2px solid #eee; }
+  .tab-btn { padding: 8px 16px; border: none; background: none; cursor: pointer;
+             font-size: 0.85rem; color: #666; border-bottom: 2px solid transparent;
+             margin-bottom: -2px; transition: all 0.2s; }
+  .tab-btn:hover { color: #333; }
+  .tab-btn.active { color: #2563eb; border-bottom-color: #2563eb; font-weight: 600; }
+  .tab-content { min-height: 80px; }
+  .signal-summary { display: flex; gap: 20px; padding: 8px 0 12px; font-size: 0.85rem;
+                    color: #555; flex-wrap: wrap; }
+  .signal-summary strong { color: #333; }
+  .field-table { width: 100%; border-collapse: collapse; font-size: 0.85rem; }
+  .field-table th { text-align: left; padding: 8px; border-bottom: 2px solid #ddd; font-weight: 600; }
+  .field-table td { padding: 8px; border-bottom: 1px solid #eee; vertical-align: top; }
+  .field-unchanged td { color: #999; }
+  .field-changed td { background: #fffbeb; }
+  .badge-regressed { background: #fee2e2; color: #991b1b; padding: 1px 6px;
+                     border-radius: 6px; font-size: 0.75rem; }
+  .added { color: #16a34a; }
+  .removed { color: #dc2626; }
+  code.entity-shared { background: #f0fdf4; color: #166534; padding: 1px 5px;
+                       border-radius: 4px; font-size: 0.8rem; margin: 1px; display: inline-block; }
+  code.entity-removed { background: #fef2f2; color: #991b1b; padding: 1px 5px;
+                        border-radius: 4px; font-size: 0.8rem; margin: 1px; display: inline-block; }
+  code.entity-added { background: #eff6ff; color: #1e40af; padding: 1px 5px;
+                      border-radius: 4px; font-size: 0.8rem; margin: 1px; display: inline-block; }
+  .green { color: #16a34a; }
+  .red { color: #dc2626; }
+  .orange { color: #ea580c; }
+  .raw-data { margin-top: 16px; }
+  .raw-data summary { cursor: pointer; font-size: 0.85rem; color: #666; padding: 8px 0; }
+  .raw-columns { display: grid; grid-template-columns: 1fr 1fr; gap: 16px; }
+  .raw-col h4 { font-size: 0.8rem; margin-bottom: 8px; color: #666; }
+  .raw-col pre { background: #f8f9fa; padding: 12px; border-radius: 4px;
+                 font-size: 0.75rem; overflow-x: auto; max-height: 400px;
+                 overflow-y: auto; white-space: pre-wrap; word-break: break-word; }
+</style>
+<script>
+function switchTab(btn, cardId, tabName) {
+  const card = btn.closest('.scenario-card');
+  card.querySelectorAll('.tab-btn').forEach(b => b.classList.remove('active'));
+  card.querySelectorAll('.tab-content').forEach(c => c.style.display = 'none');
+  btn.classList.add('active');
+  document.getElementById(cardId + '-' + tabName).style.display = 'block';
+}
+</script>
+</head>
+<body>
+{{BODY}}
+</body>
+</html>
+"""
+
+
+# ---------------------------------------------------------------------------
+# CLI
+# ---------------------------------------------------------------------------
+
+
+def main(argv: list[str] | None = None) -> int:
+    parser = argparse.ArgumentParser(description=__doc__.splitlines()[0])
+    parser.add_argument("--baseline", required=True, type=Path, help="Baseline trace JSON path")
+    parser.add_argument("--candidate", required=True, type=Path, help="Candidate trace JSON path")
+    parser.add_argument("--semantic", type=Path, help="Optional semantic-diff JSON")
+    parser.add_argument("--entities", type=Path, help="Optional entity-overlap JSON")
+    parser.add_argument("--narrative", type=Path, help="Optional plaintext corpus narrative")
+    parser.add_argument("--output", required=True, type=Path, help="HTML output path")
+    parser.add_argument("--no-open", action="store_true", help="Do not open the report in a browser")
+    args = parser.parse_args(argv)
+
+    baseline = _load_json(args.baseline)
+    candidate = _load_json(args.candidate)
+
+    graph = compare_graph_paths(
+        baseline.get("node_path", []) or [],
+        candidate.get("node_path", []) or [],
+    )
+    latency = compare_latency(baseline, candidate)
+    tools = compare_tool_calls(
+        baseline.get("tool_calls", []) or [],
+        candidate.get("tool_calls", []) or [],
+    )
+
+    semantic = _load_json(args.semantic) if args.semantic and args.semantic.exists() else None
+    entities = _load_json(args.entities) if args.entities and args.entities.exists() else None
+    narrative = (
+        args.narrative.read_text().strip()
+        if args.narrative and args.narrative.exists()
+        else ""
+    )
+
+    output = render_html(
+        baseline=baseline,
+        candidate=candidate,
+        graph=graph,
+        latency=latency,
+        tools=tools,
+        semantic=semantic,
+        entities=entities,
+        narrative=narrative,
+        output_path=args.output,
+    )
+    print(f"Wrote report: {output}")
+    if not args.no_open:
+        _open_in_browser(output)
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/skills/compare-trace/scripts/evaluators/__init__.py b/skills/compare-trace/scripts/evaluators/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/skills/compare-trace/scripts/evaluators/graph_path_diff.py b/skills/compare-trace/scripts/evaluators/graph_path_diff.py
new file mode 100644
index 00000000..90b16597
--- /dev/null
+++ b/skills/compare-trace/scripts/evaluators/graph_path_diff.py
@@ -0,0 +1,70 @@
+"""Graph path diff for two agent runs.
+
+Jaccard on visited node sets + LCS/max for ordering similarity. Neither side
+is ground truth. Ported from monte-carlo-data/ai-agent#1236.
+"""
+
+from __future__ import annotations
+
+from dataclasses import asdict, dataclass, field
+from typing import Any
+
+
+@dataclass
+class GraphPathDiff:
+    baseline_path: list[str]
+    candidate_path: list[str]
+    baseline_only_nodes: list[str] = field(default_factory=list)
+    candidate_only_nodes: list[str] = field(default_factory=list)
+    shared_nodes: list[str] = field(default_factory=list)
+    jaccard_similarity: float = 1.0
+    ordering_similarity: float = 1.0
+    overall_similarity: float = 1.0
+
+    def to_dict(self) -> dict[str, Any]:
+        return asdict(self)
+
+
+def _lcs_length(a: list[str], b: list[str]) -> int:
+    m, n = len(a), len(b)
+    if m == 0 or n == 0:
+        return 0
+    prev = [0] * (n + 1)
+    curr = [0] * (n + 1)
+    for i in range(1, m + 1):
+        for j in range(1, n + 1):
+            if a[i - 1] == b[j - 1]:
+                curr[j] = prev[j - 1] + 1
+            else:
+                curr[j] = max(prev[j], curr[j - 1])
+        prev, curr = curr, [0] * (n + 1)
+    return prev[n]
+
+
+def compare_graph_paths(
+    baseline_path: list[str],
+    candidate_path: list[str],
+) -> GraphPathDiff:
+    baseline_set = set(baseline_path)
+    candidate_set = set(candidate_path)
+
+    shared = baseline_set & candidate_set
+    baseline_only = baseline_set - candidate_set
+    candidate_only = candidate_set - baseline_set
+    union = baseline_set | candidate_set
+
+    jaccard = len(shared) / len(union) if union else 1.0
+
+    max_len = max(len(baseline_path), len(candidate_path))
+    ordering = _lcs_length(baseline_path, candidate_path) / max_len if max_len else 1.0
+
+    return GraphPathDiff(
+        baseline_path=baseline_path,
+        candidate_path=candidate_path,
+        baseline_only_nodes=sorted(baseline_only),
+        candidate_only_nodes=sorted(candidate_only),
+        shared_nodes=sorted(shared),
+        jaccard_similarity=jaccard,
+        ordering_similarity=ordering,
+        overall_similarity=(jaccard + ordering) / 2.0,
+    )
diff --git a/skills/compare-trace/scripts/evaluators/latency_diff.py b/skills/compare-trace/scripts/evaluators/latency_diff.py
new file mode 100644
index 00000000..5e867a20
--- /dev/null
+++ b/skills/compare-trace/scripts/evaluators/latency_diff.py
@@ -0,0 +1,98 @@
+"""Latency and resource-usage diff for two agent runs.
+
+Compares execution time, LLM call count, total tokens, tool call count.
+Reports per-metric ratios; flags regressions above ``regression_threshold``.
+Ported from monte-carlo-data/ai-agent#1236.
+"""
+
+from __future__ import annotations
+
+from dataclasses import asdict, dataclass, field
+from typing import Any
+
+
+@dataclass
+class MetricComparison:
+    field_name: str
+    baseline_value: float
+    candidate_value: float
+    ratio: float
+    is_regression: bool
+
+    def to_dict(self) -> dict[str, Any]:
+        return {
+            "field_name": self.field_name,
+            "baseline_value": self.baseline_value,
+            "candidate_value": self.candidate_value,
+            "ratio": self.ratio if self.ratio != float("inf") else "inf",
+            "is_regression": self.is_regression,
+        }
+
+
+@dataclass
+class LatencyDiffResult:
+    metrics: list[MetricComparison]
+    overall_assessment: str = "neutral"
+    regressions: list[str] = field(default_factory=list)
+
+    def __post_init__(self):
+        self.regressions = [m.field_name for m in self.metrics if m.is_regression]
+        if self.regressions:
+            self.overall_assessment = "regressed"
+        elif any(m.ratio < 1.0 for m in self.metrics):
+            self.overall_assessment = "improved"
+        else:
+            self.overall_assessment = "neutral"
+
+    def to_dict(self) -> dict[str, Any]:
+        return {
+            "metrics": [m.to_dict() for m in self.metrics],
+            "overall_assessment": self.overall_assessment,
+            "regressions": self.regressions,
+        }
+
+
+_METRICS = [
+    "execution_time_seconds",
+    "llm_call_count",
+    "total_tokens",
+    "tool_call_count",
+]
+
+
+def _compute_ratio(baseline: float, candidate: float) -> float:
+    if baseline == 0:
+        return 1.0 if candidate == 0 else float("inf")
+    return candidate / baseline
+
+
+def compare_latency(
+    baseline: dict,
+    candidate: dict,
+    regression_threshold: float = 1.5,
+) -> LatencyDiffResult:
+    """``baseline`` and ``candidate`` follow the normalized trace JSON shape.
+
+    ``tool_call_count`` is derived from ``len(tool_calls)`` if not present.
+    """
+
+    def _extract(snapshot: dict, key: str) -> float:
+        if key == "tool_call_count":
+            return float(len(snapshot.get("tool_calls", []) or []))
+        return float(snapshot.get(key, 0) or 0)
+
+    comparisons: list[MetricComparison] = []
+    for key in _METRICS:
+        b = _extract(baseline, key)
+        c = _extract(candidate, key)
+        ratio = _compute_ratio(b, c)
+        comparisons.append(
+            MetricComparison(
+                field_name=key,
+                baseline_value=b,
+                candidate_value=c,
+                ratio=ratio,
+                is_regression=ratio > regression_threshold,
+            )
+        )
+    return LatencyDiffResult(metrics=comparisons)
diff --git a/skills/compare-trace/scripts/evaluators/tool_call_diff.py b/skills/compare-trace/scripts/evaluators/tool_call_diff.py
new file mode 100644
index 00000000..33ddfb30
--- /dev/null
+++ b/skills/compare-trace/scripts/evaluators/tool_call_diff.py
@@ -0,0 +1,207 @@
+"""Tool-call sequence + argument diff for two agent runs.
+
+v0.3: compares ordered sequences of tool names (Levenshtein) AND argument
+dicts (top-level keys) for matched tool calls. Argument-level diff is enabled
+when callers pass tool_calls with populated ``args`` dicts.
+
+Sequence math and matching logic ported from monte-carlo-data/ai-agent#1236.
+"""
+
+from __future__ import annotations
+
+from collections import defaultdict
+from dataclasses import asdict, dataclass, field
+from typing import Any
+
+
+@dataclass
+class ArgumentChange:
+    """One matched tool-call pair whose arg keys differ.
+
+    `position_baseline` / `position_candidate` are the indices into the
+    respective sequences (useful for the HTML rendering). The ``*_values``
+    dicts carry the actual values that the renderer surfaces inline.
+    """
+
+    tool_name: str
+    position_baseline: int
+    position_candidate: int
+    added_keys: list[str] = field(default_factory=list)
+    removed_keys: list[str] = field(default_factory=list)
+    changed_keys: list[str] = field(default_factory=list)
+    added_values: dict[str, Any] = field(default_factory=dict)
+    removed_values: dict[str, Any] = field(default_factory=dict)
+    changed_values: dict[str, dict[str, Any]] = field(default_factory=dict)
+
+    def to_dict(self) -> dict[str, Any]:
+        return asdict(self)
+
+
+@dataclass
+class ToolCallDiff:
+    baseline_tools: list[str]
+    candidate_tools: list[str]
+    added: list[str] = field(default_factory=list)
+    removed: list[str] = field(default_factory=list)
+    shared: list[str] = field(default_factory=list)
+    edit_distance: int = 0
+    similarity: float = 1.0
+    argument_changes: list[dict[str, Any]] = field(default_factory=list)
+
+    def to_dict(self) -> dict[str, Any]:
+        return asdict(self)
+
+
+def _levenshtein(a: list[str], b: list[str]) -> int:
+    m, n = len(a), len(b)
+    if m < n:
+        a, b = b, a
+        m, n = n, m
+    previous = list(range(n + 1))
+    current = [0] * (n + 1)
+    for i in range(1, m + 1):
+        current[0] = i
+        for j in range(1, n + 1):
+            cost = 0 if a[i - 1] == b[j - 1] else 1
+            current[j] = min(
+                previous[j] + 1,
+                current[j - 1] + 1,
+                previous[j - 1] + cost,
+            )
+        previous, current = current, previous
+    return previous[n]
+
+
+def _match_tools_by_proximity(
+    baseline_calls: list[dict[str, Any]],
+    candidate_calls: list[dict[str, Any]],
+) -> list[tuple[int, int]]:
+    """Match tool calls between baseline and candidate by name and position.
+
+    For each tool name that appears in both sequences, greedily pairs calls by
+    closest positional proximity (baseline index vs candidate index). Each
+    call is matched at most once. Returns ``(baseline_index, candidate_index)``
+    pairs, sorted by baseline index.
+
+    Ported verbatim from monte-carlo-data/ai-agent#1236.
+    """
+    baseline_by_name: dict[str, list[int]] = defaultdict(list)
+    candidate_by_name: dict[str, list[int]] = defaultdict(list)
+
+    for i, call in enumerate(baseline_calls):
+        baseline_by_name[call.get("name", "")].append(i)
+    for j, call in enumerate(candidate_calls):
+        candidate_by_name[call.get("name", "")].append(j)
+
+    matches: list[tuple[int, int]] = []
+    for name in baseline_by_name:
+        if name not in candidate_by_name:
+            continue
+        b_indices = list(baseline_by_name[name])
+        c_indices = list(candidate_by_name[name])
+
+        used_c: set[int] = set()
+        for bi in b_indices:
+            best_ci: int | None = None
+            best_dist: float = float("inf")
+            for ci in c_indices:
+                if ci in used_c:
+                    continue
+                dist = abs(bi - ci)
+                if dist < best_dist:
+                    best_dist = dist
+                    best_ci = ci
+            if best_ci is not None:
+                matches.append((bi, best_ci))
+                used_c.add(best_ci)
+
+    matches.sort()
+    return matches
+
+
+def _compare_args(
+    tool_name: str,
+    baseline_idx: int,
+    candidate_idx: int,
+    baseline_args: dict[str, Any],
+    candidate_args: dict[str, Any],
+) -> ArgumentChange | None:
+    """Compare two arg dicts at the top-level key granularity.
+
+    Returns an ``ArgumentChange`` if anything differs, else ``None`` (so the
+    caller can skip noise). Nested-dict diff is intentionally out of scope —
+    a key whose value changes (including a nested dict that changed inside)
+    surfaces as a ``changed_keys`` entry.
+
+    Ported verbatim from monte-carlo-data/ai-agent#1236.
+    """
+    b_keys = set(baseline_args.keys())
+    c_keys = set(candidate_args.keys())
+
+    added_keys = sorted(c_keys - b_keys)
+    removed_keys = sorted(b_keys - c_keys)
+    changed_keys = sorted(k for k in b_keys & c_keys if baseline_args[k] != candidate_args[k])
+
+    if not added_keys and not removed_keys and not changed_keys:
+        return None
+
+    return ArgumentChange(
+        tool_name=tool_name,
+        position_baseline=baseline_idx,
+        position_candidate=candidate_idx,
+        added_keys=added_keys,
+        removed_keys=removed_keys,
+        changed_keys=changed_keys,
+        added_values={k: candidate_args[k] for k in added_keys},
+        removed_values={k: baseline_args[k] for k in removed_keys},
+        changed_values={
+            k: {"baseline": baseline_args[k], "candidate": candidate_args[k]}
+            for k in changed_keys
+        },
+    )
+
+
+def compare_tool_calls(
+    baseline_calls: list[dict[str, Any]],
+    candidate_calls: list[dict[str, Any]],
+) -> ToolCallDiff:
+    """Compare two ordered tool call sequences from baseline and candidate runs.
+
+    Each call is a dict with ``"name"`` (str) and ``"args"`` (dict). If ``args``
+    is empty or missing for both sides, argument_changes will be empty too.
+    """
+    baseline_tools = [c.get("name", "") for c in baseline_calls]
+    candidate_tools = [c.get("name", "") for c in candidate_calls]
+
+    baseline_set = set(baseline_tools)
+    candidate_set = set(candidate_tools)
+
+    edit_dist = _levenshtein(baseline_tools, candidate_tools)
+    max_len = max(len(baseline_tools), len(candidate_tools))
+    similarity = 1.0 - (edit_dist / max_len) if max_len > 0 else 1.0
+
+    matches = _match_tools_by_proximity(baseline_calls, candidate_calls)
+    argument_changes: list[dict[str, Any]] = []
+    for bi, ci in matches:
+        b_call = baseline_calls[bi]
+        c_call = candidate_calls[ci]
+        change = _compare_args(
+            tool_name=b_call.get("name", ""),
+            baseline_idx=bi,
+            candidate_idx=ci,
+            baseline_args=b_call.get("args", {}) or {},
+            candidate_args=c_call.get("args", {}) or {},
+        )
+        if change is not None:
+            argument_changes.append(change.to_dict())
+
+    return ToolCallDiff(
+        baseline_tools=baseline_tools,
+        candidate_tools=candidate_tools,
+        added=sorted(candidate_set - baseline_set),
+        removed=sorted(baseline_set - candidate_set),
+        shared=sorted(baseline_set & candidate_set),
+        edit_distance=edit_dist,
+        similarity=similarity,
+        argument_changes=argument_changes,
+    )
diff --git a/skills/compare-trace/scripts/local_otlp_receiver.py b/skills/compare-trace/scripts/local_otlp_receiver.py
new file mode 100644
index 00000000..14845816
--- /dev/null
+++ b/skills/compare-trace/scripts/local_otlp_receiver.py
@@ -0,0 +1,208 @@
+"""Minimal local OTLP/HTTP trace receiver.
+
+Listens on ``POST /v1/traces`` (protobuf, optionally gzipped) and appends each
+received span as one JSON object per line to ``--output``. Spans are recorded
+in a flat, framework-neutral shape that downstream converters
+(see ``scripts/sources/otel_spans.py``) can normalize for the compare-trace
+skill.
+
+Usage::
+
+    python local_otlp_receiver.py --output run-a.jsonl --port 4318
+
+Send ``SIGINT`` / ``SIGTERM`` to stop. The script flushes the output file and
+exits ``0`` on graceful shutdown.
+
+Dependencies: ``opentelemetry-proto`` (already a transitive dep of any
+``opentelemetry-sdk`` install).
+"""
+
+from __future__ import annotations
+
+import argparse
+import gzip
+import json
+import signal
+import sys
+import threading
+from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer
+from pathlib import Path
+from typing import Any
+
+from opentelemetry.proto.collector.trace.v1 import trace_service_pb2
+from opentelemetry.proto.common.v1 import common_pb2
+from opentelemetry.proto.trace.v1 import trace_pb2
+
+_SPAN_KIND_NAMES = {
+    trace_pb2.Span.SPAN_KIND_UNSPECIFIED: "UNSPECIFIED",
+    trace_pb2.Span.SPAN_KIND_INTERNAL: "INTERNAL",
+    trace_pb2.Span.SPAN_KIND_SERVER: "SERVER",
+    trace_pb2.Span.SPAN_KIND_CLIENT: "CLIENT",
+    trace_pb2.Span.SPAN_KIND_PRODUCER: "PRODUCER",
+    trace_pb2.Span.SPAN_KIND_CONSUMER: "CONSUMER",
+}
+
+_STATUS_CODE_NAMES = {
+    trace_pb2.Status.STATUS_CODE_UNSET: "UNSET",
+    trace_pb2.Status.STATUS_CODE_OK: "OK",
+    trace_pb2.Status.STATUS_CODE_ERROR: "ERROR",
+}
+
+
+def _anyvalue_to_python(value: common_pb2.AnyValue) -> Any:
+    kind = value.WhichOneof("value")
+    if kind == "string_value":
+        return value.string_value
+    if kind == "bool_value":
+        return value.bool_value
+    if kind == "int_value":
+        return value.int_value
+    if kind == "double_value":
+        return value.double_value
+    if kind == "bytes_value":
+        return value.bytes_value.hex()
+    if kind == "array_value":
+        return [_anyvalue_to_python(v) for v in value.array_value.values]
+    if kind == "kvlist_value":
+        return {kv.key: _anyvalue_to_python(kv.value) for kv in value.kvlist_value.values}
+    return None
+
+
+def _attrs_to_dict(attrs) -> dict[str, Any]:
+    return {kv.key: _anyvalue_to_python(kv.value) for kv in attrs}
+
+
+def _span_to_dict(
+    span: trace_pb2.Span,
+    resource_attrs: dict[str, Any],
+    scope_name: str,
+) -> dict[str, Any]:
+    return {
+        "trace_id": span.trace_id.hex(),
+        "span_id": span.span_id.hex(),
+        "parent_span_id": span.parent_span_id.hex() if span.parent_span_id else None,
+        "name": span.name,
+        "kind": _SPAN_KIND_NAMES.get(span.kind, "UNSPECIFIED"),
+        "start_time_unix_nano": int(span.start_time_unix_nano),
+        "end_time_unix_nano": int(span.end_time_unix_nano),
+        "duration_ms": (span.end_time_unix_nano - span.start_time_unix_nano) / 1_000_000.0,
+        "status": {
+            "code": _STATUS_CODE_NAMES.get(span.status.code, "UNSET"),
+            "message": span.status.message,
+        },
+        "attributes": _attrs_to_dict(span.attributes),
+        "events": [
+            {
+                "name": ev.name,
+                "time_unix_nano": int(ev.time_unix_nano),
+                "attributes": _attrs_to_dict(ev.attributes),
+            }
+            for ev in span.events
+        ],
+        "resource": resource_attrs,
+        "scope": scope_name,
+    }
+
+
+class _Handler(BaseHTTPRequestHandler):
+    output_path: Path = Path("/dev/null")
+    file_lock: threading.Lock = threading.Lock()
+    span_count: int = 0
+
+    def log_message(self, fmt: str, *args: Any) -> None:  # noqa: D401
+        sys.stderr.write(f"[receiver] {self.address_string()} - {fmt % args}\n")
+
+    def do_POST(self) -> None:
+        if self.path != "/v1/traces":
+            self.send_response(404)
+            self.end_headers()
+            return
+
+        length = int(self.headers.get("Content-Length", 0))
+        body = self.rfile.read(length) if length else b""
+        if self.headers.get("Content-Encoding", "").lower() == "gzip":
+            try:
+                body = gzip.decompress(body)
+            except OSError as e:
+                sys.stderr.write(f"[receiver] gzip decompress failed: {e}\n")
+                self.send_response(400)
+                self.end_headers()
+                return
+
+        req = trace_service_pb2.ExportTraceServiceRequest()
+        try:
+            req.ParseFromString(body)
+        except Exception as e:  # pragma: no cover
+            sys.stderr.write(f"[receiver] proto parse failed: {e}\n")
+            self.send_response(400)
+            self.end_headers()
+            return
+
+        new_spans: list[str] = []
+        for rspans in req.resource_spans:
+            resource_attrs = _attrs_to_dict(rspans.resource.attributes)
+            for sspans in rspans.scope_spans:
+                scope_name = sspans.scope.name
+                for span in sspans.spans:
+                    new_spans.append(
+                        json.dumps(_span_to_dict(span, resource_attrs, scope_name))
+                    )
+
+        if new_spans:
+            with type(self).file_lock:
+                with type(self).output_path.open("a") as fh:
+                    fh.write("\n".join(new_spans))
+                    fh.write("\n")
+                type(self).span_count += len(new_spans)
+            sys.stderr.write(f"[receiver] wrote {len(new_spans)} spans (total {type(self).span_count})\n")
+
+        resp = trace_service_pb2.ExportTraceServiceResponse()
+        body_out = resp.SerializeToString()
+        self.send_response(200)
+        self.send_header("Content-Type", "application/x-protobuf")
+        self.send_header("Content-Length", str(len(body_out)))
+        self.end_headers()
+        self.wfile.write(body_out)
+
+
+def _parse_args() -> argparse.Namespace:
+    p = argparse.ArgumentParser(description=__doc__)
+    p.add_argument("--output", required=True, help="JSONL output file for received spans")
+    p.add_argument("--port", type=int, default=4318)
+    p.add_argument("--host", default="127.0.0.1")
+    return p.parse_args()
+
+
+def main() -> int:
+    args = _parse_args()
+    out_path = Path(args.output).resolve()
+    out_path.parent.mkdir(parents=True, exist_ok=True)
+    out_path.touch()
+    _Handler.output_path = out_path
+    _Handler.file_lock = threading.Lock()
+    _Handler.span_count = 0
+
+    server = ThreadingHTTPServer((args.host, args.port), _Handler)
+    sys.stderr.write(f"[receiver] listening on http://{args.host}:{args.port}/v1/traces\n")
+    sys.stderr.write(f"[receiver] writing spans to {out_path}\n")
+
+    stop = threading.Event()
+
+    def _shutdown(signum: int, _frame: Any) -> None:
+        sys.stderr.write(f"[receiver] caught signal {signum}, shutting down\n")
+        stop.set()
+        threading.Thread(target=server.shutdown, daemon=True).start()
+
+    signal.signal(signal.SIGINT, _shutdown)
+    signal.signal(signal.SIGTERM, _shutdown)
+
+    try:
+        server.serve_forever()
+    finally:
+        server.server_close()
+        sys.stderr.write(f"[receiver] stopped; received {_Handler.span_count} spans total\n")
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/skills/compare-trace/scripts/sources/__init__.py b/skills/compare-trace/scripts/sources/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/skills/compare-trace/scripts/sources/otel_spans.py b/skills/compare-trace/scripts/sources/otel_spans.py
new file mode 100644
index 00000000..254fe468
--- /dev/null
+++ b/skills/compare-trace/scripts/sources/otel_spans.py
@@ -0,0 +1,226 @@
+"""Normalize a JSONL stream of OTLP spans (as written by
+``local_otlp_receiver.py``) into the shape the compare-trace skill consumes.
+
+Span dialect: GenAI semantic conventions (``gen_ai.prompt.*``, ``gen_ai.completion.*``,
+``gen_ai.usage.*``) — these are emitted by the Traceloop / openllmetry
+``LangchainInstrumentor`` and are stable across LangChain releases.
+
+LangGraph node spans are detected by the ``*.task`` name suffix that the
+instrumentor uses for each node call. The workflow root is detected by the
+``*.workflow`` suffix and is also where we read the overall duration from.
+
+Tool-call extraction reads ``gen_ai.completion.{i}.tool_calls.{j}.{name,arguments,id}``
+attributes when present. If they are absent — either because the run made no
+tool calls, or because the instrumentor omitted them — ``tool_calls`` is
+returned empty rather than guessed-at.
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import re
+import sys
+from collections import defaultdict
+from pathlib import Path
+from typing import Any
+
+_TASK_NAME_RE = re.compile(r"^(?P<node>.+)\.task$")
+_WORKFLOW_NAME_RE = re.compile(r"^(?P<name>.+)\.workflow$")
+_TOOL_SPAN_NAME_RE = re.compile(r"^(?P<tool>.+)\.tool$")
+
+# Attribute key patterns we care about for tool-call extraction.
+# Traceloop's Bedrock instrumentor only emits tool_calls under
+# ``gen_ai.prompt.*.tool_calls.*`` (where prior assistant tool calls echo
+# back as part of the message history); OpenAI / Anthropic native paths
+# also emit under ``gen_ai.completion.*.tool_calls.*``. Accept both.
+_TOOL_CALL_RE = re.compile(
+    r"^gen_ai\.(?:completion|prompt)\."
+    r"(?P<msg_idx>\d+)\.tool_calls\.(?P<tc_idx>\d+)\.(?P<field>name|arguments|id)$"
+)
+
+
+def _parse_jsonl(path: Path) -> list[dict[str, Any]]:
+    spans: list[dict[str, Any]] = []
+    with path.open() as fh:
+        for line in fh:
+            line = line.strip()
+            if not line:
+                continue
+            spans.append(json.loads(line))
+    return spans
+
+
+def _extract_tool_calls_from_attrs(
+    attrs: dict[str, Any],
+) -> list[dict[str, Any]]:
+    """Pull tool_calls out of one LLM span's gen_ai.{completion,prompt}.* attrs.
+
+    Returns calls in (msg_idx, tc_idx) order. Callers are expected to
+    dedupe across spans by tool-call ``id``: the same call typically appears
+    in the producing completion AND in every subsequent prompt's message
+    history.
+
+    ``arguments`` is a JSON-encoded string in nearly all cases; some
+    instrumentations emit a dict instead. Both are handled.
+    """
+    buckets: dict[tuple[int, int], dict[str, Any]] = defaultdict(dict)
+    for key, value in attrs.items():
+        match = _TOOL_CALL_RE.match(key)
+        if not match:
+            continue
+        msg_idx = int(match.group("msg_idx"))
+        tc_idx = int(match.group("tc_idx"))
+        buckets[(msg_idx, tc_idx)][match.group("field")] = value
+
+    calls: list[dict[str, Any]] = []
+    for (_, _), raw in sorted(buckets.items()):
+        name = raw.get("name") or ""
+        args_raw = raw.get("arguments", "")
+        if isinstance(args_raw, dict):
+            args = args_raw
+        elif isinstance(args_raw, str) and args_raw:
+            try:
+                args = json.loads(args_raw)
+            except json.JSONDecodeError:
+                args = {}
+        else:
+            args = {}
+        calls.append({"name": name, "args": args or {}, "id": raw.get("id", "")})
+    return calls
+
+
+def normalize(spans: list[dict[str, Any]]) -> dict[str, Any]:
+    """Convert raw OTLP spans into a compare-trace normalized trace dict."""
+    if not spans:
+        return {
+            "trace_id": "",
+            "node_path": [],
+            "tool_calls": [],
+            "execution_time_seconds": 0.0,
+            "llm_call_count": 0,
+            "total_tokens": 0,
+            "tool_call_count": 0,
+            "final_output_text": "",
+        }
+
+    # Take the dominant trace_id (handles stray smoke spans landing in the
+    # same file). All spans in a single agent run share one trace_id.
+    by_trace: dict[str, list[dict[str, Any]]] = defaultdict(list)
+    for s in spans:
+        by_trace[s.get("trace_id", "")].append(s)
+    trace_id = max(by_trace, key=lambda k: len(by_trace[k]))
+    run_spans = by_trace[trace_id]
+
+    # LangGraph nodes — ordered by start time, suffix stripped.
+    node_spans = sorted(
+        (s for s in run_spans if _TASK_NAME_RE.match(s.get("name", ""))),
+        key=lambda s: s.get("start_time_unix_nano", 0),
+    )
+    node_path = [_TASK_NAME_RE.match(s["name"]).group("node") for s in node_spans]
+
+    # LLM calls: any span carrying gen_ai.prompt.* counts.
+    llm_spans = [
+        s
+        for s in run_spans
+        if any(k.startswith("gen_ai.prompt.") for k in s.get("attributes", {}))
+    ]
+
+    total_tokens = 0
+    final_output_text = ""
+
+    # Dedupe tool calls by id across spans — the same call appears once in
+    # the producing completion and again in every subsequent prompt history.
+    tool_calls_by_id: dict[str, dict[str, Any]] = {}
+    # Anonymous calls (no id) keyed by (name, json-args) to avoid losing
+    # repeated tool invocations that legitimately differ from each other.
+    anon_tool_calls: list[dict[str, Any]] = []
+    # Order calls by the start_time of the FIRST span that mentioned them.
+    first_seen_ns: dict[str, int] = {}
+
+    for llm in sorted(llm_spans, key=lambda s: s.get("start_time_unix_nano", 0)):
+        attrs = llm.get("attributes", {})
+        start_ns = llm.get("start_time_unix_nano", 0)
+
+        # Prefer the explicit total_tokens attr if present; else sum the parts.
+        total = (
+            attrs.get("llm.usage.total_tokens")
+            or attrs.get("gen_ai.usage.total_tokens")
+            or (
+                (attrs.get("gen_ai.usage.prompt_tokens") or 0)
+                + (attrs.get("gen_ai.usage.completion_tokens") or 0)
+            )
+        )
+        try:
+            total_tokens += int(total)
+        except (TypeError, ValueError):
+            pass
+
+        for call in _extract_tool_calls_from_attrs(attrs):
+            tid = call.get("id") or ""
+            if tid:
+                if tid not in tool_calls_by_id:
+                    tool_calls_by_id[tid] = call
+                    first_seen_ns[tid] = start_ns
+            else:
+                anon_tool_calls.append(call)
+
+        # Capture the LATEST assistant completion as final_output_text.
+        # Walk completion indices in order; the highest index is the
+        # canonical final answer for that LLM call.
+        for i in range(20):
+            content = attrs.get(f"gen_ai.completion.{i}.content")
+            role = attrs.get(f"gen_ai.completion.{i}.role")
+            if content and role == "assistant" and isinstance(content, str):
+                final_output_text = content.strip()
+
+    tool_calls = [
+        tool_calls_by_id[tid]
+        for tid in sorted(tool_calls_by_id, key=lambda t: first_seen_ns.get(t, 0))
+    ]
+    tool_calls.extend(anon_tool_calls)
+
+    # Workflow root span carries the wall-clock duration.
+    workflow_spans = [
+        s for s in run_spans if _WORKFLOW_NAME_RE.match(s.get("name", ""))
+    ]
+    if workflow_spans:
+        wf = workflow_spans[0]
+        execution_time_seconds = (
+            wf.get("end_time_unix_nano", 0) - wf.get("start_time_unix_nano", 0)
+        ) / 1_000_000_000.0
+    else:
+        ends = [s.get("end_time_unix_nano", 0) for s in run_spans]
+        starts = [s.get("start_time_unix_nano", 0) for s in run_spans]
+        execution_time_seconds = (max(ends) - min(starts)) / 1_000_000_000.0
+
+    return {
+        "trace_id": trace_id,
+        "node_path": node_path,
+        "tool_calls": tool_calls,
+        "execution_time_seconds": round(execution_time_seconds, 3),
+        "llm_call_count": len(llm_spans),
+        "total_tokens": total_tokens,
+        "tool_call_count": len(tool_calls),
+        "final_output_text": final_output_text,
+    }
+
+
+def main() -> int:
+    p = argparse.ArgumentParser(description=__doc__)
+    p.add_argument("input", help="JSONL of OTLP spans from local_otlp_receiver.py")
+    p.add_argument("--output", help="Write normalized JSON here (default: stdout)")
+    args = p.parse_args()
+
+    spans = _parse_jsonl(Path(args.input))
+    normalized = normalize(spans)
+    out = json.dumps(normalized, indent=2)
+    if args.output:
+        Path(args.output).write_text(out)
+    else:
+        print(out)
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/skills/compare-trace/tests/fixtures/bedrock_dialect.jsonl b/skills/compare-trace/tests/fixtures/bedrock_dialect.jsonl
new file mode 100644
index 00000000..0ebf4517
--- /dev/null
+++ b/skills/compare-trace/tests/fixtures/bedrock_dialect.jsonl
@@ -0,0 +1,7 @@
+{"trace_id": "trace01", "span_id": "s01_wf",   "parent_span_id": null,     "name": "MyAgent.workflow",        "kind": "INTERNAL", "start_time_unix_nano": 1000000000, "end_time_unix_nano": 11000000000, "duration_ms": 10000.0, "status": {"code": "UNSET", "message": ""}, "attributes": {"traceloop.workflow.name": "MyAgent"}, "events": [], "resource": {"service.name": "myagent"}, "scope": "traceloop.langchain"}
+{"trace_id": "trace01", "span_id": "s02_init", "parent_span_id": "s01_wf", "name": "initialization.task",      "kind": "INTERNAL", "start_time_unix_nano": 1500000000, "end_time_unix_nano": 2000000000,  "duration_ms": 500.0,    "status": {"code": "UNSET", "message": ""}, "attributes": {"montecarlo.association_properties.langgraph_node": "initialization"}, "events": [], "resource": {"service.name": "myagent"}, "scope": "traceloop.langchain"}
+{"trace_id": "trace01", "span_id": "s03_react","parent_span_id": "s01_wf", "name": "react_agent.task",         "kind": "INTERNAL", "start_time_unix_nano": 2000000000, "end_time_unix_nano": 10000000000, "duration_ms": 8000.0,   "status": {"code": "UNSET", "message": ""}, "attributes": {"montecarlo.association_properties.langgraph_node": "react_agent"}, "events": [], "resource": {"service.name": "myagent"}, "scope": "traceloop.langchain"}
+{"trace_id": "trace01", "span_id": "s04_chat1","parent_span_id": "s03_react","name": "ChatBedrock.chat",       "kind": "CLIENT",   "start_time_unix_nano": 3000000000, "end_time_unix_nano": 4000000000,  "duration_ms": 1000.0,   "status": {"code": "UNSET", "message": ""}, "attributes": {"gen_ai.system": "AWS", "gen_ai.request.model": "claude-sonnet-4-6", "gen_ai.prompt.0.role": "user", "gen_ai.prompt.0.content": "what is my coverage gap?", "gen_ai.prompt.1.role": "system", "gen_ai.prompt.1.content": "You are a Coverage Advisor.", "gen_ai.completion.0.role": "assistant", "gen_ai.completion.0.content": "Let me check.", "gen_ai.usage.prompt_tokens": 100, "gen_ai.usage.completion_tokens": 20, "llm.usage.total_tokens": 120}, "events": [], "resource": {"service.name": "myagent"}, "scope": "traceloop.langchain"}
+{"trace_id": "trace01", "span_id": "s05_tool", "parent_span_id": "s03_react","name": "get_warehouses.tool",    "kind": "INTERNAL", "start_time_unix_nano": 4500000000, "end_time_unix_nano": 5000000000,  "duration_ms": 500.0,    "status": {"code": "UNSET", "message": ""}, "attributes": {"traceloop.span.kind": "tool"}, "events": [], "resource": {"service.name": "myagent"}, "scope": "traceloop.langchain"}
+{"trace_id": "trace01", "span_id": "s06_chat2","parent_span_id": "s03_react","name": "ChatBedrock.chat",       "kind": "CLIENT",   "start_time_unix_nano": 5500000000, "end_time_unix_nano": 6500000000,  "duration_ms": 1000.0,   "status": {"code": "UNSET", "message": ""}, "attributes": {"gen_ai.system": "AWS", "gen_ai.request.model": "claude-sonnet-4-6", "gen_ai.prompt.0.role": "user", "gen_ai.prompt.0.content": "what is my coverage gap?", "gen_ai.prompt.1.role": "system", "gen_ai.prompt.1.content": "You are a Coverage Advisor.", "gen_ai.prompt.2.role": "assistant", "gen_ai.prompt.2.content": "", "gen_ai.prompt.2.tool_calls.0.id": "tid_1", "gen_ai.prompt.2.tool_calls.0.name": "get_warehouses", "gen_ai.prompt.2.tool_calls.0.arguments": "{}", "gen_ai.prompt.3.role": "tool", "gen_ai.prompt.3.content": "[warehouses...]", "gen_ai.completion.0.role": "assistant", "gen_ai.completion.0.content": "You have 3 warehouses with use cases.", "gen_ai.usage.prompt_tokens": 200, "gen_ai.usage.completion_tokens": 30, "llm.usage.total_tokens": 230}, "events": [], "resource": {"service.name": "myagent"}, "scope": "traceloop.langchain"}
+{"trace_id": "trace01", "span_id": "s07_route","parent_span_id": "s01_wf", "name": "_route_after_react.task",  "kind": "INTERNAL", "start_time_unix_nano": 9500000000, "end_time_unix_nano": 9600000000,  "duration_ms": 100.0,    "status": {"code": "UNSET", "message": ""}, "attributes": {"montecarlo.association_properties.langgraph_node": "_route_after_react"}, "events": [], "resource": {"service.name": "myagent"}, "scope": "traceloop.langchain"}
diff --git a/skills/compare-trace/tests/fixtures/completion_dialect.jsonl b/skills/compare-trace/tests/fixtures/completion_dialect.jsonl
new file mode 100644
index 00000000..7ee95ec7
--- /dev/null
+++ b/skills/compare-trace/tests/fixtures/completion_dialect.jsonl
@@ -0,0 +1,4 @@
+{"trace_id": "trace02", "span_id": "t01_wf",   "parent_span_id": null,     "name": "OtherAgent.workflow",     "kind": "INTERNAL", "start_time_unix_nano": 1000000000, "end_time_unix_nano": 6000000000, "duration_ms": 5000.0, "status": {"code": "UNSET", "message": ""}, "attributes": {"traceloop.workflow.name": "OtherAgent"}, "events": [], "resource": {"service.name": "otheragent"}, "scope": "traceloop.langchain"}
+{"trace_id": "trace02", "span_id": "t02_node", "parent_span_id": "t01_wf", "name": "agent_node.task",          "kind": "INTERNAL", "start_time_unix_nano": 1500000000, "end_time_unix_nano": 5000000000, "duration_ms": 3500.0, "status": {"code": "UNSET", "message": ""}, "attributes": {"montecarlo.association_properties.langgraph_node": "agent_node"}, "events": [], "resource": {"service.name": "otheragent"}, "scope": "traceloop.langchain"}
+{"trace_id": "trace02", "span_id": "t03_chat1","parent_span_id": "t02_node","name": "ChatOpenAI.chat",         "kind": "CLIENT",   "start_time_unix_nano": 2000000000, "end_time_unix_nano": 3000000000, "duration_ms": 1000.0, "status": {"code": "UNSET", "message": ""}, "attributes": {"gen_ai.system": "openai", "gen_ai.request.model": "gpt-4o", "gen_ai.prompt.0.role": "user", "gen_ai.prompt.0.content": "do the thing", "gen_ai.completion.0.role": "assistant", "gen_ai.completion.0.content": "", "gen_ai.completion.0.tool_calls.0.id": "call_abc", "gen_ai.completion.0.tool_calls.0.name": "do_thing", "gen_ai.completion.0.tool_calls.0.arguments": "{\"target\":\"foo\",\"force\":true}", "gen_ai.usage.total_tokens": 75}, "events": [], "resource": {"service.name": "otheragent"}, "scope": "traceloop.langchain"}
+{"trace_id": "trace02", "span_id": "t04_chat2","parent_span_id": "t02_node","name": "ChatOpenAI.chat",         "kind": "CLIENT",   "start_time_unix_nano": 3500000000, "end_time_unix_nano": 4500000000, "duration_ms": 1000.0, "status": {"code": "UNSET", "message": ""}, "attributes": {"gen_ai.system": "openai", "gen_ai.request.model": "gpt-4o", "gen_ai.prompt.0.role": "user", "gen_ai.prompt.0.content": "do the thing", "gen_ai.prompt.1.role": "assistant", "gen_ai.prompt.1.tool_calls.0.id": "call_abc", "gen_ai.prompt.1.tool_calls.0.name": "do_thing", "gen_ai.prompt.1.tool_calls.0.arguments": "{\"target\":\"foo\",\"force\":true}", "gen_ai.prompt.2.role": "tool", "gen_ai.prompt.2.content": "OK done", "gen_ai.completion.0.role": "assistant", "gen_ai.completion.0.content": "All done — the thing was processed.", "gen_ai.usage.total_tokens": 90}, "events": [], "resource": {"service.name": "otheragent"}, "scope": "traceloop.langchain"}
diff --git a/skills/compare-trace/tests/test_otel_spans.py b/skills/compare-trace/tests/test_otel_spans.py
new file mode 100644
index 00000000..63bbc354
--- /dev/null
+++ b/skills/compare-trace/tests/test_otel_spans.py
@@ -0,0 +1,161 @@
+#!/usr/bin/env python3
+"""
+Smoke test for ``sources/otel_spans.py`` — feeds each fixture JSONL into the
+normalizer and asserts the resulting dict matches the expected shape.
+
+Two fixtures cover the two tool-call dialects the normalizer has to handle:
+
+- ``bedrock_dialect.jsonl`` — Traceloop's Bedrock instrumentor emits tool
+  calls only under ``gen_ai.prompt.*.tool_calls.*`` on the NEXT LLM call
+  (never on the completion that produced them). Exercises the
+  prompt-history extraction path.
+- ``completion_dialect.jsonl`` — OpenAI / Anthropic-native instrumentations
+  emit tool calls under ``gen_ai.completion.*.tool_calls.*``. Exercises
+  the completion-extraction path and the dedup-by-id behavior (the same
+  call also appears in the next prompt's history; the normalizer must
+  collapse to one).
+
+Run:
+    python3 skills/compare-trace/tests/test_otel_spans.py
+"""
+
+from __future__ import annotations
+
+import json
+import subprocess
+import sys
+from pathlib import Path
+
+TESTS_DIR = Path(__file__).parent
+SKILL_ROOT = TESTS_DIR.parent
+NORMALIZER = SKILL_ROOT / "scripts" / "sources" / "otel_spans.py"
+FIXTURES_DIR = TESTS_DIR / "fixtures"
+
+PASSED = 0
+FAILED = 0
+
+
+def run_normalizer(fixture: str) -> dict:
+    """Run sources/otel_spans.py against a fixture, return parsed JSON."""
+    result = subprocess.run(
+        [sys.executable, str(NORMALIZER), str(FIXTURES_DIR / fixture)],
+        capture_output=True,
+        text=True,
+        check=True,
+        timeout=30,
+    )
+    return json.loads(result.stdout)
+
+
+def check(label: str, condition: bool, hint: str = "") -> None:
+    """Record a single check; raise on failure so pytest sees per-test fails."""
+    global PASSED, FAILED
+    if condition:
+        PASSED += 1
+        print(f"  PASS  {label}")
+        return
+    FAILED += 1
+    print(f"  FAIL  {label}" + (f" -- {hint}" if hint else ""))
+    raise AssertionError(label + (f" ({hint})" if hint else ""))
+
+
+def test_bedrock_dialect() -> None:
+    """Bedrock: tool_calls live under gen_ai.prompt.*.tool_calls.* in later spans."""
+    print("test_bedrock_dialect:")
+    out = run_normalizer("bedrock_dialect.jsonl")
+
+    check("trace_id picked", out["trace_id"] == "trace01", out["trace_id"])
+    check(
+        "node_path in start-time order",
+        out["node_path"] == ["initialization", "react_agent", "_route_after_react"],
+        repr(out["node_path"]),
+    )
+    check("llm_call_count = 2", out["llm_call_count"] == 2, str(out["llm_call_count"]))
+    check(
+        "total_tokens summed across LLM spans",
+        out["total_tokens"] == 350,
+        f"expected 350 (120+230), got {out['total_tokens']}",
+    )
+    check(
+        "execution_time_seconds from workflow root",
+        out["execution_time_seconds"] == 10.0,
+        str(out["execution_time_seconds"]),
+    )
+    check("tool_call_count = 1", out["tool_call_count"] == 1, str(out["tool_call_count"]))
+    check(
+        "tool call extracted from prompt.*.tool_calls.*",
+        len(out["tool_calls"]) == 1
+        and out["tool_calls"][0]["name"] == "get_warehouses"
+        and out["tool_calls"][0]["id"] == "tid_1"
+        and out["tool_calls"][0]["args"] == {},
+        repr(out["tool_calls"]),
+    )
+    check(
+        "final_output_text from last LLM span's completion",
+        out["final_output_text"] == "You have 3 warehouses with use cases.",
+        repr(out["final_output_text"]),
+    )
+
+
+def test_completion_dialect() -> None:
+    """OpenAI/Anthropic: tool_calls under completion.* on the producing span;
+    the SAME call echoes back under prompt.* on the next span — must dedup."""
+    print("test_completion_dialect:")
+    out = run_normalizer("completion_dialect.jsonl")
+
+    check("trace_id picked", out["trace_id"] == "trace02", out["trace_id"])
+    check(
+        "node_path",
+        out["node_path"] == ["agent_node"],
+        repr(out["node_path"]),
+    )
+    check("llm_call_count = 2", out["llm_call_count"] == 2, str(out["llm_call_count"]))
+    check(
+        "total_tokens uses gen_ai.usage.total_tokens",
+        out["total_tokens"] == 165,
+        f"expected 165 (75+90), got {out['total_tokens']}",
+    )
+    check(
+        "execution_time_seconds",
+        out["execution_time_seconds"] == 5.0,
+        str(out["execution_time_seconds"]),
+    )
+    check(
+        "tool_calls deduped by id across completion + prompt-history",
+        out["tool_call_count"] == 1,
+        f"expected 1 deduped call, got {out['tool_call_count']}: {out['tool_calls']}",
+    )
+    check(
+        "deduped tool call retains parsed args",
+        len(out["tool_calls"]) == 1
+        and out["tool_calls"][0]["name"] == "do_thing"
+        and out["tool_calls"][0]["id"] == "call_abc"
+        and out["tool_calls"][0]["args"] == {"target": "foo", "force": True},
+        repr(out["tool_calls"]),
+    )
+    check(
+        "final_output_text from last LLM span",
+        out["final_output_text"] == "All done — the thing was processed.",
+        repr(out["final_output_text"]),
+    )
+
+
+def main() -> None:
+    """Standalone runner — invokes each test with its own try/except so we
+    get a single PASSED/FAILED summary instead of stopping at the first fail.
+    pytest invokes ``test_*`` directly and gets per-test failures via the
+    AssertionError raised in ``check()``."""
+    for fn in [test_bedrock_dialect, test_completion_dialect]:
+        try:
+            fn()
+        except AssertionError as e:
+            print(f"  (test {fn.__name__} aborted: {e})")
+    print()
+    print(f"PASSED: {PASSED}")
+    print(f"FAILED: {FAILED}")
+    if FAILED:
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()