diff --git a/.github/workflows/compare-trace-tests.yml b/.github/workflows/compare-trace-tests.yml new file mode 100644 index 00000000..019cc5e1 --- /dev/null +++ b/.github/workflows/compare-trace-tests.yml @@ -0,0 +1,30 @@ +name: Compare Trace Skill Tests + +on: + push: + branches: [main] + paths: + - 'skills/compare-trace/**' + - '.github/workflows/compare-trace-tests.yml' + pull_request: + paths: + - 'skills/compare-trace/**' + - '.github/workflows/compare-trace-tests.yml' + +permissions: + contents: read + +jobs: + test: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - uses: actions/setup-python@v5 + with: + python-version: '3.11' + + # test_otel_spans.py exercises sources/otel_spans.py via subprocess. + # Stdlib only — no pip install needed. + - name: Run otel_spans normalizer tests + run: python3 skills/compare-trace/tests/test_otel_spans.py diff --git a/.gitignore b/.gitignore index 34092a97..402f58ce 100644 --- a/.gitignore +++ b/.gitignore @@ -14,3 +14,9 @@ temp/ __pycache__/ *.pyc .work/ + +# Local install artifacts from running plugins/codex/scripts/install.sh +# against this repo. Intended for downstream target repos, not this one. +.agents/ +.codex/ +plugins/mc-agent-toolkit/ diff --git a/README.md b/README.md index e0042465..b63a6fe6 100644 --- a/README.md +++ b/README.md @@ -48,6 +48,12 @@ Skills are grouped by the job they help you do. Orchestrated workflows sequence | **Storage Cost Analysis** | Identifies storage waste (unread, zombie, dead-end tables); uses lineage to verify cleanup is safe and estimates savings. | [README](skills/storage-cost-analysis/README.md) | | **Performance Diagnosis** | Diagnoses slow pipelines and expensive queries across Airflow, dbt, Databricks, and other platforms. | [README](skills/performance-diagnosis/README.md) | +### Evaluate — compare agent runs + +| Skill | Description | Details | +|---|---|---| +| **Compare Trace** | A/B compares two existing agent traces (by ID) — graph path, latency/tokens, tool-call sequence, plus LLM-based semantic and entity-overlap diffs over the final answers. Emits an HTML report. | [README](skills/compare-trace/README.md) | + ### Setup — ingestion and connections | Skill | Description | Details | diff --git a/plugins/claude-code/evals/compare-trace/trigger-evals.json b/plugins/claude-code/evals/compare-trace/trigger-evals.json new file mode 100644 index 00000000..3e975edd --- /dev/null +++ b/plugins/claude-code/evals/compare-trace/trigger-evals.json @@ -0,0 +1,126 @@ +{ + "skill": "monte-carlo-compare-trace", + "description": "Trigger accuracy evals for the monte-carlo-compare-trace skill. Each case specifies whether the skill SHOULD or SHOULD NOT be triggered by the given prompt.", + "cases": [ + { + "id": "should-01", + "prompt": "Compare these two agent conversations: 019e8f2a-24ae-7880-8901-cbc79aca43ed and 019e9319-e88e-7080-bc78-2aff46543849", + "expected": "trigger", + "rationale": "Direct A/B compare with two conversation IDs -- core skill use case" + }, + { + "id": "should-02", + "prompt": "Diff these two agent runs and tell me what changed.", + "expected": "trigger", + "rationale": "Explicit 'diff two agent runs' phrasing -- listed trigger" + }, + { + "id": "should-03", + "prompt": "I tweaked the system prompt for the coverage agent and re-ran it on the same conversation. Did the change cause a regression? Here are the two IDs.", + "expected": "trigger", + "rationale": "Prompt change regression check between two runs -- explicit use case" + }, + { + "id": "should-04", + "prompt": "/compare-trace 019e8f2a-24ae-7880-8901-cbc79aca43ed 019e9319-e88e-7080-bc78-2aff46543849", + "expected": "trigger", + "rationale": "Explicit slash-command invocation -- skill's own command" + }, + { + "id": "should-05", + "prompt": "Here are two conversation IDs from the chat agent — show me how the tool sequences differ.", + "expected": "trigger", + "rationale": "Tool-sequence diff between two conversations -- tool_call_diff evaluator territory" + }, + { + "id": "should-06", + "prompt": "We swapped the agent model from claude-3-5 to claude-sonnet-4 and re-ran a fixed scenario. Compare the two runs.", + "expected": "trigger", + "rationale": "Model-swap A/B with two runs to compare -- explicit use case" + }, + { + "id": "should-07", + "prompt": "Compare the OTel traces from these two agent runs and produce a side-by-side report.", + "expected": "trigger", + "rationale": "Trace-level comparison with HTML-report intent -- matches skill output" + }, + { + "id": "should-08", + "prompt": "I have two agent traces I want to look at side-by-side. Conversation IDs are X and Y.", + "expected": "trigger", + "rationale": "Side-by-side trace comparison with both IDs supplied" + }, + { + "id": "should-09", + "prompt": "Show me the difference in graph path between baseline and candidate runs of the coverage agent.", + "expected": "trigger", + "rationale": "Graph-path diff between two runs -- graph_path_diff evaluator territory" + }, + { + "id": "should-10", + "prompt": "Did removing the get_use_cases tool change how the coverage agent handles 'what's my coverage gap?' Compare a before and after run.", + "expected": "trigger", + "rationale": "Tool-loadout change A/B between two runs -- explicit use case" + }, + { + "id": "should-11", + "prompt": "Compare the latency and token usage between these two agent conversations.", + "expected": "trigger", + "rationale": "Latency/token diff between two runs -- latency_diff evaluator territory" + }, + { + "id": "should-not-01", + "prompt": "What went wrong with this agent run? Here's the conversation ID.", + "expected": "no-trigger", + "rationale": "Single-trace troubleshooting -- not a comparison; routes to analyze-root-cause / incident-response" + }, + { + "id": "should-not-02", + "prompt": "Investigate why this trace failed. The conversation ID is 019e8f2a-24ae-7880-8901-cbc79aca43ed.", + "expected": "no-trigger", + "rationale": "Single-trace failure investigation -- not an A/B comparison" + }, + { + "id": "should-not-03", + "prompt": "Compare row counts between our staging and production orders tables.", + "expected": "no-trigger", + "rationale": "Cross-table data comparison -- routes to monitoring-advisor (comparison monitor); not agent A/B" + }, + { + "id": "should-not-04", + "prompt": "How does my chat agent perform overall? Show me aggregate metrics.", + "expected": "no-trigger", + "rationale": "Aggregate performance question with no two specific conversation IDs to compare" + }, + { + "id": "should-not-05", + "prompt": "Set up an evaluation monitor for my chat agent to track response quality over time.", + "expected": "no-trigger", + "rationale": "Agent eval monitor creation -- routes to monitoring-advisor" + }, + { + "id": "should-not-06", + "prompt": "Diff these two SQL queries and tell me which one is more efficient.", + "expected": "no-trigger", + "rationale": "SQL comparison -- wrong domain (not agent traces)" + }, + { + "id": "should-not-07", + "prompt": "Show me the trace for conversation 019e8f2a-24ae-7880-8901-cbc79aca43ed.", + "expected": "no-trigger", + "rationale": "Single-trace inspection -- not a comparison" + }, + { + "id": "should-not-08", + "prompt": "Help me build a prompt eval framework for my LangGraph agent.", + "expected": "no-trigger", + "rationale": "Generic eval-framework engineering -- not an A/B compare on existing runs" + }, + { + "id": "should-not-09", + "prompt": "Compare these two dbt models and tell me which one has more downstream tables.", + "expected": "no-trigger", + "rationale": "dbt model comparison -- wrong domain" + } + ] +} diff --git a/plugins/claude-code/skills/compare-trace b/plugins/claude-code/skills/compare-trace new file mode 120000 index 00000000..51b49682 --- /dev/null +++ b/plugins/claude-code/skills/compare-trace @@ -0,0 +1 @@ +../../../skills/compare-trace \ No newline at end of file diff --git a/plugins/codex/skills/compare-trace b/plugins/codex/skills/compare-trace new file mode 120000 index 00000000..51b49682 --- /dev/null +++ b/plugins/codex/skills/compare-trace @@ -0,0 +1 @@ +../../../skills/compare-trace \ No newline at end of file diff --git a/plugins/copilot/skills/compare-trace b/plugins/copilot/skills/compare-trace new file mode 120000 index 00000000..51b49682 --- /dev/null +++ b/plugins/copilot/skills/compare-trace @@ -0,0 +1 @@ +../../../skills/compare-trace \ No newline at end of file diff --git a/plugins/cursor/skills/compare-trace b/plugins/cursor/skills/compare-trace new file mode 120000 index 00000000..51b49682 --- /dev/null +++ b/plugins/cursor/skills/compare-trace @@ -0,0 +1 @@ +../../../skills/compare-trace \ No newline at end of file diff --git a/plugins/opencode/skills/compare-trace b/plugins/opencode/skills/compare-trace new file mode 120000 index 00000000..51b49682 --- /dev/null +++ b/plugins/opencode/skills/compare-trace @@ -0,0 +1 @@ +../../../skills/compare-trace \ No newline at end of file diff --git a/skills/README.md b/skills/README.md index fe27ed35..b082bcec 100644 --- a/skills/README.md +++ b/skills/README.md @@ -22,6 +22,7 @@ Skills are platform-agnostic instruction sets that tell an AI coding agent what | **[Tune Monitor](tune-monitor/)** | Analyzes a Monte Carlo metric monitor's alert history and recommends configuration changes to reduce noise — sensitivity, WHERE conditions, segment exclusions, schedule, and aggregation. | | **[Connection Auth Rules](connection-auth-rules/)** | Build a Connection Auth Rules configuration for a Monte Carlo connection type. Fetches live connector schemas and transform steps from the apollo-agent repo. | | **[Instrument Agent](instrument-agent/)** | Instruments a Python AI agent for Monte Carlo Agent Observability — detects AI libraries, installs the Monte Carlo OpenTelemetry SDK, sets up tracing, and verifies traces in Monte Carlo. Asks before editing. | +| **[Compare Trace](compare-trace/)** | A/B compares two Monte Carlo agent traces by ID — runs graph-path, latency/token, and tool-call diffs plus LLM-based semantic and entity-overlap evals over the final answers, and opens an HTML report. | ## Standalone Installation diff --git a/skills/compare-trace/README.md b/skills/compare-trace/README.md new file mode 100644 index 00000000..7ff7bfaf --- /dev/null +++ b/skills/compare-trace/README.md @@ -0,0 +1,44 @@ +# compare-trace + +A/B compare two Monte Carlo agent conversations by ID and produce an HTML report. + +Trace-driven backport of the [Agent A/B Evaluation Framework](https://github.com/monte-carlo-data/ai-agent/pull/1236) (PR #1236 in `ai-agent`). The original ran the agent itself against fixed scenarios; this skill operates on already-captured conversations fetched via the Monte Carlo MCP server. + +## Invocation + +``` +/compare-trace +``` + +Optional flags: `--mcon`, `--agent`, `--trace-ids a,b` (force specific OTel trace_ids when a conversation has multiple), `--labels A,B`, `--output path.html`. + +## ID model + +`conversation_id` is the user-facing identifier (per the OTel GenAI `gen_ai.conversation.id` semantic convention). It's stored as a span attribute, **not** as the OTel `trace_id`. One conversation can contain multiple OTel traces (retries, fan-outs, multi-turn). + +The skill resolves `conversation_id → trace_id` via `get_agent_conversation`. By default it picks the trace with the most spans (= the "main" execution); override with `--trace-ids` to compare specific sub-traces. + +## Signals + +| Signal | Type | Notes | +|---|---|---| +| Graph Path | deterministic | Jaccard on node sets + LCS/max ordering | +| Latency & Tokens | deterministic | Per-metric ratios; flag if candidate > 1.5x baseline | +| Tool Call Sequence + Args | deterministic | Levenshtein on tool-name sequences; matched calls also get a top-level arg-key diff (added / removed / changed) | +| Semantic Diff | LLM (inline) | Claude runs prompt over both final-completion texts | +| Entity Overlap | LLM (inline) | Extracts 8 entity types, computes per-type Jaccard | + +The two LLM signals require non-empty `final_output_text` for both sides (pulled from the last completion span in each conversation). Without that, the report ships with the 3 structural signals. + +## Files + +- `SKILL.md` — full workflow Claude follows +- `scripts/compare_traces.py` — driver that consumes normalized trace JSON + optional LLM-eval JSON and writes HTML +- `scripts/evaluators/{graph_path_diff,latency_diff,tool_call_diff}.py` — pure-Python evaluators ported from PR #1236 +- `references/PR1236_MAPPING.md` — fields-and-signals mapping from PR #1236 to the trace API + +## Known limitations (v0.3) + +- Picks one trace per conversation (the largest non-error one by edge count). Multi-trace conversations (retries, fan-outs) currently get their other traces dropped — pass `--trace-ids` to override. +- Arg-diff matches calls by name + nearest position (greedy). When a tool's count differs between A and B, the surplus calls go unmatched. v0.4 plan: stable-ID fallback using `tool_use_id` when present. +- No "structured fields" diff (the 6th evaluator in PR #1236) — only meaningful when you control the agent's output schema, which we don't from trace-land. diff --git a/skills/compare-trace/SKILL.md b/skills/compare-trace/SKILL.md new file mode 100644 index 00000000..5a9fced5 --- /dev/null +++ b/skills/compare-trace/SKILL.md @@ -0,0 +1,326 @@ +--- +name: compare-trace +description: Compare two Monte Carlo agent conversations side-by-side. Walks each conversation to its OTel trace, runs structural diffs (graph path, latency/tokens, tool-call sequence) plus LLM-based semantic and entity-overlap evals, and opens an HTML report. +when_to_use: | + Invoke when the user wants to A/B compare two AI agent runs by conversation ID — e.g. "compare these two agent conversations", "diff these two agent runs", "did my prompt change cause a regression", or `/compare-trace `. Useful for evaluating prompt changes, graph changes, model swaps, or tool-loadout changes by replaying a fixed scenario and comparing the resulting traces. + + Do NOT invoke for: + - Single-trace inspection or troubleshooting one agent run (use `analyze-root-cause` / `incident-response`). + - Comparing data tables, monitors, or alerts (different domain). + - Generic prompt evaluation without two existing conversation IDs to compare. +bucket: Evaluate +version: 0.4.0 +--- + +# Compare Trace + +A/B compare two existing Monte Carlo agent conversations. Walks each conversation to its OTel trace via MCP, runs deterministic structural evaluators in a helper script, runs LLM-based semantic and entity evaluators inline, and emits an HTML report. + +**Arguments:** $ARGUMENTS + +Parse the arguments: +- **conv_id_a** (required): first positional — the baseline conversation ID (UUIDv7 or whatever ID the MC UI exposes for the run/thread). +- **conv_id_b** (required): second positional — the candidate conversation ID. +- **`--mcon `** (optional): trace-table MCON shared by both conversations. If omitted, discover via `get_agent_metadata`. +- **`--agent `** (optional): agent name to disambiguate when multiple agents are configured. +- **`--trace-ids a,b`** (optional): if a conversation contains multiple traces (retries, fan-out, multi-turn), pass the specific OTel trace_ids to compare. If omitted, the skill picks the main trace from each conversation (see Phase 2 for the rule). +- **`--labels A,B`** (optional): display labels (default `baseline`, `candidate`). +- **`--output `** (optional): output HTML path. Default `/tmp/compare-trace/_vs_.html`. + +> **Heritage:** This skill is the trace-driven backport of the [Agent A/B Evaluation Framework](https://github.com/monte-carlo-data/ai-agent/pull/1236) — same 5-signal idea, applied to already-captured traces rather than re-running the agent. + +> **ID model:** `conversation_id` is the user-facing identifier (`gen_ai.conversation.id` per the OTel GenAI semantic convention). It's stored as a span attribute, **not** as the OTel `trace_id`. A single conversation can contain multiple OTel traces (retries, parallel branches, multi-turn). This skill takes the conversation_id as primary input, walks to the trace, then compares. + +> **Field naming:** The MCP server returns **snake_case** in JSON responses (`trace_id`, `page_info`, `turn_errors`, `has_next_page`, `node_name`, `parent_span_id`, `is_tool_call`, `has_error`, `total_tokens`, `start_time`, `end_time`, `duration_seconds`, `is_tool_call`). The schema *descriptions* sometimes show camelCase — trust the response, not the description. + +--- + +## Trace sources + +The comparator works on **normalized traces**. Two ingestion paths produce that shape: + +- **MC-stored agent conversations** — the default. Phases 1–3 below walk each conversation via MCP to its OTel trace and assemble the normalized dict. +- **Locally-collected OTel traces** — for A/B-testing changes (prompt, code, model) before they ship, with no production conversation to point at. The skill ships a local OTLP/HTTP receiver and a span-to-normalized converter. Run your agent twice, capture spans, normalize, then jump straight to Phase 4. See [`references/local-otel-collection.md`](references/local-otel-collection.md). + +Both paths produce the same normalized trace shape and feed the same Phase 4+ comparator and HTML report. + +--- + +## Setup + +**Prerequisites:** +- **`python3`** for the helper scripts (stdlib only for the default MC-conversation path). +- Monte Carlo MCP server (`monte-carlo-mcp`) configured and authenticated. +- *Local OTel path only:* `opentelemetry-proto` in the Python env that runs the receiver (already a transitive dep of `opentelemetry-sdk`). + +Helper scripts live under `${CLAUDE_PLUGIN_ROOT}/skills/compare-trace/scripts/`: +- `compare_traces.py` — main driver; takes two normalized trace JSON files (+ optional LLM-eval results JSON) and writes the HTML report. +- `evaluators/graph_path_diff.py`, `evaluators/latency_diff.py`, `evaluators/tool_call_diff.py` — deterministic evaluators ported from PR #1236. +- `local_otlp_receiver.py`, `sources/otel_spans.py` — used only by the local-OTel ingestion path. See `references/local-otel-collection.md`. + +--- + +## Workflow + +### Phase 1: Discover MCON and agent_name (skip if `--mcon` was provided) + +Call `get_agent_metadata` with no filters. The response lists agents with `agent_name`, `trace_table_mcon`, and `source_type`. Pick the MCON + `agent_name`: +- If `--agent ` was given, match by `agent_name` exactly. +- Else if exactly one agent is configured, use it. +- Else ask the user which agent the conversations belong to (list `agent_name` options). + +Both conversations must live on the same MCON. If you suspect otherwise (e.g. labels suggest different envs), ask first. + +### Phase 2: Resolve each conversation → OTel trace_id + final completion + +For each conversation_id, call: + +``` +get_agent_conversation( + agent_name=, + trace_table_mcon=, + conversation_id=, + first=100, # the server caps `first` at 100 + start_time="", + end_time="", +) +``` + +The conversation_id is a UUIDv7 — decode the timestamp from the first 48 bits and use a tight window. This drastically cuts response size on big tables. + +**Pagination:** The server caps `first` at 100. Track `page_info.has_next_page`; paginate until exhausted **only if** you need the final completion text (see step 3 below). For trace-id selection alone (step 2), the first page is usually sufficient. + +From the response: + +1. **Collect candidate trace_ids.** From `edges[*].node.trace_id` plus `turn_errors[*].trace_id`. Build two sets: `all_trace_ids` and `error_trace_ids`. +2. **Pick the main trace.** Apply these rules in order: + - If `--trace-ids a,b` was provided, use the matching value (no further filtering). + - Else, drop any `trace_id` in `error_trace_ids`. This is critical — failed retries can appear as full sub-runs with many spans once you paginate, so "most spans" alone is not enough. + - From the remaining trace_ids, pick the one with the most edges in the response. Tie-break by `min(start_time)` (earliest). + - If zero non-error trace_ids remain, abort with `"conversation {conv_id} contains only failed traces"` — don't fabricate a comparison. +3. **Extract `final_output_text` from the picked trace.** This step is what forces full pagination when `page_info.has_next_page` is true. + - Filter `edges` to the chosen `trace_id`. + - Order by `start_time` ascending. + - Walk **from the end** to find the last edge whose `node.completions` is a non-empty JSON-encoded string. + - Parse `node.completions` as JSON — it's a stringified `[{is_end, is_start, message, position, role, tool_calls?}]` array (the conversation API serializes assistant turns as a list of message blocks). + - From that parsed array, find the last entry where `role == "assistant"` AND `message` is a non-empty string AND `message` itself is not just whitespace. (You don't need to inspect `tool_calls` — a message field with substantive text is the signal that the assistant produced a final answer rather than only emitting tool requests.) That `message` value is `final_output_text`. + - If no such entry exists (e.g. trace ended mid-tool-call), set `final_output_text = ""` — the LLM evals will be skipped and the report will note this. +4. **Extract `tool_calls` (with args) for the picked trace.** v0.3 feeds these into the argument-diff evaluator. + - Iterate `edges` for the chosen `trace_id` in `start_time` order. + - For each edge, parse `node.completions` as JSON (same as step 3). + - Within each parsed assistant message, iterate its `tool_calls` array (may be empty or missing). Each tool_call has the LangChain/Bedrock shape `{"name": "", "id": "", "arguments": ""}`. + - For each tool_call, parse `arguments` into a dict. Be defensive: `arguments` is a JSON string in the LangChain/Bedrock shape but may already be a `dict` in Anthropic-native or OpenAI Tool API shapes — `isinstance(arguments, dict) -> use as-is`, `isinstance(arguments, str) -> json.loads(arguments or "{}")`. Empty string and missing keys both mean `{}`. + - Accumulate ordered `[{"name": "", "args": , "id": ""}, ...]`. This is the `tool_calls` list for the normalized JSON. + - If a completion has a different shape (e.g. OpenAI `function.arguments`, raw Anthropic `tool_use.input` block), parse what you can and fall through with `args = {}` for any blob you can't decode. Don't fail the whole comparison on one weird message. + +Cache the conversation response and the picked `trace_id` for Phase 3 — don't refetch. + +### Phase 3: Fetch each trace's structural data (with fallback) + +For each selected `trace_id`, call: + +``` +get_agent_trace( + mcon=, + trace_id=, + trace_start_time=, + trace_end_time=<+1 hour, ISO 8601>, +) +``` + +The response is a flat span list with `node_name`, `parent_span_id`, `child_span_ids`, `start_time`, `end_time`, `duration` (ms), `total_tokens`, `prompt_tokens`, `completion_tokens`, `is_tool_call`, `has_prompts`, `has_completions`, `has_error`. + +**Failure modes and the conversation-edge fallback:** + +If `get_agent_trace` errors with `"Incomplete trace"` (or returns empty despite the conversation having edges for that trace_id), fall back to **reconstructing structural data from the conversation edges** you already have from Phase 2: + +- Filter the cached `edges` to ones with this `trace_id`, ordered by `start_time`. +- `node_path` ← `[e.node.name for e in edges]`. This will be **shallower** than `get_agent_trace` would give you — only LLM and tool spans, no internal workflow/task spans. Note this in the chat report. +- `tool_calls` ← **use the list you already built in Phase 2 step 4** (from assistant completion `tool_calls` blocks). Don't rebuild it from tool-execution spans here — those spans don't carry args. +- `execution_time_seconds` ← `(max(end_time) - min(start_time))` across these edges, in seconds. +- `llm_call_count` ← count where `e.node.prompts` is non-empty and `e.node.completions` is non-empty. +- token counts ← sum of `total_tokens` / `prompt_tokens` / `completion_tokens` where present. +- `has_errors` ← `True` if any edge had a non-null `status` indicating error, else `False`. (The conversation API doesn't expose `has_error` directly.) + +If `get_agent_trace` returns 404 / permission denied / retention expired, stop and report which trace failed. Don't silently treat one side as empty. + +**Normalized trace JSON shape** (write one file per trace under `/tmp/compare-trace/.json`): + +```json +{ + "trace_id": "", + "conversation_id": "", + "label": "baseline|candidate|...", + "source": "trace_api" | "conversation_fallback", + "node_path": ["root_node", "child_a", "child_b", ...], + "tool_calls": [{"name": "", "args": {"k": "v", ...}, "id": ""}, ...], + "execution_time_seconds": 12.34, + "llm_call_count": 5, + "total_tokens": 1234, + "prompt_tokens": 900, + "completion_tokens": 334, + "has_errors": false, + "final_output_text": "" +} +``` + +**Normalization rules when using `get_agent_trace`:** +- `node_path`: sort spans by `start_time` ascending and take `node_name` for each. Skip spans where `node_name` is empty. +- `tool_calls`: **use the list you built in Phase 2 step 4** (from assistant completion `tool_calls` blocks). The trace API's tool-call spans only carry names, not args; the conversation API is where args live, so we always read tool_calls from there regardless of whether structural data came from trace_api or conversation_fallback. +- `execution_time_seconds`: `(max(end_time) - min(start_time))` in seconds across all spans. +- `llm_call_count`: count spans where `has_prompts == true` and `has_completions == true`. +- `total_tokens` / `prompt_tokens` / `completion_tokens`: sum across all spans (skip nulls). +- `has_errors`: any span with `has_error == true`. +- `final_output_text`: copied in from Phase 2. + +### Phase 4: Run LLM-based evaluators inline (only if both `final_output_text` fields are non-empty) + +#### 4a. Semantic diff + +Run this prompt yourself (Claude) with the two final outputs as inputs: + +``` +You are comparing two AI agent outputs for the same scenario. +The BASELINE is the reference version. The CANDIDATE is the variant under evaluation. + +Focus on SUBSTANCE, not wording — two paragraphs saying the same thing in different +words are "preserved." + +BASELINE: + + +CANDIDATE: + + +Respond with exactly this JSON structure (no other text): +{ + "verdict": "preserved" | "regression" | "improvement" | "mixed", + "similarity_score": 0.0-1.0, + "lost_findings": ["", ...], + "added_findings": ["", ...], + "explanation": "<1-2 sentence summary of the semantic diff>" +} + +Rules: +- "preserved" = same core findings, even if phrased differently. +- "regression" = candidate lost important information. +- "improvement" = candidate added valuable information. +- "mixed" = some lost, some added. +- similarity_score: 1.0 = semantically identical, 0.0 = completely different. +- For lost_findings / added_findings, QUOTE the actual phrases (≤100 words each). + Do not paraphrase. +``` + +Save your JSON response into `/tmp/compare-trace/llm_semantic.json`. + +#### 4b. Entity overlap + +Run this extraction prompt twice (once per final output), with text input truncated to ~4000 chars: + +``` +Extract all concrete entities from the text below. Return a JSON object with these keys, +each mapping to a list of strings. Use exact values from the text — do not paraphrase. + +Entity types: +- table_names: fully qualified table/view names (e.g. "db.schema.table") +- column_names: column or field names referenced +- metric_values: numeric values with units (e.g. "45.2%", "1000 rows") +- timestamps: dates, times, or relative time references +- job_pipeline_names: ETL job, DAG, pipeline, model, or workflow names +- pr_commit_refs: PR numbers or commit hashes +- severity_status: status or severity keywords +- monitoring_types: monitoring/anomaly type names + +Omit empty lists. Return ONLY valid JSON, no other text. + +Text: + +``` + +Take both extraction results and compute Jaccard overlap per entity type yourself, then assemble: + +```json +{ + "per_type_jaccard": {"table_names": 0.83, "column_names": 1.0, ...}, + "shared": {"table_names": ["analytics.orders"], ...}, + "baseline_only": {"table_names": ["staging.orders"], ...}, + "candidate_only": {"table_names": ["analytics.orders_v2"], ...}, + "overall_jaccard": 0.71, + "baseline_facts": {...full extraction...}, + "candidate_facts": {...full extraction...} +} +``` + +Lowercase + strip-trailing-punctuation each value before set comparison (normalize like `_normalize` in PR #1236's `fact_overlap.py`). + +Save into `/tmp/compare-trace/llm_entities.json`. + +#### 4c. Corpus narrative (optional, 2-3 sentences) + +A single short narrative summarising the overall verdict. Save to `/tmp/compare-trace/llm_narrative.txt`. The renderer surfaces this above the per-signal tabs. + +**Sanity check before rendering:** If `overall_jaccard` is ~0 and `execution_time_seconds` ratio is >5x, the two conversations are likely **not** the same scenario. Say that explicitly in the narrative — don't let the report read as a clean A/B when the inputs aren't. + +### Phase 5: Render the report + +```bash +python3 ${CLAUDE_PLUGIN_ROOT}/skills/compare-trace/scripts/compare_traces.py \ + --baseline /tmp/compare-trace/.json \ + --candidate /tmp/compare-trace/.json \ + --semantic /tmp/compare-trace/llm_semantic.json \ + --entities /tmp/compare-trace/llm_entities.json \ + --narrative /tmp/compare-trace/llm_narrative.txt \ + --output /tmp/compare-trace/_vs_.html +``` + +The `--semantic`, `--entities`, and `--narrative` flags are all optional — omit them when Phase 4 was skipped. + +The script opens the HTML in the user's default browser (`open` on macOS, `xdg-open` on Linux). On failure, print the file path for manual opening. + +### Phase 6: Report back + +Print a compact summary to chat with: +- The headline number for each signal (graph similarity, tool similarity, latency assessment, semantic verdict, entity overlap). +- The report path. +- For each conversation: which `trace_id` was picked, how many turn_errors traces were skipped, and whether structural data came from `get_agent_trace` or the conversation-edge fallback. +- If the MC webapp URL helps, call `get_mc_webapp_url` (no args) to get the regionalized base URL and include it — but don't fabricate deep-link paths; the conversation-URL schema isn't a documented public contract. + +No walls of raw JSON. + +--- + +## Known limitations (v0.3.1) + +| Limitation | Why | Plan | +|---|---|---| +| **Picks one trace per conversation.** Multi-trace conversations (retries, fan-outs, multi-turn) get the largest non-error trace; the others are reported as skipped. | One-pair comparison is the simplest mental model. | **v0.4 plan:** aggregate latency/tokens across all sub-traces; show retries in a separate tab. | +| **`get_agent_trace` "Incomplete trace" forces a shallower comparison.** When we fall back to conversation edges, `node_path` only covers LLM + tool spans (no internal workflow/task spans). | The conversation API doesn't expose the framework's nested-span hierarchy. | **v0.4 plan:** report `source: "conversation_fallback"` more prominently in the HTML and add a "graph depth" note so users understand the lower node count. | +| **Arg-diff matches calls by name + position only.** PR #1236's `_match_tools_by_proximity` algorithm: for each tool name shared between A and B, greedy-pair calls by closest positional index. Doesn't recover when a tool's arg shape changed *and* its name is the only one shared (e.g. `get_warehouses()` called 5x in A and 3x in B — the extra 2 in A are unmatched). | Position-based greedy matching is what the original framework did. | **v0.4 idea:** add a stable-ID fallback using `tool_use_id` if present. | +| **No "structured fields" diff (the 6th evaluator in PR #1236).** | Trace outputs are free-form text, not named-field dicts. | Stays dropped — only meaningful when you control the agent's output schema, which we don't in trace-land. | + +--- + +## Heuristics and edge cases + +- **Empty conversations.** If `get_agent_conversation` returns zero edges for either conv_id, abort with a clear message rather than producing a misleading 0/0 report. Common causes: wrong MCON, retention expired, wrong agent_name. +- **All-failed conversations.** If every distinct `trace_id` in a conversation appears in `turn_errors`, abort — see Phase 2 rule 2 last bullet. +- **Single-span traces.** Graph-path diff and tool-call diff still run, just return trivial results. Don't suppress. +- **Token counts of 0.** If both traces show 0 total tokens (some agents don't report tokens), the latency evaluator's `total_tokens` row is suppressed automatically (rows where both sides are 0 are filtered). +- **Very large traces (>500 spans).** Truncate `node_path` and `tool_calls` to 200 entries each in the report's "full sequences" detail blocks — the diffs themselves run on the full lists. +- **Trace ID format quirks.** `get_agent_conversation` returns dashless 32-char hex `trace_id`s; pass those verbatim to `get_agent_trace`. ClickHouse-backed MCONs accept dashed UUIDs too, but BigQuery-backed ones reject dashes (`non-hexadecimal number found`). +- **`get_agent_conversation` responses routinely exceed the tool-output cap.** A single page of 100 edges is typically multiple MB once prompts and completions are included. The harness will spill the JSON to a file on disk and hand you a path instead of inline content — parse the file. Don't try to pipe the response into `jq` or assume it's in-message. Plan for 2–5 MB per page on busy agents. +- **`get_agent_conversation` does not support per-`trace_id` filtering.** When a conversation has multiple traces (main + retries from `turn_errors`), pagination returns interleaved edges across all of them. You'll fetch (and pay for) edges from error traces you'll never use. Filter client-side after the fetch; don't try to scope the API call. +- **Transient 5xx from the upstream GraphQL.** `get_agent_conversation` and `get_agent_trace` occasionally surface a `502` from `monolith-frontend` mid-pagination. Retry once with ~2–3 s backoff before aborting; a single retry resolves the vast majority of these. If a second attempt fails, stop and surface the error — don't loop. +- **Not actually an A/B?** Per Phase 4c, watch for `overall_jaccard ≈ 0` paired with a `>5x` exec-time ratio — those usually mean the user picked two unrelated runs by mistake. Call it out in the narrative; the report stat tiles alone won't. + +--- + +## Acceptance — what "done" looks like for a single invocation + +You've succeeded when: +1. Both `get_agent_conversation` calls returned edges; you selected a non-error main trace for each. +2. Both traces have structural data — either from `get_agent_trace` or the conversation-edge fallback (and you noted which). +3. The HTML report wrote to disk. +4. The LLM evaluator sections rendered (or are clearly absent with a "skipped: no completion text" note, not silently empty). +5. The browser opened the report (or you printed the file:// URL for manual opening). +6. The chat reply lists each signal's headline number, the report path, the picked trace_id per conversation, and the source (trace_api / conversation_fallback) per side. If the inputs look like a mis-paired comparison (per the Phase 4c sanity check), the chat reply says so prominently. diff --git a/skills/compare-trace/references/PR1236_MAPPING.md b/skills/compare-trace/references/PR1236_MAPPING.md new file mode 100644 index 00000000..befb87f0 --- /dev/null +++ b/skills/compare-trace/references/PR1236_MAPPING.md @@ -0,0 +1,40 @@ +# Mapping from PR #1236 to the MC trace API + +PR: `monte-carlo-data/ai-agent#1236` ("Agents regression - TSA ready for now"). + +The PR builds an A/B framework that **runs the agent** under two branches and saves +`ScenarioOutput` snapshots locally, then runs 6 evaluators + an LLM summarizer + an +HTML renderer over the snapshot pairs. This skill backports the comparison half — +it operates on **already-captured traces** fetched via `get_agent_trace` and +`get_agent_conversation`, not on a fresh agent run. + +## ScenarioOutput → normalized trace JSON + +| PR field | Source in the trace API | Notes | +|---|---|---| +| `final_output` (dict) | Last `completions` string from `get_agent_conversation` filtered to the trace | Free text in v0.1 (not a named-field dict). | +| `node_path: list[str]` | `nodeName` of every span in `get_agent_trace`, sorted by `startTime` | Direct fit. | +| `tool_calls: list[{name,args}]` | Spans with `isToolCall == true`, take `nodeName` for `name`; `args` left empty | v0.2 will parse args from completion `tool_calls` JSON. | +| `execution_time_seconds` | `(max(endTime) - min(startTime))` across spans | Or sum root-span `duration / 1000`. | +| `llm_call_count` | Count of spans with `hasPrompts && hasCompletions` | Closer match than the PR's "AI-typed message count". | +| `total_tokens` | Sum of `totalTokens` across all spans | PR's runner left this at 0; we actually populate it. | +| `status` / `error` | Any `hasError == true` span → `has_errors: true` | Coarser than the PR's per-scenario try/except, but the agent already ran. | + +## Evaluator parity + +| PR evaluator | Trace-API parity | Status | +|---|---|---| +| `graph_path_diff` | Jaccard + LCS on `node_path` | ✅ identical implementation | +| `latency_diff` | Same 4 metrics (`execution_time_seconds`, `llm_call_count`, `total_tokens`, `tool_call_count`) | ✅ identical | +| `tool_call_diff` (names) | Levenshtein on tool-name sequences | ✅ identical | +| `tool_call_diff` (arguments) | Phase 2 walks `get_agent_conversation` and parses `tool_calls` blocks from each assistant completion (LangChain/Bedrock `arguments` JSON-string shape verified empirically; OpenAI / Anthropic shapes parsed best-effort). `_match_tools_by_proximity` + `_compare_args` ported verbatim from PR #1236. | ✅ shipped in v0.3.0 | +| `semantic_diff` | LLM prompt over two free-text completions instead of named fields | ⚠️ adapted — Claude runs the prompt inline, no per-field scoring | +| `fact_overlap` | LLM extraction over two free-text completions instead of named fields | ⚠️ adapted — same prompt, just unified text | +| `structured_field_diff` | Requires a named-field output schema from the agent | ❌ dropped — not meaningful for arbitrary traces | + +## Things deliberately NOT backported + +- **`capture` + `run_scenario`** — the PR's runner uses `graph.astream` and `_ScenarioLogCapture`. We skip the whole capture half because the user is comparing existing traces. +- **`AgentAdapter` protocol** — agent-specific, only useful when re-running. +- **Per-scenario corpus reporting** — we compare a single pair, not N scenarios. The HTML strips down to one card. +- **`get_fast_smart_llm` dependency** — the LLM evals run inline as Claude prompts (Phase 4 in SKILL.md), no Python LLM call. diff --git a/skills/compare-trace/references/local-otel-collection.md b/skills/compare-trace/references/local-otel-collection.md new file mode 100644 index 00000000..7d9a38a3 --- /dev/null +++ b/skills/compare-trace/references/local-otel-collection.md @@ -0,0 +1,237 @@ +# Local OTel collection + +Alternative ingestion source for the compare-trace skill. Instead of pulling +two traces from Monte Carlo's stored agent conversations, collect them +locally from any OTel-instrumented agent and feed them into the same +comparator and HTML report. + +**When to reach for this:** A/B-testing a change (prompt, code, model) before +it ships, so there's no production conversation to point at. Run the agent +twice locally — once with the baseline, once with the candidate — capture +spans, compare. + +The main workflow in `SKILL.md` (Phases 1–6) still applies from Phase 4 +onward; this doc replaces Phases 1–3 (MC conversation walking) with three +local steps. + +--- + +## Pipeline overview + +``` +your agent process ──OTLP/HTTP──▶ local_otlp_receiver.py ──▶ *.jsonl + │ + sources/otel_spans.py ▼ + *.normalized.json + │ + compare_traces.py ▼ + report.html +``` + +Three scripts, all under `${CLAUDE_PLUGIN_ROOT}/skills/compare-trace/scripts/`: + +- `local_otlp_receiver.py` — receiver. Accepts OTLP/HTTP protobuf, writes raw + spans as JSON-lines. +- `sources/otel_spans.py` — normalizer. Converts raw spans into the trace + shape the comparator consumes. +- `compare_traces.py` — main driver (shared with the MC-conversation path). + +--- + +## Step 1: Start the receiver + +The receiver requires `opentelemetry-proto` (already a transitive dep of +`opentelemetry-sdk`, so any venv that runs an OTel-instrumented agent +already has it). + +```bash +python3 skills/compare-trace/scripts/local_otlp_receiver.py \ + --output /tmp/run-a.jsonl \ + --port 4318 +``` + +Stays in the foreground; send `SIGINT` (Ctrl+C) when the agent run completes +to stop and flush. Each POST is appended to the JSONL — multiple agent runs +into one file is fine if you want to accumulate; one-file-per-run is +cleaner for diffing. + +If port 4318 is busy, either pass `--port ` or kill the stale process — +`lsof -i :4318` shows the holder. (The script does not bind-retry; it +exits on `EADDRINUSE`.) + +--- + +## Step 2: Configure your agent's OTel exporter + +The receiver speaks OTLP/HTTP at `/v1/traces` on the bound port. Any agent +using the OpenTelemetry SDK can point at it with one env var: + +```bash +export OTEL_EXPORTER_OTLP_TRACES_ENDPOINT=http://127.0.0.1:4318/v1/traces +``` + +(The SDK appends `/v1/traces` automatically when you set +`OTEL_EXPORTER_OTLP_ENDPOINT=http://127.0.0.1:4318` instead — either form +works.) + +If your agent uses a wrapper that takes a base URL (e.g. ai-agent's +`MC_OTEL_ENDPOINT`), set that to `http://127.0.0.1:4318` and let the wrapper +append the path. + +**Force-flush before the process exits.** `BatchSpanProcessor` buffers — if +the agent exits abruptly, the last few spans never reach the receiver. Call +`tracer_provider.shutdown()` (or `force_flush()`) at the end of your run +script. See the ai-agent example in the appendix. + +--- + +## Step 3: Normalize each run + +```bash +python3 skills/compare-trace/scripts/sources/otel_spans.py \ + /tmp/run-a.jsonl \ + --output /tmp/run-a.normalized.json +``` + +The normalizer reads the JSONL, picks the dominant `trace_id` (handles stray +spans landing in the same file), and produces the dict shape the comparator +expects: + +```json +{ + "trace_id": "...", + "node_path": ["initialization", "react_agent", ...], + "tool_calls": [{"name": "...", "args": {...}, "id": "..."}, ...], + "execution_time_seconds": ..., + "llm_call_count": ..., + "total_tokens": ..., + "tool_call_count": ..., + "final_output_text": "..." +} +``` + +Run it once per JSONL. + +--- + +## Step 4: Compare + +Hand the two normalized files to the main driver (same call as the +MC-conversation path): + +```bash +python3 skills/compare-trace/scripts/compare_traces.py \ + --baseline /tmp/run-a.normalized.json \ + --candidate /tmp/run-b.normalized.json \ + --output /tmp/report.html +``` + +For the optional LLM-based evaluators (`semantic_diff`, `entity_overlap`), +follow Phase 4 of `SKILL.md` — the inputs are the same. + +--- + +## Dialect coverage + +The normalizer reads three families of attributes, all derived from +conventions rather than ai-agent-specific code paths: + +- **OTel GenAI semantic conventions** — + `gen_ai.prompt.*`, `gen_ai.completion.*`, `gen_ai.usage.*`. Emitted by + Traceloop's `LangchainInstrumentor`, OpenInference, and the official + OpenTelemetry GenAI instrumentations. +- **LangGraph node spans** — `.task` for each node call, `.workflow` + for the compiled-graph root, `.tool` for tool executions. Emitted by + Traceloop when LangGraph is in use. +- **Tool-call attributes** — `gen_ai.{completion,prompt}..tool_calls..{name,arguments,id}`. + The normalizer dedupes by `id` because Traceloop's Bedrock instrumentor + only emits these under `prompt.*` on the *next* LLM call (where the call + shows up as part of the message history), not on the completion that + produced them. Other instrumentations emit them on completions; both + forms are merged. + +If your agent doesn't use LangGraph or Traceloop, you'll get partial +results — typically `node_path` will be empty and `tool_calls` may be +missing args. Pointing the normalizer at a different dialect is a contained +edit (it's ~200 lines); add a second module under `scripts/sources/` keyed +to whatever attribute conventions your stack uses, and feed its output into +`compare_traces.py` the same way. + +--- + +## Appendix: ai-agent integration example + +Concrete glue for driving ai-agent's `coverage_agent` locally with OTel +pointed at the receiver. The pattern transfers to other ai-agent graphs +(chat, tsa, performance) — swap the `invoke_*` import. + +**Where this code should live:** in ai-agent (e.g. +`tests/scripts/coverage_repl_with_otel.py`), not in the skill. It's +ai-agent-specific glue. + +```python +import os, sys, asyncio + +# 1) Env bootstrap (mirrors ai-agent's .envrc dev profile). +os.environ.setdefault("AWS_PROFILE", "dev") +os.environ.setdefault("ENV", "local") +os.environ.setdefault("AUTH_MODE", "none") +os.environ.setdefault("MONOLITH_URL", "https://cli.dev.mcinfra.io") +os.environ.setdefault("MC_USER_ID", "d930a36b-ee0b-4200-9f7b-fcc62cbbd645") +os.environ.setdefault("LANGSMITH_TRACING", "false") +os.environ["MC_OTEL_ENDPOINT"] = "http://127.0.0.1:4318" +# IMPORTANT: do NOT set MCP_SERVER_URL. With it unset and no signing key +# resolved, get_mcp_tools() falls through to load_mcp_tools_in_process, +# which wraps the local mcp_server package and avoids the lambda-URL 403. + +# 2) OTel setup before importing the graph. +from opentelemetry import trace as otel_trace +from opentelemetry.instrumentation.langchain import LangchainInstrumentor +from opentelemetry.sdk.trace import TracerProvider +from ai_agent.shared.observability import setup_otel_tracing + +setup_otel_tracing( + agent_name="coverage_agent", + instrumentors=[LangchainInstrumentor()], +) + +# 3) Optional: monkeypatch a system prompt for A/B testing. Works because +# nodes/initialization.py reads coverage_system_prompt at run time, not +# at module-load time. Won't work for prompts that get baked in at +# graph-compile time — those need a real file edit (or git worktree). +from ai_agent.coverage_agent import prompts as _p +_p.coverage_system_prompt = "" + +# 4) Invoke. tests_evals/.../coverage_agent/conftest.py provides +# invoke_coverage_agent() which compiles the graph with MemorySaver and +# returns a structured result. +sys.path.insert(0, "/path/to/ai-agent") +from tests_evals.ai_agent.coverage_agent.conftest import invoke_coverage_agent + +async def run(): + result = await invoke_coverage_agent( + user_message="what is my coverage gap?", + sql_permission="ALLOW_SESSION", + ) + print(result.output[:500]) + # 5) Force-flush spans before exit. + provider = otel_trace.get_tracer_provider() + if isinstance(provider, TracerProvider): + provider.shutdown() + +asyncio.run(run()) +``` + +**Known gotchas:** + +- `load_mcp_tools_in_process()` imports `mcp_server.beacon` which requires + `slowapi`. If it's missing from the ai-agent venv, MCP tool loading + silently returns `[]` (the loader has a bare `except`). Install it with + `uv pip install slowapi` (pulls in `limits` and `deprecated`). +- The `tests_evals` coverage conftest compiles the graph at import time. If + you need to A/B-test changes that affect graph structure (not just prompt + text), use a git worktree per branch, not in-process monkeypatching. +- The `MCP_SERVER_URL` unset trick depends on + `ai_agent.shared.mcp.tools.get_mcp_tools` checking `is_local_mcp_available()` + before falling back to the HTTP path. If that branch is ever removed, + local runs will need an explicit override. diff --git a/skills/compare-trace/scripts/compare_traces.py b/skills/compare-trace/scripts/compare_traces.py new file mode 100755 index 00000000..da7476be --- /dev/null +++ b/skills/compare-trace/scripts/compare_traces.py @@ -0,0 +1,655 @@ +#!/usr/bin/env python3 +"""Driver for the compare-trace skill. + +Takes two normalized trace JSON files (baseline + candidate), runs the three +deterministic evaluators, optionally folds in LLM-eval results that Claude +ran inline, and writes a single-pair HTML report. Opens the report in the +default browser unless ``--no-open`` is passed. + +Normalized trace JSON shape:: + + { + "trace_id": "", + "label": "baseline|candidate|...", + "node_path": ["node_a", "node_b", ...], + "tool_calls": [{"name": "", "args": {}}, ...], + "execution_time_seconds": 12.34, + "llm_call_count": 5, + "total_tokens": 1234, + "prompt_tokens": 900, + "completion_tokens": 334, + "has_errors": false, + "final_output_text": "" + } + +LLM-eval JSON shapes are documented in SKILL.md (Phase 4). +""" + +from __future__ import annotations + +import argparse +import html +import json +import platform +import subprocess +import sys +from datetime import datetime, timezone +from pathlib import Path +from typing import Any + +SCRIPT_DIR = Path(__file__).resolve().parent +sys.path.insert(0, str(SCRIPT_DIR)) + +from evaluators.graph_path_diff import compare_graph_paths # noqa: E402 +from evaluators.latency_diff import compare_latency # noqa: E402 +from evaluators.tool_call_diff import compare_tool_calls # noqa: E402 + + +# --------------------------------------------------------------------------- +# I/O +# --------------------------------------------------------------------------- + + +def _load_json(path: Path) -> dict[str, Any]: + with path.open() as f: + return json.load(f) + + +def _open_in_browser(path: Path) -> None: + url = f"file://{path.resolve()}" + system = platform.system() + try: + if system == "Darwin": + subprocess.run(["open", url], check=False) + elif system == "Linux": + subprocess.run(["xdg-open", url], check=False) + elif system == "Windows": + subprocess.run(["start", url], shell=True, check=False) + except FileNotFoundError: + # Browser command not available; caller prints the path instead. + pass + + +# --------------------------------------------------------------------------- +# Tab renderers (single-pair flavor, ported from PR #1236's html_renderer.py) +# --------------------------------------------------------------------------- + + +def _render_graph_tab(g) -> str: + rows = "" + if g.baseline_only_nodes: + rows += ( + f'Baseline only' + f'{html.escape(", ".join(g.baseline_only_nodes))}' + ) + if g.candidate_only_nodes: + rows += ( + f'Candidate only' + f'{html.escape(", ".join(g.candidate_only_nodes))}' + ) + if g.shared_nodes: + rows += ( + f'Shared' + f'{html.escape(", ".join(g.shared_nodes))}' + ) + return f""" +
+ Jaccard (node set): {g.jaccard_similarity:.2f} + Ordering (LCS): {g.ordering_similarity:.2f} + Overall: {g.overall_similarity:.2f} +
+ + {rows or ""}
CategoryNodes
Identical paths
+
Full paths +
+

Baseline ({len(g.baseline_path)} nodes)

{html.escape(chr(10).join(g.baseline_path[:200]) or "(empty)")}
+

Candidate ({len(g.candidate_path)} nodes)

{html.escape(chr(10).join(g.candidate_path[:200]) or "(empty)")}
+
+
""" + + +def _render_latency_tab(lat) -> str: + rows = "" + for m in lat.metrics: + if m.baseline_value == 0 and m.candidate_value == 0: + continue + css = "field-changed" if m.is_regression else "field-unchanged" + ratio_str = f"{m.ratio:.2f}x" if m.ratio != float("inf") else "inf" + badge = ( + 'regressed' + if m.is_regression + else "" + ) + rows += f""" + + {html.escape(m.field_name)} + {m.baseline_value:.1f} + {m.candidate_value:.1f} + {ratio_str} + {badge} + """ + + assessment_css = {"regressed": "red", "improved": "green"}.get( + lat.overall_assessment, "" + ) + return f""" +
+ Assessment: {lat.overall_assessment.upper()} +
+ + + {rows or ""} +
MetricBaselineCandidateRatio
No metrics
""" + + +_VALUE_TRUNCATE = 80 + + +def _format_arg_value(value: Any) -> tuple[str, str]: + """Return (truncated_repr, full_repr) for an arg value. + + Both reprs are JSON-encoded for dict/list values so the rendering is + stable across types. Truncated is 80 chars with an ellipsis suffix. + """ + if isinstance(value, (dict, list)): + full = json.dumps(value, sort_keys=True, default=str) + elif value is None: + full = "null" + elif isinstance(value, bool): + full = "true" if value else "false" + else: + full = str(value) + if len(full) <= _VALUE_TRUNCATE: + return full, full + return full[:_VALUE_TRUNCATE] + "…", full + + +def _render_arg_diff_lines(ac: dict[str, Any]) -> str: + """Build the per-row diff content (inline truncated values + expand toggle).""" + inline_lines: list[str] = [] + full_lines: list[str] = [] + + added_values = ac.get("added_values", {}) or {} + removed_values = ac.get("removed_values", {}) or {} + changed_values = ac.get("changed_values", {}) or {} + + for k in ac.get("added_keys", []) or []: + v = added_values.get(k, "") + trunc, full = _format_arg_value(v) + inline_lines.append( + f'
+ {html.escape(k)}: ' + f'{html.escape(trunc)}
' + ) + full_lines.append(f'+ {k}: {full}') + + for k in ac.get("removed_keys", []) or []: + v = removed_values.get(k, "") + trunc, full = _format_arg_value(v) + inline_lines.append( + f'
- {html.escape(k)}: ' + f'{html.escape(trunc)}
' + ) + full_lines.append(f'- {k}: {full}') + + for k in ac.get("changed_keys", []) or []: + pair = changed_values.get(k, {}) or {} + b_val = pair.get("baseline", "") + c_val = pair.get("candidate", "") + b_trunc, b_full = _format_arg_value(b_val) + c_trunc, c_full = _format_arg_value(c_val) + inline_lines.append( + f'
Δ {html.escape(k)}: ' + f'{html.escape(b_trunc)} → ' + f'{html.escape(c_trunc)}
' + ) + full_lines.append(f'Δ {k}:\n baseline: {b_full}\n candidate: {c_full}') + + inline_html = "".join(inline_lines) + any_truncated = any( + _format_arg_value(v)[0] != _format_arg_value(v)[1] + for v in list(added_values.values()) + list(removed_values.values()) + ) or any( + _format_arg_value(pair.get("baseline", ""))[0] + != _format_arg_value(pair.get("baseline", ""))[1] + or _format_arg_value(pair.get("candidate", ""))[0] + != _format_arg_value(pair.get("candidate", ""))[1] + for pair in changed_values.values() + ) + if any_truncated: + full_block = html.escape("\n".join(full_lines)) + inline_html += ( + '
▸ show full values' + f'
{full_block}
' + "
" + ) + return inline_html + + +def _render_tool_call_tab(t) -> str: + rows = "" + if t.added: + rows += ( + f'+ Added tools' + f'{html.escape(", ".join(t.added))}' + ) + if t.removed: + rows += ( + f'- Removed tools' + f'{html.escape(", ".join(t.removed))}' + ) + if t.shared: + rows += ( + f'Shared tools' + f'{html.escape(", ".join(t.shared))}' + ) + + arg_rows = "" + for ac in t.argument_changes: + diff_html = _render_arg_diff_lines(ac) + pos = f"#{ac.get('position_baseline', '?')} → #{ac.get('position_candidate', '?')}" + arg_rows += ( + f'{html.escape(ac.get("tool_name", ""))}' + f'{pos}' + f'{diff_html}' + ) + + arg_section = ( + '

Argument changes (matched calls)

' + '' + '' + f'{arg_rows}
ToolPositionsDiff
' + if arg_rows + else ( + '
' + "No argument-level changes for matched calls " + "(or tool_calls were captured without args)." + "
" + ) + ) + + return f""" +
+ Edit distance: {t.edit_distance} + Similarity: {t.similarity:.2f} + Baseline: {len(t.baseline_tools)} calls + Candidate: {len(t.candidate_tools)} calls + Arg-diff matches: {len(t.argument_changes)} +
+ + {rows or ""}
ChangeTools
Identical tool sequences
+ {arg_section} +
Full sequences +
+

Baseline

{html.escape(chr(10).join(t.baseline_tools[:200]) or "(empty)")}
+

Candidate

{html.escape(chr(10).join(t.candidate_tools[:200]) or "(empty)")}
+
+
""" + + +def _render_semantic_tab(s: dict | None, has_completions: bool) -> str: + if s is None: + msg = ( + "Skipped — no final-completion text available for one or both traces. " + "Pass --conversation-ids when invoking the skill, or re-run " + "with the conversation IDs from the MC UI." + ) if not has_completions else ( + "Skipped — Claude did not run the inline semantic diff for this comparison." + ) + return f'

{msg}

' + + verdict = s.get("verdict", "unknown") + verdict_css = { + "regression": "red", + "improvement": "green", + "preserved": "green", + "mixed": "orange", + }.get(verdict, "") + + lost = s.get("lost_findings") or [] + added = s.get("added_findings") or [] + + def _bullets(items: list[str], css: str) -> str: + if not items: + return "" + return ( + '
    ' + + "".join( + f'
  • {html.escape(str(i))}
  • ' + for i in items[:10] + ) + + ("
  • " if len(items) > 10 else "") + + "
" + ) + + return f""" +
+ Overall verdict: {verdict.upper()} + Semantic similarity: {float(s.get("similarity_score", 0.0)):.2f} +
+
+ {html.escape(s.get("explanation", "") or "")} +
+
+
Lost in candidate ({len(lost)}){_bullets(lost, "removed")}
+
Added in candidate ({len(added)}){_bullets(added, "added")}
+
""" + + +def _render_entities_tab(f: dict | None, has_completions: bool) -> str: + if f is None: + msg = ( + "Skipped — no final-completion text available for one or both traces." + ) if not has_completions else ( + "Skipped — Claude did not run the inline entity overlap for this comparison." + ) + return f'

{msg}

' + + per_type = f.get("per_type_jaccard", {}) or {} + shared = f.get("shared", {}) or {} + b_only = f.get("baseline_only", {}) or {} + c_only = f.get("candidate_only", {}) or {} + + def _chips(items: list[str], css: str, limit: int = 5) -> str: + if not items: + return "" + tags = " ".join( + f'{html.escape(str(i)[:60])}' for i in items[:limit] + ) + suffix = f" +{len(items) - limit} more" if len(items) > limit else "" + return tags + suffix + + rows = "" + for entity_type in sorted(per_type): + jaccard = float(per_type.get(entity_type, 0.0)) + css = "field-changed" if jaccard < 1.0 else "field-unchanged" + details = "" + s_items = shared.get(entity_type, []) or [] + b_items = b_only.get(entity_type, []) or [] + c_items = c_only.get(entity_type, []) or [] + if s_items: + details += f'
Shared ({len(s_items)}): {_chips(s_items, "entity-shared")}
' + if b_items: + details += f'
Baseline only ({len(b_items)}): {_chips(b_items, "entity-removed")}
' + if c_items: + details += f'
Candidate only ({len(c_items)}): {_chips(c_items, "entity-added")}
' + rows += f""" + + {html.escape(entity_type)} + {jaccard:.2f} + {details or "-"} + """ + + overall = float(f.get("overall_jaccard", 0.0)) + return f""" +
+ Overall entity overlap: {overall:.2f} +
+ + + {rows or ""} +
Entity TypeJaccardDetails
No entities extracted
""" + + +# --------------------------------------------------------------------------- +# Top-level renderer +# --------------------------------------------------------------------------- + + +def render_html( + baseline: dict, + candidate: dict, + graph, + latency, + tools, + semantic: dict | None, + entities: dict | None, + narrative: str, + output_path: Path, +) -> Path: + timestamp = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M UTC") + baseline_label = baseline.get("label") or "baseline" + candidate_label = candidate.get("label") or "candidate" + baseline_id = baseline.get("trace_id", "") + candidate_id = candidate.get("trace_id", "") + + has_completions = bool( + baseline.get("final_output_text") and candidate.get("final_output_text") + ) + + semantic_section = _render_semantic_tab(semantic, has_completions) + graph_section = _render_graph_tab(graph) + latency_section = _render_latency_tab(latency) + entities_section = _render_entities_tab(entities, has_completions) + tools_section = _render_tool_call_tab(tools) + + n_regressed = sum(1 for m in latency.metrics if m.is_regression) + latency_color = "red" if n_regressed > 0 else "green" + avg_semantic = ( + float(semantic.get("similarity_score", 0.0)) if semantic else 0.0 + ) + avg_entities = ( + float(entities.get("overall_jaccard", 0.0)) if entities else 0.0 + ) + + semantic_stat = ( + f'
{avg_semantic:.2f}
' + f'
Semantic Similarity
' + if semantic else "" + ) + entities_stat = ( + f'
{avg_entities:.2f}
' + f'
Entity Overlap
' + if entities else "" + ) + + baseline_raw_pre = html.escape( + (baseline.get("final_output_text") or "(no completion text)")[:8000] + ) + candidate_raw_pre = html.escape( + (candidate.get("final_output_text") or "(no completion text)")[:8000] + ) + + body = f""" +

Trace Comparison

+
+ Generated {timestamp} | {html.escape(baseline_label)} + ({html.escape(baseline_id)}) + vs {html.escape(candidate_label)} + ({html.escape(candidate_id)}) +
+ +
+

Summary

+
+
+
{graph.overall_similarity:.2f}
+
Graph Similarity
+
+
+
{tools.similarity:.2f}
+
Tool Similarity
+
+
+
{n_regressed}
+
Latency Regressed
+
+ {semantic_stat} + {entities_stat} +
+
{html.escape(narrative or "(no narrative provided)")}
+
+ +
+
+
+
+ + + + + +
+
{semantic_section}
+ + + + +
+
+ Final completion text (both traces) +
+

Baseline

{baseline_raw_pre}
+

Candidate

{candidate_raw_pre}
+
+
+
+
+""" + + html_doc = _TEMPLATE.replace("{{BODY}}", body) + output_path.parent.mkdir(parents=True, exist_ok=True) + output_path.write_text(html_doc) + return output_path + + +_TEMPLATE = """\ + + + + +Monte Carlo Trace Comparison + + + + +{{BODY}} + + +""" + + +# --------------------------------------------------------------------------- +# CLI +# --------------------------------------------------------------------------- + + +def main(argv: list[str] | None = None) -> int: + parser = argparse.ArgumentParser(description=__doc__.splitlines()[0]) + parser.add_argument("--baseline", required=True, type=Path, help="Baseline trace JSON path") + parser.add_argument("--candidate", required=True, type=Path, help="Candidate trace JSON path") + parser.add_argument("--semantic", type=Path, help="Optional semantic-diff JSON") + parser.add_argument("--entities", type=Path, help="Optional entity-overlap JSON") + parser.add_argument("--narrative", type=Path, help="Optional plaintext corpus narrative") + parser.add_argument("--output", required=True, type=Path, help="HTML output path") + parser.add_argument("--no-open", action="store_true", help="Do not open the report in a browser") + args = parser.parse_args(argv) + + baseline = _load_json(args.baseline) + candidate = _load_json(args.candidate) + + graph = compare_graph_paths( + baseline.get("node_path", []) or [], + candidate.get("node_path", []) or [], + ) + latency = compare_latency(baseline, candidate) + tools = compare_tool_calls( + baseline.get("tool_calls", []) or [], + candidate.get("tool_calls", []) or [], + ) + + semantic = _load_json(args.semantic) if args.semantic and args.semantic.exists() else None + entities = _load_json(args.entities) if args.entities and args.entities.exists() else None + narrative = ( + args.narrative.read_text().strip() + if args.narrative and args.narrative.exists() + else "" + ) + + output = render_html( + baseline=baseline, + candidate=candidate, + graph=graph, + latency=latency, + tools=tools, + semantic=semantic, + entities=entities, + narrative=narrative, + output_path=args.output, + ) + print(f"Wrote report: {output}") + if not args.no_open: + _open_in_browser(output) + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/skills/compare-trace/scripts/evaluators/__init__.py b/skills/compare-trace/scripts/evaluators/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/skills/compare-trace/scripts/evaluators/graph_path_diff.py b/skills/compare-trace/scripts/evaluators/graph_path_diff.py new file mode 100644 index 00000000..90b16597 --- /dev/null +++ b/skills/compare-trace/scripts/evaluators/graph_path_diff.py @@ -0,0 +1,70 @@ +"""Graph path diff for two agent runs. + +Jaccard on visited node sets + LCS/max for ordering similarity. Neither side +is ground truth. Ported from monte-carlo-data/ai-agent#1236. +""" + +from __future__ import annotations + +from dataclasses import asdict, dataclass, field +from typing import Any + + +@dataclass +class GraphPathDiff: + baseline_path: list[str] + candidate_path: list[str] + baseline_only_nodes: list[str] = field(default_factory=list) + candidate_only_nodes: list[str] = field(default_factory=list) + shared_nodes: list[str] = field(default_factory=list) + jaccard_similarity: float = 1.0 + ordering_similarity: float = 1.0 + overall_similarity: float = 1.0 + + def to_dict(self) -> dict[str, Any]: + return asdict(self) + + +def _lcs_length(a: list[str], b: list[str]) -> int: + m, n = len(a), len(b) + if m == 0 or n == 0: + return 0 + prev = [0] * (n + 1) + curr = [0] * (n + 1) + for i in range(1, m + 1): + for j in range(1, n + 1): + if a[i - 1] == b[j - 1]: + curr[j] = prev[j - 1] + 1 + else: + curr[j] = max(prev[j], curr[j - 1]) + prev, curr = curr, [0] * (n + 1) + return prev[n] + + +def compare_graph_paths( + baseline_path: list[str], + candidate_path: list[str], +) -> GraphPathDiff: + baseline_set = set(baseline_path) + candidate_set = set(candidate_path) + + shared = baseline_set & candidate_set + baseline_only = baseline_set - candidate_set + candidate_only = candidate_set - baseline_set + union = baseline_set | candidate_set + + jaccard = len(shared) / len(union) if union else 1.0 + + max_len = max(len(baseline_path), len(candidate_path)) + ordering = _lcs_length(baseline_path, candidate_path) / max_len if max_len else 1.0 + + return GraphPathDiff( + baseline_path=baseline_path, + candidate_path=candidate_path, + baseline_only_nodes=sorted(baseline_only), + candidate_only_nodes=sorted(candidate_only), + shared_nodes=sorted(shared), + jaccard_similarity=jaccard, + ordering_similarity=ordering, + overall_similarity=(jaccard + ordering) / 2.0, + ) diff --git a/skills/compare-trace/scripts/evaluators/latency_diff.py b/skills/compare-trace/scripts/evaluators/latency_diff.py new file mode 100644 index 00000000..5e867a20 --- /dev/null +++ b/skills/compare-trace/scripts/evaluators/latency_diff.py @@ -0,0 +1,98 @@ +"""Latency and resource-usage diff for two agent runs. + +Compares execution time, LLM call count, total tokens, tool call count. +Reports per-metric ratios; flags regressions above ``regression_threshold``. +Ported from monte-carlo-data/ai-agent#1236. +""" + +from __future__ import annotations + +from dataclasses import asdict, dataclass, field +from typing import Any + + +@dataclass +class MetricComparison: + field_name: str + baseline_value: float + candidate_value: float + ratio: float + is_regression: bool + + def to_dict(self) -> dict[str, Any]: + return { + "field_name": self.field_name, + "baseline_value": self.baseline_value, + "candidate_value": self.candidate_value, + "ratio": self.ratio if self.ratio != float("inf") else "inf", + "is_regression": self.is_regression, + } + + +@dataclass +class LatencyDiffResult: + metrics: list[MetricComparison] + overall_assessment: str = "neutral" + regressions: list[str] = field(default_factory=list) + + def __post_init__(self): + self.regressions = [m.field_name for m in self.metrics if m.is_regression] + if self.regressions: + self.overall_assessment = "regressed" + elif any(m.ratio < 1.0 for m in self.metrics): + self.overall_assessment = "improved" + else: + self.overall_assessment = "neutral" + + def to_dict(self) -> dict[str, Any]: + return { + "metrics": [m.to_dict() for m in self.metrics], + "overall_assessment": self.overall_assessment, + "regressions": self.regressions, + } + + +_METRICS = [ + "execution_time_seconds", + "llm_call_count", + "total_tokens", + "tool_call_count", +] + + +def _compute_ratio(baseline: float, candidate: float) -> float: + if baseline == 0: + return 1.0 if candidate == 0 else float("inf") + return candidate / baseline + + +def compare_latency( + baseline: dict, + candidate: dict, + regression_threshold: float = 1.5, +) -> LatencyDiffResult: + """``baseline`` and ``candidate`` follow the normalized trace JSON shape. + + ``tool_call_count`` is derived from ``len(tool_calls)`` if not present. + """ + + def _extract(snapshot: dict, key: str) -> float: + if key == "tool_call_count": + return float(len(snapshot.get("tool_calls", []) or [])) + return float(snapshot.get(key, 0) or 0) + + comparisons: list[MetricComparison] = [] + for key in _METRICS: + b = _extract(baseline, key) + c = _extract(candidate, key) + ratio = _compute_ratio(b, c) + comparisons.append( + MetricComparison( + field_name=key, + baseline_value=b, + candidate_value=c, + ratio=ratio, + is_regression=ratio > regression_threshold, + ) + ) + return LatencyDiffResult(metrics=comparisons) diff --git a/skills/compare-trace/scripts/evaluators/tool_call_diff.py b/skills/compare-trace/scripts/evaluators/tool_call_diff.py new file mode 100644 index 00000000..33ddfb30 --- /dev/null +++ b/skills/compare-trace/scripts/evaluators/tool_call_diff.py @@ -0,0 +1,207 @@ +"""Tool-call sequence + argument diff for two agent runs. + +v0.3: compares ordered sequences of tool names (Levenshtein) AND argument +dicts (top-level keys) for matched tool calls. Argument-level diff is enabled +when callers pass tool_calls with populated ``args`` dicts. + +Sequence math and matching logic ported from monte-carlo-data/ai-agent#1236. +""" + +from __future__ import annotations + +from collections import defaultdict +from dataclasses import asdict, dataclass, field +from typing import Any + + +@dataclass +class ArgumentChange: + """One matched tool-call pair whose arg keys differ. + + `position_baseline` / `position_candidate` are the indices into the + respective sequences (useful for the HTML rendering). The ``*_values`` + dicts carry the actual values that the renderer surfaces inline. + """ + + tool_name: str + position_baseline: int + position_candidate: int + added_keys: list[str] = field(default_factory=list) + removed_keys: list[str] = field(default_factory=list) + changed_keys: list[str] = field(default_factory=list) + added_values: dict[str, Any] = field(default_factory=dict) + removed_values: dict[str, Any] = field(default_factory=dict) + changed_values: dict[str, dict[str, Any]] = field(default_factory=dict) + + def to_dict(self) -> dict[str, Any]: + return asdict(self) + + +@dataclass +class ToolCallDiff: + baseline_tools: list[str] + candidate_tools: list[str] + added: list[str] = field(default_factory=list) + removed: list[str] = field(default_factory=list) + shared: list[str] = field(default_factory=list) + edit_distance: int = 0 + similarity: float = 1.0 + argument_changes: list[dict[str, Any]] = field(default_factory=list) + + def to_dict(self) -> dict[str, Any]: + return asdict(self) + + +def _levenshtein(a: list[str], b: list[str]) -> int: + m, n = len(a), len(b) + if m < n: + a, b = b, a + m, n = n, m + previous = list(range(n + 1)) + current = [0] * (n + 1) + for i in range(1, m + 1): + current[0] = i + for j in range(1, n + 1): + cost = 0 if a[i - 1] == b[j - 1] else 1 + current[j] = min( + previous[j] + 1, + current[j - 1] + 1, + previous[j - 1] + cost, + ) + previous, current = current, previous + return previous[n] + + +def _match_tools_by_proximity( + baseline_calls: list[dict[str, Any]], + candidate_calls: list[dict[str, Any]], +) -> list[tuple[int, int]]: + """Match tool calls between baseline and candidate by name and position. + + For each tool name that appears in both sequences, greedily pairs calls by + closest positional proximity (baseline index vs candidate index). Each + call is matched at most once. Returns ``(baseline_index, candidate_index)`` + pairs, sorted by baseline index. + + Ported verbatim from monte-carlo-data/ai-agent#1236. + """ + baseline_by_name: dict[str, list[int]] = defaultdict(list) + candidate_by_name: dict[str, list[int]] = defaultdict(list) + + for i, call in enumerate(baseline_calls): + baseline_by_name[call.get("name", "")].append(i) + for j, call in enumerate(candidate_calls): + candidate_by_name[call.get("name", "")].append(j) + + matches: list[tuple[int, int]] = [] + for name in baseline_by_name: + if name not in candidate_by_name: + continue + b_indices = list(baseline_by_name[name]) + c_indices = list(candidate_by_name[name]) + + used_c: set[int] = set() + for bi in b_indices: + best_ci: int | None = None + best_dist: float = float("inf") + for ci in c_indices: + if ci in used_c: + continue + dist = abs(bi - ci) + if dist < best_dist: + best_dist = dist + best_ci = ci + if best_ci is not None: + matches.append((bi, best_ci)) + used_c.add(best_ci) + + matches.sort() + return matches + + +def _compare_args( + tool_name: str, + baseline_idx: int, + candidate_idx: int, + baseline_args: dict[str, Any], + candidate_args: dict[str, Any], +) -> ArgumentChange | None: + """Compare two arg dicts at the top-level key granularity. + + Returns an ``ArgumentChange`` if anything differs, else ``None`` (so the + caller can skip noise). Nested-dict diff is intentionally out of scope — + a key whose value changes (including a nested dict that changed inside) + surfaces as a ``changed_keys`` entry. + + Ported verbatim from monte-carlo-data/ai-agent#1236. + """ + b_keys = set(baseline_args.keys()) + c_keys = set(candidate_args.keys()) + + added_keys = sorted(c_keys - b_keys) + removed_keys = sorted(b_keys - c_keys) + changed_keys = sorted(k for k in b_keys & c_keys if baseline_args[k] != candidate_args[k]) + + if not added_keys and not removed_keys and not changed_keys: + return None + + return ArgumentChange( + tool_name=tool_name, + position_baseline=baseline_idx, + position_candidate=candidate_idx, + added_keys=added_keys, + removed_keys=removed_keys, + changed_keys=changed_keys, + added_values={k: candidate_args[k] for k in added_keys}, + removed_values={k: baseline_args[k] for k in removed_keys}, + changed_values={ + k: {"baseline": baseline_args[k], "candidate": candidate_args[k]} + for k in changed_keys + }, + ) + + +def compare_tool_calls( + baseline_calls: list[dict[str, Any]], + candidate_calls: list[dict[str, Any]], +) -> ToolCallDiff: + """Compare two ordered tool call sequences from baseline and candidate runs. + + Each call is a dict with ``"name"`` (str) and ``"args"`` (dict). If ``args`` + is empty or missing for both sides, argument_changes will be empty too. + """ + baseline_tools = [c.get("name", "") for c in baseline_calls] + candidate_tools = [c.get("name", "") for c in candidate_calls] + + baseline_set = set(baseline_tools) + candidate_set = set(candidate_tools) + + edit_dist = _levenshtein(baseline_tools, candidate_tools) + max_len = max(len(baseline_tools), len(candidate_tools)) + similarity = 1.0 - (edit_dist / max_len) if max_len > 0 else 1.0 + + matches = _match_tools_by_proximity(baseline_calls, candidate_calls) + argument_changes: list[dict[str, Any]] = [] + for bi, ci in matches: + b_call = baseline_calls[bi] + c_call = candidate_calls[ci] + change = _compare_args( + tool_name=b_call.get("name", ""), + baseline_idx=bi, + candidate_idx=ci, + baseline_args=b_call.get("args", {}) or {}, + candidate_args=c_call.get("args", {}) or {}, + ) + if change is not None: + argument_changes.append(change.to_dict()) + + return ToolCallDiff( + baseline_tools=baseline_tools, + candidate_tools=candidate_tools, + added=sorted(candidate_set - baseline_set), + removed=sorted(baseline_set - candidate_set), + shared=sorted(baseline_set & candidate_set), + edit_distance=edit_dist, + similarity=similarity, + argument_changes=argument_changes, + ) diff --git a/skills/compare-trace/scripts/local_otlp_receiver.py b/skills/compare-trace/scripts/local_otlp_receiver.py new file mode 100644 index 00000000..14845816 --- /dev/null +++ b/skills/compare-trace/scripts/local_otlp_receiver.py @@ -0,0 +1,208 @@ +"""Minimal local OTLP/HTTP trace receiver. + +Listens on ``POST /v1/traces`` (protobuf, optionally gzipped) and appends each +received span as one JSON object per line to ``--output``. Spans are recorded +in a flat, framework-neutral shape that downstream converters +(see ``scripts/sources/otel_spans.py``) can normalize for the compare-trace +skill. + +Usage:: + + python local_otlp_receiver.py --output run-a.jsonl --port 4318 + +Send ``SIGINT`` / ``SIGTERM`` to stop. The script flushes the output file and +exits ``0`` on graceful shutdown. + +Dependencies: ``opentelemetry-proto`` (already a transitive dep of any +``opentelemetry-sdk`` install). +""" + +from __future__ import annotations + +import argparse +import gzip +import json +import signal +import sys +import threading +from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer +from pathlib import Path +from typing import Any + +from opentelemetry.proto.collector.trace.v1 import trace_service_pb2 +from opentelemetry.proto.common.v1 import common_pb2 +from opentelemetry.proto.trace.v1 import trace_pb2 + +_SPAN_KIND_NAMES = { + trace_pb2.Span.SPAN_KIND_UNSPECIFIED: "UNSPECIFIED", + trace_pb2.Span.SPAN_KIND_INTERNAL: "INTERNAL", + trace_pb2.Span.SPAN_KIND_SERVER: "SERVER", + trace_pb2.Span.SPAN_KIND_CLIENT: "CLIENT", + trace_pb2.Span.SPAN_KIND_PRODUCER: "PRODUCER", + trace_pb2.Span.SPAN_KIND_CONSUMER: "CONSUMER", +} + +_STATUS_CODE_NAMES = { + trace_pb2.Status.STATUS_CODE_UNSET: "UNSET", + trace_pb2.Status.STATUS_CODE_OK: "OK", + trace_pb2.Status.STATUS_CODE_ERROR: "ERROR", +} + + +def _anyvalue_to_python(value: common_pb2.AnyValue) -> Any: + kind = value.WhichOneof("value") + if kind == "string_value": + return value.string_value + if kind == "bool_value": + return value.bool_value + if kind == "int_value": + return value.int_value + if kind == "double_value": + return value.double_value + if kind == "bytes_value": + return value.bytes_value.hex() + if kind == "array_value": + return [_anyvalue_to_python(v) for v in value.array_value.values] + if kind == "kvlist_value": + return {kv.key: _anyvalue_to_python(kv.value) for kv in value.kvlist_value.values} + return None + + +def _attrs_to_dict(attrs) -> dict[str, Any]: + return {kv.key: _anyvalue_to_python(kv.value) for kv in attrs} + + +def _span_to_dict( + span: trace_pb2.Span, + resource_attrs: dict[str, Any], + scope_name: str, +) -> dict[str, Any]: + return { + "trace_id": span.trace_id.hex(), + "span_id": span.span_id.hex(), + "parent_span_id": span.parent_span_id.hex() if span.parent_span_id else None, + "name": span.name, + "kind": _SPAN_KIND_NAMES.get(span.kind, "UNSPECIFIED"), + "start_time_unix_nano": int(span.start_time_unix_nano), + "end_time_unix_nano": int(span.end_time_unix_nano), + "duration_ms": (span.end_time_unix_nano - span.start_time_unix_nano) / 1_000_000.0, + "status": { + "code": _STATUS_CODE_NAMES.get(span.status.code, "UNSET"), + "message": span.status.message, + }, + "attributes": _attrs_to_dict(span.attributes), + "events": [ + { + "name": ev.name, + "time_unix_nano": int(ev.time_unix_nano), + "attributes": _attrs_to_dict(ev.attributes), + } + for ev in span.events + ], + "resource": resource_attrs, + "scope": scope_name, + } + + +class _Handler(BaseHTTPRequestHandler): + output_path: Path = Path("/dev/null") + file_lock: threading.Lock = threading.Lock() + span_count: int = 0 + + def log_message(self, fmt: str, *args: Any) -> None: # noqa: D401 + sys.stderr.write(f"[receiver] {self.address_string()} - {fmt % args}\n") + + def do_POST(self) -> None: + if self.path != "/v1/traces": + self.send_response(404) + self.end_headers() + return + + length = int(self.headers.get("Content-Length", 0)) + body = self.rfile.read(length) if length else b"" + if self.headers.get("Content-Encoding", "").lower() == "gzip": + try: + body = gzip.decompress(body) + except OSError as e: + sys.stderr.write(f"[receiver] gzip decompress failed: {e}\n") + self.send_response(400) + self.end_headers() + return + + req = trace_service_pb2.ExportTraceServiceRequest() + try: + req.ParseFromString(body) + except Exception as e: # pragma: no cover + sys.stderr.write(f"[receiver] proto parse failed: {e}\n") + self.send_response(400) + self.end_headers() + return + + new_spans: list[str] = [] + for rspans in req.resource_spans: + resource_attrs = _attrs_to_dict(rspans.resource.attributes) + for sspans in rspans.scope_spans: + scope_name = sspans.scope.name + for span in sspans.spans: + new_spans.append( + json.dumps(_span_to_dict(span, resource_attrs, scope_name)) + ) + + if new_spans: + with type(self).file_lock: + with type(self).output_path.open("a") as fh: + fh.write("\n".join(new_spans)) + fh.write("\n") + type(self).span_count += len(new_spans) + sys.stderr.write(f"[receiver] wrote {len(new_spans)} spans (total {type(self).span_count})\n") + + resp = trace_service_pb2.ExportTraceServiceResponse() + body_out = resp.SerializeToString() + self.send_response(200) + self.send_header("Content-Type", "application/x-protobuf") + self.send_header("Content-Length", str(len(body_out))) + self.end_headers() + self.wfile.write(body_out) + + +def _parse_args() -> argparse.Namespace: + p = argparse.ArgumentParser(description=__doc__) + p.add_argument("--output", required=True, help="JSONL output file for received spans") + p.add_argument("--port", type=int, default=4318) + p.add_argument("--host", default="127.0.0.1") + return p.parse_args() + + +def main() -> int: + args = _parse_args() + out_path = Path(args.output).resolve() + out_path.parent.mkdir(parents=True, exist_ok=True) + out_path.touch() + _Handler.output_path = out_path + _Handler.file_lock = threading.Lock() + _Handler.span_count = 0 + + server = ThreadingHTTPServer((args.host, args.port), _Handler) + sys.stderr.write(f"[receiver] listening on http://{args.host}:{args.port}/v1/traces\n") + sys.stderr.write(f"[receiver] writing spans to {out_path}\n") + + stop = threading.Event() + + def _shutdown(signum: int, _frame: Any) -> None: + sys.stderr.write(f"[receiver] caught signal {signum}, shutting down\n") + stop.set() + threading.Thread(target=server.shutdown, daemon=True).start() + + signal.signal(signal.SIGINT, _shutdown) + signal.signal(signal.SIGTERM, _shutdown) + + try: + server.serve_forever() + finally: + server.server_close() + sys.stderr.write(f"[receiver] stopped; received {_Handler.span_count} spans total\n") + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/skills/compare-trace/scripts/sources/__init__.py b/skills/compare-trace/scripts/sources/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/skills/compare-trace/scripts/sources/otel_spans.py b/skills/compare-trace/scripts/sources/otel_spans.py new file mode 100644 index 00000000..254fe468 --- /dev/null +++ b/skills/compare-trace/scripts/sources/otel_spans.py @@ -0,0 +1,226 @@ +"""Normalize a JSONL stream of OTLP spans (as written by +``local_otlp_receiver.py``) into the shape the compare-trace skill consumes. + +Span dialect: GenAI semantic conventions (``gen_ai.prompt.*``, ``gen_ai.completion.*``, +``gen_ai.usage.*``) — these are emitted by the Traceloop / openllmetry +``LangchainInstrumentor`` and are stable across LangChain releases. + +LangGraph node spans are detected by the ``*.task`` name suffix that the +instrumentor uses for each node call. The workflow root is detected by the +``*.workflow`` suffix and is also where we read the overall duration from. + +Tool-call extraction reads ``gen_ai.completion.{i}.tool_calls.{j}.{name,arguments,id}`` +attributes when present. If they are absent — either because the run made no +tool calls, or because the instrumentor omitted them — ``tool_calls`` is +returned empty rather than guessed-at. +""" + +from __future__ import annotations + +import argparse +import json +import re +import sys +from collections import defaultdict +from pathlib import Path +from typing import Any + +_TASK_NAME_RE = re.compile(r"^(?P.+)\.task$") +_WORKFLOW_NAME_RE = re.compile(r"^(?P.+)\.workflow$") +_TOOL_SPAN_NAME_RE = re.compile(r"^(?P.+)\.tool$") + +# Attribute key patterns we care about for tool-call extraction. +# Traceloop's Bedrock instrumentor only emits tool_calls under +# ``gen_ai.prompt.*.tool_calls.*`` (where prior assistant tool calls echo +# back as part of the message history); OpenAI / Anthropic native paths +# also emit under ``gen_ai.completion.*.tool_calls.*``. Accept both. +_TOOL_CALL_RE = re.compile( + r"^gen_ai\.(?:completion|prompt)\." + r"(?P\d+)\.tool_calls\.(?P\d+)\.(?Pname|arguments|id)$" +) + + +def _parse_jsonl(path: Path) -> list[dict[str, Any]]: + spans: list[dict[str, Any]] = [] + with path.open() as fh: + for line in fh: + line = line.strip() + if not line: + continue + spans.append(json.loads(line)) + return spans + + +def _extract_tool_calls_from_attrs( + attrs: dict[str, Any], +) -> list[dict[str, Any]]: + """Pull tool_calls out of one LLM span's gen_ai.{completion,prompt}.* attrs. + + Returns calls in (msg_idx, tc_idx) order. Callers are expected to + dedupe across spans by tool-call ``id``: the same call typically appears + in the producing completion AND in every subsequent prompt's message + history. + + ``arguments`` is a JSON-encoded string in nearly all cases; some + instrumentations emit a dict instead. Both are handled. + """ + buckets: dict[tuple[int, int], dict[str, Any]] = defaultdict(dict) + for key, value in attrs.items(): + match = _TOOL_CALL_RE.match(key) + if not match: + continue + msg_idx = int(match.group("msg_idx")) + tc_idx = int(match.group("tc_idx")) + buckets[(msg_idx, tc_idx)][match.group("field")] = value + + calls: list[dict[str, Any]] = [] + for (_, _), raw in sorted(buckets.items()): + name = raw.get("name") or "" + args_raw = raw.get("arguments", "") + if isinstance(args_raw, dict): + args = args_raw + elif isinstance(args_raw, str) and args_raw: + try: + args = json.loads(args_raw) + except json.JSONDecodeError: + args = {} + else: + args = {} + calls.append({"name": name, "args": args or {}, "id": raw.get("id", "")}) + return calls + + +def normalize(spans: list[dict[str, Any]]) -> dict[str, Any]: + """Convert raw OTLP spans into a compare-trace normalized trace dict.""" + if not spans: + return { + "trace_id": "", + "node_path": [], + "tool_calls": [], + "execution_time_seconds": 0.0, + "llm_call_count": 0, + "total_tokens": 0, + "tool_call_count": 0, + "final_output_text": "", + } + + # Take the dominant trace_id (handles stray smoke spans landing in the + # same file). All spans in a single agent run share one trace_id. + by_trace: dict[str, list[dict[str, Any]]] = defaultdict(list) + for s in spans: + by_trace[s.get("trace_id", "")].append(s) + trace_id = max(by_trace, key=lambda k: len(by_trace[k])) + run_spans = by_trace[trace_id] + + # LangGraph nodes — ordered by start time, suffix stripped. + node_spans = sorted( + (s for s in run_spans if _TASK_NAME_RE.match(s.get("name", ""))), + key=lambda s: s.get("start_time_unix_nano", 0), + ) + node_path = [_TASK_NAME_RE.match(s["name"]).group("node") for s in node_spans] + + # LLM calls: any span carrying gen_ai.prompt.* counts. + llm_spans = [ + s + for s in run_spans + if any(k.startswith("gen_ai.prompt.") for k in s.get("attributes", {})) + ] + + total_tokens = 0 + final_output_text = "" + + # Dedupe tool calls by id across spans — the same call appears once in + # the producing completion and again in every subsequent prompt history. + tool_calls_by_id: dict[str, dict[str, Any]] = {} + # Anonymous calls (no id) keyed by (name, json-args) to avoid losing + # repeated tool invocations that legitimately differ from each other. + anon_tool_calls: list[dict[str, Any]] = [] + # Order calls by the start_time of the FIRST span that mentioned them. + first_seen_ns: dict[str, int] = {} + + for llm in sorted(llm_spans, key=lambda s: s.get("start_time_unix_nano", 0)): + attrs = llm.get("attributes", {}) + start_ns = llm.get("start_time_unix_nano", 0) + + # Prefer the explicit total_tokens attr if present; else sum the parts. + total = ( + attrs.get("llm.usage.total_tokens") + or attrs.get("gen_ai.usage.total_tokens") + or ( + (attrs.get("gen_ai.usage.prompt_tokens") or 0) + + (attrs.get("gen_ai.usage.completion_tokens") or 0) + ) + ) + try: + total_tokens += int(total) + except (TypeError, ValueError): + pass + + for call in _extract_tool_calls_from_attrs(attrs): + tid = call.get("id") or "" + if tid: + if tid not in tool_calls_by_id: + tool_calls_by_id[tid] = call + first_seen_ns[tid] = start_ns + else: + anon_tool_calls.append(call) + + # Capture the LATEST assistant completion as final_output_text. + # Walk completion indices in order; the highest index is the + # canonical final answer for that LLM call. + for i in range(20): + content = attrs.get(f"gen_ai.completion.{i}.content") + role = attrs.get(f"gen_ai.completion.{i}.role") + if content and role == "assistant" and isinstance(content, str): + final_output_text = content.strip() + + tool_calls = [ + tool_calls_by_id[tid] + for tid in sorted(tool_calls_by_id, key=lambda t: first_seen_ns.get(t, 0)) + ] + tool_calls.extend(anon_tool_calls) + + # Workflow root span carries the wall-clock duration. + workflow_spans = [ + s for s in run_spans if _WORKFLOW_NAME_RE.match(s.get("name", "")) + ] + if workflow_spans: + wf = workflow_spans[0] + execution_time_seconds = ( + wf.get("end_time_unix_nano", 0) - wf.get("start_time_unix_nano", 0) + ) / 1_000_000_000.0 + else: + ends = [s.get("end_time_unix_nano", 0) for s in run_spans] + starts = [s.get("start_time_unix_nano", 0) for s in run_spans] + execution_time_seconds = (max(ends) - min(starts)) / 1_000_000_000.0 + + return { + "trace_id": trace_id, + "node_path": node_path, + "tool_calls": tool_calls, + "execution_time_seconds": round(execution_time_seconds, 3), + "llm_call_count": len(llm_spans), + "total_tokens": total_tokens, + "tool_call_count": len(tool_calls), + "final_output_text": final_output_text, + } + + +def main() -> int: + p = argparse.ArgumentParser(description=__doc__) + p.add_argument("input", help="JSONL of OTLP spans from local_otlp_receiver.py") + p.add_argument("--output", help="Write normalized JSON here (default: stdout)") + args = p.parse_args() + + spans = _parse_jsonl(Path(args.input)) + normalized = normalize(spans) + out = json.dumps(normalized, indent=2) + if args.output: + Path(args.output).write_text(out) + else: + print(out) + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/skills/compare-trace/tests/fixtures/bedrock_dialect.jsonl b/skills/compare-trace/tests/fixtures/bedrock_dialect.jsonl new file mode 100644 index 00000000..0ebf4517 --- /dev/null +++ b/skills/compare-trace/tests/fixtures/bedrock_dialect.jsonl @@ -0,0 +1,7 @@ +{"trace_id": "trace01", "span_id": "s01_wf", "parent_span_id": null, "name": "MyAgent.workflow", "kind": "INTERNAL", "start_time_unix_nano": 1000000000, "end_time_unix_nano": 11000000000, "duration_ms": 10000.0, "status": {"code": "UNSET", "message": ""}, "attributes": {"traceloop.workflow.name": "MyAgent"}, "events": [], "resource": {"service.name": "myagent"}, "scope": "traceloop.langchain"} +{"trace_id": "trace01", "span_id": "s02_init", "parent_span_id": "s01_wf", "name": "initialization.task", "kind": "INTERNAL", "start_time_unix_nano": 1500000000, "end_time_unix_nano": 2000000000, "duration_ms": 500.0, "status": {"code": "UNSET", "message": ""}, "attributes": {"montecarlo.association_properties.langgraph_node": "initialization"}, "events": [], "resource": {"service.name": "myagent"}, "scope": "traceloop.langchain"} +{"trace_id": "trace01", "span_id": "s03_react","parent_span_id": "s01_wf", "name": "react_agent.task", "kind": "INTERNAL", "start_time_unix_nano": 2000000000, "end_time_unix_nano": 10000000000, "duration_ms": 8000.0, "status": {"code": "UNSET", "message": ""}, "attributes": {"montecarlo.association_properties.langgraph_node": "react_agent"}, "events": [], "resource": {"service.name": "myagent"}, "scope": "traceloop.langchain"} +{"trace_id": "trace01", "span_id": "s04_chat1","parent_span_id": "s03_react","name": "ChatBedrock.chat", "kind": "CLIENT", "start_time_unix_nano": 3000000000, "end_time_unix_nano": 4000000000, "duration_ms": 1000.0, "status": {"code": "UNSET", "message": ""}, "attributes": {"gen_ai.system": "AWS", "gen_ai.request.model": "claude-sonnet-4-6", "gen_ai.prompt.0.role": "user", "gen_ai.prompt.0.content": "what is my coverage gap?", "gen_ai.prompt.1.role": "system", "gen_ai.prompt.1.content": "You are a Coverage Advisor.", "gen_ai.completion.0.role": "assistant", "gen_ai.completion.0.content": "Let me check.", "gen_ai.usage.prompt_tokens": 100, "gen_ai.usage.completion_tokens": 20, "llm.usage.total_tokens": 120}, "events": [], "resource": {"service.name": "myagent"}, "scope": "traceloop.langchain"} +{"trace_id": "trace01", "span_id": "s05_tool", "parent_span_id": "s03_react","name": "get_warehouses.tool", "kind": "INTERNAL", "start_time_unix_nano": 4500000000, "end_time_unix_nano": 5000000000, "duration_ms": 500.0, "status": {"code": "UNSET", "message": ""}, "attributes": {"traceloop.span.kind": "tool"}, "events": [], "resource": {"service.name": "myagent"}, "scope": "traceloop.langchain"} +{"trace_id": "trace01", "span_id": "s06_chat2","parent_span_id": "s03_react","name": "ChatBedrock.chat", "kind": "CLIENT", "start_time_unix_nano": 5500000000, "end_time_unix_nano": 6500000000, "duration_ms": 1000.0, "status": {"code": "UNSET", "message": ""}, "attributes": {"gen_ai.system": "AWS", "gen_ai.request.model": "claude-sonnet-4-6", "gen_ai.prompt.0.role": "user", "gen_ai.prompt.0.content": "what is my coverage gap?", "gen_ai.prompt.1.role": "system", "gen_ai.prompt.1.content": "You are a Coverage Advisor.", "gen_ai.prompt.2.role": "assistant", "gen_ai.prompt.2.content": "", "gen_ai.prompt.2.tool_calls.0.id": "tid_1", "gen_ai.prompt.2.tool_calls.0.name": "get_warehouses", "gen_ai.prompt.2.tool_calls.0.arguments": "{}", "gen_ai.prompt.3.role": "tool", "gen_ai.prompt.3.content": "[warehouses...]", "gen_ai.completion.0.role": "assistant", "gen_ai.completion.0.content": "You have 3 warehouses with use cases.", "gen_ai.usage.prompt_tokens": 200, "gen_ai.usage.completion_tokens": 30, "llm.usage.total_tokens": 230}, "events": [], "resource": {"service.name": "myagent"}, "scope": "traceloop.langchain"} +{"trace_id": "trace01", "span_id": "s07_route","parent_span_id": "s01_wf", "name": "_route_after_react.task", "kind": "INTERNAL", "start_time_unix_nano": 9500000000, "end_time_unix_nano": 9600000000, "duration_ms": 100.0, "status": {"code": "UNSET", "message": ""}, "attributes": {"montecarlo.association_properties.langgraph_node": "_route_after_react"}, "events": [], "resource": {"service.name": "myagent"}, "scope": "traceloop.langchain"} diff --git a/skills/compare-trace/tests/fixtures/completion_dialect.jsonl b/skills/compare-trace/tests/fixtures/completion_dialect.jsonl new file mode 100644 index 00000000..7ee95ec7 --- /dev/null +++ b/skills/compare-trace/tests/fixtures/completion_dialect.jsonl @@ -0,0 +1,4 @@ +{"trace_id": "trace02", "span_id": "t01_wf", "parent_span_id": null, "name": "OtherAgent.workflow", "kind": "INTERNAL", "start_time_unix_nano": 1000000000, "end_time_unix_nano": 6000000000, "duration_ms": 5000.0, "status": {"code": "UNSET", "message": ""}, "attributes": {"traceloop.workflow.name": "OtherAgent"}, "events": [], "resource": {"service.name": "otheragent"}, "scope": "traceloop.langchain"} +{"trace_id": "trace02", "span_id": "t02_node", "parent_span_id": "t01_wf", "name": "agent_node.task", "kind": "INTERNAL", "start_time_unix_nano": 1500000000, "end_time_unix_nano": 5000000000, "duration_ms": 3500.0, "status": {"code": "UNSET", "message": ""}, "attributes": {"montecarlo.association_properties.langgraph_node": "agent_node"}, "events": [], "resource": {"service.name": "otheragent"}, "scope": "traceloop.langchain"} +{"trace_id": "trace02", "span_id": "t03_chat1","parent_span_id": "t02_node","name": "ChatOpenAI.chat", "kind": "CLIENT", "start_time_unix_nano": 2000000000, "end_time_unix_nano": 3000000000, "duration_ms": 1000.0, "status": {"code": "UNSET", "message": ""}, "attributes": {"gen_ai.system": "openai", "gen_ai.request.model": "gpt-4o", "gen_ai.prompt.0.role": "user", "gen_ai.prompt.0.content": "do the thing", "gen_ai.completion.0.role": "assistant", "gen_ai.completion.0.content": "", "gen_ai.completion.0.tool_calls.0.id": "call_abc", "gen_ai.completion.0.tool_calls.0.name": "do_thing", "gen_ai.completion.0.tool_calls.0.arguments": "{\"target\":\"foo\",\"force\":true}", "gen_ai.usage.total_tokens": 75}, "events": [], "resource": {"service.name": "otheragent"}, "scope": "traceloop.langchain"} +{"trace_id": "trace02", "span_id": "t04_chat2","parent_span_id": "t02_node","name": "ChatOpenAI.chat", "kind": "CLIENT", "start_time_unix_nano": 3500000000, "end_time_unix_nano": 4500000000, "duration_ms": 1000.0, "status": {"code": "UNSET", "message": ""}, "attributes": {"gen_ai.system": "openai", "gen_ai.request.model": "gpt-4o", "gen_ai.prompt.0.role": "user", "gen_ai.prompt.0.content": "do the thing", "gen_ai.prompt.1.role": "assistant", "gen_ai.prompt.1.tool_calls.0.id": "call_abc", "gen_ai.prompt.1.tool_calls.0.name": "do_thing", "gen_ai.prompt.1.tool_calls.0.arguments": "{\"target\":\"foo\",\"force\":true}", "gen_ai.prompt.2.role": "tool", "gen_ai.prompt.2.content": "OK done", "gen_ai.completion.0.role": "assistant", "gen_ai.completion.0.content": "All done — the thing was processed.", "gen_ai.usage.total_tokens": 90}, "events": [], "resource": {"service.name": "otheragent"}, "scope": "traceloop.langchain"} diff --git a/skills/compare-trace/tests/test_otel_spans.py b/skills/compare-trace/tests/test_otel_spans.py new file mode 100644 index 00000000..63bbc354 --- /dev/null +++ b/skills/compare-trace/tests/test_otel_spans.py @@ -0,0 +1,161 @@ +#!/usr/bin/env python3 +""" +Smoke test for ``sources/otel_spans.py`` — feeds each fixture JSONL into the +normalizer and asserts the resulting dict matches the expected shape. + +Two fixtures cover the two tool-call dialects the normalizer has to handle: + +- ``bedrock_dialect.jsonl`` — Traceloop's Bedrock instrumentor emits tool + calls only under ``gen_ai.prompt.*.tool_calls.*`` on the NEXT LLM call + (never on the completion that produced them). Exercises the + prompt-history extraction path. +- ``completion_dialect.jsonl`` — OpenAI / Anthropic-native instrumentations + emit tool calls under ``gen_ai.completion.*.tool_calls.*``. Exercises + the completion-extraction path and the dedup-by-id behavior (the same + call also appears in the next prompt's history; the normalizer must + collapse to one). + +Run: + python3 skills/compare-trace/tests/test_otel_spans.py +""" + +from __future__ import annotations + +import json +import subprocess +import sys +from pathlib import Path + +TESTS_DIR = Path(__file__).parent +SKILL_ROOT = TESTS_DIR.parent +NORMALIZER = SKILL_ROOT / "scripts" / "sources" / "otel_spans.py" +FIXTURES_DIR = TESTS_DIR / "fixtures" + +PASSED = 0 +FAILED = 0 + + +def run_normalizer(fixture: str) -> dict: + """Run sources/otel_spans.py against a fixture, return parsed JSON.""" + result = subprocess.run( + [sys.executable, str(NORMALIZER), str(FIXTURES_DIR / fixture)], + capture_output=True, + text=True, + check=True, + timeout=30, + ) + return json.loads(result.stdout) + + +def check(label: str, condition: bool, hint: str = "") -> None: + """Record a single check; raise on failure so pytest sees per-test fails.""" + global PASSED, FAILED + if condition: + PASSED += 1 + print(f" PASS {label}") + return + FAILED += 1 + print(f" FAIL {label}" + (f" -- {hint}" if hint else "")) + raise AssertionError(label + (f" ({hint})" if hint else "")) + + +def test_bedrock_dialect() -> None: + """Bedrock: tool_calls live under gen_ai.prompt.*.tool_calls.* in later spans.""" + print("test_bedrock_dialect:") + out = run_normalizer("bedrock_dialect.jsonl") + + check("trace_id picked", out["trace_id"] == "trace01", out["trace_id"]) + check( + "node_path in start-time order", + out["node_path"] == ["initialization", "react_agent", "_route_after_react"], + repr(out["node_path"]), + ) + check("llm_call_count = 2", out["llm_call_count"] == 2, str(out["llm_call_count"])) + check( + "total_tokens summed across LLM spans", + out["total_tokens"] == 350, + f"expected 350 (120+230), got {out['total_tokens']}", + ) + check( + "execution_time_seconds from workflow root", + out["execution_time_seconds"] == 10.0, + str(out["execution_time_seconds"]), + ) + check("tool_call_count = 1", out["tool_call_count"] == 1, str(out["tool_call_count"])) + check( + "tool call extracted from prompt.*.tool_calls.*", + len(out["tool_calls"]) == 1 + and out["tool_calls"][0]["name"] == "get_warehouses" + and out["tool_calls"][0]["id"] == "tid_1" + and out["tool_calls"][0]["args"] == {}, + repr(out["tool_calls"]), + ) + check( + "final_output_text from last LLM span's completion", + out["final_output_text"] == "You have 3 warehouses with use cases.", + repr(out["final_output_text"]), + ) + + +def test_completion_dialect() -> None: + """OpenAI/Anthropic: tool_calls under completion.* on the producing span; + the SAME call echoes back under prompt.* on the next span — must dedup.""" + print("test_completion_dialect:") + out = run_normalizer("completion_dialect.jsonl") + + check("trace_id picked", out["trace_id"] == "trace02", out["trace_id"]) + check( + "node_path", + out["node_path"] == ["agent_node"], + repr(out["node_path"]), + ) + check("llm_call_count = 2", out["llm_call_count"] == 2, str(out["llm_call_count"])) + check( + "total_tokens uses gen_ai.usage.total_tokens", + out["total_tokens"] == 165, + f"expected 165 (75+90), got {out['total_tokens']}", + ) + check( + "execution_time_seconds", + out["execution_time_seconds"] == 5.0, + str(out["execution_time_seconds"]), + ) + check( + "tool_calls deduped by id across completion + prompt-history", + out["tool_call_count"] == 1, + f"expected 1 deduped call, got {out['tool_call_count']}: {out['tool_calls']}", + ) + check( + "deduped tool call retains parsed args", + len(out["tool_calls"]) == 1 + and out["tool_calls"][0]["name"] == "do_thing" + and out["tool_calls"][0]["id"] == "call_abc" + and out["tool_calls"][0]["args"] == {"target": "foo", "force": True}, + repr(out["tool_calls"]), + ) + check( + "final_output_text from last LLM span", + out["final_output_text"] == "All done — the thing was processed.", + repr(out["final_output_text"]), + ) + + +def main() -> None: + """Standalone runner — invokes each test with its own try/except so we + get a single PASSED/FAILED summary instead of stopping at the first fail. + pytest invokes ``test_*`` directly and gets per-test failures via the + AssertionError raised in ``check()``.""" + for fn in [test_bedrock_dialect, test_completion_dialect]: + try: + fn() + except AssertionError as e: + print(f" (test {fn.__name__} aborted: {e})") + print() + print(f"PASSED: {PASSED}") + print(f"FAILED: {FAILED}") + if FAILED: + sys.exit(1) + + +if __name__ == "__main__": + main()