Skip to content

Commit 0e026a1

Browse files
committed
feat: [US-007] - [Add transcript discovery abstraction for non-Claude artifacts]
1 parent 5a4c07d commit 0e026a1

File tree

7 files changed

+125
-16
lines changed

7 files changed

+125
-16
lines changed

.beads/issues.jsonl

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -128,12 +128,13 @@
128128
{"id":"CodeContextBench-pdk","title":"Fix instruction contamination + re-extract all token metrics","status":"closed","priority":1,"issue_type":"bug","owner":"locobench@anthropic.com","created_at":"2026-02-07T16:04:36.872665156Z","created_by":"LoCoBench Bot","updated_at":"2026-02-07T16:11:05.467083794Z","closed_at":"2026-02-07T16:11:05.467083794Z","close_reason":"Fixed 6 contaminated files (RepoQA template, 5 LargeRepo CLAUDE.md); re-extracted 197 task_metrics.json fixing 139 inflated costs (8.7K -\u003e 50 actual total). Created scripts/reextract_all_metrics.py."}
129129
{"id":"CodeContextBench-pss","title":"US-006a: Scaffold 3 architectural understanding tasks (Tier A)","status":"closed","priority":1,"issue_type":"task","owner":"locobench@anthropic.com","created_at":"2026-02-15T23:04:39.305991853Z","created_by":"LoCoBench Bot","updated_at":"2026-02-15T23:10:55.114709246Z","closed_at":"2026-02-15T23:10:55.114709246Z","close_reason":"US-006a complete: 3 Tier A architectural understanding tasks scaffolded"}
130130
{"id":"CodeContextBench-q27","title":"Add canary guardrails + subscription enforcement to _common.sh","status":"closed","priority":1,"issue_type":"task","owner":"locobench@anthropic.com","created_at":"2026-02-08T03:36:11.761732866Z","created_by":"LoCoBench Bot","updated_at":"2026-02-08T03:37:32.353683884Z","closed_at":"2026-02-08T03:37:32.353683884Z","close_reason":"Added enforce_subscription_mode, validate_canary_result, run_canary_then_batch, check_token_health; removed API key branch from setup_multi_accounts"}
131-
{"id":"CodeContextBench-qtf","title":"US-007 Add transcript discovery abstraction for non-Claude artifacts","status":"open","priority":2,"issue_type":"task","owner":"locobench@anthropic.com","created_at":"2026-02-17T03:33:11.290586345Z","created_by":"LoCoBench Bot","updated_at":"2026-02-17T03:33:11.290586345Z"}
131+
{"id":"CodeContextBench-qtf","title":"US-007 Add transcript discovery abstraction for non-Claude artifacts","status":"closed","priority":2,"issue_type":"task","owner":"locobench@anthropic.com","created_at":"2026-02-17T03:33:11.290586345Z","created_by":"LoCoBench Bot","updated_at":"2026-02-17T04:00:12.261541882Z","closed_at":"2026-02-17T04:00:12.261541882Z","close_reason":"done"}
132132
{"id":"CodeContextBench-r71","title":"CrossRepo: all runs invalid due to verifier path bug","description":"All 8 CrossRepo runs (4 tasks × 2 configs) crashed because test.sh referenced /task/tests/expected_changes.json instead of /tests/expected_changes.json. Verifier is now fixed locally but all existing runs predate the fix. Agents produced meaningful output (261-line patch, 224-line analysis, 497-line reasoning). All 4 tasks need reruns.","status":"closed","priority":1,"issue_type":"bug","owner":"locobench@anthropic.com","created_at":"2026-02-06T22:03:15.909834308Z","created_by":"LoCoBench Bot","updated_at":"2026-02-07T18:39:18.97810564Z","closed_at":"2026-02-07T18:39:18.97810564Z","close_reason":"CrossRepo all 3 configs rerun complete: baseline avg=0.571, SG_base avg=0.587, SG_full avg=0.387"}
133133
{"id":"CodeContextBench-rch","title":"US-019: Scaffold enterprise multi-team and conflicting-docs tasks","status":"closed","priority":1,"issue_type":"task","owner":"locobench@anthropic.com","created_at":"2026-02-15T15:00:15.881711075Z","created_by":"LoCoBench Bot","updated_at":"2026-02-15T15:08:20.111033768Z","closed_at":"2026-02-15T15:08:20.111033768Z","close_reason":"US-019 complete: 3 enterprise tasks scaffolded"}
134134
{"id":"CodeContextBench-rej","title":"Generate aggregate CCB evaluation report (updated: 12 benchmarks, no LoCoBench)","description":"After all benchmark runs complete and MANIFEST is clean, generate the aggregate evaluation report using python3 scripts/generate_report.py. Should cover all 13 benchmarks with 3-config comparison (baseline vs SG_base vs SG_full), MCP impact analysis, and per-benchmark breakdowns.","status":"closed","priority":2,"issue_type":"task","owner":"locobench@anthropic.com","created_at":"2026-02-06T14:50:31.544649793Z","created_by":"LoCoBench Bot","updated_at":"2026-02-16T01:39:42.151139942Z","closed_at":"2026-02-16T01:39:42.151139942Z","close_reason":"Stale - aggregate report design predates enterprise task expansion and IR pipeline work. Needs redesign.","dependencies":[{"issue_id":"CodeContextBench-rej","depends_on_id":"CodeContextBench-aot","type":"blocks","created_at":"2026-02-06T14:50:47.565065613Z","created_by":"LoCoBench Bot"},{"issue_id":"CodeContextBench-rej","depends_on_id":"CodeContextBench-kph","type":"blocks","created_at":"2026-02-06T14:50:47.632620141Z","created_by":"LoCoBench Bot"},{"issue_id":"CodeContextBench-rej","depends_on_id":"CodeContextBench-yk3","type":"blocks","created_at":"2026-02-06T14:50:47.689660185Z","created_by":"LoCoBench Bot"},{"issue_id":"CodeContextBench-rej","depends_on_id":"CodeContextBench-9r9","type":"blocks","created_at":"2026-02-06T14:50:47.744576933Z","created_by":"LoCoBench Bot"},{"issue_id":"CodeContextBench-rej","depends_on_id":"CodeContextBench-05n","type":"blocks","created_at":"2026-02-06T14:50:47.799295655Z","created_by":"LoCoBench Bot"},{"issue_id":"CodeContextBench-rej","depends_on_id":"CodeContextBench-99h","type":"blocks","created_at":"2026-02-06T14:50:47.854278452Z","created_by":"LoCoBench Bot"},{"issue_id":"CodeContextBench-rej","depends_on_id":"CodeContextBench-dfp","type":"blocks","created_at":"2026-02-06T14:50:47.909843823Z","created_by":"LoCoBench Bot"},{"issue_id":"CodeContextBench-rej","depends_on_id":"CodeContextBench-8t7","type":"blocks","created_at":"2026-02-08T01:13:11.737288558Z","created_by":"LoCoBench Bot"},{"issue_id":"CodeContextBench-rej","depends_on_id":"CodeContextBench-nj3","type":"blocks","created_at":"2026-02-08T02:54:34.482946883Z","created_by":"LoCoBench Bot"},{"issue_id":"CodeContextBench-rej","depends_on_id":"CodeContextBench-36d","type":"blocks","created_at":"2026-02-08T02:54:34.538759931Z","created_by":"LoCoBench Bot"},{"issue_id":"CodeContextBench-rej","depends_on_id":"CodeContextBench-1up","type":"blocks","created_at":"2026-02-08T02:54:34.594203831Z","created_by":"LoCoBench Bot"},{"issue_id":"CodeContextBench-rej","depends_on_id":"CodeContextBench-0y8","type":"blocks","created_at":"2026-02-08T02:54:35.06846015Z","created_by":"LoCoBench Bot"},{"issue_id":"CodeContextBench-rej","depends_on_id":"CodeContextBench-24z","type":"blocks","created_at":"2026-02-08T02:54:35.124147115Z","created_by":"LoCoBench Bot"}]}
135135
{"id":"CodeContextBench-rf3","title":"US-002: Fix protonmail Docker environment","status":"closed","priority":2,"issue_type":"task","owner":"locobench@anthropic.com","created_at":"2026-02-11T23:31:45.49023811Z","created_by":"LoCoBench Bot","updated_at":"2026-02-11T23:39:17.748141376Z","closed_at":"2026-02-11T23:39:17.748141376Z","close_reason":"Fixed protonmail Docker Node.js v16→v18 in local + cached Dockerfiles"}
136136
{"id":"CodeContextBench-rxg","title":"Rerun 7 LoCoBench SG_base zero-token gap-fill tasks","description":"7 LoCoBench tasks in locobench_gapfill_opus_20260209_010036/sourcegraph_base have zero tokens (auth failure). Tasks: c_api_graphql_expert_079 (arch+cross_file), rust_microservice_expert_008, csharp_warehouse_expert_012 (2), python_streaming_expert_085, python_desktop_expert. Current SG_base mean=0.504 (18 valid) but MANIFEST shows 0.363 including errored. Fix errored classification is done but these need actual reruns for complete data.","status":"closed","priority":1,"issue_type":"task","owner":"locobench@anthropic.com","created_at":"2026-02-10T11:28:20.889991278Z","created_by":"LoCoBench Bot","updated_at":"2026-02-15T19:31:57.593499773Z","closed_at":"2026-02-15T19:31:57.593499773Z","close_reason":"SG_base config dropped from official runs"}
137+
{"id":"CodeContextBench-s00t","title":"US-007 - Add transcript discovery abstraction for non-Claude artifacts","status":"closed","priority":2,"issue_type":"task","owner":"locobench@anthropic.com","created_at":"2026-02-17T03:57:12.383536394Z","created_by":"LoCoBench Bot","updated_at":"2026-02-17T03:57:29.113635367Z","closed_at":"2026-02-17T03:57:29.113635367Z","close_reason":"duplicate"}
137138
{"id":"CodeContextBench-si6","title":"US-003: Retrieval-to-outcome correlation analysis","status":"closed","priority":1,"issue_type":"task","owner":"locobench@anthropic.com","created_at":"2026-02-16T01:06:44.928313476Z","created_by":"LoCoBench Bot","updated_at":"2026-02-16T01:34:05.037762777Z","closed_at":"2026-02-16T01:34:05.037762777Z","close_reason":"Done - compute_retrieval_outcome_correlation() and compute_mcp_value_scores() already on main. Spearman correlation with scatter plot output."}
138139
{"id":"CodeContextBench-szi","title":"Fix judge JSON parsing: strip markdown code fences","status":"closed","priority":0,"issue_type":"bug","owner":"locobench@anthropic.com","created_at":"2026-02-07T19:44:44.132260075Z","created_by":"LoCoBench Bot","updated_at":"2026-02-07T20:03:44.608413235Z","closed_at":"2026-02-07T20:03:44.608413235Z","close_reason":"Added code fence stripping to CodeReview test.sh (3 tasks) and RepoQA test.sh (10 tasks + template). Agents that wrap review.json or solution.json in markdown fences now get parsed correctly."}
139140
{"id":"CodeContextBench-szv","title":"US-002: Create inv-deep-002 Istio control plane deep causal chain task","status":"closed","priority":2,"issue_type":"task","owner":"locobench@anthropic.com","created_at":"2026-02-16T15:14:17.279712198Z","created_by":"LoCoBench Bot","updated_at":"2026-02-16T15:27:43.319743852Z","closed_at":"2026-02-16T15:27:43.319743852Z","close_reason":"US-002 complete: inv-deep-002 Istio deep causal chain task created and committed"}

ralph-multi-harness/prd.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -106,7 +106,7 @@
106106
"python3 -m py_compile scripts/ccb_metrics/discovery.py scripts/ccb_metrics/judge_context.py scripts/ccb_metrics/extractors.py succeeds"
107107
],
108108
"priority": 7,
109-
"passes": false,
109+
"passes": true,
110110
"notes": ""
111111
},
112112
{

ralph-multi-harness/progress.txt

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
- Harness registry entries should be keyed by stable harness IDs (`codex`, `cursor`, `gemini`, `copilot`, `openhands`) and constrain `allowed_mcp_modes` to exactly `none` and `sourcegraph_full` for this rollout.
99
- Non-Claude harness runners should still source `configs/_common.sh` for validation/parallel helpers, but must not depend on Claude OAuth refresh flows.
1010
- In sandboxed environments, `runs/staging` may resolve to an external symlink target; use a writable `--category` override when dry-running scaffolds locally.
11+
- In `scripts/ccb_metrics`, resolve transcript artifacts through a shared candidate list (not hardcoded `agent/claude-code.txt`) so non-Claude harness outputs are discoverable.
1112

1213
## Progress
1314

@@ -83,3 +84,16 @@
8384
- Useful context (e.g., "the evaluation panel is in component X")
8485
- `runs/staging` is a symlink in this repo setup; local dry runs may need a different `--category` to avoid permission issues when the symlink target is outside writable roots.
8586
---
87+
88+
## 2026-02-17 03:59:26 UTC - US-007
89+
- Implemented multi-harness transcript discovery abstraction with shared candidate-path resolution and wired it into run discovery + judge context generation.
90+
- Updated transcript-driven extractors to recover gracefully when `agent/claude-code.txt` is missing by resolving harness-appropriate alternatives from the task directory.
91+
- Files changed: `scripts/ccb_metrics/transcript_paths.py`, `scripts/ccb_metrics/discovery.py`, `scripts/ccb_metrics/judge_context.py`, `scripts/ccb_metrics/extractors.py`, `ralph-multi-harness/prd.json`, `ralph-multi-harness/progress.txt`
92+
- **Learnings for future iterations:**
93+
- Patterns discovered (e.g., "this codebase uses X for Y")
94+
- Keep transcript filename fallbacks centralized (`scripts/ccb_metrics/transcript_paths.py`) so discovery, judge context, and extractors stay consistent as harness artifacts evolve.
95+
- Gotchas encountered (e.g., "don't forget to update Z when changing W")
96+
- `scripts/ccb_metrics/extractors.py` cannot import from `discovery.py` due dependency direction, so shared transcript-path utilities should live in a separate helper module.
97+
- Useful context (e.g., "the evaluation panel is in component X")
98+
- `extract_run_config` now reads init metadata from whichever transcript candidate resolves first, improving MCP-mode inference for non-Claude harness runs.
99+
---

scripts/ccb_metrics/discovery.py

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,10 @@
1616

1717
from .models import TaskMetrics, RunMetrics
1818
from .task_selection import normalize_benchmark_name
19+
from .transcript_paths import (
20+
TRANSCRIPT_CANDIDATE_RELATIVE_PATHS,
21+
resolve_task_transcript_path as _resolve_task_transcript_path,
22+
)
1923
from .extractors import (
2024
extract_task_from_result_json,
2125
extract_task_tokens_from_transcript,
@@ -41,6 +45,14 @@
4145
)
4246

4347

48+
def resolve_task_transcript_path(task_dir: Path) -> Path:
49+
"""Resolve transcript path by trying multiple harness artifact names."""
50+
# Keep helper in this module so discovery users can share one resolver.
51+
# This list intentionally covers non-Claude harness artifact variants.
52+
_ = TRANSCRIPT_CANDIDATE_RELATIVE_PATHS
53+
return _resolve_task_transcript_path(task_dir)
54+
55+
4456
def _infer_benchmark(run_name: str) -> str:
4557
"""Infer benchmark name from run directory name.
4658
@@ -151,7 +163,7 @@ def _process_task_dir(
151163
# Token data: prefer transcript (actual API usage) over result.json
152164
# (result.json n_input_tokens can include cumulative MCP result tokens,
153165
# inflating counts by 100x for MCP-enabled runs)
154-
transcript_path = task_dir / "agent" / "claude-code.txt"
166+
transcript_path = resolve_task_transcript_path(task_dir)
155167
tokens = extract_task_tokens_from_transcript(transcript_path)
156168
if tokens.get("input_tokens") is not None:
157169
tm.input_tokens = tokens["input_tokens"]
@@ -189,7 +201,7 @@ def _process_task_dir(
189201

190202
# Tool usage — prefer trajectory, fall back to transcript
191203
trajectory_path = task_dir / "agent" / "trajectory.json"
192-
transcript_path = task_dir / "agent" / "claude-code.txt"
204+
transcript_path = resolve_task_transcript_path(task_dir)
193205
tool_usage = extract_tool_usage_from_trajectory(trajectory_path)
194206
if tool_usage.get("tool_calls_total") is None:
195207
tool_usage = extract_tool_usage_from_transcript(transcript_path)
@@ -351,7 +363,7 @@ def discover_runs(runs_dir: str | Path) -> list[RunMetrics]:
351363
_transcript_for_config = None
352364
for _td in sorted(batch_dir.iterdir()):
353365
if _is_task_dir(_td):
354-
_candidate = _td / "agent" / "claude-code.txt"
366+
_candidate = resolve_task_transcript_path(_td)
355367
if _candidate.is_file():
356368
_transcript_for_config = _candidate
357369
break

scripts/ccb_metrics/extractors.py

Lines changed: 21 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
from typing import Optional
1515

1616
from .models import TaskMetrics
17+
from .transcript_paths import infer_task_dir_from_transcript_path, resolve_task_transcript_path
1718

1819

1920
def _parse_iso(ts: Optional[str]) -> Optional[datetime]:
@@ -36,6 +37,20 @@ def _seconds_between(start: Optional[str], end: Optional[str]) -> Optional[float
3637
return (e - s).total_seconds()
3738

3839

40+
def _resolve_existing_transcript_path(transcript_path: str | Path) -> Path:
41+
"""Resolve transcript path with task-level fallback candidates."""
42+
path = Path(transcript_path)
43+
if path.is_file():
44+
return path
45+
46+
task_dir = infer_task_dir_from_transcript_path(path)
47+
if task_dir is not None:
48+
candidate = resolve_task_transcript_path(task_dir)
49+
if candidate.is_file():
50+
return candidate
51+
return path
52+
53+
3954
def extract_task_from_result_json(
4055
result_json_path: str | Path,
4156
benchmark: str = "",
@@ -174,7 +189,7 @@ def extract_task_tokens_from_transcript(
174189
"cache_read_input_tokens": None,
175190
"total_cost_usd": None,
176191
}
177-
path = Path(claude_code_txt_path)
192+
path = _resolve_existing_transcript_path(claude_code_txt_path)
178193
if not path.is_file():
179194
return empty
180195

@@ -346,7 +361,7 @@ def extract_tool_usage_from_transcript(
346361
tool_calls_by_name (Counter dict), mcp_ratio (mcp/total).
347362
Returns None-valued dict if file is missing or unparseable.
348363
"""
349-
path = Path(claude_code_txt_path)
364+
path = _resolve_existing_transcript_path(claude_code_txt_path)
350365
if not path.is_file():
351366
return _empty_tool_usage()
352367

@@ -436,7 +451,7 @@ def extract_run_config(
436451

437452
# --- Extract from claude-code.txt init line ---
438453
if transcript_path is not None:
439-
tp = Path(transcript_path)
454+
tp = _resolve_existing_transcript_path(transcript_path)
440455
if tp.is_file():
441456
try:
442457
for line in tp.open():
@@ -625,7 +640,7 @@ def extract_search_patterns_from_transcript(
625640
Returns:
626641
Dict with same schema as extract_search_patterns_from_trajectory.
627642
"""
628-
path = Path(claude_code_txt_path)
643+
path = _resolve_existing_transcript_path(claude_code_txt_path)
629644
if not path.is_file():
630645
return _empty_search_patterns()
631646

@@ -755,7 +770,7 @@ def extract_code_changes_from_transcript(
755770
Returns:
756771
Dict with same schema as extract_code_changes_from_trajectory.
757772
"""
758-
path = Path(claude_code_txt_path)
773+
path = _resolve_existing_transcript_path(claude_code_txt_path)
759774
if not path.is_file():
760775
return _empty_code_changes()
761776

@@ -1095,7 +1110,7 @@ def extract_conversation_analysis_from_transcript(
10951110
"backtrack_count": None,
10961111
"context_window_peak_pct": None,
10971112
}
1098-
path = Path(claude_code_txt_path)
1113+
path = _resolve_existing_transcript_path(claude_code_txt_path)
10991114
if not path.is_file():
11001115
return empty
11011116

scripts/ccb_metrics/judge_context.py

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,13 @@
1616
from pathlib import Path
1717
from typing import Optional
1818

19-
from .discovery import discover_runs, _extract_task_id, _is_batch_dir, _is_task_dir
19+
from .discovery import (
20+
discover_runs,
21+
_extract_task_id,
22+
_is_batch_dir,
23+
_is_task_dir,
24+
resolve_task_transcript_path,
25+
)
2026
from .extractors import (
2127
extract_tool_usage_from_trajectory,
2228
extract_tool_usage_from_transcript,
@@ -111,7 +117,7 @@ def _extract_agent_output(task_dir: Path) -> Optional[str]:
111117
return text
112118

113119
# Fallback: last assistant message text from transcript
114-
transcript_path = task_dir / "agent" / "claude-code.txt"
120+
transcript_path = resolve_task_transcript_path(task_dir)
115121
if not transcript_path.is_file():
116122
return None
117123

@@ -146,7 +152,7 @@ def _extract_agent_output(task_dir: Path) -> Optional[str]:
146152
def _extract_tool_usage_summary(task_dir: Path) -> Optional[dict]:
147153
"""Extract tool usage summary: total calls, MCP calls, top 5 tools."""
148154
trajectory_path = task_dir / "agent" / "trajectory.json"
149-
transcript_path = task_dir / "agent" / "claude-code.txt"
155+
transcript_path = resolve_task_transcript_path(task_dir)
150156

151157
usage = extract_tool_usage_from_trajectory(trajectory_path)
152158
if usage.get("tool_calls_total") is None:
@@ -173,7 +179,7 @@ def _extract_code_changes(task_dir: Path) -> Optional[list[dict]]:
173179
Returns a list of {file, action} dicts, or None if not available.
174180
"""
175181
trajectory_path = task_dir / "agent" / "trajectory.json"
176-
transcript_path = task_dir / "agent" / "claude-code.txt"
182+
transcript_path = resolve_task_transcript_path(task_dir)
177183

178184
changes: list[dict] = []
179185

@@ -506,7 +512,7 @@ def generate_judge_contexts(
506512
ground_truth = _read_ground_truth(benchmarks_dir, benchmark, task_id)
507513

508514
# Extract transcript summary
509-
transcript_path = task_dir / "agent" / "claude-code.txt"
515+
transcript_path = resolve_task_transcript_path(task_dir)
510516
transcript_summary = _extract_transcript_summary(transcript_path)
511517

512518
# Extract agent output

0 commit comments

Comments
 (0)