|
128 | 128 | {"id":"CodeContextBench-pdk","title":"Fix instruction contamination + re-extract all token metrics","status":"closed","priority":1,"issue_type":"bug","owner":"locobench@anthropic.com","created_at":"2026-02-07T16:04:36.872665156Z","created_by":"LoCoBench Bot","updated_at":"2026-02-07T16:11:05.467083794Z","closed_at":"2026-02-07T16:11:05.467083794Z","close_reason":"Fixed 6 contaminated files (RepoQA template, 5 LargeRepo CLAUDE.md); re-extracted 197 task_metrics.json fixing 139 inflated costs (8.7K -\u003e 50 actual total). Created scripts/reextract_all_metrics.py."} |
129 | 129 | {"id":"CodeContextBench-pss","title":"US-006a: Scaffold 3 architectural understanding tasks (Tier A)","status":"closed","priority":1,"issue_type":"task","owner":"locobench@anthropic.com","created_at":"2026-02-15T23:04:39.305991853Z","created_by":"LoCoBench Bot","updated_at":"2026-02-15T23:10:55.114709246Z","closed_at":"2026-02-15T23:10:55.114709246Z","close_reason":"US-006a complete: 3 Tier A architectural understanding tasks scaffolded"} |
130 | 130 | {"id":"CodeContextBench-q27","title":"Add canary guardrails + subscription enforcement to _common.sh","status":"closed","priority":1,"issue_type":"task","owner":"locobench@anthropic.com","created_at":"2026-02-08T03:36:11.761732866Z","created_by":"LoCoBench Bot","updated_at":"2026-02-08T03:37:32.353683884Z","closed_at":"2026-02-08T03:37:32.353683884Z","close_reason":"Added enforce_subscription_mode, validate_canary_result, run_canary_then_batch, check_token_health; removed API key branch from setup_multi_accounts"} |
131 | | -{"id":"CodeContextBench-qtf","title":"US-007 Add transcript discovery abstraction for non-Claude artifacts","status":"open","priority":2,"issue_type":"task","owner":"locobench@anthropic.com","created_at":"2026-02-17T03:33:11.290586345Z","created_by":"LoCoBench Bot","updated_at":"2026-02-17T03:33:11.290586345Z"} |
| 131 | +{"id":"CodeContextBench-qtf","title":"US-007 Add transcript discovery abstraction for non-Claude artifacts","status":"closed","priority":2,"issue_type":"task","owner":"locobench@anthropic.com","created_at":"2026-02-17T03:33:11.290586345Z","created_by":"LoCoBench Bot","updated_at":"2026-02-17T04:00:12.261541882Z","closed_at":"2026-02-17T04:00:12.261541882Z","close_reason":"done"} |
132 | 132 | {"id":"CodeContextBench-r71","title":"CrossRepo: all runs invalid due to verifier path bug","description":"All 8 CrossRepo runs (4 tasks × 2 configs) crashed because test.sh referenced /task/tests/expected_changes.json instead of /tests/expected_changes.json. Verifier is now fixed locally but all existing runs predate the fix. Agents produced meaningful output (261-line patch, 224-line analysis, 497-line reasoning). All 4 tasks need reruns.","status":"closed","priority":1,"issue_type":"bug","owner":"locobench@anthropic.com","created_at":"2026-02-06T22:03:15.909834308Z","created_by":"LoCoBench Bot","updated_at":"2026-02-07T18:39:18.97810564Z","closed_at":"2026-02-07T18:39:18.97810564Z","close_reason":"CrossRepo all 3 configs rerun complete: baseline avg=0.571, SG_base avg=0.587, SG_full avg=0.387"} |
133 | 133 | {"id":"CodeContextBench-rch","title":"US-019: Scaffold enterprise multi-team and conflicting-docs tasks","status":"closed","priority":1,"issue_type":"task","owner":"locobench@anthropic.com","created_at":"2026-02-15T15:00:15.881711075Z","created_by":"LoCoBench Bot","updated_at":"2026-02-15T15:08:20.111033768Z","closed_at":"2026-02-15T15:08:20.111033768Z","close_reason":"US-019 complete: 3 enterprise tasks scaffolded"} |
134 | 134 | {"id":"CodeContextBench-rej","title":"Generate aggregate CCB evaluation report (updated: 12 benchmarks, no LoCoBench)","description":"After all benchmark runs complete and MANIFEST is clean, generate the aggregate evaluation report using python3 scripts/generate_report.py. Should cover all 13 benchmarks with 3-config comparison (baseline vs SG_base vs SG_full), MCP impact analysis, and per-benchmark breakdowns.","status":"closed","priority":2,"issue_type":"task","owner":"locobench@anthropic.com","created_at":"2026-02-06T14:50:31.544649793Z","created_by":"LoCoBench Bot","updated_at":"2026-02-16T01:39:42.151139942Z","closed_at":"2026-02-16T01:39:42.151139942Z","close_reason":"Stale - aggregate report design predates enterprise task expansion and IR pipeline work. Needs redesign.","dependencies":[{"issue_id":"CodeContextBench-rej","depends_on_id":"CodeContextBench-aot","type":"blocks","created_at":"2026-02-06T14:50:47.565065613Z","created_by":"LoCoBench Bot"},{"issue_id":"CodeContextBench-rej","depends_on_id":"CodeContextBench-kph","type":"blocks","created_at":"2026-02-06T14:50:47.632620141Z","created_by":"LoCoBench Bot"},{"issue_id":"CodeContextBench-rej","depends_on_id":"CodeContextBench-yk3","type":"blocks","created_at":"2026-02-06T14:50:47.689660185Z","created_by":"LoCoBench Bot"},{"issue_id":"CodeContextBench-rej","depends_on_id":"CodeContextBench-9r9","type":"blocks","created_at":"2026-02-06T14:50:47.744576933Z","created_by":"LoCoBench Bot"},{"issue_id":"CodeContextBench-rej","depends_on_id":"CodeContextBench-05n","type":"blocks","created_at":"2026-02-06T14:50:47.799295655Z","created_by":"LoCoBench Bot"},{"issue_id":"CodeContextBench-rej","depends_on_id":"CodeContextBench-99h","type":"blocks","created_at":"2026-02-06T14:50:47.854278452Z","created_by":"LoCoBench Bot"},{"issue_id":"CodeContextBench-rej","depends_on_id":"CodeContextBench-dfp","type":"blocks","created_at":"2026-02-06T14:50:47.909843823Z","created_by":"LoCoBench Bot"},{"issue_id":"CodeContextBench-rej","depends_on_id":"CodeContextBench-8t7","type":"blocks","created_at":"2026-02-08T01:13:11.737288558Z","created_by":"LoCoBench Bot"},{"issue_id":"CodeContextBench-rej","depends_on_id":"CodeContextBench-nj3","type":"blocks","created_at":"2026-02-08T02:54:34.482946883Z","created_by":"LoCoBench Bot"},{"issue_id":"CodeContextBench-rej","depends_on_id":"CodeContextBench-36d","type":"blocks","created_at":"2026-02-08T02:54:34.538759931Z","created_by":"LoCoBench Bot"},{"issue_id":"CodeContextBench-rej","depends_on_id":"CodeContextBench-1up","type":"blocks","created_at":"2026-02-08T02:54:34.594203831Z","created_by":"LoCoBench Bot"},{"issue_id":"CodeContextBench-rej","depends_on_id":"CodeContextBench-0y8","type":"blocks","created_at":"2026-02-08T02:54:35.06846015Z","created_by":"LoCoBench Bot"},{"issue_id":"CodeContextBench-rej","depends_on_id":"CodeContextBench-24z","type":"blocks","created_at":"2026-02-08T02:54:35.124147115Z","created_by":"LoCoBench Bot"}]} |
135 | 135 | {"id":"CodeContextBench-rf3","title":"US-002: Fix protonmail Docker environment","status":"closed","priority":2,"issue_type":"task","owner":"locobench@anthropic.com","created_at":"2026-02-11T23:31:45.49023811Z","created_by":"LoCoBench Bot","updated_at":"2026-02-11T23:39:17.748141376Z","closed_at":"2026-02-11T23:39:17.748141376Z","close_reason":"Fixed protonmail Docker Node.js v16→v18 in local + cached Dockerfiles"} |
136 | 136 | {"id":"CodeContextBench-rxg","title":"Rerun 7 LoCoBench SG_base zero-token gap-fill tasks","description":"7 LoCoBench tasks in locobench_gapfill_opus_20260209_010036/sourcegraph_base have zero tokens (auth failure). Tasks: c_api_graphql_expert_079 (arch+cross_file), rust_microservice_expert_008, csharp_warehouse_expert_012 (2), python_streaming_expert_085, python_desktop_expert. Current SG_base mean=0.504 (18 valid) but MANIFEST shows 0.363 including errored. Fix errored classification is done but these need actual reruns for complete data.","status":"closed","priority":1,"issue_type":"task","owner":"locobench@anthropic.com","created_at":"2026-02-10T11:28:20.889991278Z","created_by":"LoCoBench Bot","updated_at":"2026-02-15T19:31:57.593499773Z","closed_at":"2026-02-15T19:31:57.593499773Z","close_reason":"SG_base config dropped from official runs"} |
| 137 | +{"id":"CodeContextBench-s00t","title":"US-007 - Add transcript discovery abstraction for non-Claude artifacts","status":"closed","priority":2,"issue_type":"task","owner":"locobench@anthropic.com","created_at":"2026-02-17T03:57:12.383536394Z","created_by":"LoCoBench Bot","updated_at":"2026-02-17T03:57:29.113635367Z","closed_at":"2026-02-17T03:57:29.113635367Z","close_reason":"duplicate"} |
137 | 138 | {"id":"CodeContextBench-si6","title":"US-003: Retrieval-to-outcome correlation analysis","status":"closed","priority":1,"issue_type":"task","owner":"locobench@anthropic.com","created_at":"2026-02-16T01:06:44.928313476Z","created_by":"LoCoBench Bot","updated_at":"2026-02-16T01:34:05.037762777Z","closed_at":"2026-02-16T01:34:05.037762777Z","close_reason":"Done - compute_retrieval_outcome_correlation() and compute_mcp_value_scores() already on main. Spearman correlation with scatter plot output."} |
138 | 139 | {"id":"CodeContextBench-szi","title":"Fix judge JSON parsing: strip markdown code fences","status":"closed","priority":0,"issue_type":"bug","owner":"locobench@anthropic.com","created_at":"2026-02-07T19:44:44.132260075Z","created_by":"LoCoBench Bot","updated_at":"2026-02-07T20:03:44.608413235Z","closed_at":"2026-02-07T20:03:44.608413235Z","close_reason":"Added code fence stripping to CodeReview test.sh (3 tasks) and RepoQA test.sh (10 tasks + template). Agents that wrap review.json or solution.json in markdown fences now get parsed correctly."} |
139 | 140 | {"id":"CodeContextBench-szv","title":"US-002: Create inv-deep-002 Istio control plane deep causal chain task","status":"closed","priority":2,"issue_type":"task","owner":"locobench@anthropic.com","created_at":"2026-02-16T15:14:17.279712198Z","created_by":"LoCoBench Bot","updated_at":"2026-02-16T15:27:43.319743852Z","closed_at":"2026-02-16T15:27:43.319743852Z","close_reason":"US-002 complete: inv-deep-002 Istio deep causal chain task created and committed"} |
|
0 commit comments