Skip to content

Commit d5a8d29

Browse files
sjarmakclaude
andcommitted
feat: oracle pipeline — OAuth auth, dual-verification, composite scoring, calibration subsets, cross-validation, promotion
Implements all 6 Ralph oracle pipeline stories (US-001 through US-010): - US-001: OAuth token auth (CLAUDE_CODE_OAUTH_TOKEN first, ANTHROPIC_API_KEY fallback) - US-002: Dual-retrieval verification (local FS + Sourcegraph per oracle file) - US-003: Composite go/no-go scoring (weighted recall/precision/chain/symbol) - US-004: ContextBench calibration subsets (--phase test/verify, CCB-weighted) - US-009: Cross-validation --agent-suffix, high-divergence list, default report path - US-010: New promote_agent_oracles.py with dual verification + CV F1 gates Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 520cc1c commit d5a8d29

File tree

6 files changed

+662
-28
lines changed

6 files changed

+662
-28
lines changed

docs/ops/SCRIPT_INDEX.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -193,6 +193,7 @@ Generated from `scripts/registry.json` by `scripts/generate_script_index.py`.
193193
- `scripts/list_gemini_models.py` - Utility script for list gemini models.
194194
- `scripts/mirror_largerepo_expansion.sh` - Utility script for mirror largerepo expansion.
195195
- `scripts/plan_variance_runs.py` - Utility script for plan variance runs.
196+
- `scripts/promote_agent_oracles.py` - Utility script for promote agent oracles.
196197
- `scripts/push_base_images_ghcr.sh` - Utility script for push base images ghcr.
197198
- `scripts/regenerate_artifact_dockerfiles.py` - Utility script for regenerate artifact dockerfiles.
198199
- `scripts/rehost_sweap_images.py` - Utility script for rehost sweap images.

scripts/context_retrieval_agent.py

Lines changed: 103 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,8 @@
1313
the circularity concern is empirically defused.
1414
1515
Environment variables:
16-
ANTHROPIC_API_KEY Required. Claude API key.
16+
CLAUDE_CODE_OAUTH_TOKEN Preferred. OAuth token for subscription billing.
17+
ANTHROPIC_API_KEY Fallback. Claude API key.
1718
SOURCEGRAPH_ACCESS_TOKEN Required for deepsearch/hybrid backends.
1819
SOURCEGRAPH_URL SG instance (default: https://sourcegraph.sourcegraph.com)
1920
CCB_REPO_CACHE Repo clone cache dir (default: ~/.cache/ccb_repos)
@@ -66,7 +67,7 @@
6667
# Constants
6768
# ---------------------------------------------------------------------------
6869

69-
DEFAULT_MODEL = "claude-sonnet-4-6" # Good balance of cost and capability
70+
DEFAULT_MODEL = "claude-opus-4-6" # Strongest model for oracle generation
7071
MAX_TOKENS = 16384
7172
MAX_TOOL_CALLS = 40
7273
TOOL_TIMEOUT_SEC = 30
@@ -1294,6 +1295,81 @@ def _extract_json_from_messages(messages: List[Dict]) -> Dict[str, Any]:
12941295
return {"files": [], "text": "Agent did not produce valid JSON output."}
12951296

12961297

1298+
# ---------------------------------------------------------------------------
1299+
# Dual-Retrieval Verification
1300+
# ---------------------------------------------------------------------------
1301+
1302+
1303+
def verify_dual_retrieval(
1304+
oracle: Dict[str, Any],
1305+
repo_paths: Dict[str, Path],
1306+
sg_client: Optional["SourcegraphClient"] = None,
1307+
) -> Dict[str, Any]:
1308+
"""Verify each oracle file is discoverable via local FS and Sourcegraph.
1309+
1310+
Returns dict with per-file verification results and summary stats.
1311+
Files that fail either check are flagged but NOT removed from the oracle.
1312+
"""
1313+
files = oracle.get("files", [])
1314+
verification = []
1315+
1316+
for entry in files:
1317+
repo = entry.get("repo", "")
1318+
path = entry.get("path", "")
1319+
1320+
# --- Local verification: file exists on disk in any repo_paths ---
1321+
local_ok = False
1322+
if repo_paths:
1323+
# Try exact repo match first, then fall back to any repo dir
1324+
for rname, rdir in repo_paths.items():
1325+
candidate = rdir / path
1326+
if candidate.is_file():
1327+
local_ok = True
1328+
break
1329+
1330+
# --- Sourcegraph verification: keyword search returns results ---
1331+
sg_ok = False
1332+
if sg_client and sg_client.token:
1333+
try:
1334+
# Construct a precise file search query
1335+
sg_query = f"file:^{re.escape(path)}$ count:1"
1336+
if repo:
1337+
sg_query = f"repo:{repo} {sg_query}"
1338+
result = sg_client.keyword_search(sg_query, max_results=1)
1339+
sg_ok = result and "No results found" not in result and "error" not in result.lower()
1340+
except Exception as e:
1341+
log.debug("SG verify failed for %s:%s: %s", repo, path, e)
1342+
1343+
verification.append({
1344+
"repo": repo,
1345+
"path": path,
1346+
"local_verified": local_ok,
1347+
"sg_verified": sg_ok,
1348+
})
1349+
1350+
# Summary stats
1351+
n_total = len(verification)
1352+
n_dual = sum(1 for v in verification if v["local_verified"] and v["sg_verified"])
1353+
n_local_only = sum(1 for v in verification if v["local_verified"] and not v["sg_verified"])
1354+
n_sg_only = sum(1 for v in verification if not v["local_verified"] and v["sg_verified"])
1355+
n_unverified = sum(1 for v in verification if not v["local_verified"] and not v["sg_verified"])
1356+
1357+
summary = {
1358+
"n_total": n_total,
1359+
"n_dual_verified": n_dual,
1360+
"n_local_only": n_local_only,
1361+
"n_sg_only": n_sg_only,
1362+
"n_unverified": n_unverified,
1363+
}
1364+
1365+
log.info(
1366+
" Verification: %d/%d dual, %d local-only, %d sg-only, %d unverified",
1367+
n_dual, n_total, n_local_only, n_sg_only, n_unverified,
1368+
)
1369+
1370+
return {"files": verification, "summary": summary}
1371+
1372+
12971373
# ---------------------------------------------------------------------------
12981374
# Output
12991375
# ---------------------------------------------------------------------------
@@ -1394,6 +1470,10 @@ def main() -> int:
13941470
"--missing-only", action="store_true",
13951471
help="Only process tasks that have NO ground truth at all (no oracle_answer.json, no ground_truth.json)",
13961472
)
1473+
parser.add_argument(
1474+
"--no-verify", action="store_true",
1475+
help="Skip dual-retrieval verification pass",
1476+
)
13971477
parser.add_argument(
13981478
"--dry-run", action="store_true",
13991479
help="Show tasks without running agent",
@@ -1413,9 +1493,16 @@ def main() -> int:
14131493
log.error("anthropic package not installed. pip install anthropic")
14141494
return 1
14151495

1416-
api_key = os.environ.get("ANTHROPIC_API_KEY", "")
1496+
# OAuth token preferred (subscription billing), API key fallback
1497+
api_key = os.environ.get("CLAUDE_CODE_OAUTH_TOKEN", "")
1498+
if api_key:
1499+
log.info("Using OAuth token (CLAUDE_CODE_OAUTH_TOKEN)")
1500+
else:
1501+
api_key = os.environ.get("ANTHROPIC_API_KEY", "")
1502+
if api_key:
1503+
log.info("Using API key (ANTHROPIC_API_KEY)")
14171504
if not api_key and not args.dry_run:
1418-
log.error("ANTHROPIC_API_KEY not set")
1505+
log.error("Set CLAUDE_CODE_OAUTH_TOKEN or ANTHROPIC_API_KEY")
14191506
return 1
14201507

14211508
# Discover tasks
@@ -1557,6 +1644,18 @@ def main() -> int:
15571644
else:
15581645
output_path = str(out_dir / f"{task_dir.name}_gt_agent.json")
15591646

1647+
# Dual-retrieval verification (unless --no-verify)
1648+
if not args.no_verify:
1649+
vr = verify_dual_retrieval(oracle, repo_paths, sg_client=sg)
1650+
metadata["dual_verification"] = vr["summary"]
1651+
# Annotate oracle file entries with verification flags
1652+
for v_entry in vr["files"]:
1653+
for f_entry in oracle.get("files", []):
1654+
if f_entry.get("path") == v_entry["path"]:
1655+
f_entry["local_verified"] = v_entry["local_verified"]
1656+
f_entry["sg_verified"] = v_entry["sg_verified"]
1657+
break
1658+
15601659
out_file = write_oracle(task_dir, oracle, metadata, output_path)
15611660

15621661
n_files = len(oracle.get("files", []))

scripts/cross_validate_oracles.py

Lines changed: 33 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -179,6 +179,7 @@ def find_project_root() -> Path:
179179
def discover_comparison_pairs(
180180
suite: str = "",
181181
agent_dir: str = "",
182+
agent_suffix: str = "_agent",
182183
) -> List[Dict[str, Any]]:
183184
"""Find all tasks with both an existing oracle and an agent-generated oracle.
184185
@@ -223,15 +224,15 @@ def discover_comparison_pairs(
223224
if agent_dir:
224225
ext_dir = Path(agent_dir) / s
225226
for pattern in [
226-
f"{task_dir.name}_oracle_agent.json",
227-
f"{task_dir.name}_gt_agent.json",
227+
f"{task_dir.name}_oracle{agent_suffix}.json",
228+
f"{task_dir.name}_gt{agent_suffix}.json",
228229
]:
229230
p = ext_dir / pattern
230231
if p.exists():
231232
agent = p
232233
break
233234
else:
234-
for name in ["oracle_answer_agent.json", "ground_truth_agent.json"]:
235+
for name in [f"oracle_answer{agent_suffix}.json", f"ground_truth{agent_suffix}.json"]:
235236
p = tests / name
236237
if p.exists():
237238
agent = p
@@ -381,9 +382,14 @@ def main() -> int:
381382
"--agent-dir", type=str, default="",
382383
help="External directory with agent oracles (alternative to in-place)",
383384
)
385+
parser.add_argument(
386+
"--agent-suffix", type=str, default="_agent",
387+
help="Suffix for agent oracle filenames (default: '_agent'). "
388+
"E.g., '_agent' finds oracle_answer_agent.json and ground_truth_agent.json",
389+
)
384390
parser.add_argument(
385391
"--report", type=str, default="",
386-
help="Output report JSON path",
392+
help="Output report JSON path (default: results/cross_validation/summary.json)",
387393
)
388394
parser.add_argument(
389395
"--verbose", action="store_true",
@@ -397,6 +403,7 @@ def main() -> int:
397403

398404
pairs = discover_comparison_pairs(
399405
suite=args.suite, agent_dir=args.agent_dir,
406+
agent_suffix=args.agent_suffix,
400407
)
401408

402409
if not pairs:
@@ -472,6 +479,10 @@ def main() -> int:
472479
total_agent_only = sum(len(r["agent_only"]) for r in per_task)
473480
total_matched = sum(r["n_matched"] for r in per_task)
474481

482+
# High-divergence tasks (F1 < 0.5)
483+
high_divergence = [r for r in per_task if r["f1"] < 0.5]
484+
high_divergence.sort(key=lambda r: r["f1"])
485+
475486
report = {
476487
"summary": {
477488
"total_tasks": len(per_task),
@@ -480,12 +491,14 @@ def main() -> int:
480491
"mean_file_precision": round(mean_precision, 4),
481492
"cohens_kappa": round(kappa, 4),
482493
"kappa_interpretation": _interpret_kappa(kappa),
494+
"agent_suffix": args.agent_suffix,
483495
},
484496
"divergence": {
485497
"total_matched_files": total_matched,
486498
"total_oracle_only_files": total_oracle_only,
487499
"total_agent_only_files": total_agent_only,
488500
},
501+
"high_divergence": high_divergence,
489502
"per_suite": suite_summary,
490503
"per_task": per_task,
491504
}
@@ -506,14 +519,24 @@ def main() -> int:
506519
print(f"\nPer-suite:")
507520
for s, m in sorted(suite_summary.items()):
508521
print(f" {s}: n={m['n']}, F1={m['mean_f1']:.4f} [{m['min_f1']:.4f}-{m['max_f1']:.4f}]")
522+
523+
if high_divergence:
524+
print(f"\nHigh-divergence tasks (F1 < 0.5): {len(high_divergence)}")
525+
for r in high_divergence[:10]:
526+
print(f" {r['task']}: F1={r['f1']:.4f} (oracle={r['n_oracle']}, agent={r['n_agent']})")
527+
if len(high_divergence) > 10:
528+
print(f" ... and {len(high_divergence) - 10} more")
509529
print(f"{'=' * 60}")
510530

511-
# Write report
512-
if args.report:
513-
out = Path(args.report)
514-
out.parent.mkdir(parents=True, exist_ok=True)
515-
out.write_text(json.dumps(report, indent=2) + "\n")
516-
log.info("Report written: %s", out)
531+
# Write report (default: results/cross_validation/summary.json)
532+
report_path = args.report
533+
if not report_path:
534+
root = find_project_root()
535+
report_path = str(root / "results" / "cross_validation" / "summary.json")
536+
out = Path(report_path)
537+
out.parent.mkdir(parents=True, exist_ok=True)
538+
out.write_text(json.dumps(report, indent=2) + "\n")
539+
log.info("Report written: %s", out)
517540

518541
return 0
519542

0 commit comments

Comments
 (0)