sourcegraph
diff --git a/‎docs/ops/SCRIPT_INDEX.md‎
Lines changed: 1 addition & 0 deletions b/‎docs/ops/SCRIPT_INDEX.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎scripts/context_retrieval_agent.py‎
Lines changed: 103 additions & 4 deletions b/‎scripts/context_retrieval_agent.py‎
Lines changed: 103 additions & 4 deletions
diff --git a/‎scripts/cross_validate_oracles.py‎
Lines changed: 33 additions & 10 deletions b/‎scripts/cross_validate_oracles.py‎
Lines changed: 33 additions & 10 deletions
@@ -193,6 +193,7 @@ Generated from `scripts/registry.json` by `scripts/generate_script_index.py`.
 - `scripts/list_gemini_models.py` - Utility script for list gemini models.
 - `scripts/mirror_largerepo_expansion.sh` - Utility script for mirror largerepo expansion.
 - `scripts/plan_variance_runs.py` - Utility script for plan variance runs.
+- `scripts/promote_agent_oracles.py` - Utility script for promote agent oracles.
 - `scripts/push_base_images_ghcr.sh` - Utility script for push base images ghcr.
 - `scripts/regenerate_artifact_dockerfiles.py` - Utility script for regenerate artifact dockerfiles.
 - `scripts/rehost_sweap_images.py` - Utility script for rehost sweap images.
 
@@ -13,7 +13,8 @@
 the circularity concern is empirically defused.
 
 Environment variables:
-    ANTHROPIC_API_KEY           Required. Claude API key.
+    CLAUDE_CODE_OAUTH_TOKEN     Preferred. OAuth token for subscription billing.
+    ANTHROPIC_API_KEY           Fallback. Claude API key.
     SOURCEGRAPH_ACCESS_TOKEN    Required for deepsearch/hybrid backends.
     SOURCEGRAPH_URL             SG instance (default: https://sourcegraph.sourcegraph.com)
     CCB_REPO_CACHE              Repo clone cache dir (default: ~/.cache/ccb_repos)
@@ -66,7 +67,7 @@
 # Constants
 # ---------------------------------------------------------------------------
 
-DEFAULT_MODEL = "claude-sonnet-4-6"  # Good balance of cost and capability
+DEFAULT_MODEL = "claude-opus-4-6"  # Strongest model for oracle generation
 MAX_TOKENS = 16384
 MAX_TOOL_CALLS = 40
 TOOL_TIMEOUT_SEC = 30
@@ -1294,6 +1295,81 @@ def _extract_json_from_messages(messages: List[Dict]) -> Dict[str, Any]:
     return {"files": [], "text": "Agent did not produce valid JSON output."}
 
 
+# ---------------------------------------------------------------------------
+# Dual-Retrieval Verification
+# ---------------------------------------------------------------------------
+
+
+def verify_dual_retrieval(
+    oracle: Dict[str, Any],
+    repo_paths: Dict[str, Path],
+    sg_client: Optional["SourcegraphClient"] = None,
+) -> Dict[str, Any]:
+    """Verify each oracle file is discoverable via local FS and Sourcegraph.
+
+    Returns dict with per-file verification results and summary stats.
+    Files that fail either check are flagged but NOT removed from the oracle.
+    """
+    files = oracle.get("files", [])
+    verification = []
+
+    for entry in files:
+        repo = entry.get("repo", "")
+        path = entry.get("path", "")
+
+        # --- Local verification: file exists on disk in any repo_paths ---
+        local_ok = False
+        if repo_paths:
+            # Try exact repo match first, then fall back to any repo dir
+            for rname, rdir in repo_paths.items():
+                candidate = rdir / path
+                if candidate.is_file():
+                    local_ok = True
+                    break
+
+        # --- Sourcegraph verification: keyword search returns results ---
+        sg_ok = False
+        if sg_client and sg_client.token:
+            try:
+                # Construct a precise file search query
+                sg_query = f"file:^{re.escape(path)}$ count:1"
+                if repo:
+                    sg_query = f"repo:{repo} {sg_query}"
+                result = sg_client.keyword_search(sg_query, max_results=1)
+                sg_ok = result and "No results found" not in result and "error" not in result.lower()
+            except Exception as e:
+                log.debug("SG verify failed for %s:%s: %s", repo, path, e)
+
+        verification.append({
+            "repo": repo,
+            "path": path,
+            "local_verified": local_ok,
+            "sg_verified": sg_ok,
+        })
+
+    # Summary stats
+    n_total = len(verification)
+    n_dual = sum(1 for v in verification if v["local_verified"] and v["sg_verified"])
+    n_local_only = sum(1 for v in verification if v["local_verified"] and not v["sg_verified"])
+    n_sg_only = sum(1 for v in verification if not v["local_verified"] and v["sg_verified"])
+    n_unverified = sum(1 for v in verification if not v["local_verified"] and not v["sg_verified"])
+
+    summary = {
+        "n_total": n_total,
+        "n_dual_verified": n_dual,
+        "n_local_only": n_local_only,
+        "n_sg_only": n_sg_only,
+        "n_unverified": n_unverified,
+    }
+
+    log.info(
+        "  Verification: %d/%d dual, %d local-only, %d sg-only, %d unverified",
+        n_dual, n_total, n_local_only, n_sg_only, n_unverified,
+    )
+
+    return {"files": verification, "summary": summary}
+
+
 # ---------------------------------------------------------------------------
 # Output
 # ---------------------------------------------------------------------------
@@ -1394,6 +1470,10 @@ def main() -> int:
         "--missing-only", action="store_true",
         help="Only process tasks that have NO ground truth at all (no oracle_answer.json, no ground_truth.json)",
     )
+    parser.add_argument(
+        "--no-verify", action="store_true",
+        help="Skip dual-retrieval verification pass",
+    )
     parser.add_argument(
         "--dry-run", action="store_true",
         help="Show tasks without running agent",
@@ -1413,9 +1493,16 @@ def main() -> int:
         log.error("anthropic package not installed. pip install anthropic")
         return 1
 
-    api_key = os.environ.get("ANTHROPIC_API_KEY", "")
+    # OAuth token preferred (subscription billing), API key fallback
+    api_key = os.environ.get("CLAUDE_CODE_OAUTH_TOKEN", "")
+    if api_key:
+        log.info("Using OAuth token (CLAUDE_CODE_OAUTH_TOKEN)")
+    else:
+        api_key = os.environ.get("ANTHROPIC_API_KEY", "")
+        if api_key:
+            log.info("Using API key (ANTHROPIC_API_KEY)")
     if not api_key and not args.dry_run:
-        log.error("ANTHROPIC_API_KEY not set")
+        log.error("Set CLAUDE_CODE_OAUTH_TOKEN or ANTHROPIC_API_KEY")
         return 1
 
     # Discover tasks
@@ -1557,6 +1644,18 @@ def main() -> int:
             else:
                 output_path = str(out_dir / f"{task_dir.name}_gt_agent.json")
 
+        # Dual-retrieval verification (unless --no-verify)
+        if not args.no_verify:
+            vr = verify_dual_retrieval(oracle, repo_paths, sg_client=sg)
+            metadata["dual_verification"] = vr["summary"]
+            # Annotate oracle file entries with verification flags
+            for v_entry in vr["files"]:
+                for f_entry in oracle.get("files", []):
+                    if f_entry.get("path") == v_entry["path"]:
+                        f_entry["local_verified"] = v_entry["local_verified"]
+                        f_entry["sg_verified"] = v_entry["sg_verified"]
+                        break
+
         out_file = write_oracle(task_dir, oracle, metadata, output_path)
 
         n_files = len(oracle.get("files", []))
 
@@ -179,6 +179,7 @@ def find_project_root() -> Path:
 def discover_comparison_pairs(
     suite: str = "",
     agent_dir: str = "",
+    agent_suffix: str = "_agent",
 ) -> List[Dict[str, Any]]:
     """Find all tasks with both an existing oracle and an agent-generated oracle.
 
@@ -223,15 +224,15 @@ def discover_comparison_pairs(
             if agent_dir:
                 ext_dir = Path(agent_dir) / s
                 for pattern in [
-                    f"{task_dir.name}_oracle_agent.json",
-                    f"{task_dir.name}_gt_agent.json",
+                    f"{task_dir.name}_oracle{agent_suffix}.json",
+                    f"{task_dir.name}_gt{agent_suffix}.json",
                 ]:
                     p = ext_dir / pattern
                     if p.exists():
                         agent = p
                         break
             else:
-                for name in ["oracle_answer_agent.json", "ground_truth_agent.json"]:
+                for name in [f"oracle_answer{agent_suffix}.json", f"ground_truth{agent_suffix}.json"]:
                     p = tests / name
                     if p.exists():
                         agent = p
@@ -381,9 +382,14 @@ def main() -> int:
         "--agent-dir", type=str, default="",
         help="External directory with agent oracles (alternative to in-place)",
     )
+    parser.add_argument(
+        "--agent-suffix", type=str, default="_agent",
+        help="Suffix for agent oracle filenames (default: '_agent'). "
+             "E.g., '_agent' finds oracle_answer_agent.json and ground_truth_agent.json",
+    )
     parser.add_argument(
         "--report", type=str, default="",
-        help="Output report JSON path",
+        help="Output report JSON path (default: results/cross_validation/summary.json)",
     )
     parser.add_argument(
         "--verbose", action="store_true",
@@ -397,6 +403,7 @@ def main() -> int:
 
     pairs = discover_comparison_pairs(
         suite=args.suite, agent_dir=args.agent_dir,
+        agent_suffix=args.agent_suffix,
     )
 
     if not pairs:
@@ -472,6 +479,10 @@ def main() -> int:
     total_agent_only = sum(len(r["agent_only"]) for r in per_task)
     total_matched = sum(r["n_matched"] for r in per_task)
 
+    # High-divergence tasks (F1 < 0.5)
+    high_divergence = [r for r in per_task if r["f1"] < 0.5]
+    high_divergence.sort(key=lambda r: r["f1"])
+
     report = {
         "summary": {
             "total_tasks": len(per_task),
@@ -480,12 +491,14 @@ def main() -> int:
             "mean_file_precision": round(mean_precision, 4),
             "cohens_kappa": round(kappa, 4),
             "kappa_interpretation": _interpret_kappa(kappa),
+            "agent_suffix": args.agent_suffix,
         },
         "divergence": {
             "total_matched_files": total_matched,
             "total_oracle_only_files": total_oracle_only,
             "total_agent_only_files": total_agent_only,
         },
+        "high_divergence": high_divergence,
         "per_suite": suite_summary,
         "per_task": per_task,
     }
@@ -506,14 +519,24 @@ def main() -> int:
     print(f"\nPer-suite:")
     for s, m in sorted(suite_summary.items()):
         print(f"  {s}: n={m['n']}, F1={m['mean_f1']:.4f} [{m['min_f1']:.4f}-{m['max_f1']:.4f}]")
+
+    if high_divergence:
+        print(f"\nHigh-divergence tasks (F1 < 0.5): {len(high_divergence)}")
+        for r in high_divergence[:10]:
+            print(f"  {r['task']}: F1={r['f1']:.4f} (oracle={r['n_oracle']}, agent={r['n_agent']})")
+        if len(high_divergence) > 10:
+            print(f"  ... and {len(high_divergence) - 10} more")
     print(f"{'=' * 60}")
 
-    # Write report
-    if args.report:
-        out = Path(args.report)
-        out.parent.mkdir(parents=True, exist_ok=True)
-        out.write_text(json.dumps(report, indent=2) + "\n")
-        log.info("Report written: %s", out)
+    # Write report (default: results/cross_validation/summary.json)
+    report_path = args.report
+    if not report_path:
+        root = find_project_root()
+        report_path = str(root / "results" / "cross_validation" / "summary.json")
+    out = Path(report_path)
+    out.parent.mkdir(parents=True, exist_ok=True)
+    out.write_text(json.dumps(report, indent=2) + "\n")
+    log.info("Report written: %s", out)
 
     return 0