Skip to content

Commit 38f8e33

Browse files
abrichrclaude
andauthored
fix: remove stale health-gate args and add done-gate passthrough in core4_eval.py (#111)
The core4_eval.py was passing --transport-error-threshold, --health-samples, --health-min-success, and --health-sample-delay to run_dc_eval.py, but those args don't exist in run_dc_eval.py (they were from uncommitted Codex changes). Also adds --done-gate passthrough to match PR #110. Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
1 parent efc108f commit 38f8e33

2 files changed

Lines changed: 14 additions & 13 deletions

File tree

.beads/issues.jsonl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,5 +13,5 @@
1313
{"id":"openadapt-evals-hvm","title":"VL model fix PR #18 ready to merge","notes":"2026-02-08: openadapt-ml PR #18 was already merged on 2026-01-29. VL model fix is done.","status":"closed","priority":0,"issue_type":"task","owner":"richard.abrich@gmail.com","created_at":"2026-01-29T16:17:03.491938-05:00","created_by":"Richard Abrich","updated_at":"2026-02-08T12:55:19.233249-05:00","closed_at":"2026-02-08T12:55:19.233249-05:00","close_reason":"PR #18 already merged 2026-01-29"}
1414
{"id":"openadapt-evals-mx8","title":"Analyze evaluation results and publish findings","description":"After demo-conditioned evaluation completes, analyze results: success rates, failure modes, demo impact. Create data-driven roadmap for improvements.","notes":"wright repo (OpenAdaptAI/wright) scaffolding underway. Herald + consilium repos transferred to OpenAdaptAI org. Wright will be the orchestration layer for eval pipeline.","status":"open","priority":1,"issue_type":"task","owner":"richard.abrich@gmail.com","created_at":"2026-02-14T12:23:06.328838-05:00","created_by":"Richard Abrich","updated_at":"2026-03-02T00:08:08.422633-05:00"}
1515
{"id":"openadapt-evals-sz4","title":"RCA: Windows product key prompt recurring issue","status":"closed","priority":0,"issue_type":"task","owner":"richard.abrich@gmail.com","created_at":"2026-01-20T18:59:36.266286-05:00","created_by":"Richard Abrich","updated_at":"2026-01-20T20:32:06.493102-05:00","closed_at":"2026-01-20T20:32:06.493102-05:00","close_reason":"RCA complete - root cause is VERSION mismatch (CLI=11, Dockerfile=11e). Fix documented in RECURRING_ISSUES.md and WINDOWS_PRODUCT_KEY_RCA.md"}
16-
{"id":"openadapt-evals-vcb","title":"Run demo-conditioned WAA evaluation","description":"Once demos are recorded, run WAA evaluation with demo-conditioned agents (RetrievalAugmentedAgent with real demos). Target: measure improvement over zero-shot baseline. Requires real demos from recording task.","notes":"2026-03-01: GPU grant applications reviewed and rewritten (11 files). Writing done, blocked on eval results (DC signal on harder tasks). Detailed status tracked in openadapt-internal (private repo).","status":"open","priority":0,"issue_type":"task","owner":"richard.abrich@gmail.com","created_at":"2026-02-14T12:23:04.624305-05:00","created_by":"Richard Abrich","updated_at":"2026-03-02T17:11:02.757913-05:00"}
16+
{"id":"openadapt-evals-vcb","title":"Run demo-conditioned WAA evaluation","description":"Once demos are recorded, run WAA evaluation with demo-conditioned agents (RetrievalAugmentedAgent with real demos). Target: measure improvement over zero-shot baseline. Requires real demos from recording task.","notes":"2026-03-06: Core4 Trial 1 launched with --controller --done-gate --max-steps 30 (first ever run with both features). Prior 7 trials showed DC=14% vs ZS=18% — no lift. Root causes: (1) --controller was NEVER used, (2) no done-gate existed. PRs merged this session: #107 (readiness), #109 (core4 lane), #110 (done-gate). Results will be in benchmark_results/repeat_core4_trial1_20260306_154155/","status":"open","priority":0,"issue_type":"task","owner":"richard.abrich@gmail.com","created_at":"2026-02-14T12:23:04.624305-05:00","created_by":"Richard Abrich","updated_at":"2026-03-06T15:42:14.015601-05:00"}
1717
{"id":"openadapt-evals-wis","title":"Add pre-flight check to detect Windows install issues","status":"closed","priority":1,"issue_type":"task","owner":"richard.abrich@gmail.com","created_at":"2026-01-20T18:59:36.865052-05:00","created_by":"Richard Abrich","updated_at":"2026-01-20T20:32:06.757261-05:00","closed_at":"2026-01-20T20:32:06.757261-05:00","close_reason":"Duplicate of openadapt-evals-0dt"}

scripts/core4_eval.py

Lines changed: 13 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -85,14 +85,6 @@ def cmd_run(args: argparse.Namespace) -> int:
8585
args.agent,
8686
"--vm-user",
8787
args.vm_user,
88-
"--transport-error-threshold",
89-
str(args.transport_error_threshold),
90-
"--health-samples",
91-
str(args.health_samples),
92-
"--health-min-success",
93-
str(args.health_min_success),
94-
"--health-sample-delay",
95-
str(args.health_sample_delay),
9688
]
9789
if args.vm_ip:
9890
cmd.extend(["--vm-ip", args.vm_ip])
@@ -112,6 +104,16 @@ def cmd_run(args: argparse.Namespace) -> int:
112104
str(args.max_replans),
113105
]
114106
)
107+
if args.done_gate:
108+
cmd.extend(
109+
[
110+
"--done-gate",
111+
"--done-gate-max-overrides",
112+
str(args.done_gate_max_overrides),
113+
"--done-gate-threshold",
114+
str(args.done_gate_threshold),
115+
]
116+
)
115117

116118
print(f"\n=== Trial {t} -> {out} ===")
117119
rc = _run_cmd(cmd, cwd=repo_root, dry_run=args.dry_run)
@@ -184,15 +186,14 @@ def build_parser() -> argparse.ArgumentParser:
184186
run.add_argument("--vm-ip", default=None)
185187
run.add_argument("--vm-user", default="azureuser")
186188
run.add_argument("--start-from", type=int, default=0)
187-
run.add_argument("--transport-error-threshold", type=int, default=8)
188-
run.add_argument("--health-samples", type=int, default=3)
189-
run.add_argument("--health-min-success", type=int, default=2)
190-
run.add_argument("--health-sample-delay", type=float, default=1.5)
191189
run.add_argument("--zs-only", action="store_true")
192190
run.add_argument("--dc-only", action="store_true")
193191
run.add_argument("--controller", action="store_true")
194192
run.add_argument("--max-retries", type=int, default=2)
195193
run.add_argument("--max-replans", type=int, default=2)
194+
run.add_argument("--done-gate", action="store_true")
195+
run.add_argument("--done-gate-max-overrides", type=int, default=3)
196+
run.add_argument("--done-gate-threshold", type=float, default=1.0)
196197
run.add_argument("--continue-on-fail", action="store_true")
197198
run.add_argument("--dry-run", action="store_true")
198199
run.set_defaults(func=cmd_run)

0 commit comments

Comments
 (0)