Skip to content

Commit 9f8ffe8

Browse files
abrichrclaude
andcommitted
fix: prevent heuristic/verifier drift and surface partial steps in goal verification
Three issues addressed: 1. Heuristic/verifier step drift: The agent's keyword-based _advance_plan_steps() heuristic and the DemoController's VLM verifier operated on independent state, allowing them to disagree on which step was current. Fix: add _external_step_control flag to the agent that the DemoController sets at init, making _advance_plan_steps() a no-op when the controller manages step progression via VLM verification. 2. partially_verified invisible to goal verification: When steps were marked partially_verified, the final goal verification pass had no visibility into which steps had partial completions. Fix: _verify_goal() now builds a step verification summary and augments the goal text with it when noteworthy statuses (partially_verified, failed) exist. 3. Missing integration tests: Added TestHeuristicVerifierSync (4 tests) and TestGoalVerificationContext (5 tests) that verify the heuristic is properly disabled under controller management, step advancement is driven by VLM verification, and partial/failed step context reaches goal verification. Also added 2 agent-level tests for _external_step_control behavior. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent d498bca commit 9f8ffe8

4 files changed

Lines changed: 405 additions & 2 deletions

File tree

openadapt_evals/agents/claude_computer_use_agent.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -315,6 +315,12 @@ def __init__(
315315
self._goal: str = ""
316316
self._consecutive_done_overrides: int = 0
317317

318+
# When True, _advance_plan_steps() is a no-op. The DemoController
319+
# sets this flag so that step progression is driven exclusively by
320+
# VLM verification, preventing drift between the agent's keyword
321+
# heuristic and the controller's verifier.
322+
self._external_step_control: bool = False
323+
318324
if self._parsed_demo:
319325
self._goal = self._parsed_demo["goal"]
320326
self._trajectory = self._parsed_demo["trajectory"]
@@ -727,9 +733,16 @@ def _advance_plan_steps(self, action: BenchmarkAction) -> None:
727733
based on superficial text matches (e.g., typing "Year" matching
728734
both the header step and the data entry step).
729735
736+
When ``_external_step_control`` is True (set by :class:`DemoController`),
737+
this method is a no-op because step progression is managed by VLM
738+
verification in the controller.
739+
730740
Args:
731741
action: The BenchmarkAction being returned to the runner.
732742
"""
743+
if self._external_step_control:
744+
return
745+
733746
if not self._plan_steps:
734747
return
735748

openadapt_evals/demo_controller.py

Lines changed: 56 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -155,6 +155,13 @@ def __init__(
155155
# Parse the demo into a structured plan
156156
self.plan_state = self._parse_demo(demo_text)
157157

158+
# Disable the agent's internal heuristic step advancement so that
159+
# step progression is driven exclusively by VLM verification here
160+
# in the controller. This prevents drift between the agent's
161+
# keyword-based heuristic and the controller's verifier.
162+
if hasattr(agent, "_external_step_control"):
163+
agent._external_step_control = True
164+
158165
logger.info(
159166
"DemoController initialized: goal=%r, %d plan steps, %d trajectory steps",
160167
self.plan_state.goal[:80],
@@ -742,7 +749,9 @@ def _verify_step(
742749
def _verify_goal(self, observation: BenchmarkObservation) -> bool:
743750
"""Verify whether the overall goal has been achieved.
744751
745-
Delegates to :func:`plan_verify.verify_goal_completion`.
752+
Delegates to :func:`plan_verify.verify_goal_completion`, augmenting
753+
the goal text with a summary of per-step verification outcomes so
754+
that the VLM knows which steps were only ``partially_verified``.
746755
747756
Args:
748757
observation: Current observation with screenshot.
@@ -755,14 +764,59 @@ def _verify_goal(self, observation: BenchmarkObservation) -> bool:
755764
logger.warning("No screenshot for goal verification; assuming not done")
756765
return False
757766

767+
# Build step verification summary so goal verifier is aware of
768+
# partial completions and failures.
769+
step_summary = self._build_step_verification_summary()
770+
augmented_goal = self.plan_state.goal
771+
if step_summary:
772+
augmented_goal = (
773+
f"{self.plan_state.goal}\n\n"
774+
f"STEP VERIFICATION SUMMARY (for context):\n{step_summary}"
775+
)
776+
758777
result = verify_goal_completion(
759778
screenshot_bytes,
760-
self.plan_state.goal,
779+
augmented_goal,
761780
model=self.verify_model,
762781
provider=self.verify_provider,
763782
)
764783
return result.effectively_verified
765784

785+
def _build_step_verification_summary(self) -> str:
786+
"""Build a concise summary of per-step verification outcomes.
787+
788+
Returns:
789+
A multi-line string summarising each step's verification status
790+
and any partial-verification explanations, or an empty string
791+
if there is nothing noteworthy to report.
792+
"""
793+
lines: list[str] = []
794+
has_noteworthy = False
795+
796+
for step in self.plan_state.steps:
797+
vr = step.verification_result
798+
if vr is None:
799+
status_text = step.status
800+
else:
801+
status_text = vr.status
802+
if vr.status == "partially_verified":
803+
has_noteworthy = True
804+
805+
line = f" Step {step.step_num} ({step.action[:60]}): {status_text}"
806+
if vr and vr.status == "partially_verified":
807+
line += f" -- {vr.explanation[:120]}"
808+
elif step.status == "failed":
809+
has_noteworthy = True
810+
if vr:
811+
line += f" -- {vr.explanation[:120]}"
812+
lines.append(line)
813+
814+
if not has_noteworthy:
815+
# All steps fully verified or done; no extra context needed
816+
return ""
817+
818+
return "\n".join(lines)
819+
766820
# ------------------------------------------------------------------
767821
# Helpers
768822
# ------------------------------------------------------------------

tests/test_claude_computer_use_agent.py

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1084,3 +1084,37 @@ def test_drift_scenario_from_live_eval(self, agent_with_multilevel_demo):
10841084
assert agent._plan_steps[2]["status"] == "pending"
10851085
assert agent._plan_steps[3]["status"] == "pending"
10861086
assert agent._plan_steps[4]["status"] == "pending"
1087+
1088+
def test_external_step_control_suppresses_heuristic(
1089+
self, agent_with_multilevel_demo
1090+
):
1091+
"""When _external_step_control is True, _advance_plan_steps is a no-op.
1092+
1093+
The DemoController sets this flag so that step progression is driven
1094+
by VLM verification, not the agent's keyword heuristic.
1095+
"""
1096+
agent = agent_with_multilevel_demo
1097+
assert agent._plan_steps[0]["status"] == "in_progress"
1098+
1099+
# Enable external step control (as DemoController does)
1100+
agent._external_step_control = True
1101+
1102+
# This action would normally advance step 1 -> step 2
1103+
action = BenchmarkAction(
1104+
type="type", text="Year",
1105+
raw_action={"claude_action": {"action": "type", "text": "Year"}},
1106+
)
1107+
agent._advance_plan_steps(action)
1108+
1109+
# With external control enabled, nothing should have changed
1110+
assert agent._plan_steps[0]["status"] == "in_progress"
1111+
assert agent._plan_steps[1]["status"] == "pending"
1112+
1113+
def test_external_step_control_default_false(
1114+
self, agent_with_multilevel_demo
1115+
):
1116+
"""_external_step_control defaults to False so the agent works
1117+
standalone (without DemoController).
1118+
"""
1119+
agent = agent_with_multilevel_demo
1120+
assert agent._external_step_control is False

0 commit comments

Comments
 (0)