Skip to content

Commit 30b1e62

Browse files
abrichrclaude
andcommitted
fix(controller): prevent plan step drift and reduce VLM false negatives
Two improvements to the closed-loop demo-conditioned controller: 1. Plan step tracking drift prevention: _advance_plan_steps() now only compares current step vs next step, advancing at most one step per call. Previously, bulk keyword matching could jump 5+ steps on a single action. 2. VLM verification prompt tuning: Added "partially_verified" status for cases where the core outcome is achieved but with minor deviations (cursor position, formatting). Rewrote all verification prompts to be outcome-focused, reducing false negatives from live eval scenarios. Adds 68 new tests (8 drift prevention + 21 VLM prompt + 9 false-negative regressions + 30 existing test updates). All 147 controller tests pass. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 9071fca commit 30b1e62

7 files changed

Lines changed: 851 additions & 79 deletions

File tree

openadapt_evals/agents/claude_computer_use_agent.py

Lines changed: 33 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -720,12 +720,12 @@ def _get_remaining_step_descriptions(self) -> str:
720720
def _advance_plan_steps(self, action: BenchmarkAction) -> None:
721721
"""Advance plan step tracking based on the action being taken.
722722
723-
Uses simple keyword matching between the action and the current
724-
plan step description / trajectory action to heuristically detect
725-
when a step is being worked on or completed.
726-
727-
When a new step appears to be starting (action matches a future
728-
step), all prior in_progress steps are marked as done.
723+
Only advances at most ONE step at a time to prevent tracking drift.
724+
The current in_progress step is marked as done and the next pending
725+
step becomes in_progress. This conservative approach avoids the
726+
problem of keyword heuristics aggressively skipping multiple steps
727+
based on superficial text matches (e.g., typing "Year" matching
728+
both the header step and the data entry step).
729729
730730
Args:
731731
action: The BenchmarkAction being returned to the runner.
@@ -746,7 +746,7 @@ def _advance_plan_steps(self, action: BenchmarkAction) -> None:
746746
break
747747

748748
if current_idx is None:
749-
# No in_progress step try to start the first pending one
749+
# No in_progress step -- try to start the first pending one
750750
for i, step in enumerate(self._plan_steps):
751751
if step["status"] == "pending":
752752
step["status"] = "in_progress"
@@ -757,33 +757,33 @@ def _advance_plan_steps(self, action: BenchmarkAction) -> None:
757757
break
758758
return
759759

760-
# Check if the action matches a future step better than current
761-
best_match_idx = current_idx
762-
best_score = self._match_score(action_keywords, current_idx)
760+
# Check if the action matches the NEXT step better than the current
761+
# one. Only consider the immediately next step to prevent multi-step
762+
# jumps that cause tracking drift.
763+
current_score = self._match_score(action_keywords, current_idx)
764+
next_idx = current_idx + 1
763765

764-
for i in range(current_idx + 1, len(self._plan_steps)):
765-
if self._plan_steps[i]["status"] == "done":
766-
continue
767-
score = self._match_score(action_keywords, i)
768-
if score > best_score:
769-
best_score = score
770-
best_match_idx = i
771-
772-
# If action matches a later step, mark intermediate steps as done
773-
if best_match_idx > current_idx:
774-
for i in range(current_idx, best_match_idx):
775-
if self._plan_steps[i]["status"] != "done":
776-
self._plan_steps[i]["status"] = "done"
777-
logger.info(
778-
f"Plan step {self._plan_steps[i]['step_num']} "
779-
f"marked done: {self._plan_steps[i]['text'][:60]}"
780-
)
781-
self._plan_steps[best_match_idx]["status"] = "in_progress"
782-
logger.info(
783-
f"Plan step {self._plan_steps[best_match_idx]['step_num']} "
784-
f"now in_progress: "
785-
f"{self._plan_steps[best_match_idx]['text'][:60]}"
786-
)
766+
# Find next non-done step
767+
while next_idx < len(self._plan_steps):
768+
if self._plan_steps[next_idx]["status"] != "done":
769+
break
770+
next_idx += 1
771+
772+
if next_idx < len(self._plan_steps):
773+
next_score = self._match_score(action_keywords, next_idx)
774+
if next_score > current_score and next_score > 0:
775+
# Advance exactly one step: current -> done, next -> in_progress
776+
self._plan_steps[current_idx]["status"] = "done"
777+
logger.info(
778+
f"Plan step {self._plan_steps[current_idx]['step_num']} "
779+
f"marked done: {self._plan_steps[current_idx]['text'][:60]}"
780+
)
781+
self._plan_steps[next_idx]["status"] = "in_progress"
782+
logger.info(
783+
f"Plan step {self._plan_steps[next_idx]['step_num']} "
784+
f"now in_progress: "
785+
f"{self._plan_steps[next_idx]['text'][:60]}"
786+
)
787787

788788
def _extract_action_keywords(self, action: BenchmarkAction) -> set[str]:
789789
"""Extract keywords from an action for matching against plan steps.

openadapt_evals/demo_controller.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -366,10 +366,11 @@ def execute(
366366
vr = self._verify_step(screenshot_bytes, current.expect)
367367
current.verification_result = vr
368368

369-
if vr.status == "verified":
369+
if vr.effectively_verified:
370370
logger.info(
371-
"Step %d verified (confidence=%.2f): %s",
371+
"Step %d %s (confidence=%.2f): %s",
372372
current.step_num,
373+
vr.status,
373374
vr.confidence,
374375
vr.explanation[:80],
375376
)
@@ -760,7 +761,7 @@ def _verify_goal(self, observation: BenchmarkObservation) -> bool:
760761
model=self.verify_model,
761762
provider=self.verify_provider,
762763
)
763-
return result.status == "verified"
764+
return result.effectively_verified
764765

765766
# ------------------------------------------------------------------
766767
# Helpers

openadapt_evals/plan_verify.py

Lines changed: 106 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,11 @@
44
and overall goals have been achieved, by sending screenshots to a cheap VLM
55
and parsing structured JSON responses.
66
7+
Verification is outcome-focused: we care about whether the intended effect
8+
of an action is observable (e.g., text was entered, a value is present),
9+
NOT about incidental details like exact cursor position, cell selection
10+
highlight, or minor UI state differences.
11+
712
All verification functions gracefully degrade to "unclear" on VLM failure,
813
ensuring that the calling controller never crashes due to verification issues.
914
"""
@@ -27,18 +32,31 @@ class VerificationResult:
2732
"""Result of a VLM-based verification check.
2833
2934
Attributes:
30-
status: One of ``"verified"``, ``"not_verified"``, or ``"unclear"``.
35+
status: One of ``"verified"``, ``"partially_verified"``,
36+
``"not_verified"``, or ``"unclear"``.
37+
38+
- **verified**: The core outcome is achieved as expected.
39+
- **partially_verified**: The main effect is present but with
40+
minor deviations (e.g., correct text entered but cursor in a
41+
slightly different position, or a numeric value is correct but
42+
formatting has not yet been applied). Callers should treat
43+
this the same as ``"verified"`` for step-progression purposes.
44+
- **not_verified**: The expected outcome is clearly absent (the
45+
action had no observable effect, or the wrong result occurred).
46+
- **unclear**: Cannot determine from the screenshot.
3147
confidence: Float between 0.0 and 1.0 indicating VLM confidence.
3248
explanation: Human-readable reasoning from the VLM.
3349
raw_response: Full VLM response text, useful for debugging.
3450
"""
3551

36-
status: str # "verified", "not_verified", "unclear"
52+
status: str # "verified", "partially_verified", "not_verified", "unclear"
3753
confidence: float # 0.0 to 1.0
3854
explanation: str # VLM's reasoning
3955
raw_response: str # Full VLM response for debugging
4056

41-
_VALID_STATUSES = frozenset({"verified", "not_verified", "unclear"})
57+
_VALID_STATUSES = frozenset({
58+
"verified", "partially_verified", "not_verified", "unclear",
59+
})
4260

4361
def __post_init__(self) -> None:
4462
if self.status not in self._VALID_STATUSES:
@@ -48,6 +66,14 @@ def __post_init__(self) -> None:
4866
)
4967
self.confidence = max(0.0, min(1.0, float(self.confidence)))
5068

69+
@property
70+
def effectively_verified(self) -> bool:
71+
"""Whether this result should be treated as success for step progression.
72+
73+
Both ``"verified"`` and ``"partially_verified"`` count as success.
74+
"""
75+
return self.status in ("verified", "partially_verified")
76+
5177

5278
# ---------------------------------------------------------------------------
5379
# Defaults
@@ -63,37 +89,59 @@ def __post_init__(self) -> None:
6389
# ---------------------------------------------------------------------------
6490

6591
_VERIFY_STEP_SYSTEM = (
66-
"You are a precise visual verification assistant. "
67-
"You examine screenshots and determine whether an expected condition is met. "
68-
"Always respond with valid JSON."
92+
"You are an outcome-focused visual verification assistant. "
93+
"You examine screenshots and determine whether the CORE INTENDED EFFECT "
94+
"of an action is observable. You focus on WHAT content is present, not on "
95+
"incidental details like exact cursor position, cell selection highlight, "
96+
"or scroll offset. Always respond with valid JSON."
6997
)
7098

7199
_VERIFY_STEP_PROMPT = """\
72-
Look at the screenshot and determine whether the following expectation is met:
100+
Look at the screenshot and determine whether the following expectation's \
101+
CORE OUTCOME is observable:
73102
74103
EXPECTATION: {expect_text}
75104
76-
Instructions:
77-
1. Describe what you observe in the screenshot that is relevant to the expectation.
78-
2. Compare your observations against the expectation.
79-
3. Decide whether the expectation is met.
80-
81-
Respond with ONLY a JSON object in this exact format (no other text):
105+
VERIFICATION RULES (follow strictly):
106+
1. Focus on OBSERVABLE OUTCOMES — is the intended content/value/state present?
107+
2. IGNORE incidental details that do not affect the outcome:
108+
- Exact cursor position or blinking caret location
109+
- Which cell/field currently has selection highlight
110+
- Minor scroll position differences
111+
- Whether the active cell indicator is on the exact expected cell vs. a
112+
neighboring cell, AS LONG AS the correct content is in the correct cell
113+
3. For text/data entry steps: verify the TEXT IS PRESENT in the correct
114+
location. Do NOT mark as failed just because the cursor moved after entry.
115+
4. For numeric values: verify the VALUE IS CORRECT (within reasonable
116+
rounding). Do NOT dispute minor floating-point display differences or
117+
semantic labels — if the number is correct, the step succeeded.
118+
5. For formatting steps (e.g., "format as percentage"): check whether the
119+
VISUAL FORMAT actually changed, not just whether the action was attempted.
120+
121+
DECISION GUIDE:
122+
- "verified": The core outcome is clearly achieved as expected.
123+
- "partially_verified": The main intended effect IS present, but with a
124+
minor deviation (e.g., text entered in correct cell but cursor moved to a
125+
different cell; value is correct but formatting not yet applied). The key
126+
action DID have its intended effect.
127+
- "not_verified": The expected outcome is clearly ABSENT — the action had no
128+
observable effect, the wrong content was entered, or a fundamentally
129+
different state is shown. Reserve this for REAL failures, not cosmetic
130+
differences.
131+
- "unclear": Cannot determine from the screenshot.
132+
133+
Respond with ONLY a JSON object (no other text):
82134
{{
83-
"status": "verified" | "not_verified" | "unclear",
135+
"status": "verified" | "partially_verified" | "not_verified" | "unclear",
84136
"confidence": <float between 0.0 and 1.0>,
85137
"explanation": "<your reasoning>"
86138
}}
87-
88-
Use "verified" if the expectation is clearly met.
89-
Use "not_verified" if the expectation is clearly NOT met.
90-
Use "unclear" if you cannot determine from the screenshot.
91139
"""
92140

93141
_VERIFY_PLAN_PROGRESS_SYSTEM = (
94-
"You are a precise visual verification assistant. "
95-
"You examine screenshots and assess plan progress. "
96-
"Always respond with valid JSON."
142+
"You are an outcome-focused visual verification assistant. "
143+
"You examine screenshots and assess plan progress based on OBSERVABLE "
144+
"OUTCOMES, not incidental UI details. Always respond with valid JSON."
97145
)
98146

99147
_VERIFY_PLAN_PROGRESS_PROMPT = """\
@@ -103,12 +151,20 @@ def __post_init__(self) -> None:
103151
PLAN:
104152
{plan_text}
105153
106-
Instructions:
107-
1. Examine the screenshot carefully.
108-
2. For each step, assess whether its expected outcome is visible in the screenshot.
109-
3. Identify which steps appear completed and which step should be executed next.
110-
111-
Respond with ONLY a JSON object in this exact format (no other text):
154+
VERIFICATION RULES:
155+
1. A step is "completed" if its CORE INTENDED EFFECT is observable:
156+
- For data entry: the correct text/value is present in the correct location.
157+
- For navigation: the application is on the expected screen/tab/sheet.
158+
- For formatting: the visual format has changed as expected.
159+
2. IGNORE incidental details when assessing completion:
160+
- Cursor position, cell selection highlight, scroll offset.
161+
- A step that typed text into cell A1 is complete if A1 contains that
162+
text, regardless of where the cursor currently sits.
163+
3. When in doubt, give the agent credit — if the outcome is present, mark
164+
the step as completed even if the UI state is slightly different than a
165+
literal reading of the step description.
166+
167+
Respond with ONLY a JSON object (no other text):
112168
{{
113169
"completed_steps": [<list of 0-indexed step numbers that appear done>],
114170
"current_step": <0-indexed step number to execute next>,
@@ -117,30 +173,40 @@ def __post_init__(self) -> None:
117173
"""
118174

119175
_VERIFY_GOAL_SYSTEM = (
120-
"You are a precise visual verification assistant. "
121-
"You examine screenshots and determine whether a high-level goal has been achieved. "
122-
"Always respond with valid JSON."
176+
"You are an outcome-focused visual verification assistant. "
177+
"You examine screenshots and determine whether a high-level goal has "
178+
"been achieved based on OBSERVABLE RESULTS. Always respond with valid JSON."
123179
)
124180

125181
_VERIFY_GOAL_PROMPT = """\
126-
Look at the screenshot and determine whether the following goal has been fully achieved:
182+
Look at the screenshot and determine whether the following goal has been achieved:
127183
128184
GOAL: {goal_text}
129185
130-
Instructions:
131-
1. Describe the current state visible in the screenshot.
132-
2. Compare the current state against the goal.
133-
3. Decide whether the goal is fully achieved.
134-
135-
Respond with ONLY a JSON object in this exact format (no other text):
186+
VERIFICATION RULES:
187+
1. Focus on whether the SUBSTANTIVE OUTCOME is present:
188+
- Are the required data values, text, or visual elements present?
189+
- Is the application in the expected end state?
190+
2. DO NOT penalize for:
191+
- Cursor position or cell selection state
192+
- Minor formatting differences (e.g., decimal places, rounding)
193+
- The order in which equivalent correct results appear
194+
- Incidental UI differences that do not affect the goal's substance
195+
3. If the goal involves computed values, verify the values are CORRECT
196+
(or reasonably close), not whether the computation method is visible.
197+
198+
Respond with ONLY a JSON object (no other text):
136199
{{
137-
"status": "verified" | "not_verified" | "unclear",
200+
"status": "verified" | "partially_verified" | "not_verified" | "unclear",
138201
"confidence": <float between 0.0 and 1.0>,
139202
"explanation": "<your reasoning>"
140203
}}
141204
142-
Use "verified" only if the goal is FULLY achieved (not partially).
143-
Use "not_verified" if the goal is not yet complete.
205+
Use "verified" if the goal is fully achieved.
206+
Use "partially_verified" if the goal is substantially achieved but with minor
207+
gaps (e.g., all values computed but one formatting step missing).
208+
Use "not_verified" if the goal is clearly not yet complete (substantive
209+
elements are missing, not just cosmetic differences).
144210
Use "unclear" if you cannot determine from the screenshot.
145211
"""
146212

0 commit comments

Comments
 (0)