Skip to content

Commit 7896051

Browse files
abrichrclaude
andauthored
feat: migrate annotation pipeline from openadapt-ml to openadapt-evals (#64)
* feat: migrate annotation pipeline from openadapt-ml to openadapt-evals Move annotation data classes, prompts, and utilities into openadapt_evals.annotation and consolidate three separate VLM call implementations into a shared openadapt_evals.vlm module. - New openadapt_evals/vlm.py: unified vlm_call() supporting consilium council, OpenAI, and Anthropic; extract_json() for LLM output parsing; image_bytes_from_path() helper - New openadapt_evals/annotation.py: AnnotatedStep/AnnotatedDemo data classes, ANNOTATION_SYSTEM_PROMPT/ANNOTATION_STEP_PROMPT constants, parse_annotation_response(), validate_annotations(), format_annotated_demo() - Updated scripts/record_waa_demos.py cmd_annotate_waa() to import from openadapt_evals instead of openadapt_ml - Updated scripts/refine_demo.py to use shared vlm_call/extract_json, refactored message builders to prompt+images interface - Updated scripts/convert_recording_to_demo.py to use shared vlm_call - 16 new tests in tests/test_annotation.py, all existing tests pass Closes #59 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> * fix: remove unused import and hoist model resolution in convert_recording_to_demo - Remove unused `import os` from openadapt_evals/vlm.py - Move `resolved_model` computation before the for-loop in convert_vlm() so it's computed once instead of redundantly inside each step's try block Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> * fix: add timeouts, fix temperature regression, remove dead api_key param - vlm.py: add timeout=120s to OpenAI/Anthropic SDK clients to prevent indefinite hangs (old code had explicit timeouts via requests) - vlm.py: pass system prompt separately to consilium council_query() instead of concatenating into user prompt - refine_demo.py: explicitly pass temperature=1.0 to vlm_call() in holistic and per-step review to match old behavior (vlm_call defaults to 0.1 which would be an unintended behavioral change) - refine_demo.py: remove dead api_key parameter from run_holistic_review, run_per_step_review, refine_recording, and main() — vlm_call() reads API keys from environment via the SDK Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> --------- Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 754b3ee commit 7896051

7 files changed

Lines changed: 772 additions & 343 deletions

File tree

openadapt_evals/annotation.py

Lines changed: 254 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,254 @@
1+
"""Annotation data classes, prompts, and utilities for WAA demo annotation.
2+
3+
Migrated from ``openadapt_ml.experiments.demo_prompt.annotate`` so that the
4+
eval workflow (record -> annotate -> demo -> agent -> evaluate) does not
5+
require a cross-repo dependency on ``openadapt-ml`` for annotation.
6+
7+
What stays in openadapt-ml:
8+
- ``coalesce_steps()`` — depends on Episode schema, used by training
9+
- ``annotate_episode()`` — for local capture annotation (not WAA)
10+
- ``render_click_marker()`` — used by ``annotate_episode()``
11+
- CLI commands for local capture annotation
12+
- ``format_demo.py`` — used by multiple openadapt-ml modules
13+
"""
14+
15+
from __future__ import annotations
16+
17+
import json
18+
import logging
19+
import re
20+
from dataclasses import asdict, dataclass
21+
from pathlib import Path
22+
from typing import Any
23+
24+
logger = logging.getLogger(__name__)
25+
26+
SCHEMA_VERSION = "0.1"
27+
28+
29+
# ---------------------------------------------------------------------------
30+
# Data model
31+
# ---------------------------------------------------------------------------
32+
33+
34+
@dataclass
35+
class AnnotatedStep:
36+
"""A single annotated step in a demo trace."""
37+
38+
step_index: int
39+
timestamp_ms: int | None
40+
observation: str
41+
intent: str
42+
action: str
43+
action_raw: str
44+
action_px: list[int] | None
45+
result_observation: str
46+
expected_result: str
47+
48+
49+
@dataclass
50+
class AnnotatedDemo:
51+
"""A fully annotated demo trace, produced by VLM annotation."""
52+
53+
schema_version: str
54+
task_id: str | None
55+
instruction: str
56+
source: str # "recorded"
57+
annotator: dict[str, str]
58+
recording_meta: dict[str, Any]
59+
steps: list[AnnotatedStep]
60+
61+
def to_json(self, indent: int = 2) -> str:
62+
return json.dumps(asdict(self), indent=indent, default=str)
63+
64+
def save(self, path: str | Path) -> None:
65+
path = Path(path)
66+
path.parent.mkdir(parents=True, exist_ok=True)
67+
path.write_text(self.to_json(), encoding="utf-8")
68+
logger.info(f"Saved annotated demo to {path}")
69+
70+
@classmethod
71+
def load(cls, path: str | Path) -> AnnotatedDemo:
72+
path = Path(path)
73+
data = json.loads(path.read_text(encoding="utf-8"))
74+
steps = [AnnotatedStep(**s) for s in data.pop("steps")]
75+
return cls(**data, steps=steps)
76+
77+
78+
# ---------------------------------------------------------------------------
79+
# Prompts
80+
# ---------------------------------------------------------------------------
81+
82+
ANNOTATION_SYSTEM_PROMPT = """\
83+
You are annotating a human GUI demonstration for a task automation system.
84+
Your annotations will be used to guide an AI agent performing the same task on a different screen.
85+
Be precise about UI element names, labels, and visual landmarks.
86+
Always respond with valid JSON only — no markdown, no extra text."""
87+
88+
ANNOTATION_STEP_PROMPT = """\
89+
Task: {instruction}
90+
Step {step_num} of {total_steps}.
91+
92+
The user performed: {action_raw}
93+
A red marker on the BEFORE image shows where the user clicked/interacted.
94+
95+
{previous_context}
96+
Describe this step:
97+
98+
- OBSERVATION: Describe what is visible on the BEFORE image. Include:
99+
- Application/window name
100+
- Current panel, page, or dialog
101+
- 3-6 key visible UI elements with relative positions
102+
103+
- INTENT: Why is the user performing this action? (1 sentence)
104+
105+
- ACTION: Describe which element was interacted with. Name the element by its visible label/text, not by coordinates. Reference the red marker to identify the target.
106+
107+
- RESULT: Describe what actually changed between the BEFORE and AFTER images.{no_after_note}
108+
109+
- EXPECTED_RESULT: What should the screen look like after this action?
110+
111+
Respond with valid JSON only:
112+
{{"observation": "...", "intent": "...", "action": "...", "result_observation": "...", "expected_result": "..."}}"""
113+
114+
115+
# ---------------------------------------------------------------------------
116+
# Parsing
117+
# ---------------------------------------------------------------------------
118+
119+
120+
def parse_annotation_response(response: str) -> dict[str, str]:
121+
"""Parse VLM JSON response, tolerant of minor formatting issues."""
122+
text = response.strip()
123+
124+
# Strip markdown code fences if present
125+
if text.startswith("```"):
126+
lines = text.split("\n")
127+
# Remove first and last fence lines
128+
lines = [line for line in lines if not line.strip().startswith("```")]
129+
text = "\n".join(lines).strip()
130+
131+
try:
132+
return json.loads(text)
133+
except json.JSONDecodeError:
134+
# Try to extract JSON object from the response
135+
start = text.find("{")
136+
end = text.rfind("}") + 1
137+
if start >= 0 and end > start:
138+
try:
139+
return json.loads(text[start:end])
140+
except json.JSONDecodeError:
141+
pass
142+
143+
logger.warning(f"Failed to parse VLM response as JSON: {text[:200]}")
144+
return {
145+
"observation": text[:200] if text else "",
146+
"intent": "",
147+
"action": "",
148+
"result_observation": "",
149+
"expected_result": "",
150+
}
151+
152+
153+
# ---------------------------------------------------------------------------
154+
# Validation
155+
# ---------------------------------------------------------------------------
156+
157+
158+
def validate_annotations(demo: AnnotatedDemo) -> list[str]:
159+
"""Check annotation quality. Returns list of warnings.
160+
161+
Checks:
162+
- All key fields non-empty
163+
- Action doesn't contain raw coordinates (should be semantic)
164+
- Result_observation differs from observation
165+
- No obvious platform mismatch
166+
"""
167+
warnings: list[str] = []
168+
169+
for step in demo.steps:
170+
prefix = f"Step {step.step_index}"
171+
172+
if not step.observation:
173+
warnings.append(f"{prefix}: empty observation")
174+
if not step.intent:
175+
warnings.append(f"{prefix}: empty intent")
176+
if not step.action:
177+
warnings.append(f"{prefix}: empty action")
178+
if not step.result_observation and not step.expected_result:
179+
warnings.append(f"{prefix}: no result_observation or expected_result")
180+
181+
# Check if action still contains raw coordinates
182+
if re.search(r"CLICK\(\s*0\.\d+\s*,\s*0\.\d+\s*\)", step.action):
183+
warnings.append(f"{prefix}: action contains raw coordinates: {step.action}")
184+
185+
# Check for platform mismatches (Windows recording described with macOS terms)
186+
platform = (demo.recording_meta or {}).get("platform", "")
187+
if "win" in platform.lower():
188+
mac_terms = ["finder", "dock", "spotlight", "cmd+", "command+"]
189+
action_lower = step.action.lower() + " " + step.observation.lower()
190+
for term in mac_terms:
191+
if term in action_lower:
192+
warnings.append(
193+
f"{prefix}: macOS term '{term}' in Windows recording"
194+
)
195+
196+
return warnings
197+
198+
199+
# ---------------------------------------------------------------------------
200+
# Formatting for prompt injection
201+
# ---------------------------------------------------------------------------
202+
203+
204+
def format_annotated_demo(demo: AnnotatedDemo, compact: bool = True) -> str:
205+
"""Format AnnotatedDemo as text for prompt injection.
206+
207+
If compact=True (default), uses brief observation, action, and result.
208+
If compact=False, includes full observation and intent.
209+
210+
Args:
211+
demo: AnnotatedDemo to format.
212+
compact: If True, use compact format for agent prompt.
213+
214+
Returns:
215+
Formatted demo string.
216+
"""
217+
lines = [
218+
"DEMONSTRATION:",
219+
f"Goal: {demo.instruction}",
220+
"",
221+
]
222+
223+
for step in demo.steps:
224+
lines.append(f"Step {step.step_index + 1}:")
225+
226+
if compact:
227+
# Use first sentence of observation for compactness
228+
obs = _first_sentence(step.observation)
229+
lines.append(f" [Screen: {obs}]")
230+
else:
231+
lines.append(f" [Screen: {step.observation}]")
232+
lines.append(f" [Intent: {step.intent}]")
233+
234+
lines.append(f" [Action: {step.action}]")
235+
236+
# Prefer result_observation (grounded), fall back to expected_result
237+
result = step.result_observation or step.expected_result
238+
if result:
239+
lines.append(f" [Result: {result}]")
240+
241+
lines.append("")
242+
243+
return "\n".join(lines)
244+
245+
246+
def _first_sentence(text: str) -> str:
247+
"""Extract first sentence from text."""
248+
if not text:
249+
return ""
250+
# Split on period followed by space or end
251+
for i, ch in enumerate(text):
252+
if ch == "." and (i + 1 >= len(text) or text[i + 1] == " "):
253+
return text[: i + 1]
254+
return text

0 commit comments

Comments
 (0)