Skip to content

Commit 505eb05

Browse files
abrichrclaude
andauthored
fix: browser capture end-to-end pipeline (#15)
* fix: browser capture end-to-end pipeline Three bugs prevented browser events from being captured and parsed: 1. background.js only relayed DOM_EVENT messages but the content script sends USER_EVENT — events were silently dropped. 2. background.js handleSetMode only read message.payload?.mode but the recorder sends flat {mode: "record"} — mode was never set to "record" so the content script never attached record listeners. 3. The BrowserEventType enum used "browser.click" prefix format but the content script sends raw DOM event names ("click", "keydown", etc.). This was an artificial convention introduced during the port from legacy OpenAdapt that was never tested end-to-end. Legacy used raw names throughout. Changes: - background.js: add USER_EVENT relay, fix SET_MODE format handling - browser_events.py: change enum values to raw DOM names matching the content script and legacy OpenAdapt, add BrowserMouseMoveEvent - capture.py: add _parse_element_ref() and rewrite _convert_browser_event() to handle actual content-script message format including the recorder's {"message": <raw>} wrapper, add browser_events() and browser_event_count to CaptureSession - cli.py: add --browser-events flag to record, show browser event breakdown in info command - tests: add 15 e2e tests covering both DB roundtrip and raw content-script format parsing Verified with live recording: 84/84 events captured and parsed from Chrome extension on Hacker News. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> * fix: clean up stale docstring and unused import Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> * fix: address review feedback - Replace bare except with debug logging in _convert_browser_event - Move lazy imports to module level (BoundingBox, ElementState, etc.) - Remove unused imports (pytest, Recording) from test file - Update test class names to reflect structure tested, not removed format - Fix stale docstring in _parse_element_ref Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> * fix: remove unused BrowserEventType import from tests Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> --------- Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
1 parent d4b651f commit 505eb05

5 files changed

Lines changed: 852 additions & 11 deletions

File tree

chrome_extension/background.js

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -137,7 +137,7 @@ function handleServerMessage(data) {
137137
* Handle SET_MODE message from server
138138
*/
139139
function handleSetMode(message) {
140-
const newMode = message.payload?.mode || 'idle';
140+
const newMode = message.mode || message.payload?.mode || 'idle';
141141
currentMode = newMode;
142142
console.log('[OpenAdapt] Mode set to:', currentMode);
143143

@@ -231,6 +231,7 @@ chrome.runtime.onMessage.addListener((message, sender, sendResponse) => {
231231
const tabId = sender.tab?.id;
232232

233233
switch (message.type) {
234+
case 'USER_EVENT':
234235
case 'DOM_EVENT':
235236
// Add tab ID and relay to server
236237
message.tabId = tabId;

openadapt_capture/browser_events.py

Lines changed: 31 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -18,22 +18,27 @@
1818

1919

2020
class BrowserEventType(str, Enum):
21-
"""Browser event type identifiers."""
21+
"""Browser event type identifiers.
22+
23+
Values match the raw DOM event names sent by the Chrome extension
24+
content script (e.g. "click", "keydown"), consistent with legacy OpenAdapt.
25+
"""
2226

2327
# User interaction events
24-
CLICK = "browser.click"
25-
KEYDOWN = "browser.keydown"
26-
KEYUP = "browser.keyup"
27-
SCROLL = "browser.scroll"
28-
INPUT = "browser.input"
29-
FOCUS = "browser.focus"
30-
BLUR = "browser.blur"
28+
CLICK = "click"
29+
KEYDOWN = "keydown"
30+
KEYUP = "keyup"
31+
SCROLL = "scroll"
32+
INPUT = "input"
33+
MOUSEMOVE = "mousemove"
34+
FOCUS = "focus"
35+
BLUR = "blur"
3136

3237
# Navigation events
33-
NAVIGATE = "browser.navigate"
38+
NAVIGATE = "navigate"
3439

3540
# Unknown/generic
36-
UNKNOWN = "browser.unknown"
41+
UNKNOWN = "unknown"
3742

3843

3944
class NavigationType(str, Enum):
@@ -242,6 +247,21 @@ class BrowserNavigationEvent(BaseBrowserEvent):
242247
# =============================================================================
243248

244249

250+
class BrowserMouseMoveEvent(BaseBrowserEvent):
251+
"""Mouse move event in browser."""
252+
253+
type: Literal[BrowserEventType.MOUSEMOVE] = BrowserEventType.MOUSEMOVE
254+
255+
# Coordinates
256+
client_x: float = Field(description="Viewport X")
257+
client_y: float = Field(description="Viewport Y")
258+
screen_x: float = Field(default=0, description="Screen X")
259+
screen_y: float = Field(default=0, description="Screen Y")
260+
261+
# Target element
262+
element: SemanticElementRef | None = Field(default=None)
263+
264+
245265
class BrowserFocusEvent(BaseBrowserEvent):
246266
"""Element focus/blur event in browser."""
247267

@@ -292,5 +312,6 @@ class DOMSnapshot(BaseModel):
292312
| BrowserScrollEvent
293313
| BrowserInputEvent
294314
| BrowserNavigationEvent
315+
| BrowserMouseMoveEvent
295316
| BrowserFocusEvent
296317
)

openadapt_capture/capture.py

Lines changed: 213 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,20 @@
99
from pathlib import Path
1010
from typing import TYPE_CHECKING, Iterator
1111

12+
from openadapt_capture.browser_events import (
13+
BoundingBox,
14+
BrowserClickEvent,
15+
BrowserEventType,
16+
BrowserFocusEvent,
17+
BrowserInputEvent,
18+
BrowserKeyEvent,
19+
BrowserMouseMoveEvent,
20+
BrowserNavigationEvent,
21+
BrowserScrollEvent,
22+
ElementState,
23+
NavigationType,
24+
SemanticElementRef,
25+
)
1226
from openadapt_capture.events import (
1327
ActionEvent as PydanticActionEvent,
1428
)
@@ -27,6 +41,8 @@
2741
if TYPE_CHECKING:
2842
from PIL import Image
2943

44+
from openadapt_capture.browser_events import BrowserEvent
45+
3046

3147
def _convert_action_event(db_event) -> PydanticActionEvent | None:
3248
"""Convert a SQLAlchemy ActionEvent to a Pydantic event.
@@ -99,6 +115,182 @@ def _convert_action_event(db_event) -> PydanticActionEvent | None:
99115
return None
100116

101117

118+
def _parse_element_ref(raw: dict | None) -> SemanticElementRef | None:
119+
"""Parse a raw element dict into a SemanticElementRef.
120+
121+
Handles field name variations between the content-script format
122+
(e.g. ``dataId``, ``tagName``, ``classList``) and snake_case alternatives.
123+
"""
124+
if not raw or not isinstance(raw, dict):
125+
return None
126+
127+
bbox_raw = raw.get("bbox", {})
128+
bbox = BoundingBox(
129+
x=bbox_raw.get("x", 0),
130+
y=bbox_raw.get("y", 0),
131+
width=bbox_raw.get("width", 0),
132+
height=bbox_raw.get("height", 0),
133+
)
134+
135+
state_raw = raw.get("state", {})
136+
state = ElementState(
137+
enabled=state_raw.get("enabled", True),
138+
focused=state_raw.get("focused", False),
139+
visible=state_raw.get("visible", True),
140+
checked=state_raw.get("checked"),
141+
selected=state_raw.get("selected"),
142+
expanded=state_raw.get("expanded"),
143+
value=state_raw.get("value"),
144+
) if isinstance(state_raw, dict) else ElementState()
145+
146+
return SemanticElementRef(
147+
role=raw.get("role") or "",
148+
name=raw.get("name") or "",
149+
bbox=bbox,
150+
xpath=raw.get("xpath") or raw.get("dataId") or "",
151+
css_selector=raw.get("cssSelector") or raw.get("css_selector") or "",
152+
state=state,
153+
tag_name=raw.get("tagName") or raw.get("tag_name") or "",
154+
id=raw.get("id"),
155+
class_list=raw.get("classList") or raw.get("class_list") or [],
156+
)
157+
158+
159+
def _convert_browser_event(db_event) -> "BrowserEvent | None":
160+
"""Convert a SQLAlchemy BrowserEvent to a typed Pydantic browser event.
161+
162+
The DB stores browser events as JSON in the `message` field. The recorder
163+
wraps each raw WebSocket message as ``{"message": <raw_event>}``.
164+
165+
Handles both flat (content-script) and payload-wrapped message formats.
166+
167+
Args:
168+
db_event: SQLAlchemy BrowserEvent instance.
169+
170+
Returns:
171+
Typed browser event or None if parsing fails.
172+
"""
173+
msg = db_event.message
174+
if not isinstance(msg, dict):
175+
return None
176+
177+
# Unwrap the recorder's {"message": <raw>} wrapper
178+
inner = msg.get("message", msg)
179+
if not isinstance(inner, dict):
180+
return None
181+
182+
# Support both flat (content-script) and payload-wrapped (browser_bridge) formats
183+
payload = inner.get("payload", inner)
184+
185+
raw_type = payload.get("eventType", inner.get("eventType", ""))
186+
try:
187+
event_type = BrowserEventType(raw_type)
188+
except ValueError:
189+
return None
190+
191+
timestamp = db_event.timestamp or 0
192+
url = payload.get("url", inner.get("url", ""))
193+
tab_id = inner.get("tabId", payload.get("tab_id", 0))
194+
195+
try:
196+
if event_type == BrowserEventType.CLICK:
197+
elem = _parse_element_ref(payload.get("element"))
198+
if elem is None:
199+
return None
200+
return BrowserClickEvent(
201+
timestamp=timestamp,
202+
url=url,
203+
tab_id=tab_id,
204+
client_x=payload.get("clientX", 0),
205+
client_y=payload.get("clientY", 0),
206+
page_x=payload.get("pageX", payload.get("clientX", 0)),
207+
page_y=payload.get("pageY", payload.get("clientY", 0)),
208+
button=payload.get("button", 0),
209+
click_count=payload.get("clickCount", 1),
210+
element=elem,
211+
)
212+
elif event_type in (BrowserEventType.KEYDOWN, BrowserEventType.KEYUP):
213+
element = _parse_element_ref(payload.get("element"))
214+
return BrowserKeyEvent(
215+
timestamp=timestamp,
216+
type=event_type,
217+
url=url,
218+
tab_id=tab_id,
219+
key=payload.get("key", ""),
220+
code=payload.get("code", ""),
221+
key_code=payload.get("keyCode", 0),
222+
shift_key=payload.get("shiftKey", False),
223+
ctrl_key=payload.get("ctrlKey", False),
224+
alt_key=payload.get("altKey", False),
225+
meta_key=payload.get("metaKey", False),
226+
element=element,
227+
)
228+
elif event_type == BrowserEventType.SCROLL:
229+
return BrowserScrollEvent(
230+
timestamp=timestamp,
231+
url=url,
232+
tab_id=tab_id,
233+
scroll_x=payload.get("scrollX", 0),
234+
scroll_y=payload.get("scrollY", 0),
235+
delta_x=payload.get("deltaX", payload.get("scrollDeltaX", 0)),
236+
delta_y=payload.get("deltaY", payload.get("scrollDeltaY", 0)),
237+
)
238+
elif event_type == BrowserEventType.INPUT:
239+
elem = _parse_element_ref(payload.get("element"))
240+
if elem is None:
241+
return None
242+
return BrowserInputEvent(
243+
timestamp=timestamp,
244+
url=url,
245+
tab_id=tab_id,
246+
input_type=payload.get("inputType", ""),
247+
data=payload.get("data"),
248+
value=payload.get("value", ""),
249+
element=elem,
250+
)
251+
elif event_type == BrowserEventType.NAVIGATE:
252+
nav_type = payload.get("navigationType", "link")
253+
valid = [e.value for e in NavigationType]
254+
return BrowserNavigationEvent(
255+
timestamp=timestamp,
256+
url=url,
257+
tab_id=tab_id,
258+
previous_url=payload.get("previousUrl", ""),
259+
navigation_type=(
260+
NavigationType(nav_type)
261+
if nav_type in valid
262+
else NavigationType.LINK
263+
),
264+
)
265+
elif event_type == BrowserEventType.MOUSEMOVE:
266+
element = _parse_element_ref(payload.get("element"))
267+
return BrowserMouseMoveEvent(
268+
timestamp=timestamp,
269+
url=url,
270+
tab_id=tab_id,
271+
client_x=payload.get("clientX", 0),
272+
client_y=payload.get("clientY", 0),
273+
screen_x=payload.get("screenX", 0),
274+
screen_y=payload.get("screenY", 0),
275+
element=element,
276+
)
277+
elif event_type in (BrowserEventType.FOCUS, BrowserEventType.BLUR):
278+
elem = _parse_element_ref(payload.get("element"))
279+
if elem is None:
280+
return None
281+
return BrowserFocusEvent(
282+
timestamp=timestamp,
283+
type=event_type,
284+
url=url,
285+
tab_id=tab_id,
286+
element=elem,
287+
)
288+
except Exception as e:
289+
import logging
290+
logging.getLogger(__name__).debug("Failed to parse browser event: %s", e)
291+
return None
292+
293+
102294
@dataclass
103295
class Action:
104296
"""A processed action event with associated screenshot.
@@ -385,6 +577,27 @@ def actions(self, include_moves: bool = False) -> Iterator[Action]:
385577
continue
386578
yield Action(event=event, _capture=self)
387579

580+
def browser_events(self) -> list["BrowserEvent"]:
581+
"""Get all browser events as typed Pydantic models.
582+
583+
Parses the JSON message field from each stored BrowserEvent into
584+
the appropriate typed event (BrowserClickEvent, BrowserKeyEvent, etc.).
585+
586+
Returns:
587+
List of typed browser events, ordered by timestamp.
588+
"""
589+
events: list[BrowserEvent] = []
590+
for db_event in self._recording.browser_events:
591+
parsed = _convert_browser_event(db_event)
592+
if parsed is not None:
593+
events.append(parsed)
594+
return events
595+
596+
@property
597+
def browser_event_count(self) -> int:
598+
"""Number of browser events in this capture."""
599+
return len(self._recording.browser_events)
600+
388601
def get_frame_at(self, timestamp: float, tolerance: float = 0.5) -> "Image" | None:
389602
"""Get the screen frame closest to a timestamp.
390603

openadapt_capture/cli.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ def record(
1717
video: bool = True,
1818
audio: bool = False,
1919
images: bool = False,
20+
browser_events: bool = False,
2021
send_profile: bool = False,
2122
) -> None:
2223
"""Record GUI interactions.
@@ -27,6 +28,9 @@ def record(
2728
video: Capture video (default: True).
2829
audio: Capture audio (default: False).
2930
images: Save screenshots as PNGs (default: False).
31+
browser_events: Capture browser DOM events via Chrome extension (default: False).
32+
Requires the openadapt-capture Chrome extension to be installed and
33+
connects via WebSocket on localhost:8765.
3034
send_profile: Send profiling data via wormhole after recording (default: False).
3135
"""
3236
import time
@@ -36,6 +40,9 @@ def record(
3640
output_dir = str(Path(output_dir).resolve())
3741

3842
print(f"Recording to: {output_dir}")
43+
if browser_events:
44+
print("Browser event capture enabled (WebSocket on localhost:8765)")
45+
print("Make sure the openadapt-capture Chrome extension is installed.")
3946
print("Press Ctrl+C or type stop sequence to stop recording...")
4047
print()
4148

@@ -45,6 +52,7 @@ def record(
4552
capture_video=video,
4653
capture_audio=audio,
4754
capture_images=images,
55+
capture_browser_events=browser_events,
4856
send_profile=send_profile,
4957
) as recorder:
5058
recorder.wait_for_ready()
@@ -129,6 +137,7 @@ def info(capture_dir: str) -> None:
129137
# Count events
130138
actions = list(capture.actions())
131139
print(f"Actions: {len(actions)}")
140+
print(f"Browser events: {capture.browser_event_count}")
132141

133142
# Event type breakdown
134143
from collections import Counter
@@ -138,6 +147,14 @@ def info(capture_dir: str) -> None:
138147
for event_type, count in types.most_common():
139148
print(f" {event_type}: {count}")
140149

150+
# Browser event breakdown
151+
if capture.browser_event_count > 0:
152+
browser_events = capture.browser_events()
153+
btypes = Counter(type(e).__name__ for e in browser_events)
154+
print("Browser event types:")
155+
for btype, count in btypes.most_common():
156+
print(f" {btype}: {count}")
157+
141158

142159
def transcribe(
143160
capture_dir: str,

0 commit comments

Comments
 (0)