From 71ffceafa2ad85c84d40a164d0f08411f0fc30f0 Mon Sep 17 00:00:00 2001 From: grohith327 Date: Sat, 13 Jun 2026 20:46:23 -0700 Subject: [PATCH 1/2] Delete docs --- .../2026-05-16-window-workspace-manager.md | 933 ------------------ 1 file changed, 933 deletions(-) delete mode 100644 docs/superpowers/plans/2026-05-16-window-workspace-manager.md diff --git a/docs/superpowers/plans/2026-05-16-window-workspace-manager.md b/docs/superpowers/plans/2026-05-16-window-workspace-manager.md deleted file mode 100644 index 4464066..0000000 --- a/docs/superpowers/plans/2026-05-16-window-workspace-manager.md +++ /dev/null @@ -1,933 +0,0 @@ -# Window Workspace Manager Implementation Plan - -> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. - -**Goal:** Add macOS app and window management MCP tools so agents can focus apps, inspect windows, move/resize/place windows, tile workspaces, minimize windows, hide apps, quit apps, and identify the frontmost app. - -**Architecture:** Add a dedicated `tools/window.py` Python wrapper module backed by a single Swift Accessibility helper at `tools/scripts/window-manager.swift`. Python owns validation, MCP-facing names, JSON normalization, and subprocess error handling; Swift owns macOS Accessibility, display geometry, and window mutation. - -**Tech Stack:** Python 3.13, FastMCP, pytest, Swift, AppKit, ApplicationServices Accessibility APIs, CoreGraphics display/window APIs. - ---- - -## File Structure - -- Create `tools/window.py` - - Public Python functions used by `server.py`. - - Shared `_run_window_manager(action, payload)` helper that invokes `swift tools/scripts/window-manager.swift `. - - Small validators for dimensions, display index, tile layout, and app/window target parameters. - - JSON passthrough for state-returning operations and `"Error: ..."` strings for failures, matching existing modules such as `tools/files.py` and `tools/clipboard.py`. - -- Create `tools/scripts/window-manager.swift` - - One command-line helper with subcommands: `get_frontmost_app`, `list_windows`, `focus_window`, `move_window`, `resize_window`, `center_window`, `tile_windows`, `minimize`, `hide_app`, `quit_app`. - - Uses `NSWorkspace.shared.frontmostApplication`, `NSRunningApplication`, `AXUIElement`, `AXUIElementCopyAttributeValue`, `AXUIElementSetAttributeValue`, `CGWindowListCopyWindowInfo`, and `NSScreen.screens`. - - Returns JSON on stdout for all successful operations. - - Writes human-readable errors to stderr and exits non-zero for failures. - -- Modify `tools/__init__.py` - - No public exports are currently declared, so this likely remains unchanged unless the file has future explicit imports when execution starts. - -- Modify `server.py` - - Import `window`. - - Register MCP tools: - - `get_frontmost_app` - - `list_windows` - - `focus_window` - - `move_window` - - `resize_window` - - `center_window` - - `tile_windows` - - `minimize` - - `hide_app` - - `quit_app` - - Keep `open_app` in `tools/app.py`; do not move it during this feature. - -- Create `tests/test_window.py` - - Unit tests monkeypatching `tools.window.subprocess.run`. - - Tests cover argument serialization, validation, non-zero subprocess failures, JSON normalization, and all public wrappers. - -- Modify `README.md` - - Add the Window & Workspace feature to the feature list. - - Add macOS permissions note: Accessibility is required for window focus, movement, resize, minimize, hide, and quit operations; Screen Recording improves `list_windows` titles on modern macOS. - - Add manual smoke tests for display-aware placement. - -- Modify `skills/altic-studio/SKILL.md` - - Add the new window/workspace tools to the shareable skill so installed agents know they exist. - ---- - -## Behavior Contract - -### Targeting - -All window operations target a visible, non-desktop app window using this precedence: - -1. `window_id` when provided. -2. `app_name` plus `window_index`, where `window_index` is 1-based among that app's windows sorted front-to-back. -3. Frontmost app's frontmost window when no target is provided. - -Use case-insensitive substring matching for `app_name` against localized app name, bundle identifier, and process name. If multiple apps match, return an error listing the matching apps so the caller can retry with a more specific name. - -### Coordinate System - -MCP tools accept and return AppKit/global display coordinates. `x` and `y` are top-left coordinates. `width` and `height` are window size in points. Swift converts to `AXValue` `CGPoint` and `CGSize` for `kAXPositionAttribute` and `kAXSizeAttribute`. - -### Display-Aware Placement - -`display_index` is optional and 1-based. When omitted, use the display with the largest intersection with the target window. If no window is available, use the main display. Safe placement uses `NSScreen.visibleFrame`, not full frame, so windows avoid the Dock and menu bar. - -### Tool Return Shapes - -`get_frontmost_app` returns: - -```json -{ - "action": "get_frontmost_app", - "app": { - "name": "Safari", - "bundle_id": "com.apple.Safari", - "pid": 12345, - "is_active": true - } -} -``` - -`list_windows` returns: - -```json -{ - "action": "list_windows", - "windows": [ - { - "window_id": 101, - "app_name": "Safari", - "bundle_id": "com.apple.Safari", - "pid": 12345, - "title": "Example Page", - "x": 40, - "y": 80, - "width": 1200, - "height": 800, - "display_index": 1, - "is_minimized": false, - "is_frontmost_app": true - } - ], - "count": 1 -} -``` - -Mutation tools return: - -```json -{ - "action": "move_window", - "window": { - "window_id": 101, - "app_name": "Safari", - "x": 100, - "y": 120, - "width": 1000, - "height": 700, - "display_index": 1 - } -} -``` - ---- - -## Task 1: Python Window Wrapper Tests - -**Files:** -- Create: `tests/test_window.py` -- Create later: `tools/window.py` - -- [ ] **Step 1: Write failing wrapper tests** - -Create `tests/test_window.py` with these tests: - -```python -import json -import subprocess -from pathlib import Path - -import pytest - -from tools import window - - -def read_json(value: str): - assert not value.startswith("Error:"), value - return json.loads(value) - - -def completed(args, stdout='{"action":"ok"}', stderr="", returncode=0): - return subprocess.CompletedProcess( - args=args, - returncode=returncode, - stdout=stdout, - stderr=stderr, - ) - - -def test_get_frontmost_app_invokes_swift_helper(monkeypatch): - seen = {} - payload = { - "action": "get_frontmost_app", - "app": { - "name": "Finder", - "bundle_id": "com.apple.finder", - "pid": 42, - "is_active": True, - }, - } - - def fake_run(args, **kwargs): - seen["args"] = args - seen["kwargs"] = kwargs - return completed(args, stdout=json.dumps(payload)) - - monkeypatch.setattr(window.subprocess, "run", fake_run) - - assert read_json(window.get_frontmost_app()) == payload - assert seen["args"][0] == "swift" - assert Path(seen["args"][1]).name == "window-manager.swift" - assert seen["args"][2] == "get_frontmost_app" - assert json.loads(seen["args"][3]) == {} - assert seen["kwargs"]["timeout"] == 10 - - -def test_move_window_serializes_target_and_coordinates(monkeypatch): - seen = {} - - def fake_run(args, **kwargs): - seen["payload"] = json.loads(args[3]) - return completed(args, stdout='{"action":"move_window"}') - - monkeypatch.setattr(window.subprocess, "run", fake_run) - - payload = read_json( - window.move_window( - x=100, - y=140, - app_name="Safari", - window_index=2, - display_index=1, - ) - ) - - assert payload["action"] == "move_window" - assert seen["payload"] == { - "app_name": "Safari", - "window_index": 2, - "x": 100, - "y": 140, - "display_index": 1, - } - - -def test_resize_window_rejects_non_positive_dimensions(monkeypatch): - def fail_run(*args, **kwargs): - raise AssertionError("subprocess should not run for invalid dimensions") - - monkeypatch.setattr(window.subprocess, "run", fail_run) - - assert window.resize_window(width=0, height=500).startswith("Error:") - assert window.resize_window(width=500, height=-1).startswith("Error:") - - -def test_center_window_serializes_size_when_provided(monkeypatch): - seen = {} - - def fake_run(args, **kwargs): - seen["payload"] = json.loads(args[3]) - return completed(args, stdout='{"action":"center_window"}') - - monkeypatch.setattr(window.subprocess, "run", fake_run) - - read_json( - window.center_window( - width=900, - height=700, - app_name="Terminal", - display_index=2, - ) - ) - - assert seen["payload"] == { - "app_name": "Terminal", - "width": 900, - "height": 700, - "display_index": 2, - } - - -def test_tile_windows_validates_layout(monkeypatch): - def fail_run(*args, **kwargs): - raise AssertionError("subprocess should not run for invalid layout") - - monkeypatch.setattr(window.subprocess, "run", fail_run) - - result = window.tile_windows(layout="spiral", app_names=["Safari", "Terminal"]) - - assert result.startswith("Error:") - assert "layout must be one of" in result - - -def test_tile_windows_serializes_apps_and_padding(monkeypatch): - seen = {} - - def fake_run(args, **kwargs): - seen["payload"] = json.loads(args[3]) - return completed(args, stdout='{"action":"tile_windows","count":2}') - - monkeypatch.setattr(window.subprocess, "run", fake_run) - - payload = read_json( - window.tile_windows( - layout="columns", - app_names=["Safari", "Terminal"], - display_index=1, - padding=12, - ) - ) - - assert payload == {"action": "tile_windows", "count": 2} - assert seen["payload"] == { - "layout": "columns", - "app_names": ["Safari", "Terminal"], - "display_index": 1, - "padding": 12, - } - - -def test_focus_minimize_hide_and_quit_serialize_targets(monkeypatch): - calls = [] - - def fake_run(args, **kwargs): - calls.append((args[2], json.loads(args[3]))) - return completed(args, stdout=json.dumps({"action": args[2]})) - - monkeypatch.setattr(window.subprocess, "run", fake_run) - - assert read_json(window.focus_window(app_name="Safari"))["action"] == "focus_window" - assert read_json(window.minimize(app_name="Safari", window_index=1))["action"] == "minimize" - assert read_json(window.hide_app("Safari"))["action"] == "hide_app" - assert read_json(window.quit_app("Safari"))["action"] == "quit_app" - - assert calls == [ - ("focus_window", {"app_name": "Safari"}), - ("minimize", {"app_name": "Safari", "window_index": 1}), - ("hide_app", {"app_name": "Safari"}), - ("quit_app", {"app_name": "Safari"}), - ] - - -def test_subprocess_failure_returns_error(monkeypatch): - def fake_run(args, **kwargs): - return completed(args, stdout="", stderr="accessibility permission denied", returncode=1) - - monkeypatch.setattr(window.subprocess, "run", fake_run) - - assert window.get_frontmost_app() == "Error: accessibility permission denied" - - -def test_invalid_json_from_helper_returns_error(monkeypatch): - def fake_run(args, **kwargs): - return completed(args, stdout="not-json") - - monkeypatch.setattr(window.subprocess, "run", fake_run) - - assert window.get_frontmost_app().startswith("Error: invalid window manager response:") -``` - -- [ ] **Step 2: Run tests to verify they fail** - -Run: - -```bash -uv run pytest tests/test_window.py -v -``` - -Expected: import failure or attribute failure because `tools/window.py` does not exist yet. - ---- - -## Task 2: Python Window Wrapper Implementation - -**Files:** -- Create: `tools/window.py` -- Test: `tests/test_window.py` - -- [ ] **Step 1: Implement Python wrapper** - -Create `tools/window.py` with: - -```python -import json -import subprocess -from typing import Any - -from .constants import SCRIPTS_PREFIX - - -VALID_TILE_LAYOUTS = {"columns", "rows", "grid"} - - -def _json(payload: dict[str, Any]) -> str: - return json.dumps(payload, indent=2, sort_keys=True) - - -def _error(message: str) -> str: - return f"Error: {message}" - - -def _positive_int(name: str, value: int | None) -> str | None: - if value is not None and value <= 0: - return f"{name} must be greater than 0" - return None - - -def _target_payload( - app_name: str = "", - window_id: int | None = None, - window_index: int | None = None, - display_index: int | None = None, -) -> dict[str, Any]: - payload: dict[str, Any] = {} - if app_name.strip(): - payload["app_name"] = app_name.strip() - if window_id is not None: - payload["window_id"] = window_id - if window_index is not None: - payload["window_index"] = window_index - if display_index is not None: - payload["display_index"] = display_index - return payload - - -def _run_window_manager(action: str, payload: dict[str, Any] | None = None) -> str: - script_path = SCRIPTS_PREFIX / "window-manager.swift" - args = ["swift", str(script_path), action, json.dumps(payload or {})] - try: - result = subprocess.run( - args, - capture_output=True, - text=True, - timeout=10, - ) - if result.returncode != 0: - return _error(result.stderr.strip() or f"window manager action failed: {action}") - - stdout = result.stdout.strip() - if not stdout: - return _error(f"empty window manager response for action: {action}") - - try: - parsed = json.loads(stdout) - except json.JSONDecodeError: - return _error(f"invalid window manager response: {stdout}") - return _json(parsed) - except Exception as exc: - return _error(f"failed to run window manager action {action}: {exc}") - - -def get_frontmost_app() -> str: - return _run_window_manager("get_frontmost_app") - - -def list_windows(app_name: str = "", include_minimized: bool = False) -> str: - payload: dict[str, Any] = {"include_minimized": include_minimized} - if app_name.strip(): - payload["app_name"] = app_name.strip() - return _run_window_manager("list_windows", payload) - - -def focus_window( - app_name: str = "", - window_id: int | None = None, - window_index: int | None = None, -) -> str: - return _run_window_manager( - "focus_window", - _target_payload(app_name=app_name, window_id=window_id, window_index=window_index), - ) - - -def move_window( - x: int, - y: int, - app_name: str = "", - window_id: int | None = None, - window_index: int | None = None, - display_index: int | None = None, -) -> str: - for name, value in (("window_id", window_id), ("window_index", window_index), ("display_index", display_index)): - error = _positive_int(name, value) - if error: - return _error(error) - payload = _target_payload(app_name, window_id, window_index, display_index) - payload.update({"x": x, "y": y}) - return _run_window_manager("move_window", payload) - - -def resize_window( - width: int, - height: int, - app_name: str = "", - window_id: int | None = None, - window_index: int | None = None, - display_index: int | None = None, -) -> str: - for name, value in (("width", width), ("height", height), ("window_id", window_id), ("window_index", window_index), ("display_index", display_index)): - error = _positive_int(name, value) - if error: - return _error(error) - payload = _target_payload(app_name, window_id, window_index, display_index) - payload.update({"width": width, "height": height}) - return _run_window_manager("resize_window", payload) - - -def center_window( - app_name: str = "", - window_id: int | None = None, - window_index: int | None = None, - display_index: int | None = None, - width: int | None = None, - height: int | None = None, -) -> str: - for name, value in (("width", width), ("height", height), ("window_id", window_id), ("window_index", window_index), ("display_index", display_index)): - error = _positive_int(name, value) - if error: - return _error(error) - payload = _target_payload(app_name, window_id, window_index, display_index) - if width is not None: - payload["width"] = width - if height is not None: - payload["height"] = height - return _run_window_manager("center_window", payload) - - -def tile_windows( - layout: str = "columns", - app_names: list[str] | None = None, - display_index: int | None = None, - padding: int = 8, -) -> str: - if layout not in VALID_TILE_LAYOUTS: - return _error(f"layout must be one of: {', '.join(sorted(VALID_TILE_LAYOUTS))}") - if padding < 0: - return _error("padding must be greater than or equal to 0") - error = _positive_int("display_index", display_index) - if error: - return _error(error) - payload: dict[str, Any] = {"layout": layout, "padding": padding} - if app_names: - payload["app_names"] = [name.strip() for name in app_names if name.strip()] - if display_index is not None: - payload["display_index"] = display_index - return _run_window_manager("tile_windows", payload) - - -def minimize( - app_name: str = "", - window_id: int | None = None, - window_index: int | None = None, -) -> str: - return _run_window_manager( - "minimize", - _target_payload(app_name=app_name, window_id=window_id, window_index=window_index), - ) - - -def hide_app(app_name: str) -> str: - if not app_name.strip(): - return _error("app_name cannot be empty") - return _run_window_manager("hide_app", {"app_name": app_name.strip()}) - - -def quit_app(app_name: str) -> str: - if not app_name.strip(): - return _error("app_name cannot be empty") - return _run_window_manager("quit_app", {"app_name": app_name.strip()}) -``` - -- [ ] **Step 2: Run wrapper tests** - -Run: - -```bash -uv run pytest tests/test_window.py -v -``` - -Expected: all tests in `tests/test_window.py` pass, except failures caused by missing Swift script only if a test did not monkeypatch subprocess correctly. - -- [ ] **Step 3: Commit wrapper** - -Run: - -```bash -git add tools/window.py tests/test_window.py -git commit -m "feat: add window manager python wrappers" -``` - ---- - -## Task 3: Swift Window Manager Helper - -**Files:** -- Create: `tools/scripts/window-manager.swift` -- Test manually with `swift tools/scripts/window-manager.swift get_frontmost_app '{}'` - -- [ ] **Step 1: Implement command-line parsing** - -The script must accept exactly two arguments after the script path: - -```text -window-manager.swift -``` - -Implementation requirements: - -- Decode payload with `JSONSerialization.jsonObject`. -- Store payload as `[String: Any]`. -- Dispatch on `action`. -- On success, print one compact JSON object to stdout. -- On failure, print the error message to stderr and exit 1. - -- [ ] **Step 2: Implement app/window discovery** - -Implement these Swift helper types and functions: - -```swift -struct ManagedApp { - let name: String - let bundleID: String - let pid: pid_t - let application: NSRunningApplication -} - -struct ManagedWindow { - let windowID: Int - let app: ManagedApp - let title: String - let frame: CGRect - let axWindow: AXUIElement? - let isMinimized: Bool -} -``` - -Required behavior: - -- `frontmostApp()` reads `NSWorkspace.shared.frontmostApplication`. -- `runningApps(matching:)` searches `NSWorkspace.shared.runningApplications`. -- App matching checks localized name, bundle identifier, and executable URL last path component using case-insensitive substring matching. -- `axWindows(for:)` creates `AXUIElementCreateApplication(pid)` and reads `kAXWindowsAttribute`. -- `cgWindows()` uses `CGWindowListCopyWindowInfo([.optionOnScreenOnly, .excludeDesktopElements], kCGNullWindowID)` to enrich windows with IDs and titles. -- `managedWindows(appName:includeMinimized:)` combines AX windows and CG windows by matching PID plus nearest equal frame. If a CG window ID cannot be matched, use `0` for `window_id` and still allow AX-only operations. - -- [ ] **Step 3: Implement display helpers** - -Required helpers: - -- `screens()` returns `NSScreen.screens` in their current order. -- `screenForDisplayIndex(_:)` accepts 1-based indexes and returns an error for out-of-range values. -- `screenForWindow(_:)` picks the visible frame with the largest intersection with the window frame. -- `clampedFrame(rect:screen:)` keeps width and height at least `120x80` and within `screen.visibleFrame` when possible. -- Every placement tool uses `visibleFrame`. - -- [ ] **Step 4: Implement read operations** - -Implement: - -- `get_frontmost_app` -- `list_windows` - -Success output must match the behavior contract above. Include `display_index` for each listed window by comparing the window frame to `NSScreen.visibleFrame`. - -- [ ] **Step 5: Implement app operations** - -Implement: - -- `hide_app`: resolve one app and call `application.hide()`. -- `quit_app`: resolve one app and call `application.terminate()`. - -Return JSON with `action`, `app_name`, `bundle_id`, and `pid`. - -- [ ] **Step 6: Implement window operations** - -Implement: - -- `focus_window`: resolve target, call `application.activate(options: [.activateAllWindows, .activateIgnoringOtherApps])`, then set `kAXMainAttribute` and `kAXFocusedAttribute` to `true` on the target AX window. -- `minimize`: set `kAXMinimizedAttribute` to `true`. -- `move_window`: set `kAXPositionAttribute`. -- `resize_window`: set `kAXSizeAttribute`. -- `center_window`: optionally resize first, then calculate centered origin inside selected display visible frame. -- `tile_windows`: resolve requested apps or use visible windows from the frontmost display; calculate frames for: - - `columns`: equal-width columns. - - `rows`: equal-height rows. - - `grid`: `ceil(sqrt(count))` columns and `ceil(count / columns)` rows. - -Return JSON with the final window frame or list of final frames after each mutation. - -- [ ] **Step 7: Run Swift smoke checks** - -Run: - -```bash -swift tools/scripts/window-manager.swift get_frontmost_app '{}' -swift tools/scripts/window-manager.swift list_windows '{"include_minimized":false}' -swift tools/scripts/window-manager.swift center_window '{"app_name":"Finder"}' -``` - -Expected: - -- The first command prints valid JSON with the current frontmost app. -- The second command prints valid JSON with a `windows` array. -- The third command either centers Finder's frontmost window or prints a clear Accessibility permission error. - -- [ ] **Step 8: Commit Swift helper** - -Run: - -```bash -git add tools/scripts/window-manager.swift -git commit -m "feat: add swift window manager helper" -``` - ---- - -## Task 4: MCP Tool Registration - -**Files:** -- Modify: `server.py` -- Test: `tests/test_window.py` - -- [ ] **Step 1: Import the window module** - -Modify the `from tools import (...)` block in `server.py` to include `window`. - -- [ ] **Step 2: Register MCP tools near `open_app`** - -Add tool functions in `server.py` after `open_app`: - -```python -@mcp.tool() -async def get_frontmost_app() -> str: - """ - Get the currently frontmost macOS application. - - Returns: - JSON string with app name, bundle id, pid, and active state. - """ - return window.get_frontmost_app() - - -@mcp.tool() -async def list_windows( - app_name: str = Field(default=""), - include_minimized: bool = Field(default=False), -) -> str: - """ - List manageable macOS windows. - - Args: - app_name: Optional app name, bundle id, or process name filter - include_minimized: Include minimized windows when available - - Returns: - JSON string with window ids, app metadata, titles, frames, and display indexes. - """ - return window.list_windows(app_name, include_minimized) - - -@mcp.tool() -async def focus_window( - app_name: str = Field(default=""), - window_id: int | None = Field(default=None), - window_index: int | None = Field(default=None), -) -> str: - """ - Focus a macOS window by window id, app name, or frontmost fallback. - - Args: - app_name: Optional app name, bundle id, or process name - window_id: Optional CoreGraphics window id - window_index: Optional 1-based index among the app's windows - - Returns: - JSON string with focused window metadata, or an error message. - """ - return window.focus_window(app_name, window_id, window_index) -``` - -Also add analogous wrappers for `move_window`, `resize_window`, `center_window`, `tile_windows`, `minimize`, `hide_app`, and `quit_app` using the Python function signatures from Task 2. Use `Field` constraints for public numeric bounds: - -- `window_id`: `ge=1` -- `window_index`: `ge=1` -- `display_index`: `ge=1` -- `width`: `ge=1` -- `height`: `ge=1` -- `padding`: `ge=0`, `le=100` - -- [ ] **Step 3: Add registration smoke test** - -Append this test to `tests/test_window.py`: - -```python -def test_server_exposes_window_tools(): - import server - - tool_names = {tool.name for tool in server.mcp._tool_manager._tools.values()} - - assert { - "get_frontmost_app", - "list_windows", - "focus_window", - "move_window", - "resize_window", - "center_window", - "tile_windows", - "minimize", - "hide_app", - "quit_app", - }.issubset(tool_names) -``` - -- [ ] **Step 4: Run tests** - -Run: - -```bash -uv run pytest tests/test_window.py -v -``` - -Expected: all tests pass. - -- [ ] **Step 5: Commit MCP registration** - -Run: - -```bash -git add server.py tests/test_window.py -git commit -m "feat: expose window manager mcp tools" -``` - ---- - -## Task 5: Documentation and Skill Manifest - -**Files:** -- Modify: `README.md` -- Modify: `skills/altic-studio/SKILL.md` - -- [ ] **Step 1: Update README feature list** - -Add a bullet under `## Features`: - -```markdown -- 🪟 **Window & Workspace** - List/focus apps and windows, move/resize/center/tile windows, minimize windows, hide apps, and quit apps -``` - -- [ ] **Step 2: Update permissions** - -Under permissions, add or expand: - -```markdown -- ✅ **Accessibility** - Required for screen glow, system controls, and window management tools such as focus_window, move_window, resize_window, center_window, tile_windows, minimize, hide_app, and quit_app -- ✅ **Screen Recording** - Required for screenshot capture tools and improves window title/id discovery for list_windows on recent macOS versions -``` - -- [ ] **Step 3: Add manual smoke tests** - -Add a section: - -```markdown -## Manual Smoke Tests For Window Tools - -- Call `get_frontmost_app` while Finder or Safari is active. -- Call `list_windows` and confirm visible app windows include frame and display metadata. -- Open two apps, then call `tile_windows` with `layout="columns"` and their app names. -- Call `center_window` with an app name and confirm the frontmost window is centered inside the visible display area. -- Call `move_window` and `resize_window` with a test app window, then call `list_windows` to confirm the new frame. -- Call `minimize` on a test app window and confirm it minimizes. -- Call `hide_app` on a non-critical app and confirm the app is hidden. -- Call `quit_app` only on a disposable test app. -``` - -- [ ] **Step 4: Update `skills/altic-studio/SKILL.md`** - -Add a concise tool group for window management. Use the exact tool names registered in `server.py`. - -- [ ] **Step 5: Commit docs** - -Run: - -```bash -git add README.md skills/altic-studio/SKILL.md -git commit -m "docs: document window workspace tools" -``` - ---- - -## Task 6: Full Verification - -**Files:** -- All files touched by previous tasks - -- [ ] **Step 1: Run unit tests** - -Run: - -```bash -uv run pytest -v -``` - -Expected: all existing file/clipboard tests plus new window wrapper tests pass. - -- [ ] **Step 2: Run Swift syntax check** - -Run: - -```bash -swift tools/scripts/window-manager.swift get_frontmost_app '{}' -``` - -Expected: valid JSON or a clear macOS permission error. A Swift compile error is a failure. - -- [ ] **Step 3: Run MCP server import check** - -Run: - -```bash -uv run python - <<'PY' -import server -print(server.mcp.name) -PY -``` - -Expected: prints `Altic-MCP` with no import errors. - -- [ ] **Step 4: Final status check** - -Run: - -```bash -git status --short -``` - -Expected: no uncommitted changes unless the executor intentionally keeps the branch unstaged for review. - ---- - -## Open Decisions - -- Add `list_windows` even though the feature list did not explicitly request it. This is necessary for reliable targeting and aligns with the competitor comparison. -- Keep `open_app` in `tools/app.py`; this plan adds complementary app operations to `tools/window.py` because they share target resolution with windows. -- Use Swift instead of AppleScript for window placement because the feature needs display-aware geometry, CG window ids, Accessibility window attributes, and more reliable multi-display behavior. - -## Self-Review - -- Spec coverage: the plan covers `move_window`, `resize_window`, `tile_windows`, `center_window`, `focus_window`, `minimize`, `hide_app`, `quit_app`, `get_frontmost_app`, and display-aware placement. It also adds `list_windows` to make targeting practical. -- Placeholder scan: no task depends on undefined future work; each task names exact files, commands, and expected outcomes. -- Type consistency: public Python signatures, server wrappers, JSON payload keys, and tests use the same names: `app_name`, `window_id`, `window_index`, `display_index`, `x`, `y`, `width`, `height`, `layout`, `app_names`, and `padding`. From 4c9bad7346cddc158520ffd0a8a109c14e7309e9 Mon Sep 17 00:00:00 2001 From: grohith327 Date: Sat, 13 Jun 2026 21:25:45 -0700 Subject: [PATCH 2/2] add text extraction from screen --- README.md | 13 +- server.py | 35 ++ skills/altic-studio/SKILL.md | 49 ++- skills/altic-studio/scripts/README.md | 1 + .../scripts/extract-screen-text.swift | 405 ++++++++++++++++++ tests/test_screenshot.py | 158 +++++++ tools/screenshot.py | 87 ++++ tools/scripts/extract-screen-text.swift | 405 ++++++++++++++++++ 8 files changed, 1148 insertions(+), 5 deletions(-) create mode 100644 skills/altic-studio/scripts/extract-screen-text.swift create mode 100644 tests/test_screenshot.py create mode 100644 tools/scripts/extract-screen-text.swift diff --git a/README.md b/README.md index 8c8ca19..27e5655 100644 --- a/README.md +++ b/README.md @@ -11,7 +11,7 @@ - 🪟 **Window & Workspace** - List/focus apps and windows, move/resize/center/tile windows, minimize windows, hide apps, and quit apps - 🌐 **Safari** - Control tabs, navigate, execute JavaScript - 🌍 **Chrome (CDP)** - Open sessions, navigate, click/type, extract data, screenshots -- 📸 **Screen Capture** - Capture the active display and share image output with the model +- 📸 **Screen Capture & OCR** - Capture the active display, share image output, and extract visible text with local Vision OCR - 🖥️ **System** - Open apps, adjust brightness/volume, visual effects ## Available Skills @@ -31,9 +31,10 @@ This repo currently includes one shareable skill: - Calendar: `create-calendar-event.applescript`, `list-all-calendar-events-for-day.applescript` - Safari: open/close/switch/navigate/reload/history/page-info scripts - System: `open-application.applescript`, brightness + volume scripts -- Screenshot: `capture-screenshot.applescript` +- Screenshot: `capture-screenshot.applescript`, `capture-active-screen.swift`, `extract-screen-text.swift` - Files/Finder MCP: `find_files`, `list_directory`, `get_file_info`, `copy_file`, `copy_directory`, `move_file`, `rename_file`, `trash_file`, `reveal_in_finder`, `get_finder_selection` - Clipboard MCP: `get_clipboard_text`, `set_clipboard_text`, `clear_clipboard`, `get_clipboard_files`, `set_clipboard_files`, `save_clipboard_image`, `set_clipboard_image` +- Screen OCR MCP: `extract_screen_text` - Window/Workspace MCP: `get_frontmost_app`, `list_windows`, `focus_window`, `move_window`, `resize_window`, `center_window`, `tile_windows`, `minimize`, `hide_app`, `quit_app` - Clipboard script: `clipboard.swift` - Window script: `window-manager.swift` @@ -115,7 +116,13 @@ Replace `/FULL/PATH/TO/altic-mcp` with your actual path (e.g., `/Users/johndoe/D - ✅ **Automation** - Allow Claude to control apps (Messages, Notes, Safari) - ✅ **Finder Automation** - For Finder selection, reveal, and Trash file tools - ✅ **Accessibility** - Required for screen glow, system controls, and window management tools such as focus_window, move_window, resize_window, center_window, tile_windows, minimize, hide_app, and quit_app -- ✅ **Screen Recording** - Required for screenshot capture tools and improves window title/id discovery for list_windows on recent macOS versions +- ✅ **Screen Recording** - Required for screenshot capture and `extract_screen_text`; also improves window title/id discovery for list_windows on recent macOS versions + +`extract_screen_text` uses Apple Foundation Models with Vision-backed OCRTool as +the primary text extraction path on macOS 27 when Apple Intelligence is +available, then falls back to deterministic local Vision OCR on older or +unsupported runtimes. Optional visual understanding modes do not add Core AI +model assets as a default dependency. Clipboard text operations normally do not require extra permissions. Clipboard file and image operations use macOS pasteboard APIs and may prompt for security diff --git a/server.py b/server.py index 2d98536..dee1a7c 100644 --- a/server.py +++ b/server.py @@ -1056,6 +1056,41 @@ async def capture_active_screen( return screenshot.capture_active_screen(output_path) +@mcp.tool() +async def extract_screen_text( + output_path: str = Field(default=""), + recognition_level: str = Field(default="accurate"), + languages: str = Field(default=""), + include_boxes: bool = Field(default=True), + max_chars: int = Field(default=20000, ge=1, le=200000), + visual_understanding: str = Field(default="none"), +) -> str: + """ + Capture the display containing the frontmost app and extract visible text + using local Vision OCR. Requires macOS Screen Recording permission. + + Args: + output_path: Optional file path for the captured PNG used for OCR + recognition_level: OCR mode, either "accurate" or "fast" + languages: Optional comma-separated recognition language identifiers + include_boxes: Include recognized text bounding boxes in the JSON output + max_chars: Maximum characters to return in the combined text field + visual_understanding: Optional macOS 27 extension mode: "none", "summary", or "ui_map" + + Returns: + JSON string with OCR text, line metadata, screenshot path, and optional + visual understanding metadata; or an error message. + """ + return screenshot.extract_screen_text( + output_path, + recognition_level, + languages, + include_boxes, + max_chars, + visual_understanding, + ) + + @mcp.tool() async def add_screen_glow() -> str: """ diff --git a/skills/altic-studio/SKILL.md b/skills/altic-studio/SKILL.md index 36d8fd7..767dd51 100644 --- a/skills/altic-studio/SKILL.md +++ b/skills/altic-studio/SKILL.md @@ -13,9 +13,10 @@ license: Apache-2.0 3. MCP file mode for safe Finder and filesystem operations 4. MCP clipboard mode for text, file, and image pasteboard operations 5. MCP window/workspace mode for arranging macOS apps and windows +6. MCP screen OCR mode for extracting visible text from the active display It also includes Swift utility scripts for active-display screenshots, clipboard -file/image operations, and window/workspace management on macOS. +file/image operations, screen OCR, and window/workspace management on macOS. ## Mode A: AppleScript (macOS apps) @@ -61,6 +62,7 @@ The full Altic automation surface is exposed as scripts under `skills/altic-stud - `turn-down-volume.applescript` - args: `[amount_0_to_100]` - `capture-screenshot.applescript` - args: `[output_path] [full|interactive|window]` - `capture-active-screen.swift` - args: `` (captures full display containing frontmost app) +- `extract-screen-text.swift` - args: ` [accurate|fast] [languages_csv] [include_boxes] [none|summary|ui_map]` - `clipboard.swift` - subcommands: `get-files`, `set-files `, `save-image `, `set-image ` - `window-manager.swift` - subcommands: `get_frontmost_app`, `list_windows`, `focus_window`, `move_window`, `resize_window`, `center_window`, `tile_windows`, `minimize`, `hide_app`, `quit_app` @@ -70,6 +72,12 @@ Swift command template (for active-display screenshots): swift "skills/altic-studio/scripts/capture-active-screen.swift" "/tmp/active-screen.png" ``` +Swift command template (for active-display OCR): + +```bash +swift "skills/altic-studio/scripts/extract-screen-text.swift" "/tmp/screen-text.png" "accurate" "" "true" "none" +``` + Swift command template (for window management): ```bash @@ -90,6 +98,7 @@ Use MCP tools for deterministic Chrome automation: - `chrome_close_session` - `chrome_list_sessions` - `capture_active_screen` +- `extract_screen_text` Execution pattern: @@ -98,7 +107,8 @@ Execution pattern: 3. Interact with click and type actions. 4. Verify state with extraction. 5. Capture screenshots on checkpoints or failures. -6. Close session. +6. Use `extract_screen_text` when visible page/app text is needed from the active display rather than DOM extraction. +7. Close session. ## Mode C: File Finder and File Operations (MCP) @@ -197,6 +207,34 @@ Window workflow rules: - If a window mutation fails with an Accessibility error, tell the user to grant Accessibility permission to the host app running the MCP server. +## Mode F: Screen OCR (MCP) + +Use MCP screen OCR when the user asks to read visible text from the current app, +inspect text in a screenshot-like view, or extract text from a non-browser app. +This tool returns JSON with combined text, OCR line metadata, the screenshot +path used for recognition, and optional visual understanding metadata. + +Available tools: + +- `extract_screen_text` - args: `[output_path] [recognition_level=accurate|fast] [languages] [include_boxes] [max_chars] [visual_understanding=none|summary|ui_map]` + +Screen OCR workflow rules: + +- Prefer `chrome_extract` for browser DOM text when a Chrome CDP session is + already available; use `extract_screen_text` for rendered text, canvas text, + screenshots, PDFs, images, and non-browser apps. +- Use `recognition_level="accurate"` by default; use `"fast"` only when speed + matters more than precision. +- Set `languages` to a comma-separated list such as `en-US,fr-FR` only when the + expected language is known. +- Keep `visual_understanding="none"` unless the user asks for higher-level + image or UI interpretation. On macOS 27 with Apple Intelligence available, + `extract_screen_text` uses Apple Foundation Models plus Vision-backed OCRTool + as the primary extraction path; on older or unsupported runtimes it falls back + to deterministic Vision OCR. +- If OCR fails with a permission error, tell the user to grant Screen Recording + permission to the host app running the MCP server. + ## Operational Rules - Validate date/time format before running reminder/calendar scripts. @@ -211,6 +249,9 @@ Window workflow rules: confirmation. - For window mutations, verify with `list_windows` when the user needs confirmation. +- Use `extract_screen_text` instead of manual screenshot inspection when the + task depends on visible text in an app or page that is not accessible through + Chrome DOM extraction. ## Permissions Checklist @@ -220,6 +261,10 @@ Window workflow rules: - Automation permission for app control - Accessibility permission for system controls and window management - Screen Recording permission for screenshots and improved window discovery +- Screen Recording permission for `extract_screen_text`; FoundationModels + primary extraction and optional visual understanding modes require macOS 27, + Apple Intelligence availability, and a FoundationModels/OCRTool-capable + SDK/runtime - Safari setting: Allow JavaScript from Apple Events - Google Chrome installed for CDP tools - Full Disk Access for reading Messages database diff --git a/skills/altic-studio/scripts/README.md b/skills/altic-studio/scripts/README.md index bf42305..b74b38a 100644 --- a/skills/altic-studio/scripts/README.md +++ b/skills/altic-studio/scripts/README.md @@ -19,6 +19,7 @@ osascript "skills/altic-studio/scripts/create-calendar-event.applescript" "Team osascript "skills/altic-studio/scripts/navigate-safari.applescript" "https://example.com" osascript "skills/altic-studio/scripts/capture-screenshot.applescript" "/tmp/screen.png" "full" swift "skills/altic-studio/scripts/capture-active-screen.swift" "/tmp/active-screen.png" +swift "skills/altic-studio/scripts/extract-screen-text.swift" "/tmp/screen-text.png" "accurate" "" "true" "none" swift "skills/altic-studio/scripts/clipboard.swift" get-files swift "skills/altic-studio/scripts/clipboard.swift" set-files "/Users/example/Desktop/report.pdf" swift "skills/altic-studio/scripts/clipboard.swift" save-image "/tmp/clipboard.png" diff --git a/skills/altic-studio/scripts/extract-screen-text.swift b/skills/altic-studio/scripts/extract-screen-text.swift new file mode 100644 index 0000000..2284990 --- /dev/null +++ b/skills/altic-studio/scripts/extract-screen-text.swift @@ -0,0 +1,405 @@ +#!/usr/bin/env swift + +import AppKit +import Foundation +import ScreenCaptureKit +import Vision + +#if canImport(FoundationModels) && canImport(_Vision_FoundationModels) +import FoundationModels +import _Vision_FoundationModels +#endif + +func area(_ rect: CGRect) -> CGFloat { + max(0, rect.width) * max(0, rect.height) +} + +func fail(_ message: String, code: Int32 = 1) -> Never { + fputs("\(message)\n", stderr) + exit(code) +} + +func displayForFrontmostApp(content: SCShareableContent) -> SCDisplay? { + guard let app = NSWorkspace.shared.frontmostApplication else { + return nil + } + + let targetPID = app.processIdentifier + let appWindows = content.windows.filter { window in + window.owningApplication?.processID == targetPID + } + + guard + let frontWindow = appWindows.max(by: { lhs, rhs in + area(lhs.frame) < area(rhs.frame) + }) + else { + return nil + } + + let targetRect = frontWindow.frame + return content.displays.max(by: { lhs, rhs in + area(lhs.frame.intersection(targetRect)) < area(rhs.frame.intersection(targetRect)) + }) +} + +func captureDisplay(to outputPath: String, display: SCDisplay) async throws -> CGImage { + let filter = SCContentFilter(display: display, excludingWindows: []) + let config = SCStreamConfiguration() + + let image = try await SCScreenshotManager.captureImage( + contentFilter: filter, + configuration: config + ) + + let bitmap = NSBitmapImageRep(cgImage: image) + guard let pngData = bitmap.representation(using: .png, properties: [:]) else { + throw NSError( + domain: "altic-mcp.extract-screen-text", + code: 2, + userInfo: [NSLocalizedDescriptionKey: "Could not encode screenshot as PNG."] + ) + } + + let outputURL = URL(fileURLWithPath: outputPath) + try FileManager.default.createDirectory( + at: outputURL.deletingLastPathComponent(), + withIntermediateDirectories: true + ) + try pngData.write(to: outputURL) + + return image +} + +func recognizedLines( + in image: CGImage, + recognitionLevel: String, + languages: [String], + includeBoxes: Bool +) throws -> [[String: Any]] { + let request = VNRecognizeTextRequest() + request.recognitionLevel = recognitionLevel == "fast" ? .fast : .accurate + request.usesLanguageCorrection = true + if !languages.isEmpty { + request.recognitionLanguages = languages + } + + let handler = VNImageRequestHandler(cgImage: image, options: [:]) + try handler.perform([request]) + + let observations = request.results ?? [] + let imageWidth = Double(image.width) + let imageHeight = Double(image.height) + + let entries = observations.compactMap { observation -> (line: [String: Any], x: Double, y: Double)? in + guard let candidate = observation.topCandidates(1).first else { + return nil + } + + let box = observation.boundingBox + let frame = [ + "x": Double(box.minX) * imageWidth, + "y": (1.0 - Double(box.maxY)) * imageHeight, + "width": Double(box.width) * imageWidth, + "height": Double(box.height) * imageHeight, + ] + var line: [String: Any] = [ + "text": candidate.string, + "confidence": Double(candidate.confidence), + ] + + if includeBoxes { + line["frame"] = frame + } + + return (line, frame["x"] ?? 0, frame["y"] ?? 0) + }.sorted { lhs, rhs in + if abs(lhs.y - rhs.y) > 4 { + return lhs.y < rhs.y + } + return lhs.x < rhs.x + } + + return entries.map { $0.line } +} + +struct ScreenTextExtraction { + let engine: String + let text: String + let visualUnderstanding: Any +} + +func linesFromText(_ text: String) -> [[String: Any]] { + text + .split(separator: "\n", omittingEmptySubsequences: false) + .map { String($0).trimmingCharacters(in: .whitespacesAndNewlines) } + .filter { !$0.isEmpty } + .map { line in + [ + "text": line, + "confidence": 1.0, + ] + } +} + +func visualUnderstandingUnavailablePayload(mode: String, reason: String) -> Any { + if mode == "none" { + return NSNull() + } + + return [ + "available": false, + "mode": mode, + "reason": reason, + ] +} + +#if canImport(FoundationModels) && canImport(_Vision_FoundationModels) +@available(macOS 27.0, *) +func foundationModelsTextPrompt(image: CGImage) -> Prompt { + return Prompt { + """ + Use OCRTool on the attached image labeled "screen". Return only the exact visible text \ + from the screen, preserving line breaks where practical. Do not summarize or add commentary. + """ + Attachment(image) + .label("screen") + } +} + +@available(macOS 27.0, *) +func foundationModelsVisualPrompt(for mode: String, image: CGImage, extractedText: String) -> Prompt { + let instruction: String + switch mode { + case "summary": + instruction = """ + Use the attached image labeled "screen" and the OCR text below to return a concise summary \ + of what the screen is showing. Preserve important labels, warnings, numbers, and button text. + + OCR text: + \(extractedText) + """ + case "ui_map": + instruction = """ + Use the attached image labeled "screen" and the OCR text below to describe the visible UI \ + structure as compact JSON with sections, controls, and important labels. Keep the response \ + short and machine-readable. + + OCR text: + \(extractedText) + """ + default: + instruction = extractedText + } + + return Prompt { + instruction + Attachment(image) + .label("screen") + } +} + +@available(macOS 27.0, *) +func extractWithFoundationModels(in image: CGImage, mode: String) async throws -> ScreenTextExtraction? { + let model = SystemLanguageModel.default + guard model.isAvailable else { + return nil + } + + let session = LanguageModelSession( + model: model, + tools: [ + OCRTool(), + ], + instructions: """ + You are a local screen text extraction engine. Prefer OCRTool whenever text \ + is needed from the attached image. Return concise, faithful output. + """ + ) + + let textResponse = try await session.respond(to: foundationModelsTextPrompt(image: image)) + let extractedText = textResponse.content.trimmingCharacters(in: .whitespacesAndNewlines) + guard !extractedText.isEmpty else { + return nil + } + + let visualUnderstanding: Any + if mode == "none" { + visualUnderstanding = NSNull() + } else { + let visualResponse = try await session.respond( + to: foundationModelsVisualPrompt(for: mode, image: image, extractedText: extractedText) + ) + let visualContent = visualResponse.content.trimmingCharacters(in: .whitespacesAndNewlines) + visualUnderstanding = [ + "available": true, + "engine": "foundation_models", + "mode": mode, + "content": visualContent, + ] + } + + return ScreenTextExtraction( + engine: "foundation_models", + text: extractedText, + visualUnderstanding: visualUnderstanding + ) +} +#endif + +func extractWithFoundationModelsIfAvailable(in image: CGImage, mode: String) async throws -> ScreenTextExtraction? { + #if canImport(FoundationModels) && canImport(_Vision_FoundationModels) + if #available(macOS 27.0, *) { + return try await extractWithFoundationModels(in: image, mode: mode) + } + #endif + + return nil +} + +func visionExtraction( + in image: CGImage, + recognitionLevel: String, + languages: [String], + includeBoxes: Bool, + visualUnderstanding: String +) throws -> (ScreenTextExtraction, [[String: Any]]) { + let lines = try recognizedLines( + in: image, + recognitionLevel: recognitionLevel, + languages: languages, + includeBoxes: includeBoxes + ) + let text = lines.compactMap { $0["text"] as? String }.joined(separator: "\n") + let extraction = ScreenTextExtraction( + engine: "vision", + text: text, + visualUnderstanding: visualUnderstandingUnavailablePayload( + mode: visualUnderstanding, + reason: "requires macOS 27 runtime, Apple Intelligence availability, and FoundationModels SDK" + ) + ) + return (extraction, lines) +} + +func lineMetadata( + for extraction: ScreenTextExtraction, + image: CGImage, + recognitionLevel: String, + languages: [String], + includeBoxes: Bool +) -> [[String: Any]] { + if extraction.engine == "foundation_models" { + do { + return try recognizedLines( + in: image, + recognitionLevel: recognitionLevel, + languages: languages, + includeBoxes: includeBoxes + ) + } catch { + return linesFromText(extraction.text) + } + } + + return linesFromText(extraction.text) +} + +func topLevelVisualUnderstanding(for extraction: ScreenTextExtraction, mode: String) -> Any { + if extraction.engine == "foundation_models" { + return extraction.visualUnderstanding + } + if mode == "none" { + return NSNull() + } + return extraction.visualUnderstanding +} + +func primaryExtraction( + in image: CGImage, + recognitionLevel: String, + languages: [String], + includeBoxes: Bool, + visualUnderstanding: String +) async throws -> (ScreenTextExtraction, [[String: Any]]) { + if let foundationExtraction = try await extractWithFoundationModelsIfAvailable(in: image, mode: visualUnderstanding) { + let lines = lineMetadata( + for: foundationExtraction, + image: image, + recognitionLevel: recognitionLevel, + languages: languages, + includeBoxes: includeBoxes + ) + return (foundationExtraction, lines.isEmpty ? linesFromText(foundationExtraction.text) : lines) + } + + return try visionExtraction( + in: image, + recognitionLevel: recognitionLevel, + languages: languages, + includeBoxes: includeBoxes, + visualUnderstanding: visualUnderstanding + ) +} + +let args = CommandLine.arguments +guard args.count >= 2 else { + fail("Usage: extract-screen-text.swift [accurate|fast] [languages_csv] [include_boxes] [none|summary|ui_map]") +} + +let outputPath = args[1] +let recognitionLevel = args.count >= 3 ? args[2].lowercased() : "accurate" +let languages = args.count >= 4 + ? args[3].split(separator: ",").map { $0.trimmingCharacters(in: .whitespacesAndNewlines) }.filter { !$0.isEmpty } + : [] +let includeBoxes = args.count >= 5 ? (args[4].lowercased() != "false") : true +let visualUnderstanding = args.count >= 6 ? args[5].lowercased() : "none" + +guard ["accurate", "fast"].contains(recognitionLevel) else { + fail("recognition_level must be one of: accurate, fast") +} +guard ["none", "summary", "ui_map"].contains(visualUnderstanding) else { + fail("visual_understanding must be one of: none, summary, ui_map") +} + +do { + let content = try await SCShareableContent.excludingDesktopWindows(false, onScreenWindowsOnly: true) + let display = displayForFrontmostApp(content: content) ?? content.displays.first + guard let display else { + fail("Could not determine a display to capture.") + } + + let image = try await captureDisplay(to: outputPath, display: display) + let (extraction, lines) = try await primaryExtraction( + in: image, + recognitionLevel: recognitionLevel, + languages: languages, + includeBoxes: includeBoxes, + visualUnderstanding: visualUnderstanding + ) + + let payload: [String: Any] = [ + "action": "extract_screen_text", + "engine": extraction.engine, + "source": "active_screen", + "screenshot_path": outputPath, + "image_size": [ + "width": image.width, + "height": image.height, + ], + "recognition_level": recognitionLevel, + "text": extraction.text, + "length_chars": extraction.text.count, + "truncated": false, + "lines": lines, + "visual_understanding": topLevelVisualUnderstanding(for: extraction, mode: visualUnderstanding), + ] + + let data = try JSONSerialization.data(withJSONObject: payload, options: [.prettyPrinted, .sortedKeys]) + guard let json = String(data: data, encoding: .utf8) else { + fail("Could not encode OCR payload as UTF-8.") + } + print(json) +} catch { + fail(error.localizedDescription) +} diff --git a/tests/test_screenshot.py b/tests/test_screenshot.py new file mode 100644 index 0000000..6ddf969 --- /dev/null +++ b/tests/test_screenshot.py @@ -0,0 +1,158 @@ +import json +import subprocess +from pathlib import Path + +from tools import screenshot + + +def read_json(value: str): + assert not value.startswith("Error:"), value + return json.loads(value) + + +def completed(args, stdout='{"action":"ok"}', stderr="", returncode=0): + return subprocess.CompletedProcess( + args=args, + returncode=returncode, + stdout=stdout, + stderr=stderr, + ) + + +def screen_text_payload(text: str = "Hello screen"): + return { + "action": "extract_screen_text", + "source": "active_screen", + "screenshot_path": "/tmp/screen-text.png", + "image_size": {"width": 1200, "height": 800}, + "recognition_level": "accurate", + "text": text, + "length_chars": len(text), + "truncated": False, + "lines": [ + { + "text": "Hello screen", + "confidence": 0.98, + "frame": {"x": 10, "y": 20, "width": 200, "height": 30}, + } + ], + "visual_understanding": None, + } + + +def test_extract_screen_text_invokes_swift_helper(monkeypatch): + seen = {} + + def fake_run(args, **kwargs): + seen["args"] = args + seen["kwargs"] = kwargs + return completed(args, stdout=json.dumps(screen_text_payload())) + + monkeypatch.setattr(screenshot.subprocess, "run", fake_run) + + payload = read_json( + screenshot.extract_screen_text( + output_path="/tmp/out.png", + recognition_level="fast", + languages="en-US,fr-FR", + include_boxes=False, + visual_understanding="summary", + ) + ) + + assert payload["action"] == "extract_screen_text" + assert seen["args"][0] == "swift" + assert Path(seen["args"][1]).name == "extract-screen-text.swift" + assert seen["args"][2:] == [ + "/tmp/out.png", + "fast", + "en-US,fr-FR", + "false", + "summary", + ] + assert seen["kwargs"]["timeout"] == 90 + + +def test_extract_screen_text_uses_default_temp_screenshot_path(monkeypatch): + seen = {} + + def fake_run(args, **kwargs): + seen["target"] = args[2] + return completed(args, stdout=json.dumps(screen_text_payload())) + + monkeypatch.setattr(screenshot.subprocess, "run", fake_run) + + read_json(screenshot.extract_screen_text()) + + target = Path(seen["target"]) + assert target.parent == Path("/tmp/altic-mcp-screenshots") + assert target.name.startswith("screen-text-") + assert target.suffix == ".png" + + +def test_extract_screen_text_rejects_invalid_recognition_level(monkeypatch): + def fail_run(*args, **kwargs): + raise AssertionError("subprocess should not run for invalid recognition level") + + monkeypatch.setattr(screenshot.subprocess, "run", fail_run) + + result = screenshot.extract_screen_text(recognition_level="balanced") + + assert result.startswith("Error:") + assert "recognition_level must be one of" in result + + +def test_extract_screen_text_rejects_invalid_visual_understanding(monkeypatch): + def fail_run(*args, **kwargs): + raise AssertionError("subprocess should not run for invalid visual understanding") + + monkeypatch.setattr(screenshot.subprocess, "run", fail_run) + + result = screenshot.extract_screen_text(visual_understanding="describe_everything") + + assert result.startswith("Error:") + assert "visual_understanding must be one of" in result + + +def test_extract_screen_text_invalid_swift_json_returns_error(monkeypatch): + def fake_run(args, **kwargs): + return completed(args, stdout="not-json") + + monkeypatch.setattr(screenshot.subprocess, "run", fake_run) + + result = screenshot.extract_screen_text() + + assert result.startswith("Error: invalid screen text response:") + + +def test_extract_screen_text_truncates_long_text(monkeypatch): + long_text = "abcdefghijklmnopqrstuvwxyz" + + def fake_run(args, **kwargs): + return completed(args, stdout=json.dumps(screen_text_payload(long_text))) + + monkeypatch.setattr(screenshot.subprocess, "run", fake_run) + + payload = read_json(screenshot.extract_screen_text(max_chars=10)) + + assert payload["text"] == "abcdefghij" + assert payload["length_chars"] == len(long_text) + assert payload["truncated"] is True + + +def test_server_exposes_extract_screen_text_tool(): + import server + + tool_names = set(server.mcp._tool_manager._tools) + + assert "extract_screen_text" in tool_names + + +def test_swift_helper_contains_macos_27_foundation_models_primary_path(): + script = Path("tools/scripts/extract-screen-text.swift").read_text(encoding="utf-8") + + assert "LanguageModelSession" in script + assert "SystemLanguageModel.default" in script + assert "OCRTool()" in script + assert "Attachment(image)" in script + assert "FoundationModels image-input OCRTool integration is gated" not in script diff --git a/tools/screenshot.py b/tools/screenshot.py index 6d422f7..892901c 100644 --- a/tools/screenshot.py +++ b/tools/screenshot.py @@ -1,3 +1,4 @@ +import json import subprocess import tempfile import time @@ -8,6 +9,18 @@ from .constants import SCRIPTS_PREFIX +def _json(payload: dict) -> str: + return json.dumps(payload, indent=2, sort_keys=True) + + +def _error(message: str) -> str: + return f"Error: {message}" + + +def _screen_text_script() -> str: + return str(SCRIPTS_PREFIX / "extract-screen-text.swift") + + def capture_active_screen(output_path: str = "") -> str | list[object]: script_path = SCRIPTS_PREFIX / "capture-active-screen.swift" @@ -40,3 +53,77 @@ def capture_active_screen(output_path: str = "") -> str | list[object]: return [f"Captured active screen: {saved_path}", Image(path=saved_path)] except Exception as e: return f"Error: Failed to capture active screen: {str(e)}" + + +def extract_screen_text( + output_path: str = "", + recognition_level: str = "accurate", + languages: str = "", + include_boxes: bool = True, + max_chars: int = 20000, + visual_understanding: str = "none", +) -> str: + """ + Capture the active display and extract screen text with local Vision OCR. + """ + valid_levels = {"accurate", "fast"} + valid_visual_modes = {"none", "summary", "ui_map"} + + recognition_level = recognition_level.strip().lower() + visual_understanding = visual_understanding.strip().lower() + + if recognition_level not in valid_levels: + return _error( + "recognition_level must be one of: accurate, fast" + ) + if visual_understanding not in valid_visual_modes: + return _error( + "visual_understanding must be one of: none, summary, ui_map" + ) + + try: + max_chars = max(1, min(max_chars, 200000)) + target_path = output_path.strip() + if not target_path: + timestamp = int(time.time()) + shots_dir = Path("/tmp") / "altic-mcp-screenshots" + shots_dir.mkdir(parents=True, exist_ok=True) + target_path = str(shots_dir / f"screen-text-{timestamp}.png") + + target = Path(target_path).expanduser() + target.parent.mkdir(parents=True, exist_ok=True) + + result = subprocess.run( + [ + "swift", + _screen_text_script(), + str(target), + recognition_level, + languages, + str(include_boxes).lower(), + visual_understanding, + ], + capture_output=True, + text=True, + timeout=90, + ) + + if result.returncode != 0: + return _error( + result.stderr.strip() or "unable to extract screen text" + ) + + try: + payload = json.loads(result.stdout or "{}") + except json.JSONDecodeError as exc: + return _error(f"invalid screen text response: {exc}") + + text = str(payload.get("text", "")) + truncated_text = text[:max_chars] + payload["text"] = truncated_text + payload["length_chars"] = len(text) + payload["truncated"] = len(text) > len(truncated_text) + + return _json(payload) + except Exception as exc: + return _error(f"failed to extract screen text: {exc}") diff --git a/tools/scripts/extract-screen-text.swift b/tools/scripts/extract-screen-text.swift new file mode 100644 index 0000000..2284990 --- /dev/null +++ b/tools/scripts/extract-screen-text.swift @@ -0,0 +1,405 @@ +#!/usr/bin/env swift + +import AppKit +import Foundation +import ScreenCaptureKit +import Vision + +#if canImport(FoundationModels) && canImport(_Vision_FoundationModels) +import FoundationModels +import _Vision_FoundationModels +#endif + +func area(_ rect: CGRect) -> CGFloat { + max(0, rect.width) * max(0, rect.height) +} + +func fail(_ message: String, code: Int32 = 1) -> Never { + fputs("\(message)\n", stderr) + exit(code) +} + +func displayForFrontmostApp(content: SCShareableContent) -> SCDisplay? { + guard let app = NSWorkspace.shared.frontmostApplication else { + return nil + } + + let targetPID = app.processIdentifier + let appWindows = content.windows.filter { window in + window.owningApplication?.processID == targetPID + } + + guard + let frontWindow = appWindows.max(by: { lhs, rhs in + area(lhs.frame) < area(rhs.frame) + }) + else { + return nil + } + + let targetRect = frontWindow.frame + return content.displays.max(by: { lhs, rhs in + area(lhs.frame.intersection(targetRect)) < area(rhs.frame.intersection(targetRect)) + }) +} + +func captureDisplay(to outputPath: String, display: SCDisplay) async throws -> CGImage { + let filter = SCContentFilter(display: display, excludingWindows: []) + let config = SCStreamConfiguration() + + let image = try await SCScreenshotManager.captureImage( + contentFilter: filter, + configuration: config + ) + + let bitmap = NSBitmapImageRep(cgImage: image) + guard let pngData = bitmap.representation(using: .png, properties: [:]) else { + throw NSError( + domain: "altic-mcp.extract-screen-text", + code: 2, + userInfo: [NSLocalizedDescriptionKey: "Could not encode screenshot as PNG."] + ) + } + + let outputURL = URL(fileURLWithPath: outputPath) + try FileManager.default.createDirectory( + at: outputURL.deletingLastPathComponent(), + withIntermediateDirectories: true + ) + try pngData.write(to: outputURL) + + return image +} + +func recognizedLines( + in image: CGImage, + recognitionLevel: String, + languages: [String], + includeBoxes: Bool +) throws -> [[String: Any]] { + let request = VNRecognizeTextRequest() + request.recognitionLevel = recognitionLevel == "fast" ? .fast : .accurate + request.usesLanguageCorrection = true + if !languages.isEmpty { + request.recognitionLanguages = languages + } + + let handler = VNImageRequestHandler(cgImage: image, options: [:]) + try handler.perform([request]) + + let observations = request.results ?? [] + let imageWidth = Double(image.width) + let imageHeight = Double(image.height) + + let entries = observations.compactMap { observation -> (line: [String: Any], x: Double, y: Double)? in + guard let candidate = observation.topCandidates(1).first else { + return nil + } + + let box = observation.boundingBox + let frame = [ + "x": Double(box.minX) * imageWidth, + "y": (1.0 - Double(box.maxY)) * imageHeight, + "width": Double(box.width) * imageWidth, + "height": Double(box.height) * imageHeight, + ] + var line: [String: Any] = [ + "text": candidate.string, + "confidence": Double(candidate.confidence), + ] + + if includeBoxes { + line["frame"] = frame + } + + return (line, frame["x"] ?? 0, frame["y"] ?? 0) + }.sorted { lhs, rhs in + if abs(lhs.y - rhs.y) > 4 { + return lhs.y < rhs.y + } + return lhs.x < rhs.x + } + + return entries.map { $0.line } +} + +struct ScreenTextExtraction { + let engine: String + let text: String + let visualUnderstanding: Any +} + +func linesFromText(_ text: String) -> [[String: Any]] { + text + .split(separator: "\n", omittingEmptySubsequences: false) + .map { String($0).trimmingCharacters(in: .whitespacesAndNewlines) } + .filter { !$0.isEmpty } + .map { line in + [ + "text": line, + "confidence": 1.0, + ] + } +} + +func visualUnderstandingUnavailablePayload(mode: String, reason: String) -> Any { + if mode == "none" { + return NSNull() + } + + return [ + "available": false, + "mode": mode, + "reason": reason, + ] +} + +#if canImport(FoundationModels) && canImport(_Vision_FoundationModels) +@available(macOS 27.0, *) +func foundationModelsTextPrompt(image: CGImage) -> Prompt { + return Prompt { + """ + Use OCRTool on the attached image labeled "screen". Return only the exact visible text \ + from the screen, preserving line breaks where practical. Do not summarize or add commentary. + """ + Attachment(image) + .label("screen") + } +} + +@available(macOS 27.0, *) +func foundationModelsVisualPrompt(for mode: String, image: CGImage, extractedText: String) -> Prompt { + let instruction: String + switch mode { + case "summary": + instruction = """ + Use the attached image labeled "screen" and the OCR text below to return a concise summary \ + of what the screen is showing. Preserve important labels, warnings, numbers, and button text. + + OCR text: + \(extractedText) + """ + case "ui_map": + instruction = """ + Use the attached image labeled "screen" and the OCR text below to describe the visible UI \ + structure as compact JSON with sections, controls, and important labels. Keep the response \ + short and machine-readable. + + OCR text: + \(extractedText) + """ + default: + instruction = extractedText + } + + return Prompt { + instruction + Attachment(image) + .label("screen") + } +} + +@available(macOS 27.0, *) +func extractWithFoundationModels(in image: CGImage, mode: String) async throws -> ScreenTextExtraction? { + let model = SystemLanguageModel.default + guard model.isAvailable else { + return nil + } + + let session = LanguageModelSession( + model: model, + tools: [ + OCRTool(), + ], + instructions: """ + You are a local screen text extraction engine. Prefer OCRTool whenever text \ + is needed from the attached image. Return concise, faithful output. + """ + ) + + let textResponse = try await session.respond(to: foundationModelsTextPrompt(image: image)) + let extractedText = textResponse.content.trimmingCharacters(in: .whitespacesAndNewlines) + guard !extractedText.isEmpty else { + return nil + } + + let visualUnderstanding: Any + if mode == "none" { + visualUnderstanding = NSNull() + } else { + let visualResponse = try await session.respond( + to: foundationModelsVisualPrompt(for: mode, image: image, extractedText: extractedText) + ) + let visualContent = visualResponse.content.trimmingCharacters(in: .whitespacesAndNewlines) + visualUnderstanding = [ + "available": true, + "engine": "foundation_models", + "mode": mode, + "content": visualContent, + ] + } + + return ScreenTextExtraction( + engine: "foundation_models", + text: extractedText, + visualUnderstanding: visualUnderstanding + ) +} +#endif + +func extractWithFoundationModelsIfAvailable(in image: CGImage, mode: String) async throws -> ScreenTextExtraction? { + #if canImport(FoundationModels) && canImport(_Vision_FoundationModels) + if #available(macOS 27.0, *) { + return try await extractWithFoundationModels(in: image, mode: mode) + } + #endif + + return nil +} + +func visionExtraction( + in image: CGImage, + recognitionLevel: String, + languages: [String], + includeBoxes: Bool, + visualUnderstanding: String +) throws -> (ScreenTextExtraction, [[String: Any]]) { + let lines = try recognizedLines( + in: image, + recognitionLevel: recognitionLevel, + languages: languages, + includeBoxes: includeBoxes + ) + let text = lines.compactMap { $0["text"] as? String }.joined(separator: "\n") + let extraction = ScreenTextExtraction( + engine: "vision", + text: text, + visualUnderstanding: visualUnderstandingUnavailablePayload( + mode: visualUnderstanding, + reason: "requires macOS 27 runtime, Apple Intelligence availability, and FoundationModels SDK" + ) + ) + return (extraction, lines) +} + +func lineMetadata( + for extraction: ScreenTextExtraction, + image: CGImage, + recognitionLevel: String, + languages: [String], + includeBoxes: Bool +) -> [[String: Any]] { + if extraction.engine == "foundation_models" { + do { + return try recognizedLines( + in: image, + recognitionLevel: recognitionLevel, + languages: languages, + includeBoxes: includeBoxes + ) + } catch { + return linesFromText(extraction.text) + } + } + + return linesFromText(extraction.text) +} + +func topLevelVisualUnderstanding(for extraction: ScreenTextExtraction, mode: String) -> Any { + if extraction.engine == "foundation_models" { + return extraction.visualUnderstanding + } + if mode == "none" { + return NSNull() + } + return extraction.visualUnderstanding +} + +func primaryExtraction( + in image: CGImage, + recognitionLevel: String, + languages: [String], + includeBoxes: Bool, + visualUnderstanding: String +) async throws -> (ScreenTextExtraction, [[String: Any]]) { + if let foundationExtraction = try await extractWithFoundationModelsIfAvailable(in: image, mode: visualUnderstanding) { + let lines = lineMetadata( + for: foundationExtraction, + image: image, + recognitionLevel: recognitionLevel, + languages: languages, + includeBoxes: includeBoxes + ) + return (foundationExtraction, lines.isEmpty ? linesFromText(foundationExtraction.text) : lines) + } + + return try visionExtraction( + in: image, + recognitionLevel: recognitionLevel, + languages: languages, + includeBoxes: includeBoxes, + visualUnderstanding: visualUnderstanding + ) +} + +let args = CommandLine.arguments +guard args.count >= 2 else { + fail("Usage: extract-screen-text.swift [accurate|fast] [languages_csv] [include_boxes] [none|summary|ui_map]") +} + +let outputPath = args[1] +let recognitionLevel = args.count >= 3 ? args[2].lowercased() : "accurate" +let languages = args.count >= 4 + ? args[3].split(separator: ",").map { $0.trimmingCharacters(in: .whitespacesAndNewlines) }.filter { !$0.isEmpty } + : [] +let includeBoxes = args.count >= 5 ? (args[4].lowercased() != "false") : true +let visualUnderstanding = args.count >= 6 ? args[5].lowercased() : "none" + +guard ["accurate", "fast"].contains(recognitionLevel) else { + fail("recognition_level must be one of: accurate, fast") +} +guard ["none", "summary", "ui_map"].contains(visualUnderstanding) else { + fail("visual_understanding must be one of: none, summary, ui_map") +} + +do { + let content = try await SCShareableContent.excludingDesktopWindows(false, onScreenWindowsOnly: true) + let display = displayForFrontmostApp(content: content) ?? content.displays.first + guard let display else { + fail("Could not determine a display to capture.") + } + + let image = try await captureDisplay(to: outputPath, display: display) + let (extraction, lines) = try await primaryExtraction( + in: image, + recognitionLevel: recognitionLevel, + languages: languages, + includeBoxes: includeBoxes, + visualUnderstanding: visualUnderstanding + ) + + let payload: [String: Any] = [ + "action": "extract_screen_text", + "engine": extraction.engine, + "source": "active_screen", + "screenshot_path": outputPath, + "image_size": [ + "width": image.width, + "height": image.height, + ], + "recognition_level": recognitionLevel, + "text": extraction.text, + "length_chars": extraction.text.count, + "truncated": false, + "lines": lines, + "visual_understanding": topLevelVisualUnderstanding(for: extraction, mode: visualUnderstanding), + ] + + let data = try JSONSerialization.data(withJSONObject: payload, options: [.prettyPrinted, .sortedKeys]) + guard let json = String(data: data, encoding: .utf8) else { + fail("Could not encode OCR payload as UTF-8.") + } + print(json) +} catch { + fail(error.localizedDescription) +}