From 71ffceafa2ad85c84d40a164d0f08411f0fc30f0 Mon Sep 17 00:00:00 2001
From: grohith327 <grohith327@gmail.com>
Date: Sat, 13 Jun 2026 20:46:23 -0700
Subject: [PATCH 1/2] Delete docs

---
 .../2026-05-16-window-workspace-manager.md    | 933 ------------------
 1 file changed, 933 deletions(-)
 delete mode 100644 docs/superpowers/plans/2026-05-16-window-workspace-manager.md
diff --git a/docs/superpowers/plans/2026-05-16-window-workspace-manager.md b/docs/superpowers/plans/2026-05-16-window-workspace-manager.md
deleted file mode 100644
index 4464066..0000000
--- a/docs/superpowers/plans/2026-05-16-window-workspace-manager.md
+++ /dev/null
@@ -1,933 +0,0 @@
-# Window Workspace Manager Implementation Plan
-
-> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking.
-
-**Goal:** Add macOS app and window management MCP tools so agents can focus apps, inspect windows, move/resize/place windows, tile workspaces, minimize windows, hide apps, quit apps, and identify the frontmost app.
-
-**Architecture:** Add a dedicated `tools/window.py` Python wrapper module backed by a single Swift Accessibility helper at `tools/scripts/window-manager.swift`. Python owns validation, MCP-facing names, JSON normalization, and subprocess error handling; Swift owns macOS Accessibility, display geometry, and window mutation.
-
-**Tech Stack:** Python 3.13, FastMCP, pytest, Swift, AppKit, ApplicationServices Accessibility APIs, CoreGraphics display/window APIs.
-
----
-
-## File Structure
-
-- Create `tools/window.py`
-  - Public Python functions used by `server.py`.
-  - Shared `_run_window_manager(action, payload)` helper that invokes `swift tools/scripts/window-manager.swift <action> <payload-json>`.
-  - Small validators for dimensions, display index, tile layout, and app/window target parameters.
-  - JSON passthrough for state-returning operations and `"Error: ..."` strings for failures, matching existing modules such as `tools/files.py` and `tools/clipboard.py`.
-
-- Create `tools/scripts/window-manager.swift`
-  - One command-line helper with subcommands: `get_frontmost_app`, `list_windows`, `focus_window`, `move_window`, `resize_window`, `center_window`, `tile_windows`, `minimize`, `hide_app`, `quit_app`.
-  - Uses `NSWorkspace.shared.frontmostApplication`, `NSRunningApplication`, `AXUIElement`, `AXUIElementCopyAttributeValue`, `AXUIElementSetAttributeValue`, `CGWindowListCopyWindowInfo`, and `NSScreen.screens`.
-  - Returns JSON on stdout for all successful operations.
-  - Writes human-readable errors to stderr and exits non-zero for failures.
-
-- Modify `tools/__init__.py`
-  - No public exports are currently declared, so this likely remains unchanged unless the file has future explicit imports when execution starts.
-
-- Modify `server.py`
-  - Import `window`.
-  - Register MCP tools:
-    - `get_frontmost_app`
-    - `list_windows`
-    - `focus_window`
-    - `move_window`
-    - `resize_window`
-    - `center_window`
-    - `tile_windows`
-    - `minimize`
-    - `hide_app`
-    - `quit_app`
-  - Keep `open_app` in `tools/app.py`; do not move it during this feature.
-
-- Create `tests/test_window.py`
-  - Unit tests monkeypatching `tools.window.subprocess.run`.
-  - Tests cover argument serialization, validation, non-zero subprocess failures, JSON normalization, and all public wrappers.
-
-- Modify `README.md`
-  - Add the Window & Workspace feature to the feature list.
-  - Add macOS permissions note: Accessibility is required for window focus, movement, resize, minimize, hide, and quit operations; Screen Recording improves `list_windows` titles on modern macOS.
-  - Add manual smoke tests for display-aware placement.
-
-- Modify `skills/altic-studio/SKILL.md`
-  - Add the new window/workspace tools to the shareable skill so installed agents know they exist.
-
----
-
-## Behavior Contract
-
-### Targeting
-
-All window operations target a visible, non-desktop app window using this precedence:
-
-1. `window_id` when provided.
-2. `app_name` plus `window_index`, where `window_index` is 1-based among that app's windows sorted front-to-back.
-3. Frontmost app's frontmost window when no target is provided.
-
-Use case-insensitive substring matching for `app_name` against localized app name, bundle identifier, and process name. If multiple apps match, return an error listing the matching apps so the caller can retry with a more specific name.
-
-### Coordinate System
-
-MCP tools accept and return AppKit/global display coordinates. `x` and `y` are top-left coordinates. `width` and `height` are window size in points. Swift converts to `AXValue` `CGPoint` and `CGSize` for `kAXPositionAttribute` and `kAXSizeAttribute`.
-
-### Display-Aware Placement
-
-`display_index` is optional and 1-based. When omitted, use the display with the largest intersection with the target window. If no window is available, use the main display. Safe placement uses `NSScreen.visibleFrame`, not full frame, so windows avoid the Dock and menu bar.
-
-### Tool Return Shapes
-
-`get_frontmost_app` returns:
-
-```json
-{
-  "action": "get_frontmost_app",
-  "app": {
-    "name": "Safari",
-    "bundle_id": "com.apple.Safari",
-    "pid": 12345,
-    "is_active": true
-  }
-}
-```
-
-`list_windows` returns:
-
-```json
-{
-  "action": "list_windows",
-  "windows": [
-    {
-      "window_id": 101,
-      "app_name": "Safari",
-      "bundle_id": "com.apple.Safari",
-      "pid": 12345,
-      "title": "Example Page",
-      "x": 40,
-      "y": 80,
-      "width": 1200,
-      "height": 800,
-      "display_index": 1,
-      "is_minimized": false,
-      "is_frontmost_app": true
-    }
-  ],
-  "count": 1
-}
-```
-
-Mutation tools return:
-
-```json
-{
-  "action": "move_window",
-  "window": {
-    "window_id": 101,
-    "app_name": "Safari",
-    "x": 100,
-    "y": 120,
-    "width": 1000,
-    "height": 700,
-    "display_index": 1
-  }
-}
-```
-
----
-
-## Task 1: Python Window Wrapper Tests
-
-**Files:**
-- Create: `tests/test_window.py`
-- Create later: `tools/window.py`
-
-- [ ] **Step 1: Write failing wrapper tests**
-
-Create `tests/test_window.py` with these tests:
-
-```python
-import json
-import subprocess
-from pathlib import Path
-
-import pytest
-
-from tools import window
-
-
-def read_json(value: str):
-    assert not value.startswith("Error:"), value
-    return json.loads(value)
-
-
-def completed(args, stdout='{"action":"ok"}', stderr="", returncode=0):
-    return subprocess.CompletedProcess(
-        args=args,
-        returncode=returncode,
-        stdout=stdout,
-        stderr=stderr,
-    )
-
-
-def test_get_frontmost_app_invokes_swift_helper(monkeypatch):
-    seen = {}
-    payload = {
-        "action": "get_frontmost_app",
-        "app": {
-            "name": "Finder",
-            "bundle_id": "com.apple.finder",
-            "pid": 42,
-            "is_active": True,
-        },
-    }
-
-    def fake_run(args, **kwargs):
-        seen["args"] = args
-        seen["kwargs"] = kwargs
-        return completed(args, stdout=json.dumps(payload))
-
-    monkeypatch.setattr(window.subprocess, "run", fake_run)
-
-    assert read_json(window.get_frontmost_app()) == payload
-    assert seen["args"][0] == "swift"
-    assert Path(seen["args"][1]).name == "window-manager.swift"
-    assert seen["args"][2] == "get_frontmost_app"
-    assert json.loads(seen["args"][3]) == {}
-    assert seen["kwargs"]["timeout"] == 10
-
-
-def test_move_window_serializes_target_and_coordinates(monkeypatch):
-    seen = {}
-
-    def fake_run(args, **kwargs):
-        seen["payload"] = json.loads(args[3])
-        return completed(args, stdout='{"action":"move_window"}')
-
-    monkeypatch.setattr(window.subprocess, "run", fake_run)
-
-    payload = read_json(
-        window.move_window(
-            x=100,
-            y=140,
-            app_name="Safari",
-            window_index=2,
-            display_index=1,
-        )
-    )
-
-    assert payload["action"] == "move_window"
-    assert seen["payload"] == {
-        "app_name": "Safari",
-        "window_index": 2,
-        "x": 100,
-        "y": 140,
-        "display_index": 1,
-    }
-
-
-def test_resize_window_rejects_non_positive_dimensions(monkeypatch):
-    def fail_run(*args, **kwargs):
-        raise AssertionError("subprocess should not run for invalid dimensions")
-
-    monkeypatch.setattr(window.subprocess, "run", fail_run)
-
-    assert window.resize_window(width=0, height=500).startswith("Error:")
-    assert window.resize_window(width=500, height=-1).startswith("Error:")
-
-
-def test_center_window_serializes_size_when_provided(monkeypatch):
-    seen = {}
-
-    def fake_run(args, **kwargs):
-        seen["payload"] = json.loads(args[3])
-        return completed(args, stdout='{"action":"center_window"}')
-
-    monkeypatch.setattr(window.subprocess, "run", fake_run)
-
-    read_json(
-        window.center_window(
-            width=900,
-            height=700,
-            app_name="Terminal",
-            display_index=2,
-        )
-    )
-
-    assert seen["payload"] == {
-        "app_name": "Terminal",
-        "width": 900,
-        "height": 700,
-        "display_index": 2,
-    }
-
-
-def test_tile_windows_validates_layout(monkeypatch):
-    def fail_run(*args, **kwargs):
-        raise AssertionError("subprocess should not run for invalid layout")
-
-    monkeypatch.setattr(window.subprocess, "run", fail_run)
-
-    result = window.tile_windows(layout="spiral", app_names=["Safari", "Terminal"])
-
-    assert result.startswith("Error:")
-    assert "layout must be one of" in result
-
-
-def test_tile_windows_serializes_apps_and_padding(monkeypatch):
-    seen = {}
-
-    def fake_run(args, **kwargs):
-        seen["payload"] = json.loads(args[3])
-        return completed(args, stdout='{"action":"tile_windows","count":2}')
-
-    monkeypatch.setattr(window.subprocess, "run", fake_run)
-
-    payload = read_json(
-        window.tile_windows(
-            layout="columns",
-            app_names=["Safari", "Terminal"],
-            display_index=1,
-            padding=12,
-        )
-    )
-
-    assert payload == {"action": "tile_windows", "count": 2}
-    assert seen["payload"] == {
-        "layout": "columns",
-        "app_names": ["Safari", "Terminal"],
-        "display_index": 1,
-        "padding": 12,
-    }
-
-
-def test_focus_minimize_hide_and_quit_serialize_targets(monkeypatch):
-    calls = []
-
-    def fake_run(args, **kwargs):
-        calls.append((args[2], json.loads(args[3])))
-        return completed(args, stdout=json.dumps({"action": args[2]}))
-
-    monkeypatch.setattr(window.subprocess, "run", fake_run)
-
-    assert read_json(window.focus_window(app_name="Safari"))["action"] == "focus_window"
-    assert read_json(window.minimize(app_name="Safari", window_index=1))["action"] == "minimize"
-    assert read_json(window.hide_app("Safari"))["action"] == "hide_app"
-    assert read_json(window.quit_app("Safari"))["action"] == "quit_app"
-
-    assert calls == [
-        ("focus_window", {"app_name": "Safari"}),
-        ("minimize", {"app_name": "Safari", "window_index": 1}),
-        ("hide_app", {"app_name": "Safari"}),
-        ("quit_app", {"app_name": "Safari"}),
-    ]
-
-
-def test_subprocess_failure_returns_error(monkeypatch):
-    def fake_run(args, **kwargs):
-        return completed(args, stdout="", stderr="accessibility permission denied", returncode=1)
-
-    monkeypatch.setattr(window.subprocess, "run", fake_run)
-
-    assert window.get_frontmost_app() == "Error: accessibility permission denied"
-
-
-def test_invalid_json_from_helper_returns_error(monkeypatch):
-    def fake_run(args, **kwargs):
-        return completed(args, stdout="not-json")
-
-    monkeypatch.setattr(window.subprocess, "run", fake_run)
-
-    assert window.get_frontmost_app().startswith("Error: invalid window manager response:")
-```
-
-- [ ] **Step 2: Run tests to verify they fail**
-
-Run:
-
-```bash
-uv run pytest tests/test_window.py -v
-```
-
-Expected: import failure or attribute failure because `tools/window.py` does not exist yet.
-
----
-
-## Task 2: Python Window Wrapper Implementation
-
-**Files:**
-- Create: `tools/window.py`
-- Test: `tests/test_window.py`
-
-- [ ] **Step 1: Implement Python wrapper**
-
-Create `tools/window.py` with:
-
-```python
-import json
-import subprocess
-from typing import Any
-
-from .constants import SCRIPTS_PREFIX
-
-
-VALID_TILE_LAYOUTS = {"columns", "rows", "grid"}
-
-
-def _json(payload: dict[str, Any]) -> str:
-    return json.dumps(payload, indent=2, sort_keys=True)
-
-
-def _error(message: str) -> str:
-    return f"Error: {message}"
-
-
-def _positive_int(name: str, value: int | None) -> str | None:
-    if value is not None and value <= 0:
-        return f"{name} must be greater than 0"
-    return None
-
-
-def _target_payload(
-    app_name: str = "",
-    window_id: int | None = None,
-    window_index: int | None = None,
-    display_index: int | None = None,
-) -> dict[str, Any]:
-    payload: dict[str, Any] = {}
-    if app_name.strip():
-        payload["app_name"] = app_name.strip()
-    if window_id is not None:
-        payload["window_id"] = window_id
-    if window_index is not None:
-        payload["window_index"] = window_index
-    if display_index is not None:
-        payload["display_index"] = display_index
-    return payload
-
-
-def _run_window_manager(action: str, payload: dict[str, Any] | None = None) -> str:
-    script_path = SCRIPTS_PREFIX / "window-manager.swift"
-    args = ["swift", str(script_path), action, json.dumps(payload or {})]
-    try:
-        result = subprocess.run(
-            args,
-            capture_output=True,
-            text=True,
-            timeout=10,
-        )
-        if result.returncode != 0:
-            return _error(result.stderr.strip() or f"window manager action failed: {action}")
-
-        stdout = result.stdout.strip()
-        if not stdout:
-            return _error(f"empty window manager response for action: {action}")
-
-        try:
-            parsed = json.loads(stdout)
-        except json.JSONDecodeError:
-            return _error(f"invalid window manager response: {stdout}")
-        return _json(parsed)
-    except Exception as exc:
-        return _error(f"failed to run window manager action {action}: {exc}")
-
-
-def get_frontmost_app() -> str:
-    return _run_window_manager("get_frontmost_app")
-
-
-def list_windows(app_name: str = "", include_minimized: bool = False) -> str:
-    payload: dict[str, Any] = {"include_minimized": include_minimized}
-    if app_name.strip():
-        payload["app_name"] = app_name.strip()
-    return _run_window_manager("list_windows", payload)
-
-
-def focus_window(
-    app_name: str = "",
-    window_id: int | None = None,
-    window_index: int | None = None,
-) -> str:
-    return _run_window_manager(
-        "focus_window",
-        _target_payload(app_name=app_name, window_id=window_id, window_index=window_index),
-    )
-
-
-def move_window(
-    x: int,
-    y: int,
-    app_name: str = "",
-    window_id: int | None = None,
-    window_index: int | None = None,
-    display_index: int | None = None,
-) -> str:
-    for name, value in (("window_id", window_id), ("window_index", window_index), ("display_index", display_index)):
-        error = _positive_int(name, value)
-        if error:
-            return _error(error)
-    payload = _target_payload(app_name, window_id, window_index, display_index)
-    payload.update({"x": x, "y": y})
-    return _run_window_manager("move_window", payload)
-
-
-def resize_window(
-    width: int,
-    height: int,
-    app_name: str = "",
-    window_id: int | None = None,
-    window_index: int | None = None,
-    display_index: int | None = None,
-) -> str:
-    for name, value in (("width", width), ("height", height), ("window_id", window_id), ("window_index", window_index), ("display_index", display_index)):
-        error = _positive_int(name, value)
-        if error:
-            return _error(error)
-    payload = _target_payload(app_name, window_id, window_index, display_index)
-    payload.update({"width": width, "height": height})
-    return _run_window_manager("resize_window", payload)
-
-
-def center_window(
-    app_name: str = "",
-    window_id: int | None = None,
-    window_index: int | None = None,
-    display_index: int | None = None,
-    width: int | None = None,
-    height: int | None = None,
-) -> str:
-    for name, value in (("width", width), ("height", height), ("window_id", window_id), ("window_index", window_index), ("display_index", display_index)):
-        error = _positive_int(name, value)
-        if error:
-            return _error(error)
-    payload = _target_payload(app_name, window_id, window_index, display_index)
-    if width is not None:
-        payload["width"] = width
-    if height is not None:
-        payload["height"] = height
-    return _run_window_manager("center_window", payload)
-
-
-def tile_windows(
-    layout: str = "columns",
-    app_names: list[str] | None = None,
-    display_index: int | None = None,
-    padding: int = 8,
-) -> str:
-    if layout not in VALID_TILE_LAYOUTS:
-        return _error(f"layout must be one of: {', '.join(sorted(VALID_TILE_LAYOUTS))}")
-    if padding < 0:
-        return _error("padding must be greater than or equal to 0")
-    error = _positive_int("display_index", display_index)
-    if error:
-        return _error(error)
-    payload: dict[str, Any] = {"layout": layout, "padding": padding}
-    if app_names:
-        payload["app_names"] = [name.strip() for name in app_names if name.strip()]
-    if display_index is not None:
-        payload["display_index"] = display_index
-    return _run_window_manager("tile_windows", payload)
-
-
-def minimize(
-    app_name: str = "",
-    window_id: int | None = None,
-    window_index: int | None = None,
-) -> str:
-    return _run_window_manager(
-        "minimize",
-        _target_payload(app_name=app_name, window_id=window_id, window_index=window_index),
-    )
-
-
-def hide_app(app_name: str) -> str:
-    if not app_name.strip():
-        return _error("app_name cannot be empty")
-    return _run_window_manager("hide_app", {"app_name": app_name.strip()})
-
-
-def quit_app(app_name: str) -> str:
-    if not app_name.strip():
-        return _error("app_name cannot be empty")
-    return _run_window_manager("quit_app", {"app_name": app_name.strip()})
-```
-
-- [ ] **Step 2: Run wrapper tests**
-
-Run:
-
-```bash
-uv run pytest tests/test_window.py -v
-```
-
-Expected: all tests in `tests/test_window.py` pass, except failures caused by missing Swift script only if a test did not monkeypatch subprocess correctly.
-
-- [ ] **Step 3: Commit wrapper**
-
-Run:
-
-```bash
-git add tools/window.py tests/test_window.py
-git commit -m "feat: add window manager python wrappers"
-```
-
----
-
-## Task 3: Swift Window Manager Helper
-
-**Files:**
-- Create: `tools/scripts/window-manager.swift`
-- Test manually with `swift tools/scripts/window-manager.swift get_frontmost_app '{}'`
-
-- [ ] **Step 1: Implement command-line parsing**
-
-The script must accept exactly two arguments after the script path:
-
-```text
-window-manager.swift <action> <payload-json>
-```
-
-Implementation requirements:
-
-- Decode payload with `JSONSerialization.jsonObject`.
-- Store payload as `[String: Any]`.
-- Dispatch on `action`.
-- On success, print one compact JSON object to stdout.
-- On failure, print the error message to stderr and exit 1.
-
-- [ ] **Step 2: Implement app/window discovery**
-
-Implement these Swift helper types and functions:
-
-```swift
-struct ManagedApp {
-    let name: String
-    let bundleID: String
-    let pid: pid_t
-    let application: NSRunningApplication
-}
-
-struct ManagedWindow {
-    let windowID: Int
-    let app: ManagedApp
-    let title: String
-    let frame: CGRect
-    let axWindow: AXUIElement?
-    let isMinimized: Bool
-}
-```
-
-Required behavior:
-
-- `frontmostApp()` reads `NSWorkspace.shared.frontmostApplication`.
-- `runningApps(matching:)` searches `NSWorkspace.shared.runningApplications`.
-- App matching checks localized name, bundle identifier, and executable URL last path component using case-insensitive substring matching.
-- `axWindows(for:)` creates `AXUIElementCreateApplication(pid)` and reads `kAXWindowsAttribute`.
-- `cgWindows()` uses `CGWindowListCopyWindowInfo([.optionOnScreenOnly, .excludeDesktopElements], kCGNullWindowID)` to enrich windows with IDs and titles.
-- `managedWindows(appName:includeMinimized:)` combines AX windows and CG windows by matching PID plus nearest equal frame. If a CG window ID cannot be matched, use `0` for `window_id` and still allow AX-only operations.
-
-- [ ] **Step 3: Implement display helpers**
-
-Required helpers:
-
-- `screens()` returns `NSScreen.screens` in their current order.
-- `screenForDisplayIndex(_:)` accepts 1-based indexes and returns an error for out-of-range values.
-- `screenForWindow(_:)` picks the visible frame with the largest intersection with the window frame.
-- `clampedFrame(rect:screen:)` keeps width and height at least `120x80` and within `screen.visibleFrame` when possible.
-- Every placement tool uses `visibleFrame`.
-
-- [ ] **Step 4: Implement read operations**
-
-Implement:
-
-- `get_frontmost_app`
-- `list_windows`
-
-Success output must match the behavior contract above. Include `display_index` for each listed window by comparing the window frame to `NSScreen.visibleFrame`.
-
-- [ ] **Step 5: Implement app operations**
-
-Implement:
-
-- `hide_app`: resolve one app and call `application.hide()`.
-- `quit_app`: resolve one app and call `application.terminate()`.
-
-Return JSON with `action`, `app_name`, `bundle_id`, and `pid`.
-
-- [ ] **Step 6: Implement window operations**
-
-Implement:
-
-- `focus_window`: resolve target, call `application.activate(options: [.activateAllWindows, .activateIgnoringOtherApps])`, then set `kAXMainAttribute` and `kAXFocusedAttribute` to `true` on the target AX window.
-- `minimize`: set `kAXMinimizedAttribute` to `true`.
-- `move_window`: set `kAXPositionAttribute`.
-- `resize_window`: set `kAXSizeAttribute`.
-- `center_window`: optionally resize first, then calculate centered origin inside selected display visible frame.
-- `tile_windows`: resolve requested apps or use visible windows from the frontmost display; calculate frames for:
-  - `columns`: equal-width columns.
-  - `rows`: equal-height rows.
-  - `grid`: `ceil(sqrt(count))` columns and `ceil(count / columns)` rows.
-
-Return JSON with the final window frame or list of final frames after each mutation.
-
-- [ ] **Step 7: Run Swift smoke checks**
-
-Run:
-
-```bash
-swift tools/scripts/window-manager.swift get_frontmost_app '{}'
-swift tools/scripts/window-manager.swift list_windows '{"include_minimized":false}'
-swift tools/scripts/window-manager.swift center_window '{"app_name":"Finder"}'
-```
-
-Expected:
-
-- The first command prints valid JSON with the current frontmost app.
-- The second command prints valid JSON with a `windows` array.
-- The third command either centers Finder's frontmost window or prints a clear Accessibility permission error.
-
-- [ ] **Step 8: Commit Swift helper**
-
-Run:
-
-```bash
-git add tools/scripts/window-manager.swift
-git commit -m "feat: add swift window manager helper"
-```
-
----
-
-## Task 4: MCP Tool Registration
-
-**Files:**
-- Modify: `server.py`
-- Test: `tests/test_window.py`
-
-- [ ] **Step 1: Import the window module**
-
-Modify the `from tools import (...)` block in `server.py` to include `window`.
-
-- [ ] **Step 2: Register MCP tools near `open_app`**
-
-Add tool functions in `server.py` after `open_app`:
-
-```python
-@mcp.tool()
-async def get_frontmost_app() -> str:
-    """
-    Get the currently frontmost macOS application.
-
-    Returns:
-        JSON string with app name, bundle id, pid, and active state.
-    """
-    return window.get_frontmost_app()
-
-
-@mcp.tool()
-async def list_windows(
-    app_name: str = Field(default=""),
-    include_minimized: bool = Field(default=False),
-) -> str:
-    """
-    List manageable macOS windows.
-
-    Args:
-        app_name: Optional app name, bundle id, or process name filter
-        include_minimized: Include minimized windows when available
-
-    Returns:
-        JSON string with window ids, app metadata, titles, frames, and display indexes.
-    """
-    return window.list_windows(app_name, include_minimized)
-
-
-@mcp.tool()
-async def focus_window(
-    app_name: str = Field(default=""),
-    window_id: int | None = Field(default=None),
-    window_index: int | None = Field(default=None),
-) -> str:
-    """
-    Focus a macOS window by window id, app name, or frontmost fallback.
-
-    Args:
-        app_name: Optional app name, bundle id, or process name
-        window_id: Optional CoreGraphics window id
-        window_index: Optional 1-based index among the app's windows
-
-    Returns:
-        JSON string with focused window metadata, or an error message.
-    """
-    return window.focus_window(app_name, window_id, window_index)
-```
-
-Also add analogous wrappers for `move_window`, `resize_window`, `center_window`, `tile_windows`, `minimize`, `hide_app`, and `quit_app` using the Python function signatures from Task 2. Use `Field` constraints for public numeric bounds:
-
-- `window_id`: `ge=1`
-- `window_index`: `ge=1`
-- `display_index`: `ge=1`
-- `width`: `ge=1`
-- `height`: `ge=1`
-- `padding`: `ge=0`, `le=100`
-
-- [ ] **Step 3: Add registration smoke test**
-
-Append this test to `tests/test_window.py`:
-
-```python
-def test_server_exposes_window_tools():
-    import server
-
-    tool_names = {tool.name for tool in server.mcp._tool_manager._tools.values()}
-
-    assert {
-        "get_frontmost_app",
-        "list_windows",
-        "focus_window",
-        "move_window",
-        "resize_window",
-        "center_window",
-        "tile_windows",
-        "minimize",
-        "hide_app",
-        "quit_app",
-    }.issubset(tool_names)
-```
-
-- [ ] **Step 4: Run tests**
-
-Run:
-
-```bash
-uv run pytest tests/test_window.py -v
-```
-
-Expected: all tests pass.
-
-- [ ] **Step 5: Commit MCP registration**
-
-Run:
-
-```bash
-git add server.py tests/test_window.py
-git commit -m "feat: expose window manager mcp tools"
-```
-
----
-
-## Task 5: Documentation and Skill Manifest
-
-**Files:**
-- Modify: `README.md`
-- Modify: `skills/altic-studio/SKILL.md`
-
-- [ ] **Step 1: Update README feature list**
-
-Add a bullet under `## Features`:
-
-```markdown
-- 🪟 **Window & Workspace** - List/focus apps and windows, move/resize/center/tile windows, minimize windows, hide apps, and quit apps
-```
-
-- [ ] **Step 2: Update permissions**
-
-Under permissions, add or expand:
-
-```markdown
-- ✅ **Accessibility** - Required for screen glow, system controls, and window management tools such as focus_window, move_window, resize_window, center_window, tile_windows, minimize, hide_app, and quit_app
-- ✅ **Screen Recording** - Required for screenshot capture tools and improves window title/id discovery for list_windows on recent macOS versions
-```
-
-- [ ] **Step 3: Add manual smoke tests**
-
-Add a section:
-
-```markdown
-## Manual Smoke Tests For Window Tools
-
-- Call `get_frontmost_app` while Finder or Safari is active.
-- Call `list_windows` and confirm visible app windows include frame and display metadata.
-- Open two apps, then call `tile_windows` with `layout="columns"` and their app names.
-- Call `center_window` with an app name and confirm the frontmost window is centered inside the visible display area.
-- Call `move_window` and `resize_window` with a test app window, then call `list_windows` to confirm the new frame.
-- Call `minimize` on a test app window and confirm it minimizes.
-- Call `hide_app` on a non-critical app and confirm the app is hidden.
-- Call `quit_app` only on a disposable test app.
-```
-
-- [ ] **Step 4: Update `skills/altic-studio/SKILL.md`**
-
-Add a concise tool group for window management. Use the exact tool names registered in `server.py`.
-
-- [ ] **Step 5: Commit docs**
-
-Run:
-
-```bash
-git add README.md skills/altic-studio/SKILL.md
-git commit -m "docs: document window workspace tools"
-```
-
----
-
-## Task 6: Full Verification
-
-**Files:**
-- All files touched by previous tasks
-
-- [ ] **Step 1: Run unit tests**
-
-Run:
-
-```bash
-uv run pytest -v
-```
-
-Expected: all existing file/clipboard tests plus new window wrapper tests pass.
-
-- [ ] **Step 2: Run Swift syntax check**
-
-Run:
-
-```bash
-swift tools/scripts/window-manager.swift get_frontmost_app '{}'
-```
-
-Expected: valid JSON or a clear macOS permission error. A Swift compile error is a failure.
-
-- [ ] **Step 3: Run MCP server import check**
-
-Run:
-
-```bash
-uv run python - <<'PY'
-import server
-print(server.mcp.name)
-PY
-```
-
-Expected: prints `Altic-MCP` with no import errors.
-
-- [ ] **Step 4: Final status check**
-
-Run:
-
-```bash
-git status --short
-```
-
-Expected: no uncommitted changes unless the executor intentionally keeps the branch unstaged for review.
-
----
-
-## Open Decisions
-
-- Add `list_windows` even though the feature list did not explicitly request it. This is necessary for reliable targeting and aligns with the competitor comparison.
-- Keep `open_app` in `tools/app.py`; this plan adds complementary app operations to `tools/window.py` because they share target resolution with windows.
-- Use Swift instead of AppleScript for window placement because the feature needs display-aware geometry, CG window ids, Accessibility window attributes, and more reliable multi-display behavior.
-
-## Self-Review
-
-- Spec coverage: the plan covers `move_window`, `resize_window`, `tile_windows`, `center_window`, `focus_window`, `minimize`, `hide_app`, `quit_app`, `get_frontmost_app`, and display-aware placement. It also adds `list_windows` to make targeting practical.
-- Placeholder scan: no task depends on undefined future work; each task names exact files, commands, and expected outcomes.
-- Type consistency: public Python signatures, server wrappers, JSON payload keys, and tests use the same names: `app_name`, `window_id`, `window_index`, `display_index`, `x`, `y`, `width`, `height`, `layout`, `app_names`, and `padding`.

From 4c9bad7346cddc158520ffd0a8a109c14e7309e9 Mon Sep 17 00:00:00 2001
From: grohith327 <grohith327@gmail.com>
Date: Sat, 13 Jun 2026 21:25:45 -0700
Subject: [PATCH 2/2] add text extraction from screen

---
 README.md                                     |  13 +-
 server.py                                     |  35 ++
 skills/altic-studio/SKILL.md                  |  49 ++-
 skills/altic-studio/scripts/README.md         |   1 +
 .../scripts/extract-screen-text.swift         | 405 ++++++++++++++++++
 tests/test_screenshot.py                      | 158 +++++++
 tools/screenshot.py                           |  87 ++++
 tools/scripts/extract-screen-text.swift       | 405 ++++++++++++++++++
 8 files changed, 1148 insertions(+), 5 deletions(-)
 create mode 100644 skills/altic-studio/scripts/extract-screen-text.swift
 create mode 100644 tests/test_screenshot.py
 create mode 100644 tools/scripts/extract-screen-text.swift

diff --git a/README.md b/README.md
index 8c8ca19..27e5655 100644
--- a/README.md
+++ b/README.md
@@ -11,7 +11,7 @@
 - 🪟 **Window & Workspace** - List/focus apps and windows, move/resize/center/tile windows, minimize windows, hide apps, and quit apps
 - 🌐 **Safari** - Control tabs, navigate, execute JavaScript
 - 🌍 **Chrome (CDP)** - Open sessions, navigate, click/type, extract data, screenshots
-- 📸 **Screen Capture** - Capture the active display and share image output with the model
+- 📸 **Screen Capture & OCR** - Capture the active display, share image output, and extract visible text with local Vision OCR
 - 🖥️ **System** - Open apps, adjust brightness/volume, visual effects
 
 ## Available Skills
@@ -31,9 +31,10 @@ This repo currently includes one shareable skill:
 - Calendar: `create-calendar-event.applescript`, `list-all-calendar-events-for-day.applescript`
 - Safari: open/close/switch/navigate/reload/history/page-info scripts
 - System: `open-application.applescript`, brightness + volume scripts
-- Screenshot: `capture-screenshot.applescript`
+- Screenshot: `capture-screenshot.applescript`, `capture-active-screen.swift`, `extract-screen-text.swift`
 - Files/Finder MCP: `find_files`, `list_directory`, `get_file_info`, `copy_file`, `copy_directory`, `move_file`, `rename_file`, `trash_file`, `reveal_in_finder`, `get_finder_selection`
 - Clipboard MCP: `get_clipboard_text`, `set_clipboard_text`, `clear_clipboard`, `get_clipboard_files`, `set_clipboard_files`, `save_clipboard_image`, `set_clipboard_image`
+- Screen OCR MCP: `extract_screen_text`
 - Window/Workspace MCP: `get_frontmost_app`, `list_windows`, `focus_window`, `move_window`, `resize_window`, `center_window`, `tile_windows`, `minimize`, `hide_app`, `quit_app`
 - Clipboard script: `clipboard.swift`
 - Window script: `window-manager.swift`
@@ -115,7 +116,13 @@ Replace `/FULL/PATH/TO/altic-mcp` with your actual path (e.g., `/Users/johndoe/D
 - ✅ **Automation** - Allow Claude to control apps (Messages, Notes, Safari)
 - ✅ **Finder Automation** - For Finder selection, reveal, and Trash file tools
 - ✅ **Accessibility** - Required for screen glow, system controls, and window management tools such as focus_window, move_window, resize_window, center_window, tile_windows, minimize, hide_app, and quit_app
-- ✅ **Screen Recording** - Required for screenshot capture tools and improves window title/id discovery for list_windows on recent macOS versions
+- ✅ **Screen Recording** - Required for screenshot capture and `extract_screen_text`; also improves window title/id discovery for list_windows on recent macOS versions
+
+`extract_screen_text` uses Apple Foundation Models with Vision-backed OCRTool as
+the primary text extraction path on macOS 27 when Apple Intelligence is
+available, then falls back to deterministic local Vision OCR on older or
+unsupported runtimes. Optional visual understanding modes do not add Core AI
+model assets as a default dependency.
 
 Clipboard text operations normally do not require extra permissions. Clipboard
 file and image operations use macOS pasteboard APIs and may prompt for security
diff --git a/server.py b/server.py
index 2d98536..dee1a7c 100644
--- a/server.py
+++ b/server.py
@@ -1056,6 +1056,41 @@ async def capture_active_screen(
     return screenshot.capture_active_screen(output_path)
 
 
+@mcp.tool()
+async def extract_screen_text(
+    output_path: str = Field(default=""),
+    recognition_level: str = Field(default="accurate"),
+    languages: str = Field(default=""),
+    include_boxes: bool = Field(default=True),
+    max_chars: int = Field(default=20000, ge=1, le=200000),
+    visual_understanding: str = Field(default="none"),
+) -> str:
+    """
+    Capture the display containing the frontmost app and extract visible text
+    using local Vision OCR. Requires macOS Screen Recording permission.
+
+    Args:
+        output_path: Optional file path for the captured PNG used for OCR
+        recognition_level: OCR mode, either "accurate" or "fast"
+        languages: Optional comma-separated recognition language identifiers
+        include_boxes: Include recognized text bounding boxes in the JSON output
+        max_chars: Maximum characters to return in the combined text field
+        visual_understanding: Optional macOS 27 extension mode: "none", "summary", or "ui_map"
+
+    Returns:
+        JSON string with OCR text, line metadata, screenshot path, and optional
+        visual understanding metadata; or an error message.
+    """
+    return screenshot.extract_screen_text(
+        output_path,
+        recognition_level,
+        languages,
+        include_boxes,
+        max_chars,
+        visual_understanding,
+    )
+
+
 @mcp.tool()
 async def add_screen_glow() -> str:
     """
diff --git a/skills/altic-studio/SKILL.md b/skills/altic-studio/SKILL.md
index 36d8fd7..767dd51 100644
--- a/skills/altic-studio/SKILL.md
+++ b/skills/altic-studio/SKILL.md
@@ -13,9 +13,10 @@ license: Apache-2.0
 3. MCP file mode for safe Finder and filesystem operations
 4. MCP clipboard mode for text, file, and image pasteboard operations
 5. MCP window/workspace mode for arranging macOS apps and windows
+6. MCP screen OCR mode for extracting visible text from the active display
 
 It also includes Swift utility scripts for active-display screenshots, clipboard
-file/image operations, and window/workspace management on macOS.
+file/image operations, screen OCR, and window/workspace management on macOS.
 
 ## Mode A: AppleScript (macOS apps)
 
@@ -61,6 +62,7 @@ The full Altic automation surface is exposed as scripts under `skills/altic-stud
 - `turn-down-volume.applescript` - args: `[amount_0_to_100]`
 - `capture-screenshot.applescript` - args: `[output_path] [full|interactive|window]`
 - `capture-active-screen.swift` - args: `<output_path>` (captures full display containing frontmost app)
+- `extract-screen-text.swift` - args: `<output_path> [accurate|fast] [languages_csv] [include_boxes] [none|summary|ui_map]`
 - `clipboard.swift` - subcommands: `get-files`, `set-files <paths...>`, `save-image <output_path>`, `set-image <image_path>`
 - `window-manager.swift` - subcommands: `get_frontmost_app`, `list_windows`, `focus_window`, `move_window`, `resize_window`, `center_window`, `tile_windows`, `minimize`, `hide_app`, `quit_app`
 
@@ -70,6 +72,12 @@ Swift command template (for active-display screenshots):
 swift "skills/altic-studio/scripts/capture-active-screen.swift" "/tmp/active-screen.png"
 ```
 
+Swift command template (for active-display OCR):
+
+```bash
+swift "skills/altic-studio/scripts/extract-screen-text.swift" "/tmp/screen-text.png" "accurate" "" "true" "none"
+```
+
 Swift command template (for window management):
 
 ```bash
@@ -90,6 +98,7 @@ Use MCP tools for deterministic Chrome automation:
 - `chrome_close_session`
 - `chrome_list_sessions`
 - `capture_active_screen`
+- `extract_screen_text`
 
 Execution pattern:
 
@@ -98,7 +107,8 @@ Execution pattern:
 3. Interact with click and type actions.
 4. Verify state with extraction.
 5. Capture screenshots on checkpoints or failures.
-6. Close session.
+6. Use `extract_screen_text` when visible page/app text is needed from the active display rather than DOM extraction.
+7. Close session.
 
 ## Mode C: File Finder and File Operations (MCP)
 
@@ -197,6 +207,34 @@ Window workflow rules:
 - If a window mutation fails with an Accessibility error, tell the user to grant
   Accessibility permission to the host app running the MCP server.
 
+## Mode F: Screen OCR (MCP)
+
+Use MCP screen OCR when the user asks to read visible text from the current app,
+inspect text in a screenshot-like view, or extract text from a non-browser app.
+This tool returns JSON with combined text, OCR line metadata, the screenshot
+path used for recognition, and optional visual understanding metadata.
+
+Available tools:
+
+- `extract_screen_text` - args: `[output_path] [recognition_level=accurate|fast] [languages] [include_boxes] [max_chars] [visual_understanding=none|summary|ui_map]`
+
+Screen OCR workflow rules:
+
+- Prefer `chrome_extract` for browser DOM text when a Chrome CDP session is
+  already available; use `extract_screen_text` for rendered text, canvas text,
+  screenshots, PDFs, images, and non-browser apps.
+- Use `recognition_level="accurate"` by default; use `"fast"` only when speed
+  matters more than precision.
+- Set `languages` to a comma-separated list such as `en-US,fr-FR` only when the
+  expected language is known.
+- Keep `visual_understanding="none"` unless the user asks for higher-level
+  image or UI interpretation. On macOS 27 with Apple Intelligence available,
+  `extract_screen_text` uses Apple Foundation Models plus Vision-backed OCRTool
+  as the primary extraction path; on older or unsupported runtimes it falls back
+  to deterministic Vision OCR.
+- If OCR fails with a permission error, tell the user to grant Screen Recording
+  permission to the host app running the MCP server.
+
 ## Operational Rules
 
 - Validate date/time format before running reminder/calendar scripts.
@@ -211,6 +249,9 @@ Window workflow rules:
   confirmation.
 - For window mutations, verify with `list_windows` when the user needs
   confirmation.
+- Use `extract_screen_text` instead of manual screenshot inspection when the
+  task depends on visible text in an app or page that is not accessible through
+  Chrome DOM extraction.
 
 ## Permissions Checklist
 
@@ -220,6 +261,10 @@ Window workflow rules:
 - Automation permission for app control
 - Accessibility permission for system controls and window management
 - Screen Recording permission for screenshots and improved window discovery
+- Screen Recording permission for `extract_screen_text`; FoundationModels
+  primary extraction and optional visual understanding modes require macOS 27,
+  Apple Intelligence availability, and a FoundationModels/OCRTool-capable
+  SDK/runtime
 - Safari setting: Allow JavaScript from Apple Events
 - Google Chrome installed for CDP tools
 - Full Disk Access for reading Messages database
diff --git a/skills/altic-studio/scripts/README.md b/skills/altic-studio/scripts/README.md
index bf42305..b74b38a 100644
--- a/skills/altic-studio/scripts/README.md
+++ b/skills/altic-studio/scripts/README.md
@@ -19,6 +19,7 @@ osascript "skills/altic-studio/scripts/create-calendar-event.applescript" "Team
 osascript "skills/altic-studio/scripts/navigate-safari.applescript" "https://example.com"
 osascript "skills/altic-studio/scripts/capture-screenshot.applescript" "/tmp/screen.png" "full"
 swift "skills/altic-studio/scripts/capture-active-screen.swift" "/tmp/active-screen.png"
+swift "skills/altic-studio/scripts/extract-screen-text.swift" "/tmp/screen-text.png" "accurate" "" "true" "none"
 swift "skills/altic-studio/scripts/clipboard.swift" get-files
 swift "skills/altic-studio/scripts/clipboard.swift" set-files "/Users/example/Desktop/report.pdf"
 swift "skills/altic-studio/scripts/clipboard.swift" save-image "/tmp/clipboard.png"
diff --git a/skills/altic-studio/scripts/extract-screen-text.swift b/skills/altic-studio/scripts/extract-screen-text.swift
new file mode 100644
index 0000000..2284990
--- /dev/null
+++ b/skills/altic-studio/scripts/extract-screen-text.swift
@@ -0,0 +1,405 @@
+#!/usr/bin/env swift
+
+import AppKit
+import Foundation
+import ScreenCaptureKit
+import Vision
+
+#if canImport(FoundationModels) && canImport(_Vision_FoundationModels)
+import FoundationModels
+import _Vision_FoundationModels
+#endif
+
+func area(_ rect: CGRect) -> CGFloat {
+    max(0, rect.width) * max(0, rect.height)
+}
+
+func fail(_ message: String, code: Int32 = 1) -> Never {
+    fputs("\(message)\n", stderr)
+    exit(code)
+}
+
+func displayForFrontmostApp(content: SCShareableContent) -> SCDisplay? {
+    guard let app = NSWorkspace.shared.frontmostApplication else {
+        return nil
+    }
+
+    let targetPID = app.processIdentifier
+    let appWindows = content.windows.filter { window in
+        window.owningApplication?.processID == targetPID
+    }
+
+    guard
+        let frontWindow = appWindows.max(by: { lhs, rhs in
+            area(lhs.frame) < area(rhs.frame)
+        })
+    else {
+        return nil
+    }
+
+    let targetRect = frontWindow.frame
+    return content.displays.max(by: { lhs, rhs in
+        area(lhs.frame.intersection(targetRect)) < area(rhs.frame.intersection(targetRect))
+    })
+}
+
+func captureDisplay(to outputPath: String, display: SCDisplay) async throws -> CGImage {
+    let filter = SCContentFilter(display: display, excludingWindows: [])
+    let config = SCStreamConfiguration()
+
+    let image = try await SCScreenshotManager.captureImage(
+        contentFilter: filter,
+        configuration: config
+    )
+
+    let bitmap = NSBitmapImageRep(cgImage: image)
+    guard let pngData = bitmap.representation(using: .png, properties: [:]) else {
+        throw NSError(
+            domain: "altic-mcp.extract-screen-text",
+            code: 2,
+            userInfo: [NSLocalizedDescriptionKey: "Could not encode screenshot as PNG."]
+        )
+    }
+
+    let outputURL = URL(fileURLWithPath: outputPath)
+    try FileManager.default.createDirectory(
+        at: outputURL.deletingLastPathComponent(),
+        withIntermediateDirectories: true
+    )
+    try pngData.write(to: outputURL)
+
+    return image
+}
+
+func recognizedLines(
+    in image: CGImage,
+    recognitionLevel: String,
+    languages: [String],
+    includeBoxes: Bool
+) throws -> [[String: Any]] {
+    let request = VNRecognizeTextRequest()
+    request.recognitionLevel = recognitionLevel == "fast" ? .fast : .accurate
+    request.usesLanguageCorrection = true
+    if !languages.isEmpty {
+        request.recognitionLanguages = languages
+    }
+
+    let handler = VNImageRequestHandler(cgImage: image, options: [:])
+    try handler.perform([request])
+
+    let observations = request.results ?? []
+    let imageWidth = Double(image.width)
+    let imageHeight = Double(image.height)
+
+    let entries = observations.compactMap { observation -> (line: [String: Any], x: Double, y: Double)? in
+        guard let candidate = observation.topCandidates(1).first else {
+            return nil
+        }
+
+        let box = observation.boundingBox
+        let frame = [
+            "x": Double(box.minX) * imageWidth,
+            "y": (1.0 - Double(box.maxY)) * imageHeight,
+            "width": Double(box.width) * imageWidth,
+            "height": Double(box.height) * imageHeight,
+        ]
+        var line: [String: Any] = [
+            "text": candidate.string,
+            "confidence": Double(candidate.confidence),
+        ]
+
+        if includeBoxes {
+            line["frame"] = frame
+        }
+
+        return (line, frame["x"] ?? 0, frame["y"] ?? 0)
+    }.sorted { lhs, rhs in
+        if abs(lhs.y - rhs.y) > 4 {
+            return lhs.y < rhs.y
+        }
+        return lhs.x < rhs.x
+    }
+
+    return entries.map { $0.line }
+}
+
+struct ScreenTextExtraction {
+    let engine: String
+    let text: String
+    let visualUnderstanding: Any
+}
+
+func linesFromText(_ text: String) -> [[String: Any]] {
+    text
+        .split(separator: "\n", omittingEmptySubsequences: false)
+        .map { String($0).trimmingCharacters(in: .whitespacesAndNewlines) }
+        .filter { !$0.isEmpty }
+        .map { line in
+            [
+                "text": line,
+                "confidence": 1.0,
+            ]
+        }
+}
+
+func visualUnderstandingUnavailablePayload(mode: String, reason: String) -> Any {
+    if mode == "none" {
+        return NSNull()
+    }
+
+    return [
+        "available": false,
+        "mode": mode,
+        "reason": reason,
+    ]
+}
+
+#if canImport(FoundationModels) && canImport(_Vision_FoundationModels)
+@available(macOS 27.0, *)
+func foundationModelsTextPrompt(image: CGImage) -> Prompt {
+    return Prompt {
+        """
+        Use OCRTool on the attached image labeled "screen". Return only the exact visible text \
+        from the screen, preserving line breaks where practical. Do not summarize or add commentary.
+        """
+        Attachment(image)
+            .label("screen")
+    }
+}
+
+@available(macOS 27.0, *)
+func foundationModelsVisualPrompt(for mode: String, image: CGImage, extractedText: String) -> Prompt {
+    let instruction: String
+    switch mode {
+    case "summary":
+        instruction = """
+        Use the attached image labeled "screen" and the OCR text below to return a concise summary \
+        of what the screen is showing. Preserve important labels, warnings, numbers, and button text.
+
+        OCR text:
+        \(extractedText)
+        """
+    case "ui_map":
+        instruction = """
+        Use the attached image labeled "screen" and the OCR text below to describe the visible UI \
+        structure as compact JSON with sections, controls, and important labels. Keep the response \
+        short and machine-readable.
+
+        OCR text:
+        \(extractedText)
+        """
+    default:
+        instruction = extractedText
+    }
+
+    return Prompt {
+        instruction
+        Attachment(image)
+            .label("screen")
+    }
+}
+
+@available(macOS 27.0, *)
+func extractWithFoundationModels(in image: CGImage, mode: String) async throws -> ScreenTextExtraction? {
+    let model = SystemLanguageModel.default
+    guard model.isAvailable else {
+        return nil
+    }
+
+    let session = LanguageModelSession(
+        model: model,
+        tools: [
+            OCRTool(),
+        ],
+        instructions: """
+        You are a local screen text extraction engine. Prefer OCRTool whenever text \
+        is needed from the attached image. Return concise, faithful output.
+        """
+    )
+
+    let textResponse = try await session.respond(to: foundationModelsTextPrompt(image: image))
+    let extractedText = textResponse.content.trimmingCharacters(in: .whitespacesAndNewlines)
+    guard !extractedText.isEmpty else {
+        return nil
+    }
+
+    let visualUnderstanding: Any
+    if mode == "none" {
+        visualUnderstanding = NSNull()
+    } else {
+        let visualResponse = try await session.respond(
+            to: foundationModelsVisualPrompt(for: mode, image: image, extractedText: extractedText)
+        )
+        let visualContent = visualResponse.content.trimmingCharacters(in: .whitespacesAndNewlines)
+        visualUnderstanding = [
+            "available": true,
+            "engine": "foundation_models",
+            "mode": mode,
+            "content": visualContent,
+        ]
+    }
+
+    return ScreenTextExtraction(
+        engine: "foundation_models",
+        text: extractedText,
+        visualUnderstanding: visualUnderstanding
+    )
+}
+#endif
+
+func extractWithFoundationModelsIfAvailable(in image: CGImage, mode: String) async throws -> ScreenTextExtraction? {
+    #if canImport(FoundationModels) && canImport(_Vision_FoundationModels)
+    if #available(macOS 27.0, *) {
+        return try await extractWithFoundationModels(in: image, mode: mode)
+    }
+    #endif
+
+    return nil
+}
+
+func visionExtraction(
+    in image: CGImage,
+    recognitionLevel: String,
+    languages: [String],
+    includeBoxes: Bool,
+    visualUnderstanding: String
+) throws -> (ScreenTextExtraction, [[String: Any]]) {
+    let lines = try recognizedLines(
+        in: image,
+        recognitionLevel: recognitionLevel,
+        languages: languages,
+        includeBoxes: includeBoxes
+    )
+    let text = lines.compactMap { $0["text"] as? String }.joined(separator: "\n")
+    let extraction = ScreenTextExtraction(
+        engine: "vision",
+        text: text,
+        visualUnderstanding: visualUnderstandingUnavailablePayload(
+            mode: visualUnderstanding,
+            reason: "requires macOS 27 runtime, Apple Intelligence availability, and FoundationModels SDK"
+        )
+    )
+    return (extraction, lines)
+}
+
+func lineMetadata(
+    for extraction: ScreenTextExtraction,
+    image: CGImage,
+    recognitionLevel: String,
+    languages: [String],
+    includeBoxes: Bool
+) -> [[String: Any]] {
+    if extraction.engine == "foundation_models" {
+        do {
+            return try recognizedLines(
+                in: image,
+                recognitionLevel: recognitionLevel,
+                languages: languages,
+                includeBoxes: includeBoxes
+            )
+        } catch {
+            return linesFromText(extraction.text)
+        }
+    }
+
+    return linesFromText(extraction.text)
+}
+
+func topLevelVisualUnderstanding(for extraction: ScreenTextExtraction, mode: String) -> Any {
+    if extraction.engine == "foundation_models" {
+        return extraction.visualUnderstanding
+    }
+    if mode == "none" {
+        return NSNull()
+    }
+    return extraction.visualUnderstanding
+}
+
+func primaryExtraction(
+    in image: CGImage,
+    recognitionLevel: String,
+    languages: [String],
+    includeBoxes: Bool,
+    visualUnderstanding: String
+) async throws -> (ScreenTextExtraction, [[String: Any]]) {
+    if let foundationExtraction = try await extractWithFoundationModelsIfAvailable(in: image, mode: visualUnderstanding) {
+        let lines = lineMetadata(
+            for: foundationExtraction,
+            image: image,
+            recognitionLevel: recognitionLevel,
+            languages: languages,
+            includeBoxes: includeBoxes
+        )
+        return (foundationExtraction, lines.isEmpty ? linesFromText(foundationExtraction.text) : lines)
+    }
+
+    return try visionExtraction(
+        in: image,
+        recognitionLevel: recognitionLevel,
+        languages: languages,
+        includeBoxes: includeBoxes,
+        visualUnderstanding: visualUnderstanding
+    )
+}
+
+let args = CommandLine.arguments
+guard args.count >= 2 else {
+    fail("Usage: extract-screen-text.swift <output_path> [accurate|fast] [languages_csv] [include_boxes] [none|summary|ui_map]")
+}
+
+let outputPath = args[1]
+let recognitionLevel = args.count >= 3 ? args[2].lowercased() : "accurate"
+let languages = args.count >= 4
+    ? args[3].split(separator: ",").map { $0.trimmingCharacters(in: .whitespacesAndNewlines) }.filter { !$0.isEmpty }
+    : []
+let includeBoxes = args.count >= 5 ? (args[4].lowercased() != "false") : true
+let visualUnderstanding = args.count >= 6 ? args[5].lowercased() : "none"
+
+guard ["accurate", "fast"].contains(recognitionLevel) else {
+    fail("recognition_level must be one of: accurate, fast")
+}
+guard ["none", "summary", "ui_map"].contains(visualUnderstanding) else {
+    fail("visual_understanding must be one of: none, summary, ui_map")
+}
+
+do {
+    let content = try await SCShareableContent.excludingDesktopWindows(false, onScreenWindowsOnly: true)
+    let display = displayForFrontmostApp(content: content) ?? content.displays.first
+    guard let display else {
+        fail("Could not determine a display to capture.")
+    }
+
+    let image = try await captureDisplay(to: outputPath, display: display)
+    let (extraction, lines) = try await primaryExtraction(
+        in: image,
+        recognitionLevel: recognitionLevel,
+        languages: languages,
+        includeBoxes: includeBoxes,
+        visualUnderstanding: visualUnderstanding
+    )
+
+    let payload: [String: Any] = [
+        "action": "extract_screen_text",
+        "engine": extraction.engine,
+        "source": "active_screen",
+        "screenshot_path": outputPath,
+        "image_size": [
+            "width": image.width,
+            "height": image.height,
+        ],
+        "recognition_level": recognitionLevel,
+        "text": extraction.text,
+        "length_chars": extraction.text.count,
+        "truncated": false,
+        "lines": lines,
+        "visual_understanding": topLevelVisualUnderstanding(for: extraction, mode: visualUnderstanding),
+    ]
+
+    let data = try JSONSerialization.data(withJSONObject: payload, options: [.prettyPrinted, .sortedKeys])
+    guard let json = String(data: data, encoding: .utf8) else {
+        fail("Could not encode OCR payload as UTF-8.")
+    }
+    print(json)
+} catch {
+    fail(error.localizedDescription)
+}
diff --git a/tests/test_screenshot.py b/tests/test_screenshot.py
new file mode 100644
index 0000000..6ddf969
--- /dev/null
+++ b/tests/test_screenshot.py
@@ -0,0 +1,158 @@
+import json
+import subprocess
+from pathlib import Path
+
+from tools import screenshot
+
+
+def read_json(value: str):
+    assert not value.startswith("Error:"), value
+    return json.loads(value)
+
+
+def completed(args, stdout='{"action":"ok"}', stderr="", returncode=0):
+    return subprocess.CompletedProcess(
+        args=args,
+        returncode=returncode,
+        stdout=stdout,
+        stderr=stderr,
+    )
+
+
+def screen_text_payload(text: str = "Hello screen"):
+    return {
+        "action": "extract_screen_text",
+        "source": "active_screen",
+        "screenshot_path": "/tmp/screen-text.png",
+        "image_size": {"width": 1200, "height": 800},
+        "recognition_level": "accurate",
+        "text": text,
+        "length_chars": len(text),
+        "truncated": False,
+        "lines": [
+            {
+                "text": "Hello screen",
+                "confidence": 0.98,
+                "frame": {"x": 10, "y": 20, "width": 200, "height": 30},
+            }
+        ],
+        "visual_understanding": None,
+    }
+
+
+def test_extract_screen_text_invokes_swift_helper(monkeypatch):
+    seen = {}
+
+    def fake_run(args, **kwargs):
+        seen["args"] = args
+        seen["kwargs"] = kwargs
+        return completed(args, stdout=json.dumps(screen_text_payload()))
+
+    monkeypatch.setattr(screenshot.subprocess, "run", fake_run)
+
+    payload = read_json(
+        screenshot.extract_screen_text(
+            output_path="/tmp/out.png",
+            recognition_level="fast",
+            languages="en-US,fr-FR",
+            include_boxes=False,
+            visual_understanding="summary",
+        )
+    )
+
+    assert payload["action"] == "extract_screen_text"
+    assert seen["args"][0] == "swift"
+    assert Path(seen["args"][1]).name == "extract-screen-text.swift"
+    assert seen["args"][2:] == [
+        "/tmp/out.png",
+        "fast",
+        "en-US,fr-FR",
+        "false",
+        "summary",
+    ]
+    assert seen["kwargs"]["timeout"] == 90
+
+
+def test_extract_screen_text_uses_default_temp_screenshot_path(monkeypatch):
+    seen = {}
+
+    def fake_run(args, **kwargs):
+        seen["target"] = args[2]
+        return completed(args, stdout=json.dumps(screen_text_payload()))
+
+    monkeypatch.setattr(screenshot.subprocess, "run", fake_run)
+
+    read_json(screenshot.extract_screen_text())
+
+    target = Path(seen["target"])
+    assert target.parent == Path("/tmp/altic-mcp-screenshots")
+    assert target.name.startswith("screen-text-")
+    assert target.suffix == ".png"
+
+
+def test_extract_screen_text_rejects_invalid_recognition_level(monkeypatch):
+    def fail_run(*args, **kwargs):
+        raise AssertionError("subprocess should not run for invalid recognition level")
+
+    monkeypatch.setattr(screenshot.subprocess, "run", fail_run)
+
+    result = screenshot.extract_screen_text(recognition_level="balanced")
+
+    assert result.startswith("Error:")
+    assert "recognition_level must be one of" in result
+
+
+def test_extract_screen_text_rejects_invalid_visual_understanding(monkeypatch):
+    def fail_run(*args, **kwargs):
+        raise AssertionError("subprocess should not run for invalid visual understanding")
+
+    monkeypatch.setattr(screenshot.subprocess, "run", fail_run)
+
+    result = screenshot.extract_screen_text(visual_understanding="describe_everything")
+
+    assert result.startswith("Error:")
+    assert "visual_understanding must be one of" in result
+
+
+def test_extract_screen_text_invalid_swift_json_returns_error(monkeypatch):
+    def fake_run(args, **kwargs):
+        return completed(args, stdout="not-json")
+
+    monkeypatch.setattr(screenshot.subprocess, "run", fake_run)
+
+    result = screenshot.extract_screen_text()
+
+    assert result.startswith("Error: invalid screen text response:")
+
+
+def test_extract_screen_text_truncates_long_text(monkeypatch):
+    long_text = "abcdefghijklmnopqrstuvwxyz"
+
+    def fake_run(args, **kwargs):
+        return completed(args, stdout=json.dumps(screen_text_payload(long_text)))
+
+    monkeypatch.setattr(screenshot.subprocess, "run", fake_run)
+
+    payload = read_json(screenshot.extract_screen_text(max_chars=10))
+
+    assert payload["text"] == "abcdefghij"
+    assert payload["length_chars"] == len(long_text)
+    assert payload["truncated"] is True
+
+
+def test_server_exposes_extract_screen_text_tool():
+    import server
+
+    tool_names = set(server.mcp._tool_manager._tools)
+
+    assert "extract_screen_text" in tool_names
+
+
+def test_swift_helper_contains_macos_27_foundation_models_primary_path():
+    script = Path("tools/scripts/extract-screen-text.swift").read_text(encoding="utf-8")
+
+    assert "LanguageModelSession" in script
+    assert "SystemLanguageModel.default" in script
+    assert "OCRTool()" in script
+    assert "Attachment(image)" in script
+    assert "FoundationModels image-input OCRTool integration is gated" not in script
diff --git a/tools/screenshot.py b/tools/screenshot.py
index 6d422f7..892901c 100644
--- a/tools/screenshot.py
+++ b/tools/screenshot.py
@@ -1,3 +1,4 @@
+import json
 import subprocess
 import tempfile
 import time
@@ -8,6 +9,18 @@
 from .constants import SCRIPTS_PREFIX
 
 
+def _json(payload: dict) -> str:
+    return json.dumps(payload, indent=2, sort_keys=True)
+
+
+def _error(message: str) -> str:
+    return f"Error: {message}"
+
+
+def _screen_text_script() -> str:
+    return str(SCRIPTS_PREFIX / "extract-screen-text.swift")
+
+
 def capture_active_screen(output_path: str = "") -> str | list[object]:
     script_path = SCRIPTS_PREFIX / "capture-active-screen.swift"
 
@@ -40,3 +53,77 @@ def capture_active_screen(output_path: str = "") -> str | list[object]:
         return [f"Captured active screen: {saved_path}", Image(path=saved_path)]
     except Exception as e:
         return f"Error: Failed to capture active screen: {str(e)}"
+
+
+def extract_screen_text(
+    output_path: str = "",
+    recognition_level: str = "accurate",
+    languages: str = "",
+    include_boxes: bool = True,
+    max_chars: int = 20000,
+    visual_understanding: str = "none",
+) -> str:
+    """
+    Capture the active display and extract screen text with local Vision OCR.
+    """
+    valid_levels = {"accurate", "fast"}
+    valid_visual_modes = {"none", "summary", "ui_map"}
+
+    recognition_level = recognition_level.strip().lower()
+    visual_understanding = visual_understanding.strip().lower()
+
+    if recognition_level not in valid_levels:
+        return _error(
+            "recognition_level must be one of: accurate, fast"
+        )
+    if visual_understanding not in valid_visual_modes:
+        return _error(
+            "visual_understanding must be one of: none, summary, ui_map"
+        )
+
+    try:
+        max_chars = max(1, min(max_chars, 200000))
+        target_path = output_path.strip()
+        if not target_path:
+            timestamp = int(time.time())
+            shots_dir = Path("/tmp") / "altic-mcp-screenshots"
+            shots_dir.mkdir(parents=True, exist_ok=True)
+            target_path = str(shots_dir / f"screen-text-{timestamp}.png")
+
+        target = Path(target_path).expanduser()
+        target.parent.mkdir(parents=True, exist_ok=True)
+
+        result = subprocess.run(
+            [
+                "swift",
+                _screen_text_script(),
+                str(target),
+                recognition_level,
+                languages,
+                str(include_boxes).lower(),
+                visual_understanding,
+            ],
+            capture_output=True,
+            text=True,
+            timeout=90,
+        )
+
+        if result.returncode != 0:
+            return _error(
+                result.stderr.strip() or "unable to extract screen text"
+            )
+
+        try:
+            payload = json.loads(result.stdout or "{}")
+        except json.JSONDecodeError as exc:
+            return _error(f"invalid screen text response: {exc}")
+
+        text = str(payload.get("text", ""))
+        truncated_text = text[:max_chars]
+        payload["text"] = truncated_text
+        payload["length_chars"] = len(text)
+        payload["truncated"] = len(text) > len(truncated_text)
+
+        return _json(payload)
+    except Exception as exc:
+        return _error(f"failed to extract screen text: {exc}")
diff --git a/tools/scripts/extract-screen-text.swift b/tools/scripts/extract-screen-text.swift
new file mode 100644
index 0000000..2284990
--- /dev/null
+++ b/tools/scripts/extract-screen-text.swift
@@ -0,0 +1,405 @@
+#!/usr/bin/env swift
+
+import AppKit
+import Foundation
+import ScreenCaptureKit
+import Vision
+
+#if canImport(FoundationModels) && canImport(_Vision_FoundationModels)
+import FoundationModels
+import _Vision_FoundationModels
+#endif
+
+func area(_ rect: CGRect) -> CGFloat {
+    max(0, rect.width) * max(0, rect.height)
+}
+
+func fail(_ message: String, code: Int32 = 1) -> Never {
+    fputs("\(message)\n", stderr)
+    exit(code)
+}
+
+func displayForFrontmostApp(content: SCShareableContent) -> SCDisplay? {
+    guard let app = NSWorkspace.shared.frontmostApplication else {
+        return nil
+    }
+
+    let targetPID = app.processIdentifier
+    let appWindows = content.windows.filter { window in
+        window.owningApplication?.processID == targetPID
+    }
+
+    guard
+        let frontWindow = appWindows.max(by: { lhs, rhs in
+            area(lhs.frame) < area(rhs.frame)
+        })
+    else {
+        return nil
+    }
+
+    let targetRect = frontWindow.frame
+    return content.displays.max(by: { lhs, rhs in
+        area(lhs.frame.intersection(targetRect)) < area(rhs.frame.intersection(targetRect))
+    })
+}
+
+func captureDisplay(to outputPath: String, display: SCDisplay) async throws -> CGImage {
+    let filter = SCContentFilter(display: display, excludingWindows: [])
+    let config = SCStreamConfiguration()
+
+    let image = try await SCScreenshotManager.captureImage(
+        contentFilter: filter,
+        configuration: config
+    )
+
+    let bitmap = NSBitmapImageRep(cgImage: image)
+    guard let pngData = bitmap.representation(using: .png, properties: [:]) else {
+        throw NSError(
+            domain: "altic-mcp.extract-screen-text",
+            code: 2,
+            userInfo: [NSLocalizedDescriptionKey: "Could not encode screenshot as PNG."]
+        )
+    }
+
+    let outputURL = URL(fileURLWithPath: outputPath)
+    try FileManager.default.createDirectory(
+        at: outputURL.deletingLastPathComponent(),
+        withIntermediateDirectories: true
+    )
+    try pngData.write(to: outputURL)
+
+    return image
+}
+
+func recognizedLines(
+    in image: CGImage,
+    recognitionLevel: String,
+    languages: [String],
+    includeBoxes: Bool
+) throws -> [[String: Any]] {
+    let request = VNRecognizeTextRequest()
+    request.recognitionLevel = recognitionLevel == "fast" ? .fast : .accurate
+    request.usesLanguageCorrection = true
+    if !languages.isEmpty {
+        request.recognitionLanguages = languages
+    }
+
+    let handler = VNImageRequestHandler(cgImage: image, options: [:])
+    try handler.perform([request])
+
+    let observations = request.results ?? []
+    let imageWidth = Double(image.width)
+    let imageHeight = Double(image.height)
+
+    let entries = observations.compactMap { observation -> (line: [String: Any], x: Double, y: Double)? in
+        guard let candidate = observation.topCandidates(1).first else {
+            return nil
+        }
+
+        let box = observation.boundingBox
+        let frame = [
+            "x": Double(box.minX) * imageWidth,
+            "y": (1.0 - Double(box.maxY)) * imageHeight,
+            "width": Double(box.width) * imageWidth,
+            "height": Double(box.height) * imageHeight,
+        ]
+        var line: [String: Any] = [
+            "text": candidate.string,
+            "confidence": Double(candidate.confidence),
+        ]
+
+        if includeBoxes {
+            line["frame"] = frame
+        }
+
+        return (line, frame["x"] ?? 0, frame["y"] ?? 0)
+    }.sorted { lhs, rhs in
+        if abs(lhs.y - rhs.y) > 4 {
+            return lhs.y < rhs.y
+        }
+        return lhs.x < rhs.x
+    }
+
+    return entries.map { $0.line }
+}
+
+struct ScreenTextExtraction {
+    let engine: String
+    let text: String
+    let visualUnderstanding: Any
+}
+
+func linesFromText(_ text: String) -> [[String: Any]] {
+    text
+        .split(separator: "\n", omittingEmptySubsequences: false)
+        .map { String($0).trimmingCharacters(in: .whitespacesAndNewlines) }
+        .filter { !$0.isEmpty }
+        .map { line in
+            [
+                "text": line,
+                "confidence": 1.0,
+            ]
+        }
+}
+
+func visualUnderstandingUnavailablePayload(mode: String, reason: String) -> Any {
+    if mode == "none" {
+        return NSNull()
+    }
+
+    return [
+        "available": false,
+        "mode": mode,
+        "reason": reason,
+    ]
+}
+
+#if canImport(FoundationModels) && canImport(_Vision_FoundationModels)
+@available(macOS 27.0, *)
+func foundationModelsTextPrompt(image: CGImage) -> Prompt {
+    return Prompt {
+        """
+        Use OCRTool on the attached image labeled "screen". Return only the exact visible text \
+        from the screen, preserving line breaks where practical. Do not summarize or add commentary.
+        """
+        Attachment(image)
+            .label("screen")
+    }
+}
+
+@available(macOS 27.0, *)
+func foundationModelsVisualPrompt(for mode: String, image: CGImage, extractedText: String) -> Prompt {
+    let instruction: String
+    switch mode {
+    case "summary":
+        instruction = """
+        Use the attached image labeled "screen" and the OCR text below to return a concise summary \
+        of what the screen is showing. Preserve important labels, warnings, numbers, and button text.
+
+        OCR text:
+        \(extractedText)
+        """
+    case "ui_map":
+        instruction = """
+        Use the attached image labeled "screen" and the OCR text below to describe the visible UI \
+        structure as compact JSON with sections, controls, and important labels. Keep the response \
+        short and machine-readable.
+
+        OCR text:
+        \(extractedText)
+        """
+    default:
+        instruction = extractedText
+    }
+
+    return Prompt {
+        instruction
+        Attachment(image)
+            .label("screen")
+    }
+}
+
+@available(macOS 27.0, *)
+func extractWithFoundationModels(in image: CGImage, mode: String) async throws -> ScreenTextExtraction? {
+    let model = SystemLanguageModel.default
+    guard model.isAvailable else {
+        return nil
+    }
+
+    let session = LanguageModelSession(
+        model: model,
+        tools: [
+            OCRTool(),
+        ],
+        instructions: """
+        You are a local screen text extraction engine. Prefer OCRTool whenever text \
+        is needed from the attached image. Return concise, faithful output.
+        """
+    )
+
+    let textResponse = try await session.respond(to: foundationModelsTextPrompt(image: image))
+    let extractedText = textResponse.content.trimmingCharacters(in: .whitespacesAndNewlines)
+    guard !extractedText.isEmpty else {
+        return nil
+    }
+
+    let visualUnderstanding: Any
+    if mode == "none" {
+        visualUnderstanding = NSNull()
+    } else {
+        let visualResponse = try await session.respond(
+            to: foundationModelsVisualPrompt(for: mode, image: image, extractedText: extractedText)
+        )
+        let visualContent = visualResponse.content.trimmingCharacters(in: .whitespacesAndNewlines)
+        visualUnderstanding = [
+            "available": true,
+            "engine": "foundation_models",
+            "mode": mode,
+            "content": visualContent,
+        ]
+    }
+
+    return ScreenTextExtraction(
+        engine: "foundation_models",
+        text: extractedText,
+        visualUnderstanding: visualUnderstanding
+    )
+}
+#endif
+
+func extractWithFoundationModelsIfAvailable(in image: CGImage, mode: String) async throws -> ScreenTextExtraction? {
+    #if canImport(FoundationModels) && canImport(_Vision_FoundationModels)
+    if #available(macOS 27.0, *) {
+        return try await extractWithFoundationModels(in: image, mode: mode)
+    }
+    #endif
+
+    return nil
+}
+
+func visionExtraction(
+    in image: CGImage,
+    recognitionLevel: String,
+    languages: [String],
+    includeBoxes: Bool,
+    visualUnderstanding: String
+) throws -> (ScreenTextExtraction, [[String: Any]]) {
+    let lines = try recognizedLines(
+        in: image,
+        recognitionLevel: recognitionLevel,
+        languages: languages,
+        includeBoxes: includeBoxes
+    )
+    let text = lines.compactMap { $0["text"] as? String }.joined(separator: "\n")
+    let extraction = ScreenTextExtraction(
+        engine: "vision",
+        text: text,
+        visualUnderstanding: visualUnderstandingUnavailablePayload(
+            mode: visualUnderstanding,
+            reason: "requires macOS 27 runtime, Apple Intelligence availability, and FoundationModels SDK"
+        )
+    )
+    return (extraction, lines)
+}
+
+func lineMetadata(
+    for extraction: ScreenTextExtraction,
+    image: CGImage,
+    recognitionLevel: String,
+    languages: [String],
+    includeBoxes: Bool
+) -> [[String: Any]] {
+    if extraction.engine == "foundation_models" {
+        do {
+            return try recognizedLines(
+                in: image,
+                recognitionLevel: recognitionLevel,
+                languages: languages,
+                includeBoxes: includeBoxes
+            )
+        } catch {
+            return linesFromText(extraction.text)
+        }
+    }
+
+    return linesFromText(extraction.text)
+}
+
+func topLevelVisualUnderstanding(for extraction: ScreenTextExtraction, mode: String) -> Any {
+    if extraction.engine == "foundation_models" {
+        return extraction.visualUnderstanding
+    }
+    if mode == "none" {
+        return NSNull()
+    }
+    return extraction.visualUnderstanding
+}
+
+func primaryExtraction(
+    in image: CGImage,
+    recognitionLevel: String,
+    languages: [String],
+    includeBoxes: Bool,
+    visualUnderstanding: String
+) async throws -> (ScreenTextExtraction, [[String: Any]]) {
+    if let foundationExtraction = try await extractWithFoundationModelsIfAvailable(in: image, mode: visualUnderstanding) {
+        let lines = lineMetadata(
+            for: foundationExtraction,
+            image: image,
+            recognitionLevel: recognitionLevel,
+            languages: languages,
+            includeBoxes: includeBoxes
+        )
+        return (foundationExtraction, lines.isEmpty ? linesFromText(foundationExtraction.text) : lines)
+    }
+
+    return try visionExtraction(
+        in: image,
+        recognitionLevel: recognitionLevel,
+        languages: languages,
+        includeBoxes: includeBoxes,
+        visualUnderstanding: visualUnderstanding
+    )
+}
+
+let args = CommandLine.arguments
+guard args.count >= 2 else {
+    fail("Usage: extract-screen-text.swift <output_path> [accurate|fast] [languages_csv] [include_boxes] [none|summary|ui_map]")
+}
+
+let outputPath = args[1]
+let recognitionLevel = args.count >= 3 ? args[2].lowercased() : "accurate"
+let languages = args.count >= 4
+    ? args[3].split(separator: ",").map { $0.trimmingCharacters(in: .whitespacesAndNewlines) }.filter { !$0.isEmpty }
+    : []
+let includeBoxes = args.count >= 5 ? (args[4].lowercased() != "false") : true
+let visualUnderstanding = args.count >= 6 ? args[5].lowercased() : "none"
+
+guard ["accurate", "fast"].contains(recognitionLevel) else {
+    fail("recognition_level must be one of: accurate, fast")
+}
+guard ["none", "summary", "ui_map"].contains(visualUnderstanding) else {
+    fail("visual_understanding must be one of: none, summary, ui_map")
+}
+
+do {
+    let content = try await SCShareableContent.excludingDesktopWindows(false, onScreenWindowsOnly: true)
+    let display = displayForFrontmostApp(content: content) ?? content.displays.first
+    guard let display else {
+        fail("Could not determine a display to capture.")
+    }
+
+    let image = try await captureDisplay(to: outputPath, display: display)
+    let (extraction, lines) = try await primaryExtraction(
+        in: image,
+        recognitionLevel: recognitionLevel,
+        languages: languages,
+        includeBoxes: includeBoxes,
+        visualUnderstanding: visualUnderstanding
+    )
+
+    let payload: [String: Any] = [
+        "action": "extract_screen_text",
+        "engine": extraction.engine,
+        "source": "active_screen",
+        "screenshot_path": outputPath,
+        "image_size": [
+            "width": image.width,
+            "height": image.height,
+        ],
+        "recognition_level": recognitionLevel,
+        "text": extraction.text,
+        "length_chars": extraction.text.count,
+        "truncated": false,
+        "lines": lines,
+        "visual_understanding": topLevelVisualUnderstanding(for: extraction, mode: visualUnderstanding),
+    ]
+
+    let data = try JSONSerialization.data(withJSONObject: payload, options: [.prettyPrinted, .sortedKeys])
+    guard let json = String(data: data, encoding: .utf8) else {
+        fail("Could not encode OCR payload as UTF-8.")
+    }
+    print(json)
+} catch {
+    fail(error.localizedDescription)
+}