From 380d9af2fd8a682f3f35faf48ae8805f97411225 Mon Sep 17 00:00:00 2001 From: Tom di Mino Date: Mon, 23 Feb 2026 17:31:21 -0500 Subject: [PATCH] docs: add Claude Code skill for desktop GUI automation Add a community-contributed Claude Code skill that wraps Open Interpreter's Computer API (pyautogui, pytesseract) for desktop GUI automation. Provides standalone scripts for screenshot capture, mouse clicking (by coordinates or OCR text), keyboard input, and screen text detection. Three integration modes: - Library: Claude Code reasons from screenshots, dispatches actions via scripts - OS subprocess: delegates entire GUI tasks to OI's --os agent loop - Local agent: offline computer use via Ollama No changes to Open Interpreter's source code or package. Co-Authored-By: Claude Opus 4.6 --- .claude/skills/open-interpreter/README.md | 193 +++++++++++++++ .claude/skills/open-interpreter/SKILL.md | 219 ++++++++++++++++++ .../references/computer-api.md | 115 +++++++++ .../open-interpreter/references/os-mode.md | 99 ++++++++ .../references/safety-and-permissions.md | 107 +++++++++ .../open-interpreter/scripts/oi_click.py | 159 +++++++++++++ .../open-interpreter/scripts/oi_computer.py | 87 +++++++ .../open-interpreter/scripts/oi_find_text.py | 181 +++++++++++++++ .../open-interpreter/scripts/oi_install.sh | 58 +++++ .../open-interpreter/scripts/oi_os_mode.py | 171 ++++++++++++++ .../scripts/oi_permission_check.py | 140 +++++++++++ .../open-interpreter/scripts/oi_screenshot.py | 186 +++++++++++++++ .../open-interpreter/scripts/oi_type.py | 177 ++++++++++++++ 13 files changed, 1892 insertions(+) create mode 100644 .claude/skills/open-interpreter/README.md create mode 100644 .claude/skills/open-interpreter/SKILL.md create mode 100644 .claude/skills/open-interpreter/references/computer-api.md create mode 100644 .claude/skills/open-interpreter/references/os-mode.md create mode 100644 .claude/skills/open-interpreter/references/safety-and-permissions.md create mode 100755 .claude/skills/open-interpreter/scripts/oi_click.py create mode 100755 .claude/skills/open-interpreter/scripts/oi_computer.py create mode 100755 .claude/skills/open-interpreter/scripts/oi_find_text.py create mode 100755 .claude/skills/open-interpreter/scripts/oi_install.sh create mode 100755 .claude/skills/open-interpreter/scripts/oi_os_mode.py create mode 100755 .claude/skills/open-interpreter/scripts/oi_permission_check.py create mode 100755 .claude/skills/open-interpreter/scripts/oi_screenshot.py create mode 100755 .claude/skills/open-interpreter/scripts/oi_type.py diff --git a/.claude/skills/open-interpreter/README.md b/.claude/skills/open-interpreter/README.md new file mode 100644 index 0000000000..6118cad2fb --- /dev/null +++ b/.claude/skills/open-interpreter/README.md @@ -0,0 +1,193 @@ +# open-interpreter — Claude Code Skill + +A [Claude Code skill](https://code.claude.com/docs/en/skills) for desktop GUI automation, built on top of Open Interpreter's Computer API. Provides mouse, keyboard, screenshot, and OCR control for native macOS/Linux applications that have no CLI or API. + +## What is this? + +[Claude Code](https://github.com/anthropics/claude-code) is Anthropic's terminal-based AI coding tool. It reads `.claude/skills/` directories for specialized capabilities. This skill gives Claude Code the ability to interact with desktop GUIs by wrapping Open Interpreter's pyautogui + pytesseract primitives in standalone scripts. + +## When to Use + +- Interacting with desktop apps (System Preferences, Calculator, browsers, any GUI) +- Automating GUI workflows (form filling, menu navigation, data extraction) +- Reading screen content via OCR (finding buttons, labels, prices, status text) +- Controlling mouse and keyboard programmatically + +## Modes + +| Mode | LLM | Script | Best For | +|------|-----|--------|----------| +| **Library** | Claude Code (native) | Individual scripts | Surgical GUI actions — Claude sees screenshots, reasons, dispatches | +| **OS subprocess** | Claude API (via OI) | `oi_os_mode.py` | Delegating entire GUI tasks to OI's agent loop | +| **Local agent** | Ollama (offline) | `oi_os_mode.py --local` | Offline computer use, no API costs | + +Use Library mode by default. OS subprocess for self-contained GUI tasks. Local agent when offline. + +## Prerequisites + +- Python 3.10+ +- [uv](https://github.com/astral-sh/uv) package manager +- macOS: Accessibility + Screen Recording permissions for terminal app +- tesseract (`brew install tesseract`) + +## Installation + +To use this skill, copy the folder into your Claude Code skills directory: + +```bash +cp -r .claude/skills/open-interpreter ~/.claude/skills/open-interpreter +``` + +Then run the install script: + +```bash +~/.claude/skills/open-interpreter/scripts/oi_install.sh +``` + +Verify permissions: + +```bash +python3 ~/.claude/skills/open-interpreter/scripts/oi_permission_check.py +``` + +## Directory Structure + +``` +open-interpreter/ +├── SKILL.md # Skill instructions for Claude Code +├── README.md # This file +├── scripts/ +│ ├── oi_install.sh # One-shot install + permissions check +│ ├── oi_screenshot.py # Screen capture with Retina metadata +│ ├── oi_click.py # Mouse click by coordinates or OCR text +│ ├── oi_type.py # Keyboard input, hotkeys, key presses +│ ├── oi_find_text.py # OCR: find text on screen → JSON coords +│ ├── oi_computer.py # Unified dispatch for all actions +│ ├── oi_os_mode.py # Launch OI as managed subprocess +│ └── oi_permission_check.py # Check macOS permissions +└── references/ + ├── computer-api.md # OI Computer API reference + ├── os-mode.md # OS Mode usage and architecture + └── safety-and-permissions.md # Permissions guide and safety model +``` + +## Scripts + +### oi_screenshot.py — Screen capture + +```bash +python3 scripts/oi_screenshot.py # Full screen +python3 scripts/oi_screenshot.py --region 0,0,800,600 # Region +python3 scripts/oi_screenshot.py --active-window # Active window only +``` + +Outputs file path + `SCALE_FACTOR` + `SCREEN_SIZE` metadata (3 lines to stdout). + +### oi_click.py — Mouse click + +```bash +python3 scripts/oi_click.py --x 450 --y 300 # Coordinate click +python3 scripts/oi_click.py --x 900 --y 600 --image-coords # Auto-divide by Retina scale +python3 scripts/oi_click.py --text "Submit" # OCR: find and click text +python3 scripts/oi_click.py --x 450 --y 300 --double # Double click +python3 scripts/oi_click.py --x 450 --y 300 --right # Right click +``` + +### oi_type.py — Keyboard input + +```bash +python3 scripts/oi_type.py --text "hello world" # Clipboard-paste (default) +python3 scripts/oi_type.py --key enter # Single key press +python3 scripts/oi_type.py --hotkey command space # Hotkey (AppleScript on macOS) +python3 scripts/oi_type.py --text "search" --method typewrite # Character-by-character +``` + +### oi_find_text.py — OCR screen reading + +```bash +python3 scripts/oi_find_text.py --text "Submit" +python3 scripts/oi_find_text.py --text "Price" --all --min-conf 80 +``` + +Returns JSON: `[{"text": "Submit", "x": 450, "y": 300, "w": 80, "h": 24, "confidence": 95}]` + +### oi_computer.py — Unified dispatch + +```bash +python3 scripts/oi_computer.py screenshot +python3 scripts/oi_computer.py click --x 450 --y 300 +python3 scripts/oi_computer.py type --text "hello" +python3 scripts/oi_computer.py find --text "Submit" +python3 scripts/oi_computer.py scroll --clicks 3 +python3 scripts/oi_computer.py mouse-position +python3 scripts/oi_computer.py screen-size +``` + +### oi_os_mode.py — Delegate full GUI tasks + +```bash +python3 scripts/oi_os_mode.py "Open Calculator and compute 2+2" +python3 scripts/oi_os_mode.py --local "What apps are open?" # Ollama (offline) +``` + +## Quick Examples + +### Open an app via Spotlight + +```bash +python3 scripts/oi_type.py --hotkey command space +sleep 0.5 +python3 scripts/oi_type.py --text "Calculator" +sleep 0.3 +python3 scripts/oi_type.py --key enter +``` + +### Click a button by label + +```bash +python3 scripts/oi_click.py --text "Save" +``` + +### Read text from screen + +```bash +python3 scripts/oi_find_text.py --text "Total" --all +``` + +### Fill a form + +```bash +python3 scripts/oi_click.py --text "Email" +python3 scripts/oi_type.py --text "user@example.com" +python3 scripts/oi_type.py --key tab +python3 scripts/oi_type.py --text "password123" +``` + +## Retina Display Handling + +macOS Retina displays render at 2x scaling. Screenshot image pixels differ from pyautogui screen coordinates. Use `--image-coords` on `oi_click.py` to auto-divide coordinates by the scale factor when targeting positions from screenshot pixels. + +## Safety + +1. Confirm with user before clicking Send, Delete, Submit, or Confirm buttons +2. Screenshot before and after every action for verification +3. No unbounded autonomous loops +4. pyautogui failsafe: moving mouse to screen corner raises exception +5. Every script logs actions to stderr: `[oi] click at (450, 300) button=left` + +## Troubleshooting + +| Symptom | Fix | +|---------|-----| +| Black screenshot | Grant Screen Recording permission to terminal app | +| Click/type no effect | Grant Accessibility permission to terminal app | +| OCR finds no text | Verify tesseract: `which tesseract && tesseract --version` | +| Coordinates off by 2x | Use `--image-coords` flag on `oi_click.py` | +| OS Mode hangs | Verify `ANTHROPIC_API_KEY` is set | +| Local mode fails | Verify Ollama running: `ollama list` | + +## Credits + +- [OpenInterpreter](https://github.com/OpenInterpreter/open-interpreter) by Killian Lucas — the foundation this skill builds on +- [Claudicle](https://github.com/tdimino/claudicle) by Tom di Mino — open-source soul agent framework, LLM-agnostic at the cognitive level +- Built as a [Claude Code skill](https://code.claude.com/docs/en/skills) following the [Agent Skills](https://agentskills.io/) open standard diff --git a/.claude/skills/open-interpreter/SKILL.md b/.claude/skills/open-interpreter/SKILL.md new file mode 100644 index 0000000000..e5e22477f4 --- /dev/null +++ b/.claude/skills/open-interpreter/SKILL.md @@ -0,0 +1,219 @@ +--- +name: open-interpreter +description: Desktop GUI automation via OpenInterpreter — mouse, keyboard, screenshot, + and OCR control for native macOS/Linux applications. Three modes: Library (Claude + reasons, OI executes), OS subprocess (full autonomous computer use), and Local agent + (Ollama, offline). This skill should be used when interacting with desktop apps that + have no CLI or API, automating GUI workflows, reading screen content via OCR, or + controlling mouse/keyboard. +--- + +# OpenInterpreter — Desktop GUI Automation + +Desktop control for Claude Code via [OpenInterpreter](https://github.com/OpenInterpreter/open-interpreter) (62k stars, AGPL-3.0). Mouse, keyboard, screenshot, and OCR primitives backed by pyautogui + pytesseract. + +## Mode Selection + +| Mode | LLM | Script | Best For | +|------|-----|--------|----------| +| **Library** | Claude Code (native) | Individual scripts below | Surgical GUI actions — Claude sees screenshots, reasons, dispatches actions | +| **OS subprocess** | Claude API (via OI) | `oi_os_mode.py` | Full autonomous computer use — delegate entire GUI tasks | +| **Local agent** | Ollama (offline) | `oi_os_mode.py --local` | Offline computer use, no API costs, privacy-sensitive tasks | + +Use Library mode by default. Use OS subprocess to delegate self-contained GUI tasks. Use Local agent when offline or to avoid API costs. + +## Installation + +Run once: + +```bash +.claude/skills/open-interpreter/scripts/oi_install.sh +``` + +Installs `open-interpreter[os]` via uv, verifies pyautogui and tesseract, checks macOS permissions. + +**macOS permissions** (one-time, manual): +- System Settings > Privacy & Security > **Accessibility** > add terminal app (Ghostty/Terminal/iTerm2) +- System Settings > Privacy & Security > **Screen Recording** > add terminal app + +Verify permissions: + +```bash +python3 .claude/skills/open-interpreter/scripts/oi_permission_check.py +``` + +## Library Mode: The Screenshot Loop + +The core pattern for GUI automation: + +``` +1. Take screenshot → oi_screenshot.py +2. Read PNG → Claude Read tool (native vision) +3. Decide action → Claude reasoning +4. Execute action → oi_click.py / oi_type.py +5. Verify → Take another screenshot +6. Loop until done +``` + +### Scripts + +**`oi_screenshot.py`** — Capture screen, return file path with Retina metadata + +```bash +python3 .claude/skills/open-interpreter/scripts/oi_screenshot.py +python3 .claude/skills/open-interpreter/scripts/oi_screenshot.py --region 0,0,800,600 +python3 .claude/skills/open-interpreter/scripts/oi_screenshot.py --active-window +``` + +Output (3 lines): +``` +/tmp/oi_screenshot_1708789200.png +SCALE_FACTOR=2 +SCREEN_SIZE=1512x982 +``` + +**`oi_click.py`** — Mouse click by coordinates or OCR text + +```bash +python3 .claude/skills/open-interpreter/scripts/oi_click.py --x 450 --y 300 +python3 .claude/skills/open-interpreter/scripts/oi_click.py --x 900 --y 600 --image-coords +python3 .claude/skills/open-interpreter/scripts/oi_click.py --text "Submit" +python3 .claude/skills/open-interpreter/scripts/oi_click.py --x 450 --y 300 --double +python3 .claude/skills/open-interpreter/scripts/oi_click.py --x 450 --y 300 --right +``` + +- `--image-coords`: auto-divides by Retina scale factor (use when coordinates come from screenshot image pixels) +- `--text`: OCR-based — screenshots, finds text via pytesseract, clicks center of match + +**`oi_type.py`** — Keyboard input + +```bash +python3 .claude/skills/open-interpreter/scripts/oi_type.py --text "hello world" +python3 .claude/skills/open-interpreter/scripts/oi_type.py --key enter +python3 .claude/skills/open-interpreter/scripts/oi_type.py --hotkey command space +python3 .claude/skills/open-interpreter/scripts/oi_type.py --text "search" --method typewrite +``` + +- Default text input: clipboard-paste (Cmd+V) for speed and Unicode safety +- `--method typewrite`: character-by-character (use when clipboard is needed for other purposes) +- `--hotkey`: AppleScript on macOS for reliable modifier key handling + +**`oi_find_text.py`** — OCR screen reading + +```bash +python3 .claude/skills/open-interpreter/scripts/oi_find_text.py --text "Submit" +python3 .claude/skills/open-interpreter/scripts/oi_find_text.py --text "Price" --screenshot /tmp/ss.png +``` + +Returns JSON array: `[{"text": "Submit", "x": 450, "y": 300, "w": 80, "h": 24, "confidence": 95}]` + +**`oi_computer.py`** — Unified dispatch for all actions + +```bash +python3 .claude/skills/open-interpreter/scripts/oi_computer.py screenshot +python3 .claude/skills/open-interpreter/scripts/oi_computer.py click --x 450 --y 300 +python3 .claude/skills/open-interpreter/scripts/oi_computer.py type --text "hello" +python3 .claude/skills/open-interpreter/scripts/oi_computer.py find --text "Submit" +python3 .claude/skills/open-interpreter/scripts/oi_computer.py scroll --clicks 3 +python3 .claude/skills/open-interpreter/scripts/oi_computer.py mouse-position +python3 .claude/skills/open-interpreter/scripts/oi_computer.py screen-size +``` + +### Retina Display Handling + +macOS Retina displays render at 2x (or 3x) scaling. Screenshot image pixels differ from screen coordinates: + +| Metric | Example (14" MBP) | +|--------|-------------------| +| Image pixels (screenshot) | 3024 x 1964 | +| Screen coordinates (pyautogui) | 1512 x 982 | +| Scale factor | 2x | + +When estimating click targets from a screenshot image, use `--image-coords` on `oi_click.py` to auto-divide by the scale factor. The `oi_screenshot.py` output includes `SCALE_FACTOR` metadata. + +## OS Mode: Delegate Full Tasks + +For self-contained GUI tasks, delegate to OI's full agent loop: + +```bash +python3 .claude/skills/open-interpreter/scripts/oi_os_mode.py "Open Calculator and compute 2+2" +python3 .claude/skills/open-interpreter/scripts/oi_os_mode.py --provider anthropic "Change the desktop wallpaper" +``` + +OI runs its own screenshot → analyze → act loop using the Claude API. Requires `ANTHROPIC_API_KEY`. + +## Local Mode: Offline Computer Use + +Run OI with a local vision model via Ollama: + +```bash +python3 .claude/skills/open-interpreter/scripts/oi_os_mode.py --local "What apps are open?" +``` + +Prerequisites: +1. Ollama running: `ollama serve` +2. Vision model pulled: `ollama pull llama3.2-vision` + +Limitation: Local models use OI's classic code-execution mode, not the screenshot-driven OS Mode (which requires Claude 3.5 Sonnet). Local mode generates and executes code to accomplish GUI tasks rather than using pixel-level screenshot analysis. + +## Common Recipes + +### Open an App via Spotlight + +```bash +python3 scripts/oi_type.py --hotkey command space +sleep 0.5 +python3 scripts/oi_type.py --text "Calculator" +sleep 0.3 +python3 scripts/oi_type.py --key enter +``` + +### Read Text from Screen + +```bash +python3 scripts/oi_screenshot.py > /tmp/ss_meta.txt +python3 scripts/oi_find_text.py --text "Total" --screenshot "$(head -1 /tmp/ss_meta.txt)" +``` + +### Click a Button by Label + +```bash +python3 scripts/oi_click.py --text "Save" +``` + +### Fill a Form Field + +```bash +python3 scripts/oi_click.py --text "Email" +python3 scripts/oi_type.py --text "user@example.com" +python3 scripts/oi_type.py --key tab +python3 scripts/oi_type.py --text "password123" +``` + +## Safety + +1. **Confirm before destructive actions** — before clicking Send, Delete, Submit, or Confirm buttons, verify with the user +2. **Screenshot before and after** every action for verification +3. **No unbounded autonomous loops** — confirm with user between multi-step GUI workflows +4. **pyautogui failsafe** — moving mouse to any screen corner raises `pyautogui.FailSafeException` (enabled by default) +5. **Action logging** — every script logs actions to stderr: `[oi] click at (450, 300) button=left` + +## Troubleshooting + +| Symptom | Fix | +|---------|-----| +| `oi_screenshot.py` returns black image | Grant Screen Recording permission to terminal app | +| `oi_click.py` / `oi_type.py` no effect | Grant Accessibility permission to terminal app | +| OCR finds no text | Verify tesseract: `which tesseract && tesseract --version` | +| Retina coordinates off by 2x | Use `--image-coords` flag on `oi_click.py` | +| `oi_find_text.py` low confidence | Try larger text, ensure screen is not obstructed | +| OS Mode hangs | Verify `ANTHROPIC_API_KEY` is set, check OI stderr output | +| Local mode fails | Verify Ollama running (`ollama list`) and model pulled | + +## Reference Documentation + +| File | Contents | +|------|----------| +| `references/computer-api.md` | OI Computer API reference — mouse, keyboard, display, clipboard | +| `references/os-mode.md` | OS Mode usage, provider configuration, agent loop architecture | +| `references/safety-and-permissions.md` | macOS permissions guide, safety model, failsafe configuration | diff --git a/.claude/skills/open-interpreter/references/computer-api.md b/.claude/skills/open-interpreter/references/computer-api.md new file mode 100644 index 0000000000..d1597864ea --- /dev/null +++ b/.claude/skills/open-interpreter/references/computer-api.md @@ -0,0 +1,115 @@ +# OpenInterpreter Computer API Reference + +## Overview + +OpenInterpreter's Computer API (`interpreter/core/computer/`) provides programmatic access to desktop automation primitives. The skill wraps these via standalone scripts for use with Claude Code. + +## Script Reference + +### oi_screenshot.py + +Captures the screen using `screencapture` (macOS) or `scrot`/pyautogui (Linux). + +| Flag | Description | +|------|-------------| +| `--region X,Y,W,H` | Capture region only | +| `--active-window` | Capture frontmost window | +| `--output PATH` | Custom output path (default: `/tmp/oi_screenshot_TIMESTAMP.png`) | + +**Output** (3 lines to stdout): +``` +/tmp/oi_screenshot_1708789200.png +SCALE_FACTOR=2 +SCREEN_SIZE=1512x982 +``` + +### oi_click.py + +Performs mouse clicks via pyautogui. Two modes: coordinate and OCR text. + +| Flag | Description | +|------|-------------| +| `--x N --y N` | Click at screen coordinates | +| `--text "label"` | Find text via OCR, click center | +| `--image-coords` | Divide coords by Retina scale factor | +| `--double` | Double click | +| `--right` | Right click | +| `--clicks N` | Number of clicks (default: 1) | + +**Output**: JSON object to stdout, action log to stderr. + +### oi_type.py + +Keyboard input: text, single keys, and hotkey combos. + +| Flag | Description | +|------|-------------| +| `--text "string"` | Type text (default: clipboard paste) | +| `--key NAME` | Press single key (enter, tab, escape, etc.) | +| `--hotkey KEY KEY...` | Hotkey combo (e.g., command space) | +| `--method paste\|typewrite` | Text input method (default: paste) | + +**Text methods**: +- `paste` (default): Copy to clipboard, Cmd+V. Fast, Unicode-safe. +- `typewrite`: Character-by-character. Slower, but doesn't touch clipboard. + +**macOS hotkeys**: Uses AppleScript (`osascript`) for reliable modifier key handling. Key names: command, shift, option, control, plus key codes for special keys (space, enter, tab, escape, F1-F8, arrow keys). + +### oi_find_text.py + +OCR screen reading via pytesseract. + +| Flag | Description | +|------|-------------| +| `--text "string"` | Text to search for (required) | +| `--screenshot PATH` | Use existing screenshot | +| `--all` | Return all matches, not just best | +| `--min-conf N` | Minimum confidence threshold (0-100) | + +**Output**: JSON array to stdout: +```json +[{"text": "Submit", "x": 450, "y": 300, "w": 80, "h": 24, "confidence": 95}] +``` + +Coordinates are in screen space (divided by Retina scale). + +### oi_computer.py + +Unified dispatch. Routes to the appropriate script. + +| Subcommand | Equivalent | +|------------|------------| +| `screenshot [args]` | `oi_screenshot.py [args]` | +| `click [args]` | `oi_click.py [args]` | +| `type [args]` | `oi_type.py [args]` | +| `find [args]` | `oi_find_text.py [args]` | +| `scroll --clicks N` | pyautogui.scroll() | +| `mouse-position` | Returns `{"x": N, "y": N}` | +| `screen-size` | Returns `{"width": N, "height": N}` | + +## Retina Coordinate Handling + +On macOS Retina displays, screenshot image pixels differ from pyautogui screen coordinates: + +| | Image Pixels | Screen Coordinates | +|--|-------------|-------------------| +| 14" MBP | 3024 x 1964 | 1512 x 982 | +| Scale factor | 2x | 1x (pyautogui native) | + +When Claude reads a screenshot and estimates a click target at pixel (900, 600) in the image: +- Without `--image-coords`: clicks at screen position (900, 600) — wrong +- With `--image-coords`: divides by 2, clicks at screen position (450, 300) — correct + +The `oi_screenshot.py` SCALE_FACTOR output enables this conversion. + +## Dependencies + +| Package | Purpose | Install | +|---------|---------|---------| +| pyautogui | Mouse/keyboard control | `uv pip install pyautogui` | +| pytesseract | OCR text detection | `uv pip install pytesseract` | +| Pillow | Image processing | `uv pip install Pillow` | +| pyperclip | Clipboard access | `uv pip install pyperclip` | +| tesseract | OCR engine (CLI) | `brew install tesseract` | + +All installed by `oi_install.sh`. diff --git a/.claude/skills/open-interpreter/references/os-mode.md b/.claude/skills/open-interpreter/references/os-mode.md new file mode 100644 index 0000000000..4e61e3bc41 --- /dev/null +++ b/.claude/skills/open-interpreter/references/os-mode.md @@ -0,0 +1,99 @@ +# OpenInterpreter OS Mode Reference + +## Overview + +OS Mode (`interpreter --os`) is OpenInterpreter's screenshot-driven desktop control system. It runs an autonomous loop: screenshot → Claude API analysis → pyautogui action → repeat. + +## Architecture + +``` +User task → OI agent loop: + 1. Take screenshot (screencapture / pyautogui) + 2. Send screenshot to Claude API (vision) + 3. Claude analyzes: what to do next? + 4. OI executes: click(x,y) / type("text") / hotkey(cmd+space) + 5. Take verification screenshot + 6. Repeat until task complete or max iterations +``` + +## Usage via oi_os_mode.py + +```bash +# Default: Claude API via Anthropic +python3 scripts/oi_os_mode.py "Open Calculator and compute 2+2" + +# Explicit provider +python3 scripts/oi_os_mode.py --provider anthropic "Change wallpaper" + +# Custom timeout (default: 300s) +python3 scripts/oi_os_mode.py --timeout 120 "Fill out the form" +``` + +**Requirements**: +- `ANTHROPIC_API_KEY` environment variable +- macOS Accessibility + Screen Recording permissions + +## Local Mode via Ollama + +```bash +# Local model (code-execution mode, not screenshot-driven) +python3 scripts/oi_os_mode.py --local "What apps are open?" + +# Custom model +python3 scripts/oi_os_mode.py --local --model llama3.2-vision "Describe the screen" + +# Custom Ollama endpoint +python3 scripts/oi_os_mode.py --local --api-base http://192.168.1.100:11434 "List files" +``` + +**Requirements**: +- Ollama running: `ollama serve` +- Vision model: `ollama pull llama3.2-vision` + +**Limitation**: Local models use OI's classic code-execution mode (generates Python/Bash to accomplish tasks). The screenshot-driven OS Mode is hardcoded to Claude 3.5 Sonnet and cannot use local models. + +## When to Use Each Mode + +| Scenario | Mode | +|----------|------| +| Precise GUI action (one click, one type) | Library (oi_click.py, oi_type.py) | +| Multi-step GUI workflow Claude can reason about | Library with screenshot loop | +| Self-contained GUI task, no codebase context | OS subprocess (oi_os_mode.py) | +| Offline / no API costs / privacy | Local (oi_os_mode.py --local) | +| Complex multi-app workflow | OS subprocess | + +## OI CLI Flags Reference + +| Flag | Description | +|------|-------------| +| `--os` | Enable OS Mode (screenshot-driven) | +| `-y` | Auto-approve actions (skip confirmation prompts) | +| `--model NAME` | LLM model to use | +| `--api_base URL` | Custom API endpoint | +| `--local` | Use local model (bundled profile) | +| `--vision` | Enable vision capabilities | +| `--safe_mode ask\|auto\|off` | Safety confirmation level | + +## OI Python API (for advanced integration) + +```python +from interpreter import interpreter + +# OS Mode +interpreter.computer.display.screenshot() # Take screenshot +interpreter.computer.mouse.click(x, y) # Click at coordinates +interpreter.computer.keyboard.write("text") # Type text +interpreter.computer.keyboard.hotkey("command", "space") # Hotkey + +# Classic mode with Ollama +interpreter.llm.model = "ollama/llama3.2-vision" +interpreter.llm.api_base = "http://localhost:11434" +interpreter.auto_run = True +interpreter.chat("What time is it?") +``` + +## Development Status + +OpenInterpreter is in maintenance mode (last release v0.4.2, Oct 2024). The core team pivoted to the 01 App. The codebase is stable and the underlying primitives (pyautogui, pytesseract) are well-maintained independently. + +OS Mode is labeled "highly experimental" in OI's documentation. For production use, prefer Library mode (Claude Code reasons, scripts execute) over the OS subprocess approach. diff --git a/.claude/skills/open-interpreter/references/safety-and-permissions.md b/.claude/skills/open-interpreter/references/safety-and-permissions.md new file mode 100644 index 0000000000..b0e7f5807e --- /dev/null +++ b/.claude/skills/open-interpreter/references/safety-and-permissions.md @@ -0,0 +1,107 @@ +# Safety and Permissions Guide + +## macOS Permissions + +Desktop GUI automation requires two macOS permissions. Both are per-application (grant to your terminal app: Ghostty, Terminal.app, iTerm2, VS Code, etc.). + +### Accessibility + +**What it enables**: Mouse movement, clicks, keyboard input (pyautogui) + +**How to grant**: +1. System Settings > Privacy & Security > Accessibility +2. Click the lock icon to authenticate +3. Click "+" and add your terminal app +4. If already listed, toggle it off and on again + +**Symptoms when missing**: pyautogui operations silently fail or throw "This process is not trusted! Input event monitoring will not be possible until it is added to accessibility clients." + +### Screen Recording + +**What it enables**: Screen capture (screencapture, pyautogui.screenshot) + +**How to grant**: +1. System Settings > Privacy & Security > Screen Recording +2. Click the lock icon to authenticate +3. Click "+" and add your terminal app +4. Restart the terminal app after granting + +**Symptoms when missing**: Screenshots are blank (all black or all white), or screencapture produces 0-byte files. + +### Verifying Permissions + +```bash +python3 ~/.claude/skills/open-interpreter/scripts/oi_permission_check.py +``` + +This checks: +- pyautogui can read mouse position (Accessibility) +- screencapture produces a non-empty file (Screen Recording) +- tesseract is installed (OCR support) + +## Safety Model + +### Principles + +1. **Human-in-the-loop for destructive actions**: Before clicking Send, Delete, Submit, Confirm, or Purchase buttons, verify with the user. A misclick on a destructive button cannot be undone. + +2. **Screenshot-verify pattern**: Take a screenshot before and after every action. This provides an audit trail and catches misclicks early. + +3. **No unbounded autonomous loops**: Multi-step GUI workflows should checkpoint with the user. Do not run 50 uninterrupted click sequences without verification. + +4. **pyautogui failsafe**: Moving the mouse to any screen corner (0,0 or max,0 or 0,max or max,max) raises `pyautogui.FailSafeException`, which halts execution. This is enabled by default and should not be disabled. + +5. **Action logging**: Every script logs its actions to stderr with the `[oi]` prefix. This provides a record of what was done. + +### Risk Categories + +| Action | Risk | Mitigation | +|--------|------|-----------| +| Screenshot | None (read-only) | — | +| Find text (OCR) | None (read-only) | — | +| Mouse move | Low | Reversible | +| Click (left) | Medium | Screenshot before, verify target | +| Click (right) | Medium | Context menus are dismissible | +| Type text | Medium | Can undo (Cmd+Z) | +| Hotkey | Medium-High | Some hotkeys trigger irreversible actions | +| Click "Delete"/"Send" | High | Require user confirmation | +| Form submission | High | Require user confirmation | + +### OS Mode Safety + +When using `oi_os_mode.py` (delegated autonomous control), OI runs its own agent loop with `-y` (auto-approve). This means: +- OI will execute actions without asking for confirmation +- The timeout flag provides a hard limit on execution time +- Monitor stderr for OI's action log +- For high-risk tasks, prefer Library mode where Claude Code controls each step + +### pyautogui Failsafe + +```python +import pyautogui + +# Enabled by default — do not disable +pyautogui.FAILSAFE = True # Default: True + +# Pause between actions (seconds) +pyautogui.PAUSE = 0.1 # Default: 0.1 +``` + +To emergency-stop any pyautogui automation, quickly move the mouse to any screen corner. This raises `FailSafeException` and halts the script. + +## tesseract Installation + +tesseract provides OCR for text detection on screen. + +```bash +# macOS +brew install tesseract + +# With additional language packs +brew install tesseract-lang + +# Verify +tesseract --version +``` + +The Python binding (`pytesseract`) is installed by `oi_install.sh`. It requires the tesseract CLI to be available in PATH. diff --git a/.claude/skills/open-interpreter/scripts/oi_click.py b/.claude/skills/open-interpreter/scripts/oi_click.py new file mode 100755 index 0000000000..56cb1bbdf5 --- /dev/null +++ b/.claude/skills/open-interpreter/scripts/oi_click.py @@ -0,0 +1,159 @@ +#!/usr/bin/env python3 +""" +oi_click.py -- Mouse click by coordinates or OCR text. + +Usage: + oi_click.py --x 450 --y 300 # Click at screen coordinates + oi_click.py --x 900 --y 600 --image-coords # Auto-divide by Retina scale + oi_click.py --text "Submit" # OCR: find text on screen, click center + oi_click.py --x 450 --y 300 --double # Double click + oi_click.py --x 450 --y 300 --right # Right click + oi_click.py --x 450 --y 300 --clicks 3 # Triple click +""" + +import argparse +import json +import os +import platform +import subprocess +import sys +import tempfile +import time + + +def get_scale_factor(): + """Detect Retina scale factor.""" + if platform.system() != "Darwin": + return 1 + try: + import pyautogui + screen = pyautogui.size() + tmp = os.path.join(tempfile.gettempdir(), "oi_scale_test.png") + subprocess.run(["screencapture", "-x", "-C", tmp], capture_output=True, timeout=5) + if os.path.exists(tmp): + from PIL import Image + img = Image.open(tmp) + img_w = img.width + img.close() + os.unlink(tmp) + factor = round(img_w / screen.width) + return max(1, factor) + except Exception: + pass + return 2 + + +def find_text_on_screen(text): + """Find text on screen using pytesseract OCR. Returns (x, y) center coordinates.""" + import pyautogui + import pytesseract + from PIL import Image + + # Take screenshot + tmp = os.path.join(tempfile.gettempdir(), f"oi_ocr_{int(time.time())}.png") + if platform.system() == "Darwin": + subprocess.run(["screencapture", "-x", "-C", tmp], capture_output=True, timeout=5) + else: + img = pyautogui.screenshot() + img.save(tmp) + + if not os.path.exists(tmp): + return None + + img = Image.open(tmp) + scale = get_scale_factor() + + # Run OCR with bounding box data + data = pytesseract.image_to_data(img, output_type=pytesseract.Output.DICT) + img.close() + os.unlink(tmp) + + # Search for matching text + best_match = None + best_conf = -1 + text_lower = text.lower() + + n = len(data["text"]) + for i in range(n): + word = data["text"][i].strip() + if not word: + continue + conf = int(data["conf"][i]) if data["conf"][i] != "-1" else 0 + if text_lower in word.lower() and conf > best_conf: + # Center of bounding box in image space, then convert to screen coordinates + x = int((data["left"][i] + data["width"][i] / 2) / scale) + y = int((data["top"][i] + data["height"][i] / 2) / scale) + best_match = (x, y) + best_conf = conf + + # Also try matching across consecutive words + if best_match is None and " " in text: + words = text_lower.split() + for i in range(n - len(words) + 1): + segment = " ".join(data["text"][i:i + len(words)]).strip().lower() + if text_lower in segment: + # Span from first to last word + x1 = data["left"][i] + y1 = data["top"][i] + last = i + len(words) - 1 + x2 = data["left"][last] + data["width"][last] + y2 = data["top"][last] + data["height"][last] + x = int((x1 + x2) / 2 / scale) + y = int((y1 + y2) / 2 / scale) + best_match = (x, y) + break + + return best_match + + +def main(): + parser = argparse.ArgumentParser(description="Click at coordinates or OCR text location") + parser.add_argument("--x", type=int, help="X coordinate") + parser.add_argument("--y", type=int, help="Y coordinate") + parser.add_argument("--text", metavar="TEXT", help="Find text on screen via OCR and click its center") + parser.add_argument("--image-coords", action="store_true", + help="Divide coordinates by Retina scale factor (use when coords come from screenshot pixels)") + parser.add_argument("--double", action="store_true", help="Double click") + parser.add_argument("--right", action="store_true", help="Right click") + parser.add_argument("--clicks", type=int, default=1, help="Number of clicks (default: 1)") + + args = parser.parse_args() + + if not args.text and (args.x is None or args.y is None): + parser.error("Provide either --text or both --x and --y") + + try: + import pyautogui + except ImportError: + print("Error: pyautogui not installed. Run: ~/.claude/skills/open-interpreter/scripts/oi_install.sh", + file=sys.stderr) + sys.exit(1) + + if args.text: + # OCR mode + result = find_text_on_screen(args.text) + if result is None: + print(f"Error: text '{args.text}' not found on screen", file=sys.stderr) + sys.exit(1) + x, y = result + print(f"[oi] found '{args.text}' at ({x}, {y})", file=sys.stderr) + else: + x, y = args.x, args.y + if args.image_coords: + scale = get_scale_factor() + x = x // scale + y = y // scale + print(f"[oi] image coords ({args.x}, {args.y}) -> screen coords ({x}, {y}) (scale={scale})", + file=sys.stderr) + + # Perform click + button = "right" if args.right else "left" + clicks = args.clicks if args.clicks > 1 else (2 if args.double else 1) + + pyautogui.click(x, y, clicks=clicks, button=button) + print(f"[oi] click at ({x}, {y}) button={button} clicks={clicks}", file=sys.stderr) + print(json.dumps({"action": "click", "x": x, "y": y, "button": button, "clicks": clicks})) + + +if __name__ == "__main__": + main() diff --git a/.claude/skills/open-interpreter/scripts/oi_computer.py b/.claude/skills/open-interpreter/scripts/oi_computer.py new file mode 100755 index 0000000000..b14a75a237 --- /dev/null +++ b/.claude/skills/open-interpreter/scripts/oi_computer.py @@ -0,0 +1,87 @@ +#!/usr/bin/env python3 +""" +oi_computer.py -- Unified dispatch for all desktop automation actions. + +Usage: + oi_computer.py screenshot [--region X,Y,W,H] [--active-window] + oi_computer.py click --x 450 --y 300 [--image-coords] [--double] [--right] + oi_computer.py click --text "Submit" + oi_computer.py type --text "hello world" [--method typewrite] + oi_computer.py type --key enter + oi_computer.py type --hotkey command space + oi_computer.py find --text "Submit" [--all] [--min-conf 80] + oi_computer.py scroll --clicks 3 [--x 450 --y 300] + oi_computer.py mouse-position + oi_computer.py screen-size +""" + +import json +import os +import subprocess +import sys + +SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) + + +def run_script(name, args): + """Run a sibling script and pass through its output.""" + script = os.path.join(SCRIPT_DIR, name) + cmd = [sys.executable, script] + args + result = subprocess.run(cmd, capture_output=False) + return result.returncode + + +def main(): + if len(sys.argv) < 2: + print("Usage: oi_computer.py [args...]", file=sys.stderr) + print("Commands: screenshot, click, type, find, scroll, mouse-position, screen-size", file=sys.stderr) + sys.exit(1) + + command = sys.argv[1] + args = sys.argv[2:] + + if command == "screenshot": + sys.exit(run_script("oi_screenshot.py", args)) + + elif command == "click": + sys.exit(run_script("oi_click.py", args)) + + elif command == "type": + sys.exit(run_script("oi_type.py", args)) + + elif command == "find": + sys.exit(run_script("oi_find_text.py", args)) + + elif command == "scroll": + import argparse + parser = argparse.ArgumentParser() + parser.add_argument("--clicks", type=int, default=3, help="Scroll clicks (positive=up, negative=down)") + parser.add_argument("--x", type=int, help="X position to scroll at") + parser.add_argument("--y", type=int, help="Y position to scroll at") + parsed = parser.parse_args(args) + + import pyautogui + if parsed.x is not None and parsed.y is not None: + pyautogui.moveTo(parsed.x, parsed.y) + pyautogui.scroll(parsed.clicks) + print(f"[oi] scroll clicks={parsed.clicks}", file=sys.stderr) + print(json.dumps({"action": "scroll", "clicks": parsed.clicks})) + + elif command == "mouse-position": + import pyautogui + pos = pyautogui.position() + print(json.dumps({"x": pos.x, "y": pos.y})) + + elif command == "screen-size": + import pyautogui + size = pyautogui.size() + print(json.dumps({"width": size.width, "height": size.height})) + + else: + print(f"Error: unknown command '{command}'", file=sys.stderr) + print("Commands: screenshot, click, type, find, scroll, mouse-position, screen-size", file=sys.stderr) + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/.claude/skills/open-interpreter/scripts/oi_find_text.py b/.claude/skills/open-interpreter/scripts/oi_find_text.py new file mode 100755 index 0000000000..4e10288610 --- /dev/null +++ b/.claude/skills/open-interpreter/scripts/oi_find_text.py @@ -0,0 +1,181 @@ +#!/usr/bin/env python3 +""" +oi_find_text.py -- OCR screen reading: find text locations on screen. + +Returns JSON array of matches with coordinates (in screen space, not image pixels). + +Usage: + oi_find_text.py --text "Submit" + oi_find_text.py --text "Submit" --screenshot /tmp/screenshot.png + oi_find_text.py --text "Price" --all # Return all matches, not just best + oi_find_text.py --text "File" --min-conf 80 # Minimum confidence threshold +""" + +import argparse +import json +import os +import platform +import subprocess +import sys +import tempfile +import time + + +def get_scale_factor(): + """Detect Retina scale factor.""" + if platform.system() != "Darwin": + return 1 + try: + import pyautogui + screen = pyautogui.size() + tmp = os.path.join(tempfile.gettempdir(), "oi_scale_test.png") + subprocess.run(["screencapture", "-x", "-C", tmp], capture_output=True, timeout=5) + if os.path.exists(tmp): + from PIL import Image + img = Image.open(tmp) + img_w = img.width + img.close() + os.unlink(tmp) + factor = round(img_w / screen.width) + return max(1, factor) + except Exception: + pass + return 2 + + +def find_text(text, screenshot_path=None, return_all=False, min_conf=0): + """Find text on screen using pytesseract OCR. + + Returns list of dicts: [{"text": str, "x": int, "y": int, "w": int, "h": int, "confidence": int}] + Coordinates are in screen space (divided by Retina scale factor). + """ + import pytesseract + from PIL import Image + + # Take screenshot if not provided + tmp_created = False + if screenshot_path is None: + screenshot_path = os.path.join(tempfile.gettempdir(), f"oi_ocr_{int(time.time())}.png") + if platform.system() == "Darwin": + subprocess.run(["screencapture", "-x", "-C", screenshot_path], capture_output=True, timeout=5) + else: + import pyautogui + img = pyautogui.screenshot() + img.save(screenshot_path) + tmp_created = True + + if not os.path.exists(screenshot_path): + return [] + + img = Image.open(screenshot_path) + scale = get_scale_factor() + + # Run OCR + data = pytesseract.image_to_data(img, output_type=pytesseract.Output.DICT) + img.close() + + if tmp_created: + os.unlink(screenshot_path) + + # Find matches + matches = [] + text_lower = text.lower() + n = len(data["text"]) + + # Single-word matches + for i in range(n): + word = data["text"][i].strip() + if not word: + continue + conf = int(data["conf"][i]) if data["conf"][i] != "-1" else 0 + if conf < min_conf: + continue + if text_lower in word.lower(): + matches.append({ + "text": word, + "x": int((data["left"][i] + data["width"][i] / 2) / scale), + "y": int((data["top"][i] + data["height"][i] / 2) / scale), + "w": int(data["width"][i] / scale), + "h": int(data["height"][i] / scale), + "confidence": conf, + }) + + # Multi-word matches + if " " in text: + words = text_lower.split() + for i in range(n - len(words) + 1): + segment = " ".join(data["text"][i:i + len(words)]).strip().lower() + if text_lower in segment: + # Check minimum confidence across span + span_conf = min( + int(data["conf"][j]) if data["conf"][j] != "-1" else 0 + for j in range(i, i + len(words)) + ) + if span_conf < min_conf: + continue + + last = i + len(words) - 1 + x1 = data["left"][i] + y1 = data["top"][i] + x2 = data["left"][last] + data["width"][last] + y2 = data["top"][last] + data["height"][last] + + matches.append({ + "text": " ".join(data["text"][i:i + len(words)]), + "x": int((x1 + x2) / 2 / scale), + "y": int((y1 + y2) / 2 / scale), + "w": int((x2 - x1) / scale), + "h": int((y2 - y1) / scale), + "confidence": span_conf, + }) + + # Sort by confidence descending + matches.sort(key=lambda m: m["confidence"], reverse=True) + + if return_all: + return matches + elif matches: + return [matches[0]] + else: + return [] + + +def main(): + parser = argparse.ArgumentParser(description="Find text on screen via OCR") + parser.add_argument("--text", required=True, help="Text to search for") + parser.add_argument("--screenshot", metavar="PATH", help="Use existing screenshot instead of capturing") + parser.add_argument("--all", action="store_true", help="Return all matches, not just the best") + parser.add_argument("--min-conf", type=int, default=0, help="Minimum OCR confidence threshold (0-100)") + + args = parser.parse_args() + + try: + results = find_text( + args.text, + screenshot_path=args.screenshot, + return_all=args.all, + min_conf=args.min_conf, + ) + + print(json.dumps(results, indent=2)) + + if results: + print(f"[oi] found {len(results)} match(es) for '{args.text}'", file=sys.stderr) + else: + print(f"[oi] no matches for '{args.text}'", file=sys.stderr) + sys.exit(1) + + except ImportError as e: + print(f"Error: Missing dependency: {e}\n" + "Run: ~/.claude/skills/open-interpreter/scripts/oi_install.sh", + file=sys.stderr) + sys.exit(1) + except KeyboardInterrupt: + sys.exit(130) + except Exception as e: + print(f"Error: {e}", file=sys.stderr) + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/.claude/skills/open-interpreter/scripts/oi_install.sh b/.claude/skills/open-interpreter/scripts/oi_install.sh new file mode 100755 index 0000000000..ddcaa52b19 --- /dev/null +++ b/.claude/skills/open-interpreter/scripts/oi_install.sh @@ -0,0 +1,58 @@ +#!/usr/bin/env bash +# oi_install.sh -- One-shot OpenInterpreter installation and verification +# +# Installs open-interpreter[os] via uv, verifies pyautogui, tesseract, +# and checks macOS permissions. +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" + +echo "==> Installing OpenInterpreter with OS mode extras..." +# OI pins tiktoken==0.7.0 which has no prebuilt wheel for Python 3.13+. +# Use --override to force a newer tiktoken that ships prebuilt wheels. +OVERRIDE_FILE=$(mktemp) +echo "tiktoken>=0.8" > "$OVERRIDE_FILE" +uv pip install --system "open-interpreter[os]" --override "$OVERRIDE_FILE" +rm -f "$OVERRIDE_FILE" + +echo "" +echo "==> Verifying OpenInterpreter import..." +python3 -c "from interpreter import interpreter; print(' interpreter: OK')" + +echo "" +echo "==> Verifying pyautogui..." +python3 -c " +import pyautogui +size = pyautogui.size() +print(f' pyautogui: OK (screen: {size.width}x{size.height})') +" + +echo "" +echo "==> Verifying pytesseract..." +python3 -c " +import pytesseract +version = pytesseract.get_tesseract_version() +print(f' pytesseract: OK (tesseract {version})') +" 2>/dev/null || { + echo " pytesseract: MISSING" + echo " Install tesseract: brew install tesseract" +} + +echo "" +echo "==> Checking tesseract CLI..." +if command -v tesseract &>/dev/null; then + echo " tesseract: $(tesseract --version 2>&1 | head -1)" +else + echo " tesseract: NOT FOUND" + echo " Install: brew install tesseract" +fi + +echo "" +echo "==> Checking macOS permissions..." +python3 "$SCRIPT_DIR/oi_permission_check.py" + +echo "" +echo "OpenInterpreter installation complete." +echo "If permissions are missing, add your terminal app in:" +echo " System Settings > Privacy & Security > Accessibility" +echo " System Settings > Privacy & Security > Screen Recording" diff --git a/.claude/skills/open-interpreter/scripts/oi_os_mode.py b/.claude/skills/open-interpreter/scripts/oi_os_mode.py new file mode 100755 index 0000000000..a6bdce6410 --- /dev/null +++ b/.claude/skills/open-interpreter/scripts/oi_os_mode.py @@ -0,0 +1,171 @@ +#!/usr/bin/env python3 +""" +oi_os_mode.py -- Launch OpenInterpreter as a managed subprocess for full +autonomous computer use (OS Mode) or local agent mode (Ollama). + +OS Mode: OI runs its own screenshot → Claude API → pyautogui loop. +Local Mode: OI runs in classic code-execution mode with Ollama as backend. + +Usage: + oi_os_mode.py "Open Calculator and compute 2+2" + oi_os_mode.py --provider anthropic "Change the wallpaper" + oi_os_mode.py --local "What apps are open?" + oi_os_mode.py --local --model llama3.2-vision "Describe the screen" + oi_os_mode.py --timeout 120 "Fill out the form" +""" + +import argparse +import os +import subprocess +import sys + + +def find_interpreter(): + """Find the interpreter CLI.""" + import shutil + path = shutil.which("interpreter") + if path: + return path + # Try common locations + for candidate in [ + os.path.expanduser("~/.local/bin/interpreter"), + "/usr/local/bin/interpreter", + ]: + if os.path.exists(candidate): + return candidate + return None + + +def run_os_mode(task, provider="anthropic", timeout=300): + """Run OI in OS Mode (screenshot-driven, Claude API).""" + interpreter_path = find_interpreter() + if not interpreter_path: + print("Error: 'interpreter' CLI not found. Run: ~/.claude/skills/open-interpreter/scripts/oi_install.sh", + file=sys.stderr) + sys.exit(1) + + # Check API key + if provider == "anthropic" and not os.environ.get("ANTHROPIC_API_KEY"): + print("Error: ANTHROPIC_API_KEY not set. Required for OS Mode.", file=sys.stderr) + sys.exit(1) + + cmd = [interpreter_path, "--os", "-y"] + + print(f"[oi] OS Mode: provider={provider}, task={repr(task)}", file=sys.stderr) + + try: + proc = subprocess.Popen( + cmd, + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + ) + + # Send task and close stdin to signal end of input + stdout, stderr = proc.communicate(input=task + "\n", timeout=timeout) + + if stdout: + print(stdout) + if stderr: + print(stderr, file=sys.stderr) + + return proc.returncode + + except subprocess.TimeoutExpired: + proc.kill() + proc.wait() + print(f"Error: OS Mode timed out after {timeout}s", file=sys.stderr) + return 1 + except FileNotFoundError: + print("Error: 'interpreter' CLI not found", file=sys.stderr) + return 1 + + +def run_local_mode(task, model="llama3.2-vision", api_base="http://localhost:11434", timeout=300): + """Run OI in classic mode with Ollama as backend.""" + interpreter_path = find_interpreter() + if not interpreter_path: + print("Error: 'interpreter' CLI not found. Run: ~/.claude/skills/open-interpreter/scripts/oi_install.sh", + file=sys.stderr) + sys.exit(1) + + # Verify Ollama is running + try: + import urllib.request + urllib.request.urlopen(f"{api_base}/api/tags", timeout=3) + except Exception: + print(f"Error: Ollama not reachable at {api_base}. Start with: ollama serve", file=sys.stderr) + sys.exit(1) + + cmd = [ + interpreter_path, + "--model", f"ollama/{model}", + "--api_base", api_base, + "-y", + ] + + print(f"[oi] Local Mode: model=ollama/{model}, task={repr(task)}", file=sys.stderr) + + try: + proc = subprocess.Popen( + cmd, + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + ) + + stdout, stderr = proc.communicate(input=task + "\n", timeout=timeout) + + if stdout: + print(stdout) + if stderr: + print(stderr, file=sys.stderr) + + return proc.returncode + + except subprocess.TimeoutExpired: + proc.kill() + proc.wait() + print(f"Error: Local Mode timed out after {timeout}s", file=sys.stderr) + return 1 + + +def main(): + parser = argparse.ArgumentParser( + description="Launch OpenInterpreter for autonomous computer use" + ) + parser.add_argument("task", help="Task description for OI to execute") + parser.add_argument("--local", action="store_true", + help="Use local Ollama model instead of Claude API") + parser.add_argument("--model", default="llama3.2-vision", + help="Ollama model for local mode (default: llama3.2-vision)") + parser.add_argument("--provider", default="anthropic", + help="API provider for OS Mode (default: anthropic). Currently only validates anthropic.") + parser.add_argument("--api-base", default="http://localhost:11434", + help="Ollama API base URL (default: http://localhost:11434)") + parser.add_argument("--timeout", type=int, default=300, + help="Timeout in seconds (default: 300)") + + args = parser.parse_args() + + if args.local: + rc = run_local_mode( + args.task, + model=args.model, + api_base=args.api_base, + timeout=args.timeout, + ) + else: + rc = run_os_mode( + args.task, + provider=args.provider, + timeout=args.timeout, + ) + + sys.exit(rc) + + +if __name__ == "__main__": + main() diff --git a/.claude/skills/open-interpreter/scripts/oi_permission_check.py b/.claude/skills/open-interpreter/scripts/oi_permission_check.py new file mode 100755 index 0000000000..cb99c9f6ac --- /dev/null +++ b/.claude/skills/open-interpreter/scripts/oi_permission_check.py @@ -0,0 +1,140 @@ +#!/usr/bin/env python3 +""" +oi_permission_check.py -- Check macOS permissions for desktop GUI automation. + +Verifies: + 1. Accessibility permission (required for pyautogui mouse/keyboard) + 2. Screen Recording permission (required for screenshots) + 3. tesseract installation (required for OCR) + +Usage: + python3 oi_permission_check.py +""" + +import platform +import shutil +import subprocess +import sys + + +def check_accessibility(): + """Check if Accessibility permission is granted (macOS only).""" + if platform.system() != "Darwin": + print(" Accessibility: N/A (not macOS)") + return True + + # Try a minimal pyautogui operation to detect permission + try: + import pyautogui + # position() requires Accessibility on macOS + pos = pyautogui.position() + print(f" Accessibility: OK (mouse at {pos.x}, {pos.y})") + return True + except Exception as e: + err = str(e).lower() + if "accessibility" in err or "permission" in err or "not allowed" in err: + print(" Accessibility: DENIED") + print(" -> System Settings > Privacy & Security > Accessibility > add your terminal app") + return False + # If the error is something else, pyautogui may still work + print(f" Accessibility: UNKNOWN ({e})") + return True + + +def check_screen_recording(): + """Check if Screen Recording permission is granted (macOS only).""" + if platform.system() != "Darwin": + print(" Screen Recording: N/A (not macOS)") + return True + + # Take a test screenshot with screencapture + import tempfile + import os + tmp = os.path.join(tempfile.gettempdir(), "oi_perm_test.png") + try: + result = subprocess.run( + ["screencapture", "-x", "-C", tmp], + capture_output=True, timeout=5 + ) + if os.path.exists(tmp): + size = os.path.getsize(tmp) + os.unlink(tmp) + if size > 100: + print(f" Screen Recording: OK (test screenshot {size} bytes)") + return True + else: + print(" Screen Recording: DENIED (screenshot is empty)") + print(" -> System Settings > Privacy & Security > Screen Recording > add your terminal app") + return False + else: + print(" Screen Recording: DENIED (no screenshot produced)") + print(" -> System Settings > Privacy & Security > Screen Recording > add your terminal app") + return False + except subprocess.TimeoutExpired: + print(" Screen Recording: TIMEOUT (screencapture hung — permission dialog may be showing)") + return False + except FileNotFoundError: + print(" Screen Recording: N/A (screencapture not found)") + return True + finally: + if os.path.exists(tmp): + os.unlink(tmp) + + +def check_tesseract(): + """Check if tesseract OCR is installed.""" + path = shutil.which("tesseract") + if path: + try: + result = subprocess.run( + ["tesseract", "--version"], + capture_output=True, text=True, timeout=5 + ) + version = result.stdout.strip().split("\n")[0] if result.stdout else result.stderr.strip().split("\n")[0] + print(f" tesseract: OK ({version} at {path})") + return True + except Exception: + print(f" tesseract: OK (at {path}, version check failed)") + return True + else: + print(" tesseract: NOT FOUND") + print(" -> Install: brew install tesseract") + return False + + +def check_pyautogui(): + """Check if pyautogui is installed.""" + try: + import pyautogui + size = pyautogui.size() + print(f" pyautogui: OK (screen: {size.width}x{size.height})") + return True + except ImportError: + print(" pyautogui: NOT INSTALLED") + print(" -> Run: ~/.claude/skills/open-interpreter/scripts/oi_install.sh") + return False + except Exception as e: + print(f" pyautogui: ERROR ({e})") + return False + + +def main(): + print("OpenInterpreter Permission Check") + print("=" * 40) + + all_ok = True + all_ok &= check_pyautogui() + all_ok &= check_accessibility() + all_ok &= check_screen_recording() + all_ok &= check_tesseract() + + print("=" * 40) + if all_ok: + print("All checks passed.") + else: + print("Some checks failed. See instructions above.") + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/.claude/skills/open-interpreter/scripts/oi_screenshot.py b/.claude/skills/open-interpreter/scripts/oi_screenshot.py new file mode 100755 index 0000000000..a2296b2ec0 --- /dev/null +++ b/.claude/skills/open-interpreter/scripts/oi_screenshot.py @@ -0,0 +1,186 @@ +#!/usr/bin/env python3 +""" +oi_screenshot.py -- Capture screen and return file path with Retina metadata. + +Outputs 3 lines to stdout: + 1. File path to PNG screenshot + 2. SCALE_FACTOR=N (Retina multiplier) + 3. SCREEN_SIZE=WxH (pyautogui coordinates) + +Usage: + oi_screenshot.py # Full screen + oi_screenshot.py --region 0,0,800,600 # Region (x,y,w,h) + oi_screenshot.py --active-window # Active window only (macOS) + oi_screenshot.py --output /tmp/my.png # Custom output path +""" + +import argparse +import os +import platform +import subprocess +import sys +import tempfile +import time + + +def get_scale_factor(): + """Detect Retina scale factor on macOS.""" + if platform.system() != "Darwin": + return 1 + + try: + # Use system_profiler to get display info + result = subprocess.run( + ["system_profiler", "SPDisplaysDataType"], + capture_output=True, text=True, timeout=5 + ) + output = result.stdout + # Look for Resolution line with Retina indicator + for line in output.splitlines(): + if "Retina" in line or "Resolution" in line: + if "Retina" in line: + return 2 + # Fallback: compare screenshot size to pyautogui screen size + return _detect_scale_from_screenshot() + except Exception: + return _detect_scale_from_screenshot() + + +def _detect_scale_from_screenshot(): + """Detect scale factor by comparing screenshot dimensions to screen size.""" + try: + import pyautogui + screen = pyautogui.size() + + # Take a tiny test screenshot + tmp = os.path.join(tempfile.gettempdir(), "oi_scale_test.png") + subprocess.run(["screencapture", "-x", "-C", tmp], capture_output=True, timeout=5) + + if os.path.exists(tmp): + from PIL import Image + img = Image.open(tmp) + img_w = img.width + img.close() + os.unlink(tmp) + + factor = round(img_w / screen.width) + return max(1, factor) + except Exception: + pass + return 2 # Default assumption for modern Macs + + +def screenshot_macos(output_path, region=None, active_window=False): + """Take screenshot on macOS using screencapture.""" + cmd = ["screencapture", "-x", "-C"] + + if active_window: + # Get frontmost window ID via AppleScript + try: + result = subprocess.run( + ["osascript", "-e", + 'tell application "System Events" to get id of first window of (first process whose frontmost is true)'], + capture_output=True, text=True, timeout=5 + ) + window_id = result.stdout.strip() + if window_id and window_id.isdigit(): + cmd.extend(["-l", window_id]) + else: + # Fallback to full-screen capture (never use -w which hangs in automation) + print("[oi] warning: could not get window ID, falling back to full screen", file=sys.stderr) + except Exception: + print("[oi] warning: could not get window ID, falling back to full screen", file=sys.stderr) + + if region: + cmd.extend(["-R", region]) + + cmd.append(output_path) + subprocess.run(cmd, capture_output=True, timeout=10) + return os.path.exists(output_path) + + +def screenshot_linux(output_path, region=None, active_window=False): + """Take screenshot on Linux using scrot or import.""" + for tool in ["scrot", "import"]: + if subprocess.run(["which", tool], capture_output=True).returncode == 0: + if tool == "scrot": + cmd = ["scrot", output_path] + if active_window: + cmd = ["scrot", "-u", output_path] + elif region: + x, y, w, h = region.split(",") + cmd = ["scrot", "-a", f"{x},{y},{w},{h}", output_path] + else: # import (ImageMagick) + cmd = ["import", "-window", "root", output_path] + if active_window: + cmd = ["import", output_path] # Interactive + + subprocess.run(cmd, capture_output=True, timeout=10) + return os.path.exists(output_path) + + # Fallback: pyautogui + try: + import pyautogui + img = pyautogui.screenshot(region=tuple(map(int, region.split(","))) if region else None) + img.save(output_path) + return True + except Exception: + return False + + +def main(): + parser = argparse.ArgumentParser(description="Capture screen, return path + Retina metadata") + parser.add_argument("--region", metavar="X,Y,W,H", help="Capture region (x,y,width,height)") + parser.add_argument("--active-window", action="store_true", help="Capture active window only") + parser.add_argument("--output", "-o", metavar="PATH", help="Custom output file path") + + args = parser.parse_args() + + # Generate output path + if args.output: + output_path = args.output + else: + timestamp = int(time.time()) + output_path = os.path.join(tempfile.gettempdir(), f"oi_screenshot_{timestamp}.png") + + # Take screenshot + system = platform.system() + if system == "Darwin": + ok = screenshot_macos(output_path, region=args.region, active_window=args.active_window) + elif system == "Linux": + ok = screenshot_linux(output_path, region=args.region, active_window=args.active_window) + else: + # Fallback: pyautogui + try: + import pyautogui + region_tuple = tuple(map(int, args.region.split(","))) if args.region else None + img = pyautogui.screenshot(region=region_tuple) + img.save(output_path) + ok = True + except Exception as e: + print(f"Error: {e}", file=sys.stderr) + ok = False + + if not ok or not os.path.exists(output_path): + print("Error: screenshot failed", file=sys.stderr) + sys.exit(1) + + # Get metadata + scale_factor = get_scale_factor() + try: + import pyautogui + screen = pyautogui.size() + screen_size = f"{screen.width}x{screen.height}" + except Exception: + screen_size = "unknown" + + # Output: path + metadata + print(output_path) + print(f"SCALE_FACTOR={scale_factor}") + print(f"SCREEN_SIZE={screen_size}") + + print(f"[oi] screenshot saved: {output_path} (scale={scale_factor}, screen={screen_size})", file=sys.stderr) + + +if __name__ == "__main__": + main() diff --git a/.claude/skills/open-interpreter/scripts/oi_type.py b/.claude/skills/open-interpreter/scripts/oi_type.py new file mode 100755 index 0000000000..38a4844335 --- /dev/null +++ b/.claude/skills/open-interpreter/scripts/oi_type.py @@ -0,0 +1,177 @@ +#!/usr/bin/env python3 +""" +oi_type.py -- Keyboard input: text, keys, and hotkeys. + +Usage: + oi_type.py --text "hello world" # Clipboard-paste (fast, unicode-safe) + oi_type.py --text "search" --method typewrite # Character-by-character + oi_type.py --key enter # Single key press + oi_type.py --key tab # Tab key + oi_type.py --hotkey command space # Hotkey combo (AppleScript on macOS) + oi_type.py --hotkey command shift 3 # Multi-modifier hotkey +""" + +import argparse +import json +import platform +import subprocess +import sys + + +def paste_text(text): + """Type text via clipboard-paste (Cmd+V on macOS, Ctrl+V elsewhere). + Faster and Unicode-safe compared to character-by-character typing.""" + import pyperclip + import pyautogui + + # Save current clipboard + try: + old_clipboard = pyperclip.paste() + except Exception: + old_clipboard = None + + # Copy text to clipboard and paste + pyperclip.copy(text) + if platform.system() == "Darwin": + pyautogui.hotkey("command", "v") + else: + pyautogui.hotkey("ctrl", "v") + + # Restore clipboard after brief delay + import time + time.sleep(0.1) + if old_clipboard is not None: + try: + pyperclip.copy(old_clipboard) + except Exception: + pass + + +def typewrite_text(text, interval=0.02): + """Type text character-by-character. Slower but doesn't use clipboard.""" + import pyautogui + pyautogui.typewrite(text, interval=interval) + + +def press_key(key): + """Press a single key.""" + import pyautogui + pyautogui.press(key) + + +def hotkey_applescript(*keys): + """Execute hotkey via AppleScript (macOS). More reliable for modifier keys.""" + # Map key names to AppleScript key codes + modifier_map = { + "command": "command down", + "cmd": "command down", + "shift": "shift down", + "option": "option down", + "alt": "option down", + "control": "control down", + "ctrl": "control down", + } + + modifiers = [] + key_char = None + + for k in keys: + k_lower = k.lower() + if k_lower in modifier_map: + modifiers.append(modifier_map[k_lower]) + else: + key_char = k_lower + + if key_char is None: + # All modifiers, no key — just press the last modifier as a key + key_char = keys[-1].lower() + modifiers = modifiers[:-1] + + # Map special key names to AppleScript + key_code_map = { + "space": 49, "return": 36, "enter": 36, "tab": 48, + "escape": 53, "esc": 53, "delete": 51, "backspace": 51, + "up": 126, "down": 125, "left": 123, "right": 124, + "f1": 122, "f2": 120, "f3": 99, "f4": 118, + "f5": 96, "f6": 97, "f7": 98, "f8": 100, + } + + modifier_str = ", ".join(modifiers) if modifiers else "" + + if key_char in key_code_map: + code = key_code_map[key_char] + if modifier_str: + script = f'tell application "System Events" to key code {code} using {{{modifier_str}}}' + else: + script = f'tell application "System Events" to key code {code}' + else: + # Single character — sanitize to prevent AppleScript injection + if len(key_char) != 1 or key_char in ('"', '\\'): + print(f"Error: invalid key character for keystroke: {repr(key_char)}", file=sys.stderr) + sys.exit(1) + if modifier_str: + script = f'tell application "System Events" to keystroke "{key_char}" using {{{modifier_str}}}' + else: + script = f'tell application "System Events" to keystroke "{key_char}"' + + subprocess.run(["osascript", "-e", script], capture_output=True, timeout=5) + + +def hotkey_pyautogui(*keys): + """Execute hotkey via pyautogui (cross-platform fallback).""" + import pyautogui + pyautogui.hotkey(*keys) + + +def main(): + parser = argparse.ArgumentParser(description="Keyboard input: text, keys, and hotkeys") + parser.add_argument("--text", metavar="TEXT", help="Text to type") + parser.add_argument("--key", metavar="KEY", help="Single key to press (enter, tab, escape, etc.)") + parser.add_argument("--hotkey", nargs="+", metavar="KEY", help="Hotkey combination (e.g., command space)") + parser.add_argument("--method", choices=["paste", "typewrite"], default="paste", + help="Text input method: paste (clipboard, default) or typewrite (character-by-character)") + + args = parser.parse_args() + + if not any([args.text, args.key, args.hotkey]): + parser.error("Provide one of --text, --key, or --hotkey") + + try: + if args.text: + if args.method == "typewrite": + typewrite_text(args.text) + print(f"[oi] typewrite: {repr(args.text)}", file=sys.stderr) + else: + paste_text(args.text) + print(f"[oi] paste: {repr(args.text)}", file=sys.stderr) + print(json.dumps({"action": "type", "text": args.text, "method": args.method})) + + elif args.key: + import pyautogui + pyautogui.press(args.key) + print(f"[oi] key: {args.key}", file=sys.stderr) + print(json.dumps({"action": "key", "key": args.key})) + + elif args.hotkey: + if platform.system() == "Darwin": + hotkey_applescript(*args.hotkey) + else: + hotkey_pyautogui(*args.hotkey) + combo = "+".join(args.hotkey) + print(f"[oi] hotkey: {combo}", file=sys.stderr) + print(json.dumps({"action": "hotkey", "keys": args.hotkey})) + + except ImportError as e: + print(f"Error: Missing dependency: {e}\n" + "Run: ~/.claude/skills/open-interpreter/scripts/oi_install.sh", + file=sys.stderr) + sys.exit(1) + except KeyboardInterrupt: + sys.exit(130) + except Exception as e: + print(f"Error: {e}", file=sys.stderr) + sys.exit(1) + + +if __name__ == "__main__": + main()