diff --git a/.claude/skills/open-interpreter/README.md b/.claude/skills/open-interpreter/README.md new file mode 100644 index 0000000000..6118cad2fb --- /dev/null +++ b/.claude/skills/open-interpreter/README.md @@ -0,0 +1,193 @@ +# open-interpreter — Claude Code Skill + +A [Claude Code skill](https://code.claude.com/docs/en/skills) for desktop GUI automation, built on top of Open Interpreter's Computer API. Provides mouse, keyboard, screenshot, and OCR control for native macOS/Linux applications that have no CLI or API. + +## What is this? + +[Claude Code](https://github.com/anthropics/claude-code) is Anthropic's terminal-based AI coding tool. It reads `.claude/skills/` directories for specialized capabilities. This skill gives Claude Code the ability to interact with desktop GUIs by wrapping Open Interpreter's pyautogui + pytesseract primitives in standalone scripts. + +## When to Use + +- Interacting with desktop apps (System Preferences, Calculator, browsers, any GUI) +- Automating GUI workflows (form filling, menu navigation, data extraction) +- Reading screen content via OCR (finding buttons, labels, prices, status text) +- Controlling mouse and keyboard programmatically + +## Modes + +| Mode | LLM | Script | Best For | +|------|-----|--------|----------| +| **Library** | Claude Code (native) | Individual scripts | Surgical GUI actions — Claude sees screenshots, reasons, dispatches | +| **OS subprocess** | Claude API (via OI) | `oi_os_mode.py` | Delegating entire GUI tasks to OI's agent loop | +| **Local agent** | Ollama (offline) | `oi_os_mode.py --local` | Offline computer use, no API costs | + +Use Library mode by default. OS subprocess for self-contained GUI tasks. Local agent when offline. + +## Prerequisites + +- Python 3.10+ +- [uv](https://github.com/astral-sh/uv) package manager +- macOS: Accessibility + Screen Recording permissions for terminal app +- tesseract (`brew install tesseract`) + +## Installation + +To use this skill, copy the folder into your Claude Code skills directory: + +```bash +cp -r .claude/skills/open-interpreter ~/.claude/skills/open-interpreter +``` + +Then run the install script: + +```bash +~/.claude/skills/open-interpreter/scripts/oi_install.sh +``` + +Verify permissions: + +```bash +python3 ~/.claude/skills/open-interpreter/scripts/oi_permission_check.py +``` + +## Directory Structure + +``` +open-interpreter/ +├── SKILL.md # Skill instructions for Claude Code +├── README.md # This file +├── scripts/ +│ ├── oi_install.sh # One-shot install + permissions check +│ ├── oi_screenshot.py # Screen capture with Retina metadata +│ ├── oi_click.py # Mouse click by coordinates or OCR text +│ ├── oi_type.py # Keyboard input, hotkeys, key presses +│ ├── oi_find_text.py # OCR: find text on screen → JSON coords +│ ├── oi_computer.py # Unified dispatch for all actions +│ ├── oi_os_mode.py # Launch OI as managed subprocess +│ └── oi_permission_check.py # Check macOS permissions +└── references/ + ├── computer-api.md # OI Computer API reference + ├── os-mode.md # OS Mode usage and architecture + └── safety-and-permissions.md # Permissions guide and safety model +``` + +## Scripts + +### oi_screenshot.py — Screen capture + +```bash +python3 scripts/oi_screenshot.py # Full screen +python3 scripts/oi_screenshot.py --region 0,0,800,600 # Region +python3 scripts/oi_screenshot.py --active-window # Active window only +``` + +Outputs file path + `SCALE_FACTOR` + `SCREEN_SIZE` metadata (3 lines to stdout). + +### oi_click.py — Mouse click + +```bash +python3 scripts/oi_click.py --x 450 --y 300 # Coordinate click +python3 scripts/oi_click.py --x 900 --y 600 --image-coords # Auto-divide by Retina scale +python3 scripts/oi_click.py --text "Submit" # OCR: find and click text +python3 scripts/oi_click.py --x 450 --y 300 --double # Double click +python3 scripts/oi_click.py --x 450 --y 300 --right # Right click +``` + +### oi_type.py — Keyboard input + +```bash +python3 scripts/oi_type.py --text "hello world" # Clipboard-paste (default) +python3 scripts/oi_type.py --key enter # Single key press +python3 scripts/oi_type.py --hotkey command space # Hotkey (AppleScript on macOS) +python3 scripts/oi_type.py --text "search" --method typewrite # Character-by-character +``` + +### oi_find_text.py — OCR screen reading + +```bash +python3 scripts/oi_find_text.py --text "Submit" +python3 scripts/oi_find_text.py --text "Price" --all --min-conf 80 +``` + +Returns JSON: `[{"text": "Submit", "x": 450, "y": 300, "w": 80, "h": 24, "confidence": 95}]` + +### oi_computer.py — Unified dispatch + +```bash +python3 scripts/oi_computer.py screenshot +python3 scripts/oi_computer.py click --x 450 --y 300 +python3 scripts/oi_computer.py type --text "hello" +python3 scripts/oi_computer.py find --text "Submit" +python3 scripts/oi_computer.py scroll --clicks 3 +python3 scripts/oi_computer.py mouse-position +python3 scripts/oi_computer.py screen-size +``` + +### oi_os_mode.py — Delegate full GUI tasks + +```bash +python3 scripts/oi_os_mode.py "Open Calculator and compute 2+2" +python3 scripts/oi_os_mode.py --local "What apps are open?" # Ollama (offline) +``` + +## Quick Examples + +### Open an app via Spotlight + +```bash +python3 scripts/oi_type.py --hotkey command space +sleep 0.5 +python3 scripts/oi_type.py --text "Calculator" +sleep 0.3 +python3 scripts/oi_type.py --key enter +``` + +### Click a button by label + +```bash +python3 scripts/oi_click.py --text "Save" +``` + +### Read text from screen + +```bash +python3 scripts/oi_find_text.py --text "Total" --all +``` + +### Fill a form + +```bash +python3 scripts/oi_click.py --text "Email" +python3 scripts/oi_type.py --text "user@example.com" +python3 scripts/oi_type.py --key tab +python3 scripts/oi_type.py --text "password123" +``` + +## Retina Display Handling + +macOS Retina displays render at 2x scaling. Screenshot image pixels differ from pyautogui screen coordinates. Use `--image-coords` on `oi_click.py` to auto-divide coordinates by the scale factor when targeting positions from screenshot pixels. + +## Safety + +1. Confirm with user before clicking Send, Delete, Submit, or Confirm buttons +2. Screenshot before and after every action for verification +3. No unbounded autonomous loops +4. pyautogui failsafe: moving mouse to screen corner raises exception +5. Every script logs actions to stderr: `[oi] click at (450, 300) button=left` + +## Troubleshooting + +| Symptom | Fix | +|---------|-----| +| Black screenshot | Grant Screen Recording permission to terminal app | +| Click/type no effect | Grant Accessibility permission to terminal app | +| OCR finds no text | Verify tesseract: `which tesseract && tesseract --version` | +| Coordinates off by 2x | Use `--image-coords` flag on `oi_click.py` | +| OS Mode hangs | Verify `ANTHROPIC_API_KEY` is set | +| Local mode fails | Verify Ollama running: `ollama list` | + +## Credits + +- [OpenInterpreter](https://github.com/OpenInterpreter/open-interpreter) by Killian Lucas — the foundation this skill builds on +- [Claudicle](https://github.com/tdimino/claudicle) by Tom di Mino — open-source soul agent framework, LLM-agnostic at the cognitive level +- Built as a [Claude Code skill](https://code.claude.com/docs/en/skills) following the [Agent Skills](https://agentskills.io/) open standard diff --git a/.claude/skills/open-interpreter/SKILL.md b/.claude/skills/open-interpreter/SKILL.md new file mode 100644 index 0000000000..e5e22477f4 --- /dev/null +++ b/.claude/skills/open-interpreter/SKILL.md @@ -0,0 +1,219 @@ +--- +name: open-interpreter +description: Desktop GUI automation via OpenInterpreter — mouse, keyboard, screenshot, + and OCR control for native macOS/Linux applications. Three modes: Library (Claude + reasons, OI executes), OS subprocess (full autonomous computer use), and Local agent + (Ollama, offline). This skill should be used when interacting with desktop apps that + have no CLI or API, automating GUI workflows, reading screen content via OCR, or + controlling mouse/keyboard. +--- + +# OpenInterpreter — Desktop GUI Automation + +Desktop control for Claude Code via [OpenInterpreter](https://github.com/OpenInterpreter/open-interpreter) (62k stars, AGPL-3.0). Mouse, keyboard, screenshot, and OCR primitives backed by pyautogui + pytesseract. + +## Mode Selection + +| Mode | LLM | Script | Best For | +|------|-----|--------|----------| +| **Library** | Claude Code (native) | Individual scripts below | Surgical GUI actions — Claude sees screenshots, reasons, dispatches actions | +| **OS subprocess** | Claude API (via OI) | `oi_os_mode.py` | Full autonomous computer use — delegate entire GUI tasks | +| **Local agent** | Ollama (offline) | `oi_os_mode.py --local` | Offline computer use, no API costs, privacy-sensitive tasks | + +Use Library mode by default. Use OS subprocess to delegate self-contained GUI tasks. Use Local agent when offline or to avoid API costs. + +## Installation + +Run once: + +```bash +.claude/skills/open-interpreter/scripts/oi_install.sh +``` + +Installs `open-interpreter[os]` via uv, verifies pyautogui and tesseract, checks macOS permissions. + +**macOS permissions** (one-time, manual): +- System Settings > Privacy & Security > **Accessibility** > add terminal app (Ghostty/Terminal/iTerm2) +- System Settings > Privacy & Security > **Screen Recording** > add terminal app + +Verify permissions: + +```bash +python3 .claude/skills/open-interpreter/scripts/oi_permission_check.py +``` + +## Library Mode: The Screenshot Loop + +The core pattern for GUI automation: + +``` +1. Take screenshot → oi_screenshot.py +2. Read PNG → Claude Read tool (native vision) +3. Decide action → Claude reasoning +4. Execute action → oi_click.py / oi_type.py +5. Verify → Take another screenshot +6. Loop until done +``` + +### Scripts + +**`oi_screenshot.py`** — Capture screen, return file path with Retina metadata + +```bash +python3 .claude/skills/open-interpreter/scripts/oi_screenshot.py +python3 .claude/skills/open-interpreter/scripts/oi_screenshot.py --region 0,0,800,600 +python3 .claude/skills/open-interpreter/scripts/oi_screenshot.py --active-window +``` + +Output (3 lines): +``` +/tmp/oi_screenshot_1708789200.png +SCALE_FACTOR=2 +SCREEN_SIZE=1512x982 +``` + +**`oi_click.py`** — Mouse click by coordinates or OCR text + +```bash +python3 .claude/skills/open-interpreter/scripts/oi_click.py --x 450 --y 300 +python3 .claude/skills/open-interpreter/scripts/oi_click.py --x 900 --y 600 --image-coords +python3 .claude/skills/open-interpreter/scripts/oi_click.py --text "Submit" +python3 .claude/skills/open-interpreter/scripts/oi_click.py --x 450 --y 300 --double +python3 .claude/skills/open-interpreter/scripts/oi_click.py --x 450 --y 300 --right +``` + +- `--image-coords`: auto-divides by Retina scale factor (use when coordinates come from screenshot image pixels) +- `--text`: OCR-based — screenshots, finds text via pytesseract, clicks center of match + +**`oi_type.py`** — Keyboard input + +```bash +python3 .claude/skills/open-interpreter/scripts/oi_type.py --text "hello world" +python3 .claude/skills/open-interpreter/scripts/oi_type.py --key enter +python3 .claude/skills/open-interpreter/scripts/oi_type.py --hotkey command space +python3 .claude/skills/open-interpreter/scripts/oi_type.py --text "search" --method typewrite +``` + +- Default text input: clipboard-paste (Cmd+V) for speed and Unicode safety +- `--method typewrite`: character-by-character (use when clipboard is needed for other purposes) +- `--hotkey`: AppleScript on macOS for reliable modifier key handling + +**`oi_find_text.py`** — OCR screen reading + +```bash +python3 .claude/skills/open-interpreter/scripts/oi_find_text.py --text "Submit" +python3 .claude/skills/open-interpreter/scripts/oi_find_text.py --text "Price" --screenshot /tmp/ss.png +``` + +Returns JSON array: `[{"text": "Submit", "x": 450, "y": 300, "w": 80, "h": 24, "confidence": 95}]` + +**`oi_computer.py`** — Unified dispatch for all actions + +```bash +python3 .claude/skills/open-interpreter/scripts/oi_computer.py screenshot +python3 .claude/skills/open-interpreter/scripts/oi_computer.py click --x 450 --y 300 +python3 .claude/skills/open-interpreter/scripts/oi_computer.py type --text "hello" +python3 .claude/skills/open-interpreter/scripts/oi_computer.py find --text "Submit" +python3 .claude/skills/open-interpreter/scripts/oi_computer.py scroll --clicks 3 +python3 .claude/skills/open-interpreter/scripts/oi_computer.py mouse-position +python3 .claude/skills/open-interpreter/scripts/oi_computer.py screen-size +``` + +### Retina Display Handling + +macOS Retina displays render at 2x (or 3x) scaling. Screenshot image pixels differ from screen coordinates: + +| Metric | Example (14" MBP) | +|--------|-------------------| +| Image pixels (screenshot) | 3024 x 1964 | +| Screen coordinates (pyautogui) | 1512 x 982 | +| Scale factor | 2x | + +When estimating click targets from a screenshot image, use `--image-coords` on `oi_click.py` to auto-divide by the scale factor. The `oi_screenshot.py` output includes `SCALE_FACTOR` metadata. + +## OS Mode: Delegate Full Tasks + +For self-contained GUI tasks, delegate to OI's full agent loop: + +```bash +python3 .claude/skills/open-interpreter/scripts/oi_os_mode.py "Open Calculator and compute 2+2" +python3 .claude/skills/open-interpreter/scripts/oi_os_mode.py --provider anthropic "Change the desktop wallpaper" +``` + +OI runs its own screenshot → analyze → act loop using the Claude API. Requires `ANTHROPIC_API_KEY`. + +## Local Mode: Offline Computer Use + +Run OI with a local vision model via Ollama: + +```bash +python3 .claude/skills/open-interpreter/scripts/oi_os_mode.py --local "What apps are open?" +``` + +Prerequisites: +1. Ollama running: `ollama serve` +2. Vision model pulled: `ollama pull llama3.2-vision` + +Limitation: Local models use OI's classic code-execution mode, not the screenshot-driven OS Mode (which requires Claude 3.5 Sonnet). Local mode generates and executes code to accomplish GUI tasks rather than using pixel-level screenshot analysis. + +## Common Recipes + +### Open an App via Spotlight + +```bash +python3 scripts/oi_type.py --hotkey command space +sleep 0.5 +python3 scripts/oi_type.py --text "Calculator" +sleep 0.3 +python3 scripts/oi_type.py --key enter +``` + +### Read Text from Screen + +```bash +python3 scripts/oi_screenshot.py > /tmp/ss_meta.txt +python3 scripts/oi_find_text.py --text "Total" --screenshot "$(head -1 /tmp/ss_meta.txt)" +``` + +### Click a Button by Label + +```bash +python3 scripts/oi_click.py --text "Save" +``` + +### Fill a Form Field + +```bash +python3 scripts/oi_click.py --text "Email" +python3 scripts/oi_type.py --text "user@example.com" +python3 scripts/oi_type.py --key tab +python3 scripts/oi_type.py --text "password123" +``` + +## Safety + +1. **Confirm before destructive actions** — before clicking Send, Delete, Submit, or Confirm buttons, verify with the user +2. **Screenshot before and after** every action for verification +3. **No unbounded autonomous loops** — confirm with user between multi-step GUI workflows +4. **pyautogui failsafe** — moving mouse to any screen corner raises `pyautogui.FailSafeException` (enabled by default) +5. **Action logging** — every script logs actions to stderr: `[oi] click at (450, 300) button=left` + +## Troubleshooting + +| Symptom | Fix | +|---------|-----| +| `oi_screenshot.py` returns black image | Grant Screen Recording permission to terminal app | +| `oi_click.py` / `oi_type.py` no effect | Grant Accessibility permission to terminal app | +| OCR finds no text | Verify tesseract: `which tesseract && tesseract --version` | +| Retina coordinates off by 2x | Use `--image-coords` flag on `oi_click.py` | +| `oi_find_text.py` low confidence | Try larger text, ensure screen is not obstructed | +| OS Mode hangs | Verify `ANTHROPIC_API_KEY` is set, check OI stderr output | +| Local mode fails | Verify Ollama running (`ollama list`) and model pulled | + +## Reference Documentation + +| File | Contents | +|------|----------| +| `references/computer-api.md` | OI Computer API reference — mouse, keyboard, display, clipboard | +| `references/os-mode.md` | OS Mode usage, provider configuration, agent loop architecture | +| `references/safety-and-permissions.md` | macOS permissions guide, safety model, failsafe configuration | diff --git a/.claude/skills/open-interpreter/references/computer-api.md b/.claude/skills/open-interpreter/references/computer-api.md new file mode 100644 index 0000000000..d1597864ea --- /dev/null +++ b/.claude/skills/open-interpreter/references/computer-api.md @@ -0,0 +1,115 @@ +# OpenInterpreter Computer API Reference + +## Overview + +OpenInterpreter's Computer API (`interpreter/core/computer/`) provides programmatic access to desktop automation primitives. The skill wraps these via standalone scripts for use with Claude Code. + +## Script Reference + +### oi_screenshot.py + +Captures the screen using `screencapture` (macOS) or `scrot`/pyautogui (Linux). + +| Flag | Description | +|------|-------------| +| `--region X,Y,W,H` | Capture region only | +| `--active-window` | Capture frontmost window | +| `--output PATH` | Custom output path (default: `/tmp/oi_screenshot_TIMESTAMP.png`) | + +**Output** (3 lines to stdout): +``` +/tmp/oi_screenshot_1708789200.png +SCALE_FACTOR=2 +SCREEN_SIZE=1512x982 +``` + +### oi_click.py + +Performs mouse clicks via pyautogui. Two modes: coordinate and OCR text. + +| Flag | Description | +|------|-------------| +| `--x N --y N` | Click at screen coordinates | +| `--text "label"` | Find text via OCR, click center | +| `--image-coords` | Divide coords by Retina scale factor | +| `--double` | Double click | +| `--right` | Right click | +| `--clicks N` | Number of clicks (default: 1) | + +**Output**: JSON object to stdout, action log to stderr. + +### oi_type.py + +Keyboard input: text, single keys, and hotkey combos. + +| Flag | Description | +|------|-------------| +| `--text "string"` | Type text (default: clipboard paste) | +| `--key NAME` | Press single key (enter, tab, escape, etc.) | +| `--hotkey KEY KEY...` | Hotkey combo (e.g., command space) | +| `--method paste\|typewrite` | Text input method (default: paste) | + +**Text methods**: +- `paste` (default): Copy to clipboard, Cmd+V. Fast, Unicode-safe. +- `typewrite`: Character-by-character. Slower, but doesn't touch clipboard. + +**macOS hotkeys**: Uses AppleScript (`osascript`) for reliable modifier key handling. Key names: command, shift, option, control, plus key codes for special keys (space, enter, tab, escape, F1-F8, arrow keys). + +### oi_find_text.py + +OCR screen reading via pytesseract. + +| Flag | Description | +|------|-------------| +| `--text "string"` | Text to search for (required) | +| `--screenshot PATH` | Use existing screenshot | +| `--all` | Return all matches, not just best | +| `--min-conf N` | Minimum confidence threshold (0-100) | + +**Output**: JSON array to stdout: +```json +[{"text": "Submit", "x": 450, "y": 300, "w": 80, "h": 24, "confidence": 95}] +``` + +Coordinates are in screen space (divided by Retina scale). + +### oi_computer.py + +Unified dispatch. Routes to the appropriate script. + +| Subcommand | Equivalent | +|------------|------------| +| `screenshot [args]` | `oi_screenshot.py [args]` | +| `click [args]` | `oi_click.py [args]` | +| `type [args]` | `oi_type.py [args]` | +| `find [args]` | `oi_find_text.py [args]` | +| `scroll --clicks N` | pyautogui.scroll() | +| `mouse-position` | Returns `{"x": N, "y": N}` | +| `screen-size` | Returns `{"width": N, "height": N}` | + +## Retina Coordinate Handling + +On macOS Retina displays, screenshot image pixels differ from pyautogui screen coordinates: + +| | Image Pixels | Screen Coordinates | +|--|-------------|-------------------| +| 14" MBP | 3024 x 1964 | 1512 x 982 | +| Scale factor | 2x | 1x (pyautogui native) | + +When Claude reads a screenshot and estimates a click target at pixel (900, 600) in the image: +- Without `--image-coords`: clicks at screen position (900, 600) — wrong +- With `--image-coords`: divides by 2, clicks at screen position (450, 300) — correct + +The `oi_screenshot.py` SCALE_FACTOR output enables this conversion. + +## Dependencies + +| Package | Purpose | Install | +|---------|---------|---------| +| pyautogui | Mouse/keyboard control | `uv pip install pyautogui` | +| pytesseract | OCR text detection | `uv pip install pytesseract` | +| Pillow | Image processing | `uv pip install Pillow` | +| pyperclip | Clipboard access | `uv pip install pyperclip` | +| tesseract | OCR engine (CLI) | `brew install tesseract` | + +All installed by `oi_install.sh`. diff --git a/.claude/skills/open-interpreter/references/os-mode.md b/.claude/skills/open-interpreter/references/os-mode.md new file mode 100644 index 0000000000..4e61e3bc41 --- /dev/null +++ b/.claude/skills/open-interpreter/references/os-mode.md @@ -0,0 +1,99 @@ +# OpenInterpreter OS Mode Reference + +## Overview + +OS Mode (`interpreter --os`) is OpenInterpreter's screenshot-driven desktop control system. It runs an autonomous loop: screenshot → Claude API analysis → pyautogui action → repeat. + +## Architecture + +``` +User task → OI agent loop: + 1. Take screenshot (screencapture / pyautogui) + 2. Send screenshot to Claude API (vision) + 3. Claude analyzes: what to do next? + 4. OI executes: click(x,y) / type("text") / hotkey(cmd+space) + 5. Take verification screenshot + 6. Repeat until task complete or max iterations +``` + +## Usage via oi_os_mode.py + +```bash +# Default: Claude API via Anthropic +python3 scripts/oi_os_mode.py "Open Calculator and compute 2+2" + +# Explicit provider +python3 scripts/oi_os_mode.py --provider anthropic "Change wallpaper" + +# Custom timeout (default: 300s) +python3 scripts/oi_os_mode.py --timeout 120 "Fill out the form" +``` + +**Requirements**: +- `ANTHROPIC_API_KEY` environment variable +- macOS Accessibility + Screen Recording permissions + +## Local Mode via Ollama + +```bash +# Local model (code-execution mode, not screenshot-driven) +python3 scripts/oi_os_mode.py --local "What apps are open?" + +# Custom model +python3 scripts/oi_os_mode.py --local --model llama3.2-vision "Describe the screen" + +# Custom Ollama endpoint +python3 scripts/oi_os_mode.py --local --api-base http://192.168.1.100:11434 "List files" +``` + +**Requirements**: +- Ollama running: `ollama serve` +- Vision model: `ollama pull llama3.2-vision` + +**Limitation**: Local models use OI's classic code-execution mode (generates Python/Bash to accomplish tasks). The screenshot-driven OS Mode is hardcoded to Claude 3.5 Sonnet and cannot use local models. + +## When to Use Each Mode + +| Scenario | Mode | +|----------|------| +| Precise GUI action (one click, one type) | Library (oi_click.py, oi_type.py) | +| Multi-step GUI workflow Claude can reason about | Library with screenshot loop | +| Self-contained GUI task, no codebase context | OS subprocess (oi_os_mode.py) | +| Offline / no API costs / privacy | Local (oi_os_mode.py --local) | +| Complex multi-app workflow | OS subprocess | + +## OI CLI Flags Reference + +| Flag | Description | +|------|-------------| +| `--os` | Enable OS Mode (screenshot-driven) | +| `-y` | Auto-approve actions (skip confirmation prompts) | +| `--model NAME` | LLM model to use | +| `--api_base URL` | Custom API endpoint | +| `--local` | Use local model (bundled profile) | +| `--vision` | Enable vision capabilities | +| `--safe_mode ask\|auto\|off` | Safety confirmation level | + +## OI Python API (for advanced integration) + +```python +from interpreter import interpreter + +# OS Mode +interpreter.computer.display.screenshot() # Take screenshot +interpreter.computer.mouse.click(x, y) # Click at coordinates +interpreter.computer.keyboard.write("text") # Type text +interpreter.computer.keyboard.hotkey("command", "space") # Hotkey + +# Classic mode with Ollama +interpreter.llm.model = "ollama/llama3.2-vision" +interpreter.llm.api_base = "http://localhost:11434" +interpreter.auto_run = True +interpreter.chat("What time is it?") +``` + +## Development Status + +OpenInterpreter is in maintenance mode (last release v0.4.2, Oct 2024). The core team pivoted to the 01 App. The codebase is stable and the underlying primitives (pyautogui, pytesseract) are well-maintained independently. + +OS Mode is labeled "highly experimental" in OI's documentation. For production use, prefer Library mode (Claude Code reasons, scripts execute) over the OS subprocess approach. diff --git a/.claude/skills/open-interpreter/references/safety-and-permissions.md b/.claude/skills/open-interpreter/references/safety-and-permissions.md new file mode 100644 index 0000000000..b0e7f5807e --- /dev/null +++ b/.claude/skills/open-interpreter/references/safety-and-permissions.md @@ -0,0 +1,107 @@ +# Safety and Permissions Guide + +## macOS Permissions + +Desktop GUI automation requires two macOS permissions. Both are per-application (grant to your terminal app: Ghostty, Terminal.app, iTerm2, VS Code, etc.). + +### Accessibility + +**What it enables**: Mouse movement, clicks, keyboard input (pyautogui) + +**How to grant**: +1. System Settings > Privacy & Security > Accessibility +2. Click the lock icon to authenticate +3. Click "+" and add your terminal app +4. If already listed, toggle it off and on again + +**Symptoms when missing**: pyautogui operations silently fail or throw "This process is not trusted! Input event monitoring will not be possible until it is added to accessibility clients." + +### Screen Recording + +**What it enables**: Screen capture (screencapture, pyautogui.screenshot) + +**How to grant**: +1. System Settings > Privacy & Security > Screen Recording +2. Click the lock icon to authenticate +3. Click "+" and add your terminal app +4. Restart the terminal app after granting + +**Symptoms when missing**: Screenshots are blank (all black or all white), or screencapture produces 0-byte files. + +### Verifying Permissions + +```bash +python3 ~/.claude/skills/open-interpreter/scripts/oi_permission_check.py +``` + +This checks: +- pyautogui can read mouse position (Accessibility) +- screencapture produces a non-empty file (Screen Recording) +- tesseract is installed (OCR support) + +## Safety Model + +### Principles + +1. **Human-in-the-loop for destructive actions**: Before clicking Send, Delete, Submit, Confirm, or Purchase buttons, verify with the user. A misclick on a destructive button cannot be undone. + +2. **Screenshot-verify pattern**: Take a screenshot before and after every action. This provides an audit trail and catches misclicks early. + +3. **No unbounded autonomous loops**: Multi-step GUI workflows should checkpoint with the user. Do not run 50 uninterrupted click sequences without verification. + +4. **pyautogui failsafe**: Moving the mouse to any screen corner (0,0 or max,0 or 0,max or max,max) raises `pyautogui.FailSafeException`, which halts execution. This is enabled by default and should not be disabled. + +5. **Action logging**: Every script logs its actions to stderr with the `[oi]` prefix. This provides a record of what was done. + +### Risk Categories + +| Action | Risk | Mitigation | +|--------|------|-----------| +| Screenshot | None (read-only) | — | +| Find text (OCR) | None (read-only) | — | +| Mouse move | Low | Reversible | +| Click (left) | Medium | Screenshot before, verify target | +| Click (right) | Medium | Context menus are dismissible | +| Type text | Medium | Can undo (Cmd+Z) | +| Hotkey | Medium-High | Some hotkeys trigger irreversible actions | +| Click "Delete"/"Send" | High | Require user confirmation | +| Form submission | High | Require user confirmation | + +### OS Mode Safety + +When using `oi_os_mode.py` (delegated autonomous control), OI runs its own agent loop with `-y` (auto-approve). This means: +- OI will execute actions without asking for confirmation +- The timeout flag provides a hard limit on execution time +- Monitor stderr for OI's action log +- For high-risk tasks, prefer Library mode where Claude Code controls each step + +### pyautogui Failsafe + +```python +import pyautogui + +# Enabled by default — do not disable +pyautogui.FAILSAFE = True # Default: True + +# Pause between actions (seconds) +pyautogui.PAUSE = 0.1 # Default: 0.1 +``` + +To emergency-stop any pyautogui automation, quickly move the mouse to any screen corner. This raises `FailSafeException` and halts the script. + +## tesseract Installation + +tesseract provides OCR for text detection on screen. + +```bash +# macOS +brew install tesseract + +# With additional language packs +brew install tesseract-lang + +# Verify +tesseract --version +``` + +The Python binding (`pytesseract`) is installed by `oi_install.sh`. It requires the tesseract CLI to be available in PATH. diff --git a/.claude/skills/open-interpreter/scripts/oi_click.py b/.claude/skills/open-interpreter/scripts/oi_click.py new file mode 100755 index 0000000000..56cb1bbdf5 --- /dev/null +++ b/.claude/skills/open-interpreter/scripts/oi_click.py @@ -0,0 +1,159 @@ +#!/usr/bin/env python3 +""" +oi_click.py -- Mouse click by coordinates or OCR text. + +Usage: + oi_click.py --x 450 --y 300 # Click at screen coordinates + oi_click.py --x 900 --y 600 --image-coords # Auto-divide by Retina scale + oi_click.py --text "Submit" # OCR: find text on screen, click center + oi_click.py --x 450 --y 300 --double # Double click + oi_click.py --x 450 --y 300 --right # Right click + oi_click.py --x 450 --y 300 --clicks 3 # Triple click +""" + +import argparse +import json +import os +import platform +import subprocess +import sys +import tempfile +import time + + +def get_scale_factor(): + """Detect Retina scale factor.""" + if platform.system() != "Darwin": + return 1 + try: + import pyautogui + screen = pyautogui.size() + tmp = os.path.join(tempfile.gettempdir(), "oi_scale_test.png") + subprocess.run(["screencapture", "-x", "-C", tmp], capture_output=True, timeout=5) + if os.path.exists(tmp): + from PIL import Image + img = Image.open(tmp) + img_w = img.width + img.close() + os.unlink(tmp) + factor = round(img_w / screen.width) + return max(1, factor) + except Exception: + pass + return 2 + + +def find_text_on_screen(text): + """Find text on screen using pytesseract OCR. Returns (x, y) center coordinates.""" + import pyautogui + import pytesseract + from PIL import Image + + # Take screenshot + tmp = os.path.join(tempfile.gettempdir(), f"oi_ocr_{int(time.time())}.png") + if platform.system() == "Darwin": + subprocess.run(["screencapture", "-x", "-C", tmp], capture_output=True, timeout=5) + else: + img = pyautogui.screenshot() + img.save(tmp) + + if not os.path.exists(tmp): + return None + + img = Image.open(tmp) + scale = get_scale_factor() + + # Run OCR with bounding box data + data = pytesseract.image_to_data(img, output_type=pytesseract.Output.DICT) + img.close() + os.unlink(tmp) + + # Search for matching text + best_match = None + best_conf = -1 + text_lower = text.lower() + + n = len(data["text"]) + for i in range(n): + word = data["text"][i].strip() + if not word: + continue + conf = int(data["conf"][i]) if data["conf"][i] != "-1" else 0 + if text_lower in word.lower() and conf > best_conf: + # Center of bounding box in image space, then convert to screen coordinates + x = int((data["left"][i] + data["width"][i] / 2) / scale) + y = int((data["top"][i] + data["height"][i] / 2) / scale) + best_match = (x, y) + best_conf = conf + + # Also try matching across consecutive words + if best_match is None and " " in text: + words = text_lower.split() + for i in range(n - len(words) + 1): + segment = " ".join(data["text"][i:i + len(words)]).strip().lower() + if text_lower in segment: + # Span from first to last word + x1 = data["left"][i] + y1 = data["top"][i] + last = i + len(words) - 1 + x2 = data["left"][last] + data["width"][last] + y2 = data["top"][last] + data["height"][last] + x = int((x1 + x2) / 2 / scale) + y = int((y1 + y2) / 2 / scale) + best_match = (x, y) + break + + return best_match + + +def main(): + parser = argparse.ArgumentParser(description="Click at coordinates or OCR text location") + parser.add_argument("--x", type=int, help="X coordinate") + parser.add_argument("--y", type=int, help="Y coordinate") + parser.add_argument("--text", metavar="TEXT", help="Find text on screen via OCR and click its center") + parser.add_argument("--image-coords", action="store_true", + help="Divide coordinates by Retina scale factor (use when coords come from screenshot pixels)") + parser.add_argument("--double", action="store_true", help="Double click") + parser.add_argument("--right", action="store_true", help="Right click") + parser.add_argument("--clicks", type=int, default=1, help="Number of clicks (default: 1)") + + args = parser.parse_args() + + if not args.text and (args.x is None or args.y is None): + parser.error("Provide either --text or both --x and --y") + + try: + import pyautogui + except ImportError: + print("Error: pyautogui not installed. Run: ~/.claude/skills/open-interpreter/scripts/oi_install.sh", + file=sys.stderr) + sys.exit(1) + + if args.text: + # OCR mode + result = find_text_on_screen(args.text) + if result is None: + print(f"Error: text '{args.text}' not found on screen", file=sys.stderr) + sys.exit(1) + x, y = result + print(f"[oi] found '{args.text}' at ({x}, {y})", file=sys.stderr) + else: + x, y = args.x, args.y + if args.image_coords: + scale = get_scale_factor() + x = x // scale + y = y // scale + print(f"[oi] image coords ({args.x}, {args.y}) -> screen coords ({x}, {y}) (scale={scale})", + file=sys.stderr) + + # Perform click + button = "right" if args.right else "left" + clicks = args.clicks if args.clicks > 1 else (2 if args.double else 1) + + pyautogui.click(x, y, clicks=clicks, button=button) + print(f"[oi] click at ({x}, {y}) button={button} clicks={clicks}", file=sys.stderr) + print(json.dumps({"action": "click", "x": x, "y": y, "button": button, "clicks": clicks})) + + +if __name__ == "__main__": + main() diff --git a/.claude/skills/open-interpreter/scripts/oi_computer.py b/.claude/skills/open-interpreter/scripts/oi_computer.py new file mode 100755 index 0000000000..b14a75a237 --- /dev/null +++ b/.claude/skills/open-interpreter/scripts/oi_computer.py @@ -0,0 +1,87 @@ +#!/usr/bin/env python3 +""" +oi_computer.py -- Unified dispatch for all desktop automation actions. + +Usage: + oi_computer.py screenshot [--region X,Y,W,H] [--active-window] + oi_computer.py click --x 450 --y 300 [--image-coords] [--double] [--right] + oi_computer.py click --text "Submit" + oi_computer.py type --text "hello world" [--method typewrite] + oi_computer.py type --key enter + oi_computer.py type --hotkey command space + oi_computer.py find --text "Submit" [--all] [--min-conf 80] + oi_computer.py scroll --clicks 3 [--x 450 --y 300] + oi_computer.py mouse-position + oi_computer.py screen-size +""" + +import json +import os +import subprocess +import sys + +SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) + + +def run_script(name, args): + """Run a sibling script and pass through its output.""" + script = os.path.join(SCRIPT_DIR, name) + cmd = [sys.executable, script] + args + result = subprocess.run(cmd, capture_output=False) + return result.returncode + + +def main(): + if len(sys.argv) < 2: + print("Usage: oi_computer.py [args...]", file=sys.stderr) + print("Commands: screenshot, click, type, find, scroll, mouse-position, screen-size", file=sys.stderr) + sys.exit(1) + + command = sys.argv[1] + args = sys.argv[2:] + + if command == "screenshot": + sys.exit(run_script("oi_screenshot.py", args)) + + elif command == "click": + sys.exit(run_script("oi_click.py", args)) + + elif command == "type": + sys.exit(run_script("oi_type.py", args)) + + elif command == "find": + sys.exit(run_script("oi_find_text.py", args)) + + elif command == "scroll": + import argparse + parser = argparse.ArgumentParser() + parser.add_argument("--clicks", type=int, default=3, help="Scroll clicks (positive=up, negative=down)") + parser.add_argument("--x", type=int, help="X position to scroll at") + parser.add_argument("--y", type=int, help="Y position to scroll at") + parsed = parser.parse_args(args) + + import pyautogui + if parsed.x is not None and parsed.y is not None: + pyautogui.moveTo(parsed.x, parsed.y) + pyautogui.scroll(parsed.clicks) + print(f"[oi] scroll clicks={parsed.clicks}", file=sys.stderr) + print(json.dumps({"action": "scroll", "clicks": parsed.clicks})) + + elif command == "mouse-position": + import pyautogui + pos = pyautogui.position() + print(json.dumps({"x": pos.x, "y": pos.y})) + + elif command == "screen-size": + import pyautogui + size = pyautogui.size() + print(json.dumps({"width": size.width, "height": size.height})) + + else: + print(f"Error: unknown command '{command}'", file=sys.stderr) + print("Commands: screenshot, click, type, find, scroll, mouse-position, screen-size", file=sys.stderr) + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/.claude/skills/open-interpreter/scripts/oi_find_text.py b/.claude/skills/open-interpreter/scripts/oi_find_text.py new file mode 100755 index 0000000000..4e10288610 --- /dev/null +++ b/.claude/skills/open-interpreter/scripts/oi_find_text.py @@ -0,0 +1,181 @@ +#!/usr/bin/env python3 +""" +oi_find_text.py -- OCR screen reading: find text locations on screen. + +Returns JSON array of matches with coordinates (in screen space, not image pixels). + +Usage: + oi_find_text.py --text "Submit" + oi_find_text.py --text "Submit" --screenshot /tmp/screenshot.png + oi_find_text.py --text "Price" --all # Return all matches, not just best + oi_find_text.py --text "File" --min-conf 80 # Minimum confidence threshold +""" + +import argparse +import json +import os +import platform +import subprocess +import sys +import tempfile +import time + + +def get_scale_factor(): + """Detect Retina scale factor.""" + if platform.system() != "Darwin": + return 1 + try: + import pyautogui + screen = pyautogui.size() + tmp = os.path.join(tempfile.gettempdir(), "oi_scale_test.png") + subprocess.run(["screencapture", "-x", "-C", tmp], capture_output=True, timeout=5) + if os.path.exists(tmp): + from PIL import Image + img = Image.open(tmp) + img_w = img.width + img.close() + os.unlink(tmp) + factor = round(img_w / screen.width) + return max(1, factor) + except Exception: + pass + return 2 + + +def find_text(text, screenshot_path=None, return_all=False, min_conf=0): + """Find text on screen using pytesseract OCR. + + Returns list of dicts: [{"text": str, "x": int, "y": int, "w": int, "h": int, "confidence": int}] + Coordinates are in screen space (divided by Retina scale factor). + """ + import pytesseract + from PIL import Image + + # Take screenshot if not provided + tmp_created = False + if screenshot_path is None: + screenshot_path = os.path.join(tempfile.gettempdir(), f"oi_ocr_{int(time.time())}.png") + if platform.system() == "Darwin": + subprocess.run(["screencapture", "-x", "-C", screenshot_path], capture_output=True, timeout=5) + else: + import pyautogui + img = pyautogui.screenshot() + img.save(screenshot_path) + tmp_created = True + + if not os.path.exists(screenshot_path): + return [] + + img = Image.open(screenshot_path) + scale = get_scale_factor() + + # Run OCR + data = pytesseract.image_to_data(img, output_type=pytesseract.Output.DICT) + img.close() + + if tmp_created: + os.unlink(screenshot_path) + + # Find matches + matches = [] + text_lower = text.lower() + n = len(data["text"]) + + # Single-word matches + for i in range(n): + word = data["text"][i].strip() + if not word: + continue + conf = int(data["conf"][i]) if data["conf"][i] != "-1" else 0 + if conf < min_conf: + continue + if text_lower in word.lower(): + matches.append({ + "text": word, + "x": int((data["left"][i] + data["width"][i] / 2) / scale), + "y": int((data["top"][i] + data["height"][i] / 2) / scale), + "w": int(data["width"][i] / scale), + "h": int(data["height"][i] / scale), + "confidence": conf, + }) + + # Multi-word matches + if " " in text: + words = text_lower.split() + for i in range(n - len(words) + 1): + segment = " ".join(data["text"][i:i + len(words)]).strip().lower() + if text_lower in segment: + # Check minimum confidence across span + span_conf = min( + int(data["conf"][j]) if data["conf"][j] != "-1" else 0 + for j in range(i, i + len(words)) + ) + if span_conf < min_conf: + continue + + last = i + len(words) - 1 + x1 = data["left"][i] + y1 = data["top"][i] + x2 = data["left"][last] + data["width"][last] + y2 = data["top"][last] + data["height"][last] + + matches.append({ + "text": " ".join(data["text"][i:i + len(words)]), + "x": int((x1 + x2) / 2 / scale), + "y": int((y1 + y2) / 2 / scale), + "w": int((x2 - x1) / scale), + "h": int((y2 - y1) / scale), + "confidence": span_conf, + }) + + # Sort by confidence descending + matches.sort(key=lambda m: m["confidence"], reverse=True) + + if return_all: + return matches + elif matches: + return [matches[0]] + else: + return [] + + +def main(): + parser = argparse.ArgumentParser(description="Find text on screen via OCR") + parser.add_argument("--text", required=True, help="Text to search for") + parser.add_argument("--screenshot", metavar="PATH", help="Use existing screenshot instead of capturing") + parser.add_argument("--all", action="store_true", help="Return all matches, not just the best") + parser.add_argument("--min-conf", type=int, default=0, help="Minimum OCR confidence threshold (0-100)") + + args = parser.parse_args() + + try: + results = find_text( + args.text, + screenshot_path=args.screenshot, + return_all=args.all, + min_conf=args.min_conf, + ) + + print(json.dumps(results, indent=2)) + + if results: + print(f"[oi] found {len(results)} match(es) for '{args.text}'", file=sys.stderr) + else: + print(f"[oi] no matches for '{args.text}'", file=sys.stderr) + sys.exit(1) + + except ImportError as e: + print(f"Error: Missing dependency: {e}\n" + "Run: ~/.claude/skills/open-interpreter/scripts/oi_install.sh", + file=sys.stderr) + sys.exit(1) + except KeyboardInterrupt: + sys.exit(130) + except Exception as e: + print(f"Error: {e}", file=sys.stderr) + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/.claude/skills/open-interpreter/scripts/oi_install.sh b/.claude/skills/open-interpreter/scripts/oi_install.sh new file mode 100755 index 0000000000..ddcaa52b19 --- /dev/null +++ b/.claude/skills/open-interpreter/scripts/oi_install.sh @@ -0,0 +1,58 @@ +#!/usr/bin/env bash +# oi_install.sh -- One-shot OpenInterpreter installation and verification +# +# Installs open-interpreter[os] via uv, verifies pyautogui, tesseract, +# and checks macOS permissions. +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" + +echo "==> Installing OpenInterpreter with OS mode extras..." +# OI pins tiktoken==0.7.0 which has no prebuilt wheel for Python 3.13+. +# Use --override to force a newer tiktoken that ships prebuilt wheels. +OVERRIDE_FILE=$(mktemp) +echo "tiktoken>=0.8" > "$OVERRIDE_FILE" +uv pip install --system "open-interpreter[os]" --override "$OVERRIDE_FILE" +rm -f "$OVERRIDE_FILE" + +echo "" +echo "==> Verifying OpenInterpreter import..." +python3 -c "from interpreter import interpreter; print(' interpreter: OK')" + +echo "" +echo "==> Verifying pyautogui..." +python3 -c " +import pyautogui +size = pyautogui.size() +print(f' pyautogui: OK (screen: {size.width}x{size.height})') +" + +echo "" +echo "==> Verifying pytesseract..." +python3 -c " +import pytesseract +version = pytesseract.get_tesseract_version() +print(f' pytesseract: OK (tesseract {version})') +" 2>/dev/null || { + echo " pytesseract: MISSING" + echo " Install tesseract: brew install tesseract" +} + +echo "" +echo "==> Checking tesseract CLI..." +if command -v tesseract &>/dev/null; then + echo " tesseract: $(tesseract --version 2>&1 | head -1)" +else + echo " tesseract: NOT FOUND" + echo " Install: brew install tesseract" +fi + +echo "" +echo "==> Checking macOS permissions..." +python3 "$SCRIPT_DIR/oi_permission_check.py" + +echo "" +echo "OpenInterpreter installation complete." +echo "If permissions are missing, add your terminal app in:" +echo " System Settings > Privacy & Security > Accessibility" +echo " System Settings > Privacy & Security > Screen Recording" diff --git a/.claude/skills/open-interpreter/scripts/oi_os_mode.py b/.claude/skills/open-interpreter/scripts/oi_os_mode.py new file mode 100755 index 0000000000..a6bdce6410 --- /dev/null +++ b/.claude/skills/open-interpreter/scripts/oi_os_mode.py @@ -0,0 +1,171 @@ +#!/usr/bin/env python3 +""" +oi_os_mode.py -- Launch OpenInterpreter as a managed subprocess for full +autonomous computer use (OS Mode) or local agent mode (Ollama). + +OS Mode: OI runs its own screenshot → Claude API → pyautogui loop. +Local Mode: OI runs in classic code-execution mode with Ollama as backend. + +Usage: + oi_os_mode.py "Open Calculator and compute 2+2" + oi_os_mode.py --provider anthropic "Change the wallpaper" + oi_os_mode.py --local "What apps are open?" + oi_os_mode.py --local --model llama3.2-vision "Describe the screen" + oi_os_mode.py --timeout 120 "Fill out the form" +""" + +import argparse +import os +import subprocess +import sys + + +def find_interpreter(): + """Find the interpreter CLI.""" + import shutil + path = shutil.which("interpreter") + if path: + return path + # Try common locations + for candidate in [ + os.path.expanduser("~/.local/bin/interpreter"), + "/usr/local/bin/interpreter", + ]: + if os.path.exists(candidate): + return candidate + return None + + +def run_os_mode(task, provider="anthropic", timeout=300): + """Run OI in OS Mode (screenshot-driven, Claude API).""" + interpreter_path = find_interpreter() + if not interpreter_path: + print("Error: 'interpreter' CLI not found. Run: ~/.claude/skills/open-interpreter/scripts/oi_install.sh", + file=sys.stderr) + sys.exit(1) + + # Check API key + if provider == "anthropic" and not os.environ.get("ANTHROPIC_API_KEY"): + print("Error: ANTHROPIC_API_KEY not set. Required for OS Mode.", file=sys.stderr) + sys.exit(1) + + cmd = [interpreter_path, "--os", "-y"] + + print(f"[oi] OS Mode: provider={provider}, task={repr(task)}", file=sys.stderr) + + try: + proc = subprocess.Popen( + cmd, + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + ) + + # Send task and close stdin to signal end of input + stdout, stderr = proc.communicate(input=task + "\n", timeout=timeout) + + if stdout: + print(stdout) + if stderr: + print(stderr, file=sys.stderr) + + return proc.returncode + + except subprocess.TimeoutExpired: + proc.kill() + proc.wait() + print(f"Error: OS Mode timed out after {timeout}s", file=sys.stderr) + return 1 + except FileNotFoundError: + print("Error: 'interpreter' CLI not found", file=sys.stderr) + return 1 + + +def run_local_mode(task, model="llama3.2-vision", api_base="http://localhost:11434", timeout=300): + """Run OI in classic mode with Ollama as backend.""" + interpreter_path = find_interpreter() + if not interpreter_path: + print("Error: 'interpreter' CLI not found. Run: ~/.claude/skills/open-interpreter/scripts/oi_install.sh", + file=sys.stderr) + sys.exit(1) + + # Verify Ollama is running + try: + import urllib.request + urllib.request.urlopen(f"{api_base}/api/tags", timeout=3) + except Exception: + print(f"Error: Ollama not reachable at {api_base}. Start with: ollama serve", file=sys.stderr) + sys.exit(1) + + cmd = [ + interpreter_path, + "--model", f"ollama/{model}", + "--api_base", api_base, + "-y", + ] + + print(f"[oi] Local Mode: model=ollama/{model}, task={repr(task)}", file=sys.stderr) + + try: + proc = subprocess.Popen( + cmd, + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + ) + + stdout, stderr = proc.communicate(input=task + "\n", timeout=timeout) + + if stdout: + print(stdout) + if stderr: + print(stderr, file=sys.stderr) + + return proc.returncode + + except subprocess.TimeoutExpired: + proc.kill() + proc.wait() + print(f"Error: Local Mode timed out after {timeout}s", file=sys.stderr) + return 1 + + +def main(): + parser = argparse.ArgumentParser( + description="Launch OpenInterpreter for autonomous computer use" + ) + parser.add_argument("task", help="Task description for OI to execute") + parser.add_argument("--local", action="store_true", + help="Use local Ollama model instead of Claude API") + parser.add_argument("--model", default="llama3.2-vision", + help="Ollama model for local mode (default: llama3.2-vision)") + parser.add_argument("--provider", default="anthropic", + help="API provider for OS Mode (default: anthropic). Currently only validates anthropic.") + parser.add_argument("--api-base", default="http://localhost:11434", + help="Ollama API base URL (default: http://localhost:11434)") + parser.add_argument("--timeout", type=int, default=300, + help="Timeout in seconds (default: 300)") + + args = parser.parse_args() + + if args.local: + rc = run_local_mode( + args.task, + model=args.model, + api_base=args.api_base, + timeout=args.timeout, + ) + else: + rc = run_os_mode( + args.task, + provider=args.provider, + timeout=args.timeout, + ) + + sys.exit(rc) + + +if __name__ == "__main__": + main() diff --git a/.claude/skills/open-interpreter/scripts/oi_permission_check.py b/.claude/skills/open-interpreter/scripts/oi_permission_check.py new file mode 100755 index 0000000000..cb99c9f6ac --- /dev/null +++ b/.claude/skills/open-interpreter/scripts/oi_permission_check.py @@ -0,0 +1,140 @@ +#!/usr/bin/env python3 +""" +oi_permission_check.py -- Check macOS permissions for desktop GUI automation. + +Verifies: + 1. Accessibility permission (required for pyautogui mouse/keyboard) + 2. Screen Recording permission (required for screenshots) + 3. tesseract installation (required for OCR) + +Usage: + python3 oi_permission_check.py +""" + +import platform +import shutil +import subprocess +import sys + + +def check_accessibility(): + """Check if Accessibility permission is granted (macOS only).""" + if platform.system() != "Darwin": + print(" Accessibility: N/A (not macOS)") + return True + + # Try a minimal pyautogui operation to detect permission + try: + import pyautogui + # position() requires Accessibility on macOS + pos = pyautogui.position() + print(f" Accessibility: OK (mouse at {pos.x}, {pos.y})") + return True + except Exception as e: + err = str(e).lower() + if "accessibility" in err or "permission" in err or "not allowed" in err: + print(" Accessibility: DENIED") + print(" -> System Settings > Privacy & Security > Accessibility > add your terminal app") + return False + # If the error is something else, pyautogui may still work + print(f" Accessibility: UNKNOWN ({e})") + return True + + +def check_screen_recording(): + """Check if Screen Recording permission is granted (macOS only).""" + if platform.system() != "Darwin": + print(" Screen Recording: N/A (not macOS)") + return True + + # Take a test screenshot with screencapture + import tempfile + import os + tmp = os.path.join(tempfile.gettempdir(), "oi_perm_test.png") + try: + result = subprocess.run( + ["screencapture", "-x", "-C", tmp], + capture_output=True, timeout=5 + ) + if os.path.exists(tmp): + size = os.path.getsize(tmp) + os.unlink(tmp) + if size > 100: + print(f" Screen Recording: OK (test screenshot {size} bytes)") + return True + else: + print(" Screen Recording: DENIED (screenshot is empty)") + print(" -> System Settings > Privacy & Security > Screen Recording > add your terminal app") + return False + else: + print(" Screen Recording: DENIED (no screenshot produced)") + print(" -> System Settings > Privacy & Security > Screen Recording > add your terminal app") + return False + except subprocess.TimeoutExpired: + print(" Screen Recording: TIMEOUT (screencapture hung — permission dialog may be showing)") + return False + except FileNotFoundError: + print(" Screen Recording: N/A (screencapture not found)") + return True + finally: + if os.path.exists(tmp): + os.unlink(tmp) + + +def check_tesseract(): + """Check if tesseract OCR is installed.""" + path = shutil.which("tesseract") + if path: + try: + result = subprocess.run( + ["tesseract", "--version"], + capture_output=True, text=True, timeout=5 + ) + version = result.stdout.strip().split("\n")[0] if result.stdout else result.stderr.strip().split("\n")[0] + print(f" tesseract: OK ({version} at {path})") + return True + except Exception: + print(f" tesseract: OK (at {path}, version check failed)") + return True + else: + print(" tesseract: NOT FOUND") + print(" -> Install: brew install tesseract") + return False + + +def check_pyautogui(): + """Check if pyautogui is installed.""" + try: + import pyautogui + size = pyautogui.size() + print(f" pyautogui: OK (screen: {size.width}x{size.height})") + return True + except ImportError: + print(" pyautogui: NOT INSTALLED") + print(" -> Run: ~/.claude/skills/open-interpreter/scripts/oi_install.sh") + return False + except Exception as e: + print(f" pyautogui: ERROR ({e})") + return False + + +def main(): + print("OpenInterpreter Permission Check") + print("=" * 40) + + all_ok = True + all_ok &= check_pyautogui() + all_ok &= check_accessibility() + all_ok &= check_screen_recording() + all_ok &= check_tesseract() + + print("=" * 40) + if all_ok: + print("All checks passed.") + else: + print("Some checks failed. See instructions above.") + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/.claude/skills/open-interpreter/scripts/oi_screenshot.py b/.claude/skills/open-interpreter/scripts/oi_screenshot.py new file mode 100755 index 0000000000..a2296b2ec0 --- /dev/null +++ b/.claude/skills/open-interpreter/scripts/oi_screenshot.py @@ -0,0 +1,186 @@ +#!/usr/bin/env python3 +""" +oi_screenshot.py -- Capture screen and return file path with Retina metadata. + +Outputs 3 lines to stdout: + 1. File path to PNG screenshot + 2. SCALE_FACTOR=N (Retina multiplier) + 3. SCREEN_SIZE=WxH (pyautogui coordinates) + +Usage: + oi_screenshot.py # Full screen + oi_screenshot.py --region 0,0,800,600 # Region (x,y,w,h) + oi_screenshot.py --active-window # Active window only (macOS) + oi_screenshot.py --output /tmp/my.png # Custom output path +""" + +import argparse +import os +import platform +import subprocess +import sys +import tempfile +import time + + +def get_scale_factor(): + """Detect Retina scale factor on macOS.""" + if platform.system() != "Darwin": + return 1 + + try: + # Use system_profiler to get display info + result = subprocess.run( + ["system_profiler", "SPDisplaysDataType"], + capture_output=True, text=True, timeout=5 + ) + output = result.stdout + # Look for Resolution line with Retina indicator + for line in output.splitlines(): + if "Retina" in line or "Resolution" in line: + if "Retina" in line: + return 2 + # Fallback: compare screenshot size to pyautogui screen size + return _detect_scale_from_screenshot() + except Exception: + return _detect_scale_from_screenshot() + + +def _detect_scale_from_screenshot(): + """Detect scale factor by comparing screenshot dimensions to screen size.""" + try: + import pyautogui + screen = pyautogui.size() + + # Take a tiny test screenshot + tmp = os.path.join(tempfile.gettempdir(), "oi_scale_test.png") + subprocess.run(["screencapture", "-x", "-C", tmp], capture_output=True, timeout=5) + + if os.path.exists(tmp): + from PIL import Image + img = Image.open(tmp) + img_w = img.width + img.close() + os.unlink(tmp) + + factor = round(img_w / screen.width) + return max(1, factor) + except Exception: + pass + return 2 # Default assumption for modern Macs + + +def screenshot_macos(output_path, region=None, active_window=False): + """Take screenshot on macOS using screencapture.""" + cmd = ["screencapture", "-x", "-C"] + + if active_window: + # Get frontmost window ID via AppleScript + try: + result = subprocess.run( + ["osascript", "-e", + 'tell application "System Events" to get id of first window of (first process whose frontmost is true)'], + capture_output=True, text=True, timeout=5 + ) + window_id = result.stdout.strip() + if window_id and window_id.isdigit(): + cmd.extend(["-l", window_id]) + else: + # Fallback to full-screen capture (never use -w which hangs in automation) + print("[oi] warning: could not get window ID, falling back to full screen", file=sys.stderr) + except Exception: + print("[oi] warning: could not get window ID, falling back to full screen", file=sys.stderr) + + if region: + cmd.extend(["-R", region]) + + cmd.append(output_path) + subprocess.run(cmd, capture_output=True, timeout=10) + return os.path.exists(output_path) + + +def screenshot_linux(output_path, region=None, active_window=False): + """Take screenshot on Linux using scrot or import.""" + for tool in ["scrot", "import"]: + if subprocess.run(["which", tool], capture_output=True).returncode == 0: + if tool == "scrot": + cmd = ["scrot", output_path] + if active_window: + cmd = ["scrot", "-u", output_path] + elif region: + x, y, w, h = region.split(",") + cmd = ["scrot", "-a", f"{x},{y},{w},{h}", output_path] + else: # import (ImageMagick) + cmd = ["import", "-window", "root", output_path] + if active_window: + cmd = ["import", output_path] # Interactive + + subprocess.run(cmd, capture_output=True, timeout=10) + return os.path.exists(output_path) + + # Fallback: pyautogui + try: + import pyautogui + img = pyautogui.screenshot(region=tuple(map(int, region.split(","))) if region else None) + img.save(output_path) + return True + except Exception: + return False + + +def main(): + parser = argparse.ArgumentParser(description="Capture screen, return path + Retina metadata") + parser.add_argument("--region", metavar="X,Y,W,H", help="Capture region (x,y,width,height)") + parser.add_argument("--active-window", action="store_true", help="Capture active window only") + parser.add_argument("--output", "-o", metavar="PATH", help="Custom output file path") + + args = parser.parse_args() + + # Generate output path + if args.output: + output_path = args.output + else: + timestamp = int(time.time()) + output_path = os.path.join(tempfile.gettempdir(), f"oi_screenshot_{timestamp}.png") + + # Take screenshot + system = platform.system() + if system == "Darwin": + ok = screenshot_macos(output_path, region=args.region, active_window=args.active_window) + elif system == "Linux": + ok = screenshot_linux(output_path, region=args.region, active_window=args.active_window) + else: + # Fallback: pyautogui + try: + import pyautogui + region_tuple = tuple(map(int, args.region.split(","))) if args.region else None + img = pyautogui.screenshot(region=region_tuple) + img.save(output_path) + ok = True + except Exception as e: + print(f"Error: {e}", file=sys.stderr) + ok = False + + if not ok or not os.path.exists(output_path): + print("Error: screenshot failed", file=sys.stderr) + sys.exit(1) + + # Get metadata + scale_factor = get_scale_factor() + try: + import pyautogui + screen = pyautogui.size() + screen_size = f"{screen.width}x{screen.height}" + except Exception: + screen_size = "unknown" + + # Output: path + metadata + print(output_path) + print(f"SCALE_FACTOR={scale_factor}") + print(f"SCREEN_SIZE={screen_size}") + + print(f"[oi] screenshot saved: {output_path} (scale={scale_factor}, screen={screen_size})", file=sys.stderr) + + +if __name__ == "__main__": + main() diff --git a/.claude/skills/open-interpreter/scripts/oi_type.py b/.claude/skills/open-interpreter/scripts/oi_type.py new file mode 100755 index 0000000000..38a4844335 --- /dev/null +++ b/.claude/skills/open-interpreter/scripts/oi_type.py @@ -0,0 +1,177 @@ +#!/usr/bin/env python3 +""" +oi_type.py -- Keyboard input: text, keys, and hotkeys. + +Usage: + oi_type.py --text "hello world" # Clipboard-paste (fast, unicode-safe) + oi_type.py --text "search" --method typewrite # Character-by-character + oi_type.py --key enter # Single key press + oi_type.py --key tab # Tab key + oi_type.py --hotkey command space # Hotkey combo (AppleScript on macOS) + oi_type.py --hotkey command shift 3 # Multi-modifier hotkey +""" + +import argparse +import json +import platform +import subprocess +import sys + + +def paste_text(text): + """Type text via clipboard-paste (Cmd+V on macOS, Ctrl+V elsewhere). + Faster and Unicode-safe compared to character-by-character typing.""" + import pyperclip + import pyautogui + + # Save current clipboard + try: + old_clipboard = pyperclip.paste() + except Exception: + old_clipboard = None + + # Copy text to clipboard and paste + pyperclip.copy(text) + if platform.system() == "Darwin": + pyautogui.hotkey("command", "v") + else: + pyautogui.hotkey("ctrl", "v") + + # Restore clipboard after brief delay + import time + time.sleep(0.1) + if old_clipboard is not None: + try: + pyperclip.copy(old_clipboard) + except Exception: + pass + + +def typewrite_text(text, interval=0.02): + """Type text character-by-character. Slower but doesn't use clipboard.""" + import pyautogui + pyautogui.typewrite(text, interval=interval) + + +def press_key(key): + """Press a single key.""" + import pyautogui + pyautogui.press(key) + + +def hotkey_applescript(*keys): + """Execute hotkey via AppleScript (macOS). More reliable for modifier keys.""" + # Map key names to AppleScript key codes + modifier_map = { + "command": "command down", + "cmd": "command down", + "shift": "shift down", + "option": "option down", + "alt": "option down", + "control": "control down", + "ctrl": "control down", + } + + modifiers = [] + key_char = None + + for k in keys: + k_lower = k.lower() + if k_lower in modifier_map: + modifiers.append(modifier_map[k_lower]) + else: + key_char = k_lower + + if key_char is None: + # All modifiers, no key — just press the last modifier as a key + key_char = keys[-1].lower() + modifiers = modifiers[:-1] + + # Map special key names to AppleScript + key_code_map = { + "space": 49, "return": 36, "enter": 36, "tab": 48, + "escape": 53, "esc": 53, "delete": 51, "backspace": 51, + "up": 126, "down": 125, "left": 123, "right": 124, + "f1": 122, "f2": 120, "f3": 99, "f4": 118, + "f5": 96, "f6": 97, "f7": 98, "f8": 100, + } + + modifier_str = ", ".join(modifiers) if modifiers else "" + + if key_char in key_code_map: + code = key_code_map[key_char] + if modifier_str: + script = f'tell application "System Events" to key code {code} using {{{modifier_str}}}' + else: + script = f'tell application "System Events" to key code {code}' + else: + # Single character — sanitize to prevent AppleScript injection + if len(key_char) != 1 or key_char in ('"', '\\'): + print(f"Error: invalid key character for keystroke: {repr(key_char)}", file=sys.stderr) + sys.exit(1) + if modifier_str: + script = f'tell application "System Events" to keystroke "{key_char}" using {{{modifier_str}}}' + else: + script = f'tell application "System Events" to keystroke "{key_char}"' + + subprocess.run(["osascript", "-e", script], capture_output=True, timeout=5) + + +def hotkey_pyautogui(*keys): + """Execute hotkey via pyautogui (cross-platform fallback).""" + import pyautogui + pyautogui.hotkey(*keys) + + +def main(): + parser = argparse.ArgumentParser(description="Keyboard input: text, keys, and hotkeys") + parser.add_argument("--text", metavar="TEXT", help="Text to type") + parser.add_argument("--key", metavar="KEY", help="Single key to press (enter, tab, escape, etc.)") + parser.add_argument("--hotkey", nargs="+", metavar="KEY", help="Hotkey combination (e.g., command space)") + parser.add_argument("--method", choices=["paste", "typewrite"], default="paste", + help="Text input method: paste (clipboard, default) or typewrite (character-by-character)") + + args = parser.parse_args() + + if not any([args.text, args.key, args.hotkey]): + parser.error("Provide one of --text, --key, or --hotkey") + + try: + if args.text: + if args.method == "typewrite": + typewrite_text(args.text) + print(f"[oi] typewrite: {repr(args.text)}", file=sys.stderr) + else: + paste_text(args.text) + print(f"[oi] paste: {repr(args.text)}", file=sys.stderr) + print(json.dumps({"action": "type", "text": args.text, "method": args.method})) + + elif args.key: + import pyautogui + pyautogui.press(args.key) + print(f"[oi] key: {args.key}", file=sys.stderr) + print(json.dumps({"action": "key", "key": args.key})) + + elif args.hotkey: + if platform.system() == "Darwin": + hotkey_applescript(*args.hotkey) + else: + hotkey_pyautogui(*args.hotkey) + combo = "+".join(args.hotkey) + print(f"[oi] hotkey: {combo}", file=sys.stderr) + print(json.dumps({"action": "hotkey", "keys": args.hotkey})) + + except ImportError as e: + print(f"Error: Missing dependency: {e}\n" + "Run: ~/.claude/skills/open-interpreter/scripts/oi_install.sh", + file=sys.stderr) + sys.exit(1) + except KeyboardInterrupt: + sys.exit(130) + except Exception as e: + print(f"Error: {e}", file=sys.stderr) + sys.exit(1) + + +if __name__ == "__main__": + main()