Skip to content

Commit d6b11b0

Browse files
committed
feat: add vision support for image analysis
- New /paste command to paste images from clipboard - @mention image files (e.g. @photo.jpg) sends them as vision input - Multimodal messages with base64-encoded images - Supports jpg, png, gif, webp, bmp, tiff - macOS clipboard paste via osascript/pngpaste - Streaming output for vision responses
1 parent 09e159c commit d6b11b0

3 files changed

Lines changed: 201 additions & 4 deletions

File tree

iclaw/completer.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
"/cmd",
88
"/provider_model",
99
"/model",
10+
"/paste",
1011
"/search",
1112
"/provider_search",
1213
"/proxy",

iclaw/main.py

Lines changed: 64 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,11 @@
88

99
from iclaw import http
1010
from iclaw import log
11-
from iclaw.at_mention import resolve_at_mentions
11+
from iclaw.vision import (
12+
get_clipboard_image,
13+
make_multimodal_message,
14+
resolve_at_mentions_with_vision,
15+
)
1216
from iclaw.commands.compact import handle_compact_command
1317
from iclaw.commands.export import handle_export_command
1418
from iclaw.commands.log import handle_log_command
@@ -37,6 +41,7 @@
3741
("/cmd", "Run shell command directly (usage: /cmd <command>)"),
3842
("/provider_model", "Select and authenticate with the model provider"),
3943
("/model", "Select specific model from your provider"),
44+
("/paste", "Paste image from clipboard for vision analysis"),
4045
("/search", "Web search (usage: /search <query>)"),
4146
("/provider_search", "Select the web search provider"),
4247
("/proxy", "Set HTTP/HTTPS proxy (usage: /proxy [url|off])"),
@@ -322,6 +327,60 @@ def main():
322327
output = exec(parts[1])
323328
print(output)
324329
continue
330+
if user_input == "/paste":
331+
img_b64, mime = get_clipboard_image()
332+
if not img_b64:
333+
print(
334+
"No image found in clipboard. Copy an image first.", file=sys.stderr
335+
)
336+
continue
337+
print(
338+
"Image pasted from clipboard. Enter your question (or press Enter for general analysis):"
339+
)
340+
try:
341+
question = input("> ").strip()
342+
except (EOFError, KeyboardInterrupt):
343+
print()
344+
continue
345+
if not question:
346+
question = "What's in this image? Describe it in detail."
347+
img_part = {"base64": img_b64, "mime": mime, "path": "clipboard"}
348+
msg = make_multimodal_message(question, [img_part])
349+
messages.append(msg)
350+
try:
351+
if (
352+
model_provider == "copilot"
353+
and time.monotonic() >= token_expiry
354+
and github_token
355+
):
356+
provider_token = get_copilot_token(github_token)
357+
token_expiry = time.monotonic() + TOKEN_REFRESH_INTERVAL
358+
chunks = _chat(
359+
model_provider,
360+
provider_token,
361+
messages,
362+
current_model,
363+
tools=TOOLS,
364+
stream=True,
365+
)
366+
print()
367+
reply = ""
368+
for chunk in chunks:
369+
print(chunk, end="", flush=True)
370+
reply += chunk
371+
print("\n")
372+
messages.append({"role": "assistant", "content": reply})
373+
last_reply = reply
374+
except UnsupportedModelError as e:
375+
print(f"Error: {e}", file=sys.stderr)
376+
print(
377+
"Please select a vision-capable model with /model", file=sys.stderr
378+
)
379+
messages.pop()
380+
except Exception as e:
381+
print(f"Error: {e}", file=sys.stderr)
382+
messages.pop()
383+
continue
325384

326385
if not provider_token:
327386
print("Not authenticated. Type /provider_model first.", file=sys.stderr)
@@ -336,9 +395,10 @@ def main():
336395
provider_token = get_copilot_token(github_token)
337396
token_expiry = time.monotonic() + TOKEN_REFRESH_INTERVAL
338397

339-
messages.append(
340-
{"role": "user", "content": resolve_at_mentions(user_input)}
341-
)
398+
# Resolve @mentions and handle images
399+
resolved_text, image_parts = resolve_at_mentions_with_vision(user_input)
400+
msg = make_multimodal_message(resolved_text, image_parts)
401+
messages.append(msg)
342402
response_message = _chat(
343403
model_provider, provider_token, messages, current_model, tools=TOOLS
344404
)

iclaw/vision.py

Lines changed: 136 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,136 @@
1+
"""Vision support for iclaw - handle image files and clipboard."""
2+
3+
import base64
4+
import os
5+
import subprocess
6+
7+
IMAGE_EXTENSIONS = {".jpg", ".jpeg", ".png", ".gif", ".webp", ".bmp", ".tiff"}
8+
9+
10+
def is_image_file(path):
11+
"""Check if a file is an image based on extension."""
12+
_, ext = os.path.splitext(path.lower())
13+
return ext in IMAGE_EXTENSIONS
14+
15+
16+
def read_image_base64(path):
17+
"""Read an image file and return base64 encoded string."""
18+
with open(path, "rb") as f:
19+
return base64.b64encode(f.read()).decode("utf-8")
20+
21+
22+
def get_image_mime_type(path):
23+
"""Get MIME type from file extension."""
24+
_, ext = os.path.splitext(path.lower())
25+
mime_map = {
26+
".jpg": "image/jpeg",
27+
".jpeg": "image/jpeg",
28+
".png": "image/png",
29+
".gif": "image/gif",
30+
".webp": "image/webp",
31+
".bmp": "image/bmp",
32+
".tiff": "image/tiff",
33+
}
34+
return mime_map.get(ext, "image/jpeg")
35+
36+
37+
def get_clipboard_image():
38+
"""Try to get image from macOS clipboard. Returns (base64, mime_type) or (None, None)."""
39+
try:
40+
# macOS: use pngpaste if available, or osascript
41+
result = subprocess.run(
42+
["osascript", "-e", "the clipboard as «class PNGf»"],
43+
capture_output=True,
44+
timeout=5,
45+
)
46+
if result.returncode == 0 and result.stdout:
47+
# Parse the hex output
48+
hex_str = result.stdout.decode("utf-8").strip()
49+
# Remove the «data PNGf» wrapper
50+
if "«data PNGf" in hex_str:
51+
hex_str = hex_str.split("«data PNGf")[1].rstrip("»").strip()
52+
image_bytes = bytes.fromhex(hex_str)
53+
return base64.b64encode(image_bytes).decode("utf-8"), "image/png"
54+
except (subprocess.TimeoutExpired, FileNotFoundError, Exception):
55+
pass
56+
57+
# Try pngpaste as fallback
58+
try:
59+
result = subprocess.run(
60+
["pngpaste", "-"],
61+
capture_output=True,
62+
timeout=5,
63+
)
64+
if result.returncode == 0 and result.stdout:
65+
return base64.b64encode(result.stdout).decode("utf-8"), "image/png"
66+
except (subprocess.TimeoutExpired, FileNotFoundError):
67+
pass
68+
69+
return None, None
70+
71+
72+
def make_image_message(text, image_base64, mime_type="image/jpeg"):
73+
"""Create a multimodal message with text and image."""
74+
return {
75+
"role": "user",
76+
"content": [
77+
{"type": "text", "text": text},
78+
{
79+
"type": "image_url",
80+
"image_url": {"url": f"data:{mime_type};base64,{image_base64}"},
81+
},
82+
],
83+
}
84+
85+
86+
def resolve_at_mentions_with_vision(text):
87+
"""Extract @file references, handle images as vision content.
88+
89+
Returns (resolved_text, list_of_image_parts).
90+
Each image_part is {"base64": str, "mime": str, "path": str}.
91+
"""
92+
import re
93+
94+
mentions = re.findall(r"@(\S+)", text)
95+
if not mentions:
96+
return text, []
97+
98+
from pathlib import Path
99+
100+
text_parts = []
101+
image_parts = []
102+
103+
for path in mentions:
104+
if os.path.isfile(path):
105+
if is_image_file(path):
106+
b64 = read_image_base64(path)
107+
mime = get_image_mime_type(path)
108+
image_parts.append({"base64": b64, "mime": mime, "path": path})
109+
else:
110+
try:
111+
contents = Path(path).read_text()
112+
text_parts.append(f'<file path="{path}">\n{contents}\n</file>')
113+
except OSError:
114+
pass
115+
116+
result = text
117+
if text_parts:
118+
result = "\n".join(text_parts) + "\n\n" + text
119+
return result, image_parts
120+
121+
122+
def make_multimodal_message(text, image_parts):
123+
"""Create a multimodal message with text and images."""
124+
if not image_parts:
125+
return {"role": "user", "content": text}
126+
127+
content = []
128+
for img in image_parts:
129+
content.append(
130+
{
131+
"type": "image_url",
132+
"image_url": {"url": f"data:{img['mime']};base64,{img['base64']}"},
133+
}
134+
)
135+
content.append({"type": "text", "text": text})
136+
return {"role": "user", "content": content}

0 commit comments

Comments
 (0)