Skip to content

Commit 359a3a2

Browse files
Merge pull request #3 from vaibhavqrcg/VPZ/support-for-file-prompts
Add image and audio to prompt/exec
2 parents d61c59a + 06f194c commit 359a3a2

15 files changed

Lines changed: 232 additions & 69 deletions

README.md

Lines changed: 15 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -101,17 +101,21 @@ codeye --agent "my-custom-agent --stdio" prompt <session-id> "review security po
101101

102102
### Global Options
103103

104-
| Flag | Description |
105-
| ----------------- | ----------------------------------------------------------- |
106-
| `--cwd <path>` | Run in a specific working directory |
107-
| `--agent "<cmd>"` | Use a custom ACP-compatible agent command |
108-
| `--format <mode>` | Output format: `text`, `json`, `json-strict`, `quiet` |
109-
| `--json-strict` | Shorthand for `--format json-strict` |
110-
| `--approve-all` | Allow all agent tool requests |
111-
| `--approve-reads` | Allow read-only tool requests, deny writes |
112-
| `--deny-all` | Deny all agent tool requests |
113-
| `--ask` | Prompt to approve or reject each tool request (interactive) |
114-
| `--version`, `-V` | Print version |
104+
| Flag | Description |
105+
| ------------------- | ----------------------------------------------------------- |
106+
| `--cwd <path>` | Run in a specific working directory |
107+
| `--agent "<cmd>"` | Use a custom ACP-compatible agent command |
108+
| `--format <mode>` | Output format: `text`, `json`, `json-strict`, `quiet` |
109+
| `--json-strict` | Shorthand for `--format json-strict` |
110+
| `--audio <path>` | Add audio file(s) to prompt/exec (repeatable). Supports .wav, .mp3, .ogg, .flac, .m4a |
111+
| `--image <path>` | Add image file(s) to prompt/exec (repeatable). Supports .png, .jpg, .gif, .webp |
112+
| `--approve-all` | Allow all agent tool requests |
113+
| `--approve-reads` | Allow read-only tool requests, deny writes |
114+
| `--deny-all` | Deny all agent tool requests |
115+
| `--ask` | Prompt to approve or reject each tool request (interactive) |
116+
| `--version`, `-V` | Print version |
117+
118+
For `prompt` and `exec`, place `--audio` and `--image` before the command (e.g. `codeye --image diagram.png prompt <session-id> "describe this"`). Agents must advertise the corresponding prompt capabilities (image/audio) in initialization.
115119

116120
## Configuration
117121

internal/acp/payloads.go

Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -46,14 +46,23 @@ type SessionListResponse struct {
4646
Sessions []SessionListEntry `json:"sessions"`
4747
}
4848

49-
type PromptTextPart struct {
50-
Type string `json:"type"`
51-
Text string `json:"text"`
49+
// PromptPart is one content block in a session/prompt request.
50+
// For text: Type="text", Text set. For image/audio: Type="image"|"audio", MimeType and Data (base64) set.
51+
type PromptPart struct {
52+
Type string `json:"type"`
53+
Text string `json:"text,omitempty"`
54+
MimeType string `json:"mimeType,omitempty"`
55+
Data string `json:"data,omitempty"` // base64-encoded for image/audio
56+
}
57+
58+
// TextPrompt returns a single text prompt part (convenience for callers that have only text).
59+
func TextPrompt(text string) []PromptPart {
60+
return []PromptPart{{Type: "text", Text: text}}
5261
}
5362

5463
type SessionPromptRequest struct {
55-
SessionID string `json:"sessionId"`
56-
Prompt []PromptTextPart `json:"prompt"`
64+
SessionID string `json:"sessionId"`
65+
Prompt []PromptPart `json:"prompt"`
5766
}
5867

5968
type SessionPromptResponse struct {

internal/cli/prompt_parts.go

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
package cli
2+
3+
import (
4+
"encoding/base64"
5+
"fmt"
6+
"os"
7+
"path/filepath"
8+
"strings"
9+
10+
"github.com/one710/codeye/internal/acp"
11+
)
12+
13+
// BuildPromptParts returns ACP prompt parts: one text block (if text is non-empty), then image blocks, then audio blocks.
14+
// Image and audio files are read and base64-encoded; mime types are inferred from extension.
15+
func BuildPromptParts(text string, imagePaths, audioPaths []string) ([]acp.PromptPart, error) {
16+
var parts []acp.PromptPart
17+
text = strings.TrimSpace(text)
18+
if text != "" {
19+
parts = append(parts, acp.PromptPart{Type: "text", Text: text})
20+
}
21+
for _, p := range imagePaths {
22+
mime, data, err := readFileAsBase64(p, imageMimeTypes)
23+
if err != nil {
24+
return nil, fmt.Errorf("image %s: %w", p, err)
25+
}
26+
parts = append(parts, acp.PromptPart{Type: "image", MimeType: mime, Data: data})
27+
}
28+
for _, p := range audioPaths {
29+
mime, data, err := readFileAsBase64(p, audioMimeTypes)
30+
if err != nil {
31+
return nil, fmt.Errorf("audio %s: %w", p, err)
32+
}
33+
parts = append(parts, acp.PromptPart{Type: "audio", MimeType: mime, Data: data})
34+
}
35+
if len(parts) == 0 {
36+
parts = []acp.PromptPart{{Type: "text", Text: ""}}
37+
}
38+
return parts, nil
39+
}
40+
41+
func readFileAsBase64(path string, mimeMap map[string]string) (mimeType, b64 string, err error) {
42+
ext := strings.ToLower(strings.TrimPrefix(filepath.Ext(path), "."))
43+
mimeType = mimeMap[ext]
44+
if mimeType == "" {
45+
return "", "", fmt.Errorf("unsupported extension .%s", ext)
46+
}
47+
raw, err := os.ReadFile(path)
48+
if err != nil {
49+
return "", "", err
50+
}
51+
return mimeType, base64.StdEncoding.EncodeToString(raw), nil
52+
}
53+
54+
var imageMimeTypes = map[string]string{
55+
"png": "image/png",
56+
"jpg": "image/jpeg",
57+
"jpeg": "image/jpeg",
58+
"gif": "image/gif",
59+
"webp": "image/webp",
60+
}
61+
62+
var audioMimeTypes = map[string]string{
63+
"wav": "audio/wav",
64+
"mp3": "audio/mpeg",
65+
"mpeg": "audio/mpeg",
66+
"ogg": "audio/ogg",
67+
"flac": "audio/flac",
68+
"m4a": "audio/mp4",
69+
}

internal/cli/root.go

Lines changed: 29 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,8 @@ type globalFlags struct {
3232
AgentCommand string
3333
Format output.Format
3434
PermMode permissions.Mode
35+
AudioPaths []string
36+
ImagePaths []string
3537
}
3638

3739
func Run(argv []string) int {
@@ -114,7 +116,7 @@ func Run(argv []string) int {
114116
)
115117

116118
ctx := context.Background()
117-
code := dispatch(ctx, out, repo, rt, adapter, agent, flags.Cwd, cmd, cmdArgs, flags.PermMode, flags.Format)
119+
code := dispatch(ctx, out, repo, rt, adapter, agent, flags.Cwd, cmd, cmdArgs, flags.PermMode, flags.Format, flags.AudioPaths, flags.ImagePaths)
118120
return code
119121
}
120122

@@ -130,15 +132,21 @@ func dispatch(
130132
args []string,
131133
permMode permissions.Mode,
132134
format output.Format,
135+
audioPaths, imagePaths []string,
133136
) int {
134137
switch cmd {
135138
case "prompt":
136139
if len(args) < 2 {
137-
out.PrintRPCError(-32602, "usage: prompt <session-id> <text...>", map[string]interface{}{"errorCode": cerr.CodeUsage})
140+
out.PrintRPCError(-32602, "usage: prompt <session-id> <text...> [--audio <path>...] [--image <path>...]", map[string]interface{}{"errorCode": cerr.CodeUsage})
138141
return 2
139142
}
140143
sessionID := args[0]
141144
promptText := strings.Join(args[1:], " ")
145+
parts, err := BuildPromptParts(promptText, imagePaths, audioPaths)
146+
if err != nil {
147+
out.PrintError(err.Error())
148+
return 1
149+
}
142150
rec, err := repo.Load(sessionID)
143151
if err != nil {
144152
out.PrintError("session not found: " + sessionID)
@@ -149,7 +157,7 @@ func dispatch(
149157
return 1
150158
}
151159
// ACP: session/load (by agent) restores conversation history; then session/prompt. All in-process.
152-
stopReason, responseText, err := rt.PromptWithOutput(ctx, rec, promptText)
160+
stopReason, responseText, err := rt.PromptWithOutput(ctx, rec, parts)
153161
if err != nil {
154162
out.PrintError(err.Error())
155163
return 1
@@ -163,11 +171,16 @@ func dispatch(
163171
}, "")
164172
return 0
165173
case "exec":
166-
if len(args) == 0 {
167-
out.PrintRPCError(-32602, "prompt is required", map[string]interface{}{"errorCode": cerr.CodeUsage})
174+
if len(args) == 0 && len(audioPaths) == 0 && len(imagePaths) == 0 {
175+
out.PrintRPCError(-32602, "prompt text or --audio/--image is required", map[string]interface{}{"errorCode": cerr.CodeUsage})
168176
return 2
169177
}
170-
stopReason, responseText, err := rt.RunOnceWithOutput(ctx, cwd, strings.Join(args, " "))
178+
parts, err := BuildPromptParts(strings.Join(args, " "), imagePaths, audioPaths)
179+
if err != nil {
180+
out.PrintError(err.Error())
181+
return 1
182+
}
183+
stopReason, responseText, err := rt.RunOnceWithOutput(ctx, cwd, parts)
171184
if err != nil {
172185
out.PrintError(err.Error())
173186
return 1
@@ -428,6 +441,16 @@ func parseGlobals(args []string) (globalFlags, []string, error) {
428441
flags.PermMode = permissions.DenyAll
429442
case "--ask":
430443
flags.PermMode = permissions.Ask
444+
case "--audio":
445+
i++
446+
if i < len(args) {
447+
flags.AudioPaths = append(flags.AudioPaths, args[i])
448+
}
449+
case "--image":
450+
i++
451+
if i < len(args) {
452+
flags.ImagePaths = append(flags.ImagePaths, args[i])
453+
}
431454
default:
432455
if strings.HasPrefix(arg, "--") {
433456
return flags, nil, fmt.Errorf("unknown flag: %s", arg)

internal/client/client.go

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -242,7 +242,10 @@ func (c *Client) ListSessions(ctx context.Context, cwd string) ([]string, error)
242242
return out, nil
243243
}
244244

245-
func (c *Client) Prompt(ctx context.Context, sessionID, text string) (PromptResult, error) {
245+
func (c *Client) Prompt(ctx context.Context, sessionID string, parts []acp.PromptPart) (PromptResult, error) {
246+
if len(parts) == 0 {
247+
parts = []acp.PromptPart{{Type: "text", Text: ""}}
248+
}
246249
c.updateMu.Lock()
247250
c.activePromptSessionID = sessionID
248251
c.activePromptChunks = nil
@@ -256,9 +259,7 @@ func (c *Client) Prompt(ctx context.Context, sessionID, text string) (PromptResu
256259

257260
res, err := c.call(ctx, acp.MethodSessionPrompt, acp.SessionPromptRequest{
258261
SessionID: sessionID,
259-
Prompt: []acp.PromptTextPart{
260-
{Type: "text", Text: text},
261-
},
262+
Prompt: parts,
262263
})
263264
if err != nil {
264265
return PromptResult{}, err

internal/queue/protocol.go

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
package queue
22

3+
import "github.com/one710/codeye/internal/acp"
4+
35
type Command string
46

57
const (
@@ -11,13 +13,14 @@ const (
1113
)
1214

1315
type Request struct {
14-
RequestID string `json:"requestId,omitempty"`
15-
Command Command `json:"command"`
16-
SessionID string `json:"sessionId"`
17-
Prompt string `json:"prompt,omitempty"`
18-
Mode string `json:"mode,omitempty"`
19-
Key string `json:"key,omitempty"`
20-
Value string `json:"value,omitempty"`
16+
RequestID string `json:"requestId,omitempty"`
17+
Command Command `json:"command"`
18+
SessionID string `json:"sessionId"`
19+
Prompt string `json:"prompt,omitempty"` // legacy: single text part
20+
PromptParts []acp.PromptPart `json:"promptParts,omitempty"` // when set, used instead of Prompt
21+
Mode string `json:"mode,omitempty"`
22+
Key string `json:"key,omitempty"`
23+
Value string `json:"value,omitempty"`
2124
}
2225

2326
type Response struct {

internal/queue/server.go

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,10 +8,12 @@ import (
88
"os"
99
"sync"
1010
"time"
11+
12+
"github.com/one710/codeye/internal/acp"
1113
)
1214

1315
type Handler interface {
14-
Prompt(ctx context.Context, sessionID, prompt string) (PromptResult, error)
16+
Prompt(ctx context.Context, sessionID string, parts []acp.PromptPart) (PromptResult, error)
1517
Cancel(ctx context.Context, sessionID string) error
1618
SetMode(ctx context.Context, sessionID, mode string) error
1719
SetConfigOption(ctx context.Context, sessionID, key, value string) error
@@ -122,7 +124,11 @@ func (s *Server) Run(ctx context.Context) error {
122124
func (s *Server) dispatch(ctx context.Context, req Request) (Response, error) {
123125
switch req.Command {
124126
case CmdPrompt:
125-
result, err := s.Handler.Prompt(ctx, req.SessionID, req.Prompt)
127+
parts := req.PromptParts
128+
if len(parts) == 0 {
129+
parts = []acp.PromptPart{{Type: "text", Text: req.Prompt}}
130+
}
131+
result, err := s.Handler.Prompt(ctx, req.SessionID, parts)
126132
if err != nil {
127133
return Response{}, err
128134
}

internal/session/runtime.go

Lines changed: 14 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ import (
1111
"sync"
1212
"time"
1313

14+
"github.com/one710/codeye/internal/acp"
1415
"github.com/one710/codeye/internal/client"
1516
"github.com/one710/codeye/internal/queue"
1617
"github.com/one710/codeye/internal/session/persistence"
@@ -59,18 +60,18 @@ func (r *Runtime) CreateSession(ctx context.Context, agent, cwd, name string) (p
5960
return rec, nil
6061
}
6162

62-
func (r *Runtime) Prompt(ctx context.Context, rec persistence.Record, prompt string) (string, error) {
63-
stopReason, _, err := r.PromptWithOutput(ctx, rec, prompt)
63+
func (r *Runtime) Prompt(ctx context.Context, rec persistence.Record, parts []acp.PromptPart) (string, error) {
64+
stopReason, _, err := r.PromptWithOutput(ctx, rec, parts)
6465
return stopReason, err
6566
}
6667

67-
func (r *Runtime) PromptWithOutput(ctx context.Context, rec persistence.Record, prompt string) (string, string, error) {
68-
return r.PromptInProcess(ctx, rec, prompt)
68+
func (r *Runtime) PromptWithOutput(ctx context.Context, rec persistence.Record, parts []acp.PromptPart) (string, string, error) {
69+
return r.PromptInProcess(ctx, rec, parts)
6970
}
7071

7172
// PromptInProcess runs the prompt in the current process with the same client as exec,
7273
// so streaming and permission prompts behave identically to exec.
73-
func (r *Runtime) PromptInProcess(ctx context.Context, rec persistence.Record, prompt string) (string, string, error) {
74+
func (r *Runtime) PromptInProcess(ctx context.Context, rec persistence.Record, parts []acp.PromptPart) (string, string, error) {
7475
c := r.ClientFactory()
7576
if err := c.Start(ctx); err != nil {
7677
return "", "", err
@@ -86,7 +87,7 @@ func (r *Runtime) PromptInProcess(ctx context.Context, rec persistence.Record, p
8687
rec.ACPSession = newSid
8788
_ = r.Repo.Save(rec)
8889
}
89-
result, err := c.Prompt(ctx, sid, prompt)
90+
result, err := c.Prompt(ctx, sid, parts)
9091
if err != nil {
9192
return "", "", err
9293
}
@@ -193,12 +194,12 @@ func (r *Runtime) RunWorkingSession(ctx context.Context, initial persistence.Rec
193194
return server.Run(ctx)
194195
}
195196

196-
func (r *Runtime) RunOnce(ctx context.Context, cwd, prompt string) (string, error) {
197-
stopReason, _, err := r.RunOnceWithOutput(ctx, cwd, prompt)
197+
func (r *Runtime) RunOnce(ctx context.Context, cwd string, parts []acp.PromptPart) (string, error) {
198+
stopReason, _, err := r.RunOnceWithOutput(ctx, cwd, parts)
198199
return stopReason, err
199200
}
200201

201-
func (r *Runtime) RunOnceWithOutput(ctx context.Context, cwd, prompt string) (string, string, error) {
202+
func (r *Runtime) RunOnceWithOutput(ctx context.Context, cwd string, parts []acp.PromptPart) (string, string, error) {
202203
c := r.ClientFactory()
203204
if err := c.Start(ctx); err != nil {
204205
return "", "", err
@@ -208,7 +209,7 @@ func (r *Runtime) RunOnceWithOutput(ctx context.Context, cwd, prompt string) (st
208209
if err != nil {
209210
return "", "", err
210211
}
211-
result, err := c.Prompt(ctx, sid, prompt)
212+
result, err := c.Prompt(ctx, sid, parts)
212213
if err != nil {
213214
return "", "", err
214215
}
@@ -250,14 +251,14 @@ type workingSessionHandler struct {
250251
fullMessage string // set when we get a non-chunk final message to avoid duplicating chunk content
251252
}
252253

253-
func (w *workingSessionHandler) Prompt(ctx context.Context, sessionID, prompt string) (queue.PromptResult, error) {
254+
func (w *workingSessionHandler) Prompt(ctx context.Context, sessionID string, parts []acp.PromptPart) (queue.PromptResult, error) {
254255
w.mu.Lock()
255256
activeSessionID := w.liveSessionID
256257
w.chunks = nil
257258
w.fullMessage = ""
258259
w.mu.Unlock()
259260

260-
result, err := w.client.Prompt(ctx, activeSessionID, prompt)
261+
result, err := w.client.Prompt(ctx, activeSessionID, parts)
261262
if err != nil && shouldRecreateSessionOnPromptError(err) {
262263
// Agent rejected stale/unknown session; recreate once and retry.
263264
if sid, createErr := w.client.CreateSession(ctx, w.cwd); createErr == nil {
@@ -268,7 +269,7 @@ func (w *workingSessionHandler) Prompt(ctx context.Context, sessionID, prompt st
268269
_ = w.repo.Save(w.record)
269270
}
270271
w.mu.Unlock()
271-
result, err = w.client.Prompt(ctx, sid, prompt)
272+
result, err = w.client.Prompt(ctx, sid, parts)
272273
}
273274
}
274275
if err != nil {

0 commit comments

Comments
 (0)