Skip to content

Commit 1ff1aeb

Browse files
feat: VisionPaste - intelligent screenshot context pipeline (v1)
1 parent d682bdd commit 1ff1aeb

13 files changed

Lines changed: 385 additions & 8 deletions

File tree

README.md

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,17 @@ make build
7272
### Security & Privacy
7373
- **Concealed / Sensitive Clipboard Filtering**: When the clipboard watcher is enabled (`watch.enabled = true`), PasteLocal automatically skips items marked as secrets by password managers. On macOS this uses the standard `org.nspasteboard.ConcealedType` signal (the same one Raycast and other good clipboard managers respect). These items are never relayed to remotes. Explicit access returns error `CB1013`. The feature is on by default for safety. See `[watch.sensitive]` in RELAY.md for configuration and detector-failure logging.
7474

75+
- **VisionPaste / Intelligent Screenshot Context (v1)**: Optional local analysis pipeline (`[vision]` in config.toml) that runs external commands (tesseract etc.) on screenshots **at explicit read time** on the serving daemon. Produces OCR text + descriptions included in `/clipboard` (and `/clipboard/history/{id}`) responses and written as `.analysis.txt` sidecars by `pastelocal-remote`. Skills present the rich text first.
76+
- v1 limitations (by design, per scoped plan): Analysis is demand-driven on read (not pre-computed in watcher goroutine); relay carries raw bytes only; history stores raw bytes (re-analysis occurs on history fetch if still present on source clipboard). Concealed items never analyzed.
77+
Configure example:
78+
```toml
79+
[vision]
80+
enabled = true
81+
[[vision.chain]]
82+
name = "ocr"
83+
command = "tesseract - - 2>/dev/null || true"
84+
```
85+
7586
### Experimental
7687
- **Multi-device Relay** — E2E encrypted clipboard sync without SSH tunnels (see RELAY.md for full details, including sensitive clipboard filtering for password managers).
7788

cmd/pastelocal-remote/main.go

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -171,6 +171,26 @@ func run(port int, outDir string, timeout time.Duration, tokenFile string, sendP
171171
fmt.Fprintf(os.Stderr, "error resolving absolute path: %v\n", err)
172172
return 10
173173
}
174+
175+
// VisionPaste v1: if the daemon provided analysis (OCR/description), write a
176+
// lightweight sidecar next to the image so the agent skill can present rich
177+
// text context first without extra vision calls. Sidecar is best-effort.
178+
if clipResp.Analysis != nil {
179+
var analysisText string
180+
if clipResp.Analysis.OCRText != "" {
181+
analysisText += "OCR Text:\n" + clipResp.Analysis.OCRText + "\n\n"
182+
}
183+
if clipResp.Analysis.Description != "" {
184+
analysisText += "Description:\n" + clipResp.Analysis.Description + "\n"
185+
}
186+
if analysisText != "" {
187+
analysisPath := strings.TrimSuffix(path, filepath.Ext(path)) + ".analysis.txt"
188+
if writeErr := os.WriteFile(analysisPath, []byte(analysisText), 0o600); writeErr != nil {
189+
fmt.Fprintf(os.Stderr, "warning: failed to write analysis sidecar %s: %v\n", analysisPath, writeErr)
190+
}
191+
}
192+
}
193+
174194
fmt.Println(absPath)
175195

176196
case "text":
@@ -610,6 +630,26 @@ func runHistoryFetch(client *http.Client, baseURL, token, outDir string, index i
610630
fmt.Fprintf(os.Stderr, "error resolving absolute path: %v\n", err)
611631
return 10
612632
}
633+
634+
// VisionPaste v1: mirror sidecar logic for history fetches (when daemon
635+
// populates Analysis on /history/{id} responses). Keeps agent experience
636+
// consistent for --list + --index usage.
637+
if clipResp.Analysis != nil {
638+
var analysisText string
639+
if clipResp.Analysis.OCRText != "" {
640+
analysisText += "OCR Text:\n" + clipResp.Analysis.OCRText + "\n\n"
641+
}
642+
if clipResp.Analysis.Description != "" {
643+
analysisText += "Description:\n" + clipResp.Analysis.Description + "\n"
644+
}
645+
if analysisText != "" {
646+
analysisPath := strings.TrimSuffix(path, filepath.Ext(path)) + ".analysis.txt"
647+
if writeErr := os.WriteFile(analysisPath, []byte(analysisText), 0o600); writeErr != nil {
648+
fmt.Fprintf(os.Stderr, "warning: failed to write analysis sidecar %s: %v\n", analysisPath, writeErr)
649+
}
650+
}
651+
}
652+
613653
fmt.Println(absPath)
614654

615655
case "text":

docs/RELAY.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -135,4 +135,10 @@ See [SECURITY.md](./SECURITY.md) for full threat model.
135135

136136
Relay v1.0 completes the "Future Vision" section of the README. The core SSH path remains the recommended fast path; relay is the perfect fallback / multi-device / no-tunnel solution.
137137

138+
VisionPaste (Intelligent Screenshot Context) is a complementary local-first feature (v1): when the daemon's `[vision]` pipeline is configured, screenshots served over the direct (SSH) read path (and history fetches) include rich OCR + description text in the `ClipboardResponse` and as a `.analysis.txt` sidecar written by `pastelocal-remote`.
139+
140+
**v1 scoping (intentional)**: Analysis is performed demand-driven at explicit read time on the serving daemon only (never inside the watcher goroutine or at push time, to keep the 2s poll fast). Relay carries raw bytes only. History responses carry Analysis on specific-entry fetches via on-demand re-analysis (storage remains raw bytes only; re-analysis happens if the screenshot is still on the source clipboard at fetch time). This resolves the plan's open question on watcher aggressiveness while delivering the highest-value path for agents. See README Features for config + full limitations, and the paste skills for agent UX.
141+
142+
See README and the Grok/Claude paste skills for usage.
143+
138144
Bugs / feedback: open an issue or PR.

internal/config/config.go

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@ type Config struct {
4040
History HistoryConfig `toml:"history"`
4141
Redaction RedactionConfig `toml:"redaction"`
4242
Processors ProcessorConfig `toml:"processors"`
43+
Vision VisionConfig `toml:"vision"`
4344
Relay RelayConfig `toml:"relay"`
4445
Watch WatchConfig `toml:"watch"`
4546
}
@@ -96,6 +97,23 @@ type ProcessorEntry struct {
9697
Command string `toml:"command"`
9798
}
9899

100+
// VisionConfig controls the VisionPaste / intelligent screenshot analysis pipeline (v1).
101+
// When enabled, image clipboard content is passed through external analysis commands
102+
// (e.g. tesseract for OCR) that produce text metadata (OCR, descriptions). The results
103+
// are returned in read responses and written as sidecars by pastelocal-remote for agent
104+
// consumption. Analysis is never run on concealed/sensitive items.
105+
type VisionConfig struct {
106+
Enabled bool `toml:"enabled"`
107+
Timeout int `toml:"timeout_seconds"`
108+
Chain []VisionEntry `toml:"chain"`
109+
}
110+
111+
// VisionEntry represents one analysis step in the vision pipeline.
112+
type VisionEntry struct {
113+
Name string `toml:"name"` // "ocr", "describe", etc. Determines which result field is populated.
114+
Command string `toml:"command"` // Shell command: image bytes on stdin, text analysis on stdout.
115+
}
116+
99117
// RelayConfig controls the E2E encrypted relay for multi-device sync.
100118
type RelayConfig struct {
101119
Enabled bool `toml:"enabled"`
@@ -161,6 +179,10 @@ func Default() *Config {
161179
Enabled: false,
162180
Timeout: 5,
163181
},
182+
Vision: VisionConfig{
183+
Enabled: false,
184+
Timeout: 15,
185+
},
164186
Relay: RelayConfig{
165187
Enabled: false,
166188
RelayURL: "http://localhost:7332",
@@ -381,6 +403,9 @@ func mergeDefaults(cfg *Config) {
381403
if cfg.Processors.Timeout == 0 {
382404
cfg.Processors.Timeout = 5
383405
}
406+
if cfg.Vision.Timeout == 0 {
407+
cfg.Vision.Timeout = 15
408+
}
384409

385410
// Watch.Sensitive: safe defaults (FilterConcealed=true, LogFilteredItems=true)
386411
// are injected by Default() before toml.Decode and applyEnvOverrides.

internal/proto/types.go

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
package proto
22

33
// ProtocolVersion is the current wire protocol version.
4-
const ProtocolVersion = 2
4+
const ProtocolVersion = 3
55

66
// ClipboardResponse is the JSON response for GET /clipboard on success.
77
type ClipboardResponse struct {
@@ -12,6 +12,7 @@ type ClipboardResponse struct {
1212
ByteCount int64 `json:"byte_count"`
1313
CapturedAt string `json:"captured_at"` // RFC3339
1414
ID string `json:"id,omitempty"` // unique ID for history entries
15+
Analysis *ClipboardAnalysis `json:"analysis,omitempty"`
1516
}
1617

1718
// ClipboardWriteRequest is the JSON request for POST /clipboard.
@@ -28,6 +29,14 @@ type ClipboardWriteResponse struct {
2829
ByteCount int64 `json:"byte_count"`
2930
}
3031

32+
// ClipboardAnalysis carries optional rich context for images produced by the
33+
// VisionPaste analysis pipeline (OCR text, natural language description).
34+
// Populated only for png responses when vision analysis is configured and succeeds.
35+
type ClipboardAnalysis struct {
36+
OCRText string `json:"ocr_text,omitempty"`
37+
Description string `json:"description,omitempty"`
38+
}
39+
3140
// ErrorResponse is the JSON response for any error.
3241
type ErrorResponse struct {
3342
OK bool `json:"ok"`

internal/proto/types_test.go

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -225,7 +225,9 @@ func TestHealthResponseSerialization(t *testing.T) {
225225
}
226226

227227
func TestProtocolVersionConstant(t *testing.T) {
228-
if ProtocolVersion != 2 {
229-
t.Errorf("ProtocolVersion = %d, want 2", ProtocolVersion)
228+
// Bumped to 3 for VisionPaste (additive Analysis field in ClipboardResponse).
229+
// Use >= to tolerate future additive bumps without test churn.
230+
if ProtocolVersion < 3 {
231+
t.Errorf("ProtocolVersion = %d, want >= 3", ProtocolVersion)
230232
}
231233
}

internal/server/handlers.go

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -156,6 +156,12 @@ func (s *Server) handleClipboardGet(w http.ResponseWriter, r *http.Request) {
156156
s.lastReadFormat = content.Format
157157
s.mu.Unlock()
158158

159+
// Step 5e: Run vision analysis pipeline (post-processors, post-unlock so we do not
160+
// hold the read mutex during potentially slow external commands like tesseract).
161+
// Only images; concealed items never reach here. Fail-open inside Analyze.
162+
var analysis *AnalysisResult
163+
analysis = s.analysis.Analyze(r.Context(), content)
164+
159165
// Step 6: Build response based on format.
160166
resp := proto.ClipboardResponse{
161167
OK: true,
@@ -170,6 +176,13 @@ func (s *Server) handleClipboardGet(w http.ResponseWriter, r *http.Request) {
170176
resp.Text = string(content.Data)
171177
}
172178

179+
if analysis != nil {
180+
resp.Analysis = &proto.ClipboardAnalysis{
181+
OCRText: analysis.OCRText,
182+
Description: analysis.Description,
183+
}
184+
}
185+
173186
// Step 7: Add to history if enabled.
174187
entryID := ""
175188
if s.history != nil {
@@ -405,6 +418,25 @@ func (s *Server) handleClipboardHistory(w http.ResponseWriter, r *http.Request)
405418
resp.Text = string(data)
406419
}
407420

421+
// VisionPaste: demand-driven re-analysis on history fetch (explicit read of
422+
// historical bytes). Keeps history storage unchanged (raw only) while still
423+
// delivering rich context + sidecar for agents using --list/--index.
424+
// Only runs if vision enabled; fail-open.
425+
//
426+
// Note: intentionally operates on stored historical bytes and therefore
427+
// skips the live IsConcealed + redaction + processor gates that protect
428+
// the primary /clipboard read path (those checks are impossible on past data).
429+
// Historical entries were already vetted at original capture time.
430+
if entry.Format == "png" {
431+
tmp := &clipboard.Content{Data: data, Format: entry.Format}
432+
if ar := s.analysis.Analyze(r.Context(), tmp); ar != nil {
433+
resp.Analysis = &proto.ClipboardAnalysis{
434+
OCRText: ar.OCRText,
435+
Description: ar.Description,
436+
}
437+
}
438+
}
439+
408440
w.Header().Set("Content-Type", "application/json")
409441
json.NewEncoder(w).Encode(resp)
410442
return

internal/server/processor.go

Lines changed: 128 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ import (
55
"context"
66
"fmt"
77
"os/exec"
8+
"strings"
89
"time"
910

1011
"github.com/pastelocal/pastelocal/internal/clipboard"
@@ -122,3 +123,130 @@ func (p *ProcessorPipeline) runCommand(ctx context.Context, name, command string
122123
func (p *ProcessorPipeline) StepCount() int {
123124
return len(p.chain)
124125
}
126+
127+
// --- VisionPaste analysis pipeline (v1) ---
128+
129+
// AnalysisResult holds text metadata extracted from an image via the vision pipeline.
130+
type AnalysisResult struct {
131+
OCRText string
132+
Description string
133+
}
134+
135+
// AnalysisPipeline runs configured external commands to produce OCR / descriptions
136+
// from screenshot images. Commands receive PNG bytes on stdin and must emit
137+
// useful text on stdout. Results are best-effort; failures are logged and ignored
138+
// (fail-open) so that clipboard flow is never blocked by analysis.
139+
type AnalysisPipeline struct {
140+
chain []analysisStep
141+
timeout time.Duration
142+
logger interface {
143+
Error(string, ...interface{})
144+
Warn(string, ...interface{})
145+
}
146+
}
147+
148+
type analysisStep struct {
149+
name string
150+
command string
151+
}
152+
153+
// NewAnalysisPipeline builds from config (parallel to NewProcessorPipeline).
154+
func NewAnalysisPipeline(cfg *config.Config, logger interface {
155+
Error(string, ...interface{})
156+
Warn(string, ...interface{})
157+
}) *AnalysisPipeline {
158+
if !cfg.Vision.Enabled || len(cfg.Vision.Chain) == 0 {
159+
return &AnalysisPipeline{timeout: time.Duration(cfg.Vision.Timeout) * time.Second, logger: logger}
160+
}
161+
timeout := time.Duration(cfg.Vision.Timeout) * time.Second
162+
if timeout == 0 {
163+
timeout = 15 * time.Second
164+
}
165+
166+
var steps []analysisStep
167+
for _, v := range cfg.Vision.Chain {
168+
steps = append(steps, analysisStep{
169+
name: v.Name,
170+
command: v.Command,
171+
})
172+
}
173+
return &AnalysisPipeline{
174+
chain: steps,
175+
timeout: timeout,
176+
logger: logger,
177+
}
178+
}
179+
180+
// Analyze runs the analysis chain on png content and returns collected results.
181+
// Returns nil if disabled, not an image, or no useful output produced.
182+
// Errors from individual steps are logged as warnings and skipped (fail-open).
183+
func (a *AnalysisPipeline) Analyze(ctx context.Context, content *clipboard.Content) *AnalysisResult {
184+
if len(a.chain) == 0 || content == nil || content.Format != "png" || len(content.Data) == 0 {
185+
return nil
186+
}
187+
188+
res := &AnalysisResult{}
189+
for _, step := range a.chain {
190+
text, err := a.runTextCommand(ctx, step.name, step.command, content.Data)
191+
if err != nil {
192+
a.logger.Warn("analysis step failed, skipping", "step", step.name, "error", err)
193+
continue
194+
}
195+
text = strings.TrimSpace(text)
196+
if text == "" {
197+
continue
198+
}
199+
// Map step name to result field. Unknown names go to OCRText with prefix.
200+
switch strings.ToLower(step.name) {
201+
case "ocr", "tesseract", "text", "extract":
202+
if res.OCRText != "" {
203+
res.OCRText += "\n"
204+
}
205+
res.OCRText += text
206+
case "describe", "caption", "summary", "vision", "alt":
207+
if res.Description != "" {
208+
res.Description += " "
209+
}
210+
res.Description += text
211+
default:
212+
if res.OCRText != "" {
213+
res.OCRText += "\n"
214+
}
215+
res.OCRText += step.name + ": " + text
216+
}
217+
}
218+
219+
if res.OCRText == "" && res.Description == "" {
220+
return nil
221+
}
222+
return res
223+
}
224+
225+
// runTextCommand executes a shell command with image data on stdin and returns
226+
// the stdout as text (analysis output). Mirrors the processor runCommand pattern
227+
// but for text-producing commands and with slightly longer default timeout.
228+
func (a *AnalysisPipeline) runTextCommand(ctx context.Context, name, command string, data []byte) (string, error) {
229+
procCtx, cancel := context.WithTimeout(ctx, a.timeout)
230+
defer cancel()
231+
232+
cmd := exec.CommandContext(procCtx, "sh", "-c", command)
233+
cmd.Stdin = bytes.NewReader(data)
234+
var stdout, stderr bytes.Buffer
235+
cmd.Stdout = &stdout
236+
cmd.Stderr = &stderr
237+
238+
if err := cmd.Run(); err != nil {
239+
return "", fmt.Errorf("analysis %q failed: %w: %s", name, err, stderr.String())
240+
}
241+
242+
result := strings.TrimSpace(stdout.String())
243+
if result == "" {
244+
return "", fmt.Errorf("analysis %q produced no output", name)
245+
}
246+
return result, nil
247+
}
248+
249+
// AnalysisStepCount returns how many analysis steps are configured.
250+
func (a *AnalysisPipeline) AnalysisStepCount() int {
251+
return len(a.chain)
252+
}

0 commit comments

Comments
 (0)