Skip to content

Commit d02100b

Browse files
committed
fix gemini image responses, gate screenshot tool appropriately based on api capabilities
1 parent 1fbc09b commit d02100b

6 files changed

Lines changed: 57 additions & 11 deletions

File tree

docs/docs/waveai.mdx

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ Controls AI's access to your workspace:
3434

3535
## File Attachments
3636

37-
Drag files onto the AI panel to attach:
37+
Drag files onto the AI panel to attach (not supported with all models):
3838

3939
| Type | Formats | Size Limit | Notes |
4040
|------|---------|------------|-------|
@@ -68,7 +68,7 @@ Supports text files, images, PDFs, and directories. Use `-n` for new chat, `-s`
6868
- **Navigate Web**: Changes URLs in web browser widgets
6969

7070
### All Widgets
71-
- **Capture Screenshots**: Takes screenshots of any widget for visual analysis
71+
- **Capture Screenshots**: Takes screenshots of any widget for visual analysis (not supported on all models)
7272

7373
:::warning Security
7474
File system operations require explicit approval. You control all file access.

pkg/aiusechat/aiutil/aiutil.go

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -209,6 +209,13 @@ func CheckModelSubPrefix(model string, prefix string) bool {
209209
return false
210210
}
211211

212+
// GeminiSupportsImageToolResults returns true if the model supports multimodal function responses (images in tool results)
213+
// This is only supported by Gemini 3 Pro and later models
214+
func GeminiSupportsImageToolResults(model string) bool {
215+
m := strings.ToLower(model)
216+
return strings.Contains(m, "gemini-3") || strings.Contains(m, "gemini-4")
217+
}
218+
212219
// CreateToolUseData creates a UIMessageDataToolUse from tool call information
213220
func CreateToolUseData(toolCallID, toolName string, arguments string, chatOpts uctypes.WaveChatOpts) uctypes.UIMessageDataToolUse {
214221
toolUseData := uctypes.UIMessageDataToolUse{

pkg/aiusechat/gemini/gemini-convertmessage.go

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ import (
1313
"github.com/google/uuid"
1414
"github.com/wavetermdev/waveterm/pkg/aiusechat/aiutil"
1515
"github.com/wavetermdev/waveterm/pkg/aiusechat/uctypes"
16+
"github.com/wavetermdev/waveterm/pkg/util/utilfn"
1617
)
1718

1819
// cleanSchemaForGemini removes fields from JSON Schema that Gemini doesn't accept
@@ -232,9 +233,36 @@ func ConvertToolResultsToGeminiChatMessage(toolResults []uctypes.AIToolResult) (
232233
}
233234

234235
response := make(map[string]any)
236+
var nestedParts []GeminiMessagePart
237+
235238
if result.ErrorText != "" {
236239
response["ok"] = false
237240
response["error"] = result.ErrorText
241+
} else if strings.HasPrefix(result.Text, "data:") {
242+
mimeType, base64Data, err := utilfn.DecodeDataURL(result.Text)
243+
if err != nil {
244+
log.Printf("gemini: failed to decode data URL in tool result: %v\n", err)
245+
response["ok"] = false
246+
response["error"] = fmt.Sprintf("failed to decode data URL: %v", err)
247+
} else if strings.HasPrefix(mimeType, "image/") {
248+
// For image data URLs, use multimodal function response (Gemini 3 Pro+)
249+
displayName := fmt.Sprintf("result_%s.%s", result.ToolUseID[:8], strings.TrimPrefix(mimeType, "image/"))
250+
response["ok"] = true
251+
response["image"] = map[string]string{"$ref": displayName}
252+
253+
// Add the image data as a nested part
254+
nestedParts = append(nestedParts, GeminiMessagePart{
255+
InlineData: &GeminiInlineData{
256+
MimeType: mimeType,
257+
Data: base64.StdEncoding.EncodeToString(base64Data),
258+
DisplayName: displayName,
259+
},
260+
})
261+
} else {
262+
log.Printf("gemini: unsupported data URL mimetype in tool result: %s\n", mimeType)
263+
response["ok"] = false
264+
response["error"] = fmt.Sprintf("unsupported data URL mimetype: %s", mimeType)
265+
}
238266
} else {
239267
response["ok"] = true
240268
response["result"] = result.Text
@@ -244,6 +272,7 @@ func ConvertToolResultsToGeminiChatMessage(toolResults []uctypes.AIToolResult) (
244272
FunctionResponse: &GeminiFunctionResponse{
245273
Name: result.ToolName,
246274
Response: response,
275+
Parts: nestedParts,
247276
},
248277
})
249278
}

pkg/aiusechat/gemini/gemini-types.go

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -79,14 +79,16 @@ func (p *GeminiMessagePart) Clean() *GeminiMessagePart {
7979

8080
// GeminiInlineData represents inline binary data
8181
type GeminiInlineData struct {
82-
MimeType string `json:"mimeType"`
83-
Data string `json:"data"` // base64 encoded
82+
MimeType string `json:"mimeType"`
83+
Data string `json:"data"` // base64 encoded
84+
DisplayName string `json:"displayName,omitempty"` // for multimodal function responses
8485
}
8586

8687
// GeminiFileData represents uploaded file reference
8788
type GeminiFileData struct {
88-
MimeType string `json:"mimeType"`
89-
FileUri string `json:"fileUri"` // gs:// URI from file upload
89+
MimeType string `json:"mimeType"`
90+
FileUri string `json:"fileUri"` // gs:// URI from file upload
91+
DisplayName string `json:"displayName,omitempty"` // for multimodal function responses
9092
}
9193

9294
// GeminiFunctionCall represents a function call from the model
@@ -97,8 +99,9 @@ type GeminiFunctionCall struct {
9799

98100
// GeminiFunctionResponse represents a function execution result
99101
type GeminiFunctionResponse struct {
100-
Name string `json:"name"`
101-
Response map[string]any `json:"response"`
102+
Name string `json:"name"`
103+
Response map[string]any `json:"response"`
104+
Parts []GeminiMessagePart `json:"parts,omitempty"` // nested parts for multimodal content (Gemini 3 Pro and later)
102105
}
103106

104107
// GeminiUsageMetadata represents token usage

pkg/aiusechat/tools.go

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ import (
1010
"strings"
1111

1212
"github.com/google/uuid"
13+
"github.com/wavetermdev/waveterm/pkg/aiusechat/aiutil"
1314
"github.com/wavetermdev/waveterm/pkg/aiusechat/uctypes"
1415
"github.com/wavetermdev/waveterm/pkg/blockcontroller"
1516
"github.com/wavetermdev/waveterm/pkg/util/utilfn"
@@ -132,7 +133,7 @@ func MakeBlockShortDesc(block *waveobj.Block) string {
132133
}
133134
}
134135

135-
func GenerateTabStateAndTools(ctx context.Context, tabid string, widgetAccess bool) (string, []uctypes.ToolDefinition, error) {
136+
func GenerateTabStateAndTools(ctx context.Context, tabid string, widgetAccess bool, chatOpts *uctypes.WaveChatOpts) (string, []uctypes.ToolDefinition, error) {
136137
if tabid == "" {
137138
return "", nil, nil
138139
}
@@ -160,7 +161,13 @@ func GenerateTabStateAndTools(ctx context.Context, tabid string, widgetAccess bo
160161
// log.Printf("TABPROMPT %s\n", tabState)
161162
var tools []uctypes.ToolDefinition
162163
if widgetAccess {
163-
tools = append(tools, GetCaptureScreenshotToolDefinition(tabid))
164+
// Only add screenshot tool for:
165+
// - openai-responses API type
166+
// - google-gemini API type with Gemini 3+ models
167+
if chatOpts.Config.APIType == uctypes.APIType_OpenAIResponses ||
168+
(chatOpts.Config.APIType == uctypes.APIType_GoogleGemini && aiutil.GeminiSupportsImageToolResults(chatOpts.Config.Model)) {
169+
tools = append(tools, GetCaptureScreenshotToolDefinition(tabid))
170+
}
164171
tools = append(tools, GetReadTextFileToolDefinition())
165172
tools = append(tools, GetReadDirToolDefinition())
166173
tools = append(tools, GetWriteTextFileToolDefinition())

pkg/aiusechat/usechat.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -645,7 +645,7 @@ func WaveAIPostMessageHandler(w http.ResponseWriter, r *http.Request) {
645645

646646
if req.TabId != "" {
647647
chatOpts.TabStateGenerator = func() (string, []uctypes.ToolDefinition, string, error) {
648-
tabState, tabTools, err := GenerateTabStateAndTools(r.Context(), req.TabId, req.WidgetAccess)
648+
tabState, tabTools, err := GenerateTabStateAndTools(r.Context(), req.TabId, req.WidgetAccess, &chatOpts)
649649
return tabState, tabTools, req.TabId, err
650650
}
651651
}

0 commit comments

Comments
 (0)