Skip to content

Commit 5d6cb4b

Browse files
committed
feat(agui): Add multimodal input support (image/video/audio) to AguiMessage and AguiMessageConverter
Adds support for multimodal input (image, video, audio, document) in the AG-UI extension, aligning with the AG-UI Protocol InputContent specification. - AguiMessage.content: String -> Object (backward compatible) - AguiMessageConverter: InputContent[] -> ContentBlock conversion - Supports both url and data (base64) source types
1 parent c3f302c commit 5d6cb4b

4 files changed

Lines changed: 465 additions & 6 deletions

File tree

agentscope-extensions/agentscope-extensions-agui/src/main/java/io/agentscope/core/agui/converter/AguiMessageConverter.java

Lines changed: 116 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,12 +19,18 @@
1919
import io.agentscope.core.agui.model.AguiFunctionCall;
2020
import io.agentscope.core.agui.model.AguiMessage;
2121
import io.agentscope.core.agui.model.AguiToolCall;
22+
import io.agentscope.core.message.AudioBlock;
23+
import io.agentscope.core.message.Base64Source;
2224
import io.agentscope.core.message.ContentBlock;
25+
import io.agentscope.core.message.ImageBlock;
2326
import io.agentscope.core.message.Msg;
2427
import io.agentscope.core.message.MsgRole;
28+
import io.agentscope.core.message.Source;
2529
import io.agentscope.core.message.TextBlock;
2630
import io.agentscope.core.message.ToolResultBlock;
2731
import io.agentscope.core.message.ToolUseBlock;
32+
import io.agentscope.core.message.URLSource;
33+
import io.agentscope.core.message.VideoBlock;
2834
import io.agentscope.core.util.JsonException;
2935
import io.agentscope.core.util.JsonUtils;
3036
import java.util.ArrayList;
@@ -37,6 +43,17 @@
3743
*
3844
* <p>This class handles the bidirectional conversion between the AG-UI protocol's
3945
* message format and AgentScope's internal message format.
46+
*
47+
* <p>Supports multimodal input per AG-UI protocol:
48+
* <ul>
49+
* <li>{@code text} → {@link TextBlock}</li>
50+
* <li>{@code image} → {@link ImageBlock}</li>
51+
* <li>{@code video} → {@link VideoBlock}</li>
52+
* <li>{@code audio} → {@link AudioBlock}</li>
53+
* <li>{@code document} → {@link TextBlock} (with description)</li>
54+
* </ul>
55+
*
56+
* <p>See https://docs.ag-ui.com/concepts/messages.md for AG-UI InputContent spec.
4057
*/
4158
public class AguiMessageConverter {
4259
/**
@@ -54,8 +71,20 @@ public Msg toMsg(AguiMessage aguiMessage) {
5471
MsgRole role = convertRole(aguiMessage.getRole());
5572
List<ContentBlock> blocks = new ArrayList<>();
5673

57-
// Add text content if present
58-
if (aguiMessage.getContent() != null && !aguiMessage.getContent().isEmpty()) {
74+
// Handle multimodal content (InputContent array per AG-UI protocol)
75+
if (aguiMessage.isMultimodalContent()) {
76+
List<Map<String, Object>> parts = aguiMessage.getMultimodalContent();
77+
if (parts != null) {
78+
for (Map<String, Object> part : parts) {
79+
ContentBlock block = convertInputContent(part);
80+
if (block != null) {
81+
blocks.add(block);
82+
}
83+
}
84+
}
85+
}
86+
// Handle simple text content (backward compatible)
87+
else if (aguiMessage.getContent() != null && !aguiMessage.getContent().isEmpty()) {
5988
if (aguiMessage.isToolMessage() && aguiMessage.getToolCallId() != null) {
6089
// For tool messages, wrap content in ToolResultBlock
6190
blocks.add(
@@ -78,6 +107,91 @@ public Msg toMsg(AguiMessage aguiMessage) {
78107
return Msg.builder().id(aguiMessage.getId()).role(role).content(blocks).build();
79108
}
80109

110+
/**
111+
* Convert a single AG-UI InputContent part to an AgentScope ContentBlock.
112+
*
113+
* @param part The InputContent map from AG-UI protocol
114+
* @return The converted ContentBlock, or null if type is unrecognized
115+
*/
116+
@SuppressWarnings("unchecked")
117+
private ContentBlock convertInputContent(Map<String, Object> part) {
118+
String type = (String) part.get("type");
119+
if (type == null) {
120+
return null;
121+
}
122+
123+
switch (type) {
124+
case "text":
125+
String text = (String) part.get("text");
126+
return text != null ? TextBlock.builder().text(text).build() : null;
127+
128+
case "image":
129+
Source source = extractSource(part);
130+
return source != null ? ImageBlock.builder().source(source).build() : null;
131+
132+
case "video":
133+
Source videoSource = extractSource(part);
134+
return videoSource != null
135+
? VideoBlock.builder().source(videoSource).build()
136+
: null;
137+
138+
case "audio":
139+
Source audioSource = extractSource(part);
140+
return audioSource != null
141+
? AudioBlock.builder().source(audioSource).build()
142+
: null;
143+
144+
case "document":
145+
// Convert document to TextBlock with description
146+
Source docSource = extractSource(part);
147+
if (docSource != null) {
148+
String docDesc = "[Document: " + extractMimeType(part) + "]";
149+
return TextBlock.builder().text(docDesc).build();
150+
}
151+
return null;
152+
153+
default:
154+
return null;
155+
}
156+
}
157+
158+
/**
159+
* Extract Source from an InputContent part.
160+
* Supports both 'url' and 'data' (base64) source types.
161+
*/
162+
@SuppressWarnings("unchecked")
163+
private Source extractSource(Map<String, Object> part) {
164+
Map<String, Object> sourceMap = (Map<String, Object>) part.get("source");
165+
if (sourceMap == null) {
166+
return null;
167+
}
168+
169+
String sourceType = (String) sourceMap.get("type");
170+
if ("url".equals(sourceType)) {
171+
String url = (String) sourceMap.get("value");
172+
return url != null ? new URLSource(url) : null;
173+
} else if ("data".equals(sourceType)) {
174+
String data = (String) sourceMap.get("value");
175+
String mimeType = (String) sourceMap.get("mimeType");
176+
if (data != null && mimeType != null) {
177+
return new Base64Source(data, mimeType);
178+
}
179+
}
180+
return null;
181+
}
182+
183+
/**
184+
* Extract mimeType from an InputContent part (for document type).
185+
*/
186+
private String extractMimeType(Map<String, Object> part) {
187+
@SuppressWarnings("unchecked")
188+
Map<String, Object> sourceMap = (Map<String, Object>) part.get("source");
189+
if (sourceMap != null) {
190+
return (String) sourceMap.get("mimeType");
191+
}
192+
return null;
193+
}
194+
81195
/**
82196
* Convert an AgentScope message to an AG-UI message.
83197
*

agentscope-extensions/agentscope-extensions-agui/src/main/java/io/agentscope/core/agui/model/AguiMessage.java

Lines changed: 50 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
import com.fasterxml.jackson.annotation.JsonProperty;
2020
import java.util.Collections;
2121
import java.util.List;
22+
import java.util.Map;
2223
import java.util.Objects;
2324

2425
/**
@@ -34,12 +35,24 @@
3435
* <li>system - System instructions</li>
3536
* <li>tool - Tool execution results</li>
3637
* </ul>
38+
*
39+
* <p>Content can be a simple string or a multimodal array of
40+
* {@code InputContent} objects (per AG-UI protocol).
41+
* See https://docs.ag-ui.com/concepts/messages.md for details.
42+
*
43+
* <p>InputContent array element structure:
44+
* <pre>{@code
45+
* { "type": "text", "text": "Hello" }
46+
* { "type": "image", "source": { "type": "url", "value": "https://...", "mimeType": "image/png" } }
47+
* { "type": "video", "source": { "type": "url", "value": "https://...", "mimeType": "video/mp4" } }
48+
* { "type": "audio", "source": { "type": "url", "value": "https://...", "mimeType": "audio/wav" } }
49+
* }</pre>
3750
*/
3851
public class AguiMessage {
3952

4053
private final String id;
4154
private final String role;
42-
private final String content;
55+
private final Object content; // String or List<map<string,object>> for multimodal
4356
private final List<AguiToolCall> toolCalls;
4457
private final String toolCallId;
4558

@@ -48,15 +61,16 @@ public class AguiMessage {
4861
*
4962
* @param id The unique message ID
5063
* @param role The message role (user, assistant, system, tool)
51-
* @param content The message content
64+
* @param content The message content - may be a String or a List of InputContent objects
65+
* (multimodal input per AG-UI protocol)
5266
* @param toolCalls Tool calls for assistant messages (optional)
5367
* @param toolCallId Tool call ID for tool messages (optional)
5468
*/
5569
@JsonCreator
5670
public AguiMessage(
5771
@JsonProperty("id") String id,
5872
@JsonProperty("role") String role,
59-
@JsonProperty("content") String content,
73+
@JsonProperty("content") Object content,
6074
@JsonProperty("toolCalls") List<AguiToolCall> toolCalls,
6175
@JsonProperty("toolCallId") String toolCallId) {
6276
this.id = Objects.requireNonNull(id, "id cannot be null");
@@ -135,12 +149,44 @@ public String getRole() {
135149
/**
136150
* Get the message content.
137151
*
138-
* @return The content, may be null
152+
* @return The content as a String if it is a simple text message, or null if
153+
* the content is multimodal (InputContent array). Use {@link #getContentObject()}
154+
* for full multimodal support.
139155
*/
140156
public String getContent() {
157+
return content instanceof String ? (String) content : null;
158+
}
159+
160+
/**
161+
* Get the raw content object.
162+
*
163+
* @return The content as an Object - either a String for simple text messages
164+
* or a List of InputContent maps for multimodal messages.
165+
*/
166+
public Object getContentObject() {
141167
return content;
142168
}
143169

170+
/**
171+
* Check if this message contains multimodal content (InputContent array).
172+
*
173+
* @return true if content is a List (multimodal), false if it's a String or null
174+
*/
175+
public boolean isMultimodalContent() {
176+
return content instanceof List;
177+
}
178+
179+
/**
180+
* Get the multimodal content as a list of InputContent objects.
181+
* Each item is a Map with keys: type, text/source/etc.
182+
*
183+
* @return The content as a List if it is multimodal, or null if it's a simple String
184+
*/
185+
@SuppressWarnings("unchecked")
186+
public List<Map<String, Object>> getMultimodalContent() {
187+
return content instanceof List ? (List<Map<String, Object>>) content : null;
188+
}
189+
144190
/**
145191
* Get the tool calls (for assistant messages).
146192
*

0 commit comments

Comments
 (0)