Skip to content

Commit 2ec2f34

Browse files
committed
feat(cli): 支持多模态消息内容及图片URL数组
- 扩展聊天消息内容类型,支持文本和图片URL的数组形式 - 处理 --image 参数,将图片URL作为多模态内容附加到最后一条用户消息 - 若无用户消息且指定图片URL,自动创建空用户消息以承载图片内容 - 禁止同时使用内嵌图片内容和 --image 参数,避免冲突 - 将知识搜索接口请求的图片参数字段 image_list 重命名为 images - 单元测试覆盖多模态内容及图片数组行为验证 - 优化消息解析,支持JSON结构化消息和 role:content 格式 - 更新API类型声明,明确多模态消息结构与字段类型
1 parent d2aa8ca commit 2ec2f34

7 files changed

Lines changed: 170 additions & 29 deletions

File tree

packages/cli/tests/e2e/knowledge-chat.e2e.test.ts

Lines changed: 53 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2,16 +2,21 @@ import { tmpdir } from "os";
22
import { describe, expect, test } from "vite-plus/test";
33
import { parseStdoutJson, runCli } from "./helpers.ts";
44

5+
interface ContentPart {
6+
type: string;
7+
text?: string;
8+
image_url?: { url: string };
9+
}
10+
511
interface DryRunBody {
612
endpoint?: string;
713
request?: {
814
input?: {
9-
messages?: Array<{ role: string; content: string }>;
15+
messages?: Array<{ role: string; content: string | ContentPart[] }>;
1016
};
1117
parameters?: {
1218
agent_options?: {
1319
agent_id?: string;
14-
image_list?: string[];
1520
};
1621
};
1722
stream?: boolean;
@@ -135,7 +140,7 @@ describe("e2e: knowledge chat", () => {
135140
expect(msgs[2]?.content).toBe("它怎么工作");
136141
});
137142

138-
test("--dry-run + --image 输出 image_list", async () => {
143+
test("--dry-run + --image 输出多模态 content 数组", async () => {
139144
const { stdout, stderr, exitCode } = await runCli(
140145
[
141146
"knowledge",
@@ -157,8 +162,50 @@ describe("e2e: knowledge chat", () => {
157162
);
158163
expect(exitCode, stderr).toBe(0);
159164
const data = parseStdoutJson<DryRunBody>(stdout);
160-
expect(data.request?.parameters?.agent_options?.image_list).toEqual([
161-
"https://example.com/img.jpg",
162-
]);
165+
const lastMsg = data.request?.input?.messages?.[0];
166+
expect(lastMsg?.role).toBe("user");
167+
expect(Array.isArray(lastMsg?.content)).toBe(true);
168+
const parts = lastMsg?.content as ContentPart[];
169+
expect(parts[0]).toEqual({ type: "text", text: "描述这张图" });
170+
expect(parts[1]).toEqual({
171+
type: "image_url",
172+
image_url: { url: "https://example.com/img.jpg" },
173+
});
174+
});
175+
176+
test("--dry-run + --image 无 --message 自动创建空 user message", async () => {
177+
const { stdout, stderr, exitCode } = await runCli(
178+
[
179+
"knowledge",
180+
"chat",
181+
"--dry-run",
182+
"--agent-id",
183+
"aid_test",
184+
"--workspace-id",
185+
"ws_test",
186+
"--image",
187+
"https://example.com/a.png",
188+
"--image",
189+
"https://example.com/b.png",
190+
"--non-interactive",
191+
"--output",
192+
"json",
193+
],
194+
{ DASHSCOPE_API_KEY: "sk-fake-for-dryrun" },
195+
);
196+
expect(exitCode, stderr).toBe(0);
197+
const data = parseStdoutJson<DryRunBody>(stdout);
198+
const lastMsg = data.request?.input?.messages?.[0];
199+
expect(lastMsg?.role).toBe("user");
200+
const parts = lastMsg?.content as ContentPart[];
201+
expect(parts[0]).toEqual({ type: "text", text: "" });
202+
expect(parts[1]).toEqual({
203+
type: "image_url",
204+
image_url: { url: "https://example.com/a.png" },
205+
});
206+
expect(parts[2]).toEqual({
207+
type: "image_url",
208+
image_url: { url: "https://example.com/b.png" },
209+
});
163210
});
164211
});

packages/cli/tests/e2e/knowledge-search.e2e.test.ts

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ interface DryRunBody {
77
request?: {
88
query?: string;
99
agent_id?: string;
10-
image_list?: string[];
10+
images?: string[];
1111
query_history?: Array<{ role: string; content: string }>;
1212
};
1313
}
@@ -96,7 +96,7 @@ describe("e2e: knowledge search", () => {
9696
expect(data.request?.agent_id).toBe("aid_test");
9797
});
9898

99-
test("--dry-run + --image 输出 image_list", async () => {
99+
test("--dry-run + --image 输出 images", async () => {
100100
const { stdout, stderr, exitCode } = await runCli(
101101
[
102102
"knowledge",
@@ -120,7 +120,7 @@ describe("e2e: knowledge search", () => {
120120
);
121121
expect(exitCode, stderr).toBe(0);
122122
const data = parseStdoutJson<DryRunBody>(stdout);
123-
expect(data.request?.image_list).toEqual([
123+
expect(data.request?.images).toEqual([
124124
"https://example.com/a.jpg",
125125
"https://example.com/b.jpg",
126126
]);

packages/commands/src/commands/knowledge/chat.ts

Lines changed: 95 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -9,22 +9,40 @@ import {
99
isInteractive,
1010
type Config,
1111
type GlobalFlags,
12+
type KnowledgeChatContentPart,
13+
type KnowledgeChatMessage,
1214
type KnowledgeChatRequest,
1315
type KnowledgeChatStreamChunk,
1416
} from "bailian-cli-core";
1517
import { failIfMissing, cmdUsage, emitResult, emitBare, promptText } from "bailian-cli-runtime";
1618

17-
interface ParsedMessage {
18-
role: "user" | "assistant";
19-
content: string;
20-
}
21-
22-
function parseMessages(flags: GlobalFlags): ParsedMessage[] {
23-
const messages: ParsedMessage[] = [];
19+
/**
20+
* Parse --message flags into KnowledgeChatMessage[].
21+
* Supports:
22+
* 1. Simple text: "hello" → {role:"user", content:"hello"}
23+
* 2. Role prefix: "user:hello" / "assistant:hi" → {role, content}
24+
* 3. JSON object: '{"role":"user","content":[...]}' → structured message (advanced)
25+
*/
26+
function parseMessages(flags: GlobalFlags): KnowledgeChatMessage[] {
27+
const messages: KnowledgeChatMessage[] = [];
2428
if (flags.message) {
2529
const validRoles = new Set(["user", "assistant"]);
2630
const msgs = flags.message as string[];
2731
for (const m of msgs) {
32+
// Try JSON object first (advanced usage)
33+
if (m.startsWith("{")) {
34+
try {
35+
const parsed = JSON.parse(m) as { role?: string; content?: unknown };
36+
if (parsed.role && validRoles.has(parsed.role) && parsed.content !== undefined) {
37+
messages.push(parsed as KnowledgeChatMessage);
38+
continue;
39+
}
40+
} catch {
41+
// Not valid JSON, fall through to simple parsing
42+
}
43+
}
44+
45+
// Simple role:content or plain text
2846
const colonIdx = m.indexOf(":");
2947
const maybeRole = colonIdx !== -1 ? m.slice(0, colonIdx) : "";
3048

@@ -38,6 +56,55 @@ function parseMessages(flags: GlobalFlags): ParsedMessage[] {
3856
return messages;
3957
}
4058

59+
/** Check if any message content already contains image_url parts */
60+
function hasEmbeddedImages(messages: KnowledgeChatMessage[]): boolean {
61+
for (const msg of messages) {
62+
if (Array.isArray(msg.content)) {
63+
if (msg.content.some((p) => p.type === "image_url")) return true;
64+
}
65+
}
66+
return false;
67+
}
68+
69+
/** Attach --image URLs to the last user message's content (as multimodal array) */
70+
function attachImagesToLastUserMessage(
71+
messages: KnowledgeChatMessage[],
72+
imageUrls: string[],
73+
): void {
74+
// Find last user message index
75+
let lastUserIdx = -1;
76+
for (let i = messages.length - 1; i >= 0; i--) {
77+
if (messages[i]!.role === "user") {
78+
lastUserIdx = i;
79+
break;
80+
}
81+
}
82+
83+
// If no user message exists, append an empty one
84+
if (lastUserIdx === -1) {
85+
messages.push({ role: "user", content: "" });
86+
lastUserIdx = messages.length - 1;
87+
}
88+
89+
const target = messages[lastUserIdx]!;
90+
const contentParts: KnowledgeChatContentPart[] = [];
91+
92+
// Preserve existing text content (always include a text part, even if empty)
93+
if (typeof target.content === "string") {
94+
contentParts.push({ type: "text", text: target.content });
95+
} else {
96+
// Already an array, extend it
97+
contentParts.push(...target.content);
98+
}
99+
100+
// Append image parts
101+
for (const url of imageUrls) {
102+
contentParts.push({ type: "image_url", image_url: { url } });
103+
}
104+
105+
target.content = contentParts;
106+
}
107+
41108
/** SSE step_change → human-friendly progress label (TTY only) */
42109
const STEP_LABELS: Record<string, string> = {
43110
tool_calling: "🔍 Retrieving...",
@@ -67,7 +134,8 @@ export default defineCommand({
67134
},
68135
{
69136
flag: "--image <url>",
70-
description: "Image URL(s) (repeatable)",
137+
description:
138+
"Image URL (repeatable). Attached to the last user message as multimodal content",
71139
type: "array",
72140
},
73141
],
@@ -80,12 +148,19 @@ export default defineCommand({
80148
exampleArgs: [
81149
'--message "What is RAG?" --agent-id aid-xxx --workspace-id ws-xxx',
82150
'--message "user:What is RAG?" --message "assistant:RAG is..." --message "How does it work?" --agent-id aid-xxx --workspace-id ws-xxx',
151+
'--message "Describe these images" --image https://example.com/a.png --image https://example.com/b.png --agent-id aid-xxx --workspace-id ws-xxx',
83152
],
84153
async run(config: Config, flags: GlobalFlags) {
85154
let messages = parseMessages(flags);
86155

156+
const imageUrls = flags.image as string[] | undefined;
157+
const hasImages = imageUrls && imageUrls.length > 0;
158+
87159
if (messages.length === 0) {
88-
if (isInteractive({ nonInteractive: config.nonInteractive })) {
160+
if (hasImages) {
161+
// --image without --message: create an empty user message to hold images
162+
messages = [{ role: "user", content: "" }];
163+
} else if (isInteractive({ nonInteractive: config.nonInteractive })) {
89164
const hint = await promptText({ message: "Enter your message:" });
90165
if (!hint) {
91166
process.stderr.write("Chat cancelled.\n");
@@ -113,6 +188,17 @@ export default defineCommand({
113188
// API only supports SSE; streamOutput controls whether to print tokens in real-time
114189
const streamOutput = format === "text" && !!process.stdout.isTTY;
115190

191+
// Attach --image URLs to messages (multimodal content array)
192+
if (hasImages) {
193+
if (hasEmbeddedImages(messages)) {
194+
throw new BailianError(
195+
"Cannot use --image when messages already contain embedded image_url content parts. Use one approach or the other.",
196+
ExitCode.USAGE,
197+
);
198+
}
199+
attachImagesToLastUserMessage(messages, imageUrls!);
200+
}
201+
116202
const body: KnowledgeChatRequest = {
117203
input: {
118204
messages,
@@ -125,11 +211,6 @@ export default defineCommand({
125211
stream: true,
126212
};
127213

128-
const imageUrls = flags.image as string[] | undefined;
129-
if (imageUrls && imageUrls.length > 0) {
130-
body.parameters.agent_options.image_list = imageUrls;
131-
}
132-
133214
const url = knowledgeChatEndpoint(workspaceId);
134215

135216
if (config.dryRun) {

packages/commands/src/commands/knowledge/search.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -89,7 +89,7 @@ export default defineCommand({
8989

9090
const imageUrls = flags.image as string[] | undefined;
9191
if (imageUrls && imageUrls.length > 0) {
92-
body.image_list = imageUrls;
92+
body.images = imageUrls;
9393
}
9494

9595
// Parse query_history JSON for multi-turn context

packages/core/src/types/api.ts

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -422,7 +422,7 @@ export interface DashScopeKnowledgeRetrieveResponse {
422422
export interface KnowledgeSearchRequest {
423423
query: string;
424424
agent_id: string;
425-
image_list?: string[];
425+
images?: string[];
426426
query_history?: Array<{ role: "user" | "assistant"; content: string }>;
427427
}
428428

@@ -456,15 +456,22 @@ export interface KnowledgeSearchResponse {
456456

457457
// ---- Knowledge Chat (新版 RAG 问答 SSE API, agent_id-based) ----
458458

459+
export type KnowledgeChatContentPart =
460+
| { type: "text"; text: string }
461+
| { type: "image_url"; image_url: { url: string } };
462+
463+
export interface KnowledgeChatMessage {
464+
role: "user" | "assistant";
465+
content: string | KnowledgeChatContentPart[];
466+
}
467+
459468
export interface KnowledgeChatRequest {
460469
input: {
461-
messages: Array<{ role: "user" | "assistant"; content: string }>;
462-
request_id?: string;
470+
messages: KnowledgeChatMessage[];
463471
};
464472
parameters: {
465473
agent_options: {
466474
agent_id: string;
467-
image_list?: string[];
468475
user?: {
469476
user_id?: string;
470477
workspace_id?: string;

packages/core/src/types/index.ts

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,8 @@ export type {
2323
DashScopeVideoEditRequest,
2424
DashScopeVideoRefRequest,
2525
DashScopeVideoRequest,
26+
KnowledgeChatContentPart,
27+
KnowledgeChatMessage,
2628
KnowledgeChatRequest,
2729
KnowledgeChatStreamChunk,
2830
KnowledgeRetrieveRequest,

skills/bailian-cli/reference/knowledge.md

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ Index: [index.md](index.md)
3030
| `--message <text>` | array | yes | Message text (repeatable). Supports role:content prefix to set role (e.g. user:hello), defaults to user. Follows OpenAI message format |
3131
| `--agent-id <id>` | string | yes | Q&A service ID (find in console knowledge Q&A page) |
3232
| `--workspace-id <id>` | string | no | Workspace ID for API endpoint URL (or set BAILIAN_WORKSPACE_ID) |
33-
| `--image <url>` | array | no | Image URL(s) (repeatable) |
33+
| `--image <url>` | array | no | Image URL (repeatable). Attached to the last user message as multimodal content |
3434

3535
#### Notes
3636

@@ -49,6 +49,10 @@ bl knowledge chat --message "What is RAG?" --agent-id aid-xxx --workspace-id ws-
4949
bl knowledge chat --message "user:What is RAG?" --message "assistant:RAG is..." --message "How does it work?" --agent-id aid-xxx --workspace-id ws-xxx
5050
```
5151

52+
```bash
53+
bl knowledge chat --message "Describe these images" --image https://example.com/a.png --image https://example.com/b.png --agent-id aid-xxx --workspace-id ws-xxx
54+
```
55+
5256
### `bl knowledge retrieve`
5357

5458
| Field | Value |

0 commit comments

Comments
 (0)