Skip to content

Commit 094ff0d

Browse files
feat(deepseek): add vision mode support
* feat(deepseek): add vision mode support DeepSeek added a third model "识图模式" (Vision Mode) that accepts image uploads for visual understanding. Add vision to the --model choices, update selectModel to use explicit index mapping for all three models, skip the search toggle in vision mode (not available), and extend waitForFilePreview to detect image thumbnails via send button state since vision mode shows a preview image instead of a filename label. Also catch "Not allowed" errors from setFileInput (Cloudflare may block CDP file operations) so the DataTransfer fallback can run. Closes #1215 * fix(deepseek): harden vision upload mode --------- Co-authored-by: jackwener <jakevingoo@gmail.com>
1 parent 25e8653 commit 094ff0d

6 files changed

Lines changed: 187 additions & 15 deletions

File tree

cli-manifest.json

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4380,10 +4380,11 @@
43804380
"type": "str",
43814381
"default": "instant",
43824382
"required": false,
4383-
"help": "Model to use: instant or expert",
4383+
"help": "Model to use: instant, expert, or vision",
43844384
"choices": [
43854385
"instant",
4386-
"expert"
4386+
"expert",
4387+
"vision"
43874388
]
43884389
},
43894390
{

clis/deepseek/ask.js

Lines changed: 17 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ export const askCommand = cli({
1818
{ name: 'prompt', positional: true, required: true, help: 'Prompt to send' },
1919
{ name: 'timeout', type: 'int', default: 120, help: 'Max seconds to wait for response' },
2020
{ name: 'new', type: 'boolean', default: false, help: 'Start a new chat before sending' },
21-
{ name: 'model', default: 'instant', choices: ['instant', 'expert'], help: 'Model to use: instant or expert' },
21+
{ name: 'model', default: 'instant', choices: ['instant', 'expert', 'vision'], help: 'Model to use: instant, expert, or vision' },
2222
{ name: 'think', type: 'boolean', default: false, help: 'Enable DeepThink mode' },
2323
{ name: 'search', type: 'boolean', default: false, help: 'Enable web search' },
2424
{ name: 'file', help: 'Attach a file (PDF, image, text) with the prompt' },
@@ -78,9 +78,22 @@ export const askCommand = cli({
7878
throw new CommandExecutionError('Could not enable DeepThink');
7979
}
8080

81-
const searchResult = await withRetry(() => setFeature(page, 'Search', wantSearch));
82-
if (!searchResult?.ok && wantSearch) {
83-
throw new CommandExecutionError('Could not enable Search');
81+
if (wantModel === 'vision' && wantSearch) {
82+
throw new CliError(
83+
'ARGUMENT',
84+
'DeepSeek vision mode does not support --search.',
85+
'Run without --search, or use --model instant/expert for web search.',
86+
EXIT_CODES.USAGE_ERROR,
87+
);
88+
}
89+
90+
// Vision mode does not have the search toggle.
91+
let searchResult;
92+
if (wantModel !== 'vision') {
93+
searchResult = await withRetry(() => setFeature(page, 'Search', wantSearch));
94+
if (!searchResult?.ok && wantSearch) {
95+
throw new CommandExecutionError('Could not enable Search');
96+
}
8497
}
8598

8699
if (thinkResult?.toggled || searchResult?.toggled) await page.wait(0.5);

clis/deepseek/ask.test.js

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -263,4 +263,50 @@ describe('deepseek ask conversation resume', () => {
263263
expect(rows).toEqual([{ response: 'follow-up reply' }]);
264264
expect(mockSelectModel).toHaveBeenCalled();
265265
});
266+
267+
it('skips search toggle in vision mode when search is not requested', async () => {
268+
mockEnsureOnDeepSeek.mockResolvedValue(false);
269+
mockSelectModel.mockResolvedValue({ ok: true, toggled: false });
270+
mockSetFeature.mockResolvedValue({ ok: true, toggled: false });
271+
mockSendMessage.mockResolvedValue({ ok: true });
272+
mockGetBubbleCount.mockResolvedValue(0);
273+
mockWaitForResponse.mockResolvedValue('vision reply');
274+
page.evaluate.mockResolvedValue('https://chat.deepseek.com/');
275+
276+
const rows = await askCommand.func(page, {
277+
prompt: 'describe',
278+
timeout: 120,
279+
new: false,
280+
model: 'vision',
281+
think: false,
282+
search: false,
283+
});
284+
285+
expect(rows).toEqual([{ response: 'vision reply' }]);
286+
expect(mockSetFeature).toHaveBeenCalledTimes(1);
287+
expect(mockSetFeature).toHaveBeenCalledWith(expect.anything(), 'DeepThink', false);
288+
});
289+
290+
it('fails fast instead of silently ignoring --search in vision mode', async () => {
291+
mockEnsureOnDeepSeek.mockResolvedValue(false);
292+
mockSelectModel.mockResolvedValue({ ok: true, toggled: false });
293+
page.evaluate.mockResolvedValue('https://chat.deepseek.com/');
294+
295+
await expect(askCommand.func(page, {
296+
prompt: 'describe',
297+
timeout: 120,
298+
new: false,
299+
model: 'vision',
300+
think: false,
301+
search: true,
302+
})).rejects.toMatchObject(new CliError(
303+
'ARGUMENT',
304+
'DeepSeek vision mode does not support --search.',
305+
'Run without --search, or use --model instant/expert for web search.',
306+
EXIT_CODES.USAGE_ERROR,
307+
));
308+
309+
expect(mockSendMessage).not.toHaveBeenCalled();
310+
expect(mockSendWithFile).not.toHaveBeenCalled();
311+
});
266312
});

clis/deepseek/utils.js

Lines changed: 17 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -40,9 +40,10 @@ export async function selectModel(page, modelName) {
4040
return page.evaluate(`(() => {
4141
var radios = document.querySelectorAll('div[role="radio"]');
4242
if (radios.length === 0) return { ok: false };
43-
var isFirst = '${modelName}'.toLowerCase() === 'instant';
44-
if (!isFirst && radios.length < 2) return { ok: false };
45-
var target = isFirst ? radios[0] : radios[radios.length - 1];
43+
var name = '${modelName}'.toLowerCase();
44+
var index = name === 'instant' ? 0 : name === 'expert' ? 1 : name === 'vision' ? 2 : -1;
45+
if (index < 0 || index >= radios.length) return { ok: false };
46+
var target = radios[index];
4647
var alreadySelected = target.getAttribute('aria-checked') === 'true';
4748
if (!alreadySelected) target.click();
4849
return { ok: true, toggled: !alreadySelected };
@@ -277,9 +278,18 @@ async function waitForFilePreview(page, fileName) {
277278
for (let attempt = 0; attempt < 8; attempt++) {
278279
await page.wait(2);
279280
const ready = await page.evaluate(`(() => {
280-
const name = ${JSON.stringify(fileName)};
281-
return Array.from(document.querySelectorAll('div'))
282-
.some((el) => el.children.length === 0 && (el.textContent || '').trim() === name);
281+
var name = ${JSON.stringify(fileName)};
282+
var hasFileName = Array.from(document.querySelectorAll('div'))
283+
.some(function(el) { return el.children.length === 0 && (el.textContent || '').trim() === name; });
284+
if (hasFileName) return true;
285+
// Vision mode shows an image thumbnail, not filename text. Require
286+
// a preview-like node here; send-button readiness is checked later.
287+
var box = document.querySelector('${TEXTAREA_SELECTOR}');
288+
if (!box) return false;
289+
var c = box.parentElement;
290+
while (c && !c.querySelector('div[role="button"]')) c = c.parentElement;
291+
if (!c) return false;
292+
return !!c.querySelector('img[src], canvas, video, [style*="background-image"], [class*="preview"], [class*="upload"]');
283293
})()`);
284294
if (ready) return true;
285295
}
@@ -318,7 +328,7 @@ export async function sendWithFile(page, filePath, prompt) {
318328
uploaded = true;
319329
} catch (err) {
320330
const msg = String(err?.message || err);
321-
if (!msg.includes('Unknown action') && !msg.includes('not supported')) {
331+
if (!msg.includes('Unknown action') && !msg.includes('not supported') && !msg.includes('Not allowed')) {
322332
throw err;
323333
}
324334
}

clis/deepseek/utils.test.js

Lines changed: 98 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -163,4 +163,102 @@ describe('deepseek selectModel', () => {
163163
expect(result).toEqual({ ok: false });
164164
expect(instantRadio.click).not.toHaveBeenCalled();
165165
});
166+
167+
it('selects the correct radio for each model', async () => {
168+
const radios = [0, 1, 2].map(() => ({
169+
getAttribute: vi.fn(() => 'false'),
170+
click: vi.fn(),
171+
}));
172+
global.document = {
173+
querySelectorAll: vi.fn(() => radios),
174+
};
175+
const page = {
176+
evaluate: vi.fn(async (script) => eval(script)),
177+
};
178+
179+
await selectModel(page, 'instant');
180+
expect(radios[0].click).toHaveBeenCalled();
181+
expect(radios[1].click).not.toHaveBeenCalled();
182+
expect(radios[2].click).not.toHaveBeenCalled();
183+
184+
radios.forEach(r => r.click.mockClear());
185+
await selectModel(page, 'expert');
186+
expect(radios[1].click).toHaveBeenCalled();
187+
188+
radios.forEach(r => r.click.mockClear());
189+
await selectModel(page, 'vision');
190+
expect(radios[2].click).toHaveBeenCalled();
191+
});
192+
193+
it('rejects unknown model names', async () => {
194+
const radios = [0, 1, 2].map(() => ({
195+
getAttribute: vi.fn(() => 'false'),
196+
click: vi.fn(),
197+
}));
198+
global.document = {
199+
querySelectorAll: vi.fn(() => radios),
200+
};
201+
const page = {
202+
evaluate: vi.fn(async (script) => eval(script)),
203+
};
204+
205+
const result = await selectModel(page, 'turbo');
206+
expect(result).toEqual({ ok: false });
207+
});
208+
});
209+
210+
describe('deepseek sendWithFile Not allowed fallback', () => {
211+
const tempDirs = [];
212+
213+
afterEach(() => {
214+
vi.restoreAllMocks();
215+
while (tempDirs.length) {
216+
fs.rmSync(tempDirs.pop(), { recursive: true, force: true });
217+
}
218+
});
219+
220+
it('falls back to DataTransfer when setFileInput throws Not allowed', async () => {
221+
const dir = fs.mkdtempSync(path.join(os.tmpdir(), 'opencli-deepseek-'));
222+
tempDirs.push(dir);
223+
const filePath = path.join(dir, 'image.png');
224+
fs.writeFileSync(filePath, 'fake-png');
225+
226+
const page = {
227+
setFileInput: vi.fn().mockRejectedValue(new Error('Not allowed')),
228+
wait: vi.fn().mockResolvedValue(undefined),
229+
evaluate: vi.fn()
230+
.mockResolvedValueOnce(undefined) // sidebar collapse
231+
.mockResolvedValueOnce({ ok: true }) // DataTransfer fallback
232+
.mockResolvedValueOnce(true) // waitForFilePreview
233+
.mockResolvedValueOnce(true) // send button enabled
234+
.mockResolvedValueOnce({ ok: true }),// sendMessage
235+
};
236+
237+
const result = await sendWithFile(page, filePath, 'describe');
238+
239+
expect(page.setFileInput).toHaveBeenCalled();
240+
expect(page.evaluate).toHaveBeenCalledTimes(5);
241+
expect(result).toEqual({ ok: true });
242+
});
243+
244+
it('does not treat send-button enablement alone as image upload proof', async () => {
245+
const dir = fs.mkdtempSync(path.join(os.tmpdir(), 'opencli-deepseek-'));
246+
tempDirs.push(dir);
247+
const filePath = path.join(dir, 'image.png');
248+
fs.writeFileSync(filePath, 'fake-png');
249+
250+
const page = {
251+
setFileInput: vi.fn().mockResolvedValue(undefined),
252+
wait: vi.fn().mockResolvedValue(undefined),
253+
evaluate: vi.fn()
254+
.mockResolvedValueOnce(undefined) // sidebar collapse
255+
.mockResolvedValue(false), // no filename / thumbnail preview
256+
};
257+
258+
const result = await sendWithFile(page, filePath, 'describe');
259+
260+
expect(result).toEqual({ ok: false, reason: 'file preview did not appear' });
261+
expect(page.evaluate.mock.calls[1][0]).toContain('img[src], canvas, video');
262+
expect(page.evaluate.mock.calls[1][0]).not.toContain("aria-disabled') === 'false'");
263+
});
166264
});

docs/adapters/browser/deepseek.md

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,9 @@ opencli deepseek ask "hello" --new
2424
# Use Expert model instead of Instant
2525
opencli deepseek ask "prove that sqrt(2) is irrational" --model expert
2626

27+
# Use Vision model with an image
28+
opencli deepseek ask "describe this image" --model vision --file ./image.png
29+
2730
# Enable DeepThink mode
2831
opencli deepseek ask "prove that sqrt(2) is irrational" --think
2932

@@ -62,7 +65,7 @@ opencli deepseek history --limit 10
6265
| `<prompt>` | The message to send (required, positional) |
6366
| `--timeout` | Wait timeout in seconds (default: 120) |
6467
| `--new` | Start a new chat before sending (default: false) |
65-
| `--model` | Model to use: `instant` or `expert` (default: instant) |
68+
| `--model` | Model to use: `instant`, `expert`, or `vision` (default: instant) |
6669
| `--think` | Enable DeepThink mode (default: false) |
6770
| `--search` | Enable web search (default: false) |
6871
| `--file` | Attach a file (PDF, image, text) with the prompt (max 100 MB) |
@@ -76,5 +79,6 @@ opencli deepseek history --limit 10
7679

7780
- This adapter drives the DeepSeek web UI in the browser, not an API
7881
- Default mode is Instant with DeepThink and Search disabled; each flag (`--model`, `--think`, `--search`) is synced on every invocation so omitting a flag resets it
82+
- Vision mode does not support `--search`; use `--model instant` or `--model expert` for web search
7983
- Long responses (code, essays) may need a higher `--timeout`
80-
- File upload reads the file into memory and passes it via base64 to the browser; files over 100 MB are rejected
84+
- File upload prefers the browser file-input path, falls back to base64 injection when needed, and rejects files over 100 MB

0 commit comments

Comments
 (0)