Skip to content

Commit 4fc9513

Browse files
authored
feat(pdf-server): get_viewer_state interact action (#590)
* feat(pdf-server): get_viewer_state interact action New interact action that returns a JSON snapshot of the live viewer: {currentPage, pageCount, zoom, displayMode, selectedAnnotationIds, selection: {text, contextBefore, contextAfter, boundingRect} | null}. The viewer already pushes selection passively via setModelContext as <pdf-selection> tags, but not all hosts surface model-context. This gives the model an explicit pull. selection.boundingRect is a single bbox in PDF points (top-left origin, y-down) so it can be fed straight back into add_annotations. selection is null when nothing is selected or the selection is outside the text-layer. Wiring: new PdfCommand variant -> processCommands case -> handleGetViewerState -> submit_viewer_state (new app-only tool, mirrors submit_save_data) -> waitForViewerState -> text content block. Also fills a gap in the display_pdf description: it listed interact actions but was missing save_as; added that and get_viewer_state. e2e: two tests covering selection:null and a programmatic text-layer selection. * test(pdf-server): fix get_viewer_state e2e race + assertions readLastToolResult clicked .last() before the interact result panel existed (callInteract doesn't block), so it expanded the display_pdf panel instead. Wait for the expected panel count first. Also: basic-host renders the full CallToolResult JSON, with the state double-escaped inside content[0].text. Parse instead of regex-matching. playwright.config.ts: honor PW_CHANNEL env to use system Chrome locally when the bundled chromium_headless_shell is broken.
1 parent 2e55c84 commit 4fc9513

File tree

5 files changed

+281
-1
lines changed

5 files changed

+281
-1
lines changed

examples/pdf-server/server.ts

Lines changed: 94 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -305,6 +305,33 @@ function waitForSaveData(
305305
});
306306
}
307307

308+
const pendingStateRequests = new Map<string, (v: string | Error) => void>();
309+
310+
/**
311+
* Wait for the viewer to report its current state (page, zoom, selection, …)
312+
* as a JSON string. Same timeout/abort semantics as waitForSaveData.
313+
*/
314+
function waitForViewerState(
315+
requestId: string,
316+
signal?: AbortSignal,
317+
): Promise<string> {
318+
return new Promise<string>((resolve, reject) => {
319+
const settle = (v: string | Error) => {
320+
clearTimeout(timer);
321+
signal?.removeEventListener("abort", onAbort);
322+
pendingStateRequests.delete(requestId);
323+
v instanceof Error ? reject(v) : resolve(v);
324+
};
325+
const onAbort = () => settle(new Error("interact request cancelled"));
326+
const timer = setTimeout(
327+
() => settle(new Error("Timeout waiting for viewer state")),
328+
GET_PAGES_TIMEOUT_MS,
329+
);
330+
signal?.addEventListener("abort", onAbort);
331+
pendingStateRequests.set(requestId, settle);
332+
});
333+
}
334+
308335
interface QueueEntry {
309336
commands: PdfCommand[];
310337
/** Timestamp of the most recent enqueue or dequeue */
@@ -1350,7 +1377,8 @@ Returns a viewUUID in structuredContent. Pass it to \`interact\`:
13501377
- add_annotations, update_annotations, remove_annotations, highlight_text
13511378
- fill_form (fill PDF form fields)
13521379
- navigate, search, find, search_navigate, zoom
1353-
- get_text, get_screenshot (extract content)
1380+
- get_text, get_screenshot, get_viewer_state (extract content / read selection & current page)
1381+
- save_as (write annotated PDF to disk)
13541382
13551383
Accepts local files (use list_pdfs), client MCP root directories, or any HTTPS URL.
13561384
Set \`elicit_form_inputs\` to true to prompt the user to fill form fields before display.`,
@@ -1650,6 +1678,7 @@ URL: ${normalized}`,
16501678
"fill_form",
16511679
"get_text",
16521680
"get_screenshot",
1681+
"get_viewer_state",
16531682
"save_as",
16541683
])
16551684
.describe("Action to perform"),
@@ -2238,6 +2267,26 @@ URL: ${normalized}`,
22382267
);
22392268
}
22402269
}
2270+
case "get_viewer_state": {
2271+
const requestId = randomUUID();
2272+
enqueueCommand(uuid, { type: "get_viewer_state", requestId });
2273+
let state: string;
2274+
try {
2275+
await ensureViewerIsPolling(uuid);
2276+
state = await waitForViewerState(requestId, signal);
2277+
} catch (err) {
2278+
return {
2279+
content: [
2280+
{
2281+
type: "text",
2282+
text: `Error: ${err instanceof Error ? err.message : String(err)}`,
2283+
},
2284+
],
2285+
isError: true,
2286+
};
2287+
}
2288+
return { content: [{ type: "text", text: state }] };
2289+
}
22412290
default:
22422291
return {
22432292
content: [{ type: "text", text: `Unknown action: ${action}` }],
@@ -2295,6 +2344,7 @@ Example — add a signature image and a stamp, then screenshot to verify:
22952344
**TEXT/SCREENSHOTS**:
22962345
• get_text: extract text from pages. Optional \`page\` for single page, or \`intervals\` for ranges [{start?,end?}]. Max 20 pages.
22972346
• get_screenshot: capture a single page as PNG image. Requires \`page\`.
2347+
• get_viewer_state: snapshot of the live viewer — JSON {currentPage, pageCount, zoom, displayMode, selectedAnnotationIds, selection:{text,contextBefore,contextAfter,boundingRect}|null}. Use this to read what the user has selected or which page they're on.
22982348
22992349
**FORMS** — fill_form: fill fields with \`fields\` array of {name, value}.
23002350
@@ -2320,6 +2370,7 @@ Example — add a signature image and a stamp, then screenshot to verify:
23202370
"fill_form",
23212371
"get_text",
23222372
"get_screenshot",
2373+
"get_viewer_state",
23232374
"save_as",
23242375
])
23252376
.optional()
@@ -2603,6 +2654,48 @@ Example — add a signature image and a stamp, then screenshot to verify:
26032654
},
26042655
);
26052656

2657+
// Tool: submit_viewer_state (app-only) - Viewer reports its live state
2658+
registerAppTool(
2659+
server,
2660+
"submit_viewer_state",
2661+
{
2662+
title: "Submit Viewer State",
2663+
description:
2664+
"Submit a viewer-state snapshot for a get_viewer_state request (used by viewer). The model should NOT call this tool directly.",
2665+
inputSchema: {
2666+
requestId: z
2667+
.string()
2668+
.describe("The request ID from the get_viewer_state command"),
2669+
state: z
2670+
.string()
2671+
.optional()
2672+
.describe("JSON-encoded viewer state snapshot"),
2673+
error: z
2674+
.string()
2675+
.optional()
2676+
.describe("Error message if the viewer failed to read state"),
2677+
},
2678+
_meta: { ui: { visibility: ["app"] } },
2679+
},
2680+
async ({ requestId, state, error }): Promise<CallToolResult> => {
2681+
const settle = pendingStateRequests.get(requestId);
2682+
if (!settle) {
2683+
return {
2684+
content: [
2685+
{ type: "text", text: `No pending request for ${requestId}` },
2686+
],
2687+
isError: true,
2688+
};
2689+
}
2690+
if (error || !state) {
2691+
settle(new Error(error || "Viewer returned no state"));
2692+
} else {
2693+
settle(state);
2694+
}
2695+
return { content: [{ type: "text", text: "Submitted" }] };
2696+
},
2697+
);
2698+
26062699
// Tool: poll_pdf_commands (app-only) - Poll for pending commands
26072700
registerAppTool(
26082701
server,

examples/pdf-server/src/commands.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,4 +66,5 @@ export type PdfCommand =
6666
getScreenshots: boolean;
6767
}
6868
| { type: "save_as"; requestId: string }
69+
| { type: "get_viewer_state"; requestId: string }
6970
| { type: "file_changed"; mtimeMs: number };

examples/pdf-server/src/mcp-app.ts

Lines changed: 95 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2428,6 +2428,84 @@ async function renderPageOffscreen(pageNum: number): Promise<string> {
24282428
return dataUrl.split(",")[1];
24292429
}
24302430

2431+
/**
2432+
* Snapshot the live viewer for `interact({action:"get_viewer_state"})`.
2433+
*
2434+
* Selection is read from `window.getSelection()` at call time — no caching;
2435+
* if the user navigated away or nothing is selected, `selection` is `null`.
2436+
* `boundingRect` is in model coords (PDF points, origin top-left, y-down) so
2437+
* it can be fed straight back into `add_annotations`.
2438+
*/
2439+
async function handleGetViewerState(requestId: string): Promise<void> {
2440+
const CONTEXT_CHARS = 200;
2441+
2442+
let selection: {
2443+
text: string;
2444+
contextBefore: string;
2445+
contextAfter: string;
2446+
boundingRect: { x: number; y: number; width: number; height: number };
2447+
} | null = null;
2448+
2449+
const sel = window.getSelection();
2450+
const selectedText = sel?.toString().replace(/\s+/g, " ").trim();
2451+
if (sel && selectedText && sel.rangeCount > 0) {
2452+
// Only treat it as a PDF selection if it lives inside the text layer of
2453+
// the rendered page (not the toolbar, search box, etc.).
2454+
const range = sel.getRangeAt(0);
2455+
const anchor =
2456+
range.commonAncestorContainer.nodeType === Node.ELEMENT_NODE
2457+
? (range.commonAncestorContainer as Element)
2458+
: range.commonAncestorContainer.parentElement;
2459+
if (anchor && textLayerEl.contains(anchor)) {
2460+
// Context: locate selection in the page's extracted text and slice
2461+
// ±CONTEXT_CHARS around it. Falls back to empty strings if fuzzy
2462+
// match fails (still return text + rect — they're the load-bearing
2463+
// bits).
2464+
const pageText = pageTextCache.get(currentPage) ?? "";
2465+
const loc = findSelectionInText(pageText, selectedText);
2466+
const contextBefore = loc
2467+
? pageText.slice(Math.max(0, loc.start - CONTEXT_CHARS), loc.start)
2468+
: "";
2469+
const contextAfter = loc
2470+
? pageText.slice(loc.end, loc.end + CONTEXT_CHARS)
2471+
: "";
2472+
2473+
// Single bounding box, page-relative model coords. getBoundingClientRect
2474+
// is viewport-relative; subtract the page-wrapper origin then divide by
2475+
// scale → PDF points (top-left origin, y-down — matches the coord
2476+
// system documented in the interact tool description).
2477+
const r = range.getBoundingClientRect();
2478+
const origin = pageWrapperEl.getBoundingClientRect();
2479+
const round = (n: number) => Math.round(n * 100) / 100;
2480+
selection = {
2481+
text: selectedText,
2482+
contextBefore,
2483+
contextAfter,
2484+
boundingRect: {
2485+
x: round((r.left - origin.left) / scale),
2486+
y: round((r.top - origin.top) / scale),
2487+
width: round(r.width / scale),
2488+
height: round(r.height / scale),
2489+
},
2490+
};
2491+
}
2492+
}
2493+
2494+
const state = {
2495+
currentPage,
2496+
pageCount: totalPages,
2497+
zoom: Math.round(scale * 100),
2498+
displayMode: currentDisplayMode,
2499+
selectedAnnotationIds: [...selectedAnnotationIds],
2500+
selection,
2501+
};
2502+
2503+
await app.callServerTool({
2504+
name: "submit_viewer_state",
2505+
arguments: { requestId, state: JSON.stringify(state, null, 2) },
2506+
});
2507+
}
2508+
24312509
async function handleGetPages(cmd: {
24322510
requestId: string;
24332511
intervals: Array<{ start?: number; end?: number }>;
@@ -4678,6 +4756,23 @@ async function processCommands(commands: PdfCommand[]): Promise<void> {
46784756
.catch(() => {});
46794757
}
46804758
break;
4759+
case "get_viewer_state":
4760+
// Same await-before-next-poll discipline as get_pages/save_as.
4761+
try {
4762+
await handleGetViewerState(cmd.requestId);
4763+
} catch (err) {
4764+
log.error("get_viewer_state failed — submitting error:", err);
4765+
await app
4766+
.callServerTool({
4767+
name: "submit_viewer_state",
4768+
arguments: {
4769+
requestId: cmd.requestId,
4770+
error: err instanceof Error ? err.message : String(err),
4771+
},
4772+
})
4773+
.catch(() => {});
4774+
}
4775+
break;
46814776
case "file_changed": {
46824777
// Skip our own save_pdf echo: either save is still in flight, or the
46834778
// event's mtime matches what save_pdf just returned.

playwright.config.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ export default defineConfig({
3030
...devices["Desktop Chrome"],
3131
// Use default Chromium everywhere for consistent screenshot rendering
3232
// Run `npm run test:e2e:docker` locally for CI-identical results
33+
...(process.env.PW_CHANNEL ? { channel: process.env.PW_CHANNEL } : {}),
3334
},
3435
},
3536
],

tests/e2e/pdf-annotations.spec.ts

Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -314,3 +314,93 @@ test.describe("PDF Server - Annotations", () => {
314314
);
315315
});
316316
});
317+
318+
/**
319+
* Read the most recent interact result text from the basic-host UI.
320+
* Waits for the result-panel count to reach `expectedCount` first —
321+
* `callInteract` doesn't block, so `.last()` would otherwise race to the
322+
* previous (display_pdf) panel.
323+
*/
324+
async function readLastToolResult(
325+
page: Page,
326+
expectedCount: number,
327+
): Promise<string> {
328+
const panels = page.locator('text="📤 Tool Result"');
329+
await expect(panels).toHaveCount(expectedCount, { timeout: 30000 });
330+
await panels.last().click();
331+
const pre = page.locator("pre").last();
332+
await expect(pre).toBeVisible({ timeout: 5000 });
333+
return (await pre.textContent()) ?? "";
334+
}
335+
336+
/** Unwrap basic-host's `CallToolResult` JSON to the first text block. */
337+
function unwrapTextResult(raw: string): string {
338+
const parsed = JSON.parse(raw) as {
339+
content?: { type: string; text?: string }[];
340+
};
341+
const block = parsed.content?.find((c) => c.type === "text");
342+
if (!block?.text) throw new Error(`No text block in: ${raw.slice(0, 200)}`);
343+
return block.text;
344+
}
345+
346+
test.describe("PDF Server - get_viewer_state", () => {
347+
test("returns page/zoom/mode and selection:null when nothing is selected", async ({
348+
page,
349+
}) => {
350+
await loadPdfServer(page);
351+
await waitForPdfCanvas(page);
352+
353+
const viewUUID = await extractViewUUID(page);
354+
355+
await callInteract(page, { viewUUID, action: "get_viewer_state" });
356+
const raw = await readLastToolResult(page, 2);
357+
const state = JSON.parse(unwrapTextResult(raw));
358+
359+
expect(state.currentPage).toBe(1);
360+
expect(state.pageCount).toBeGreaterThan(1);
361+
expect(typeof state.zoom).toBe("number");
362+
expect(state.displayMode).toBe("inline");
363+
expect(state.selection).toBeNull();
364+
expect(Array.isArray(state.selectedAnnotationIds)).toBe(true);
365+
});
366+
367+
test("returns selected text and bounding rect when text-layer text is selected", async ({
368+
page,
369+
}) => {
370+
await loadPdfServer(page);
371+
await waitForPdfCanvas(page);
372+
373+
const viewUUID = await extractViewUUID(page);
374+
const app = getAppFrame(page);
375+
376+
// Programmatically select the contents of the first text-layer span.
377+
const selectedText = await app
378+
.locator("#text-layer span")
379+
.first()
380+
.evaluate((span) => {
381+
const range = span.ownerDocument.createRange();
382+
range.selectNodeContents(span);
383+
const sel = span.ownerDocument.defaultView!.getSelection()!;
384+
sel.removeAllRanges();
385+
sel.addRange(range);
386+
return sel.toString().replace(/\s+/g, " ").trim();
387+
});
388+
expect(selectedText.length).toBeGreaterThan(0);
389+
390+
await callInteract(page, { viewUUID, action: "get_viewer_state" });
391+
const raw = await readLastToolResult(page, 2);
392+
const state = JSON.parse(unwrapTextResult(raw));
393+
394+
expect(state.currentPage).toBe(1);
395+
expect(state.selection).not.toBeNull();
396+
expect(state.selection.text).toContain(selectedText);
397+
expect(state.selection.boundingRect).toEqual(
398+
expect.objectContaining({
399+
x: expect.any(Number),
400+
y: expect.any(Number),
401+
width: expect.any(Number),
402+
height: expect.any(Number),
403+
}),
404+
);
405+
});
406+
});

0 commit comments

Comments
 (0)