feat(pdf-server): get_viewer_state interact action (#590)

ochafik · web-flow · commit 4fc95131bf3f · 2026-04-02T08:47:48.000+01:00
* feat(pdf-server): get_viewer_state interact action

New interact action that returns a JSON snapshot of the live viewer:
{currentPage, pageCount, zoom, displayMode, selectedAnnotationIds,
 selection: {text, contextBefore, contextAfter, boundingRect} | null}.

The viewer already pushes selection passively via setModelContext as
&lt;pdf-selection&gt; tags, but not all hosts surface model-context. This gives
the model an explicit pull.

selection.boundingRect is a single bbox in PDF points (top-left origin,
y-down) so it can be fed straight back into add_annotations. selection is
null when nothing is selected or the selection is outside the text-layer.

Wiring: new PdfCommand variant -&gt; processCommands case -&gt;
handleGetViewerState -&gt; submit_viewer_state (new app-only tool, mirrors
submit_save_data) -&gt; waitForViewerState -&gt; text content block.

Also fills a gap in the display_pdf description: it listed interact
actions but was missing save_as; added that and get_viewer_state.

e2e: two tests covering selection:null and a programmatic text-layer
selection.

* test(pdf-server): fix get_viewer_state e2e race + assertions

readLastToolResult clicked .last() before the interact result panel
existed (callInteract doesn't block), so it expanded the display_pdf
panel instead. Wait for the expected panel count first.

Also: basic-host renders the full CallToolResult JSON, with the state
double-escaped inside content[0].text. Parse instead of regex-matching.

playwright.config.ts: honor PW_CHANNEL env to use system Chrome locally
when the bundled chromium_headless_shell is broken.
diff --git a/examples/pdf-server/server.ts b/examples/pdf-server/server.ts
@@ -305,6 +305,33 @@ function waitForSaveData(
   });
 }
 
+const pendingStateRequests = new Map<string, (v: string | Error) => void>();
+
+/**
+ * Wait for the viewer to report its current state (page, zoom, selection, …)
+ * as a JSON string. Same timeout/abort semantics as waitForSaveData.
+ */
+function waitForViewerState(
+  requestId: string,
+  signal?: AbortSignal,
+): Promise<string> {
+  return new Promise<string>((resolve, reject) => {
+    const settle = (v: string | Error) => {
+      clearTimeout(timer);
+      signal?.removeEventListener("abort", onAbort);
+      pendingStateRequests.delete(requestId);
+      v instanceof Error ? reject(v) : resolve(v);
+    };
+    const onAbort = () => settle(new Error("interact request cancelled"));
+    const timer = setTimeout(
+      () => settle(new Error("Timeout waiting for viewer state")),
+      GET_PAGES_TIMEOUT_MS,
+    );
+    signal?.addEventListener("abort", onAbort);
+    pendingStateRequests.set(requestId, settle);
+  });
+}
+
 interface QueueEntry {
   commands: PdfCommand[];
   /** Timestamp of the most recent enqueue or dequeue */
@@ -1350,7 +1377,8 @@ Returns a viewUUID in structuredContent. Pass it to \`interact\`:
 - add_annotations, update_annotations, remove_annotations, highlight_text
 - fill_form (fill PDF form fields)
 - navigate, search, find, search_navigate, zoom
-- get_text, get_screenshot (extract content)
+- get_text, get_screenshot, get_viewer_state (extract content / read selection & current page)
+- save_as (write annotated PDF to disk)
 
 Accepts local files (use list_pdfs), client MCP root directories, or any HTTPS URL.
 Set \`elicit_form_inputs\` to true to prompt the user to fill form fields before display.`,
@@ -1650,6 +1678,7 @@ URL: ${normalized}`,
           "fill_form",
           "get_text",
           "get_screenshot",
+          "get_viewer_state",
           "save_as",
         ])
         .describe("Action to perform"),
@@ -2238,6 +2267,26 @@ URL: ${normalized}`,
             );
           }
         }
+        case "get_viewer_state": {
+          const requestId = randomUUID();
+          enqueueCommand(uuid, { type: "get_viewer_state", requestId });
+          let state: string;
+          try {
+            await ensureViewerIsPolling(uuid);
+            state = await waitForViewerState(requestId, signal);
+          } catch (err) {
+            return {
+              content: [
+                {
+                  type: "text",
+                  text: `Error: ${err instanceof Error ? err.message : String(err)}`,
+                },
+              ],
+              isError: true,
+            };
+          }
+          return { content: [{ type: "text", text: state }] };
+        }
         default:
           return {
             content: [{ type: "text", text: `Unknown action: ${action}` }],
@@ -2295,6 +2344,7 @@ Example — add a signature image and a stamp, then screenshot to verify:
 **TEXT/SCREENSHOTS**:
 • get_text: extract text from pages. Optional \`page\` for single page, or \`intervals\` for ranges [{start?,end?}]. Max 20 pages.
 • get_screenshot: capture a single page as PNG image. Requires \`page\`.
+• get_viewer_state: snapshot of the live viewer — JSON {currentPage, pageCount, zoom, displayMode, selectedAnnotationIds, selection:{text,contextBefore,contextAfter,boundingRect}|null}. Use this to read what the user has selected or which page they're on.
 
 **FORMS** — fill_form: fill fields with \`fields\` array of {name, value}.
 
@@ -2320,6 +2370,7 @@ Example — add a signature image and a stamp, then screenshot to verify:
               "fill_form",
               "get_text",
               "get_screenshot",
+              "get_viewer_state",
               "save_as",
             ])
             .optional()
@@ -2603,6 +2654,48 @@ Example — add a signature image and a stamp, then screenshot to verify:
       },
     );
 
+    // Tool: submit_viewer_state (app-only) - Viewer reports its live state
+    registerAppTool(
+      server,
+      "submit_viewer_state",
+      {
+        title: "Submit Viewer State",
+        description:
+          "Submit a viewer-state snapshot for a get_viewer_state request (used by viewer). The model should NOT call this tool directly.",
+        inputSchema: {
+          requestId: z
+            .string()
+            .describe("The request ID from the get_viewer_state command"),
+          state: z
+            .string()
+            .optional()
+            .describe("JSON-encoded viewer state snapshot"),
+          error: z
+            .string()
+            .optional()
+            .describe("Error message if the viewer failed to read state"),
+        },
+        _meta: { ui: { visibility: ["app"] } },
+      },
+      async ({ requestId, state, error }): Promise<CallToolResult> => {
+        const settle = pendingStateRequests.get(requestId);
+        if (!settle) {
+          return {
+            content: [
+              { type: "text", text: `No pending request for ${requestId}` },
+            ],
+            isError: true,
+          };
+        }
+        if (error || !state) {
+          settle(new Error(error || "Viewer returned no state"));
+        } else {
+          settle(state);
+        }
+        return { content: [{ type: "text", text: "Submitted" }] };
+      },
+    );
+
     // Tool: poll_pdf_commands (app-only) - Poll for pending commands
     registerAppTool(
       server,
diff --git a/examples/pdf-server/src/commands.ts b/examples/pdf-server/src/commands.ts
@@ -66,4 +66,5 @@ export type PdfCommand =
       getScreenshots: boolean;
     }
   | { type: "save_as"; requestId: string }
+  | { type: "get_viewer_state"; requestId: string }
   | { type: "file_changed"; mtimeMs: number };
diff --git a/examples/pdf-server/src/mcp-app.ts b/examples/pdf-server/src/mcp-app.ts
@@ -2428,6 +2428,84 @@ async function renderPageOffscreen(pageNum: number): Promise<string> {
   return dataUrl.split(",")[1];
 }
 
+/**
+ * Snapshot the live viewer for `interact({action:"get_viewer_state"})`.
+ *
+ * Selection is read from `window.getSelection()` at call time — no caching;
+ * if the user navigated away or nothing is selected, `selection` is `null`.
+ * `boundingRect` is in model coords (PDF points, origin top-left, y-down) so
+ * it can be fed straight back into `add_annotations`.
+ */
+async function handleGetViewerState(requestId: string): Promise<void> {
+  const CONTEXT_CHARS = 200;
+
+  let selection: {
+    text: string;
+    contextBefore: string;
+    contextAfter: string;
+    boundingRect: { x: number; y: number; width: number; height: number };
+  } | null = null;
+
+  const sel = window.getSelection();
+  const selectedText = sel?.toString().replace(/\s+/g, " ").trim();
+  if (sel && selectedText && sel.rangeCount > 0) {
+    // Only treat it as a PDF selection if it lives inside the text layer of
+    // the rendered page (not the toolbar, search box, etc.).
+    const range = sel.getRangeAt(0);
+    const anchor =
+      range.commonAncestorContainer.nodeType === Node.ELEMENT_NODE
+        ? (range.commonAncestorContainer as Element)
+        : range.commonAncestorContainer.parentElement;
+    if (anchor && textLayerEl.contains(anchor)) {
+      // Context: locate selection in the page's extracted text and slice
+      // ±CONTEXT_CHARS around it. Falls back to empty strings if fuzzy
+      // match fails (still return text + rect — they're the load-bearing
+      // bits).
+      const pageText = pageTextCache.get(currentPage) ?? "";
+      const loc = findSelectionInText(pageText, selectedText);
+      const contextBefore = loc
+        ? pageText.slice(Math.max(0, loc.start - CONTEXT_CHARS), loc.start)
+        : "";
+      const contextAfter = loc
+        ? pageText.slice(loc.end, loc.end + CONTEXT_CHARS)
+        : "";
+
+      // Single bounding box, page-relative model coords. getBoundingClientRect
+      // is viewport-relative; subtract the page-wrapper origin then divide by
+      // scale → PDF points (top-left origin, y-down — matches the coord
+      // system documented in the interact tool description).
+      const r = range.getBoundingClientRect();
+      const origin = pageWrapperEl.getBoundingClientRect();
+      const round = (n: number) => Math.round(n * 100) / 100;
+      selection = {
+        text: selectedText,
+        contextBefore,
+        contextAfter,
+        boundingRect: {
+          x: round((r.left - origin.left) / scale),
+          y: round((r.top - origin.top) / scale),
+          width: round(r.width / scale),
+          height: round(r.height / scale),
+        },
+      };
+    }
+  }
+
+  const state = {
+    currentPage,
+    pageCount: totalPages,
+    zoom: Math.round(scale * 100),
+    displayMode: currentDisplayMode,
+    selectedAnnotationIds: [...selectedAnnotationIds],
+    selection,
+  };
+
+  await app.callServerTool({
+    name: "submit_viewer_state",
+    arguments: { requestId, state: JSON.stringify(state, null, 2) },
+  });
+}
+
 async function handleGetPages(cmd: {
   requestId: string;
   intervals: Array<{ start?: number; end?: number }>;
@@ -4678,6 +4756,23 @@ async function processCommands(commands: PdfCommand[]): Promise<void> {
             .catch(() => {});
         }
         break;
+      case "get_viewer_state":
+        // Same await-before-next-poll discipline as get_pages/save_as.
+        try {
+          await handleGetViewerState(cmd.requestId);
+        } catch (err) {
+          log.error("get_viewer_state failed — submitting error:", err);
+          await app
+            .callServerTool({
+              name: "submit_viewer_state",
+              arguments: {
+                requestId: cmd.requestId,
+                error: err instanceof Error ? err.message : String(err),
+              },
+            })
+            .catch(() => {});
+        }
+        break;
       case "file_changed": {
         // Skip our own save_pdf echo: either save is still in flight, or the
         // event's mtime matches what save_pdf just returned.
diff --git a/playwright.config.ts b/playwright.config.ts
@@ -30,6 +30,7 @@ export default defineConfig({
         ...devices["Desktop Chrome"],
         // Use default Chromium everywhere for consistent screenshot rendering
         // Run `npm run test:e2e:docker` locally for CI-identical results
+        ...(process.env.PW_CHANNEL ? { channel: process.env.PW_CHANNEL } : {}),
       },
     },
   ],
diff --git a/tests/e2e/pdf-annotations.spec.ts b/tests/e2e/pdf-annotations.spec.ts
@@ -314,3 +314,93 @@ test.describe("PDF Server - Annotations", () => {
     );
   });
 });
+
+/**
+ * Read the most recent interact result text from the basic-host UI.
+ * Waits for the result-panel count to reach `expectedCount` first —
+ * `callInteract` doesn't block, so `.last()` would otherwise race to the
+ * previous (display_pdf) panel.
+ */
+async function readLastToolResult(
+  page: Page,
+  expectedCount: number,
+): Promise<string> {
+  const panels = page.locator('text="📤 Tool Result"');
+  await expect(panels).toHaveCount(expectedCount, { timeout: 30000 });
+  await panels.last().click();
+  const pre = page.locator("pre").last();
+  await expect(pre).toBeVisible({ timeout: 5000 });
+  return (await pre.textContent()) ?? "";
+}
+
+/** Unwrap basic-host's `CallToolResult` JSON to the first text block. */
+function unwrapTextResult(raw: string): string {
+  const parsed = JSON.parse(raw) as {
+    content?: { type: string; text?: string }[];
+  };
+  const block = parsed.content?.find((c) => c.type === "text");
+  if (!block?.text) throw new Error(`No text block in: ${raw.slice(0, 200)}`);
+  return block.text;
+}
+
+test.describe("PDF Server - get_viewer_state", () => {
+  test("returns page/zoom/mode and selection:null when nothing is selected", async ({
+    page,
+  }) => {
+    await loadPdfServer(page);
+    await waitForPdfCanvas(page);
+
+    const viewUUID = await extractViewUUID(page);
+
+    await callInteract(page, { viewUUID, action: "get_viewer_state" });
+    const raw = await readLastToolResult(page, 2);
+    const state = JSON.parse(unwrapTextResult(raw));
+
+    expect(state.currentPage).toBe(1);
+    expect(state.pageCount).toBeGreaterThan(1);
+    expect(typeof state.zoom).toBe("number");
+    expect(state.displayMode).toBe("inline");
+    expect(state.selection).toBeNull();
+    expect(Array.isArray(state.selectedAnnotationIds)).toBe(true);
+  });
+
+  test("returns selected text and bounding rect when text-layer text is selected", async ({
+    page,
+  }) => {
+    await loadPdfServer(page);
+    await waitForPdfCanvas(page);
+
+    const viewUUID = await extractViewUUID(page);
+    const app = getAppFrame(page);
+
+    // Programmatically select the contents of the first text-layer span.
+    const selectedText = await app
+      .locator("#text-layer span")
+      .first()
+      .evaluate((span) => {
+        const range = span.ownerDocument.createRange();
+        range.selectNodeContents(span);
+        const sel = span.ownerDocument.defaultView!.getSelection()!;
+        sel.removeAllRanges();
+        sel.addRange(range);
+        return sel.toString().replace(/\s+/g, " ").trim();
+      });
+    expect(selectedText.length).toBeGreaterThan(0);
+
+    await callInteract(page, { viewUUID, action: "get_viewer_state" });
+    const raw = await readLastToolResult(page, 2);
+    const state = JSON.parse(unwrapTextResult(raw));
+
+    expect(state.currentPage).toBe(1);
+    expect(state.selection).not.toBeNull();
+    expect(state.selection.text).toContain(selectedText);
+    expect(state.selection.boundingRect).toEqual(
+      expect.objectContaining({
+        x: expect.any(Number),
+        y: expect.any(Number),
+        width: expect.any(Number),
+        height: expect.any(Number),
+      }),
+    );
+  });
+});

Original file line number	Diff line number	Diff line change
`@@ -66,4 +66,5 @@ export type PdfCommand =`
`66`	`66`	`getScreenshots: boolean;`
`67`	`67`	`}`
`68`	`68`	`\| { type: "save_as"; requestId: string }`
	`69`	`+ \| { type: "get_viewer_state"; requestId: string }`
`69`	`70`	`\| { type: "file_changed"; mtimeMs: number };`