superdoc-dev
diff --git a/‎apps/cli/src/cli/operation-hints.ts‎
Lines changed: 4 additions & 0 deletions b/‎apps/cli/src/cli/operation-hints.ts‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎apps/docs/document-api/common-workflows.mdx‎
Lines changed: 56 additions & 0 deletions b/‎apps/docs/document-api/common-workflows.mdx‎
Lines changed: 56 additions & 0 deletions
diff --git a/‎apps/docs/guides/general/stable-navigation.mdx‎
Lines changed: 10 additions & 11 deletions b/‎apps/docs/guides/general/stable-navigation.mdx‎
Lines changed: 10 additions & 11 deletions
diff --git a/‎packages/document-api/src/contract/operation-definitions.ts‎
Lines changed: 13 additions & 0 deletions b/‎packages/document-api/src/contract/operation-definitions.ts‎
Lines changed: 13 additions & 0 deletions
diff --git a/‎packages/document-api/src/contract/operation-registry.ts‎
Lines changed: 3 additions & 0 deletions b/‎packages/document-api/src/contract/operation-registry.ts‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎packages/document-api/src/contract/schemas.ts‎
Lines changed: 54 additions & 0 deletions b/‎packages/document-api/src/contract/schemas.ts‎
Lines changed: 54 additions & 0 deletions
diff --git a/‎packages/document-api/src/extract/extract.test.ts‎
Lines changed: 46 additions & 0 deletions b/‎packages/document-api/src/extract/extract.test.ts‎
Lines changed: 46 additions & 0 deletions
diff --git a/‎packages/document-api/src/extract/extract.ts‎
Lines changed: 20 additions & 0 deletions b/‎packages/document-api/src/extract/extract.ts‎
Lines changed: 20 additions & 0 deletions
diff --git a/‎packages/document-api/src/index.ts‎
Lines changed: 12 additions & 0 deletions b/‎packages/document-api/src/index.ts‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎packages/document-api/src/invoke/invoke.ts‎
Lines changed: 1 addition & 0 deletions b/‎packages/document-api/src/invoke/invoke.ts‎
Lines changed: 1 addition & 0 deletions
@@ -80,6 +80,7 @@ export const SUCCESS_VERB: Record<CliExposedOperationId, string> = {
   getMarkdown: 'extracted markdown',
   getHtml: 'extracted html',
   info: 'retrieved info',
+  extract: 'extracted content',
   clearContent: 'cleared document content',
   insert: 'inserted text',
   replace: 'replaced text',
@@ -255,6 +256,7 @@ export const OUTPUT_FORMAT: Record<CliExposedOperationId, OutputFormat> = {
   getMarkdown: 'plain',
   getHtml: 'plain',
   info: 'documentInfo',
+  extract: 'plain',
   clearContent: 'receipt',
   insert: 'mutationReceipt',
   replace: 'mutationReceipt',
@@ -411,6 +413,7 @@ export const RESPONSE_ENVELOPE_KEY: Record<CliExposedOperationId, string | null>
   getMarkdown: 'markdown',
   getHtml: 'html',
   info: null,
+  extract: null,
   clearContent: 'receipt',
   insert: null,
   replace: null,
@@ -608,6 +611,7 @@ export const OPERATION_FAMILY: Record<CliExposedOperationId, OperationFamily> =
   getMarkdown: 'query',
   getHtml: 'query',
   info: 'general',
+  extract: 'general',
   clearContent: 'general',
   insert: 'textMutation',
   replace: 'textMutation',
 
@@ -308,6 +308,62 @@ await superdoc.scrollToElement(trackedChangeEntityId);
 No ID is guaranteed to survive all Microsoft Word round-trips. Re-extract addresses after major external edits or transformations, since Word (or other tools) may rewrite paragraph IDs and SuperDoc may rewrite duplicate IDs on import.
 </Warning>
 
+## Content extraction for RAG
+
+`doc.extract()` returns all document content in one call — blocks with full text, comments, and tracked changes. Each item has a stable ID that works directly with [`scrollToElement`](/core/superdoc/methods#scrolltoelement).
+
+```ts
+const content = editor.doc.extract();
+
+// Every block in document order, with full text
+for (const block of content.blocks) {
+  console.log(block.nodeId, block.type, block.text);
+  // → '5AF80E61', 'heading', 'Chapter 1: Introduction'
+  // → '17FBFA43', 'paragraph', 'This is the opening paragraph...'
+}
+
+// Comments anchored to blocks
+for (const comment of content.comments) {
+  console.log(comment.entityId, comment.blockId, comment.text);
+}
+
+// Tracked changes
+for (const tc of content.trackedChanges) {
+  console.log(tc.entityId, tc.type, tc.excerpt);
+}
+```
+
+### RAG pipeline pattern
+
+Extract content, chunk it, store the IDs, and navigate back on click:
+
+```ts
+// 1. Extract all content
+const { blocks } = editor.doc.extract();
+
+// 2. Chunk and embed (your pipeline)
+const chunks = blocks
+  .filter((b) => b.text.length > 0)
+  .map((b) => ({
+    id: b.nodeId,
+    text: b.text,
+    type: b.type,
+    headingLevel: b.headingLevel,
+  }));
+const embeddings = await embedChunks(chunks);
+
+// 3. Store embeddings with nodeIds
+await vectorStore.upsert(embeddings);
+
+// 4. Later — user clicks a citation
+const citation = await vectorStore.query(userQuestion);
+await superdoc.scrollToElement(citation.id);
+```
+
+<Info>
+All IDs from `doc.extract()` work directly with `scrollToElement()` — no conversion needed. For DOCX-imported content, block `nodeId` values are stable across sessions.
+</Info>
+
 ## Read document counts
 
 `doc.info()` returns a snapshot of current document statistics including word, character, paragraph, heading, table, image, comment, tracked-change, SDT-field, and list counts.
 
@@ -13,18 +13,17 @@ SuperDoc has two navigation approaches depending on your use case:
 
 ## Navigate by element ID
 
-`scrollToElement` takes any element ID — paragraph, comment, or tracked change — and scrolls to it. The ID comes from the Document API.
+`scrollToElement` takes any element ID — paragraph, comment, or tracked change — and scrolls to it. Use `doc.extract()` to get all IDs at once, or `query.match` for targeted lookups.
 
 ```javascript
-// Get an element's ID
-const match = editor.doc.query.match({
-  select: { type: 'text', pattern: 'Introduction', mode: 'contains' },
-  require: 'first',
-});
-const nodeId = match.items[0].address.nodeId;
+// Extract all content with stable IDs
+const { blocks, comments } = editor.doc.extract();
+
+// Navigate to any block
+await superdoc.scrollToElement(blocks[0].nodeId);
 
-// Navigate to it — works for paragraphs, comments, tracked changes
-await superdoc.scrollToElement(nodeId);
+// Navigate to a comment
+await superdoc.scrollToElement(comments[0].entityId);
 ```
 
 This is the approach to use for:
@@ -33,7 +32,7 @@ This is the approach to use for:
 - **Search results** — scroll to the matching paragraph
 - **Cross-session addressing** — IDs from DOCX-imported content survive reloads
 
-For the full cross-session pattern, see [cross-session block addressing](/document-api/common-workflows#cross-session-block-addressing).
+For the full extraction pattern, see [content extraction for RAG](/document-api/common-workflows#content-extraction-for-rag). For the cross-session pattern, see [cross-session block addressing](/document-api/common-workflows#cross-session-block-addressing).
 
 ## Track nodes during edits
 
@@ -62,7 +61,7 @@ function goToLink(link) {
 
 ## Best practices
 
-- Use `scrollToElement` when you have an element ID from the Document API.
+- Use `scrollToElement` when you have an element ID from `doc.extract()` or the Document API.
 - Use `PositionTracker` when you need to follow nodes that move during edits.
 - For cross-session use, store `nodeId` values (not `sdBlockId` — those regenerate on each open).
 - Handle missing targets gracefully — both APIs return `false` if the element no longer exists.
@@ -648,6 +648,19 @@ export const OPERATION_DEFINITIONS = {
     intentGroup: 'get_content',
     intentAction: 'info',
   },
+  extract: {
+    memberPath: 'extract',
+    description:
+      'Extract all document content with stable IDs for RAG pipelines. Returns blocks with full text, comments, and tracked changes — each with an ID compatible with scrollToElement().',
+    expectedResult:
+      'Returns an ExtractResult with blocks (nodeId, type, text, headingLevel), comments (entityId, text, anchoredText, blockId, status, author), tracked changes (entityId, type, excerpt, author, date), and revision.',
+    requiresDocumentContext: true,
+    metadata: readOperation(),
+    referenceDocPath: 'extract.mdx',
+    referenceGroup: 'core',
+    intentGroup: 'get_content',
+    intentAction: 'extract',
+  },
 
   clearContent: {
     memberPath: 'clearContent',
 
@@ -35,6 +35,8 @@ import type { GetMarkdownInput } from '../get-markdown/get-markdown.js';
 import type { GetHtmlInput } from '../get-html/get-html.js';
 import type { MarkdownToFragmentInput } from '../markdown-to-fragment/markdown-to-fragment.js';
 import type { InfoInput } from '../info/info.js';
+import type { ExtractInput } from '../extract/extract.js';
+import type { ExtractResult } from '../types/extract.types.js';
 import type { ClearContentInput } from '../clear-content/clear-content.js';
 import type { InsertInput } from '../insert/insert.js';
 import type { ReplaceInput } from '../replace/replace.js';
@@ -527,6 +529,7 @@ export interface OperationRegistry extends FormatInlineAliasOperationRegistry {
   getHtml: { input: GetHtmlInput; options: never; output: string };
   markdownToFragment: { input: MarkdownToFragmentInput; options: never; output: SDMarkdownToFragmentResult };
   info: { input: InfoInput; options: never; output: DocumentInfo };
+  extract: { input: ExtractInput; options: never; output: ExtractResult };
 
   // --- Singleton mutations ---
   clearContent: { input: ClearContentInput; options: RevisionGuardOptions; output: Receipt };
 
@@ -2952,6 +2952,60 @@ const operationSchemas: Record<OperationId, OperationSchemaSet> = {
     input: strictEmptyObjectSchema,
     output: documentInfoSchema,
   },
+  extract: {
+    input: strictEmptyObjectSchema,
+    output: objectSchema(
+      {
+        blocks: {
+          type: 'array',
+          items: objectSchema(
+            {
+              nodeId: { type: 'string', description: 'Stable block ID — pass to scrollToElement() for navigation.' },
+              type: { type: 'string', description: 'Block type: paragraph, heading, listItem, table, image, etc.' },
+              text: { type: 'string', description: 'Full plain text content of the block.' },
+              headingLevel: { type: 'integer', description: 'Heading level (1–6). Only present for headings.' },
+            },
+            ['nodeId', 'type', 'text'],
+          ),
+        },
+        comments: {
+          type: 'array',
+          items: objectSchema(
+            {
+              entityId: {
+                type: 'string',
+                description: 'Comment entity ID — pass to scrollToElement() for navigation.',
+              },
+              text: { type: 'string', description: 'Comment body text.' },
+              anchoredText: { type: 'string', description: 'The document text the comment is anchored to.' },
+              blockId: { type: 'string', description: 'Block ID the comment is anchored to.' },
+              status: { type: 'string', enum: ['open', 'resolved'] },
+              author: { type: 'string', description: 'Comment author name.' },
+            },
+            ['entityId', 'status'],
+          ),
+        },
+        trackedChanges: {
+          type: 'array',
+          items: objectSchema(
+            {
+              entityId: {
+                type: 'string',
+                description: 'Tracked change entity ID — pass to scrollToElement() for navigation.',
+              },
+              type: { type: 'string', enum: ['insert', 'delete', 'format'] },
+              excerpt: { type: 'string', description: 'Short text excerpt of the changed content.' },
+              author: { type: 'string', description: 'Change author name.' },
+              date: { type: 'string', description: 'Change date (ISO string).' },
+            },
+            ['entityId', 'type'],
+          ),
+        },
+        revision: { type: 'string', description: 'Document revision at the time of extraction.' },
+      },
+      ['blocks', 'comments', 'trackedChanges', 'revision'],
+    ),
+  },
   clearContent: {
     input: strictEmptyObjectSchema,
     output: receiptResultSchemaFor('clearContent'),
 
@@ -0,0 +1,46 @@
+import { describe, expect, it, mock } from 'bun:test';
+import type { ExtractResult } from '../types/extract.types.js';
+import { executeExtract } from './extract.js';
+import type { ExtractAdapter } from './extract.js';
+
+const DEFAULT_EXTRACT: ExtractResult = {
+  blocks: [
+    { nodeId: 'h1', type: 'heading', text: 'Introduction', headingLevel: 1 },
+    { nodeId: 'p1', type: 'paragraph', text: 'First paragraph content.' },
+    { nodeId: 'p2', type: 'paragraph', text: '' },
+  ],
+  comments: [
+    { entityId: 'c1', text: 'Fix this', anchoredText: 'content', blockId: 'p1', status: 'open', author: 'Alice' },
+  ],
+  trackedChanges: [{ entityId: 'tc1', type: 'insert', excerpt: 'new text', author: 'Bob', date: '2026-01-01' }],
+  revision: '5',
+};
+
+describe('executeExtract', () => {
+  it('delegates to adapter.extract with the input', () => {
+    const adapter: ExtractAdapter = {
+      extract: mock(() => DEFAULT_EXTRACT),
+    };
+
+    const result = executeExtract(adapter, {});
+
+    expect(result).toBe(DEFAULT_EXTRACT);
+    expect(adapter.extract).toHaveBeenCalledWith({});
+  });
+
+  it('passes through full text without truncation', () => {
+    const longText = 'A'.repeat(200);
+    const extractResult: ExtractResult = {
+      ...DEFAULT_EXTRACT,
+      blocks: [{ nodeId: 'p1', type: 'paragraph', text: longText }],
+    };
+    const adapter: ExtractAdapter = {
+      extract: mock(() => extractResult),
+    };
+
+    const result = executeExtract(adapter, {});
+
+    expect(result.blocks[0].text).toBe(longText);
+    expect(result.blocks[0].text.length).toBe(200);
+  });
+});
@@ -0,0 +1,20 @@
+import type { ExtractResult } from '../types/extract.types.js';
+
+export type ExtractInput = Record<string, never>;
+
+/**
+ * Engine-specific adapter that provides document content extraction.
+ */
+export interface ExtractAdapter {
+  /**
+   * Extract all document content with stable IDs for RAG pipelines.
+   */
+  extract(input: ExtractInput): ExtractResult;
+}
+
+/**
+ * Execute an extract operation through the provided adapter.
+ */
+export function executeExtract(adapter: ExtractAdapter, input: ExtractInput): ExtractResult {
+  return adapter.extract(input);
+}
@@ -63,6 +63,7 @@ import type {
   SDMutationReceipt,
   TrackChangeInfo,
   TrackChangesListResult,
+  ExtractResult,
 } from './types/index.js';
 import type { CommentInfo, CommentsListQuery, CommentsListResult } from './comments/comments.types.js';
 import type {
@@ -115,6 +116,7 @@ import {
 } from './markdown-to-fragment/markdown-to-fragment.js';
 import type { SDMarkdownToFragmentResult } from './types/sd-contract.js';
 import { executeInfo, type InfoAdapter, type InfoInput } from './info/info.js';
+import { executeExtract, type ExtractAdapter, type ExtractInput } from './extract/extract.js';
 import {
   executeClearContent,
   type ClearContentAdapter,
@@ -889,6 +891,7 @@ export type { GetTextAdapter, GetTextInput } from './get-text/get-text.js';
 export type { GetMarkdownAdapter, GetMarkdownInput } from './get-markdown/get-markdown.js';
 export type { GetHtmlAdapter, GetHtmlInput } from './get-html/get-html.js';
 export type { InfoAdapter, InfoInput } from './info/info.js';
+export type { ExtractAdapter, ExtractInput } from './extract/extract.js';
 export type { WriteAdapter, WriteRequest } from './write/write.js';
 export type {
   FormatInlineAliasApi,
@@ -1531,6 +1534,11 @@ export interface DocumentApi {
    * Return document summary info including document counts and capabilities.
    */
   info(input: InfoInput): DocumentInfo;
+  /**
+   * Extract all document content with stable IDs for RAG pipelines.
+   * Returns blocks with full text, comments, and tracked changes.
+   */
+  extract(input: ExtractInput): ExtractResult;
   /**
    * Clear all document body content, leaving a single empty paragraph.
    */
@@ -1695,6 +1703,7 @@ export interface DocumentApiAdapters {
   getHtml: GetHtmlAdapter;
   markdownToFragment: MarkdownToFragmentAdapter;
   info: InfoAdapter;
+  extract: ExtractAdapter;
   clearContent: ClearContentAdapter;
   capabilities: CapabilitiesAdapter;
   comments: CommentsAdapter;
@@ -1894,6 +1903,9 @@ export function createDocumentApi(adapters: DocumentApiAdapters): DocumentApi {
     info(input: InfoInput): DocumentInfo {
       return executeInfo(adapters.info, input);
     },
+    extract(input: ExtractInput): ExtractResult {
+      return executeExtract(adapters.extract, input);
+    },
     clearContent(input: ClearContentInput, options?: RevisionGuardOptions): Receipt {
       return executeClearContent(adapters.clearContent, input, options);
     },
 
@@ -67,6 +67,7 @@ export function buildDispatchTable(api: DocumentApi): TypedDispatchTable {
     getHtml: (input) => api.getHtml(input),
     markdownToFragment: (input) => api.markdownToFragment(input),
     info: (input) => api.info(input),
+    extract: (input) => api.extract(input),
 
     // --- Singleton mutations ---
     clearContent: (input, options) => api.clearContent(input, options),