raphaelmansuy · raphaelmansuy · Mar 24, 2026 · Mar 24, 2026
diff --git a/README.md b/README.md
@@ -519,11 +519,11 @@ Combine formats: `--format markdown,json`
   "type": "heading",
   "id": 42,
   "level": "Title",
-  "page_number": 1,
-  "bounding_box": [72.0, 700.0, 540.0, 730.0],
-  "heading_level": 1,
+  "page number": 1,
+  "bounding box": [72.0, 700.0, 540.0, 730.0],
+  "heading level": 1,
   "font": "Helvetica-Bold",
-  "font_size": 24.0,
+  "font size": 24.0,
   "content": "Introduction"
 }
 ```
@@ -532,16 +532,16 @@ Combine formats: `--format markdown,json`
 |-------|-------------|
 | `type` | Element type: `heading`, `paragraph`, `table`, `list`, `image`, `caption` |
 | `id` | Unique identifier for cross-referencing |
-| `page_number` | 1-indexed page reference |
-| `bounding_box` | `[left, bottom, right, top]` in PDF points (72 pt = 1 inch) |
-| `heading_level` | Heading depth (1+) |
+| `page number` | 1-indexed page reference |
+| `bounding box` | `[left, bottom, right, top]` in PDF points (72 pt = 1 inch) |
+| `heading level` | Heading depth (1+) |
 | `content` | Extracted text |
 
 ---
 
 ## RAG / LLM Integration
 
-EdgeParse is designed for AI pipelines. Every element has a `bounding_box` and `page_number`, so you can cite exact sources in answers.
+EdgeParse is designed for AI pipelines. Every element has a `bounding box` and `page number`, so you can cite exact sources in answers.
 
 ### Extract Markdown for chunking
 
@@ -561,11 +561,11 @@ import json, edgeparse
 
 data = json.loads(edgeparse.convert("report.pdf", format="json"))
 
-for element in data["elements"]:
+for element in data["kids"]:
     if element["type"] == "paragraph":
-        # element["bounding_box"] → highlight location in original PDF
-        # element["page_number"] → link back to source page
-        print(f"p.{element['page_number']}: {element['content'][:80]}")
+        # element["bounding box"] → highlight location in original PDF
+        # element["page number"] → link back to source page
+        print(f"p.{element['page number']}: {element['content'][:80]}")
 ```
 
 ### LangChain integration
@@ -579,14 +579,14 @@ import edgeparse, json
 def load_pdf(path: str) -> list[Document]:
     data = json.loads(edgeparse.convert(path, format="json"))
     docs = []
-    for el in data["elements"]:
-        if el["type"] in ("paragraph", "heading", "table"):
+    for el in data["kids"]:
+        if el["type"] in ("paragraph", "heading"):
             docs.append(Document(
                 page_content=el["content"],
                 metadata={
                     "source": path,
-                    "page": el["page_number"],
-                    "bbox": el["bounding_box"],
+                    "page": el["page number"],
+                    "bbox": el["bounding box"],
                     "type": el["type"],
                 }
             ))
@@ -604,9 +604,9 @@ def edgeparse_reader(path: str) -> list[Document]:
     return [
         Document(
             text=el["content"],
-            metadata={"page": el["page_number"], "source": path}
+            metadata={"page": el["page number"], "source": path}
         )
-        for el in data["elements"]
+        for el in data["kids"]
         if el.get("content")
     ]
 ```
@@ -618,7 +618,7 @@ def edgeparse_reader(path: str) -> list[Document]:
 | Feed PDF to LLM | `markdown` | Clean structure, fits in context window |
 | RAG with source citations | `json` | Bounding boxes enable "click-to-source" UX |
 | Semantic chunking by section | `markdown` | Headings make natural chunk boundaries |
-| Element-level filtering | `json` | Filter by `type`, `page_number`, `heading_level` |
+| Element-level filtering | `json` | Filter by `type`, `page number`, `heading level` |
 | Web display | `html` | Styled output with semantic elements |
 
 ---
@@ -716,7 +716,7 @@ For RAG pipelines, you need a parser that preserves document structure, maintain
 
 ### How do I cite PDF sources in RAG answers?
 
-Every element in JSON output includes a `bounding_box` (`[left, bottom, right, top]` in PDF points, 72 pt = 1 inch) and `page_number`. Map the source chunk back to its bounding box to highlight the exact location in the original PDF — enabling "click-to-source" UX. No other non-OCR open-source parser provides bounding boxes for every element by default.
+Every element in JSON output includes a `bounding box` (`[left, bottom, right, top]` in PDF points, 72 pt = 1 inch) and `page number`. Map the source chunk back to its bounding box to highlight the exact location in the original PDF — enabling "click-to-source" UX. No other non-OCR open-source parser provides bounding boxes for every element by default.
 
 ### How do I extract tables from PDF?
 
@@ -753,7 +753,7 @@ Same input PDF → same output, every time. No stochastic models, no floating-po
 
 ### How do I chunk PDFs for semantic search?
 
-Use `format="markdown"`. EdgeParse preserves heading hierarchy and table structure in Markdown output — headings make natural chunk boundaries for `RecursiveCharacterTextSplitter` (LangChain) or heading-based splitters. For element-level control, use `format="json"` and split on `heading_level` boundaries or `page_number` changes.
+Use `format="markdown"`. EdgeParse preserves heading hierarchy and table structure in Markdown output — headings make natural chunk boundaries for `RecursiveCharacterTextSplitter` (LangChain) or heading-based splitters. For element-level control, use `format="json"` and split on `heading level` boundaries or `page number` changes.
 
 ### Does the Python SDK run on Windows?
 

diff --git a/docs/05-output-formats.md b/docs/05-output-formats.md
@@ -50,7 +50,7 @@ This is the **default output format** (used when `--format json` or no format sp
   "author": "Alice Smith",
   "creation date": "D:20240101",
   "modification date": "D:20240201",
-  "elements": [ ... ]
+  "kids": [ ... ]
 }
 ```
 
@@ -72,11 +72,10 @@ Type-specific fields:
 ```json
 {
   "type": "heading",
-  "level": "h1",
-  "value": "Introduction",
-  "font name": "Helvetica-Bold",
-  "font size": "14.0",
-  "font weight": "700.0",
+  "level": "Title",
+  "content": "Introduction",
+  "font": "Helvetica-Bold",
+  "font size": 14.0,
   "text color": "[0.0, 0.0, 0.0]"
 }
 ```
@@ -88,9 +87,10 @@ Type-specific fields:
   "rows": [
     {
       "type": "table row",
+      "row number": 1,
       "cells": [
-        { "type": "table header cell", "value": "Name" },
-        { "type": "table data cell",   "value": "Alice" }
+        { "type": "table cell", "row number": 1, "column number": 1, "row span": 1, "column span": 1, "kids": [] },
+        { "type": "table cell", "row number": 1, "column number": 2, "row span": 1, "column span": 1, "kids": [] }
       ]
     }
   ]
@@ -102,7 +102,8 @@ Type-specific fields:
 {
   "type": "list",
   "list items": [
-    { "type": "list item", "label value": "•", "body value": "First item" }
+    { "type": "list item", "content": "First item", "kids": [] },
+    { "type": "list item", "content": "Second item", "kids": [] }
   ]
 }
 ```

diff --git a/site/src/content/docs/api/wasm.mdx b/site/src/content/docs/api/wasm.mdx
@@ -46,50 +46,36 @@ function convert(
 
 ### Return value
 
-A JavaScript object matching the `PdfDocument` structure:
+A JavaScript object representing the Rust `PdfDocument` struct, serialized via `serde_wasm_bindgen`. The top-level document has a `kids` array containing `ContentElement` enum variants (externally-tagged):
 
 ```typescript
 {
-  pages: [
-    {
-      page_number: 1,
-      width: 612.0,
-      height: 792.0,
-      elements: [
-        {
-          type: "heading",       // "heading" | "paragraph" | "table" | "list" | "image" | ...
-          text: "Introduction",
-          level: 1,              // heading level (1-6)
-          bbox: {
-            x0: 72.0,
-            y0: 700.0,
-            x1: 300.0,
-            y1: 720.0,
-          },
-        },
-        // ... more elements
-      ],
-    },
-  ],
+  file_name: string,
+  number_of_pages: number,
+  author: string | null,
+  title: string | null,
+  kids: Array<Record<string, any>>  // externally-tagged Rust enum variants
 }
 ```
 
+Each element in `kids` is an externally-tagged enum object like `{ "Paragraph": { ... } }` or `{ "Heading": { ... } }`. For most use cases, `convert_to_string(bytes, 'json')` + `JSON.parse()` is simpler and gives the same structured schema as the Python/Node.js SDK.
+
 ### Example
 
 ```typescript
-import init, { convert } from '@edgeparse/edgeparse-wasm';
+import init, { convert_to_string } from '@edgeparse/edgeparse-wasm';
 
 await init();
 
 const bytes = new Uint8Array(await file.arrayBuffer());
-const doc = convert(bytes, 'json');
-
-// Iterate pages and elements
-for (const page of doc.pages) {
-  for (const el of page.elements) {
-    if (el.type === 'table') {
-      console.log('Table found on page', page.page_number);
-    }
+
+// Easiest: parse JSON string — same schema as Python/Node.js SDK
+const doc = JSON.parse(convert_to_string(bytes, 'json'));
+
+// Iterate elements (uses same keys as Python/Node.js JSON output)
+for (const el of doc.kids) {
+  if (el.type === 'table') {
+    console.log('Table found on page', el['page number']);
   }
 }
 ```

diff --git a/site/src/content/docs/getting-started/quick-start-wasm.mdx b/site/src/content/docs/getting-started/quick-start-wasm.mdx
@@ -84,22 +84,21 @@ fileInput.addEventListener('change', async () => {
 
 ## Get structured document data
 
-Use `convert()` instead of `convert_to_string()` to get a full JavaScript object with pages, elements, and bounding boxes:
+Use `convert_to_string()` with `'json'` format and `JSON.parse()` to get the same structured schema as the Python/Node.js SDKs — a flat `kids` array with element objects:
 
 ```typescript
-import init, { convert } from '@edgeparse/edgeparse-wasm';
+import init, { convert_to_string } from '@edgeparse/edgeparse-wasm';
 
 await init();
 
 const bytes = new Uint8Array(await file.arrayBuffer());
-const doc = convert(bytes, 'json');
+
+// Parse JSON string — same schema as Python/Node.js SDK
+const doc = JSON.parse(convert_to_string(bytes, 'json'));
 
 // Access structured data
-for (const page of doc.pages) {
-  console.log(`Page ${page.page_number}:`);
-  for (const element of page.elements) {
-    console.log(`  [${element.type}] ${element.text}`);
-  }
+for (const el of doc.kids) {
+  console.log(`[${el.type}] page ${el['page number']}: ${el.content ?? ''}`);
 }
 ```
 

diff --git a/site/src/content/docs/guides/wasm-use-cases.mdx b/site/src/content/docs/guides/wasm-use-cases.mdx
@@ -10,31 +10,30 @@ EdgeParse WASM runs the full Rust PDF extraction engine directly in the browser.
 Extract structured chunks from PDFs in the browser, then send only the text to your embedding API. The full PDF never leaves the user's device.
 
 ```typescript
-import init, { convert } from '@edgeparse/edgeparse-wasm';
+import init, { convert_to_string } from '@edgeparse/edgeparse-wasm';
 
 await init();
 
 async function extractChunksForRAG(file: File) {
   const bytes = new Uint8Array(await file.arrayBuffer());
-  const doc = convert(bytes, 'json');
+  // Use JSON string — same schema as Python/Node.js SDK
+  const doc = JSON.parse(convert_to_string(bytes, 'json'));
 
   // Build chunks with metadata
-  const chunks = doc.pages.flatMap(page =>
-    page.elements
-      .filter(el => ['paragraph', 'heading', 'list_item'].includes(el.type))
-      .map(el => ({
-        text: el.text,
-        page: page.page_number,
-        type: el.type,
-        bbox: el.bbox,
-      }))
-  );
+  const chunks = doc.kids
+    .filter((el: any) => ['paragraph', 'heading'].includes(el.type))
+    .map((el: any) => ({
+      text: el.content,
+      page: el['page number'],
+      type: el.type,
+      bbox: el['bounding box'],
+    }));
 
   // Only text leaves the browser — not the PDF
   const response = await fetch('/api/embed', {
     method: 'POST',
     headers: { 'Content-Type': 'application/json' },
-    body: JSON.stringify({ chunks: chunks.map(c => c.text) }),
+    body: JSON.stringify({ chunks: chunks.map((c: any) => c.text) }),
   });
 
   return { chunks, embeddings: await response.json() };
@@ -159,27 +158,21 @@ chrome.runtime.onMessage.addListener(async (msg) => {
 Extract tables from PDFs and convert them to CSV or array data for spreadsheet applications or data analysis tools.
 
 ```typescript
-import init, { convert } from '@edgeparse/edgeparse-wasm';
+import init, { convert_to_string } from '@edgeparse/edgeparse-wasm';
 
 await init();
 
 function extractTables(bytes: Uint8Array) {
-  const doc = convert(bytes, 'json');
-  const tables = [];
-
-  for (const page of doc.pages) {
-    for (const el of page.elements) {
-      if (el.type === 'table') {
-        tables.push({
-          page: page.page_number,
-          rows: el.rows, // array of arrays
-          bbox: el.bbox,
-        });
-      }
-    }
-  }
-
-  return tables;
+  // Use JSON string — same schema as Python/Node.js SDK
+  const doc = JSON.parse(convert_to_string(bytes, 'json'));
+
+  return doc.kids
+    .filter((el: any) => el.type === 'table')
+    .map((el: any) => ({
+      page: el['page number'],
+      rows: el.rows, // array of row objects with cells
+      bbox: el['bounding box'],
+    }));
 }
 ```
 

diff --git a/site/src/content/docs/index.mdx b/site/src/content/docs/index.mdx
@@ -76,8 +76,8 @@ print(md[:500])
 
 # Parse structured JSON with bounding boxes
 doc = json.loads(edgeparse.convert("report.pdf", format="json"))
-for el in doc["elements"][:3]:
-    print(el["type"], el["text"][:60])
+for el in doc["kids"][:3]:
+    print(el["type"], el.get("content", "")[:60])
 
 # Save to output file
 path = edgeparse.convert_file("report.pdf", output_dir="out/", format="markdown")
@@ -92,7 +92,7 @@ console.log(md.slice(0, 500));
 
 // Parse structured JSON output
 const doc = JSON.parse(convert("invoice.pdf", { format: "json" }));
-doc.elements.slice(0, 3).forEach(el => console.log(el.type, el.text));
+doc.kids.slice(0, 3).forEach(el => console.log(el.type, el.content ?? ''));
 
 // Extract specific pages
 const pages = convert("report.pdf", { format: "markdown", pages: "1-5" });

diff --git a/site/src/content/docs/output/json-schema.mdx b/site/src/content/docs/output/json-schema.mdx
@@ -29,15 +29,15 @@ Each element in `kids` has a `type` field identifying its kind:
 | `id` | `int` | Globally unique sequential ID |
 | `page number` | `int` | 1-based page index |
 | `bounding box` | `[left, bottom, right, top]` | Coordinates in PDF points |
-| `content` | `string` | Extracted text |
-| `font` | `string` | Font name |
-| `font size` | `float` | Font size in points |
+| `content` | `string` | Extracted text (present on `paragraph`, `heading`, `caption`, `list item`; absent on `table`, `image`, `list`, `header`, `footer`) |
+| `font` | `string` | Font name (text elements only) |
+| `font size` | `float` | Font size in points (text elements only) |
 
 ## Heading-Specific Fields
 
 | Field | Type | Description |
 |-------|------|------------|
-| `level` | `string` | `"title"`, `"section"`, `"subsection"` |
+| `level` | `string` | Semantic label: `"Title"`, `"Subtitle"`, `"Heading1"`, `"Heading2"`, `"Heading3"`, `"Heading4"` |
 | `heading level` | `int` | Numeric heading level (1–6) |
 
 ## Table-Specific Fields
@@ -57,11 +57,12 @@ Each element in `kids` has a `type` field identifying its kind:
 
 | Field | Type | Description |
 |-------|------|------------|
-| `col` | `int` | 0-based column index |
-| `content` | `string` | Cell text |
-| `is header` | `bool?` | Whether in a header row |
-| `row span` | `int?` | Rows spanned (default: 1) |
-| `col span` | `int?` | Columns spanned (default: 1) |
+| `type` | `string` | Always `"table cell"` |
+| `row number` | `int` | 1-based row index |
+| `column number` | `int` | 1-based column index |
+| `row span` | `int` | Number of rows spanned (default: 1) |
+| `column span` | `int` | Number of columns spanned (default: 1) |
+| `kids` | `Element[]` | Nested child elements (typically empty) |
 
 ## Full Example