From 830268580edcdf262a1c0e2f83e3a6db78fb0afc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20MANSUY?= Date: Tue, 24 Mar 2026 21:14:40 +0800 Subject: [PATCH] fix(docs): correct JSON schema key names and field values across all docs - Replace all underscore keys with space-separated keys (page_number -> 'page number', bounding_box -> 'bounding box', heading_level -> 'heading level', font_size -> 'font size') - Replace 'elements' with 'kids' as the top-level document array field - Fix heading level values: 'h1'/'title'/'section' -> 'Title'/'Subtitle'/'Heading1-4' - Fix image field: 'image path' -> 'source' - Fix list schema: 'items':[strings] -> 'list items':[{type,content,kids}] - Fix table cell schema: correct type/row number/column number/row span/column span/kids - Remove 'table' from content filters (tables have no 'content' key) - Fix WASM docs: replace fictional pages[] structure with convert_to_string()+JSON.parse() pattern - Safe-access content with .get('content','') for elements that may lack it --- README.md | 42 +++++++-------- docs/05-output-formats.md | 19 +++---- site/src/content/docs/api/wasm.mdx | 48 ++++++----------- .../docs/getting-started/quick-start-wasm.mdx | 15 +++--- .../content/docs/guides/wasm-use-cases.mdx | 53 ++++++++----------- site/src/content/docs/index.mdx | 6 +-- site/src/content/docs/output/json-schema.mdx | 19 +++---- tutorials/02-python-sdk.md | 2 +- tutorials/05-output-formats.md | 34 +++++++----- 9 files changed, 114 insertions(+), 124 deletions(-) diff --git a/README.md b/README.md index 77f8ffa..eba705e 100644 --- a/README.md +++ b/README.md @@ -519,11 +519,11 @@ Combine formats: `--format markdown,json` "type": "heading", "id": 42, "level": "Title", - "page_number": 1, - "bounding_box": [72.0, 700.0, 540.0, 730.0], - "heading_level": 1, + "page number": 1, + "bounding box": [72.0, 700.0, 540.0, 730.0], + "heading level": 1, "font": "Helvetica-Bold", - "font_size": 24.0, + "font size": 24.0, "content": "Introduction" } ``` @@ -532,16 +532,16 @@ Combine formats: `--format markdown,json` |-------|-------------| | `type` | Element type: `heading`, `paragraph`, `table`, `list`, `image`, `caption` | | `id` | Unique identifier for cross-referencing | -| `page_number` | 1-indexed page reference | -| `bounding_box` | `[left, bottom, right, top]` in PDF points (72 pt = 1 inch) | -| `heading_level` | Heading depth (1+) | +| `page number` | 1-indexed page reference | +| `bounding box` | `[left, bottom, right, top]` in PDF points (72 pt = 1 inch) | +| `heading level` | Heading depth (1+) | | `content` | Extracted text | --- ## RAG / LLM Integration -EdgeParse is designed for AI pipelines. Every element has a `bounding_box` and `page_number`, so you can cite exact sources in answers. +EdgeParse is designed for AI pipelines. Every element has a `bounding box` and `page number`, so you can cite exact sources in answers. ### Extract Markdown for chunking @@ -561,11 +561,11 @@ import json, edgeparse data = json.loads(edgeparse.convert("report.pdf", format="json")) -for element in data["elements"]: +for element in data["kids"]: if element["type"] == "paragraph": - # element["bounding_box"] → highlight location in original PDF - # element["page_number"] → link back to source page - print(f"p.{element['page_number']}: {element['content'][:80]}") + # element["bounding box"] → highlight location in original PDF + # element["page number"] → link back to source page + print(f"p.{element['page number']}: {element['content'][:80]}") ``` ### LangChain integration @@ -579,14 +579,14 @@ import edgeparse, json def load_pdf(path: str) -> list[Document]: data = json.loads(edgeparse.convert(path, format="json")) docs = [] - for el in data["elements"]: - if el["type"] in ("paragraph", "heading", "table"): + for el in data["kids"]: + if el["type"] in ("paragraph", "heading"): docs.append(Document( page_content=el["content"], metadata={ "source": path, - "page": el["page_number"], - "bbox": el["bounding_box"], + "page": el["page number"], + "bbox": el["bounding box"], "type": el["type"], } )) @@ -604,9 +604,9 @@ def edgeparse_reader(path: str) -> list[Document]: return [ Document( text=el["content"], - metadata={"page": el["page_number"], "source": path} + metadata={"page": el["page number"], "source": path} ) - for el in data["elements"] + for el in data["kids"] if el.get("content") ] ``` @@ -618,7 +618,7 @@ def edgeparse_reader(path: str) -> list[Document]: | Feed PDF to LLM | `markdown` | Clean structure, fits in context window | | RAG with source citations | `json` | Bounding boxes enable "click-to-source" UX | | Semantic chunking by section | `markdown` | Headings make natural chunk boundaries | -| Element-level filtering | `json` | Filter by `type`, `page_number`, `heading_level` | +| Element-level filtering | `json` | Filter by `type`, `page number`, `heading level` | | Web display | `html` | Styled output with semantic elements | --- @@ -716,7 +716,7 @@ For RAG pipelines, you need a parser that preserves document structure, maintain ### How do I cite PDF sources in RAG answers? -Every element in JSON output includes a `bounding_box` (`[left, bottom, right, top]` in PDF points, 72 pt = 1 inch) and `page_number`. Map the source chunk back to its bounding box to highlight the exact location in the original PDF — enabling "click-to-source" UX. No other non-OCR open-source parser provides bounding boxes for every element by default. +Every element in JSON output includes a `bounding box` (`[left, bottom, right, top]` in PDF points, 72 pt = 1 inch) and `page number`. Map the source chunk back to its bounding box to highlight the exact location in the original PDF — enabling "click-to-source" UX. No other non-OCR open-source parser provides bounding boxes for every element by default. ### How do I extract tables from PDF? @@ -753,7 +753,7 @@ Same input PDF → same output, every time. No stochastic models, no floating-po ### How do I chunk PDFs for semantic search? -Use `format="markdown"`. EdgeParse preserves heading hierarchy and table structure in Markdown output — headings make natural chunk boundaries for `RecursiveCharacterTextSplitter` (LangChain) or heading-based splitters. For element-level control, use `format="json"` and split on `heading_level` boundaries or `page_number` changes. +Use `format="markdown"`. EdgeParse preserves heading hierarchy and table structure in Markdown output — headings make natural chunk boundaries for `RecursiveCharacterTextSplitter` (LangChain) or heading-based splitters. For element-level control, use `format="json"` and split on `heading level` boundaries or `page number` changes. ### Does the Python SDK run on Windows? diff --git a/docs/05-output-formats.md b/docs/05-output-formats.md index 99abb1d..90b7a9b 100644 --- a/docs/05-output-formats.md +++ b/docs/05-output-formats.md @@ -50,7 +50,7 @@ This is the **default output format** (used when `--format json` or no format sp "author": "Alice Smith", "creation date": "D:20240101", "modification date": "D:20240201", - "elements": [ ... ] + "kids": [ ... ] } ``` @@ -72,11 +72,10 @@ Type-specific fields: ```json { "type": "heading", - "level": "h1", - "value": "Introduction", - "font name": "Helvetica-Bold", - "font size": "14.0", - "font weight": "700.0", + "level": "Title", + "content": "Introduction", + "font": "Helvetica-Bold", + "font size": 14.0, "text color": "[0.0, 0.0, 0.0]" } ``` @@ -88,9 +87,10 @@ Type-specific fields: "rows": [ { "type": "table row", + "row number": 1, "cells": [ - { "type": "table header cell", "value": "Name" }, - { "type": "table data cell", "value": "Alice" } + { "type": "table cell", "row number": 1, "column number": 1, "row span": 1, "column span": 1, "kids": [] }, + { "type": "table cell", "row number": 1, "column number": 2, "row span": 1, "column span": 1, "kids": [] } ] } ] @@ -102,7 +102,8 @@ Type-specific fields: { "type": "list", "list items": [ - { "type": "list item", "label value": "•", "body value": "First item" } + { "type": "list item", "content": "First item", "kids": [] }, + { "type": "list item", "content": "Second item", "kids": [] } ] } ``` diff --git a/site/src/content/docs/api/wasm.mdx b/site/src/content/docs/api/wasm.mdx index 8fdec78..2423149 100644 --- a/site/src/content/docs/api/wasm.mdx +++ b/site/src/content/docs/api/wasm.mdx @@ -46,50 +46,36 @@ function convert( ### Return value -A JavaScript object matching the `PdfDocument` structure: +A JavaScript object representing the Rust `PdfDocument` struct, serialized via `serde_wasm_bindgen`. The top-level document has a `kids` array containing `ContentElement` enum variants (externally-tagged): ```typescript { - pages: [ - { - page_number: 1, - width: 612.0, - height: 792.0, - elements: [ - { - type: "heading", // "heading" | "paragraph" | "table" | "list" | "image" | ... - text: "Introduction", - level: 1, // heading level (1-6) - bbox: { - x0: 72.0, - y0: 700.0, - x1: 300.0, - y1: 720.0, - }, - }, - // ... more elements - ], - }, - ], + file_name: string, + number_of_pages: number, + author: string | null, + title: string | null, + kids: Array> // externally-tagged Rust enum variants } ``` +Each element in `kids` is an externally-tagged enum object like `{ "Paragraph": { ... } }` or `{ "Heading": { ... } }`. For most use cases, `convert_to_string(bytes, 'json')` + `JSON.parse()` is simpler and gives the same structured schema as the Python/Node.js SDK. + ### Example ```typescript -import init, { convert } from '@edgeparse/edgeparse-wasm'; +import init, { convert_to_string } from '@edgeparse/edgeparse-wasm'; await init(); const bytes = new Uint8Array(await file.arrayBuffer()); -const doc = convert(bytes, 'json'); - -// Iterate pages and elements -for (const page of doc.pages) { - for (const el of page.elements) { - if (el.type === 'table') { - console.log('Table found on page', page.page_number); - } + +// Easiest: parse JSON string — same schema as Python/Node.js SDK +const doc = JSON.parse(convert_to_string(bytes, 'json')); + +// Iterate elements (uses same keys as Python/Node.js JSON output) +for (const el of doc.kids) { + if (el.type === 'table') { + console.log('Table found on page', el['page number']); } } ``` diff --git a/site/src/content/docs/getting-started/quick-start-wasm.mdx b/site/src/content/docs/getting-started/quick-start-wasm.mdx index a209b05..ddb8453 100644 --- a/site/src/content/docs/getting-started/quick-start-wasm.mdx +++ b/site/src/content/docs/getting-started/quick-start-wasm.mdx @@ -84,22 +84,21 @@ fileInput.addEventListener('change', async () => { ## Get structured document data -Use `convert()` instead of `convert_to_string()` to get a full JavaScript object with pages, elements, and bounding boxes: +Use `convert_to_string()` with `'json'` format and `JSON.parse()` to get the same structured schema as the Python/Node.js SDKs — a flat `kids` array with element objects: ```typescript -import init, { convert } from '@edgeparse/edgeparse-wasm'; +import init, { convert_to_string } from '@edgeparse/edgeparse-wasm'; await init(); const bytes = new Uint8Array(await file.arrayBuffer()); -const doc = convert(bytes, 'json'); + +// Parse JSON string — same schema as Python/Node.js SDK +const doc = JSON.parse(convert_to_string(bytes, 'json')); // Access structured data -for (const page of doc.pages) { - console.log(`Page ${page.page_number}:`); - for (const element of page.elements) { - console.log(` [${element.type}] ${element.text}`); - } +for (const el of doc.kids) { + console.log(`[${el.type}] page ${el['page number']}: ${el.content ?? ''}`); } ``` diff --git a/site/src/content/docs/guides/wasm-use-cases.mdx b/site/src/content/docs/guides/wasm-use-cases.mdx index f2b7479..457b5d0 100644 --- a/site/src/content/docs/guides/wasm-use-cases.mdx +++ b/site/src/content/docs/guides/wasm-use-cases.mdx @@ -10,31 +10,30 @@ EdgeParse WASM runs the full Rust PDF extraction engine directly in the browser. Extract structured chunks from PDFs in the browser, then send only the text to your embedding API. The full PDF never leaves the user's device. ```typescript -import init, { convert } from '@edgeparse/edgeparse-wasm'; +import init, { convert_to_string } from '@edgeparse/edgeparse-wasm'; await init(); async function extractChunksForRAG(file: File) { const bytes = new Uint8Array(await file.arrayBuffer()); - const doc = convert(bytes, 'json'); + // Use JSON string — same schema as Python/Node.js SDK + const doc = JSON.parse(convert_to_string(bytes, 'json')); // Build chunks with metadata - const chunks = doc.pages.flatMap(page => - page.elements - .filter(el => ['paragraph', 'heading', 'list_item'].includes(el.type)) - .map(el => ({ - text: el.text, - page: page.page_number, - type: el.type, - bbox: el.bbox, - })) - ); + const chunks = doc.kids + .filter((el: any) => ['paragraph', 'heading'].includes(el.type)) + .map((el: any) => ({ + text: el.content, + page: el['page number'], + type: el.type, + bbox: el['bounding box'], + })); // Only text leaves the browser — not the PDF const response = await fetch('/api/embed', { method: 'POST', headers: { 'Content-Type': 'application/json' }, - body: JSON.stringify({ chunks: chunks.map(c => c.text) }), + body: JSON.stringify({ chunks: chunks.map((c: any) => c.text) }), }); return { chunks, embeddings: await response.json() }; @@ -159,27 +158,21 @@ chrome.runtime.onMessage.addListener(async (msg) => { Extract tables from PDFs and convert them to CSV or array data for spreadsheet applications or data analysis tools. ```typescript -import init, { convert } from '@edgeparse/edgeparse-wasm'; +import init, { convert_to_string } from '@edgeparse/edgeparse-wasm'; await init(); function extractTables(bytes: Uint8Array) { - const doc = convert(bytes, 'json'); - const tables = []; - - for (const page of doc.pages) { - for (const el of page.elements) { - if (el.type === 'table') { - tables.push({ - page: page.page_number, - rows: el.rows, // array of arrays - bbox: el.bbox, - }); - } - } - } - - return tables; + // Use JSON string — same schema as Python/Node.js SDK + const doc = JSON.parse(convert_to_string(bytes, 'json')); + + return doc.kids + .filter((el: any) => el.type === 'table') + .map((el: any) => ({ + page: el['page number'], + rows: el.rows, // array of row objects with cells + bbox: el['bounding box'], + })); } ``` diff --git a/site/src/content/docs/index.mdx b/site/src/content/docs/index.mdx index 6cff027..0b1c1f1 100644 --- a/site/src/content/docs/index.mdx +++ b/site/src/content/docs/index.mdx @@ -76,8 +76,8 @@ print(md[:500]) # Parse structured JSON with bounding boxes doc = json.loads(edgeparse.convert("report.pdf", format="json")) -for el in doc["elements"][:3]: - print(el["type"], el["text"][:60]) +for el in doc["kids"][:3]: + print(el["type"], el.get("content", "")[:60]) # Save to output file path = edgeparse.convert_file("report.pdf", output_dir="out/", format="markdown") @@ -92,7 +92,7 @@ console.log(md.slice(0, 500)); // Parse structured JSON output const doc = JSON.parse(convert("invoice.pdf", { format: "json" })); -doc.elements.slice(0, 3).forEach(el => console.log(el.type, el.text)); +doc.kids.slice(0, 3).forEach(el => console.log(el.type, el.content ?? '')); // Extract specific pages const pages = convert("report.pdf", { format: "markdown", pages: "1-5" }); diff --git a/site/src/content/docs/output/json-schema.mdx b/site/src/content/docs/output/json-schema.mdx index f78770c..f6faa6d 100644 --- a/site/src/content/docs/output/json-schema.mdx +++ b/site/src/content/docs/output/json-schema.mdx @@ -29,15 +29,15 @@ Each element in `kids` has a `type` field identifying its kind: | `id` | `int` | Globally unique sequential ID | | `page number` | `int` | 1-based page index | | `bounding box` | `[left, bottom, right, top]` | Coordinates in PDF points | -| `content` | `string` | Extracted text | -| `font` | `string` | Font name | -| `font size` | `float` | Font size in points | +| `content` | `string` | Extracted text (present on `paragraph`, `heading`, `caption`, `list item`; absent on `table`, `image`, `list`, `header`, `footer`) | +| `font` | `string` | Font name (text elements only) | +| `font size` | `float` | Font size in points (text elements only) | ## Heading-Specific Fields | Field | Type | Description | |-------|------|------------| -| `level` | `string` | `"title"`, `"section"`, `"subsection"` | +| `level` | `string` | Semantic label: `"Title"`, `"Subtitle"`, `"Heading1"`, `"Heading2"`, `"Heading3"`, `"Heading4"` | | `heading level` | `int` | Numeric heading level (1–6) | ## Table-Specific Fields @@ -57,11 +57,12 @@ Each element in `kids` has a `type` field identifying its kind: | Field | Type | Description | |-------|------|------------| -| `col` | `int` | 0-based column index | -| `content` | `string` | Cell text | -| `is header` | `bool?` | Whether in a header row | -| `row span` | `int?` | Rows spanned (default: 1) | -| `col span` | `int?` | Columns spanned (default: 1) | +| `type` | `string` | Always `"table cell"` | +| `row number` | `int` | 1-based row index | +| `column number` | `int` | 1-based column index | +| `row span` | `int` | Number of rows spanned (default: 1) | +| `column span` | `int` | Number of columns spanned (default: 1) | +| `kids` | `Element[]` | Nested child elements (typically empty) | ## Full Example diff --git a/tutorials/02-python-sdk.md b/tutorials/02-python-sdk.md index 69cf691..1b44750 100644 --- a/tutorials/02-python-sdk.md +++ b/tutorials/02-python-sdk.md @@ -310,7 +310,7 @@ page1_sorted = sorted( reverse=True, # PDF coordinates start at bottom-left ) for e in page1_sorted[:10]: - print(f' [{e["bounding box"][1]:.0f}] {e["type"]}: {e["content"][:50]}') + print(f' [{e["bounding box"][1]:.0f}] {e["type"]}: {e.get("content", "")[:50]}') ``` --- diff --git a/tutorials/05-output-formats.md b/tutorials/05-output-formats.md index b248dff..fec6b36 100644 --- a/tutorials/05-output-formats.md +++ b/tutorials/05-output-formats.md @@ -109,14 +109,21 @@ Additional fields: `level` (semantic label: `"Title"`, `"H1"` … `"H6"`), `head "id": 12, "page number": 3, "bounding box": [72.0, 400.0, 540.0, 600.0], + "number of rows": 2, "rows": [ - ["Method", "Accuracy", "Speed"], - ["EdgeParse", "0.881", "0.023 s"] + { + "type": "table row", + "row number": 1, + "cells": [ + { "type": "table cell", "row number": 1, "column number": 1, "row span": 1, "column span": 1, "kids": [] }, + { "type": "table cell", "row number": 1, "column number": 2, "row span": 1, "column span": 1, "kids": [] } + ] + } ] } ``` -Additional field: `rows` — a 2D array of strings (row-major order). First row is the header row when detected. +Additional field: `rows` — array of row objects. Each row has `row number` and `cells` (array of cell objects with `row number`, `column number`, `row span`, `column span`, `kids`). #### `image` @@ -126,11 +133,11 @@ Additional field: `rows` — a 2D array of strings (row-major order). First row "id": 8, "page number": 2, "bounding box": [72.0, 300.0, 300.0, 500.0], - "image path": "output/images/page2_img1.png" + "source": "output/document_images/imageFile1.png" } ``` -`image path` is present only when `--image-output external` is set. +`source` is always present and contains the generated image path. #### `list` @@ -140,10 +147,9 @@ Additional field: `rows` — a 2D array of strings (row-major order). First row "id": 20, "page number": 4, "bounding box": [72.0, 200.0, 400.0, 280.0], - "items": [ - "First item", - "Second item", - "Nested list item" + "list items": [ + { "type": "list item", "content": "First item", "kids": [] }, + { "type": "list item", "content": "Second item", "kids": [] } ] } ``` @@ -176,7 +182,7 @@ Additional field: `rows` — a 2D array of strings (row-major order). First row ### Bounding Box Coordinates ``` -bounding_box = [x0, y0, x1, y1] +"bounding box" = [x0, y0, x1, y1] ``` - **Origin**: bottom-left corner of the page @@ -214,7 +220,8 @@ for e in doc["kids"]: if e["type"] == "table": print(f'\nTable on page {e["page number"]}:') for row in e["rows"]: - print(" ", " | ".join(row)) + n_cells = len(row.get("cells", [])) + print(f' Row {row["row number"]}: {n_cells} cell(s)') # --- Get bounding boxes for all paragraphs on page 1 -------------------- page1_paras = [ @@ -255,7 +262,10 @@ headings.slice(0, 5).forEach(h => { const tables = doc.kids.filter(e => e.type === 'table'); tables.forEach(t => { console.log(`\nTable on page ${t['page number']}:`); - t.rows.forEach(row => console.log(' ', row.join(' | '))); + t.rows.forEach(row => { + const nCells = row.cells?.length ?? 0; + console.log(` Row ${row['row number']}: ${nCells} cell(s)`); + }); }); // --- Prepare RAG chunks -------------------------------------------------