superdoc-dev
diff --git a/‎devtools/compare-rendering/.gitignore‎
Lines changed: 1 addition & 0 deletions b/‎devtools/compare-rendering/.gitignore‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎devtools/compare-rendering/README.md‎
Lines changed: 96 additions & 0 deletions b/‎devtools/compare-rendering/README.md‎
Lines changed: 96 additions & 0 deletions
diff --git a/‎devtools/compare-rendering/package.json‎
Lines changed: 10 additions & 0 deletions b/‎devtools/compare-rendering/package.json‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎devtools/compare-rendering/src/cache.ts‎
Lines changed: 36 additions & 0 deletions b/‎devtools/compare-rendering/src/cache.ts‎
Lines changed: 36 additions & 0 deletions
diff --git a/‎devtools/compare-rendering/src/cli.ts‎
Lines changed: 161 additions & 0 deletions b/‎devtools/compare-rendering/src/cli.ts‎
Lines changed: 161 additions & 0 deletions
@@ -0,0 +1 @@
+.cache/
@@ -0,0 +1,96 @@
+# compare-rendering
+
+Diffs Word and SuperDoc rendering of the same `.docx` at the *resolved schema* level — text, page assignment, and (in later milestones) font/indent/color/numbering. Emits typed `Finding[]` so an agent can route fixes to specific SuperDoc modules.
+
+This is a dev tool, not a pass/fail test. It surfaces concrete divergences so you don't have to compare screenshots by eye.
+
+## Scope (M1)
+
+- **Supported:** paragraph-only documents (text-heavy memos, letters, policies).
+- **Short-circuited with a reason:** docs containing tables, inline/floating shapes, or tracked changes. The report emits an `unsupported` finding and skips the diff — honest boundary rather than a misleading "everything looks fine."
+- **Categories emitted in M1:** `text`, `pagination`, `structure`, `unsupported`. Style/indent/color/numbering come in M2 once the SuperDoc-side normalizer pulls resolved values out of `measures[]` and `runs[]`.
+
+## Quick start
+
+```bash
+export WORD_MCP_URL="https://word-mcp.superdoc.workers.dev/mcp"
+export WORD_MCP_TOKEN="<your-bearer-token>"
+
+pnpm compare-rendering -- \
+  --input evals/fixtures/docs/memorandum.docx \
+  --format md
+```
+
+Run directly without the wrapper:
+
+```bash
+bun devtools/compare-rendering/src/cli.ts --input <path> --format md
+```
+
+Example output (truncated):
+
+```markdown
+# compare-rendering: memorandum.docx
+
+- Word pages: 3, SuperDoc pages: 3
+- Word paragraphs: 94, SuperDoc paragraphs: 94
+
+## Findings (2)
+
+### pagination (2)
+- **[visible]** Paragraph #39 landed on page 1 in SuperDoc but page 2 in Word (empty line)
+  - spec: ECMA-376 §17.3.1.16 (keepNext/keepLines/pageBreakBefore)
+  - code: `layout-engine/layout-engine/src/pagination`
+- **[visible]** Paragraph #80 landed on page 2 in SuperDoc but page 3 in Word ("   - Any press releases…")
+  - spec: ECMA-376 §17.3.1.16 (keepNext/keepLines/pageBreakBefore)
+  - code: `layout-engine/layout-engine/src/pagination`
+```
+
+## How it works
+
+```
+docx
+ ├── word adapter (POST run_powershell to word-mcp worker) ─► word.json (cached)
+ └── superdoc adapter (spawn pnpm layout:export-one)        ─► sd.layout.json
+                                │
+                        normalize both sides
+                                │
+                     NormalizedParagraph[] × 2
+                                │
+                           differ + taxonomy
+                                │
+                           Finding[] report
+```
+
+- Word extraction is **cached** by `sha256(docx) + sha256(extract-layout.ps1)`. Editing SuperDoc code and re-running the tool only re-runs the SuperDoc side — no re-hit to the VM (~25s saved per iteration). Editing the PowerShell script busts the cache automatically.
+- Bypass the cache for a single run with `--no-cache`.
+
+## Env
+
+| Variable         | Purpose                                              |
+|------------------|------------------------------------------------------|
+| `WORD_MCP_URL`   | HTTP endpoint of the word-mcp MCP worker             |
+| `WORD_MCP_TOKEN` | Bearer token (same one you use in your `.mcp.json`)  |
+
+## Exit codes
+
+- `0` — ran successfully; findings are at most `visible`/`cosmetic` (or no findings at all)
+- `1` — tool error (network, missing input, bad args)
+- `2` — ran successfully but emitted at least one `blocking` finding
+
+Makes it CI-usable later without rework.
+
+## Non-goals
+
+- Pixel diffing (see `tests/visual/`).
+- Tables, images, shapes, track changes, headers/footers, comments, TOC — deferred past M5.
+- Auto-fix generation.
+- Publishing as a package.
+
+## Milestones
+
+- **M1** (this): CLI works end-to-end on paragraph-only docs. 3 categories. JSON + markdown output. Caching.
+- **M2**: Pull resolved style fields out of SuperDoc's block schema. Taxonomy extends to `style`, `indent`, `font`, `color`, `alignment`, `spacing`, `numbering`.
+- **M3**: Batch mode (`--input-dir`), nightly run against the paragraph-only subset of the corpus, per-category dashboard.
+- **M4**: MCP wrapper `compare_rendering(docx_path)`. Agent dogfood with ECMA-spec MCP in context.
+- **M5**: Table support. Non-trivial — needs parallel table walks on both sides.
@@ -0,0 +1,10 @@
+{
+  "private": true,
+  "type": "module",
+  "name": "compare-rendering",
+  "scripts": {
+    "start": "bun src/cli.ts",
+    "typecheck": "tsc --noEmit",
+    "test": "vitest run"
+  }
+}
@@ -0,0 +1,36 @@
+import { createHash } from 'node:crypto';
+import { mkdir, readFile, stat, writeFile } from 'node:fs/promises';
+import { fileURLToPath } from 'node:url';
+import { dirname, join } from 'node:path';
+
+const CACHE_DIR = fileURLToPath(new URL('../.cache/word', import.meta.url));
+
+export function sha256(bytes: Uint8Array | string): string {
+  const h = createHash('sha256');
+  h.update(bytes);
+  return h.digest('hex');
+}
+
+export async function hashFile(path: string): Promise<string> {
+  return sha256(await readFile(path));
+}
+
+function cachePath(sha: string, keySuffix: string): string {
+  return join(CACHE_DIR, `${sha}-${keySuffix}.json`);
+}
+
+export async function readCache<T>(sha: string, keySuffix: string): Promise<T | null> {
+  const p = cachePath(sha, keySuffix);
+  try {
+    await stat(p);
+  } catch {
+    return null;
+  }
+  return JSON.parse(await readFile(p, 'utf8')) as T;
+}
+
+export async function writeCache<T>(sha: string, keySuffix: string, value: T): Promise<void> {
+  const p = cachePath(sha, keySuffix);
+  await mkdir(dirname(p), { recursive: true });
+  await writeFile(p, JSON.stringify(value), 'utf8');
+}
@@ -0,0 +1,161 @@
+#!/usr/bin/env bun
+import { parseArgs as nodeParseArgs } from 'node:util';
+import { resolve } from 'node:path';
+import { writeFile } from 'node:fs/promises';
+import { extractWord } from './word.ts';
+import { extractSuperDoc } from './superdoc.ts';
+import { normalizeSuperDoc, normalizeWord } from './normalize.ts';
+import { diffParagraphs } from './differ.ts';
+import { formatJson, formatMarkdown } from './format.ts';
+import type { CompareReport, Finding } from './types.ts';
+
+type Args = {
+  input: string;
+  output?: string;
+  format: 'json' | 'md';
+  pipeline: 'presentation' | 'headless';
+  cache: boolean;
+};
+
+const USAGE = `compare-rendering — diff Word vs SuperDoc rendering (paragraph-only scope)
+
+Usage:
+  pnpm compare-rendering -- --input <docx> [options]
+
+Options:
+  --input <path>             Required. Path to a .docx file.
+  --output <path>            Write the report to a file (default: stdout).
+  --format json|md           Output format (default: json).
+  --pipeline presentation|headless   SuperDoc layout pipeline (default: presentation).
+  --no-cache                 Bypass the Word extraction cache.
+  -h, --help                 Show this help.
+
+Env:
+  WORD_MCP_URL               HTTP endpoint of the word-mcp worker.
+  WORD_MCP_TOKEN             Bearer token for the worker.
+
+Exit codes:
+  0  — ran; findings are at most visible/cosmetic.
+  1  — tool error (network, missing file, bad args).
+  2  — ran; emitted at least one blocking finding.`;
+
+function parseArgs(argv: string[]): Args {
+  const { values } = nodeParseArgs({
+    args: argv,
+    options: {
+      input: { type: 'string' },
+      output: { type: 'string' },
+      format: { type: 'string', default: 'json' },
+      pipeline: { type: 'string', default: 'presentation' },
+      'no-cache': { type: 'boolean', default: false },
+      help: { type: 'boolean', short: 'h', default: false },
+    },
+    strict: true,
+    allowPositionals: false,
+  });
+
+  if (values.help) {
+    console.log(USAGE);
+    process.exit(0);
+  }
+
+  if (!values.input) throw new Error('--input <docx> is required');
+  if (values.format !== 'json' && values.format !== 'md') {
+    throw new Error(`--format must be json or md, got "${values.format}"`);
+  }
+  if (values.pipeline !== 'presentation' && values.pipeline !== 'headless') {
+    throw new Error(`--pipeline must be presentation or headless, got "${values.pipeline}"`);
+  }
+
+  return {
+    input: values.input,
+    output: values.output,
+    format: values.format,
+    pipeline: values.pipeline,
+    cache: !values['no-cache'],
+  };
+}
+
+function hasBlocking(findings: Finding[]): boolean {
+  return findings.some((f) => f.severity === 'blocking');
+}
+
+const log = (msg: string) => console.error(`[compare-rendering] ${msg}`);
+
+async function main(): Promise<void> {
+  const args = parseArgs(process.argv.slice(2));
+  const docxPath = resolve(args.input);
+
+  log(`word: extracting ${docxPath}`);
+  const wordStart = Date.now();
+  const { extraction: wordExtraction, sha, cached } = await extractWord(docxPath, { cache: args.cache });
+  log(`word: ${cached ? 'cached' : 'fresh'} extraction in ${Date.now() - wordStart}ms (sha=${sha.slice(0, 12)})`);
+
+  if (!wordExtraction.supported) {
+    const report: CompareReport = {
+      docxPath,
+      docxSha: sha,
+      wordSupported: false,
+      unsupportedReason: wordExtraction.unsupportedReason,
+      counts: {
+        wordParagraphs: 0,
+        superdocParagraphs: 0,
+        wordPages: wordExtraction.pageCount,
+        superdocPages: 0,
+      },
+      findings: [
+        {
+          category: 'unsupported',
+          severity: 'cosmetic',
+          paragraphOrdinal: 0,
+          word: wordExtraction.unsupportedReason,
+          superdoc: null,
+          message: `Document skipped: ${wordExtraction.unsupportedReason ?? 'unsupported'}`,
+        },
+      ],
+    };
+    await emit(report, args);
+    return;
+  }
+
+  log('superdoc: running layout:export-one');
+  const sdStart = Date.now();
+  const sdExtraction = await extractSuperDoc(docxPath, { pipeline: args.pipeline });
+  log(`superdoc: extracted in ${Date.now() - sdStart}ms`);
+
+  const wordParas = normalizeWord(wordExtraction);
+  const sdParas = normalizeSuperDoc(sdExtraction);
+
+  const findings = diffParagraphs(wordParas, sdParas);
+
+  const report: CompareReport = {
+    docxPath,
+    docxSha: sha,
+    wordSupported: true,
+    counts: {
+      wordParagraphs: wordParas.length,
+      superdocParagraphs: sdParas.length,
+      wordPages: wordExtraction.pageCount,
+      superdocPages: sdExtraction.pageCount,
+    },
+    findings,
+  };
+
+  await emit(report, args);
+  if (hasBlocking(findings)) process.exitCode = 2;
+}
+
+async function emit(report: CompareReport, args: Args): Promise<void> {
+  const out = args.format === 'md' ? formatMarkdown(report) : formatJson(report);
+  if (args.output) {
+    await writeFile(resolve(args.output), out, 'utf8');
+    log(`wrote ${resolve(args.output)}`);
+  } else {
+    process.stdout.write(out);
+  }
+}
+
+main().catch((e) => {
+  console.error(`[compare-rendering] error: ${(e as Error).message}`);
+  process.exit(1);
+});