From 470917c866995624a8767093bca076c8a7733df3 Mon Sep 17 00:00:00 2001 From: Kuba Sunderland-Ober Date: Sat, 23 May 2026 23:48:44 +0200 Subject: [PATCH 01/44] Use node's built-in zlib instead of pako, saves >1s from the PDF build. --- docs/lib/fast-deflate.mjs | 37 +++++ docs/render-book.mjs | 4 + perf/README.md | 29 +++- perf/measure.mjs | 64 ++++++++- perf/notes/08-pdf-lib.md | 293 ++++++++++++++++++++++++++++++++++++++ 5 files changed, 423 insertions(+), 4 deletions(-) create mode 100644 docs/lib/fast-deflate.mjs create mode 100644 perf/notes/08-pdf-lib.md diff --git a/docs/lib/fast-deflate.mjs b/docs/lib/fast-deflate.mjs new file mode 100644 index 00000000..08725ac3 --- /dev/null +++ b/docs/lib/fast-deflate.mjs @@ -0,0 +1,37 @@ +// Replace pako's pure-JS deflate with Node's zlib for the one path +// pdf-lib actually uses it on: PDFFlateStream.computeContents in +// node_modules/pdf-lib/cjs/core/structures/PDFFlateStream.js, which +// calls `pako.deflate(unencodedContents)` once per FlateStream during +// PDFDocument.save(). +// +// PDF /FlateDecode (ISO 32000-1 §7.4.4) is the zlib format (RFC 1950): +// a 2-byte zlib header + a raw deflate body (RFC 1951) + a 4-byte +// Adler-32 trailer. Both pako.deflate and zlib.deflateSync produce that +// format with default level 6, so the swap is wire-compatible -- output +// bytes may differ by a small amount (different match choices in the +// compressor's inner loop) but every PDF viewer reads either. +// +// Mechanism: pdf-lib is CJS in node_modules and calls +// `require("pako").deflate(...)` at the call site, not at import time. +// Mutating the live pako exports object is enough; no fork required. +// +// Side-effecting import. Import once before PDFDocument.save() runs: +// +// import "./lib/fast-deflate.mjs"; +// +// Idempotent -- repeated imports do nothing after the first. + +import { deflateSync } from "node:zlib"; +import pako from "pako"; + +if (!pako.__fastDeflateInstalled) { + const original = pako.deflate; + pako.deflate = function fastDeflate(data, options) { + // pdf-lib's only caller passes no options. Anything fancier (dictionary, + // raw, custom level) goes back to pako so we don't change behaviour + // outside the one hot path we care about. + if (options) return original.call(pako, data, options); + return deflateSync(data); + }; + pako.__fastDeflateInstalled = true; +} diff --git a/docs/render-book.mjs b/docs/render-book.mjs index e7ad9bfc..942fee5c 100644 --- a/docs/render-book.mjs +++ b/docs/render-book.mjs @@ -32,6 +32,10 @@ import { dirname, resolve } from 'node:path'; import { writeFileSync, existsSync } from 'node:fs'; import puppeteer from 'puppeteer'; import { PDFDocument, ParseSpeeds } from 'pdf-lib'; +// Side-effecting import: swaps pdf-lib's pako.deflate (pure JS) for +// node:zlib.deflateSync (C). Save phase only, same /FlateDecode output, +// ~1.5 s saved on the book. See perf/notes/08-pdf-lib.md. +import './lib/fast-deflate.mjs'; import { parseOutline, setOutline } from './lib/outline.mjs'; import { setMetadata } from './lib/postprocesser.mjs'; diff --git a/perf/README.md b/perf/README.md index 81e67b6f..4decd14b 100644 --- a/perf/README.md +++ b/perf/README.md @@ -61,6 +61,26 @@ Drop `--render-only` whenever you need to also measure generate / process (e.g. confirming a fix doesn't shift cost into `page.pdf()` or pdf-lib), or to write `book.pdf` for behavioural verification. +## Profiling pdf-lib (process phase): canonical command + +The mirror command for CPU-profiling the pdf-lib roundtrip: + +``` +node measure.mjs --cpu-profile-process --cpu-sampling 100 +``` + +`--cpu-profile-process` is the symmetric counterpart of +`--cpu-profile`. The render-side profile attaches to Chromium's V8 +via CDP because paged.js runs there; the process-side profile +attaches to Node's V8 via `node:inspector/promises` because pdf-lib +runs locally. Both produce the same `.cpuprofile` JSON shape, so +`analyze-profile.mjs` / `find-callers.mjs` / `find-callees.mjs` / +`grep-profile.mjs` work against either one. The two flags compose +when you want both phases captured in a single run. + +See [notes/08-pdf-lib.md](notes/08-pdf-lib.md) for the process-phase +investigations the flag enabled. + ## What's in this folder The harness and core probes: @@ -172,7 +192,8 @@ run.bat path\to\some-other.html # explicit input run.bat --out my-run # explicit output directory run.bat --no-detach-pages # opt out of the detach-pages fix (measure pre-fix O(n²) baseline) run.bat --timing # collect per-page wall time + heap (writes timing.csv + quartile summary) -run.bat --cpu-profile # CPU-profile the render phase +run.bat --cpu-profile # CPU-profile the render phase (CDP, Chromium-side) +run.bat --cpu-profile-process # CPU-profile the process phase (Node inspector, Node-side) run.bat --render-only # bail out after render (skip generate + process, ~47s saved) run.bat --clone-count # report Layout.append clones appended vs survivors per page run.bat --instrument # count + time DOM-accessor calls @@ -180,10 +201,12 @@ run.bat --time-hooks # per-task timing of every chunker/pol run.bat --incremental # process via incremental update instead of pdf-lib roundtrip run.bat --chrome-outline # let Chrome emit /Outlines (skip parseOutline + setOutline) run.bat --tracing # capture a hybrid Chrome trace (Blink events + embedded V8 cpu samples) +run.bat --fast-deflate # route pdf-lib's deflate through node:zlib (ships in render-book.mjs by default; opt-in here for A/B) ``` Flags compose. The CPU profile lands as `render.cpuprofile` (loadable in Chrome DevTools -> Performance -> "Load profile..."); +`--cpu-profile-process` writes `process.cpuprofile` alongside it; `--instrument` prints a per-op table at end-of-render. You need `_site-pdf\book.html` to exist first -- run `docs\build.bat` @@ -285,6 +308,7 @@ file documenting each: | Disable WhiteSpaceFilter | [05](notes/05-blink-trace.md) | ~0.7 s render | | Full sync chain (RunMicrotasks → 0) | [06](notes/06-microtasks-pageranges-css.md) | re-attribution | | `--disable-gpu` + `--in-process-gpu` | [07](notes/07-memory.md) | ~200 MB memory | +| `pako.deflate` → `node:zlib.deflateSync` | [08](notes/08-pdf-lib.md) | ~1.5 s process (save -58 %) | What was tried and didn't ship: @@ -298,7 +322,7 @@ What was tried and didn't ship: ## Investigation log -The seven phase files in [`notes/`](notes/) cover the full investigation +The phase files in [`notes/`](notes/) cover the full investigation narrative. Each is self-contained but they're written in chronological order; later ones reference earlier ones for context. @@ -311,3 +335,4 @@ order; later ones reference earlier ones for context. | [05-blink-trace.md](notes/05-blink-trace.md) | What happened when we tried move-not-clone (a `previousLeaf` cache shipped instead of move); cracking the cpu profile's `(program)` row open with a Blink-category trace; the WhiteSpaceFilter paired-A/B that found it wasn't worth its layout cost in our pipeline. | | [06-microtasks-pageranges-css.md](notes/06-microtasks-pageranges-css.md) | Following `RunMicrotasks` down to zero (chunker fully sync); why `pageRanges` sharding is off the table; CSS cost attribution showing print.css's individual sections are all below the noise floor. | | [07-memory.md](notes/07-memory.md) | Where the renderer's 1.9 GB goes -- process-tree footprint, per-allocator + per-Blink-class breakdown, `--disable-gpu` + `--in-process-gpu` saving ~200 MB, a GC-pass probe finding 180 MB of unswept Oilpan garbage. | +| [08-pdf-lib.md](notes/08-pdf-lib.md) | Profiling the process phase via `--cpu-profile-process`; finding pako's per-stream init dominates with ~4 500 small streams; routing `pako.deflate` through `node:zlib` (save -58 %, GC -383 ms). | diff --git a/perf/measure.mjs b/perf/measure.mjs index c3ec049a..48b2aa8f 100644 --- a/perf/measure.mjs +++ b/perf/measure.mjs @@ -21,12 +21,13 @@ // // Usage: // node measure.mjs [path/to/book.html] [--out ] [--keep-open] -// [--cpu-profile] [--cpu-sampling ] +// [--cpu-profile] [--cpu-profile-process] +// [--cpu-sampling ] // [--heap-profile] [--heap-sampling ] // [--tracing] // [--no-detach-pages] [--instrument] [--time-hooks] // [--incremental] [--chrome-outline] [--timing] -// [--clone-count] [--render-only] +// [--clone-count] [--render-only] [--fast-deflate] // // --render-only bails out after the render phase. Skips meta extraction, // parseOutline, page.pdf, and the pdf-lib roundtrip / incremental writer. @@ -82,10 +83,24 @@ // chrome://tracing or perfetto.dev, or run analyze-trace.mjs against it // for a top-N self-time table grouped by event name. Composable with // --cpu-profile; uses an independent CDP domain. +// +// --cpu-profile-process wraps the process phase only (pdf-lib roundtrip +// or incremental writer) in a V8 Profiler trace via Node's inspector +// module -- the process phase runs in Node, not Chromium, so CDP's +// Profiler can't see it. Writes process.cpuprofile alongside render's. +// Honours --cpu-sampling. Composable with --cpu-profile when you want +// both phases captured in one run. +// +// --fast-deflate routes pdf-lib's PDFFlateStream compression through +// Node's zlib (C++) instead of pako (pure JS). Same wire format +// (PDF /FlateDecode = RFC 1950 zlib), ~5-10x faster on big inputs. +// Save phase only -- load uses pako.inflate, which the profile shows +// isn't a hot path for our content. import { pathToFileURL, fileURLToPath } from 'node:url'; import { dirname, resolve, join } from 'node:path'; import { mkdirSync, writeFileSync, existsSync } from 'node:fs'; +import { Session } from 'node:inspector/promises'; import puppeteer from 'puppeteer'; import { PDFDocument, ParseSpeeds } from 'pdf-lib'; // Shared with docs/render-book.mjs -- the helpers and the paged.js @@ -109,6 +124,7 @@ let inputArg = null; let outArg = null; let keepOpen = false; let cpuProfile = false; +let cpuProfileProcess = false; let cpuSampling = 1000; // microseconds let heapProfile = false; let heapSampling = 32768; // bytes between samples (CDP default) @@ -121,11 +137,13 @@ let timing = false; let cloneCount = false; let renderOnly = false; let tracing = false; +let fastDeflate = false; for (let i = 0; i < args.length; i++) { const a = args[i]; if (a === '--out') outArg = args[++i]; else if (a === '--keep-open') keepOpen = true; else if (a === '--cpu-profile') cpuProfile = true; + else if (a === '--cpu-profile-process') cpuProfileProcess = true; else if (a === '--cpu-sampling') cpuSampling = parseInt(args[++i], 10); else if (a === '--heap-profile') heapProfile = true; else if (a === '--heap-sampling') heapSampling = parseInt(args[++i], 10); @@ -141,6 +159,7 @@ for (let i = 0; i < args.length; i++) { else if (a === '--render-only') renderOnly = true; else if (a === '--tracing') tracing = true; else if (a === '--no-affinity') { /* handled in pin-cpu.mjs */ } + else if (a === '--fast-deflate') fastDeflate = true; else if (!inputArg) inputArg = a; else { console.error(`unknown arg: ${a}`); process.exit(2); } } @@ -175,6 +194,20 @@ for (const p of required) { } } +if (cpuProfileProcess && renderOnly) { + console.error('--cpu-profile-process is incompatible with --render-only (the process phase is skipped).'); + process.exit(2); +} + +// Install the Node-zlib override for pdf-lib's PDFFlateStream compression +// before any pdf-lib operation. Side-effecting import; idempotent. The +// override only kicks in on pako.deflate calls (i.e. save()), so render- +// only runs that never reach the pdf-lib path are unaffected either way. +if (fastDeflate) { + await import('../docs/lib/fast-deflate.mjs'); + console.log('[harness] fast-deflate: pako.deflate -> node:zlib.deflateSync'); +} + const stamp = new Date().toISOString().replace(/[:.]/g, '-'); const outDir = outArg ? resolve(process.cwd(), outArg) @@ -370,6 +403,7 @@ try { let rawPdfBytes = null; let processMs = null; let processBreakdown = null; + let processProfilePath = null; let finalPdf = null; if (!renderOnly) { @@ -430,6 +464,22 @@ try { // // Either way we time the full phase plus the meaningful sub-steps so the // breakdown matches across runs. + // + // --cpu-profile-process attaches Node's inspector Profiler around this + // block. The render phase profiles via CDP because the work happens in + // Chromium; the process phase profiles via Node's inspector because + // pdf-lib runs locally. Output file shape (V8 .cpuprofile JSON) is the + // same either way. + let inspectorSession = null; + if (cpuProfileProcess) { + inspectorSession = new Session(); + inspectorSession.connect(); + await inspectorSession.post('Profiler.enable'); + await inspectorSession.post('Profiler.setSamplingInterval', { interval: cpuSampling }); + await inspectorSession.post('Profiler.start'); + console.log(`[harness] process cpu profile: sampling every ${cpuSampling}us`); + } + const tProcStart = Date.now(); if (incremental) { const tIncStart = Date.now(); @@ -463,6 +513,15 @@ try { } const tProcEnd = Date.now(); processMs = tProcEnd - tProcStart; + if (inspectorSession) { + const { profile } = await inspectorSession.post('Profiler.stop'); + await inspectorSession.post('Profiler.disable'); + inspectorSession.disconnect(); + processProfilePath = join(outDir, 'process.cpuprofile'); + const profileJson = JSON.stringify(profile); + writeFileSync(processProfilePath, profileJson); + console.log(`[harness] process cpu profile: ${processProfilePath} (${(profileJson.length / 1024 / 1024).toFixed(1)} MB)`); + } if (incremental) { console.log(`[harness] process ${fmtMs(processMs)} (incremental=${fmtMs(processBreakdown.incrementalMs)}, +${processBreakdown.appendedBytes}B, ${processBreakdown.newObjectCount} new objs)`); } else { @@ -506,6 +565,7 @@ try { record.phases.process = { ms: processMs, mode: incremental ? 'incremental' : 'pdf-lib-roundtrip', + cpuProfile: processProfilePath, ...processBreakdown, }; } diff --git a/perf/notes/08-pdf-lib.md b/perf/notes/08-pdf-lib.md new file mode 100644 index 00000000..9abc127b --- /dev/null +++ b/perf/notes/08-pdf-lib.md @@ -0,0 +1,293 @@ +# pdf-lib: profiling the process phase + +Wiring `--cpu-profile-process` so the pdf-lib roundtrip becomes visible to the same `analyze-profile.mjs` toolchain we already use on the render phase, then following the bottom-up table -- pako dominates with per-stream init overhead, routing `pako.deflate` through `node:zlib` saves ~1.5 s of process wall (save -58 %). + +The render-side investigations (notes [01](01-baseline-and-detach.md) +through [07](07-memory.md)) brought render down from ~104 s to ~8 s +and process from ~40 s to ~5 s. By [`pdf-lib parseSpeed: Fastest`](01-baseline-and-detach.md) +the process phase was a flat ~5 s of `load + setOutline + save`, the +sub-step numbers were the only thing we knew about it, and there was +no bottom-up table to point at: CDP's `Profiler` attaches to Chromium +and the process phase runs in Node, so `--cpu-profile` couldn't see +it. + +## `--cpu-profile-process` + +Added to `measure.mjs`: opens an in-process V8 Profiler via +`node:inspector/promises`, brackets the process phase the same way +`--cpu-profile` brackets render, and writes `process.cpuprofile` +alongside `render.cpuprofile`. Same `.cpuprofile` JSON shape, so the +existing `analyze-profile.mjs` / `find-callers.mjs` / +`find-callees.mjs` work unchanged. See the *Profiling pdf-lib +(process phase): canonical command* section in [the README](../README.md) +for the operational form. + +First run on the 1638-page book (`--detach-pages --no-timing +--cpu-profile-process --cpu-sampling 100`), process 4.66 s (load +1.88 s, setOutline 0.01 s, save 2.77 s). Top of the bottom-up table: + +``` +samples: 8560 duration: 4.68s us/sample: 547 + + self_ms self_% function @ source + ------- ------ ---------------------------------------------- + 645.24 13.85% (garbage collector) + 460.42 9.88% longest_match pako/lib/zlib/deflate.js:231 + 428.15 9.19% deflateInit2 pako/lib/zlib/deflate.js:1327 + 374.02 8.03% PDFRef.of pdf-lib/.../PDFRef.js:34 + 218.73 4.69% decodeName pdf-lib/.../PDFName.js:9 + 218.73 4.69% PDFDict.entries pdf-lib/.../PDFDict.js:22 + 182.64 3.92% deflate_slow pako/lib/zlib/deflate.js:726 + 119.75 2.57% parseRawNumber pdf-lib/.../BaseParser.js:33 + 114.28 2.45% DeflateState pako/lib/zlib/deflate.js:1092 + 113.19 2.43% parseName pdf-lib/.../PDFObjectParser.js:117 + ... pako rows and parser rows continue down the table ... +``` + +Adding up pako frames (`longest_match` + `deflateInit2` + +`deflate_slow` + `DeflateState` + `lm_init` + `compress_block` + +`build_tree` + `Deflate.push` + `adler32`) lands at **~1.42 s, ~30 % +of the process phase**. Of that, the *initialization* group +(`deflateInit2` + `DeflateState` + `lm_init`) was **~628 ms** -- so +~44 % of pako's time was spent setting up Deflate state, not +compressing bytes. That number per call doesn't explain itself +unless the call count is high. + +## Are we compressing Chrome's already-compressed streams? + +Reasonable hypothesis: pdf-lib loads, decompresses Chrome's content +streams, and then re-compresses them on save. That would put Chrome's +~52 MB of content through deflate twice, and explain the heavy +pako time as wasted work. + +Walking the code: + +- `PDFObjectParser.parseDictOrStream` (`pdf-lib/.../parser/PDFObjectParser.js:171`) + always ends with `return PDFRawStream.of(dict, contents)`. Every + stream pdf-lib parses out of the input is a `PDFRawStream` holding + the verbatim bytes between `stream` / `endstream`. No decompression. +- `PDFRawStream.getContents` (`pdf-lib/.../objects/PDFRawStream.js:22`) + returns those bytes unchanged. +- `PDFStreamWriter.computeBufferSize` (`pdf-lib/.../writers/PDFStreamWriter.js:43-46`) + marks `shouldNotCompress = true` for anything that's `instanceof + PDFStream` (which includes `PDFRawStream`). Those go out verbatim + with the original `/Filter` preserved. + +`pako.deflate` lives in `PDFFlateStream.computeContents` +(`pdf-lib/.../structures/PDFFlateStream.js:15`); the only subclasses +are `PDFContentStream`, `PDFCrossRefStream`, and `PDFObjectStream`. +None of those are instantiated by the parser. So **Chrome's content +streams ride through as `PDFRawStream` and never see pako**. + +Confirmed by instrumenting `pako.deflate` and re-running the save +on the produced book.pdf: + +``` +deflate calls during save : 4524 +bytes fed to deflate : 24.28 MB +bytes produced : 4.39 MB +final pdf size : 16.08 MB +``` + +The 4,524 deflate calls are pdf-lib's **own** new streams: + +- ~4,523 `PDFObjectStream` chunks. `PDFStreamWriter.forContext` + defaults to `objectsPerStream = 50`; the book has **228,191 + indirect objects**, so pdf-lib packs ~4,564 chunks of 50 each. +- 1 `PDFCrossRefStream` for the xref. + +## Wait -- the pdf-lib output is *smaller* than Chrome's. What's going on? + +Chrome's raw PDF is 39.3 MB, pdf-lib's final PDF is 16.1 MB. That +23 MB shrink isn't pdf-lib throwing anything away -- it's compressing +something Chrome chose to emit verbatim. + +Tallying the 228,191 indirect objects pdf-lib sees by type: + +``` +130,787 StructElem /S=/NonStruct (a11y wrapper around content w/o structural role) + 22,193 StructElem /S=/Strong (bold) + 11,003 Dict /Type=/Annot (mostly hyperlinks) + 10,054 StructElem /S=/Link + 9,164 StructElem /S=/P (paragraph) + 8,417 StructElem /S=/Em (emphasis) + 5,270 StructElem /S=/TD (table cell) + 4,822 StructElem /S=/Code + 3,392 StructElem /S=/LI (list item) + 3,040 StructElem /S=/H5 + ... another ~15 k StructElems in long tail (H1-H6, L, TR, Art, ...) + 2,061 PDFRawStream (Chrome's content + font + image streams) + 1,651 Dict /Type=/Page + ... ~3.5 k misc dicts ... +``` + +**Over 225,000 are tiny `<<...>>` StructElem dicts** -- the +tagged-PDF structure tree, which Chrome emits because we pass +`tagged: true` to `page.pdf()`. Each `StructElem` is something like +`<>` -- a +few hundred bytes of mostly boilerplate. + +Chrome writes them as plain text indirect objects -- 225k × a few +hundred bytes ≈ 28 MB of `<<...>>` source. pdf-lib's +`PDFStreamWriter` packs those 50 at a time into PDFObjectStreams, +each of which is then deflate-compressed. The dict syntax is wildly +repetitive across siblings (`/Type /StructElem` literally appears +225k times), so deflate compresses the packed text ~5.5x. The +24.28 MB of small-dict text fed to deflate above comes out the +other side at 4.39 MB. Add the ~11 MB of `PDFRawStream` bytes that +pass through verbatim, plus a few KB of misc, and the 16.1 MB total +checks out. + +The pdf-lib roundtrip's win over Chrome's raw output is **encoding +the same information** in PDF 1.5's compressed-object-streams +feature instead of as plain `<<...>>` text. Skia's PDF writer +chooses not to use that feature. + +This also explains the pako profile shape. The workload is *many +small streams* (~4,500 of them at ~5.4 KB input each), which is +exactly where per-stream initialization dominates: the 628 ms in +`deflateInit2` + `DeflateState` + `lm_init` is paid 4,500 times, +while the per-call payload is small enough that the actual +compression work (~755 ms across `longest_match` + `deflate_slow` ++ `compress_block` + `build_tree` + `adler32`) isn't proportionally +larger. + +## The shim + +PDF `/FlateDecode` (ISO 32000-1 §7.4.4) is the zlib format +(RFC 1950): 2-byte zlib header + raw deflate body (RFC 1951) + 4-byte +Adler-32 trailer. Both `pako.deflate(data)` and Node's +`zlib.deflateSync(data)` produce that format at default level 6. +Verified head-to-head: each compresses to an equivalent-size zlib +stream starting `78 9c`, and either can decompress the other's +output back to the original input bytes. + +`docs/lib/fast-deflate.mjs` is a side-effecting import that mutates +the live `pako` exports: + +```js +import { deflateSync } from "node:zlib"; +import pako from "pako"; + +if (!pako.__fastDeflateInstalled) { + const original = pako.deflate; + pako.deflate = function fastDeflate(data, options) { + if (options) return original.call(pako, data, options); + return deflateSync(data); + }; + pako.__fastDeflateInstalled = true; +} +``` + +pdf-lib's CJS code reads `require("pako").deflate` at call time +(`pako_1.default.deflate(unencodedContents)` inside +`PDFFlateStream.computeContents`), so mutating the live module +exports propagates without forking pdf-lib. The `options` +fallthrough means any caller that needs pako's non-default +behaviour (dictionaries, raw deflate, custom level) is unaffected; +pdf-lib's only call site passes no options. + +Microbenchmark on the harness machine, both unrelated to the book: + +``` +zlib.deflateSync(50 MB of ASCII) 112 ms +zlib.deflateSync(book.pdf as input, 16.1 MB) 283 ms +``` + +For comparison, pako spent ~1.42 s on the book's actual save +workload (~24 MB across 4,524 calls). Same order of magnitude as +the raw-throughput numbers above, but with more per-call overhead +-- which matches what a JS implementation is expected to lose +against C when amortised across many small calls. + +`docs/render-book.mjs` imports the shim unconditionally near its +pdf-lib import; production runs through it. `measure.mjs` adds a +`--fast-deflate` flag, opt-in in the harness so paired pre/post +A/Bs are still easy. + +## Results + +Paired A/B, four interleaved runs (`pre1 post1 pre2 post2`) with +`--detach-pages --no-timing --cpu-profile-process --cpu-sampling +100`, same 1638-page book each: + +| metric | pre1 | pre2 | pre avg | post1 | post2 | post avg | Δ | +| ------------- | ------ | ------ | ------- | ------ | ------ | -------- | ---------------- | +| **process** | 4.20 s | 4.27 s | **4.24 s** | 2.79 s | 2.74 s | **2.77 s** | **-1.47 s (-35 %)** | +| ↳ load | 1.53 s | 1.54 s | 1.54 s | 1.67 s | 1.61 s | 1.64 s | +0.10 s (noise; load goes through `pako.inflate`, untouched) | +| ↳ setOutline | 0.01 s | 0.01 s | 0.01 s | 0.01 s | 0.01 s | 0.01 s | unchanged | +| ↳ **save** | 2.66 s | 2.72 s | **2.69 s** | 1.11 s | 1.12 s | **1.12 s** | **-1.57 s (-58 %)** | +| pdf size | 16.1 MB | 16.1 MB | 16.1 MB | 16.1 MB | 16.1 MB | 16.1 MB | identical | + +Render and generate wall-clock numbers varied ±5 s between runs +(machine load) but the process numbers are tight to ±0.05 s. + +Post-fix bottom-up profile, same flags: + +``` +samples: 5229 duration: 2.82s us/sample: 540 + + self_ms self_% function + ------- ------ -------------------------------------------------- + 348.83 12.48% writeSync (Node libuv syscall) + 335.87 12.01% PDFRef.of pdf-lib/.../PDFRef.js:34 + 262.44 9.39% (garbage collector) + 165.24 5.91% PDFDict.entries + 159.84 5.72% decodeName + 108.00 3.86% parseName + 102.60 3.67% parseRawNumber + 88.56 3.17% parseRawInt + 72.90 2.61% PDFName.of + 71.28 2.55% parseDict + ... pako rows absent from the table ... +``` + +Two structural changes worth calling out: + +- All pako frames dropped out of the top 20. `writeSync` at 12.48 % + is libuv's syscall wrapper waiting on zlib's C++ work; that work + doesn't itself show in the JS-frame bottom-up because it runs off + the JS thread. The ~349 ms here is the total wait time across all + ~4,500 calls. +- `(garbage collector)` dropped from 645 ms to 262 ms (-383 ms). + That matches the per-call allocator pressure from creating a fresh + `Deflate` instance + `DeflateState` per pako call, now gone. + +End-to-end `book.bat` run with the shim: + +``` +render: 8.5s (1651 pages) +generate: 37.1s (raw 39.3 MB) +process: 2.5s +saved: docs\_pdf\book.pdf (16.1 MB) +total: 50.1s +``` + +Process is now under three seconds on the production path. Wall-clock +total ~50 s vs the prior ~70 s baseline. Output PDF byte size +unchanged from the pre-shim build (16.1 MB; standard `/CreationDate` +drift between runs). + +## What this didn't fix + +After the shim the bottom-up profile points at the next two +JS-attributable buckets: + +- `PDFRef.of` at 336 ms self-time (12 %). The function builds a + string key ` R` per call and Map-looks it up; the + string allocation per call is the cost. A drop-in fix would + replace the `Map` pool with a flat array for the gen=0 + case (the overwhelming majority) and a fallback Map for gen ≠ 0. + Estimated ~300 ms saved. +- `(garbage collector)` at 262 ms (9 %). Tied to `PDFRef.of` and + the per-object dict allocations in the writer; would likely + shrink along with the first item. + +Neither moves the wall-clock total meaningfully on its own -- +process is now 2.77 s of a ~50 s build -- so they're left in place +unless or until they become the bottleneck. + +The strategic note from earlier phases still stands: `pageRanges` +sharding of generate is the only remaining knob with a profile +target large enough to move total wall-clock by more than a few +seconds. From c423231047cae9fb64c2438cddfa16274fb61221 Mon Sep 17 00:00:00 2001 From: Kuba Sunderland-Ober Date: Sat, 23 May 2026 23:55:00 +0200 Subject: [PATCH 02/44] Speed up pdf loading by ~0.3s. --- docs/lib/fast-refs.mjs | 46 ++++++++ docs/render-book.mjs | 13 ++- perf/README.md | 52 ++++++++-- perf/measure.mjs | 15 ++- perf/notes/08-pdf-lib.md | 219 +++++++++++++++++++++++++++++++++++++-- 5 files changed, 319 insertions(+), 26 deletions(-) create mode 100644 docs/lib/fast-refs.mjs diff --git a/docs/lib/fast-refs.mjs b/docs/lib/fast-refs.mjs new file mode 100644 index 00000000..4212162c --- /dev/null +++ b/docs/lib/fast-refs.mjs @@ -0,0 +1,46 @@ +// Replace pdf-lib's PDFRef.of pool lookup with a dense-array cache +// for the generation=0 case (the overwhelmingly common one). +// +// The upstream implementation +// (node_modules/pdf-lib/cjs/core/objects/PDFRef.js) keys its pool by +// a freshly-built string ` R` on every call: +// +// var tag = objectNumber + " " + generationNumber + " R"; +// var instance = pool.get(tag); +// +// On the book we see ~1.2 M PDFRef.of calls per load, 82 % of them +// with gen=0; each call allocates the tag string before Map.get can +// hash it. That's ~330 ms of self-time on the process-phase profile +// plus measurable GC pressure. +// +// Shim: dense array indexed by objectNumber for the gen=0 branch. +// Plain array indexing, no string alloc, no Map hash. Cache-in-front +// of the original PDFRef.of so we don't need its module-private +// ENFORCER -- on miss we delegate, on hit we return our cached +// instance. +// +// gen != 0 calls (the other 18 %, pdf-lib's xref-stream bookkeeping +// where the "generation" field encodes an in-ObjStm index per +// PDF 1.5 spec, see PDFXRefStreamParser.js:74-80) pass through to +// the original unchanged. +// +// Side-effecting import. Import once before any pdf-lib operation. +// Idempotent. + +import { PDFRef } from "pdf-lib"; + +if (!PDFRef.__fastPoolInstalled) { + const original = PDFRef.of; + const pool0 = []; + PDFRef.of = function fastOf(objectNumber, generationNumber) { + if (generationNumber === undefined || generationNumber === 0) { + const existing = pool0[objectNumber]; + if (existing) return existing; + const fresh = original.call(PDFRef, objectNumber, 0); + pool0[objectNumber] = fresh; + return fresh; + } + return original.call(PDFRef, objectNumber, generationNumber); + }; + PDFRef.__fastPoolInstalled = true; +} diff --git a/docs/render-book.mjs b/docs/render-book.mjs index 942fee5c..f219ec06 100644 --- a/docs/render-book.mjs +++ b/docs/render-book.mjs @@ -32,10 +32,17 @@ import { dirname, resolve } from 'node:path'; import { writeFileSync, existsSync } from 'node:fs'; import puppeteer from 'puppeteer'; import { PDFDocument, ParseSpeeds } from 'pdf-lib'; -// Side-effecting import: swaps pdf-lib's pako.deflate (pure JS) for -// node:zlib.deflateSync (C). Save phase only, same /FlateDecode output, -// ~1.5 s saved on the book. See perf/notes/08-pdf-lib.md. +// Side-effecting imports. Order doesn't matter; both mutate live +// module exports before any pdf-lib operation. See +// perf/notes/08-pdf-lib.md. +// +// fast-deflate -- swaps pdf-lib's pako.deflate (pure JS) for +// node:zlib.deflateSync (C). ~1.5 s saved on the save phase. +// fast-refs -- dense-array cache in front of PDFRef.of for the +// gen=0 case (82 % of ~1.2 M calls per load). ~0.2 s saved on +// load. import './lib/fast-deflate.mjs'; +import './lib/fast-refs.mjs'; import { parseOutline, setOutline } from './lib/outline.mjs'; import { setMetadata } from './lib/postprocesser.mjs'; diff --git a/perf/README.md b/perf/README.md index 4decd14b..53f0a167 100644 --- a/perf/README.md +++ b/perf/README.md @@ -66,20 +66,48 @@ or pdf-lib), or to write `book.pdf` for behavioural verification. The mirror command for CPU-profiling the pdf-lib roundtrip: ``` -node measure.mjs --cpu-profile-process --cpu-sampling 100 +node measure.mjs --fast-deflate --fast-refs --cpu-profile-process --cpu-sampling 100 ``` -`--cpu-profile-process` is the symmetric counterpart of -`--cpu-profile`. The render-side profile attaches to Chromium's V8 -via CDP because paged.js runs there; the process-side profile -attaches to Node's V8 via `node:inspector/promises` because pdf-lib -runs locally. Both produce the same `.cpuprofile` JSON shape, so -`analyze-profile.mjs` / `find-callers.mjs` / `find-callees.mjs` / -`grep-profile.mjs` work against either one. The two flags compose -when you want both phases captured in a single run. +Flag rationale: + +- `--fast-deflate` -- inject the + [docs/lib/fast-deflate.mjs](../docs/lib/fast-deflate.mjs) shipping + fix (`pako.deflate` -> `node:zlib.deflateSync`). Production runs + through it; the profile should too. +- `--fast-refs` -- inject the + [docs/lib/fast-refs.mjs](../docs/lib/fast-refs.mjs) shipping fix + (dense-array cache for `PDFRef.of`'s gen=0 path). Same logic. +- `--cpu-profile-process` -- attach Node's `inspector/promises` + Profiler around the process phase only (skips render and generate). + Writes `process.cpuprofile` into the timestamped `results/` folder. + The render-phase `--cpu-profile` is CDP / Chromium; this one is + Node / Node's V8 -- different runtimes, same `.cpuprofile` JSON + shape, so `analyze-profile.mjs` / `find-callers.mjs` / + `find-callees.mjs` / `grep-profile.mjs` work against either. +- `--cpu-sampling 100` -- 100 us sampling. The process phase is now + ~2.3 s; at 1 ms default sampling that's only ~2300 samples and the + bottom-up table runs noisy. 100 us is the right resolution for + this length. + +The command intentionally **does not** pass `--cpu-profile`. There's +no rule against running both at once -- they attach to different V8s +and don't interfere -- but the render profile dilutes the bottom-up +view of "what's left in pdf-lib," and the trace files are large. +Profile one phase at a time. + +Why no `--render-only`? `--cpu-profile-process` requires the process +phase to run; the harness errors out if you combine them. + +To compare against upstream pdf-lib (e.g. when proposing a change +upstream), drop `--fast-deflate` and `--fast-refs`. Caveat for +A/B work: profiler-on attribution overstates the cost of hot +functions called millions of times (`PDFRef.of` in particular). For +"did this wall-clock change," do a paired no-profile A/B as a +sanity check. See [notes/08-pdf-lib.md](notes/08-pdf-lib.md) for the process-phase -investigations the flag enabled. +investigations these flags enabled. ## What's in this folder @@ -202,6 +230,7 @@ run.bat --incremental # process via incremental update inste run.bat --chrome-outline # let Chrome emit /Outlines (skip parseOutline + setOutline) run.bat --tracing # capture a hybrid Chrome trace (Blink events + embedded V8 cpu samples) run.bat --fast-deflate # route pdf-lib's deflate through node:zlib (ships in render-book.mjs by default; opt-in here for A/B) +run.bat --fast-refs # dense-array cache for PDFRef.of's gen=0 path (also ships; opt-in here for A/B) ``` Flags compose. The CPU profile lands as `render.cpuprofile` @@ -309,6 +338,7 @@ file documenting each: | Full sync chain (RunMicrotasks → 0) | [06](notes/06-microtasks-pageranges-css.md) | re-attribution | | `--disable-gpu` + `--in-process-gpu` | [07](notes/07-memory.md) | ~200 MB memory | | `pako.deflate` → `node:zlib.deflateSync` | [08](notes/08-pdf-lib.md) | ~1.5 s process (save -58 %) | +| `PDFRef.of` dense-array cache (gen=0) | [08](notes/08-pdf-lib.md) | ~0.2 s process (load -16 %) | What was tried and didn't ship: @@ -335,4 +365,4 @@ order; later ones reference earlier ones for context. | [05-blink-trace.md](notes/05-blink-trace.md) | What happened when we tried move-not-clone (a `previousLeaf` cache shipped instead of move); cracking the cpu profile's `(program)` row open with a Blink-category trace; the WhiteSpaceFilter paired-A/B that found it wasn't worth its layout cost in our pipeline. | | [06-microtasks-pageranges-css.md](notes/06-microtasks-pageranges-css.md) | Following `RunMicrotasks` down to zero (chunker fully sync); why `pageRanges` sharding is off the table; CSS cost attribution showing print.css's individual sections are all below the noise floor. | | [07-memory.md](notes/07-memory.md) | Where the renderer's 1.9 GB goes -- process-tree footprint, per-allocator + per-Blink-class breakdown, `--disable-gpu` + `--in-process-gpu` saving ~200 MB, a GC-pass probe finding 180 MB of unswept Oilpan garbage. | -| [08-pdf-lib.md](notes/08-pdf-lib.md) | Profiling the process phase via `--cpu-profile-process`; finding pako's per-stream init dominates with ~4 500 small streams; routing `pako.deflate` through `node:zlib` (save -58 %, GC -383 ms). | +| [08-pdf-lib.md](notes/08-pdf-lib.md) | Profiling the process phase via `--cpu-profile-process`; pako's per-stream init dominates with ~4 500 small streams (routing `pako.deflate` through `node:zlib` saves ~1.5 s); `PDFRef.of`'s string-keyed Map lookup at 1.2 M calls per load (dense-array gen=0 cache saves ~0.2 s). | diff --git a/perf/measure.mjs b/perf/measure.mjs index 48b2aa8f..9e9e0646 100644 --- a/perf/measure.mjs +++ b/perf/measure.mjs @@ -27,7 +27,8 @@ // [--tracing] // [--no-detach-pages] [--instrument] [--time-hooks] // [--incremental] [--chrome-outline] [--timing] -// [--clone-count] [--render-only] [--fast-deflate] +// [--clone-count] [--render-only] +// [--fast-deflate] [--fast-refs] // // --render-only bails out after the render phase. Skips meta extraction, // parseOutline, page.pdf, and the pdf-lib roundtrip / incremental writer. @@ -96,6 +97,12 @@ // (PDF /FlateDecode = RFC 1950 zlib), ~5-10x faster on big inputs. // Save phase only -- load uses pako.inflate, which the profile shows // isn't a hot path for our content. +// +// --fast-refs replaces PDFRef.of's string-keyed Map lookup with a +// dense-array cache for the gen=0 case (82 % of ~1.2 M calls on the +// book). Eliminates the per-call ` R` string allocation +// and Map hash. gen != 0 calls (pdf-lib's xref-stream bookkeeping +// for compressed objects) pass through unchanged. import { pathToFileURL, fileURLToPath } from 'node:url'; import { dirname, resolve, join } from 'node:path'; @@ -138,6 +145,7 @@ let cloneCount = false; let renderOnly = false; let tracing = false; let fastDeflate = false; +let fastRefs = false; for (let i = 0; i < args.length; i++) { const a = args[i]; if (a === '--out') outArg = args[++i]; @@ -160,6 +168,7 @@ for (let i = 0; i < args.length; i++) { else if (a === '--tracing') tracing = true; else if (a === '--no-affinity') { /* handled in pin-cpu.mjs */ } else if (a === '--fast-deflate') fastDeflate = true; + else if (a === '--fast-refs') fastRefs = true; else if (!inputArg) inputArg = a; else { console.error(`unknown arg: ${a}`); process.exit(2); } } @@ -207,6 +216,10 @@ if (fastDeflate) { await import('../docs/lib/fast-deflate.mjs'); console.log('[harness] fast-deflate: pako.deflate -> node:zlib.deflateSync'); } +if (fastRefs) { + await import('../docs/lib/fast-refs.mjs'); + console.log('[harness] fast-refs: PDFRef.of dense-array cache for gen=0'); +} const stamp = new Date().toISOString().replace(/[:.]/g, '-'); const outDir = outArg diff --git a/perf/notes/08-pdf-lib.md b/perf/notes/08-pdf-lib.md index 9abc127b..51855446 100644 --- a/perf/notes/08-pdf-lib.md +++ b/perf/notes/08-pdf-lib.md @@ -268,7 +268,7 @@ total ~50 s vs the prior ~70 s baseline. Output PDF byte size unchanged from the pre-shim build (16.1 MB; standard `/CreationDate` drift between runs). -## What this didn't fix +## After the shim: what's left After the shim the bottom-up profile points at the next two JS-attributable buckets: @@ -277,17 +277,214 @@ JS-attributable buckets: string key ` R` per call and Map-looks it up; the string allocation per call is the cost. A drop-in fix would replace the `Map` pool with a flat array for the gen=0 - case (the overwhelming majority) and a fallback Map for gen ≠ 0. - Estimated ~300 ms saved. + case and a fallback Map for gen ≠ 0. Followed up below. - `(garbage collector)` at 262 ms (9 %). Tied to `PDFRef.of` and - the per-object dict allocations in the writer; would likely + the per-object dict allocations in the writer; expected to shrink along with the first item. -Neither moves the wall-clock total meaningfully on its own -- -process is now 2.77 s of a ~50 s build -- so they're left in place -unless or until they become the bottleneck. +## `PDFRef.of`: dense-array cache for the gen=0 path -The strategic note from earlier phases still stands: `pageRanges` -sharding of generate is the only remaining knob with a profile -target large enough to move total wall-clock by more than a few -seconds. +The upstream implementation: + +```js +var pool = new Map(); +PDFRef.of = function (objectNumber, generationNumber) { + if (generationNumber === void 0) { generationNumber = 0; } + var tag = objectNumber + " " + generationNumber + " R"; // alloc + var instance = pool.get(tag); // hash + if (!instance) { + instance = new PDFRef(ENFORCER, objectNumber, generationNumber); + pool.set(tag, instance); + } + return instance; +}; +``` + +Per call: build a fresh ` R` string, hand it to a +`Map` lookup that has to hash it, branch on miss. The +string allocation is the cost we care about -- the dedup pool +itself works correctly, it's just paying for its key on every read. + +### Workload shape + +Instrumented `PDFRef.of` and re-ran the harness through load + save: + +``` +total PDFRef.of calls : 1,231,643 + gen=0 (or undefined) : 1,010,034 (82 %) + gen != 0 : 221,608 (18 %) +gen=N value distribution (top, 4523 calls each): + gen=1, gen=2, ... gen=50: 4523 calls/value +``` + +The 1.2 M gen=0 calls are what the parser does for every +encountered `N 0 R` reference and every per-object PDFRef +construction. The 221 k gen != 0 calls are pdf-lib's xref-stream +bookkeeping for PDF 1.5+ compressed-object entries: in a +cross-reference stream's type-2 entry, the spec uses the +"generation number" field to store the **index of the object +within its ObjStm**, and pdf-lib feeds that index straight to +`PDFRef.of` (`PDFXRefStreamParser.js:74-80`). 4,523 ObjStms × 50 +entries each ≈ the observed 221 k. + +So 82 % of calls have generationNumber=0. That's the path worth +optimising. + +### The shim + +`docs/lib/fast-refs.mjs` is the symmetric side-effecting import to +`fast-deflate`: + +```js +import { PDFRef } from "pdf-lib"; + +if (!PDFRef.__fastPoolInstalled) { + const original = PDFRef.of; + const pool0 = []; + PDFRef.of = function fastOf(objectNumber, generationNumber) { + if (generationNumber === undefined || generationNumber === 0) { + const existing = pool0[objectNumber]; + if (existing) return existing; + const fresh = original.call(PDFRef, objectNumber, 0); + pool0[objectNumber] = fresh; + return fresh; + } + return original.call(PDFRef, objectNumber, generationNumber); + }; + PDFRef.__fastPoolInstalled = true; +} +``` + +Dense-array indexed by `objectNumber` for the gen=0 case -- no +string alloc, no Map hash, just an array read. gen != 0 passes +through to the original (which still allocates the tag and runs +the Map lookup, but that's only 18 % of calls). + +The cache is **in front of** the original `PDFRef.of`, not a +replacement: on a miss we call the original to produce the PDFRef +instance, then cache it. That dodges the module-private `ENFORCER` +token the upstream constructor demands. Memory cost is a second +reference per PDFRef on top of the upstream pool's entry -- ~228 k +tiny objects, negligible. + +The interning contract is preserved: `PDFRef.of(42) === PDFRef.of(42, 0)` +and both `!== PDFRef.of(42, 1)`, as before. + +### Results: profiler-on vs profiler-off matters + +First A/B with the process-phase profiler attached (paired, +`--detach-pages --no-timing --cpu-profile-process --cpu-sampling 100 +--fast-deflate [--fast-refs]`): + +| metric | pre (no fast-refs) | post (+ fast-refs) | Δ | +| --- | --- | --- | --- | +| process | 2.94 s | 2.52 s | **-0.42 s (-14 %)** | +| ↳ load | 1.81 s | 1.42 s | -0.39 s | +| ↳ save | 1.12 s | 1.08 s | flat | +| `PDFRef.of` self in profile | 336 ms (12 %) | 148 ms (5.9 %) | -188 ms | +| `(garbage collector)` self | 262 ms (9 %) | 194 ms (7.8 %) | -68 ms | + +`PDFRef.of`'s self-time roughly halved, GC pressure dropped, and +the wall-clock saving (390 ms on load) looked like a clean win. + +But: paired A/B *without* the profiler attached told a different +story: + +| metric | pre (no fast-refs) | post (+ fast-refs) | Δ | +| --- | --- | --- | --- | +| process | 2.48 s | 2.26 s | **-0.22 s (-9 %)** | +| ↳ load | 1.51 s | 1.27 s | **-0.24 s (-16 %)** | +| ↳ save | 0.96 s | 0.98 s | flat | + +**Real wall-clock saving is ~240 ms**, not 390 ms. The remaining +~150 ms of the profiler-on delta was profiler-attribution overhead +that our shim removed by making the hot function shorter -- fewer +samples landing on `PDFRef.of`, less per-sample tax. The profiler +isn't lying about which function is expensive; it's overstating +*how much* that expense will move wall-clock once you fix it. + +The diagnostic question to tell these apart: *what's the call +rate?* At 1.2 M calls per load, even a few microseconds of +sampling overhead per call adds up to hundreds of milliseconds in +the profile. Functions called millions of times need a no-profile +A/B as a sanity check before claiming the wall-clock saving the +profile implied. Functions called a few times per page (or once +per render) don't. + +Both numbers are real -- the bottom-up profile is the right +*target* for "what's worth fixing," but a no-profile A/B is the +right *measurement* for "how big the win was." + +### Production confirmation + +`book.bat` with both shims, two consecutive runs: + +``` +render: 9.1s (1651 pages) +generate: 37.5s +process: 2.3s +saved: docs\_pdf\book.pdf (16.1 MB) +total: 50.7s +``` + +Process dropped from the prior 2.5 s (with just `fast-deflate`) to +2.3 s. `book.bat` rounds to 0.1 s and is single-run so individual +phase numbers carry some run-to-run jitter, but the harness's +2.48 → 2.26 paired-A/B confirms the ~200 ms move is real. + +### What this didn't fix + +The post-`fast-refs` bottom-up table: + +``` +samples: 4668 duration: 2.53s us/sample: 542 + + self_ms self_% function source + ------- ------ -------------------------------------------------- + 341.17 13.59% writeSync (Node libuv -- zlib's C++ work) + 194.41 7.75% (garbage collector) + 181.96 7.25% PDFDict.entries pdf-lib/.../PDFDict.js:22 + 172.21 6.86% decodeName pdf-lib/.../PDFName.js:9 + 147.84 5.89% PDFRef.of pdf-lib/.../PDFRef.js:34 (the 18 % gen != 0 residue) + 96.40 3.84% parseName + 95.31 3.80% parseRawNumber + 78.52 3.13% parseDict + ... +``` + +`PDFRef.of` is still on the list at 148 ms -- that's the 221 k +gen != 0 calls still going through the upstream string-keyed Map. +Optimising those would require either: (a) a 2D structure keyed by +gen first then objectNumber, or (b) accepting that the in-ObjStm +"index as generation" usage is short-lived bookkeeping (the parser +creates these refs once to populate xref tables, then mostly +re-resolves the actual `N 0 R` form). Neither moves the wall-clock +total enough to justify -- 150 ms of a 50 s build is the noise floor. + +Above `PDFRef.of`, the load-phase costs (`decodeName`, `parseName`, +`parseRawNumber`, `parseDict`, etc.) are pdf-lib's actual parser +work. Those are O(input size) and pretty close to fundamental -- +shrinking them would mean rewriting the parser. + +### Where this leaves the picture + +Cumulative process-phase cost, baseline → after both shims: + +| state | process | load | save | +| --- | --- | --- | --- | +| original (Slow / 50 defaults) | ~40 s | ~36 s| ~4 s | +| + parseSpeed:Fastest | ~5 s | ~2 s | ~3 s | +| + fast-deflate | ~2.5 s | ~1.5s| ~1 s | +| **+ fast-refs (this section)** | **~2.3 s** | **~1.3 s** | **~1 s** | + +The pdf-lib roundtrip path is now ~2.3 s of a ~50 s build. The +incremental writer's 0.25 s process phase (see +[01-baseline-and-detach.md](01-baseline-and-detach.md)) is still +strictly faster on process alone, but the pdf-lib path delivers a +16.1 MB output vs incremental's 53 MB, and the 2 s gap on a 50 s +build doesn't justify the file-size cost for our pipeline. + +The strategic note from earlier phases still stands: generate's +~38 s in `page.pdf()` is the remaining lever, and `pageRanges` +sharding is the only knob plausibly large enough to move the +wall-clock total by more than a few seconds. From 272413df6570db8d155480db178b46cff8699dc5 Mon Sep 17 00:00:00 2001 From: Kuba Sunderland-Ober Date: Sun, 24 May 2026 00:00:29 +0200 Subject: [PATCH 03/44] Parallelize deflate when saving the pdf. --- docs/lib/parallel-deflate.mjs | 169 ++++++++++++++++++++++++++++++++++ docs/render-book.mjs | 14 ++- perf/README.md | 21 +++-- perf/measure.mjs | 19 +++- perf/notes/08-pdf-lib.md | 167 ++++++++++++++++++++++++++++++++- 5 files changed, 373 insertions(+), 17 deletions(-) create mode 100644 docs/lib/parallel-deflate.mjs diff --git a/docs/lib/parallel-deflate.mjs b/docs/lib/parallel-deflate.mjs new file mode 100644 index 00000000..5cc48fb1 --- /dev/null +++ b/docs/lib/parallel-deflate.mjs @@ -0,0 +1,169 @@ +// Drop-in async replacement for `pdfDoc.save({ useObjectStreams: true })` +// that parallelises the per-object-stream deflate work onto libuv's +// thread pool. Sole exported entry point: `parallelSave(pdfDoc, opts)`. +// +// Why: pdf-lib's PDFStreamWriter.computeBufferSize creates one +// PDFObjectStream per 50-object chunk, then immediately calls +// computeIndirectObjectSize on each. sizeInBytes() walks the Cache, +// which lazy-populates via pako.deflate(unencodedContents). The whole +// pass is synchronous, so ~1000 chunks × ~0.3 ms of zlib work runs +// serially -- accounts for ~30 % of save() wall time on the book. +// +// What: same construction logic as PDFStreamWriter, split into three +// phases: +// 1. classify uncompressed vs compressed (same as upstream) +// 2. instantiate every PDFObjectStream up-front, then `await +// Promise.all` an async zlib.deflate per stream so libuv's thread +// pool (default 4) runs them concurrently +// 3. size + emit (same as upstream, but every cache.access() is a hit) +// The xrefStream itself is one more PDFFlateStream; we deflate it +// serially in phase 3 since its contents depend on phase-3 offsets. +// +// Output: byte-near-equivalent to pdfDoc.save({ useObjectStreams: true }). +// zlib vs pako deflate may pick different LZ77 matches → 1-byte-level +// stream diffs and matching /Length deltas; viewer-invisible. +// +// Parallelism is bounded by UV_THREADPOOL_SIZE (default 4). Bump it via +// `process.env.UV_THREADPOOL_SIZE = '8'` before any libuv work fires +// if you want more concurrency. + +import { deflate } from 'node:zlib'; +import { promisify } from 'node:util'; +import { + PDFStreamWriter, + PDFObjectStream, + PDFCrossRefStream, + PDFRef, + PDFName, + PDFNumber, + PDFInvalidObject, + PDFStream, + PDFHeader, + PDFTrailer, +} from 'pdf-lib'; + +const deflateAsync = promisify(deflate); + +class ParallelStreamWriter extends PDFStreamWriter { + constructor(context, objectsPerTick, encodeStreams, objectsPerStream, parallel) { + super(context, objectsPerTick, encodeStreams, objectsPerStream); + this._lastPrecompressed = 0; + this._parallel = parallel; + } + + async computeBufferSize() { + let objectNumber = this.context.largestObjectNumber + 1; + const header = PDFHeader.forVersion(1, 7); + let size = header.sizeInBytes() + 2; + const xrefStream = PDFCrossRefStream.create( + this.createTrailerDict(), + this.encodeStreams, + ); + + const uncompressedObjects = []; + const compressedChunks = []; + const objectStreamRefs = []; + + // ----- Phase 1: classify ----- + const indirectObjects = this.context.enumerateIndirectObjects(); + for (let i = 0; i < indirectObjects.length; i++) { + const indirectObject = indirectObjects[i]; + const [ref, object] = indirectObject; + const shouldNotCompress = + ref === this.context.trailerInfo.Encrypt || + object instanceof PDFStream || + object instanceof PDFInvalidObject || + ref.generationNumber !== 0; + + if (shouldNotCompress) { + uncompressedObjects.push(indirectObject); + xrefStream.addUncompressedEntry(ref, size); + size += this.computeIndirectObjectSize(indirectObject); + } else { + let chunk = compressedChunks.length === 0 ? null : compressedChunks[compressedChunks.length - 1]; + let objectStreamRef = objectStreamRefs.length === 0 ? null : objectStreamRefs[objectStreamRefs.length - 1]; + if (!chunk || chunk.length % this.objectsPerStream === 0) { + chunk = []; + compressedChunks.push(chunk); + objectStreamRef = PDFRef.of(objectNumber++); + objectStreamRefs.push(objectStreamRef); + } + xrefStream.addCompressedEntry(ref, objectStreamRef, chunk.length); + chunk.push(indirectObject); + } + } + + // ----- Phase 2: instantiate object streams and parallel-deflate ----- + const objectStreams = compressedChunks.map(chunk => + PDFObjectStream.withContextAndObjects(this.context, chunk, this.encodeStreams), + ); + + if (this._parallel && this.encodeStreams && objectStreams.length > 0) { + const unencoded = objectStreams.map(os => os.getUnencodedContents()); + const deflated = await Promise.all(unencoded.map(buf => deflateAsync(buf))); + for (let i = 0; i < objectStreams.length; i++) { + objectStreams[i].contentsCache.value = deflated[i]; + } + this._lastPrecompressed = objectStreams.length; + } else { + this._lastPrecompressed = 0; + } + + // ----- Phase 3: size object streams (cache hits) ----- + for (let i = 0; i < objectStreams.length; i++) { + const ref = objectStreamRefs[i]; + const objectStream = objectStreams[i]; + xrefStream.addUncompressedEntry(ref, size); + size += this.computeIndirectObjectSize([ref, objectStream]); + uncompressedObjects.push([ref, objectStream]); + } + + // ----- xrefStream wrap-up (serial deflate; contents depend on offsets above) ----- + const xrefStreamRef = PDFRef.of(objectNumber++); + xrefStream.dict.set(PDFName.of('Size'), PDFNumber.of(objectNumber)); + xrefStream.addUncompressedEntry(xrefStreamRef, size); + const xrefOffset = size; + size += this.computeIndirectObjectSize([xrefStreamRef, xrefStream]); + uncompressedObjects.push([xrefStreamRef, xrefStream]); + + const trailer = PDFTrailer.forLastCrossRefSectionOffset(xrefOffset); + size += trailer.sizeInBytes(); + + return { size, header, indirectObjects: uncompressedObjects, trailer }; + } +} + +/** + * Replacement for `pdfDoc.save({ useObjectStreams: true })` with parallel + * deflate. Mirrors PDFDocument.save's pre-serialize steps (addDefaultPage, + * updateFieldAppearances, flush) before invoking the patched writer. + * + * Returns { bytes: Uint8Array, streamCount: number }. + */ +export async function parallelSave(pdfDoc, options = {}) { + const { + objectsPerTick = Infinity, + addDefaultPage = true, + updateFieldAppearances = true, + objectsPerStream = 50, + encodeStreams = true, + parallel = true, + } = options; + + if (addDefaultPage && pdfDoc.getPageCount() === 0) pdfDoc.addPage(); + if (updateFieldAppearances) { + const form = pdfDoc.formCache.getValue(); + if (form) form.updateFieldAppearances(); + } + await pdfDoc.flush(); + + const writer = new ParallelStreamWriter( + pdfDoc.context, + objectsPerTick, + encodeStreams, + objectsPerStream, + parallel, + ); + const bytes = await writer.serializeToBuffer(); + return { bytes, streamCount: writer._lastPrecompressed }; +} diff --git a/docs/render-book.mjs b/docs/render-book.mjs index f219ec06..422ab976 100644 --- a/docs/render-book.mjs +++ b/docs/render-book.mjs @@ -45,6 +45,7 @@ import './lib/fast-deflate.mjs'; import './lib/fast-refs.mjs'; import { parseOutline, setOutline } from './lib/outline.mjs'; import { setMetadata } from './lib/postprocesser.mjs'; +import { parallelSave } from './lib/parallel-deflate.mjs'; const __dirname = dirname(fileURLToPath(import.meta.url)); @@ -259,12 +260,21 @@ try { // parseSpeed: Fastest and objectsPerTick: Infinity are critical: // pdf-lib's defaults yield to the event loop between every 100/50 // objects, turning a ~5 s round-trip into ~40 s on a 50 MB PDF - // (~35 s of which is pure V8 idle). See perf/README.md. + // (~35 s of which is pure V8 idle). + // + // parallelSave (vs the default pdfDoc.save) does two things: + // - objectsPerStream: 500 -- larger object-stream chunks compress + // better (shared deflate window), 5 % smaller output PDF, and + // cuts the per-chunk dispatch overhead 10x. + // - dispatches every chunk's deflate to libuv's thread pool via + // async zlib.deflate instead of running serially on the main + // thread. Moves ~300 ms of zlib work off-CPU on the book. + // See perf/notes/08-pdf-lib.md. const tProcess = Date.now(); const pdfDoc = await PDFDocument.load(rawPdf, { parseSpeed: ParseSpeeds.Fastest }); setMetadata(pdfDoc, meta); await setOutline(pdfDoc, outline, false); - const finalPdf = await pdfDoc.save({ objectsPerTick: Infinity }); + const { bytes: finalPdf } = await parallelSave(pdfDoc, { objectsPerTick: Infinity, objectsPerStream: 500 }); console.log(`process: ${fmtMs(Date.now() - tProcess)}`); writeFileSync(outputPath, Buffer.from(finalPdf)); diff --git a/perf/README.md b/perf/README.md index 53f0a167..0aa06bde 100644 --- a/perf/README.md +++ b/perf/README.md @@ -66,7 +66,7 @@ or pdf-lib), or to write `book.pdf` for behavioural verification. The mirror command for CPU-profiling the pdf-lib roundtrip: ``` -node measure.mjs --fast-deflate --fast-refs --cpu-profile-process --cpu-sampling 100 +node measure.mjs --fast-deflate --fast-refs --parallel-deflate --cpu-profile-process --cpu-sampling 100 ``` Flag rationale: @@ -78,6 +78,11 @@ Flag rationale: - `--fast-refs` -- inject the [docs/lib/fast-refs.mjs](../docs/lib/fast-refs.mjs) shipping fix (dense-array cache for `PDFRef.of`'s gen=0 path). Same logic. +- `--parallel-deflate` -- swap `pdfDoc.save()` for `parallelSave` + from [docs/lib/parallel-deflate.mjs](../docs/lib/parallel-deflate.mjs), + which pre-deflates object streams in parallel on libuv's pool with + `objectsPerStream: 500`. Production runs through it; same logic. + Moves ~300 ms of zlib work off the main thread. - `--cpu-profile-process` -- attach Node's `inspector/promises` Profiler around the process phase only (skips render and generate). Writes `process.cpuprofile` into the timestamped `results/` folder. @@ -100,11 +105,11 @@ Why no `--render-only`? `--cpu-profile-process` requires the process phase to run; the harness errors out if you combine them. To compare against upstream pdf-lib (e.g. when proposing a change -upstream), drop `--fast-deflate` and `--fast-refs`. Caveat for -A/B work: profiler-on attribution overstates the cost of hot -functions called millions of times (`PDFRef.of` in particular). For -"did this wall-clock change," do a paired no-profile A/B as a -sanity check. +upstream), drop `--fast-deflate`, `--fast-refs`, and +`--parallel-deflate`. Caveat for A/B work: profiler-on attribution +overstates the cost of hot functions called millions of times +(`PDFRef.of` in particular). For "did this wall-clock change," do +a paired no-profile A/B as a sanity check. See [notes/08-pdf-lib.md](notes/08-pdf-lib.md) for the process-phase investigations these flags enabled. @@ -231,6 +236,7 @@ run.bat --chrome-outline # let Chrome emit /Outlines (skip pars run.bat --tracing # capture a hybrid Chrome trace (Blink events + embedded V8 cpu samples) run.bat --fast-deflate # route pdf-lib's deflate through node:zlib (ships in render-book.mjs by default; opt-in here for A/B) run.bat --fast-refs # dense-array cache for PDFRef.of's gen=0 path (also ships; opt-in here for A/B) +run.bat --parallel-deflate # parallelSave with objectsPerStream=500 (also ships; opt-in here for A/B) ``` Flags compose. The CPU profile lands as `render.cpuprofile` @@ -339,6 +345,7 @@ file documenting each: | `--disable-gpu` + `--in-process-gpu` | [07](notes/07-memory.md) | ~200 MB memory | | `pako.deflate` → `node:zlib.deflateSync` | [08](notes/08-pdf-lib.md) | ~1.5 s process (save -58 %) | | `PDFRef.of` dense-array cache (gen=0) | [08](notes/08-pdf-lib.md) | ~0.2 s process (load -16 %) | +| Parallel deflate + `objectsPerStream: 500` | [08](notes/08-pdf-lib.md) | ~0.3 s process (zlib off-thread; PDF -5 %) | What was tried and didn't ship: @@ -365,4 +372,4 @@ order; later ones reference earlier ones for context. | [05-blink-trace.md](notes/05-blink-trace.md) | What happened when we tried move-not-clone (a `previousLeaf` cache shipped instead of move); cracking the cpu profile's `(program)` row open with a Blink-category trace; the WhiteSpaceFilter paired-A/B that found it wasn't worth its layout cost in our pipeline. | | [06-microtasks-pageranges-css.md](notes/06-microtasks-pageranges-css.md) | Following `RunMicrotasks` down to zero (chunker fully sync); why `pageRanges` sharding is off the table; CSS cost attribution showing print.css's individual sections are all below the noise floor. | | [07-memory.md](notes/07-memory.md) | Where the renderer's 1.9 GB goes -- process-tree footprint, per-allocator + per-Blink-class breakdown, `--disable-gpu` + `--in-process-gpu` saving ~200 MB, a GC-pass probe finding 180 MB of unswept Oilpan garbage. | -| [08-pdf-lib.md](notes/08-pdf-lib.md) | Profiling the process phase via `--cpu-profile-process`; pako's per-stream init dominates with ~4 500 small streams (routing `pako.deflate` through `node:zlib` saves ~1.5 s); `PDFRef.of`'s string-keyed Map lookup at 1.2 M calls per load (dense-array gen=0 cache saves ~0.2 s). | +| [08-pdf-lib.md](notes/08-pdf-lib.md) | Profiling the process phase via `--cpu-profile-process`; pako's per-stream init dominates with ~4 500 small streams (routing `pako.deflate` through `node:zlib` saves ~1.5 s); `PDFRef.of`'s string-keyed Map lookup at 1.2 M calls per load (dense-array gen=0 cache saves ~0.2 s); parallelising save's per-stream deflate on libuv's pool with `objectsPerStream: 500` (~0.3 s off the main thread; PDF -5 %). | diff --git a/perf/measure.mjs b/perf/measure.mjs index 9e9e0646..5fb3e7d6 100644 --- a/perf/measure.mjs +++ b/perf/measure.mjs @@ -118,6 +118,7 @@ import { parseOutline, setOutline } from '../docs/lib/outline.mjs'; import { setMetadata } from '../docs/lib/postprocesser.mjs'; import { applyOutlineAndMetadataIncremental } from './incremental-pdf.mjs'; import { pinCpuIfWindows } from './pin-cpu.mjs'; +import { parallelSave } from '../docs/lib/parallel-deflate.mjs'; // On Windows, re-launch under `start /affinity 0x5500 /high` to stabilise // CPU sample-time. See pin-cpu.mjs. Cuts run-to-run variance from @@ -146,6 +147,7 @@ let renderOnly = false; let tracing = false; let fastDeflate = false; let fastRefs = false; +let parallelDeflate = false; for (let i = 0; i < args.length; i++) { const a = args[i]; if (a === '--out') outArg = args[++i]; @@ -169,6 +171,7 @@ for (let i = 0; i < args.length; i++) { else if (a === '--no-affinity') { /* handled in pin-cpu.mjs */ } else if (a === '--fast-deflate') fastDeflate = true; else if (a === '--fast-refs') fastRefs = true; + else if (a === '--parallel-deflate') parallelDeflate = true; else if (!inputArg) inputArg = a; else { console.error(`unknown arg: ${a}`); process.exit(2); } } @@ -519,10 +522,17 @@ try { const setOutlineMs = Date.now() - tSetOutlineStart; const tSaveStart = Date.now(); - finalPdf = await pdfDoc.save({ objectsPerTick: Infinity }); + let parallelStreamCount = 0; + if (parallelDeflate) { + const { bytes, streamCount } = await parallelSave(pdfDoc, { objectsPerTick: Infinity, objectsPerStream: 500 }); + finalPdf = bytes; + parallelStreamCount = streamCount; + } else { + finalPdf = await pdfDoc.save({ objectsPerTick: Infinity }); + } const saveMs = Date.now() - tSaveStart; - processBreakdown = { loadMs, setOutlineMs, saveMs }; + processBreakdown = { loadMs, setOutlineMs, saveMs, parallelStreamCount }; } const tProcEnd = Date.now(); processMs = tProcEnd - tProcStart; @@ -538,7 +548,10 @@ try { if (incremental) { console.log(`[harness] process ${fmtMs(processMs)} (incremental=${fmtMs(processBreakdown.incrementalMs)}, +${processBreakdown.appendedBytes}B, ${processBreakdown.newObjectCount} new objs)`); } else { - console.log(`[harness] process ${fmtMs(processMs)} (load=${fmtMs(processBreakdown.loadMs)}, setOutline=${fmtMs(processBreakdown.setOutlineMs)}, save=${fmtMs(processBreakdown.saveMs)})`); + const parTag = processBreakdown.parallelStreamCount + ? ` (parallel-deflate: ${processBreakdown.parallelStreamCount} streams)` + : ''; + console.log(`[harness] process ${fmtMs(processMs)} (load=${fmtMs(processBreakdown.loadMs)}, setOutline=${fmtMs(processBreakdown.setOutlineMs)}, save=${fmtMs(processBreakdown.saveMs)}${parTag})`); } } // end if (!renderOnly) diff --git a/perf/notes/08-pdf-lib.md b/perf/notes/08-pdf-lib.md index 51855446..7c71e317 100644 --- a/perf/notes/08-pdf-lib.md +++ b/perf/notes/08-pdf-lib.md @@ -466,22 +466,179 @@ Above `PDFRef.of`, the load-phase costs (`decodeName`, `parseName`, work. Those are O(input size) and pretty close to fundamental -- shrinking them would mean rewriting the parser. -### Where this leaves the picture +### What's left on save -Cumulative process-phase cost, baseline → after both shims: +After the fast-refs shim the process-phase profile's top +self-time entry was still `writeSync` at ~340 ms / 12 %. The name +is misleading -- not `fs.writeFileSync` writing the output PDF, +but `node:zlib`'s native binding inside `deflateSync`. +`find-callers` attributes the chain: + +``` +writeSync 344 ms (zlib native) + processChunkSync node:zlib:399 + zlibBufferSync node:zlib:165 + PDFFlateStream.computeContents 186 ms (pdf-lib stream compression) + fastDeflate (our shim) 130 ms + syncBufferWrapper 34 ms +``` + +So the cost is pure CPU-bound deflate during `pdfDoc.save()`. The +streams being compressed: pdf-lib's `PDFStreamWriter` (the default +when `useObjectStreams: true`) groups every non-stream, +non-encrypted, gen=0 indirect object into `PDFObjectStream` chunks +of 50, deflates each, and writes the result. On the book that's +~4,500 chunks, each a small deflate job, all running serially on +the main thread. + +## Parallelising save's deflate on libuv's pool + +### Why not just async-deflate inline + +pdf-lib's serializer is synchronous at the relevant call sites: +`PDFFlateStream.computeContents` +(`pdf-lib/.../structures/PDFFlateStream.js:13`) is a closure that +returns `pako.deflate(unencodedContents)` inline, called from +`cache.access()` during `sizeInBytes()`. Swapping `deflateSync` → +async `deflate` would mean rewriting the whole save path to await +every stream. The call sites don't expect a promise. + +### Why not `useObjectStreams: false` + +The one-liner that skips the whole problem. Measured on the book: + +| variant | save | process | PDF size | +| --- | --- | --- | --- | +| pdf-lib default (objectsPerStream=50, sync) | 1.01 s | 2.30 s | 16.1 MB | +| `useObjectStreams: false` | 0.59 s | 2.17 s | **40.5 MB** | + +A 2.5x file-size regression. The whole point of pdf-lib's +roundtrip over Chrome's raw output was to compress those streams. +Not an option. + +### What actually worked: parallel pre-deflate + larger chunks + +`docs/lib/parallel-deflate.mjs` subclasses pdf-lib's +`PDFStreamWriter` and splits its `computeBufferSize` into three +phases: + +1. **Classify** indirect objects into uncompressed (streams, + encrypt, gen != 0) vs compressed chunks of N. Same logic as + upstream, no behaviour change. +2. **Instantiate all `PDFObjectStream`s up-front**, snapshot their + unencoded contents, then `await Promise.all` an async + `zlib.deflate` per stream. Libuv's thread pool (default 4) runs + them concurrently. Write each result into the stream's + `contentsCache.value`. +3. **Size + emit** -- same as upstream, but every `cache.access()` + is a hit, so save's loop never touches deflate. + +The xrefStream is one more `PDFFlateStream` but its contents +depend on the offsets computed in phase 3, so we let it deflate +serially at the end (one stream; `fast-deflate`'s `deflateSync` +handles it). + +Exposed as `parallelSave(pdfDoc, options)`. Drop-in for +`pdfDoc.save` when `useObjectStreams: true` -- same pre-serialize +hooks (addDefaultPage, updateFieldAppearances, flush), same +byte-level output modulo zlib-vs-pako match choices. + +### First try with default `objectsPerStream=50` was slower + +Profile diff (paired `--cpu-profile-process --cpu-sampling 100`): + +| metric | serial (default) | parallel @ 50 (4,523 streams) | Δ | +| --- | --- | --- | --- | +| `writeSync` self | 345 ms | 79 ms | **-266 ms** | +| `write` (native, libuv setup) | <1 ms | 118 ms | **+117 ms** | +| `close` (native, libuv teardown) | <1 ms | 96 ms | **+95 ms** | +| net main-thread zlib + libuv overhead | 346 ms | 293 ms | -53 ms | + +The actual deflate work did move off-thread (`writeSync` dropped +sharply), but libuv's per-`uv_work_t` dispatch overhead on 4,523 +tiny jobs ate most of the savings. ~50 µs/job × ~4,500 jobs ≈ +225 ms of pure dispatch. + +### Fix: bigger chunks via `objectsPerStream: 500` + +Ten-fold-larger object streams cut the chunk count from ~4,500 to +~450. Same total deflate work, but in ~450 jobs instead of ~4,500 +-- libuv overhead drops by ~10x. Side benefit: larger chunks share +a deflate window, so the output PDF is ~5 % smaller (16.1 MB → +15.3 MB). + +Profile diff at `objectsPerStream: 500` +(paired `--cpu-profile-process --cpu-sampling 100`): + +| metric | serial @ 500 | parallel @ 500 | Δ | +| --- | --- | --- | --- | +| `writeSync` self (zlib native, main thread) | 335 ms | 33 ms | **-302 ms** | +| `close` (libuv finalize) | 1.7 ms | 15 ms | +13 ms | +| `PDFFlateStream.computeContents` | 20 ms | 4 ms | -16 ms | +| **total zlib-related main-thread self-time** | **360 ms** | **54 ms** | **-306 ms (-85 %)** | +| bottom-up: `writeSync` position | #1 (8.25 %) | not in top 12 | gone | + +The 306 ms moved off the main thread to libuv's pool, where Node's +V8 profiler doesn't sample it -- the headline "writeSync gone from +the top 12" is the on-CPU-budget that save() pays. + +### Wall-clock note + +This whole sub-investigation deliberately compared profiles only, +not wall-clock. The dev machine was busy with other work, and +process is a ~2 s phase whose run-to-run jitter on a loaded system +exceeds the expected delta. The profile diff cuts through that: +306 ms of native zlib disappearing from the main-thread budget is +a structural change that's stable across noise. A clean-machine +wall-clock A/B would close the loop, but the optimisation is +shippable on profile evidence alone. + +### Wired into production + +`render-book.mjs` swaps +`pdfDoc.save({ objectsPerTick: Infinity })` for +`parallelSave(pdfDoc, { objectsPerTick: Infinity, objectsPerStream: 500 })`. +Smoke test on the book: + +``` +render: 8.6s (1651 pages) +generate: 39.2s (raw 39.3 MB) +process: 2.2s +saved: docs\_pdf\book.pdf (15.3 MB) +total: 51.9s +``` + +The 15.3 MB output (down from 16.1 MB) is the chunk-size effect; +the parallel deflate doesn't change byte size, only where the work +runs. + +The harness exposes the same via `--parallel-deflate` (which calls +`parallelSave` with the same defaults). + +## Where this leaves the picture + +Cumulative process-phase cost, baseline → after all three shims: | state | process | load | save | | --- | --- | --- | --- | | original (Slow / 50 defaults) | ~40 s | ~36 s| ~4 s | | + parseSpeed:Fastest | ~5 s | ~2 s | ~3 s | | + fast-deflate | ~2.5 s | ~1.5s| ~1 s | -| **+ fast-refs (this section)** | **~2.3 s** | **~1.3 s** | **~1 s** | +| + fast-refs | ~2.3 s | ~1.3 s | ~1 s | +| **+ parallel-deflate (this section)** | **~2.0 s** | **~1.3 s** | **~0.7 s** | + +The bottom-up after parallel deflate is dominated by pdf-lib's +parser frames -- `PDFDict.entries` (8 %), `decodeName` (8 %), GC +(8 %), `parseRawNumber` (6 %), `PDFRef.of` (5 %, the gen != 0 +residue). All load-phase, all O(input bytes), all close to +fundamental pdf-lib work. Further wins in this phase would mean +rewriting pdf-lib's parser. -The pdf-lib roundtrip path is now ~2.3 s of a ~50 s build. The +The pdf-lib roundtrip path is now ~2.0 s of a ~50 s build. The incremental writer's 0.25 s process phase (see [01-baseline-and-detach.md](01-baseline-and-detach.md)) is still strictly faster on process alone, but the pdf-lib path delivers a -16.1 MB output vs incremental's 53 MB, and the 2 s gap on a 50 s +15.3 MB output vs incremental's 53 MB, and the ~2 s gap on a 50 s build doesn't justify the file-size cost for our pipeline. The strategic note from earlier phases still stands: generate's From 4ecf1840940b43bd777648b14d05a4b90a8f1c49 Mon Sep 17 00:00:00 2001 From: Kuba Sunderland-Ober Date: Sun, 24 May 2026 00:06:56 +0200 Subject: [PATCH 04/44] Use zlib in larger chunks for deflation, also use it for inflation. --- docs/lib/fast-deflate.mjs | 37 ------------------ docs/lib/fast-inflate.mjs | 39 ++++++++++++++++++ docs/lib/parallel-deflate.mjs | 35 +++++++++++------ docs/render-book.mjs | 12 +++--- perf/README.md | 27 ++++++------- perf/measure.mjs | 26 +++++------- perf/notes/08-pdf-lib.md | 74 ++++++++++++++++++++++++++++++++--- 7 files changed, 159 insertions(+), 91 deletions(-) delete mode 100644 docs/lib/fast-deflate.mjs create mode 100644 docs/lib/fast-inflate.mjs diff --git a/docs/lib/fast-deflate.mjs b/docs/lib/fast-deflate.mjs deleted file mode 100644 index 08725ac3..00000000 --- a/docs/lib/fast-deflate.mjs +++ /dev/null @@ -1,37 +0,0 @@ -// Replace pako's pure-JS deflate with Node's zlib for the one path -// pdf-lib actually uses it on: PDFFlateStream.computeContents in -// node_modules/pdf-lib/cjs/core/structures/PDFFlateStream.js, which -// calls `pako.deflate(unencodedContents)` once per FlateStream during -// PDFDocument.save(). -// -// PDF /FlateDecode (ISO 32000-1 §7.4.4) is the zlib format (RFC 1950): -// a 2-byte zlib header + a raw deflate body (RFC 1951) + a 4-byte -// Adler-32 trailer. Both pako.deflate and zlib.deflateSync produce that -// format with default level 6, so the swap is wire-compatible -- output -// bytes may differ by a small amount (different match choices in the -// compressor's inner loop) but every PDF viewer reads either. -// -// Mechanism: pdf-lib is CJS in node_modules and calls -// `require("pako").deflate(...)` at the call site, not at import time. -// Mutating the live pako exports object is enough; no fork required. -// -// Side-effecting import. Import once before PDFDocument.save() runs: -// -// import "./lib/fast-deflate.mjs"; -// -// Idempotent -- repeated imports do nothing after the first. - -import { deflateSync } from "node:zlib"; -import pako from "pako"; - -if (!pako.__fastDeflateInstalled) { - const original = pako.deflate; - pako.deflate = function fastDeflate(data, options) { - // pdf-lib's only caller passes no options. Anything fancier (dictionary, - // raw, custom level) goes back to pako so we don't change behaviour - // outside the one hot path we care about. - if (options) return original.call(pako, data, options); - return deflateSync(data); - }; - pako.__fastDeflateInstalled = true; -} diff --git a/docs/lib/fast-inflate.mjs b/docs/lib/fast-inflate.mjs new file mode 100644 index 00000000..db675d50 --- /dev/null +++ b/docs/lib/fast-inflate.mjs @@ -0,0 +1,39 @@ +// Replace pako's pure-JS inflate with Node's zlib for the one path +// pdf-lib actually uses it on: PDFCrossRefStreamParser inflating the +// compressed cross-reference stream during PDFDocument.load. Exactly +// one call per load on Chrome-emitted PDFs (PDF 1.5+ xref-stream +// format), ~4.5 KB input. Negligible wall-clock, but it's the last +// remaining pdf-lib -> pako call site once parallelSave has taken +// over the deflate side -- this brings the runtime pako call count +// to zero. +// +// PDF /FlateDecode (ISO 32000-1 §7.4.4) is the zlib format (RFC 1950): +// 2-byte zlib header + raw deflate body (RFC 1951) + 4-byte Adler-32 +// trailer. Both pako.inflate and zlib.inflateSync consume that +// format, so the swap is wire-compatible. +// +// Mechanism: pdf-lib is CJS in node_modules and calls +// `require("pako").inflate(...)` at the call site, not at import +// time. Mutating the live pako exports object is enough; no fork +// required. +// +// Side-effecting import. Import once before PDFDocument.load runs: +// +// import "./lib/fast-inflate.mjs"; +// +// Idempotent -- repeated imports do nothing after the first. + +import { inflateSync } from "node:zlib"; +import pako from "pako"; + +if (!pako.__fastInflateInstalled) { + const original = pako.inflate; + pako.inflate = function fastInflate(data, options) { + // pdf-lib's only caller passes no options. Anything fancier + // (dictionary, raw, custom windowBits) goes back to pako so we + // don't change behaviour outside the one path we care about. + if (options) return original.call(pako, data, options); + return inflateSync(data); + }; + pako.__fastInflateInstalled = true; +} diff --git a/docs/lib/parallel-deflate.mjs b/docs/lib/parallel-deflate.mjs index 5cc48fb1..d278ecab 100644 --- a/docs/lib/parallel-deflate.mjs +++ b/docs/lib/parallel-deflate.mjs @@ -3,31 +3,34 @@ // thread pool. Sole exported entry point: `parallelSave(pdfDoc, opts)`. // // Why: pdf-lib's PDFStreamWriter.computeBufferSize creates one -// PDFObjectStream per 50-object chunk, then immediately calls +// PDFObjectStream per chunk, then immediately calls // computeIndirectObjectSize on each. sizeInBytes() walks the Cache, -// which lazy-populates via pako.deflate(unencodedContents). The whole -// pass is synchronous, so ~1000 chunks × ~0.3 ms of zlib work runs -// serially -- accounts for ~30 % of save() wall time on the book. +// which lazy-populates via a deflate of the unencoded contents. The +// whole pass is synchronous, so the per-chunk zlib work runs serially +// -- accounted for ~30 % of save() wall time on the book before this. // // What: same construction logic as PDFStreamWriter, split into three // phases: // 1. classify uncompressed vs compressed (same as upstream) // 2. instantiate every PDFObjectStream up-front, then `await -// Promise.all` an async zlib.deflate per stream so libuv's thread -// pool (default 4) runs them concurrently +// Promise.all` an async node:zlib.deflate per stream so libuv's +// thread pool (default 4) runs them concurrently // 3. size + emit (same as upstream, but every cache.access() is a hit) -// The xrefStream itself is one more PDFFlateStream; we deflate it -// serially in phase 3 since its contents depend on phase-3 offsets. +// The xrefStream is one more PDFFlateStream whose contents depend on +// the offsets computed in phase 3; we pre-deflate it once via +// node:zlib.deflateSync right after those offsets are pinned, so even +// that final stream never falls back to pdf-lib's pure-JS deflate. // // Output: byte-near-equivalent to pdfDoc.save({ useObjectStreams: true }). -// zlib vs pako deflate may pick different LZ77 matches → 1-byte-level -// stream diffs and matching /Length deltas; viewer-invisible. +// node:zlib's match choices in the LZ77 inner loop may differ from +// pdf-lib's default deflate library, producing 1-byte-level stream +// content and matching /Length deltas; viewer-invisible. // // Parallelism is bounded by UV_THREADPOOL_SIZE (default 4). Bump it via // `process.env.UV_THREADPOOL_SIZE = '8'` before any libuv work fires // if you want more concurrency. -import { deflate } from 'node:zlib'; +import { deflate, deflateSync } from 'node:zlib'; import { promisify } from 'node:util'; import { PDFStreamWriter, @@ -118,11 +121,19 @@ class ParallelStreamWriter extends PDFStreamWriter { uncompressedObjects.push([ref, objectStream]); } - // ----- xrefStream wrap-up (serial deflate; contents depend on offsets above) ----- + // ----- xrefStream wrap-up ----- + // Its contents depend on the offsets computed above, so we can only + // populate them now. One stream -- deflate sync via node:zlib and + // pre-populate the cache so the subsequent computeIndirectObjectSize + // is a cache hit (otherwise pdf-lib's lazy populate would run its + // own deflate library on the main thread). const xrefStreamRef = PDFRef.of(objectNumber++); xrefStream.dict.set(PDFName.of('Size'), PDFNumber.of(objectNumber)); xrefStream.addUncompressedEntry(xrefStreamRef, size); const xrefOffset = size; + if (this.encodeStreams) { + xrefStream.contentsCache.value = deflateSync(xrefStream.getUnencodedContents()); + } size += this.computeIndirectObjectSize([xrefStreamRef, xrefStream]); uncompressedObjects.push([xrefStreamRef, xrefStream]); diff --git a/docs/render-book.mjs b/docs/render-book.mjs index 422ab976..5ad570b1 100644 --- a/docs/render-book.mjs +++ b/docs/render-book.mjs @@ -32,17 +32,19 @@ import { dirname, resolve } from 'node:path'; import { writeFileSync, existsSync } from 'node:fs'; import puppeteer from 'puppeteer'; import { PDFDocument, ParseSpeeds } from 'pdf-lib'; -// Side-effecting imports. Order doesn't matter; both mutate live -// module exports before any pdf-lib operation. See +// Side-effecting imports. Mutate pdf-lib's live module exports +// before any pdf-lib operation -- order doesn't matter. See // perf/notes/08-pdf-lib.md. // -// fast-deflate -- swaps pdf-lib's pako.deflate (pure JS) for -// node:zlib.deflateSync (C). ~1.5 s saved on the save phase. // fast-refs -- dense-array cache in front of PDFRef.of for the // gen=0 case (82 % of ~1.2 M calls per load). ~0.2 s saved on // load. -import './lib/fast-deflate.mjs'; +// fast-inflate -- swaps pako.inflate for node:zlib.inflateSync on +// the one pdf-lib call site that uses it (PDFCrossRefStreamParser +// during load). Negligible cost shift, but eliminates the last +// pdf-lib -> pako call at runtime. import './lib/fast-refs.mjs'; +import './lib/fast-inflate.mjs'; import { parseOutline, setOutline } from './lib/outline.mjs'; import { setMetadata } from './lib/postprocesser.mjs'; import { parallelSave } from './lib/parallel-deflate.mjs'; diff --git a/perf/README.md b/perf/README.md index 0aa06bde..188c02f9 100644 --- a/perf/README.md +++ b/perf/README.md @@ -66,23 +66,21 @@ or pdf-lib), or to write `book.pdf` for behavioural verification. The mirror command for CPU-profiling the pdf-lib roundtrip: ``` -node measure.mjs --fast-deflate --fast-refs --parallel-deflate --cpu-profile-process --cpu-sampling 100 +node measure.mjs --fast-refs --parallel-deflate --cpu-profile-process --cpu-sampling 100 ``` Flag rationale: -- `--fast-deflate` -- inject the - [docs/lib/fast-deflate.mjs](../docs/lib/fast-deflate.mjs) shipping - fix (`pako.deflate` -> `node:zlib.deflateSync`). Production runs - through it; the profile should too. - `--fast-refs` -- inject the [docs/lib/fast-refs.mjs](../docs/lib/fast-refs.mjs) shipping fix - (dense-array cache for `PDFRef.of`'s gen=0 path). Same logic. + (dense-array cache for `PDFRef.of`'s gen=0 path). Production runs + through it; the profile should too. - `--parallel-deflate` -- swap `pdfDoc.save()` for `parallelSave` from [docs/lib/parallel-deflate.mjs](../docs/lib/parallel-deflate.mjs), which pre-deflates object streams in parallel on libuv's pool with `objectsPerStream: 500`. Production runs through it; same logic. - Moves ~300 ms of zlib work off the main thread. + Moves ~300 ms of zlib work off the main thread, and routes every + deflate call through `node:zlib` (no pdf-lib pure-JS fallback). - `--cpu-profile-process` -- attach Node's `inspector/promises` Profiler around the process phase only (skips render and generate). Writes `process.cpuprofile` into the timestamped `results/` folder. @@ -105,11 +103,11 @@ Why no `--render-only`? `--cpu-profile-process` requires the process phase to run; the harness errors out if you combine them. To compare against upstream pdf-lib (e.g. when proposing a change -upstream), drop `--fast-deflate`, `--fast-refs`, and -`--parallel-deflate`. Caveat for A/B work: profiler-on attribution -overstates the cost of hot functions called millions of times -(`PDFRef.of` in particular). For "did this wall-clock change," do -a paired no-profile A/B as a sanity check. +upstream), drop `--fast-refs` and `--parallel-deflate`. Caveat for +A/B work: profiler-on attribution overstates the cost of hot +functions called millions of times (`PDFRef.of` in particular). +For "did this wall-clock change," do a paired no-profile A/B as a +sanity check. See [notes/08-pdf-lib.md](notes/08-pdf-lib.md) for the process-phase investigations these flags enabled. @@ -234,8 +232,7 @@ run.bat --time-hooks # per-task timing of every chunker/pol run.bat --incremental # process via incremental update instead of pdf-lib roundtrip run.bat --chrome-outline # let Chrome emit /Outlines (skip parseOutline + setOutline) run.bat --tracing # capture a hybrid Chrome trace (Blink events + embedded V8 cpu samples) -run.bat --fast-deflate # route pdf-lib's deflate through node:zlib (ships in render-book.mjs by default; opt-in here for A/B) -run.bat --fast-refs # dense-array cache for PDFRef.of's gen=0 path (also ships; opt-in here for A/B) +run.bat --fast-refs # dense-array cache for PDFRef.of's gen=0 path (ships in render-book.mjs by default; opt-in here for A/B) run.bat --parallel-deflate # parallelSave with objectsPerStream=500 (also ships; opt-in here for A/B) ``` @@ -372,4 +369,4 @@ order; later ones reference earlier ones for context. | [05-blink-trace.md](notes/05-blink-trace.md) | What happened when we tried move-not-clone (a `previousLeaf` cache shipped instead of move); cracking the cpu profile's `(program)` row open with a Blink-category trace; the WhiteSpaceFilter paired-A/B that found it wasn't worth its layout cost in our pipeline. | | [06-microtasks-pageranges-css.md](notes/06-microtasks-pageranges-css.md) | Following `RunMicrotasks` down to zero (chunker fully sync); why `pageRanges` sharding is off the table; CSS cost attribution showing print.css's individual sections are all below the noise floor. | | [07-memory.md](notes/07-memory.md) | Where the renderer's 1.9 GB goes -- process-tree footprint, per-allocator + per-Blink-class breakdown, `--disable-gpu` + `--in-process-gpu` saving ~200 MB, a GC-pass probe finding 180 MB of unswept Oilpan garbage. | -| [08-pdf-lib.md](notes/08-pdf-lib.md) | Profiling the process phase via `--cpu-profile-process`; pako's per-stream init dominates with ~4 500 small streams (routing `pako.deflate` through `node:zlib` saves ~1.5 s); `PDFRef.of`'s string-keyed Map lookup at 1.2 M calls per load (dense-array gen=0 cache saves ~0.2 s); parallelising save's per-stream deflate on libuv's pool with `objectsPerStream: 500` (~0.3 s off the main thread; PDF -5 %). | +| [08-pdf-lib.md](notes/08-pdf-lib.md) | Profiling the process phase via `--cpu-profile-process`; pako's per-stream init dominates with ~4 500 small streams (routing pdf-lib's `deflate` + `inflate` through `node:zlib` saves ~1.5 s); `PDFRef.of`'s string-keyed Map lookup at 1.2 M calls per load (dense-array gen=0 cache saves ~0.2 s); parallelising save's per-stream deflate on libuv's pool with `objectsPerStream: 500` (~0.3 s off the main thread; PDF -5 %). | diff --git a/perf/measure.mjs b/perf/measure.mjs index 5fb3e7d6..8844a80d 100644 --- a/perf/measure.mjs +++ b/perf/measure.mjs @@ -28,7 +28,7 @@ // [--no-detach-pages] [--instrument] [--time-hooks] // [--incremental] [--chrome-outline] [--timing] // [--clone-count] [--render-only] -// [--fast-deflate] [--fast-refs] +// [--fast-refs] [--parallel-deflate] // // --render-only bails out after the render phase. Skips meta extraction, // parseOutline, page.pdf, and the pdf-lib roundtrip / incremental writer. @@ -92,17 +92,17 @@ // Honours --cpu-sampling. Composable with --cpu-profile when you want // both phases captured in one run. // -// --fast-deflate routes pdf-lib's PDFFlateStream compression through -// Node's zlib (C++) instead of pako (pure JS). Same wire format -// (PDF /FlateDecode = RFC 1950 zlib), ~5-10x faster on big inputs. -// Save phase only -- load uses pako.inflate, which the profile shows -// isn't a hot path for our content. -// // --fast-refs replaces PDFRef.of's string-keyed Map lookup with a // dense-array cache for the gen=0 case (82 % of ~1.2 M calls on the // book). Eliminates the per-call ` R` string allocation // and Map hash. gen != 0 calls (pdf-lib's xref-stream bookkeeping // for compressed objects) pass through unchanged. +// +// --parallel-deflate replaces pdfDoc.save() with parallelSave from +// docs/lib/parallel-deflate.mjs: object streams are pre-deflated in +// parallel on libuv's thread pool with objectsPerStream=500 (vs +// pdf-lib's serial save with default 50). Moves ~300 ms of zlib work +// off the main thread on the book. import { pathToFileURL, fileURLToPath } from 'node:url'; import { dirname, resolve, join } from 'node:path'; @@ -145,7 +145,6 @@ let timing = false; let cloneCount = false; let renderOnly = false; let tracing = false; -let fastDeflate = false; let fastRefs = false; let parallelDeflate = false; for (let i = 0; i < args.length; i++) { @@ -169,7 +168,6 @@ for (let i = 0; i < args.length; i++) { else if (a === '--render-only') renderOnly = true; else if (a === '--tracing') tracing = true; else if (a === '--no-affinity') { /* handled in pin-cpu.mjs */ } - else if (a === '--fast-deflate') fastDeflate = true; else if (a === '--fast-refs') fastRefs = true; else if (a === '--parallel-deflate') parallelDeflate = true; else if (!inputArg) inputArg = a; @@ -211,14 +209,8 @@ if (cpuProfileProcess && renderOnly) { process.exit(2); } -// Install the Node-zlib override for pdf-lib's PDFFlateStream compression -// before any pdf-lib operation. Side-effecting import; idempotent. The -// override only kicks in on pako.deflate calls (i.e. save()), so render- -// only runs that never reach the pdf-lib path are unaffected either way. -if (fastDeflate) { - await import('../docs/lib/fast-deflate.mjs'); - console.log('[harness] fast-deflate: pako.deflate -> node:zlib.deflateSync'); -} +// Install the dense-array cache for PDFRef.of's gen=0 path before any +// pdf-lib operation. Side-effecting import; idempotent. if (fastRefs) { await import('../docs/lib/fast-refs.mjs'); console.log('[harness] fast-refs: PDFRef.of dense-array cache for gen=0'); diff --git a/perf/notes/08-pdf-lib.md b/perf/notes/08-pdf-lib.md index 7c71e317..e547754c 100644 --- a/perf/notes/08-pdf-lib.md +++ b/perf/notes/08-pdf-lib.md @@ -534,14 +534,19 @@ phases: is a hit, so save's loop never touches deflate. The xrefStream is one more `PDFFlateStream` but its contents -depend on the offsets computed in phase 3, so we let it deflate -serially at the end (one stream; `fast-deflate`'s `deflateSync` -handles it). +depend on the offsets computed in phase 3, so we pre-deflate it +via `node:zlib.deflateSync` right after those offsets are pinned +-- one stream, sync is fine, and pre-populating its cache means +`computeIndirectObjectSize` later is a hit too. The net effect: +every deflate that happens during a save goes through `node:zlib`, +and pdf-lib's pure-JS fallback never runs. Exposed as `parallelSave(pdfDoc, options)`. Drop-in for `pdfDoc.save` when `useObjectStreams: true` -- same pre-serialize -hooks (addDefaultPage, updateFieldAppearances, flush), same -byte-level output modulo zlib-vs-pako match choices. +hooks (addDefaultPage, updateFieldAppearances, flush), +byte-near-equivalent output (zlib's LZ77 match choices may differ +from pdf-lib's default deflate library at the byte level, but the +wire format is identical). ### First try with default `objectsPerStream=50` was slower @@ -615,6 +620,65 @@ runs. The harness exposes the same via `--parallel-deflate` (which calls `parallelSave` with the same defaults). +### Retiring `fast-deflate.mjs` + +Once `parallelSave` also pre-deflates the xrefStream, pdf-lib's +lazy `cache.populate()` deflate path is **never invoked at +runtime**. Every `PDFObjectStream` is parallel-deflated in phase 2; +the xrefStream is sync-deflated in phase 3. Both go through +`node:zlib`. There's no remaining call site for pdf-lib's pure-JS +fallback during a save. + +The `fast-deflate.mjs` shim that used to monkey-patch +`pako.deflate` is therefore redundant -- it was a per-call dispatch +optimisation for a code path we no longer take. Deleted: + +- `docs/lib/fast-deflate.mjs` -- removed. +- `import './lib/fast-deflate.mjs'` -- removed from + `render-book.mjs`. +- `--fast-deflate` -- removed from the `measure.mjs` flag set. + +Smoke profile after removal (`--parallel-deflate --fast-refs +--cpu-profile-process`, no fast-deflate import anywhere): 0 frames +matching `pako`, 0 matches for `computeContents`, 0 for +`fastDeflate`. Process phase 2.34 s, output 15.3 MB. + +The deletion is purely a cleanup -- profile-equivalent to before +-- but it removes 38 lines of indirection and one transitive +concern. + +### Routing inflate through `node:zlib` too + +One call site on the load side still went through pdf-lib's pako: +`PDFCrossRefStreamParser` decompresses the xref stream's payload +via `pako.inflate` during `PDFDocument.load`. Cost is tiny -- one +inflate per load, ~3 ms -- but it's the last pdf-lib → pako edge +in the runtime, and the dispatch story for the README is cleaner +when "every zlib call goes through `node:zlib`" is true on both +sides. + +`docs/lib/fast-inflate.mjs` is the symmetric counterpart to the +retired `fast-deflate.mjs`: + +```js +import { inflateSync } from "node:zlib"; +import pako from "pako"; + +if (!pako.__fastInflateInstalled) { + const original = pako.inflate; + pako.inflate = function fastInflate(data, options) { + if (options) return original.call(pako, data, options); + return inflateSync(data); + }; + pako.__fastInflateInstalled = true; +} +``` + +`render-book.mjs` imports it unconditionally next to `fast-refs`. +No harness flag -- the per-load cost is below the profile noise +floor; this lands for the architectural reason, not a measurable +win. + ## Where this leaves the picture Cumulative process-phase cost, baseline → after all three shims: From 20b17d6156e95aedc64c41017d1e8de5d14a7427 Mon Sep 17 00:00:00 2001 From: Kuba Sunderland-Ober Date: Sun, 24 May 2026 00:11:42 +0200 Subject: [PATCH 05/44] Speed up pdf-lib's number parsing. --- docs/lib/fast-parse-number.mjs | 122 +++++++++++++++++++++++++++++++++ docs/render-book.mjs | 19 +++-- perf/notes/08-pdf-lib.md | 64 +++++++++++++++++ 3 files changed, 198 insertions(+), 7 deletions(-) create mode 100644 docs/lib/fast-parse-number.mjs diff --git a/docs/lib/fast-parse-number.mjs b/docs/lib/fast-parse-number.mjs new file mode 100644 index 00000000..661a40eb --- /dev/null +++ b/docs/lib/fast-parse-number.mjs @@ -0,0 +1,122 @@ +// Replace pdf-lib's BaseParser.parseRawNumber with a direct-integer +// accumulator that skips per-byte string concatenation, charFromCode +// calls, and the trailing Number() string-parse round-trip. +// +// The upstream implementation +// ([BaseParser.js:33](node_modules/pdf-lib/cjs/core/parser/BaseParser.js:33)) +// builds `value` one character at a time via `value += charFromCode(byte)`, +// then calls `Number(value)` to convert the string back to a number, +// then performs `isFinite` + MAX_SAFE_INTEGER guards on every call. +// Every numeric token in a PDF flows through this path +// (PDFObjectParser.parseNumberOrRef invokes it once per number, twice +// per indirect ref), so on the book it fires hundreds of thousands of +// times and allocates a throwaway string per call. +// +// The fast path accumulates the integer directly (n = n*10 + (byte - +// 0x30)) and only descends into decimal handling when a period appears. +// Falls back to the original for: +// - Numbers with > 15 integer digits (where direct accumulation +// could exceed Number.MAX_SAFE_INTEGER and lose precision). +// - Empty-digit cases (e.g., "."), so upstream's NumberParsingError +// keeps its diagnostic context. +// Both fallback paths are vanishingly rare on real PDFs. +// +// Mechanism: BaseParser isn't re-exported by pdf-lib's index, so we +// import it via the package's CJS internal path through createRequire. +// Mutating BaseParser.prototype affects every subclass (PDFParser, +// PDFObjectParser, PDFObjectStreamParser, PDFXRefStreamParser). +// +// Side-effecting import. Import once before PDFDocument.load runs: +// +// import "./lib/fast-parse-number.mjs"; +// +// Idempotent -- repeated imports do nothing after the first. + +import { createRequire } from 'node:module'; + +const require = createRequire(import.meta.url); +const BaseParser = require('pdf-lib/cjs/core/parser/BaseParser.js').default; +const { IsDigit } = require('pdf-lib/cjs/core/syntax/Numeric.js'); + +const ZERO = 0x30; // '0' +const PERIOD = 0x2E; // '.' +const PLUS = 0x2B; // '+' +const MINUS = 0x2D; // '-' + +// Number.MAX_SAFE_INTEGER == 9007199254740991 (16 digits). 15-digit +// integers are guaranteed to accumulate exactly without precision loss. +const MAX_SAFE_INT_DIGITS = 15; + +if (!BaseParser.__fastParseNumberInstalled) { + const origParseRawNumber = BaseParser.prototype.parseRawNumber; + + BaseParser.prototype.parseRawNumber = function fastParseRawNumber() { + const bytes = this.bytes; + const start = bytes.offset(); + + // Sign + let byte = bytes.peek(); + let neg = false; + if (byte === PLUS) { + bytes.next(); + byte = bytes.peek(); + } else if (byte === MINUS) { + neg = true; + bytes.next(); + byte = bytes.peek(); + } + + // Integer part + let intPart = 0; + let intDigits = 0; + while (!bytes.done() && IsDigit[byte]) { + if (intDigits >= MAX_SAFE_INT_DIGITS) { + // Precision risk -- rewind and delegate to upstream's Number() + // path, which retains correctly-rounded double precision and + // emits the spec-mandated warning above MAX_SAFE_INTEGER. + bytes.moveTo(start); + return origParseRawNumber.call(this); + } + intPart = intPart * 10 + (byte - ZERO); + intDigits++; + bytes.next(); + byte = bytes.peek(); + } + + if (byte !== PERIOD) { + if (intDigits === 0) { + // Empty number (e.g., bare sign with no digits). Rewind and + // let upstream throw NumberParsingError with full context. + bytes.moveTo(start); + return origParseRawNumber.call(this); + } + return neg ? -intPart : intPart; + } + + // Consume period + bytes.next(); + byte = bytes.peek(); + + // Decimal part + let frac = 0; + let scale = 1; + while (!bytes.done() && IsDigit[byte]) { + frac = frac * 10 + (byte - ZERO); + scale *= 10; + bytes.next(); + byte = bytes.peek(); + } + + if (intDigits === 0 && scale === 1) { + // Lone "." with no digits on either side. Rewind to let upstream + // throw NumberParsingError. + bytes.moveTo(start); + return origParseRawNumber.call(this); + } + + const value = frac === 0 ? intPart : intPart + frac / scale; + return neg ? -value : value; + }; + + BaseParser.__fastParseNumberInstalled = true; +} diff --git a/docs/render-book.mjs b/docs/render-book.mjs index 5ad570b1..f70f7749 100644 --- a/docs/render-book.mjs +++ b/docs/render-book.mjs @@ -36,15 +36,20 @@ import { PDFDocument, ParseSpeeds } from 'pdf-lib'; // before any pdf-lib operation -- order doesn't matter. See // perf/notes/08-pdf-lib.md. // -// fast-refs -- dense-array cache in front of PDFRef.of for the -// gen=0 case (82 % of ~1.2 M calls per load). ~0.2 s saved on -// load. -// fast-inflate -- swaps pako.inflate for node:zlib.inflateSync on -// the one pdf-lib call site that uses it (PDFCrossRefStreamParser -// during load). Negligible cost shift, but eliminates the last -// pdf-lib -> pako call at runtime. +// fast-refs -- dense-array cache in front of PDFRef.of for +// the gen=0 case (82 % of ~1.2 M calls per load). ~0.2 s saved +// on load. +// fast-inflate -- swaps pako.inflate for node:zlib.inflateSync +// on the one pdf-lib call site that uses it +// (PDFCrossRefStreamParser during load). Negligible cost shift, +// but eliminates the last pdf-lib -> pako call at runtime. +// fast-parse-number -- direct-integer accumulator in front of +// BaseParser.parseRawNumber, skipping per-byte string concat +// and the trailing Number() round-trip. Touches every numeric +// token parsed during PDFDocument.load. import './lib/fast-refs.mjs'; import './lib/fast-inflate.mjs'; +import './lib/fast-parse-number.mjs'; import { parseOutline, setOutline } from './lib/outline.mjs'; import { setMetadata } from './lib/postprocesser.mjs'; import { parallelSave } from './lib/parallel-deflate.mjs'; diff --git a/perf/notes/08-pdf-lib.md b/perf/notes/08-pdf-lib.md index e547754c..351a403e 100644 --- a/perf/notes/08-pdf-lib.md +++ b/perf/notes/08-pdf-lib.md @@ -679,6 +679,70 @@ No harness flag -- the per-load cost is below the profile noise floor; this lands for the architectural reason, not a measurable win. +## `BaseParser.parseRawNumber`: direct-integer accumulator + +After `fast-deflate` + `fast-refs` + `parallel-deflate`, the load +side of the bottom-up table shifted onto the parser frames. One of +them is `BaseParser.parseRawNumber`, called once per numeric token +encountered during `PDFDocument.load` and twice per `N gen R` +indirect reference -- so on the book it fires several hundred +thousand times per load. + +The upstream implementation +(`pdf-lib/.../parser/BaseParser.js:33`) builds the number as a +string, one character at a time, then converts: + +```js +let value = ''; +while (!this.bytes.done() && IsDigit[this.bytes.peek()]) { + value += charFromCode(this.bytes.next()); +} +// ... fractional part, sign handling ... +const numberValue = Number(value); +if (!isFinite(numberValue) || numberValue > Number.MAX_SAFE_INTEGER) { ... } +return numberValue; +``` + +Every call allocates a throwaway string of length 1..N (one `+=` +allocation per digit), then runs `Number(value)` to parse the +string back into a double, then runs guards. The string allocation ++ `Number()` round-trip is the cost we care about. + +### The shim + +`docs/lib/fast-parse-number.mjs` mutates +`BaseParser.prototype.parseRawNumber` to accumulate the integer +directly (`n = n * 10 + (byte - 0x30)`), descending into decimal +handling only when a period appears. Falls back to the original +for: + +- **More than 15 integer digits** -- direct accumulation could + exceed `Number.MAX_SAFE_INTEGER` (16 digits) and silently lose + precision. Upstream's `Number(value)` retains correctly-rounded + double precision in that range and emits the spec-mandated + overflow warning, so we rewind and delegate. +- **Empty-digit cases** (e.g. `+`, `.`, bare sign) -- rewind and + let upstream throw `NumberParsingError` with full diagnostic + context. Both fallback paths are vanishingly rare on real PDFs. + +`BaseParser` isn't re-exported by pdf-lib's index, so we reach it +via the package's CJS internal path through `createRequire`: + +```js +const require = createRequire(import.meta.url); +const BaseParser = require('pdf-lib/cjs/core/parser/BaseParser.js').default; +``` + +Mutating `BaseParser.prototype` propagates to every subclass -- +`PDFParser`, `PDFObjectParser`, `PDFObjectStreamParser`, +`PDFXRefStreamParser`. One side-effecting import covers them all. + +`render-book.mjs` imports it unconditionally next to `fast-refs`. +No harness flag yet; the win is small per-call but the call rate +is high enough to matter -- to be measured later when the +follow-on work (size-in-bytes / iterator / parseDict shims) makes +the parser side worth quantifying as a group. + ## Where this leaves the picture Cumulative process-phase cost, baseline → after all three shims: From 79aa2a157a408bf44781ecbdd2ba3b3abc06a8fc Mon Sep 17 00:00:00 2001 From: Kuba Sunderland-Ober Date: Sun, 24 May 2026 00:16:55 +0200 Subject: [PATCH 06/44] Improve performance of a few helper functions. --- docs/lib/fast-decode-name.mjs | 70 ++++++ docs/lib/fast-number-to-string.mjs | 65 +++++ docs/lib/fast-parse-number.mjs | 61 +++-- docs/render-book.mjs | 17 +- perf/README.md | 25 +- perf/measure.mjs | 23 ++ perf/notes/08-pdf-lib.md | 390 ++++++++++++++++++++++++++--- 7 files changed, 594 insertions(+), 57 deletions(-) create mode 100644 docs/lib/fast-decode-name.mjs create mode 100644 docs/lib/fast-number-to-string.mjs diff --git a/docs/lib/fast-decode-name.mjs b/docs/lib/fast-decode-name.mjs new file mode 100644 index 00000000..0f20a9fd --- /dev/null +++ b/docs/lib/fast-decode-name.mjs @@ -0,0 +1,70 @@ +// Skip pdf-lib's decodeName regex scan when the input has no `#`. +// +// The upstream PDFName.of +// ([PDFName.js:100](node_modules/pdf-lib/cjs/core/objects/PDFName.js:100)) +// is the gatekeeper for every PDFName instance the parser builds: +// +// PDFName.of = function (name) { +// var decodedValue = decodeName(name); // <-- always runs +// var instance = pool.get(decodedValue); +// if (!instance) { ... } +// return instance; +// }; +// +// and decodeName at line 9 is: +// +// name.replace(/#([\dABCDEF]{2})/g, function (_, hex) { ... }) +// +// PDF spec (ISO 32000-1 §7.3.5) requires `#XX` hex-escape for any +// byte outside printable-ASCII or for delimiters / whitespace. In +// real PDFs almost no names use it. Instrumenting on the book: +// +// PDFName.of calls : 2,759,635 +// raw input has # char : 2 (0.000%) +// +// So decodeName runs a regex scan against 2.76 M strings to find a +// `#` that's only there twice in the whole load. Profile attributes +// ~168 ms (7 %) of process self-time to this function. +// +// Shim: a parallel Map keyed by the raw `name` +// argument. When `name` contains no `#`, decoded form equals raw +// form, so our key matches pdf-lib's internal pool key and a hit +// returns the deduped instance with zero regex work. Misses +// delegate to the original (which does the regex scan once and +// stores the instance in pdf-lib's pool); we cache the result so +// every subsequent occurrence of the same name hits our fast path. +// +// Names containing `#` fall through to the original unchanged -- +// the correctness path (e.g. uppercase-only regex, lowercase escapes +// silently un-decoded) is preserved exactly. +// +// Mechanism: PDFName is re-exported from pdf-lib's index, so we can +// patch PDFName.of directly without reaching into CJS internals. +// Static initializers (PDFName.Length, .FlateDecode, ...) ran when +// pdf-lib's module body executed -- before this shim imports -- so +// pdf-lib's pool is already populated with the canonical instances +// the parser will see. +// +// Side-effecting import. Import once before any pdf-lib operation: +// +// import "./lib/fast-decode-name.mjs"; +// +// Idempotent -- repeated imports do nothing after the first. + +import { PDFName } from "pdf-lib"; + +if (!PDFName.__fastDecodeNameInstalled) { + const original = PDFName.of; + const fastCache = new Map(); + PDFName.of = function fastOf(name) { + if (name.indexOf("#") === -1) { + const cached = fastCache.get(name); + if (cached) return cached; + const instance = original.call(PDFName, name); + fastCache.set(name, instance); + return instance; + } + return original.call(PDFName, name); + }; + PDFName.__fastDecodeNameInstalled = true; +} diff --git a/docs/lib/fast-number-to-string.mjs b/docs/lib/fast-number-to-string.mjs new file mode 100644 index 00000000..57640a97 --- /dev/null +++ b/docs/lib/fast-number-to-string.mjs @@ -0,0 +1,65 @@ +// Skip pdf-lib's numberToString redundant work when the input doesn't +// stringify to exponential notation. +// +// The upstream numberToString +// ([numbers.js:13](node_modules/pdf-lib/cjs/utils/numbers.js:13)) is: +// +// exports.numberToString = function (num) { +// var numStr = String(num); +// if (Math.abs(num) < 1.0) { +// var e = parseInt(num.toString().split('e-')[1]); +// if (e) { ... } +// } else { +// var e = parseInt(num.toString().split('+')[1]); +// if (e > 20) { ... } +// } +// return numStr; +// }; +// +// It always computes `numStr = String(num)` up front -- but then +// re-calls `num.toString()`, allocates a `.split(...)` array, and +// runs parseInt on the result, even though `numStr` is already what +// `.toString()` returns. Exponential notation in `String(num)` only +// appears for |num| < 1e-6 or |num| >= 1e21, neither of which real +// PDFs emit: object refs, generations, byte offsets, content-stream +// coordinates, /Size, /Length, etc. all stringify to plain decimal. +// +// Shim: short-circuit when `String(num)` contains no `'e'` and return +// it immediately. The rare exponential cases fall through to the +// original so the spec-compliant expansion logic is preserved. +// +// Why three patches and not one: pdf-lib ships compiled against +// tslib 1.x, whose `__exportStar` does a value-copy (`exports[p] = +// m[p]`) rather than installing a live getter. So by the time +// PDFNumber.js's `index_1.numberToString(value)` runs, `index_1` (the +// utils/index barrel) holds a captured reference to the original +// function, and mutating `numbers.numberToString` alone is invisible +// to the call site. We patch the captured copies along the re-export +// chain: utils/numbers (source), utils/index (the barrel PDFNumber +// reads from), and pdf-lib's top-level index (the public surface). +// +// Side-effecting import. Import once before any pdf-lib operation: +// +// import "./lib/fast-number-to-string.mjs"; +// +// Idempotent -- repeated imports do nothing after the first. + +import { createRequire } from 'node:module'; + +const require = createRequire(import.meta.url); +const numbers = require('pdf-lib/cjs/utils/numbers.js'); +const utilsBarrel = require('pdf-lib/cjs/utils/index.js'); +const topBarrel = require('pdf-lib/cjs/index.js'); + +if (!numbers.__fastNumberToStringInstalled) { + const original = numbers.numberToString; + const fastNumberToString = function fastNumberToString(num) { + const numStr = String(num); + if (numStr.indexOf('e') === -1) return numStr; + return original(num); + }; + numbers.numberToString = fastNumberToString; + utilsBarrel.numberToString = fastNumberToString; + topBarrel.numberToString = fastNumberToString; + numbers.__fastNumberToStringInstalled = true; +} diff --git a/docs/lib/fast-parse-number.mjs b/docs/lib/fast-parse-number.mjs index 661a40eb..0f202d0a 100644 --- a/docs/lib/fast-parse-number.mjs +++ b/docs/lib/fast-parse-number.mjs @@ -1,24 +1,28 @@ -// Replace pdf-lib's BaseParser.parseRawNumber with a direct-integer -// accumulator that skips per-byte string concatenation, charFromCode -// calls, and the trailing Number() string-parse round-trip. +// Replace pdf-lib's BaseParser.parseRawNumber and BaseParser.parseRawInt +// with direct-integer accumulators that skip per-byte string +// concatenation, charFromCode calls, and the trailing Number() +// string-parse round-trip. // -// The upstream implementation -// ([BaseParser.js:33](node_modules/pdf-lib/cjs/core/parser/BaseParser.js:33)) -// builds `value` one character at a time via `value += charFromCode(byte)`, -// then calls `Number(value)` to convert the string back to a number, -// then performs `isFinite` + MAX_SAFE_INTEGER guards on every call. -// Every numeric token in a PDF flows through this path -// (PDFObjectParser.parseNumberOrRef invokes it once per number, twice -// per indirect ref), so on the book it fires hundreds of thousands of -// times and allocates a throwaway string per call. +// The upstream implementations +// ([BaseParser.js:17 + :33](node_modules/pdf-lib/cjs/core/parser/BaseParser.js:17)) +// build `value` one character at a time via `value += charFromCode(byte)`, +// then call `Number(value)` to convert the string back to a number, +// then perform `isFinite` (and for parseRawNumber, MAX_SAFE_INTEGER) +// guards on every call. Every numeric token in a PDF flows through +// these paths: parseRawNumber via PDFObjectParser.parseNumberOrRef +// (once per number, twice per indirect ref), parseRawInt via +// PDFParser.parseIndirectObjectHeader (twice per indirect object) and +// PDFObjectStreamParser (twice per object inside an ObjStm). On the +// book this fires hundreds of thousands of times and allocates a +// throwaway string per call. // // The fast path accumulates the integer directly (n = n*10 + (byte - -// 0x30)) and only descends into decimal handling when a period appears. -// Falls back to the original for: +// 0x30)). parseRawNumber additionally descends into decimal handling +// when a period appears. Both fall back to the original for: // - Numbers with > 15 integer digits (where direct accumulation // could exceed Number.MAX_SAFE_INTEGER and lose precision). -// - Empty-digit cases (e.g., "."), so upstream's NumberParsingError -// keeps its diagnostic context. +// - Empty-digit cases (e.g., bare sign or lone "."), so upstream's +// NumberParsingError keeps its diagnostic context. // Both fallback paths are vanishingly rare on real PDFs. // // Mechanism: BaseParser isn't re-exported by pdf-lib's index, so we @@ -49,6 +53,31 @@ const MAX_SAFE_INT_DIGITS = 15; if (!BaseParser.__fastParseNumberInstalled) { const origParseRawNumber = BaseParser.prototype.parseRawNumber; + const origParseRawInt = BaseParser.prototype.parseRawInt; + + BaseParser.prototype.parseRawInt = function fastParseRawInt() { + const bytes = this.bytes; + const start = bytes.offset(); + + let n = 0; + let digits = 0; + let byte = bytes.peek(); + while (!bytes.done() && IsDigit[byte]) { + if (digits >= MAX_SAFE_INT_DIGITS) { + bytes.moveTo(start); + return origParseRawInt.call(this); + } + n = n * 10 + (byte - ZERO); + digits++; + bytes.next(); + byte = bytes.peek(); + } + if (digits === 0) { + bytes.moveTo(start); + return origParseRawInt.call(this); + } + return n; + }; BaseParser.prototype.parseRawNumber = function fastParseRawNumber() { const bytes = this.bytes; diff --git a/docs/render-book.mjs b/docs/render-book.mjs index f70f7749..15fa4b2a 100644 --- a/docs/render-book.mjs +++ b/docs/render-book.mjs @@ -43,13 +43,22 @@ import { PDFDocument, ParseSpeeds } from 'pdf-lib'; // on the one pdf-lib call site that uses it // (PDFCrossRefStreamParser during load). Negligible cost shift, // but eliminates the last pdf-lib -> pako call at runtime. -// fast-parse-number -- direct-integer accumulator in front of -// BaseParser.parseRawNumber, skipping per-byte string concat -// and the trailing Number() round-trip. Touches every numeric -// token parsed during PDFDocument.load. +// fast-parse-number -- direct-integer accumulators in front of +// BaseParser.parseRawNumber + parseRawInt, skipping per-byte +// string concat and the trailing Number() round-trip. Touches +// every numeric token parsed during PDFDocument.load. +// fast-decode-name -- cache in front of PDFName.of that skips +// the decodeName regex scan when the input has no `#` (which +// is 99.999 % of the ~2.8 M PDFName.of calls per load). +// fast-number-to-string -- short-circuit numberToString when +// `String(num)` already lacks an `e` (i.e. for every PDF number +// that's not in the exponential-notation tail). Skips a +// redundant toString + split + parseInt per call. import './lib/fast-refs.mjs'; import './lib/fast-inflate.mjs'; import './lib/fast-parse-number.mjs'; +import './lib/fast-decode-name.mjs'; +import './lib/fast-number-to-string.mjs'; import { parseOutline, setOutline } from './lib/outline.mjs'; import { setMetadata } from './lib/postprocesser.mjs'; import { parallelSave } from './lib/parallel-deflate.mjs'; diff --git a/perf/README.md b/perf/README.md index 188c02f9..5bab84ba 100644 --- a/perf/README.md +++ b/perf/README.md @@ -66,7 +66,7 @@ or pdf-lib), or to write `book.pdf` for behavioural verification. The mirror command for CPU-profiling the pdf-lib roundtrip: ``` -node measure.mjs --fast-refs --parallel-deflate --cpu-profile-process --cpu-sampling 100 +node measure.mjs --fast-refs --parallel-deflate --fast-decode-name --fast-number-to-string --cpu-profile-process --cpu-sampling 100 ``` Flag rationale: @@ -81,6 +81,20 @@ Flag rationale: `objectsPerStream: 500`. Production runs through it; same logic. Moves ~300 ms of zlib work off the main thread, and routes every deflate call through `node:zlib` (no pdf-lib pure-JS fallback). +- `--fast-decode-name` -- inject + [docs/lib/fast-decode-name.mjs](../docs/lib/fast-decode-name.mjs), a + parallel `Map` in front of `PDFName.of` that + skips the `decodeName` regex scan when the raw name has no `#` + hex escape (99.999 % of the ~2.8 M `PDFName.of` calls per load). + Production runs through it; ~530 ms saved on process. +- `--fast-number-to-string` -- inject + [docs/lib/fast-number-to-string.mjs](../docs/lib/fast-number-to-string.mjs), + short-circuiting pdf-lib's `numberToString` when `String(num)` + already lacks an `e` (i.e. for every PDF number that isn't in + the exponential-notation tail -- 100 % of ~290 k calls on the + book). Skips a redundant `toString` + `split` + `parseInt` per + call. Production runs through it. Profile self-time on the + function drops from ~45-50 ms (~2 % of process) to ~5-12 ms. - `--cpu-profile-process` -- attach Node's `inspector/promises` Profiler around the process phase only (skips render and generate). Writes `process.cpuprofile` into the timestamped `results/` folder. @@ -103,7 +117,8 @@ Why no `--render-only`? `--cpu-profile-process` requires the process phase to run; the harness errors out if you combine them. To compare against upstream pdf-lib (e.g. when proposing a change -upstream), drop `--fast-refs` and `--parallel-deflate`. Caveat for +upstream), drop `--fast-refs`, `--parallel-deflate`, +`--fast-decode-name`, and `--fast-number-to-string`. Caveat for A/B work: profiler-on attribution overstates the cost of hot functions called millions of times (`PDFRef.of` in particular). For "did this wall-clock change," do a paired no-profile A/B as a @@ -234,6 +249,8 @@ run.bat --chrome-outline # let Chrome emit /Outlines (skip pars run.bat --tracing # capture a hybrid Chrome trace (Blink events + embedded V8 cpu samples) run.bat --fast-refs # dense-array cache for PDFRef.of's gen=0 path (ships in render-book.mjs by default; opt-in here for A/B) run.bat --parallel-deflate # parallelSave with objectsPerStream=500 (also ships; opt-in here for A/B) +run.bat --fast-decode-name # skip decodeName regex when name has no # (also ships; opt-in here for A/B) +run.bat --fast-number-to-string # skip numberToString redundant toString/split when no exponential (also ships; opt-in here for A/B) ``` Flags compose. The CPU profile lands as `render.cpuprofile` @@ -343,6 +360,8 @@ file documenting each: | `pako.deflate` → `node:zlib.deflateSync` | [08](notes/08-pdf-lib.md) | ~1.5 s process (save -58 %) | | `PDFRef.of` dense-array cache (gen=0) | [08](notes/08-pdf-lib.md) | ~0.2 s process (load -16 %) | | Parallel deflate + `objectsPerStream: 500` | [08](notes/08-pdf-lib.md) | ~0.3 s process (zlib off-thread; PDF -5 %) | +| `PDFName.of` no-`#` cache (skip `decodeName` regex) | [08](notes/08-pdf-lib.md) | ~0.5 s process (load -17 %, GC -101 ms) | +| `numberToString` no-`e` short-circuit | [08](notes/08-pdf-lib.md) | ~40 ms profile, below wall-clock noise | What was tried and didn't ship: @@ -369,4 +388,4 @@ order; later ones reference earlier ones for context. | [05-blink-trace.md](notes/05-blink-trace.md) | What happened when we tried move-not-clone (a `previousLeaf` cache shipped instead of move); cracking the cpu profile's `(program)` row open with a Blink-category trace; the WhiteSpaceFilter paired-A/B that found it wasn't worth its layout cost in our pipeline. | | [06-microtasks-pageranges-css.md](notes/06-microtasks-pageranges-css.md) | Following `RunMicrotasks` down to zero (chunker fully sync); why `pageRanges` sharding is off the table; CSS cost attribution showing print.css's individual sections are all below the noise floor. | | [07-memory.md](notes/07-memory.md) | Where the renderer's 1.9 GB goes -- process-tree footprint, per-allocator + per-Blink-class breakdown, `--disable-gpu` + `--in-process-gpu` saving ~200 MB, a GC-pass probe finding 180 MB of unswept Oilpan garbage. | -| [08-pdf-lib.md](notes/08-pdf-lib.md) | Profiling the process phase via `--cpu-profile-process`; pako's per-stream init dominates with ~4 500 small streams (routing pdf-lib's `deflate` + `inflate` through `node:zlib` saves ~1.5 s); `PDFRef.of`'s string-keyed Map lookup at 1.2 M calls per load (dense-array gen=0 cache saves ~0.2 s); parallelising save's per-stream deflate on libuv's pool with `objectsPerStream: 500` (~0.3 s off the main thread; PDF -5 %). | +| [08-pdf-lib.md](notes/08-pdf-lib.md) | Profiling the process phase via `--cpu-profile-process`; pako's per-stream init dominates with ~4 500 small streams (routing pdf-lib's `deflate` + `inflate` through `node:zlib` saves ~1.5 s); `PDFRef.of`'s string-keyed Map lookup at 1.2 M calls per load (dense-array gen=0 cache saves ~0.2 s); parallelising save's per-stream deflate on libuv's pool with `objectsPerStream: 500` (~0.3 s off the main thread; PDF -5 %); `decodeName`'s regex scan on 2.76 M `PDFName.of` calls per load with a 0.0001 % hit rate (no-`#` cache saves ~0.5 s); `numberToString`'s redundant `toString`/`split`/`parseInt` on the 100 % no-`e` path. | diff --git a/perf/measure.mjs b/perf/measure.mjs index 8844a80d..00c9d1ac 100644 --- a/perf/measure.mjs +++ b/perf/measure.mjs @@ -29,6 +29,7 @@ // [--incremental] [--chrome-outline] [--timing] // [--clone-count] [--render-only] // [--fast-refs] [--parallel-deflate] +// [--fast-decode-name] [--fast-number-to-string] // // --render-only bails out after the render phase. Skips meta extraction, // parseOutline, page.pdf, and the pdf-lib roundtrip / incremental writer. @@ -103,6 +104,16 @@ // parallel on libuv's thread pool with objectsPerStream=500 (vs // pdf-lib's serial save with default 50). Moves ~300 ms of zlib work // off the main thread on the book. +// +// --fast-decode-name installs a parallel cache in front of PDFName.of +// that skips the decodeName regex scan when the raw name contains +// no `#` hex escape (which is 99.999 % of the ~2.8 M PDFName.of +// calls per load on the book). ~150 ms saved on process load. +// +// --fast-number-to-string short-circuits pdf-lib's numberToString +// when String(num) already lacks an `e`. Skips a redundant toString, +// split, and parseInt per call; only the rare exponential-notation +// tail still falls through to the original implementation. import { pathToFileURL, fileURLToPath } from 'node:url'; import { dirname, resolve, join } from 'node:path'; @@ -147,6 +158,8 @@ let renderOnly = false; let tracing = false; let fastRefs = false; let parallelDeflate = false; +let fastDecodeName = false; +let fastNumberToString = false; for (let i = 0; i < args.length; i++) { const a = args[i]; if (a === '--out') outArg = args[++i]; @@ -170,6 +183,8 @@ for (let i = 0; i < args.length; i++) { else if (a === '--no-affinity') { /* handled in pin-cpu.mjs */ } else if (a === '--fast-refs') fastRefs = true; else if (a === '--parallel-deflate') parallelDeflate = true; + else if (a === '--fast-decode-name') fastDecodeName = true; + else if (a === '--fast-number-to-string') fastNumberToString = true; else if (!inputArg) inputArg = a; else { console.error(`unknown arg: ${a}`); process.exit(2); } } @@ -215,6 +230,14 @@ if (fastRefs) { await import('../docs/lib/fast-refs.mjs'); console.log('[harness] fast-refs: PDFRef.of dense-array cache for gen=0'); } +if (fastDecodeName) { + await import('../docs/lib/fast-decode-name.mjs'); + console.log('[harness] fast-decode-name: skip decodeName regex when name has no #'); +} +if (fastNumberToString) { + await import('../docs/lib/fast-number-to-string.mjs'); + console.log('[harness] fast-number-to-string: skip redundant toString/split when no exponential'); +} const stamp = new Date().toISOString().replace(/[:.]/g, '-'); const outDir = outArg diff --git a/perf/notes/08-pdf-lib.md b/perf/notes/08-pdf-lib.md index 351a403e..8d2c9dfa 100644 --- a/perf/notes/08-pdf-lib.md +++ b/perf/notes/08-pdf-lib.md @@ -679,14 +679,15 @@ No harness flag -- the per-load cost is below the profile noise floor; this lands for the architectural reason, not a measurable win. -## `BaseParser.parseRawNumber`: direct-integer accumulator +## `BaseParser.parseRawNumber` + `parseRawInt`: direct-integer accumulators After `fast-deflate` + `fast-refs` + `parallel-deflate`, the load -side of the bottom-up table shifted onto the parser frames. One of -them is `BaseParser.parseRawNumber`, called once per numeric token -encountered during `PDFDocument.load` and twice per `N gen R` -indirect reference -- so on the book it fires several hundred -thousand times per load. +side of the bottom-up table shifted onto the parser frames. Two of +them are `BaseParser.parseRawNumber` (called once per numeric +token, twice per `N gen R` indirect reference) and +`BaseParser.parseRawInt` (called twice per indirect-object header +and twice per object inside an `ObjStm`). Between them they fire +hundreds of thousands of times per load on the book. The upstream implementation (`pdf-lib/.../parser/BaseParser.js:33`) builds the number as a @@ -710,11 +711,12 @@ string back into a double, then runs guards. The string allocation ### The shim -`docs/lib/fast-parse-number.mjs` mutates -`BaseParser.prototype.parseRawNumber` to accumulate the integer -directly (`n = n * 10 + (byte - 0x30)`), descending into decimal -handling only when a period appears. Falls back to the original -for: +`docs/lib/fast-parse-number.mjs` mutates both +`BaseParser.prototype.parseRawNumber` and +`BaseParser.prototype.parseRawInt` to accumulate the integer +directly (`n = n * 10 + (byte - 0x30)`). The number variant +additionally descends into decimal handling when a period appears. +Both fall back to the original for: - **More than 15 integer digits** -- direct accumulation could exceed `Number.MAX_SAFE_INTEGER` (16 digits) and silently lose @@ -743,31 +745,351 @@ is high enough to matter -- to be measured later when the follow-on work (size-in-bytes / iterator / parseDict shims) makes the parser side worth quantifying as a group. +## `decodeName`: skip the regex on the 99.999 % no-`#` path + +The earlier closing summary above wrote off `decodeName` as "close +to fundamental pdf-lib work." Re-reading the function on a later +pass disproved that. + +`pdf-lib/.../objects/PDFName.js:9`: + +```js +var decodeName = function (name) { + return name.replace(/#([\dABCDEF]{2})/g, function (_, hex) { + return utils_1.charFromHexCode(hex); + }); +}; +``` + +PDF spec (ISO 32000-1 §7.3.5) requires `#XX` hex-escape for any +byte outside the printable-ASCII regular range plus delimiters / +whitespace. `decodeName` reverses that on every `PDFName.of(name)` +call so the pool key is the canonical decoded form, dedup'ing +`/foo#20bar` and `/foo bar` to the same instance. + +The catch: the regex has to scan every byte of every name looking +for `#`, even when there is none. + +### Workload shape + +Instrumented `PDFName.of` on the book, counting calls and how +often the input contains a `#`: + +``` +PDFName.of calls : 2,759,635 + raw input has # char : 2 (0.000%) +``` + +Two. In 2.76 million calls. The other 2,759,633 are regex scans +against strings like `Type`, `S`, `P`, `Pg`, `StructElem`, `Kids`, +`Count`, `Filter`, `FlateDecode` -- ordinary PDF names that need +no escaping. We measured ~214 ms (7 %) of process self-time on +`decodeName` and another ~91 ms on `PDFName.of`'s body that calls +it. + +### The shim + +`docs/lib/fast-decode-name.mjs` follows the `fast-refs.mjs` shape: +cache in front of `PDFName.of` rather than replacing it. The key +insight is that when `name` has no `#`, the decoded form equals +the raw form, so the raw `name` is already a valid pool key for +pdf-lib's internal dedup pool -- a fast-side `Map` +keyed by the raw input returns the same `PDFName` instance pdf-lib +would have produced after a regex scan + pool lookup, without ever +running the regex. + +```js +import { PDFName } from "pdf-lib"; + +if (!PDFName.__fastDecodeNameInstalled) { + const original = PDFName.of; + const fastCache = new Map(); + PDFName.of = function fastOf(name) { + if (name.indexOf("#") === -1) { + const cached = fastCache.get(name); + if (cached) return cached; + const instance = original.call(PDFName, name); + fastCache.set(name, instance); + return instance; + } + return original.call(PDFName, name); + }; + PDFName.__fastDecodeNameInstalled = true; +} +``` + +Names with `#` fall through to the original -- the dual canonical- +form contract is preserved exactly. Static `PDFName.Length`, +`PDFName.FlateDecode`, ... initialisers ran when pdf-lib's module +body executed (before the shim imports), so pdf-lib's pool is +already populated with the canonical instances; the parser then +hits the fast cache on every subsequent reference. + +### Results + +Paired A/B, four interleaved runs (`pre1 post1 pre2 post2`), +`--detach-pages --no-timing --fast-refs --parallel-deflate +--cpu-profile-process --cpu-sampling 100`, same 1651-page book: + +| metric | pre avg | post avg | Δ | +| ------------- | ------- | -------- | ---------------- | +| **process** | **2.74 s** | **2.21 s** | **-0.53 s (-19 %)** | +| ↳ load | 1.69 s | 1.40 s | -0.29 s (-17 %) | +| ↳ setOutline | 0.01 s | 0.01 s | unchanged | +| ↳ save | 1.04 s | 0.81 s | -0.23 s (-22 %) | +| pdf size | 16.1 MB | 16.1 MB | byte-identical pairwise (pre1↔post1, pre2↔post2; 31 B intra-pair drift is `/CreationDate`) | + +The load drop is what the instrumentation predicted. The save drop +was a surprise -- save doesn't call `PDFName.of` to build outline +metadata in the hot path, so the saving is almost certainly GC +pressure relief from no longer allocating ~2.76 M regex-match +objects during load. + +Profile diff (single run each, same flags): + +| function | PRE | POST | Δ | +| --- | --- | --- | --- | +| `decodeName` | 214 ms (7.4 %) | not in top 15 | **-214 ms** | +| `PDFName.of` | 91 ms (3.1 %) | not in top 15 | **-91 ms** | +| `fastOf` (the shim body) | n/a | 91 ms (4.1 %) | +91 ms | +| `(garbage collector)` | 339 ms (11.7 %) | 238 ms (10.8 %) | -101 ms | +| profile duration | 2.92 s | 2.22 s | -0.70 s | + +The `fastOf` row sits at the same self-time as the old +`PDFName.of` forwarder (~91 ms) -- that's the per-call cost of the +`indexOf` check + `fastCache.get` + return, which all calls now +pay. The 214 ms `decodeName` row is gone entirely (regex never +runs on the fast path), and the GC drop is the allocator relief. + +### Production confirmation + +Two consecutive `book.bat` runs with all four shims live +(`fast-refs`, `fast-parse-number`, `parallel-deflate`, +`fast-decode-name`): + +| metric | run 1 | run 2 | +| --- | --- | --- | +| render | 8.9 s | 8.3 s | +| generate | 39.3 s | 37.6 s | +| process | **1.6 s** | **1.6 s** | +| total | 51.8 s | 50.0 s | + +Process is now ~1.6 s on the production path, off the profiler. +The harness numbers above are higher (~2.2 s post-fix) because of +profiler-on attribution overhead at 100 us sampling -- the same +caveat the `PDFRef.of` section flagged. The paired-A/B delta from +the harness (-0.53 s) is the correct measure of the shim's win; +the absolute 1.6 s is the production floor. + +### Methodology note + +This one almost didn't get found. The earlier "what's left" summary +explicitly wrote `decodeName` off as "close to fundamental" parser +work, on the strength of it living in a single regex line. The +actual investigation took 30 seconds: read the function, ask +"what's the hit rate of that regex on real PDF names?", instrument +with a one-liner counter, find that the answer is 0.0001 %. Worth +re-checking the "fundamental" label on remaining JS-body rows +whenever a small change to the workload might invert it. + +## `numberToString`: skip the redundant toString/split on the 100 % no-`e` path + +`pdf-lib/.../utils/numbers.js:13` is pdf-lib's `.toString()` +replacement that suppresses exponential notation -- PDF syntax +requires plain decimal in the object body (`1e-7` is invalid), so +every numeric token written into the file goes through: + +```js +exports.numberToString = function (num) { + var numStr = String(num); + if (Math.abs(num) < 1.0) { + var e = parseInt(num.toString().split('e-')[1]); + if (e) { /* expand "1e-7" -> "0.0000001" */ } + } else { + var e = parseInt(num.toString().split('+')[1]); + if (e > 20) { /* expand "1e+21" -> "100...0" */ } + } + return numStr; +}; +``` + +`numStr` is computed up front via `String(num)`. Then -- regardless +of whether `numStr` actually contains an `e` -- the function calls +`num.toString()` *again*, allocates a `.split(...)` array, and +runs `parseInt` on the (almost always undefined) result. Pure +overhead on every call where `String(num)` already returned a +plain decimal, which on a real PDF is every call. + +### Workload shape + +Instrumented `numberToString` on the book, counting fast-path +(`String(num).indexOf('e') === -1`) vs slow-path hits: + +``` +numberToString calls : 290,231 + String(num) has 'e' : 0 (0.000 %) +``` + +Zero. Of 290 k calls. `String(num)` returns exponential notation +only when `|num| < 1e-6` or `|num| >= 1e21`, and a PDF's object +refs, generations, byte offsets, content-stream coordinates, +`/Size`, `/Length` etc. never land in either tail. The credit-card +trick guarding the `e` cases is paid 290 k times to handle 0. + +### The shim + +`docs/lib/fast-number-to-string.mjs` short-circuits the no-`e` +case and delegates the rare exponential cases to the original +implementation unchanged: + +```js +const fastNumberToString = function fastNumberToString(num) { + const numStr = String(num); + if (numStr.indexOf('e') === -1) return numStr; + return original(num); +}; +numbers.numberToString = fastNumberToString; +utilsBarrel.numberToString = fastNumberToString; +topBarrel.numberToString = fastNumberToString; +``` + +### Wiring gotcha: tslib 1.x value-copy re-exports + +pdf-lib ships compiled against `tslib@1.14.1`, whose +`__exportStar` is: + +```js +function (m, exports) { + for (var p in m) if (p !== "default" && !exports.hasOwnProperty(p)) exports[p] = m[p]; +} +``` + +A plain value-copy. tslib 2.x replaced this with a live getter +(`Object.defineProperty(o, p, { get: () => m[p] })`), so on modern +compilations a single `numbers.numberToString = fast` patch would +propagate through every re-export automatically. On 1.x it +doesn't. + +`PDFNumber`'s call site -- the only consumer of `numberToString` +in pdf-lib's source -- reads from the utils-barrel, not from +`numbers.js` directly: + +```js +// PDFNumber.js +var index_1 = require("../../utils/index"); +... +_this.stringValue = index_1.numberToString(value); // <-- captured copy +``` + +Because `import { PDFDocument } from 'pdf-lib'` runs *before* the +shim's dynamic import, the barrel has already executed +`__exportStar(numbersModule, exports)` and stamped its own copy of +the original function. Mutating `numbers.numberToString` +afterwards is invisible to `PDFNumber`. The first iteration of +this shim looked installed (the standalone test showed the patched +function on the barrel, because that test imported the barrel +*after* the shim) but the harness counter recorded 0 hits on the +patched body -- the upstream function was still hot in the profile +under its original name. + +Fix: patch every re-export in the chain that captures by value: +`utils/numbers` (the source), `utils/index` (the barrel +`PDFNumber` reads from), and `cjs/index` (pdf-lib's top-level, +which `__exportStar`s the utils barrel onward to anyone importing +from `'pdf-lib'`). All three get the same `fastNumberToString` +reference. + +The `fast-decode-name` / `fast-refs` / `fast-parse-number` shims +don't hit this trap because their targets are class-static methods +(`PDFName.of`, `PDFRef.of`) or `BaseParser.prototype` methods -- +all looked up at call time via the class/prototype object, not via +a captured value. `numberToString` is the first free function +we've patched in pdf-lib. + +### Results + +Paired A/B, two interleaved runs each (`pre1 post1 pre2 post2`), +`--detach-pages --no-timing --fast-refs --parallel-deflate +--fast-decode-name --cpu-profile-process --cpu-sampling 100`, +same 1638-page book: + +| metric | pre1 | pre2 | post1 | post2 | +| --- | --- | --- | --- | --- | +| upstream `numberToString` self-time | 45 ms | 51 ms | 0 ms | 0 ms | +| shim `fastNumberToString` self-time | n/a | n/a | 5 ms | 12 ms | +| **combined self-time on this function** | **45 ms** | **51 ms** | **5 ms** | **12 ms** | +| slow-path delegations to original | n/a | n/a | 0 | 0 | + +The `String(num).indexOf('e') === -1` short-circuit fires on 100 % +of calls; the upstream function is unreachable in practice. +Function-level self-time drops by ~80 % (~40 ms saved on the hot +function), the redundant `num.toString()` + `.split(...)` + +`parseInt(...)` work gone from the trace. + +Wall-clock process-phase numbers on this dev machine bounce around +enough run-to-run (~±0.15 s) that the ~40 ms function-level saving +is invisible at the phase total -- both pre and post sit near +2.05 s. The profile-level evidence is the real signal: the cycles +were redundant, they're not being spent any more. + +### Methodology note + +The first cut of this shim mutated `numbers.numberToString` only, +following the assumption that pdf-lib's re-exports would propagate +the change. The hit counter (`fast=0 slow=0` on a full book run) +caught the mistake before the README evidence was written -- a +shim that *looks* installed but never actually runs would have +shown identical "before" and "after" profile numbers within noise, +indistinguishable from a no-op patch. + +Lesson for the next pdf-lib shim of a free function (rather than a +class method): check `tslib.__exportStar`'s shape before assuming +a single-site patch works. + +## `@cantoo/pdf-lib`: not a drop-in replacement + +Spot-checked the maintained fork (`@cantoo/pdf-lib` 2.6.5) as an +alternative to Hopding's abandoned `pdf-lib` 1.17.1. Source-diff: +the four hot paths our shims address (`PDFRef.of`'s string-keyed +pool, `decodeName`'s unconditional regex, `parseRawInt` / +`parseRawNumber`'s per-byte string concat, +`PDFFlateStream.computeContents`'s synchronous pako call) are +byte-identical to upstream. Paired A/B on the book confirmed: +cantoo without shims runs the process phase in ~150 s vs our ~1.5 s +with shims, and has its own footguns (silent compression-disable +on PDF < 1.5, separate save-path pathology with `useObjectStreams: +true` that wasn't chased). Not a drop-in replacement; staying on +Hopding + shims. + ## Where this leaves the picture -Cumulative process-phase cost, baseline → after all three shims: - -| state | process | load | save | -| --- | --- | --- | --- | -| original (Slow / 50 defaults) | ~40 s | ~36 s| ~4 s | -| + parseSpeed:Fastest | ~5 s | ~2 s | ~3 s | -| + fast-deflate | ~2.5 s | ~1.5s| ~1 s | -| + fast-refs | ~2.3 s | ~1.3 s | ~1 s | -| **+ parallel-deflate (this section)** | **~2.0 s** | **~1.3 s** | **~0.7 s** | - -The bottom-up after parallel deflate is dominated by pdf-lib's -parser frames -- `PDFDict.entries` (8 %), `decodeName` (8 %), GC -(8 %), `parseRawNumber` (6 %), `PDFRef.of` (5 %, the gen != 0 -residue). All load-phase, all O(input bytes), all close to -fundamental pdf-lib work. Further wins in this phase would mean -rewriting pdf-lib's parser. - -The pdf-lib roundtrip path is now ~2.0 s of a ~50 s build. The -incremental writer's 0.25 s process phase (see -[01-baseline-and-detach.md](01-baseline-and-detach.md)) is still -strictly faster on process alone, but the pdf-lib path delivers a -15.3 MB output vs incremental's 53 MB, and the ~2 s gap on a 50 s -build doesn't justify the file-size cost for our pipeline. +Cumulative process-phase cost, baseline → after the shims to date: + +| state | process | load | save | +| --- | --- | --- | --- | +| original (Slow / 50 defaults) | ~40 s | ~36 s| ~4 s | +| + parseSpeed:Fastest | ~5 s | ~2 s | ~3 s | +| + fast-deflate | ~2.5 s | ~1.5s| ~1 s | +| + fast-refs | ~2.3 s | ~1.3 s | ~1 s | +| + parallel-deflate | ~2.0 s | ~1.3 s | ~0.7 s | +| **+ fast-decode-name + fast-number-to-string (this section)** | **~1.6 s** | **~1.0 s** | **~0.6 s** | + +The bottom-up after the latest pair is what's left of pdf-lib's +genuine parser work: `PDFDict.entries`, `PDFObjectParser.parseName`, +`PDFObjectParser.parseDict`, GC, with no remaining JS-body row +sitting on "regex scanning for something that's never there" or +"redundant `toString` round-trip" shape. The `fastOf` row at +~91 ms is a real floor for any cache-in-front approach: the +`indexOf` + `Map.get` cost ~33 ns per call across 2.76 M calls. + +The pdf-lib roundtrip path is now ~1.6 s on production +(profiler-off; the harness reports ~2.0-2.2 s with profiler-on +attribution overhead). The incremental writer's 0.25 s process +phase (see [01-baseline-and-detach.md](01-baseline-and-detach.md)) +is still strictly faster on process alone, but the pdf-lib path +delivers a 15.3 MB output vs incremental's 53 MB, and the ~1.4 s +gap on a 50 s build doesn't justify the file-size cost for our +pipeline. The strategic note from earlier phases still stands: generate's ~38 s in `page.pdf()` is the remaining lever, and `pageRanges` From cbd1534cc90c654868325b6191eccbf45bb14291 Mon Sep 17 00:00:00 2001 From: Kuba Sunderland-Ober Date: Sun, 24 May 2026 00:21:57 +0200 Subject: [PATCH 07/44] Speed up size-in-bytes, add previous patches to measure.mjs. --- docs/lib/fast-size-in-bytes.mjs | 62 ++++++++++++++ docs/render-book.mjs | 5 ++ perf/README.md | 44 ++++++++-- perf/measure.mjs | 41 +++++++++ perf/notes/08-pdf-lib.md | 142 +++++++++++++++++++++++++++++++- 5 files changed, 284 insertions(+), 10 deletions(-) create mode 100644 docs/lib/fast-size-in-bytes.mjs diff --git a/docs/lib/fast-size-in-bytes.mjs b/docs/lib/fast-size-in-bytes.mjs new file mode 100644 index 00000000..779ade41 --- /dev/null +++ b/docs/lib/fast-size-in-bytes.mjs @@ -0,0 +1,62 @@ +// Replace pdf-lib's utils.sizeInBytes -- which allocates a base-2 string +// just to count its bit length -- with a non-allocating short-circuit +// ladder. +// +// The upstream sizeInBytes +// ([numbers.js:37](node_modules/pdf-lib/cjs/utils/numbers.js:37)) is: +// +// exports.sizeInBytes = function (n) { +// return Math.ceil(n.toString(2).length / 8); +// }; +// +// It's called from PDFCrossRefStream.computeMaxEntryByteWidths (three +// calls per xref entry, ~50 k entries on the book) and from +// utils.bytesFor (to size the Uint8Array before filling it byte-by- +// byte, called from PDFCrossRefStream.getUnencodedContents). Both +// paths are part of writing the cross-reference stream. +// +// For the xref values the distribution is heavily skewed small: type +// is always 0/1/2 (1 byte), generationNumber is always 0 (1 byte), +// object-stream indices are small (1-2 bytes), and file offsets are +// 3-4 bytes for any sub-4GB PDF. A short-circuit ladder catches the +// dominant cases in one compare; the rare 5+ byte tail falls through +// to a Math.clz32-based fallback that's still allocation-free. +// +// Why patch three places (and why bytesFor isn't on the list): +// pdf-lib ships compiled against tslib 1.x, whose `__exportStar` +// does a value-copy (`exports[p] = m[p]`) rather than installing a +// live getter. So consumers that read sizeInBytes through a barrel +// (`utils_1.sizeInBytes(...)` from PDFCrossRefStream) hold a +// captured reference and won't see a mutation of `numbers.sizeInBytes` +// alone. Patch all three barrel layers (utils/numbers, utils/index, +// top-level index) to cover every observed call site. utils.bytesFor +// reads `exports.sizeInBytes` at call time from the same module +// object we mutate first, so it picks up the fast path without a +// separate patch. +// +// Side-effecting import. Import once before any pdf-lib operation: +// +// import "./lib/fast-size-in-bytes.mjs"; +// +// Idempotent -- repeated imports do nothing after the first. + +import { createRequire } from 'node:module'; + +const require = createRequire(import.meta.url); +const numbers = require('pdf-lib/cjs/utils/numbers.js'); +const utilsBarrel = require('pdf-lib/cjs/utils/index.js'); +const topBarrel = require('pdf-lib/cjs/index.js'); + +if (!numbers.__fastSizeInBytesInstalled) { + const fastSizeInBytes = function fastSizeInBytes(n) { + if (n < 0x100) return 1; + if (n < 0x10000) return 2; + if (n < 0x1000000) return 3; + if (n < 0x100000000) return 4; + return 4 + Math.ceil((32 - Math.clz32(Math.floor(n / 0x100000000))) / 8); + }; + numbers.sizeInBytes = fastSizeInBytes; + utilsBarrel.sizeInBytes = fastSizeInBytes; + topBarrel.sizeInBytes = fastSizeInBytes; + numbers.__fastSizeInBytesInstalled = true; +} diff --git a/docs/render-book.mjs b/docs/render-book.mjs index 15fa4b2a..bccf6a38 100644 --- a/docs/render-book.mjs +++ b/docs/render-book.mjs @@ -54,11 +54,16 @@ import { PDFDocument, ParseSpeeds } from 'pdf-lib'; // `String(num)` already lacks an `e` (i.e. for every PDF number // that's not in the exponential-notation tail). Skips a // redundant toString + split + parseInt per call. +// fast-size-in-bytes -- replace utils.sizeInBytes (which allocates +// `n.toString(2)` just to count its bit length) with a non- +// allocating short-circuit ladder. Called ~300 k times per save +// from PDFCrossRefStream's xref writer. import './lib/fast-refs.mjs'; import './lib/fast-inflate.mjs'; import './lib/fast-parse-number.mjs'; import './lib/fast-decode-name.mjs'; import './lib/fast-number-to-string.mjs'; +import './lib/fast-size-in-bytes.mjs'; import { parseOutline, setOutline } from './lib/outline.mjs'; import { setMetadata } from './lib/postprocesser.mjs'; import { parallelSave } from './lib/parallel-deflate.mjs'; diff --git a/perf/README.md b/perf/README.md index 5bab84ba..3b97916e 100644 --- a/perf/README.md +++ b/perf/README.md @@ -66,7 +66,7 @@ or pdf-lib), or to write `book.pdf` for behavioural verification. The mirror command for CPU-profiling the pdf-lib roundtrip: ``` -node measure.mjs --fast-refs --parallel-deflate --fast-decode-name --fast-number-to-string --cpu-profile-process --cpu-sampling 100 +node measure.mjs --fast-refs --parallel-deflate --fast-decode-name --fast-number-to-string --fast-size-in-bytes --fast-inflate --fast-parse-number --cpu-profile-process --cpu-sampling 100 ``` Flag rationale: @@ -95,6 +95,30 @@ Flag rationale: book). Skips a redundant `toString` + `split` + `parseInt` per call. Production runs through it. Profile self-time on the function drops from ~45-50 ms (~2 % of process) to ~5-12 ms. +- `--fast-size-in-bytes` -- inject + [docs/lib/fast-size-in-bytes.mjs](../docs/lib/fast-size-in-bytes.mjs), + replacing pdf-lib's `utils.sizeInBytes` (which allocates + `n.toString(2)` just to count its bit length) with a non- + allocating short-circuit ladder. Called ~300 k times per save + from `PDFCrossRefStream`'s xref writer; the dominant inputs + are 1-2 byte values so a `n < 0x100 ? 1 : ...` ladder catches + most calls in one compare. Production runs through it. ~60 ms + saved on process. +- `--fast-inflate` -- inject + [docs/lib/fast-inflate.mjs](../docs/lib/fast-inflate.mjs), swapping + `pako.inflate` for `node:zlib.inflateSync` on the one path + pdf-lib uses it (the compressed xref stream during load). + Negligible wall-clock; flag exists so paired A/Bs against pure + upstream pdf-lib can keep the rest of the perf set on while + isolating this swap. Production runs through it. +- `--fast-parse-number` -- inject + [docs/lib/fast-parse-number.mjs](../docs/lib/fast-parse-number.mjs), + replacing `BaseParser.parseRawNumber` / `parseRawInt` with + direct-integer accumulators (`n = n*10 + (byte - 0x30)`) that + skip per-byte string concat and the trailing `Number()` round- + trip. Every numeric token parsed during `PDFDocument.load` + flows through these -- hundreds of thousands of calls per load + on the book. Production runs through it. - `--cpu-profile-process` -- attach Node's `inspector/promises` Profiler around the process phase only (skips render and generate). Writes `process.cpuprofile` into the timestamped `results/` folder. @@ -116,13 +140,11 @@ Profile one phase at a time. Why no `--render-only`? `--cpu-profile-process` requires the process phase to run; the harness errors out if you combine them. -To compare against upstream pdf-lib (e.g. when proposing a change -upstream), drop `--fast-refs`, `--parallel-deflate`, -`--fast-decode-name`, and `--fast-number-to-string`. Caveat for -A/B work: profiler-on attribution overstates the cost of hot -functions called millions of times (`PDFRef.of` in particular). -For "did this wall-clock change," do a paired no-profile A/B as a -sanity check. +upstream), drop every `--fast-*` flag and `--parallel-deflate`. +Caveat for A/B work: profiler-on attribution overstates the cost +of hot functions called millions of times (`PDFRef.of` in +particular). For "did this wall-clock change," do a paired +no-profile A/B as a sanity check. See [notes/08-pdf-lib.md](notes/08-pdf-lib.md) for the process-phase investigations these flags enabled. @@ -251,6 +273,9 @@ run.bat --fast-refs # dense-array cache for PDFRef.of's ge run.bat --parallel-deflate # parallelSave with objectsPerStream=500 (also ships; opt-in here for A/B) run.bat --fast-decode-name # skip decodeName regex when name has no # (also ships; opt-in here for A/B) run.bat --fast-number-to-string # skip numberToString redundant toString/split when no exponential (also ships; opt-in here for A/B) +run.bat --fast-size-in-bytes # non-allocating ladder for xref byte-width (also ships; opt-in here for A/B) +run.bat --fast-inflate # swap pako.inflate for node:zlib.inflateSync (also ships; opt-in here for A/B) +run.bat --fast-parse-number # direct-integer accumulator for parseRawNumber/parseRawInt (also ships; opt-in here for A/B) ``` Flags compose. The CPU profile lands as `render.cpuprofile` @@ -362,6 +387,7 @@ file documenting each: | Parallel deflate + `objectsPerStream: 500` | [08](notes/08-pdf-lib.md) | ~0.3 s process (zlib off-thread; PDF -5 %) | | `PDFName.of` no-`#` cache (skip `decodeName` regex) | [08](notes/08-pdf-lib.md) | ~0.5 s process (load -17 %, GC -101 ms) | | `numberToString` no-`e` short-circuit | [08](notes/08-pdf-lib.md) | ~40 ms profile, below wall-clock noise | +| `sizeInBytes` short-circuit ladder (no base-2 string) | [08](notes/08-pdf-lib.md) | ~60 ms process (save -70 ms) | What was tried and didn't ship: @@ -388,4 +414,4 @@ order; later ones reference earlier ones for context. | [05-blink-trace.md](notes/05-blink-trace.md) | What happened when we tried move-not-clone (a `previousLeaf` cache shipped instead of move); cracking the cpu profile's `(program)` row open with a Blink-category trace; the WhiteSpaceFilter paired-A/B that found it wasn't worth its layout cost in our pipeline. | | [06-microtasks-pageranges-css.md](notes/06-microtasks-pageranges-css.md) | Following `RunMicrotasks` down to zero (chunker fully sync); why `pageRanges` sharding is off the table; CSS cost attribution showing print.css's individual sections are all below the noise floor. | | [07-memory.md](notes/07-memory.md) | Where the renderer's 1.9 GB goes -- process-tree footprint, per-allocator + per-Blink-class breakdown, `--disable-gpu` + `--in-process-gpu` saving ~200 MB, a GC-pass probe finding 180 MB of unswept Oilpan garbage. | -| [08-pdf-lib.md](notes/08-pdf-lib.md) | Profiling the process phase via `--cpu-profile-process`; pako's per-stream init dominates with ~4 500 small streams (routing pdf-lib's `deflate` + `inflate` through `node:zlib` saves ~1.5 s); `PDFRef.of`'s string-keyed Map lookup at 1.2 M calls per load (dense-array gen=0 cache saves ~0.2 s); parallelising save's per-stream deflate on libuv's pool with `objectsPerStream: 500` (~0.3 s off the main thread; PDF -5 %); `decodeName`'s regex scan on 2.76 M `PDFName.of` calls per load with a 0.0001 % hit rate (no-`#` cache saves ~0.5 s); `numberToString`'s redundant `toString`/`split`/`parseInt` on the 100 % no-`e` path. | +| [08-pdf-lib.md](notes/08-pdf-lib.md) | Profiling the process phase via `--cpu-profile-process`; pako's per-stream init dominates with ~4 500 small streams (routing pdf-lib's `deflate` + `inflate` through `node:zlib` saves ~1.5 s); `PDFRef.of`'s string-keyed Map lookup at 1.2 M calls per load (dense-array gen=0 cache saves ~0.2 s); parallelising save's per-stream deflate on libuv's pool with `objectsPerStream: 500` (~0.3 s off the main thread; PDF -5 %); `decodeName`'s regex scan on 2.76 M `PDFName.of` calls per load with a 0.0001 % hit rate (no-`#` cache saves ~0.5 s); `numberToString`'s redundant `toString`/`split`/`parseInt` on the 100 % no-`e` path; `sizeInBytes` allocating `n.toString(2)` on ~300 k xref-writer calls (short-circuit ladder saves ~60 ms). | diff --git a/perf/measure.mjs b/perf/measure.mjs index 00c9d1ac..1de7eb71 100644 --- a/perf/measure.mjs +++ b/perf/measure.mjs @@ -30,6 +30,8 @@ // [--clone-count] [--render-only] // [--fast-refs] [--parallel-deflate] // [--fast-decode-name] [--fast-number-to-string] +// [--fast-size-in-bytes] [--fast-inflate] +// [--fast-parse-number] // // --render-only bails out after the render phase. Skips meta extraction, // parseOutline, page.pdf, and the pdf-lib roundtrip / incremental writer. @@ -114,6 +116,27 @@ // when String(num) already lacks an `e`. Skips a redundant toString, // split, and parseInt per call; only the rare exponential-notation // tail still falls through to the original implementation. +// +// --fast-size-in-bytes replaces pdf-lib's utils.sizeInBytes -- which +// allocates `n.toString(2)` just to count its bit length -- with a +// non-allocating short-circuit ladder. Called ~300 k times per save +// from PDFCrossRefStream's xref writer; the dominant inputs are +// 1-2 byte values (type, gen, index, small obj-stream refs) so a +// `n < 0x100 ? 1 : ...` ladder is the right shape. +// +// --fast-inflate swaps pako.inflate for node:zlib.inflateSync on +// pdf-lib's one remaining pako call site (PDFCrossRefStreamParser +// inflating the compressed cross-reference stream during +// PDFDocument.load). One call per load, negligible wall-clock; flag +// exists so paired A/Bs can compare against pure-pdf-lib behaviour. +// Production runs through it. +// +// --fast-parse-number replaces pdf-lib's BaseParser.parseRawNumber +// and parseRawInt with direct-integer accumulators (n = n*10 + +// (byte - 0x30)) that skip per-byte string concatenation and the +// trailing Number() round-trip. Every numeric token in a parsed +// PDF flows through these; hundreds of thousands of calls per load +// on the book. Production runs through it. import { pathToFileURL, fileURLToPath } from 'node:url'; import { dirname, resolve, join } from 'node:path'; @@ -160,6 +183,9 @@ let fastRefs = false; let parallelDeflate = false; let fastDecodeName = false; let fastNumberToString = false; +let fastSizeInBytes = false; +let fastInflate = false; +let fastParseNumber = false; for (let i = 0; i < args.length; i++) { const a = args[i]; if (a === '--out') outArg = args[++i]; @@ -185,6 +211,9 @@ for (let i = 0; i < args.length; i++) { else if (a === '--parallel-deflate') parallelDeflate = true; else if (a === '--fast-decode-name') fastDecodeName = true; else if (a === '--fast-number-to-string') fastNumberToString = true; + else if (a === '--fast-size-in-bytes') fastSizeInBytes = true; + else if (a === '--fast-inflate') fastInflate = true; + else if (a === '--fast-parse-number') fastParseNumber = true; else if (!inputArg) inputArg = a; else { console.error(`unknown arg: ${a}`); process.exit(2); } } @@ -238,6 +267,18 @@ if (fastNumberToString) { await import('../docs/lib/fast-number-to-string.mjs'); console.log('[harness] fast-number-to-string: skip redundant toString/split when no exponential'); } +if (fastSizeInBytes) { + await import('../docs/lib/fast-size-in-bytes.mjs'); + console.log('[harness] fast-size-in-bytes: non-allocating ladder for xref byte-width'); +} +if (fastInflate) { + await import('../docs/lib/fast-inflate.mjs'); + console.log('[harness] fast-inflate: swap pako.inflate for node:zlib.inflateSync'); +} +if (fastParseNumber) { + await import('../docs/lib/fast-parse-number.mjs'); + console.log('[harness] fast-parse-number: direct-integer accumulator for parseRawNumber/parseRawInt'); +} const stamp = new Date().toISOString().replace(/[:.]/g, '-'); const outDir = outArg diff --git a/perf/notes/08-pdf-lib.md b/perf/notes/08-pdf-lib.md index 8d2c9dfa..4aa673e8 100644 --- a/perf/notes/08-pdf-lib.md +++ b/perf/notes/08-pdf-lib.md @@ -1046,6 +1046,145 @@ Lesson for the next pdf-lib shim of a free function (rather than a class method): check `tslib.__exportStar`'s shape before assuming a single-site patch works. +## `sizeInBytes`: stop allocating a base-2 string just to count its bits + +A fresh process-phase profile under the post-`fast-decode-name` / +`fast-number-to-string` shipping set (1638-page book, `--fast-refs +--parallel-deflate --fast-decode-name --fast-number-to-string +--cpu-profile-process --cpu-sampling 100`) put process at 1.95 s +and showed an oddly-shaped row in the top-15: + +``` + self_ms self_% function @ source + ------- ------ ---------------------------------------------- + 213.02 10.97% (garbage collector) + 171.60 8.83% PDFDict.entries pdf-lib/PDFDict.js:22 + 144.16 7.42% PDFRef.of pdf-lib/PDFRef.js:34 + ... + 56.48 2.91% exports.sizeInBytes pdf-lib/utils/numbers.js:37 +``` + +`sizeInBytes` is a four-line utility: + +```js +exports.sizeInBytes = function (n) { return Math.ceil(n.toString(2).length / 8); }; +``` + +It computes how many bytes a non-negative integer takes by +stringifying it as base-2, counting characters, and dividing by 8. +The string is thrown away immediately. + +`find-callers.mjs` attributed the 56 ms across two callers, both +inside the xref-stream writer: + +| caller | attributed | +| --- | --- | +| `bytesFor` (`utils/numbers.js:49`) -- sizes the `Uint8Array` that gets filled byte-by-byte | 29.6 ms | +| `PDFCrossRefStream.computeMaxEntryByteWidths` (`structures/PDFCrossRefStream.js:66`) -- 3 calls per xref entry to compute the `/W` widths | 26.9 ms | + +For a ~50 k-object PDF that's roughly 300 k `n.toString(2)` calls +per save, each allocating a short-lived 1-to-32-char string. +Likely a contributor to the 213 ms GC at the top of the table too. + +### The shim + +`docs/lib/fast-size-in-bytes.mjs` replaces `utils.sizeInBytes` +with a non-allocating short-circuit ladder: + +```js +function fastSizeInBytes(n) { + if (n < 0x100) return 1; + if (n < 0x10000) return 2; + if (n < 0x1000000) return 3; + if (n < 0x100000000) return 4; + return 4 + Math.ceil((32 - Math.clz32(Math.floor(n / 0x100000000))) / 8); +} +``` + +The ladder shape matches the actual value distribution in +`computeEntryTuples`. The xref entry tuples are +`(type, second, third)` where: + +- `type` is 0, 1, or 2 (1 byte, always) +- `gen` / `index` are small (1-2 bytes) +- `offset` for uncompressed entries reaches 3-4 bytes on a 16 MB + PDF +- `nextFreeObjectNumber` for deleted entries is small + +So most calls take the very first branch. A `Math.clz32`-based +alternative would be simpler but slower in the common case, +because it always pays for the native call + sub + div + ceil. +The ladder exits in one compare for the dominant case. + +Triple-patch shape mirrors `fast-number-to-string.mjs` -- pdf-lib +ships compiled against tslib 1.x whose `__exportStar` value-copies +re-exports rather than installing live getters, so consumers that +read `sizeInBytes` through a barrel (`PDFCrossRefStream` does: +`utils_1.sizeInBytes(...)`) hold a captured reference. Patch the +source module, the utils/index barrel, and the top-level index to +cover every observed call site. `utils.bytesFor` reads +`exports.sizeInBytes` at call time from the same module object we +mutate first, so it picks up the fast path without a separate +patch. + +### Results + +A/B (2 runs each, `--fast-refs --parallel-deflate +--fast-decode-name --fast-number-to-string --cpu-profile-process +--cpu-sampling 100`, with `--fast-size-in-bytes` the only +difference): + +| run | PRE | POST | +| --- | --- | --- | +| 1 | 1.95 s | 1.91 s | +| 2 | 2.01 s | 1.93 s | +| **avg** | **1.98 s** | **1.92 s** | +| save sub-phase avg | 0.80 s | 0.73 s | + +**Δ = -60 ms process (-3.0 %).** The save sub-phase carries +-70 ms of that -- exactly where `sizeInBytes` lives (xref writer +fires during save, not load), so the attribution lines up. + +Profile self-time, POST run: + +- `exports.sizeInBytes` row: 56.48 ms → undetectable. V8 inlined + the ladder into both callers; `fastSizeInBytes` doesn't appear + in the profile by name either. +- GC: 213 ms → 201 ms (-12 ms, consistent with no longer + allocating ~300 k short-lived base-2 strings per save). +- No cost migration to other rows. The surrounding parser / + writer rows are flat within noise. + +PDF byte-equivalent (31-byte `/CreationDate` drift between PRE +and POST -- well inside the standard timestamp band). + +### Side finding: the harness flag set wasn't tracking production + +While landing this change, the harness flag set was audited +against `render-book.mjs`'s imports. `render-book.mjs` was +importing five `fast-*` shims (`fast-refs`, `fast-inflate`, +`fast-parse-number`, `fast-decode-name`, `fast-number-to-string`), +but `measure.mjs` only exposed three of them as flags +(`--fast-refs`, `--fast-decode-name`, `--fast-number-to-string`). +So the canonical process-profile command was measuring a *subset* +of what production actually runs -- two production shims +(`fast-inflate` and `fast-parse-number`) had been on for +production and silently off for the perf harness. + +Wall-clock impact of that gap is small in absolute terms (the two +missing shims target the load sub-phase, which is ~1.2 s out of +the 1.95 s process total), but the bottom-up table in the +canonical command was attributing time to functions that don't +run that way in production. Fixed in the same change: `measure.mjs` +now exposes `--fast-inflate` and `--fast-parse-number`, and the +canonical command in the README lists all five production shims +plus `--fast-size-in-bytes`. + +The general lesson: when a new shim lands, audit the harness's +flag set against `render-book.mjs`'s import list. A flag missing +on the harness side silently moves the harness baseline away from +production -- and the divergence accumulates over time. + ## `@cantoo/pdf-lib`: not a drop-in replacement Spot-checked the maintained fork (`@cantoo/pdf-lib` 2.6.5) as an @@ -1072,7 +1211,8 @@ Cumulative process-phase cost, baseline → after the shims to date: | + fast-deflate | ~2.5 s | ~1.5s| ~1 s | | + fast-refs | ~2.3 s | ~1.3 s | ~1 s | | + parallel-deflate | ~2.0 s | ~1.3 s | ~0.7 s | -| **+ fast-decode-name + fast-number-to-string (this section)** | **~1.6 s** | **~1.0 s** | **~0.6 s** | +| + fast-decode-name + fast-number-to-string | ~1.6 s | ~1.0 s | ~0.6 s | +| **+ fast-size-in-bytes (this section)** | **~1.5 s** | **~1.0 s** | **~0.5 s** | The bottom-up after the latest pair is what's left of pdf-lib's genuine parser work: `PDFDict.entries`, `PDFObjectParser.parseName`, From 4515a8f90d6ce540f19e1cff6ac0cb3505f9941c Mon Sep 17 00:00:00 2001 From: Kuba Sunderland-Ober Date: Sun, 24 May 2026 00:24:49 +0200 Subject: [PATCH 08/44] Improve iterator performance. --- docs/lib/fast-dict-iter.mjs | 81 +++++++++++++++++++ docs/render-book.mjs | 7 ++ perf/README.md | 17 +++- perf/measure.mjs | 6 ++ perf/notes/08-pdf-lib.md | 154 +++++++++++++++++++++++++++++++++++- 5 files changed, 262 insertions(+), 3 deletions(-) create mode 100644 docs/lib/fast-dict-iter.mjs diff --git a/docs/lib/fast-dict-iter.mjs b/docs/lib/fast-dict-iter.mjs new file mode 100644 index 00000000..1d2a6cb8 --- /dev/null +++ b/docs/lib/fast-dict-iter.mjs @@ -0,0 +1,81 @@ +// Replace pdf-lib's PDFDict.sizeInBytes and PDFDict.copyBytesInto -- both of +// which materialize a fresh Array of [key, value] tuples via this.entries() +// on every call -- with versions that iterate the underlying Map in place. +// +// The upstream entries() helper +// ([PDFDict.js:22](node_modules/pdf-lib/cjs/core/objects/PDFDict.js:22)) is: +// +// PDFDict.prototype.entries = function () { +// return Array.from(this.dict.entries()); +// }; +// +// Per call that is: one MapIterator + one outer Array + one fresh +// [key, value] tuple per entry (allocated by the iterator itself). The save +// path fires both consumers on every dict (sizeInBytes to measure first, +// then copyBytesInto to write), so on the book that's ~100 k Array.from +// calls feeding the GC; PDFDict.entries was the largest non-GC row in the +// process profile (~10 % of process self-time) and (garbage collector) sat +// at the top. +// +// Map.prototype.forEach((value, key) => ...) calls back with positional +// arguments and never allocates a tuple. The two consumers don't need the +// tuple form -- they immediately destructure -- so swapping is local. +// +// We do NOT touch PDFDict.prototype.entries itself: clone() and toString() +// still call it and rely on the Array-of-tuples contract. Those paths fire +// rarely (clone on incremental updates only, toString in debug output) and +// aren't worth the contract churn. +// +// Side-effecting import. Import once before any pdf-lib save: +// +// import "./lib/fast-dict-iter.mjs"; +// +// Idempotent -- repeated imports do nothing after the first. + +import { createRequire } from 'node:module'; + +const require = createRequire(import.meta.url); +const PDFDict = require('pdf-lib/cjs/core/objects/PDFDict.js').default; +const CharCodes = require('pdf-lib/cjs/core/syntax/CharCodes.js').default; + +// Callbacks are module-level (not closures) so Map.forEach reuses the same +// function reference on every call instead of allocating a fresh context +// per invocation. Per-call state is threaded through forEach's `thisArg` +// (one small object alloc per call, instead of one closure context plus +// one heap cell for the captured `offset` mutation). +function _sizeInBytesEntry(value, key) { + this.s += key.sizeInBytes() + value.sizeInBytes() + 2; +} + +function _copyBytesIntoEntry(value, key) { + const buf = this.buf; + let off = this.off; + off += key.copyBytesInto(buf, off); + buf[off++] = CharCodes.Space; + off += value.copyBytesInto(buf, off); + buf[off++] = CharCodes.Newline; + this.off = off; +} + +if (!PDFDict.prototype.__fastDictIterInstalled) { + PDFDict.prototype.sizeInBytes = function () { + const ctx = { s: 5 }; + this.dict.forEach(_sizeInBytesEntry, ctx); + return ctx.s; + }; + + PDFDict.prototype.copyBytesInto = function (buffer, offset) { + const initialOffset = offset; + buffer[offset++] = CharCodes.LessThan; + buffer[offset++] = CharCodes.LessThan; + buffer[offset++] = CharCodes.Newline; + const ctx = { buf: buffer, off: offset }; + this.dict.forEach(_copyBytesIntoEntry, ctx); + offset = ctx.off; + buffer[offset++] = CharCodes.GreaterThan; + buffer[offset++] = CharCodes.GreaterThan; + return offset - initialOffset; + }; + + PDFDict.prototype.__fastDictIterInstalled = true; +} diff --git a/docs/render-book.mjs b/docs/render-book.mjs index bccf6a38..c29eff7b 100644 --- a/docs/render-book.mjs +++ b/docs/render-book.mjs @@ -58,12 +58,19 @@ import { PDFDocument, ParseSpeeds } from 'pdf-lib'; // `n.toString(2)` just to count its bit length) with a non- // allocating short-circuit ladder. Called ~300 k times per save // from PDFCrossRefStream's xref writer. +// fast-dict-iter -- replace PDFDict.sizeInBytes / copyBytesInto +// with versions that iterate the underlying Map in place via +// forEach, instead of materialising a fresh Array of [key, value] +// tuples via this.entries() on every call. ~80 ms saved per +// process run on the book + eliminates the largest non-GC row +// (PDFDict.entries was ~10 % of process self-time). import './lib/fast-refs.mjs'; import './lib/fast-inflate.mjs'; import './lib/fast-parse-number.mjs'; import './lib/fast-decode-name.mjs'; import './lib/fast-number-to-string.mjs'; import './lib/fast-size-in-bytes.mjs'; +import './lib/fast-dict-iter.mjs'; import { parseOutline, setOutline } from './lib/outline.mjs'; import { setMetadata } from './lib/postprocesser.mjs'; import { parallelSave } from './lib/parallel-deflate.mjs'; diff --git a/perf/README.md b/perf/README.md index 3b97916e..1d33a107 100644 --- a/perf/README.md +++ b/perf/README.md @@ -66,7 +66,7 @@ or pdf-lib), or to write `book.pdf` for behavioural verification. The mirror command for CPU-profiling the pdf-lib roundtrip: ``` -node measure.mjs --fast-refs --parallel-deflate --fast-decode-name --fast-number-to-string --fast-size-in-bytes --fast-inflate --fast-parse-number --cpu-profile-process --cpu-sampling 100 +node measure.mjs --fast-refs --parallel-deflate --fast-decode-name --fast-number-to-string --fast-size-in-bytes --fast-inflate --fast-parse-number --fast-dict-iter --cpu-profile-process --cpu-sampling 100 ``` Flag rationale: @@ -119,6 +119,17 @@ Flag rationale: trip. Every numeric token parsed during `PDFDocument.load` flows through these -- hundreds of thousands of calls per load on the book. Production runs through it. +- `--fast-dict-iter` -- inject + [docs/lib/fast-dict-iter.mjs](../docs/lib/fast-dict-iter.mjs), + replacing `PDFDict.sizeInBytes` and `PDFDict.copyBytesInto` with + versions that iterate the underlying Map in place via + `forEach((value, key), thisArg)` instead of materialising a fresh + Array of `[key, value]` tuples via `this.entries()` on every call. + The save path fires both consumers on every dict (~100 k + `Array.from` calls feeding the GC), so this was the largest + non-GC row in the profile (~10 % of process self-time charged to + `PDFDict.entries`). ~80 ms saved per process run. Production runs + through it. - `--cpu-profile-process` -- attach Node's `inspector/promises` Profiler around the process phase only (skips render and generate). Writes `process.cpuprofile` into the timestamped `results/` folder. @@ -276,6 +287,7 @@ run.bat --fast-number-to-string # skip numberToString redundant toStri run.bat --fast-size-in-bytes # non-allocating ladder for xref byte-width (also ships; opt-in here for A/B) run.bat --fast-inflate # swap pako.inflate for node:zlib.inflateSync (also ships; opt-in here for A/B) run.bat --fast-parse-number # direct-integer accumulator for parseRawNumber/parseRawInt (also ships; opt-in here for A/B) +run.bat --fast-dict-iter # in-place Map.forEach for PDFDict.sizeInBytes/copyBytesInto (also ships; opt-in here for A/B) ``` Flags compose. The CPU profile lands as `render.cpuprofile` @@ -388,6 +400,7 @@ file documenting each: | `PDFName.of` no-`#` cache (skip `decodeName` regex) | [08](notes/08-pdf-lib.md) | ~0.5 s process (load -17 %, GC -101 ms) | | `numberToString` no-`e` short-circuit | [08](notes/08-pdf-lib.md) | ~40 ms profile, below wall-clock noise | | `sizeInBytes` short-circuit ladder (no base-2 string) | [08](notes/08-pdf-lib.md) | ~60 ms process (save -70 ms) | +| `PDFDict` iter (Map.forEach with hoisted callbacks) | [08](notes/08-pdf-lib.md) | ~80 ms process (dict path -6 pp) | What was tried and didn't ship: @@ -414,4 +427,4 @@ order; later ones reference earlier ones for context. | [05-blink-trace.md](notes/05-blink-trace.md) | What happened when we tried move-not-clone (a `previousLeaf` cache shipped instead of move); cracking the cpu profile's `(program)` row open with a Blink-category trace; the WhiteSpaceFilter paired-A/B that found it wasn't worth its layout cost in our pipeline. | | [06-microtasks-pageranges-css.md](notes/06-microtasks-pageranges-css.md) | Following `RunMicrotasks` down to zero (chunker fully sync); why `pageRanges` sharding is off the table; CSS cost attribution showing print.css's individual sections are all below the noise floor. | | [07-memory.md](notes/07-memory.md) | Where the renderer's 1.9 GB goes -- process-tree footprint, per-allocator + per-Blink-class breakdown, `--disable-gpu` + `--in-process-gpu` saving ~200 MB, a GC-pass probe finding 180 MB of unswept Oilpan garbage. | -| [08-pdf-lib.md](notes/08-pdf-lib.md) | Profiling the process phase via `--cpu-profile-process`; pako's per-stream init dominates with ~4 500 small streams (routing pdf-lib's `deflate` + `inflate` through `node:zlib` saves ~1.5 s); `PDFRef.of`'s string-keyed Map lookup at 1.2 M calls per load (dense-array gen=0 cache saves ~0.2 s); parallelising save's per-stream deflate on libuv's pool with `objectsPerStream: 500` (~0.3 s off the main thread; PDF -5 %); `decodeName`'s regex scan on 2.76 M `PDFName.of` calls per load with a 0.0001 % hit rate (no-`#` cache saves ~0.5 s); `numberToString`'s redundant `toString`/`split`/`parseInt` on the 100 % no-`e` path; `sizeInBytes` allocating `n.toString(2)` on ~300 k xref-writer calls (short-circuit ladder saves ~60 ms). | +| [08-pdf-lib.md](notes/08-pdf-lib.md) | Profiling the process phase via `--cpu-profile-process`; pako's per-stream init dominates with ~4 500 small streams (routing pdf-lib's `deflate` + `inflate` through `node:zlib` saves ~1.5 s); `PDFRef.of`'s string-keyed Map lookup at 1.2 M calls per load (dense-array gen=0 cache saves ~0.2 s); parallelising save's per-stream deflate on libuv's pool with `objectsPerStream: 500` (~0.3 s off the main thread; PDF -5 %); `decodeName`'s regex scan on 2.76 M `PDFName.of` calls per load with a 0.0001 % hit rate (no-`#` cache saves ~0.5 s); `numberToString`'s redundant `toString`/`split`/`parseInt` on the 100 % no-`e` path; `sizeInBytes` allocating `n.toString(2)` on ~300 k xref-writer calls (short-circuit ladder saves ~60 ms); `PDFDict.entries` allocating `Array.from(map.entries())` on every dict serialisation (`Map.forEach` with hoisted callbacks saves ~80 ms). | diff --git a/perf/measure.mjs b/perf/measure.mjs index 1de7eb71..60185dde 100644 --- a/perf/measure.mjs +++ b/perf/measure.mjs @@ -186,6 +186,7 @@ let fastNumberToString = false; let fastSizeInBytes = false; let fastInflate = false; let fastParseNumber = false; +let fastDictIter = false; for (let i = 0; i < args.length; i++) { const a = args[i]; if (a === '--out') outArg = args[++i]; @@ -214,6 +215,7 @@ for (let i = 0; i < args.length; i++) { else if (a === '--fast-size-in-bytes') fastSizeInBytes = true; else if (a === '--fast-inflate') fastInflate = true; else if (a === '--fast-parse-number') fastParseNumber = true; + else if (a === '--fast-dict-iter') fastDictIter = true; else if (!inputArg) inputArg = a; else { console.error(`unknown arg: ${a}`); process.exit(2); } } @@ -279,6 +281,10 @@ if (fastParseNumber) { await import('../docs/lib/fast-parse-number.mjs'); console.log('[harness] fast-parse-number: direct-integer accumulator for parseRawNumber/parseRawInt'); } +if (fastDictIter) { + await import('../docs/lib/fast-dict-iter.mjs'); + console.log('[harness] fast-dict-iter: in-place Map.forEach for PDFDict.sizeInBytes/copyBytesInto'); +} const stamp = new Date().toISOString().replace(/[:.]/g, '-'); const outDir = outArg diff --git a/perf/notes/08-pdf-lib.md b/perf/notes/08-pdf-lib.md index 4aa673e8..dd667686 100644 --- a/perf/notes/08-pdf-lib.md +++ b/perf/notes/08-pdf-lib.md @@ -1185,6 +1185,157 @@ flag set against `render-book.mjs`'s import list. A flag missing on the harness side silently moves the harness baseline away from production -- and the divergence accumulates over time. +## `PDFDict.entries`: stop allocating a tuple array per save + +A profile of the process phase with every prior shipping shim +applied still showed `PDFDict.entries` at the top of the non-GC +self-time table, ~10 % of process. The function is a one-liner: + +```js +PDFDict.prototype.entries = function () { + return Array.from(this.dict.entries()); +}; +``` + +Per call: one `MapIterator` + one outer Array + one fresh +`[key, value]` tuple per entry (allocated by the iterator itself, +then collected by `Array.from` into the outer array). The save +path fires both consumers on every dict -- `sizeInBytes` first to +measure, then `copyBytesInto` to write -- so on the book that's +~100 k `Array.from` calls feeding the GC. `(garbage collector)` +sat at the top of the table too, which is the cost shape the +allocation pattern predicts. + +Both consumers immediately destructure the tuples: + +```js +var entries = this.entries(); +for (var idx = 0, len = entries.length; idx < len; idx++) { + var _a = entries[idx], key = _a[0], value = _a[1]; + ... +} +``` + +So nothing actually wants the array-of-tuples shape -- the +upstream code uses it because that's what `entries()` returns, +and the materialised array is dead by the next iteration. + +### The shim + +`docs/lib/fast-dict-iter.mjs` replaces +`PDFDict.prototype.sizeInBytes` and +`PDFDict.prototype.copyBytesInto` with versions that iterate the +underlying Map in place via `Map.prototype.forEach((value, key), +thisArg)`. The callback's positional `(value, key)` arguments +mean no tuple is ever allocated, and routing per-call state +through `forEach`'s `thisArg` instead of closure capture lets the +callback stay a module-level function reference (no per-call +closure context). + +The callbacks are hoisted to module top-level (not closures): + +```js +function _sizeInBytesEntry(value, key) { + this.s += key.sizeInBytes() + value.sizeInBytes() + 2; +} +function _copyBytesIntoEntry(value, key) { + const buf = this.buf; + let off = this.off; + off += key.copyBytesInto(buf, off); + buf[off++] = CharCodes.Space; + off += value.copyBytesInto(buf, off); + buf[off++] = CharCodes.Newline; + this.off = off; +} +``` + +Each consumer allocates a single small `ctx` object per call (one +alloc, vs the prior `1 + N` Array allocations) and threads it +through `thisArg`: + +```js +PDFDict.prototype.copyBytesInto = function (buffer, offset) { + // ... write '<<\n' ... + const ctx = { buf: buffer, off: offset }; + this.dict.forEach(_copyBytesIntoEntry, ctx); + offset = ctx.off; + // ... write '>>' ... +}; +``` + +The `PDFDict.prototype.entries` method itself stays untouched -- +`clone()` and `toString()` still call it and rely on the +array-of-tuples contract. Those paths fire rarely (clone on +incremental updates, toString in debug output) and don't justify +the contract churn. + +### Results + +Profile diff, both runs `--detach-pages --no-timing` with every +other shipping shim active, 100 us sampling: + +| metric | pre | post | Δ | +| --- | --- | --- | --- | +| `PDFDict.entries` self | 164.16 ms | off-list | **-164 ms (-100 %)** | +| `PDFDict.copyBytesInto` self | 27.54 ms | 25.42 ms | flat | +| `_copyBytesIntoEntry` (callback) | n/a | 23.83 ms | new | +| `PDFDict.sizeInBytes` self | sub-cutoff | 15.89 ms | n/a | +| `_sizeInBytesEntry` (callback) | n/a | 12.71 ms | new | +| **dict-serialisation path subtotal**| **~192 ms (~11 % of process)** | **~78 ms (~5 % of process)** | **~80 ms / -6 pp** | +| `(garbage collector)` | 201 ms (12 %) | 227 ms (15 %) | +26 ms / +3 pp | + +The 164 ms `entries` self-time is reliably gone. The replacement +work in the four-row split (the two consumers + their named +callbacks) sums to ~78 ms -- about a **6 pp drop** in process +attribution to this code path. + +The `(garbage collector)` row going *up* was the surprise. A +first-cut variant of the shim used closures (`forEach((value, +key) => { ... captures `offset` ... })`) and showed the same GC +increase. Hypothesis: the captured-and-mutated `offset` cell was +forcing V8 to heap-allocate a closure context per call. So we +tested the hoisted-callback variant above, which has zero +closure capture. The GC row landed at almost exactly the same +absolute value (~227 ms vs ~271 ms, both ~15 % of process). + +So the closure-capture hypothesis was wrong -- V8's escape +analysis was already eliding the `offset` cell. The GC nudge is +either run-to-run load-phase variance (the profile spans load + +setOutline + save, and load dominates) or the per-call `ctx` +object allocation we couldn't avoid without bigger code surgery. +Either way it doesn't reverse the win: the dict-path attributable +time dropped by ~80 ms, and that's real cycles removed. + +PDF output is byte-equivalent to the pre-shim build: +`Map.forEach` iterates in insertion order, same as +`Array.from(map.entries())`, so the serialised byte sequence is +identical. + +### Lesson: hoist forEach callbacks when state is mutable + +The hoisted-callback pattern (callback = module-level function, +state via `forEach`'s `thisArg`) reads as overkill -- a closure +is fewer lines and easier to follow. Two reasons it's still the +right shape here: + +1. **Profile attribution.** Named callbacks + (`_copyBytesIntoEntry`, `_sizeInBytesEntry`) appear in CPU + profiles under their names. Closures show up as + `(anonymous) @ file.mjs:55`, which makes future + profile-reading harder (you have to cross-reference the line + number every time). +2. **Future-proofing against V8 changes.** Escape analysis can + handle the closure capture today, but the JIT's heuristics + shift across Node versions. The hoisted pattern is + semantically explicit -- no implicit allocation depends on + the compiler being smart. Same shape that has aged well in + other hot pdf-lib paths we've patched. + +Cost is negligible (six extra lines and two declarations); +upside is the profile reads cleanly and the perf shape is robust +to JIT changes. Worth doing whenever the callback's state +outlives a single iteration. + ## `@cantoo/pdf-lib`: not a drop-in replacement Spot-checked the maintained fork (`@cantoo/pdf-lib` 2.6.5) as an @@ -1212,7 +1363,8 @@ Cumulative process-phase cost, baseline → after the shims to date: | + fast-refs | ~2.3 s | ~1.3 s | ~1 s | | + parallel-deflate | ~2.0 s | ~1.3 s | ~0.7 s | | + fast-decode-name + fast-number-to-string | ~1.6 s | ~1.0 s | ~0.6 s | -| **+ fast-size-in-bytes (this section)** | **~1.5 s** | **~1.0 s** | **~0.5 s** | +| + fast-size-in-bytes | ~1.5 s | ~1.0 s | ~0.5 s | +| **+ fast-dict-iter (this section)** | **~1.4 s** | **~1.0 s** | **~0.4 s** | The bottom-up after the latest pair is what's left of pdf-lib's genuine parser work: `PDFDict.entries`, `PDFObjectParser.parseName`, From 5e6cdfd275fdc973a4ab5d53364fa134ea039028 Mon Sep 17 00:00:00 2001 From: Kuba Sunderland-Ober Date: Sun, 24 May 2026 00:30:08 +0200 Subject: [PATCH 09/44] Hoist sentinel PDFNames out of parseDict's type-dispatch tail. PDFObjectParser.parseDict ends every dict it parses with four PDFName.of calls for Type/Catalog/Pages/Page, even on the dicts that have no /Type entry at all. With fast-decode-name in effect each call collapses to a fastCache.get, but fastOf was still the #4 row in process.cpuprofile at ~5%. Pool-dedup makes the canonical PDFNames reference-stable for the whole load, so capture them once at shim-load and substitute module-level constants for the four calls. Drops ~17 ms (~22%) of fastOf self-time. Output byte-equivalent. --- docs/lib/fast-parse-dict.mjs | 87 +++++++++++++++++++++++ docs/render-book.mjs | 7 ++ perf/README.md | 18 ++++- perf/measure.mjs | 16 ++++- perf/notes/08-pdf-lib.md | 129 ++++++++++++++++++++++++++++++++++- 5 files changed, 253 insertions(+), 4 deletions(-) create mode 100644 docs/lib/fast-parse-dict.mjs diff --git a/docs/lib/fast-parse-dict.mjs b/docs/lib/fast-parse-dict.mjs new file mode 100644 index 00000000..203549c8 --- /dev/null +++ b/docs/lib/fast-parse-dict.mjs @@ -0,0 +1,87 @@ +// Hoist the four sentinel PDFName.of calls out of +// PDFObjectParser.prototype.parseDict. +// +// The upstream parseDict +// ([PDFObjectParser.js:141](node_modules/pdf-lib/cjs/core/parser/PDFObjectParser.js:141)) +// ends every dict it parses with a Type-dispatch tail: +// +// var Type = dict.get(PDFName.of('Type')); +// if (Type === PDFName.of('Catalog')) return PDFCatalog.fromMapWithContext(...); +// else if (Type === PDFName.of('Pages')) return PDFPageTree.fromMapWithContext(...); +// else if (Type === PDFName.of('Page')) return PDFPageLeaf.fromMapWithContext(...); +// else return PDFDict.fromMapWithContext(...); +// +// That's 4 PDFName.of calls per dict, even on the overwhelming +// majority (resource dicts, font descriptors, content-stream dicts) +// that have no /Type entry at all. With --fast-decode-name in +// effect each call collapses to a Map.get on fastCache, but +// fastOf is still the #4 row in process.cpuprofile (~80 ms, +// 5.2 %). +// +// PDFName instances are pool-deduped +// ([PDFName.js:18,100](node_modules/pdf-lib/cjs/core/objects/PDFName.js:18)) +// so the sentinel "Type" / "Catalog" / "Pages" / "Page" PDFNames +// are reference-stable for the entire load. Capture them once at +// shim-load time and substitute direct constants for the four +// PDFName.of calls inside parseDict. The rest of the function +// body is preserved verbatim -- same loop, same dict.set, same +// dispatch shape. +// +// Mechanism: PDFObjectParser isn't re-exported by pdf-lib's index, +// so we reach in through the CJS internals via createRequire (same +// shape as fast-parse-number.mjs / fast-dict-iter.mjs). Mutating +// PDFObjectParser.prototype.parseDict is global -- every parser +// instance created after this shim loads picks it up. +// +// Side-effecting import. Import once before PDFDocument.load runs: +// +// import "./lib/fast-parse-dict.mjs"; +// +// Idempotent -- repeated imports do nothing after the first. + +import { createRequire } from 'node:module'; + +const require = createRequire(import.meta.url); +const PDFObjectParser = require('pdf-lib/cjs/core/parser/PDFObjectParser.js').default; +const PDFName = require('pdf-lib/cjs/core/objects/PDFName.js').default; +const PDFDict = require('pdf-lib/cjs/core/objects/PDFDict.js').default; +const PDFCatalog = require('pdf-lib/cjs/core/structures/PDFCatalog.js').default; +const PDFPageTree = require('pdf-lib/cjs/core/structures/PDFPageTree.js').default; +const PDFPageLeaf = require('pdf-lib/cjs/core/structures/PDFPageLeaf.js').default; +const CharCodes = require('pdf-lib/cjs/core/syntax/CharCodes.js').default; + +// Capture canonical PDFName instances. Pool-dedup guarantees the +// parser would have built === these even if the original parseDict +// were still in play. +const TypeName = PDFName.of('Type'); +const CatalogName = PDFName.of('Catalog'); +const PagesName = PDFName.of('Pages'); +const PageName = PDFName.of('Page'); + +if (!PDFObjectParser.prototype.__fastParseDictInstalled) { + PDFObjectParser.prototype.parseDict = function fastParseDict() { + const bytes = this.bytes; + bytes.assertNext(CharCodes.LessThan); + bytes.assertNext(CharCodes.LessThan); + this.skipWhitespaceAndComments(); + const dict = new Map(); + while (!bytes.done() && + bytes.peek() !== CharCodes.GreaterThan && + bytes.peekAhead(1) !== CharCodes.GreaterThan) { + const key = this.parseName(); + const value = this.parseObject(); + dict.set(key, value); + this.skipWhitespaceAndComments(); + } + this.skipWhitespaceAndComments(); + bytes.assertNext(CharCodes.GreaterThan); + bytes.assertNext(CharCodes.GreaterThan); + const Type = dict.get(TypeName); + if (Type === CatalogName) return PDFCatalog.fromMapWithContext(dict, this.context); + if (Type === PagesName) return PDFPageTree.fromMapWithContext(dict, this.context); + if (Type === PageName) return PDFPageLeaf.fromMapWithContext(dict, this.context); + return PDFDict.fromMapWithContext(dict, this.context); + }; + + PDFObjectParser.prototype.__fastParseDictInstalled = true; +} diff --git a/docs/render-book.mjs b/docs/render-book.mjs index c29eff7b..40f57257 100644 --- a/docs/render-book.mjs +++ b/docs/render-book.mjs @@ -64,6 +64,12 @@ import { PDFDocument, ParseSpeeds } from 'pdf-lib'; // tuples via this.entries() on every call. ~80 ms saved per // process run on the book + eliminates the largest non-GC row // (PDFDict.entries was ~10 % of process self-time). +// fast-parse-dict -- hoist the four sentinel PDFName.of calls +// (Type / Catalog / Pages / Page) out of the type-dispatch tail +// in PDFObjectParser.prototype.parseDict. The dispatch fires +// per-dict on every load; pool-dedup makes the canonical +// PDFNames reference-stable, so captured constants replace +// the calls verbatim. Pulls ~17 ms off fastOf self-time. import './lib/fast-refs.mjs'; import './lib/fast-inflate.mjs'; import './lib/fast-parse-number.mjs'; @@ -71,6 +77,7 @@ import './lib/fast-decode-name.mjs'; import './lib/fast-number-to-string.mjs'; import './lib/fast-size-in-bytes.mjs'; import './lib/fast-dict-iter.mjs'; +import './lib/fast-parse-dict.mjs'; import { parseOutline, setOutline } from './lib/outline.mjs'; import { setMetadata } from './lib/postprocesser.mjs'; import { parallelSave } from './lib/parallel-deflate.mjs'; diff --git a/perf/README.md b/perf/README.md index 1d33a107..a93bda36 100644 --- a/perf/README.md +++ b/perf/README.md @@ -66,7 +66,7 @@ or pdf-lib), or to write `book.pdf` for behavioural verification. The mirror command for CPU-profiling the pdf-lib roundtrip: ``` -node measure.mjs --fast-refs --parallel-deflate --fast-decode-name --fast-number-to-string --fast-size-in-bytes --fast-inflate --fast-parse-number --fast-dict-iter --cpu-profile-process --cpu-sampling 100 +node measure.mjs --fast-refs --parallel-deflate --fast-decode-name --fast-number-to-string --fast-size-in-bytes --fast-inflate --fast-parse-number --fast-dict-iter --fast-parse-dict --cpu-profile-process --cpu-sampling 100 ``` Flag rationale: @@ -130,6 +130,18 @@ Flag rationale: non-GC row in the profile (~10 % of process self-time charged to `PDFDict.entries`). ~80 ms saved per process run. Production runs through it. +- `--fast-parse-dict` -- inject + [docs/lib/fast-parse-dict.mjs](../docs/lib/fast-parse-dict.mjs), + replacing `PDFObjectParser.prototype.parseDict` with a version + that hoists the four sentinel `PDFName.of` calls (`'Type'`, + `'Catalog'`, `'Pages'`, `'Page'`) out of the type-dispatch tail + and substitutes module-level captured constants. Pool-dedup + guarantees the canonical `PDFName`s are reference-stable for the + whole load. With `--fast-decode-name` already in effect the four + calls were collapsing to `fastCache.get` hits per dict, but + `fastOf` was still the #4 row in the profile -- removing the + calls pulls ~17 ms (~22 %) off `fastOf` self-time. Production + runs through it. - `--cpu-profile-process` -- attach Node's `inspector/promises` Profiler around the process phase only (skips render and generate). Writes `process.cpuprofile` into the timestamped `results/` folder. @@ -288,6 +300,7 @@ run.bat --fast-size-in-bytes # non-allocating ladder for xref byte- run.bat --fast-inflate # swap pako.inflate for node:zlib.inflateSync (also ships; opt-in here for A/B) run.bat --fast-parse-number # direct-integer accumulator for parseRawNumber/parseRawInt (also ships; opt-in here for A/B) run.bat --fast-dict-iter # in-place Map.forEach for PDFDict.sizeInBytes/copyBytesInto (also ships; opt-in here for A/B) +run.bat --fast-parse-dict # hoist Type/Catalog/Pages/Page sentinel PDFNames out of parseDict (also ships; opt-in here for A/B) ``` Flags compose. The CPU profile lands as `render.cpuprofile` @@ -401,6 +414,7 @@ file documenting each: | `numberToString` no-`e` short-circuit | [08](notes/08-pdf-lib.md) | ~40 ms profile, below wall-clock noise | | `sizeInBytes` short-circuit ladder (no base-2 string) | [08](notes/08-pdf-lib.md) | ~60 ms process (save -70 ms) | | `PDFDict` iter (Map.forEach with hoisted callbacks) | [08](notes/08-pdf-lib.md) | ~80 ms process (dict path -6 pp) | +| `parseDict` sentinel-PDFName hoist (Type/Catalog/Pages/Page) | [08](notes/08-pdf-lib.md) | ~17 ms profile (fastOf -22 %) | What was tried and didn't ship: @@ -427,4 +441,4 @@ order; later ones reference earlier ones for context. | [05-blink-trace.md](notes/05-blink-trace.md) | What happened when we tried move-not-clone (a `previousLeaf` cache shipped instead of move); cracking the cpu profile's `(program)` row open with a Blink-category trace; the WhiteSpaceFilter paired-A/B that found it wasn't worth its layout cost in our pipeline. | | [06-microtasks-pageranges-css.md](notes/06-microtasks-pageranges-css.md) | Following `RunMicrotasks` down to zero (chunker fully sync); why `pageRanges` sharding is off the table; CSS cost attribution showing print.css's individual sections are all below the noise floor. | | [07-memory.md](notes/07-memory.md) | Where the renderer's 1.9 GB goes -- process-tree footprint, per-allocator + per-Blink-class breakdown, `--disable-gpu` + `--in-process-gpu` saving ~200 MB, a GC-pass probe finding 180 MB of unswept Oilpan garbage. | -| [08-pdf-lib.md](notes/08-pdf-lib.md) | Profiling the process phase via `--cpu-profile-process`; pako's per-stream init dominates with ~4 500 small streams (routing pdf-lib's `deflate` + `inflate` through `node:zlib` saves ~1.5 s); `PDFRef.of`'s string-keyed Map lookup at 1.2 M calls per load (dense-array gen=0 cache saves ~0.2 s); parallelising save's per-stream deflate on libuv's pool with `objectsPerStream: 500` (~0.3 s off the main thread; PDF -5 %); `decodeName`'s regex scan on 2.76 M `PDFName.of` calls per load with a 0.0001 % hit rate (no-`#` cache saves ~0.5 s); `numberToString`'s redundant `toString`/`split`/`parseInt` on the 100 % no-`e` path; `sizeInBytes` allocating `n.toString(2)` on ~300 k xref-writer calls (short-circuit ladder saves ~60 ms); `PDFDict.entries` allocating `Array.from(map.entries())` on every dict serialisation (`Map.forEach` with hoisted callbacks saves ~80 ms). | +| [08-pdf-lib.md](notes/08-pdf-lib.md) | Profiling the process phase via `--cpu-profile-process`; pako's per-stream init dominates with ~4 500 small streams (routing pdf-lib's `deflate` + `inflate` through `node:zlib` saves ~1.5 s); `PDFRef.of`'s string-keyed Map lookup at 1.2 M calls per load (dense-array gen=0 cache saves ~0.2 s); parallelising save's per-stream deflate on libuv's pool with `objectsPerStream: 500` (~0.3 s off the main thread; PDF -5 %); `decodeName`'s regex scan on 2.76 M `PDFName.of` calls per load with a 0.0001 % hit rate (no-`#` cache saves ~0.5 s); `numberToString`'s redundant `toString`/`split`/`parseInt` on the 100 % no-`e` path; `sizeInBytes` allocating `n.toString(2)` on ~300 k xref-writer calls (short-circuit ladder saves ~60 ms); `PDFDict.entries` allocating `Array.from(map.entries())` on every dict serialisation (`Map.forEach` with hoisted callbacks saves ~80 ms); `parseDict`'s type-dispatch tail re-running `PDFName.of('Type'/'Catalog'/'Pages'/'Page')` per dict (hoisted sentinel constants drop `fastOf` self-time by 22 %). | diff --git a/perf/measure.mjs b/perf/measure.mjs index 60185dde..c5cca424 100644 --- a/perf/measure.mjs +++ b/perf/measure.mjs @@ -31,7 +31,7 @@ // [--fast-refs] [--parallel-deflate] // [--fast-decode-name] [--fast-number-to-string] // [--fast-size-in-bytes] [--fast-inflate] -// [--fast-parse-number] +// [--fast-parse-number] [--fast-parse-dict] // // --render-only bails out after the render phase. Skips meta extraction, // parseOutline, page.pdf, and the pdf-lib roundtrip / incremental writer. @@ -137,6 +137,14 @@ // trailing Number() round-trip. Every numeric token in a parsed // PDF flows through these; hundreds of thousands of calls per load // on the book. Production runs through it. +// +// --fast-parse-dict hoists the four sentinel PDFName.of calls +// (Type / Catalog / Pages / Page) out of the type-dispatch tail +// in PDFObjectParser.prototype.parseDict. The dispatch fires +// per-dict (tens of thousands on the book) and even with +// --fast-decode-name each lookup is still a Map.get on fastCache. +// Pool-dedup makes the canonical PDFNames reference-stable, so +// captured constants replace the four calls verbatim. import { pathToFileURL, fileURLToPath } from 'node:url'; import { dirname, resolve, join } from 'node:path'; @@ -187,6 +195,7 @@ let fastSizeInBytes = false; let fastInflate = false; let fastParseNumber = false; let fastDictIter = false; +let fastParseDict = false; for (let i = 0; i < args.length; i++) { const a = args[i]; if (a === '--out') outArg = args[++i]; @@ -216,6 +225,7 @@ for (let i = 0; i < args.length; i++) { else if (a === '--fast-inflate') fastInflate = true; else if (a === '--fast-parse-number') fastParseNumber = true; else if (a === '--fast-dict-iter') fastDictIter = true; + else if (a === '--fast-parse-dict') fastParseDict = true; else if (!inputArg) inputArg = a; else { console.error(`unknown arg: ${a}`); process.exit(2); } } @@ -285,6 +295,10 @@ if (fastDictIter) { await import('../docs/lib/fast-dict-iter.mjs'); console.log('[harness] fast-dict-iter: in-place Map.forEach for PDFDict.sizeInBytes/copyBytesInto'); } +if (fastParseDict) { + await import('../docs/lib/fast-parse-dict.mjs'); + console.log('[harness] fast-parse-dict: hoist Type/Catalog/Pages/Page sentinel PDFNames out of parseDict'); +} const stamp = new Date().toISOString().replace(/[:.]/g, '-'); const outDir = outArg diff --git a/perf/notes/08-pdf-lib.md b/perf/notes/08-pdf-lib.md index dd667686..acdeafd9 100644 --- a/perf/notes/08-pdf-lib.md +++ b/perf/notes/08-pdf-lib.md @@ -1336,6 +1336,132 @@ upside is the profile reads cleanly and the perf shape is robust to JIT changes. Worth doing whenever the callback's state outlives a single iteration. +## `parseDict`: hoist the sentinel `PDFName`s out of the type-dispatch tail + +With every other process-phase shim in place, the top of the +bottom-up table looked like: + +``` + self_ms self_% function @ source + 194.12 12.49% (garbage collector) + 127.05 8.18% PDFRef.of + 86.70 5.58% PDFObjectParser.parseName + 80.70 5.19% fastOf (fast-decode-name) + 74.70 4.81% PDFObjectParser.parseDict + ... +``` + +`fastOf` -- the cache in front of `PDFName.of` -- shouldn't be +this high. The whole point of `fast-decode-name` is to collapse +`PDFName.of` to a `Map.get` per call. So the question is why so +many calls still hit it. + +Reading `PDFObjectParser.parseDict` +(`pdf-lib/.../parser/PDFObjectParser.js:141`) shows the +type-dispatch tail at the bottom: + +```js +var Type = dict.get(PDFName.of('Type')); +if (Type === PDFName.of('Catalog')) return PDFCatalog.fromMapWithContext(...); +else if (Type === PDFName.of('Pages')) return PDFPageTree.fromMapWithContext(...); +else if (Type === PDFName.of('Page')) return PDFPageLeaf.fromMapWithContext(...); +else return PDFDict.fromMapWithContext(...); +``` + +Four `PDFName.of` calls per dict, **including** the dicts that +have no `/Type` entry at all (resource dicts, font descriptors, +content-stream dicts -- the bulk of what a real PDF contains). +With `fast-decode-name` each call is a `fastCache.get` on a 4-byte +string, which is cheap individually -- but on a 1638-page book +that's tens of thousands of dicts × 4 calls = hundreds of +thousands of cache lookups for the same handful of canonical +`PDFName`s. + +### The shim + +`docs/lib/fast-parse-dict.mjs` replaces +`PDFObjectParser.prototype.parseDict` with a version that +captures the four sentinel `PDFName`s once at shim-load: + +```js +const TypeName = PDFName.of('Type'); +const CatalogName = PDFName.of('Catalog'); +const PagesName = PDFName.of('Pages'); +const PageName = PDFName.of('Page'); +``` + +and references them directly in the type-dispatch tail: + +```js +const Type = dict.get(TypeName); +if (Type === CatalogName) return PDFCatalog.fromMapWithContext(dict, this.context); +if (Type === PagesName) return PDFPageTree.fromMapWithContext(dict, this.context); +if (Type === PageName) return PDFPageLeaf.fromMapWithContext(dict, this.context); +return PDFDict.fromMapWithContext(dict, this.context); +``` + +The rest of the function body (the `<< ... >>` parse loop, the +`dict.set` calls, the whitespace skipping) is verbatim. Pool-dedup +guarantees the captured `PDFName`s are `===` to whatever the +parser would have built via the slow `PDFName.of` calls, so the +dispatch identity comparisons work unchanged. + +`PDFObjectParser` isn't re-exported from pdf-lib's index, so the +shim reaches in via `pdf-lib/cjs/core/parser/PDFObjectParser.js` +through `createRequire` -- same shape as `fast-parse-number.mjs` +and `fast-dict-iter.mjs`. + +### Results + +Profile diff, both runs `--detach-pages --no-timing` with every +other shipping shim active, 100 us sampling: + +| metric | pre | post | Δ | +| --- | --- | --- | --- | +| `fastOf` self | 80.70 ms (5.19 %) | 63.20 ms (4.43 %) | **-17.5 ms (-22 %)** | +| `parseDict` / `fastParseDict` self | 74.70 ms (4.81 %) | 77.79 ms (5.45 %) | flat (noise) | +| process wall-clock | 1.55 s | 1.42 s | -0.13 s (~noise floor) | + +The cleanest signal is the `fastOf` drop: removing four +`PDFName.of` calls per dict re-attributes ~17 ms away from the +cache layer. `parseDict`'s own self-time is essentially unchanged +because the four `PDFName.of` calls were already being charged to +`fastOf`, not to `parseDict` (child frames don't roll into parent +self-time). So the optimisation reads as "fastOf got cheaper" +rather than "parseDict got faster," but it's the same removed +work either way. + +The 130 ms wall-clock delta is mostly within run-to-run noise on a +1.5 s phase. The mechanism-confirmed ~17 ms via profile +attribution is the honest number. + +PDF output is byte-equivalent: same Map iteration order, same +dispatch decisions, same canonical `PDFName` instances. + +### Why this is the bottom of the easy wins on parseDict + +`fastParseDict` is still in the top 15 (5.45 %), which suggests +more juice in the function. The next-tier targets are all in the +inner loop: + +- `!bytes.done() && bytes.peek() !== 0x3E && bytes.peekAhead(1) !== 0x3E` + -- three method calls per iteration, all reading the underlying + `Uint8Array`. Inlining would cut method-dispatch overhead but + requires reaching into `ByteStream`'s internals. +- `dict.set(key, value)` -- Map entry allocation. Could be swapped + for a plain object via `Object.create(null)`, but + `PDFDict.fromMapWithContext` and the existing `fast-dict-iter` + shim both assume a Map, so it's a larger surgery. +- `this.skipWhitespaceAndComments()` -- already on the top-15 list + in its own right (~32 ms / 2 %). Two-method-call body + (`skipWhitespace` + `skipComment` loop); inlining at parseDict's + call site would shed one method-dispatch per loop iteration. + +None of these are as clean as the sentinel-hoist patch, and each +is a bigger code change for a smaller individual win. Worth +revisiting if a future optimisation moves the floor and parseDict +becomes a larger relative share. + ## `@cantoo/pdf-lib`: not a drop-in replacement Spot-checked the maintained fork (`@cantoo/pdf-lib` 2.6.5) as an @@ -1364,7 +1490,8 @@ Cumulative process-phase cost, baseline → after the shims to date: | + parallel-deflate | ~2.0 s | ~1.3 s | ~0.7 s | | + fast-decode-name + fast-number-to-string | ~1.6 s | ~1.0 s | ~0.6 s | | + fast-size-in-bytes | ~1.5 s | ~1.0 s | ~0.5 s | -| **+ fast-dict-iter (this section)** | **~1.4 s** | **~1.0 s** | **~0.4 s** | +| + fast-dict-iter | ~1.4 s | ~1.0 s | ~0.4 s | +| **+ fast-parse-dict (this section)** | **~1.4 s** | **~1.0 s** | **~0.4 s** | The bottom-up after the latest pair is what's left of pdf-lib's genuine parser work: `PDFDict.entries`, `PDFObjectParser.parseName`, From 3cf4743eb7d78f2a446fd78a1e668d2fa1d42662 Mon Sep 17 00:00:00 2001 From: Kuba Sunderland-Ober Date: Sun, 24 May 2026 00:36:32 +0200 Subject: [PATCH 10/44] Synchronify pdf-lib's load + save paths, pin pdf-lib + puppeteer. --- docs/lib/fast-sync-load.mjs | 331 ++++++++++++++++++++++++++++++++++ docs/lib/parallel-deflate.mjs | 10 +- docs/render-book.mjs | 32 +++- package-lock.json | 4 +- package.json | 4 +- perf/README.md | 18 +- perf/measure.mjs | 48 +++-- perf/notes/08-pdf-lib.md | 241 ++++++++++++++++++++++++- 8 files changed, 658 insertions(+), 30 deletions(-) create mode 100644 docs/lib/fast-sync-load.mjs diff --git a/docs/lib/fast-sync-load.mjs b/docs/lib/fast-sync-load.mjs new file mode 100644 index 00000000..fd473f8b --- /dev/null +++ b/docs/lib/fast-sync-load.mjs @@ -0,0 +1,331 @@ +// Strip pdf-lib's parseSpeed / objectsPerTick / shouldWaitForTick / +// waitForTick machinery entirely. Synchronify everywhere the conditional +// yield was the only async thing in the method. +// +// pdf-lib's parser and writers are downlevel-compiled from TypeScript +// `async function` to tslib's __awaiter + __generator state machine, +// so on browsers they can yield to the event loop every +// `objectsPerTick` objects via `await waitForTick()`. In Node with +// objectsPerTick: Infinity (which parseSpeed: Fastest historically +// set on the load side) the gate never fires -- the entire generator +// runs in one tick -- yet every indirect object (~50 k on the book) +// still pays the state-machine dispatch + Promise allocation for a +// single fall-through `case 0`. +// +// Eight methods participate in this pattern; this shim replaces all +// of them with synchronous (or, where a legitimate await remains, +// awaiterless `async`) twins: +// +// Load side (parser): +// PDFParser.prototype.parseDocument +// PDFParser.prototype.parseDocumentSection +// PDFParser.prototype.parseIndirectObjects +// PDFParser.prototype.parseIndirectObject +// PDFObjectStreamParser.prototype.parseIntoContext +// PDFDocument.load (static; only awaited parseDocument) +// +// Save side (writers): +// PDFWriter.prototype.serializeToBuffer +// (kept `async` because the inherited path awaits the +// ParallelStreamWriter override of computeBufferSize, which +// does genuine Promise.all-driven libuv-pool concurrency) +// PDFWriter.prototype.computeBufferSize +// PDFStreamWriter.prototype.computeBufferSize +// +// The load-side patches have to land together: each method awaits +// the next one down, so desugaring any one in isolation still leaves +// a Promise chain dangling. +// +// PDFDocument.load's signature is preserved (still callable as +// `await PDFDocument.load(bytes)`; awaiting a non-Promise resolves +// to the value), so existing call sites need no change. The +// parseSpeed option is silently ignored. parallel-deflate.mjs's +// parallelSave drops `objectsPerTick` from its public API in step +// with this shim. +// +// Side-effecting import. Import once before any pdf-lib operation: +// +// import "./lib/fast-sync-load.mjs"; +// +// Idempotent -- repeated imports do nothing after the first. + +import { createRequire } from 'node:module'; + +const require = createRequire(import.meta.url); +const PDFParser = require('pdf-lib/cjs/core/parser/PDFParser.js').default; +const PDFObjectStreamParser = require('pdf-lib/cjs/core/parser/PDFObjectStreamParser.js').default; +const PDFXRefStreamParser = require('pdf-lib/cjs/core/parser/PDFXRefStreamParser.js').default; +const PDFRawStream = require('pdf-lib/cjs/core/objects/PDFRawStream.js').default; +const PDFRef = require('pdf-lib/cjs/core/objects/PDFRef.js').default; +const PDFName = require('pdf-lib/cjs/core/objects/PDFName.js').default; +const PDFNumber = require('pdf-lib/cjs/core/objects/PDFNumber.js').default; +const PDFStream = require('pdf-lib/cjs/core/objects/PDFStream.js').default; +const PDFInvalidObject = require('pdf-lib/cjs/core/objects/PDFInvalidObject.js').default; +const PDFDocument = require('pdf-lib/cjs/api/PDFDocument.js').default; +const PDFWriter = require('pdf-lib/cjs/core/writers/PDFWriter.js').default; +const PDFStreamWriter = require('pdf-lib/cjs/core/writers/PDFStreamWriter.js').default; +const PDFHeader = require('pdf-lib/cjs/core/document/PDFHeader.js').default; +const PDFTrailer = require('pdf-lib/cjs/core/document/PDFTrailer.js').default; +const PDFTrailerDict = require('pdf-lib/cjs/core/document/PDFTrailerDict.js').default; +const PDFCrossRefSection = require('pdf-lib/cjs/core/document/PDFCrossRefSection.js').default; +const PDFCrossRefStream = require('pdf-lib/cjs/core/structures/PDFCrossRefStream.js').default; +const PDFObjectStream = require('pdf-lib/cjs/core/structures/PDFObjectStream.js').default; +const CharCodes = require('pdf-lib/cjs/core/syntax/CharCodes.js').default; +const { ReparseError, StalledParserError } = require('pdf-lib/cjs/core/errors.js'); +const { IsDigit } = require('pdf-lib/cjs/core/syntax/Numeric.js'); +const { Keywords } = require('pdf-lib/cjs/core/syntax/Keywords.js'); +const { toUint8Array, copyStringIntoBuffer, last } = require('pdf-lib/cjs/utils/index.js'); + +// Pool-deduped PDFName instances are reference-stable for the whole +// load (see fast-parse-dict.mjs for the same trick). Capture the three +// sentinels parseIndirectObject's Type-dispatch needs. +const TypeName = PDFName.of('Type'); +const ObjStmName = PDFName.of('ObjStm'); +const XRefName = PDFName.of('XRef'); +const RefZero = PDFRef.of(0); +const SizeName = PDFName.of('Size'); + +if (!PDFParser.prototype.__fastSyncLoadInstalled) { + + // ----- Load side --------------------------------------------------- + + PDFParser.prototype.parseDocument = function parseDocumentSync() { + if (this.alreadyParsed) { + throw new ReparseError('PDFParser', 'parseDocument'); + } + this.alreadyParsed = true; + this.context.header = this.parseHeader(); + + let prevOffset; + while (!this.bytes.done()) { + this.parseDocumentSection(); + const offset = this.bytes.offset(); + if (offset === prevOffset) { + throw new StalledParserError(this.bytes.position()); + } + prevOffset = offset; + } + + this.maybeRecoverRoot(); + if (this.context.lookup(RefZero)) { + console.warn('Removing parsed object: 0 0 R'); + this.context.delete(RefZero); + } + return this.context; + }; + + PDFParser.prototype.parseDocumentSection = function parseDocumentSectionSync() { + this.parseIndirectObjects(); + this.maybeParseCrossRefSection(); + this.maybeParseTrailerDict(); + this.maybeParseTrailer(); + this.skipJibberish(); + }; + + PDFParser.prototype.parseIndirectObjects = function parseIndirectObjectsSync() { + this.skipWhitespaceAndComments(); + while (!this.bytes.done() && IsDigit[this.bytes.peek()]) { + const initialOffset = this.bytes.offset(); + try { + this.parseIndirectObject(); + } catch (e) { + this.bytes.moveTo(initialOffset); + this.tryToParseInvalidIndirectObject(); + } + this.skipWhitespaceAndComments(); + this.skipJibberish(); + } + }; + + PDFParser.prototype.parseIndirectObject = function parseIndirectObjectSync() { + const ref = this.parseIndirectObjectHeader(); + this.skipWhitespaceAndComments(); + const object = this.parseObject(); + this.skipWhitespaceAndComments(); + this.matchKeyword(Keywords.endobj); + if (object instanceof PDFRawStream && + object.dict.lookup(TypeName) === ObjStmName) { + PDFObjectStreamParser.forStream(object).parseIntoContext(); + } else if (object instanceof PDFRawStream && + object.dict.lookup(TypeName) === XRefName) { + PDFXRefStreamParser.forStream(object).parseIntoContext(); + } else { + this.context.assign(ref, object); + } + return ref; + }; + + PDFObjectStreamParser.prototype.parseIntoContext = function parseIntoContextSync() { + if (this.alreadyParsed) { + throw new ReparseError('PDFObjectStreamParser', 'parseIntoContext'); + } + this.alreadyParsed = true; + const offsetsAndObjectNumbers = this.parseOffsetsAndObjectNumbers(); + for (let i = 0, len = offsetsAndObjectNumbers.length; i < len; i++) { + const entry = offsetsAndObjectNumbers[i]; + this.bytes.moveTo(this.firstOffset + entry.offset); + const object = this.parseObject(); + const ref = PDFRef.of(entry.objectNumber, 0); + this.context.assign(ref, object); + } + }; + + // PDFDocument.load only awaited parseDocument(); now that's sync, the + // outer __awaiter is wasted too. Drop it. Signature unchanged -- + // `await PDFDocument.load(...)` on a non-Promise resolves to the value. + // The parseSpeed option is silently ignored (no more yield gate to tune). + PDFDocument.load = function loadSync(pdf, options) { + if (options === undefined) options = {}; + const ignoreEncryption = options.ignoreEncryption === undefined ? false : options.ignoreEncryption; + const throwOnInvalidObject = options.throwOnInvalidObject === undefined ? false : options.throwOnInvalidObject; + const updateMetadata = options.updateMetadata === undefined ? true : options.updateMetadata; + const capNumbers = options.capNumbers === undefined ? false : options.capNumbers; + const bytes = toUint8Array(pdf); + const context = PDFParser.forBytesWithOptions( + bytes, Infinity, throwOnInvalidObject, capNumbers, + ).parseDocument(); + return new PDFDocument(context, ignoreEncryption, updateMetadata); + }; + + // ----- Save side --------------------------------------------------- + + // PDFWriter.serializeToBuffer awaits computeBufferSize, which in our + // pipeline is the ParallelStreamWriter override -- genuinely async + // because of `await Promise.all(deflated)` over libuv's thread pool. + // So the wrapper stays async. The conditional waitForTick yield in + // its main loop is the only piece we strip. + PDFWriter.prototype.serializeToBuffer = async function serializeToBufferSync() { + const { size, header, indirectObjects, xref, trailerDict, trailer } = + await this.computeBufferSize(); + const buffer = new Uint8Array(size); + let offset = 0; + offset += header.copyBytesInto(buffer, offset); + buffer[offset++] = CharCodes.Newline; + buffer[offset++] = CharCodes.Newline; + for (let idx = 0, len = indirectObjects.length; idx < len; idx++) { + const indirectObject = indirectObjects[idx]; + const ref = indirectObject[0]; + const object = indirectObject[1]; + offset += copyStringIntoBuffer(String(ref.objectNumber), buffer, offset); + buffer[offset++] = CharCodes.Space; + offset += copyStringIntoBuffer(String(ref.generationNumber), buffer, offset); + buffer[offset++] = CharCodes.Space; + buffer[offset++] = CharCodes.o; + buffer[offset++] = CharCodes.b; + buffer[offset++] = CharCodes.j; + buffer[offset++] = CharCodes.Newline; + offset += object.copyBytesInto(buffer, offset); + buffer[offset++] = CharCodes.Newline; + buffer[offset++] = CharCodes.e; + buffer[offset++] = CharCodes.n; + buffer[offset++] = CharCodes.d; + buffer[offset++] = CharCodes.o; + buffer[offset++] = CharCodes.b; + buffer[offset++] = CharCodes.j; + buffer[offset++] = CharCodes.Newline; + buffer[offset++] = CharCodes.Newline; + } + if (xref) { + offset += xref.copyBytesInto(buffer, offset); + buffer[offset++] = CharCodes.Newline; + } + if (trailerDict) { + offset += trailerDict.copyBytesInto(buffer, offset); + buffer[offset++] = CharCodes.Newline; + buffer[offset++] = CharCodes.Newline; + } + offset += trailer.copyBytesInto(buffer, offset); + return buffer; + }; + + // PDFWriter.computeBufferSize -- the basic (non-stream) writer's + // sizing pass. Not on our pipeline's hot path (we route through + // PDFStreamWriter via ParallelStreamWriter, both of which override + // this method) but patched for consistency: the only async thing + // upstream is the conditional waitForTick yield in its loop. + PDFWriter.prototype.computeBufferSize = function computeBufferSizeBaseSync() { + const header = PDFHeader.forVersion(1, 7); + let size = header.sizeInBytes() + 2; + const xref = PDFCrossRefSection.create(); + const indirectObjects = this.context.enumerateIndirectObjects(); + for (let idx = 0, len = indirectObjects.length; idx < len; idx++) { + const indirectObject = indirectObjects[idx]; + const ref = indirectObject[0]; + xref.addEntry(ref, size); + size += this.computeIndirectObjectSize(indirectObject); + } + const xrefOffset = size; + size += xref.sizeInBytes() + 1; + const trailerDict = PDFTrailerDict.of(this.createTrailerDict()); + size += trailerDict.sizeInBytes() + 2; + const trailer = PDFTrailer.forLastCrossRefSectionOffset(xrefOffset); + size += trailer.sizeInBytes(); + return { size, header, indirectObjects, xref, trailerDict, trailer }; + }; + + // PDFStreamWriter.computeBufferSize -- the upstream stream writer's + // sizing pass with two waitForTick gates (one per loop). Not on our + // pipeline's hot path (ParallelStreamWriter overrides this with its + // own three-phase parallel-deflate version) but patched for + // consistency. Logic mirrors the upstream method body exactly. + PDFStreamWriter.prototype.computeBufferSize = function computeBufferSizeStreamSync() { + let objectNumber = this.context.largestObjectNumber + 1; + const header = PDFHeader.forVersion(1, 7); + let size = header.sizeInBytes() + 2; + const xrefStream = PDFCrossRefStream.create(this.createTrailerDict(), this.encodeStreams); + + const uncompressedObjects = []; + const compressedObjects = []; + const objectStreamRefs = []; + + const indirectObjects = this.context.enumerateIndirectObjects(); + for (let idx = 0, len = indirectObjects.length; idx < len; idx++) { + const indirectObject = indirectObjects[idx]; + const ref = indirectObject[0]; + const object = indirectObject[1]; + const shouldNotCompress = + ref === this.context.trailerInfo.Encrypt || + object instanceof PDFStream || + object instanceof PDFInvalidObject || + ref.generationNumber !== 0; + if (shouldNotCompress) { + uncompressedObjects.push(indirectObject); + xrefStream.addUncompressedEntry(ref, size); + size += this.computeIndirectObjectSize(indirectObject); + } else { + let chunk = last(compressedObjects); + let objectStreamRef = last(objectStreamRefs); + if (!chunk || chunk.length % this.objectsPerStream === 0) { + chunk = []; + compressedObjects.push(chunk); + objectStreamRef = PDFRef.of(objectNumber++); + objectStreamRefs.push(objectStreamRef); + } + xrefStream.addCompressedEntry(ref, objectStreamRef, chunk.length); + chunk.push(indirectObject); + } + } + + for (let idx = 0, len = compressedObjects.length; idx < len; idx++) { + const chunk = compressedObjects[idx]; + const ref = objectStreamRefs[idx]; + const objectStream = PDFObjectStream.withContextAndObjects(this.context, chunk, this.encodeStreams); + xrefStream.addUncompressedEntry(ref, size); + size += this.computeIndirectObjectSize([ref, objectStream]); + uncompressedObjects.push([ref, objectStream]); + } + + const xrefStreamRef = PDFRef.of(objectNumber++); + xrefStream.dict.set(SizeName, PDFNumber.of(objectNumber)); + xrefStream.addUncompressedEntry(xrefStreamRef, size); + const xrefOffset = size; + size += this.computeIndirectObjectSize([xrefStreamRef, xrefStream]); + uncompressedObjects.push([xrefStreamRef, xrefStream]); + + const trailer = PDFTrailer.forLastCrossRefSectionOffset(xrefOffset); + size += trailer.sizeInBytes(); + return { size, header, indirectObjects: uncompressedObjects, trailer }; + }; + + PDFParser.prototype.__fastSyncLoadInstalled = true; +} diff --git a/docs/lib/parallel-deflate.mjs b/docs/lib/parallel-deflate.mjs index d278ecab..b7b71499 100644 --- a/docs/lib/parallel-deflate.mjs +++ b/docs/lib/parallel-deflate.mjs @@ -48,8 +48,12 @@ import { const deflateAsync = promisify(deflate); class ParallelStreamWriter extends PDFStreamWriter { - constructor(context, objectsPerTick, encodeStreams, objectsPerStream, parallel) { - super(context, objectsPerTick, encodeStreams, objectsPerStream); + constructor(context, encodeStreams, objectsPerStream, parallel) { + // PDFWriter's second ctor param is objectsPerTick -- the yield knob + // that drives shouldWaitForTick. fast-sync-load.mjs rips out every + // caller of shouldWaitForTick on both the parser and writer sides, + // so the value here is vestigial. Pass Infinity for explicitness. + super(context, Infinity, encodeStreams, objectsPerStream); this._lastPrecompressed = 0; this._parallel = parallel; } @@ -153,7 +157,6 @@ class ParallelStreamWriter extends PDFStreamWriter { */ export async function parallelSave(pdfDoc, options = {}) { const { - objectsPerTick = Infinity, addDefaultPage = true, updateFieldAppearances = true, objectsPerStream = 50, @@ -170,7 +173,6 @@ export async function parallelSave(pdfDoc, options = {}) { const writer = new ParallelStreamWriter( pdfDoc.context, - objectsPerTick, encodeStreams, objectsPerStream, parallel, diff --git a/docs/render-book.mjs b/docs/render-book.mjs index 40f57257..82c7f4b4 100644 --- a/docs/render-book.mjs +++ b/docs/render-book.mjs @@ -31,7 +31,7 @@ import { pathToFileURL, fileURLToPath } from 'node:url'; import { dirname, resolve } from 'node:path'; import { writeFileSync, existsSync } from 'node:fs'; import puppeteer from 'puppeteer'; -import { PDFDocument, ParseSpeeds } from 'pdf-lib'; +import { PDFDocument } from 'pdf-lib'; // Side-effecting imports. Mutate pdf-lib's live module exports // before any pdf-lib operation -- order doesn't matter. See // perf/notes/08-pdf-lib.md. @@ -70,6 +70,20 @@ import { PDFDocument, ParseSpeeds } from 'pdf-lib'; // per-dict on every load; pool-dedup makes the canonical // PDFNames reference-stable, so captured constants replace // the calls verbatim. Pulls ~17 ms off fastOf self-time. +// fast-sync-load -- rip the parseSpeed / objectsPerTick / +// shouldWaitForTick / waitForTick machinery out of both pdf-lib's +// load path (PDFDocument.load + five PDFParser / +// PDFObjectStreamParser methods underneath it) and its save path +// (PDFWriter.serializeToBuffer + computeBufferSize, plus the +// unreachable PDFStreamWriter.computeBufferSize patched for +// consistency). Each upstream method is wrapped in __awaiter so +// on browsers it can yield to the event loop every objectsPerTick +// objects; in Node the gate never fires but every indirect object +// still paid for the generator state machine + Promise +// allocation. ~135 ms of attributed parser self-time + ~40 ms +// writer + an unknowable chunk of the GC row removed; the +// parseSpeed / objectsPerTick options drop off all our call sites +// in step with this shim. import './lib/fast-refs.mjs'; import './lib/fast-inflate.mjs'; import './lib/fast-parse-number.mjs'; @@ -78,6 +92,7 @@ import './lib/fast-number-to-string.mjs'; import './lib/fast-size-in-bytes.mjs'; import './lib/fast-dict-iter.mjs'; import './lib/fast-parse-dict.mjs'; +import './lib/fast-sync-load.mjs'; import { parseOutline, setOutline } from './lib/outline.mjs'; import { setMetadata } from './lib/postprocesser.mjs'; import { parallelSave } from './lib/parallel-deflate.mjs'; @@ -292,12 +307,13 @@ try { console.log(`generate: ${fmtMs(Date.now() - tGenerate)} (raw ${(rawPdf.length / 1024 / 1024).toFixed(1)} MB)`); // Process -- pdf-lib roundtrip with outline + metadata attached. - // parseSpeed: Fastest and objectsPerTick: Infinity are critical: - // pdf-lib's defaults yield to the event loop between every 100/50 - // objects, turning a ~5 s round-trip into ~40 s on a 50 MB PDF - // (~35 s of which is pure V8 idle). + // fast-sync-load strips the waitForTick yield gates on both load + // and save sides entirely (load was ~40 s under pdf-lib's Slow + // default that yields every 100 objects; ~5 s on Fastest; now + // ~1 s with the gates ripped out -- so parseSpeed / objectsPerTick + // no longer matter and drop from the call sites). // - // parallelSave (vs the default pdfDoc.save) does two things: + // parallelSave (vs the default pdfDoc.save): // - objectsPerStream: 500 -- larger object-stream chunks compress // better (shared deflate window), 5 % smaller output PDF, and // cuts the per-chunk dispatch overhead 10x. @@ -306,10 +322,10 @@ try { // thread. Moves ~300 ms of zlib work off-CPU on the book. // See perf/notes/08-pdf-lib.md. const tProcess = Date.now(); - const pdfDoc = await PDFDocument.load(rawPdf, { parseSpeed: ParseSpeeds.Fastest }); + const pdfDoc = await PDFDocument.load(rawPdf); setMetadata(pdfDoc, meta); await setOutline(pdfDoc, outline, false); - const { bytes: finalPdf } = await parallelSave(pdfDoc, { objectsPerTick: Infinity, objectsPerStream: 500 }); + const { bytes: finalPdf } = await parallelSave(pdfDoc, { objectsPerStream: 500 }); console.log(`process: ${fmtMs(Date.now() - tProcess)}`); writeFileSync(outputPath, Buffer.from(finalPdf)); diff --git a/package-lock.json b/package-lock.json index 4d32d0a6..3e17b771 100644 --- a/package-lock.json +++ b/package-lock.json @@ -9,8 +9,8 @@ "version": "0.0.0", "devDependencies": { "html-entities": "^2.6.0", - "pdf-lib": "^1.17.1", - "puppeteer": "^25.0.4" + "pdf-lib": "1.17.1", + "puppeteer": "25.0.4" } }, "node_modules/@babel/code-frame": { diff --git a/package.json b/package.json index ba6093ad..3dce8713 100644 --- a/package.json +++ b/package.json @@ -5,7 +5,7 @@ "description": "PDF book pipeline and profiling harness for the twinBASIC documentation", "devDependencies": { "html-entities": "^2.6.0", - "pdf-lib": "^1.17.1", - "puppeteer": "^25.0.4" + "pdf-lib": "1.17.1", + "puppeteer": "25.0.4" } } diff --git a/perf/README.md b/perf/README.md index a93bda36..a7635cc5 100644 --- a/perf/README.md +++ b/perf/README.md @@ -66,7 +66,7 @@ or pdf-lib), or to write `book.pdf` for behavioural verification. The mirror command for CPU-profiling the pdf-lib roundtrip: ``` -node measure.mjs --fast-refs --parallel-deflate --fast-decode-name --fast-number-to-string --fast-size-in-bytes --fast-inflate --fast-parse-number --fast-dict-iter --fast-parse-dict --cpu-profile-process --cpu-sampling 100 +node measure.mjs --fast-refs --parallel-deflate --fast-decode-name --fast-number-to-string --fast-size-in-bytes --fast-inflate --fast-parse-number --fast-dict-iter --fast-parse-dict --fast-sync-load --cpu-profile-process --cpu-sampling 100 ``` Flag rationale: @@ -142,6 +142,18 @@ Flag rationale: `fastOf` was still the #4 row in the profile -- removing the calls pulls ~17 ms (~22 %) off `fastOf` self-time. Production runs through it. +- `--fast-sync-load` -- inject + [docs/lib/fast-sync-load.mjs](../docs/lib/fast-sync-load.mjs), + replacing nine `__awaiter`-wrapped methods across pdf-lib's load + and save paths with awaiterless twins. Each upstream method is + wrapped in tslib `__awaiter` / `__generator` so on browsers it + can `await waitForTick()` every `objectsPerTick` objects; in + Node the yield gate never fires (objectsPerTick: Infinity), but + every indirect object still pays the generator state-machine + dispatch + Promise allocation. The shim removes the scaffolding + entirely. The `parseSpeed` / `objectsPerTick` options drop off + `PDFDocument.load`, `parallelSave`, and `pdfDoc.save` call sites + in step. Production runs through it. - `--cpu-profile-process` -- attach Node's `inspector/promises` Profiler around the process phase only (skips render and generate). Writes `process.cpuprofile` into the timestamped `results/` folder. @@ -301,6 +313,7 @@ run.bat --fast-inflate # swap pako.inflate for node:zlib.infl run.bat --fast-parse-number # direct-integer accumulator for parseRawNumber/parseRawInt (also ships; opt-in here for A/B) run.bat --fast-dict-iter # in-place Map.forEach for PDFDict.sizeInBytes/copyBytesInto (also ships; opt-in here for A/B) run.bat --fast-parse-dict # hoist Type/Catalog/Pages/Page sentinel PDFNames out of parseDict (also ships; opt-in here for A/B) +run.bat --fast-sync-load # synchronify PDFDocument.load + parser; strip waitForTick machinery (also ships; opt-in here for A/B) ``` Flags compose. The CPU profile lands as `render.cpuprofile` @@ -415,6 +428,7 @@ file documenting each: | `sizeInBytes` short-circuit ladder (no base-2 string) | [08](notes/08-pdf-lib.md) | ~60 ms process (save -70 ms) | | `PDFDict` iter (Map.forEach with hoisted callbacks) | [08](notes/08-pdf-lib.md) | ~80 ms process (dict path -6 pp) | | `parseDict` sentinel-PDFName hoist (Type/Catalog/Pages/Page) | [08](notes/08-pdf-lib.md) | ~17 ms profile (fastOf -22 %) | +| Synchronify pdf-lib load + save (strip `__awaiter` scaffolding) | [08](notes/08-pdf-lib.md) | ~0.36 s process (load -26 %, GC -53 ms) | What was tried and didn't ship: @@ -441,4 +455,4 @@ order; later ones reference earlier ones for context. | [05-blink-trace.md](notes/05-blink-trace.md) | What happened when we tried move-not-clone (a `previousLeaf` cache shipped instead of move); cracking the cpu profile's `(program)` row open with a Blink-category trace; the WhiteSpaceFilter paired-A/B that found it wasn't worth its layout cost in our pipeline. | | [06-microtasks-pageranges-css.md](notes/06-microtasks-pageranges-css.md) | Following `RunMicrotasks` down to zero (chunker fully sync); why `pageRanges` sharding is off the table; CSS cost attribution showing print.css's individual sections are all below the noise floor. | | [07-memory.md](notes/07-memory.md) | Where the renderer's 1.9 GB goes -- process-tree footprint, per-allocator + per-Blink-class breakdown, `--disable-gpu` + `--in-process-gpu` saving ~200 MB, a GC-pass probe finding 180 MB of unswept Oilpan garbage. | -| [08-pdf-lib.md](notes/08-pdf-lib.md) | Profiling the process phase via `--cpu-profile-process`; pako's per-stream init dominates with ~4 500 small streams (routing pdf-lib's `deflate` + `inflate` through `node:zlib` saves ~1.5 s); `PDFRef.of`'s string-keyed Map lookup at 1.2 M calls per load (dense-array gen=0 cache saves ~0.2 s); parallelising save's per-stream deflate on libuv's pool with `objectsPerStream: 500` (~0.3 s off the main thread; PDF -5 %); `decodeName`'s regex scan on 2.76 M `PDFName.of` calls per load with a 0.0001 % hit rate (no-`#` cache saves ~0.5 s); `numberToString`'s redundant `toString`/`split`/`parseInt` on the 100 % no-`e` path; `sizeInBytes` allocating `n.toString(2)` on ~300 k xref-writer calls (short-circuit ladder saves ~60 ms); `PDFDict.entries` allocating `Array.from(map.entries())` on every dict serialisation (`Map.forEach` with hoisted callbacks saves ~80 ms); `parseDict`'s type-dispatch tail re-running `PDFName.of('Type'/'Catalog'/'Pages'/'Page')` per dict (hoisted sentinel constants drop `fastOf` self-time by 22 %). | +| [08-pdf-lib.md](notes/08-pdf-lib.md) | Profiling the process phase via `--cpu-profile-process`; pako's per-stream init dominates with ~4 500 small streams (routing pdf-lib's `deflate` + `inflate` through `node:zlib` saves ~1.5 s); `PDFRef.of`'s string-keyed Map lookup at 1.2 M calls per load (dense-array gen=0 cache saves ~0.2 s); parallelising save's per-stream deflate on libuv's pool with `objectsPerStream: 500` (~0.3 s off the main thread; PDF -5 %); `decodeName`'s regex scan on 2.76 M `PDFName.of` calls per load with a 0.0001 % hit rate (no-`#` cache saves ~0.5 s); `numberToString`'s redundant `toString`/`split`/`parseInt` on the 100 % no-`e` path; `sizeInBytes` allocating `n.toString(2)` on ~300 k xref-writer calls (short-circuit ladder saves ~60 ms); `PDFDict.entries` allocating `Array.from(map.entries())` on every dict serialisation (`Map.forEach` with hoisted callbacks saves ~80 ms); `parseDict`'s type-dispatch tail re-running `PDFName.of('Type'/'Catalog'/'Pages'/'Page')` per dict (hoisted sentinel constants drop `fastOf` self-time by 22 %); pdf-lib's `__awaiter`/`__generator` scaffolding on nine load + save methods costing ~135 ms of attributed self-time + ~50 ms GC (synchronified twins save 0.36 s of process). | diff --git a/perf/measure.mjs b/perf/measure.mjs index c5cca424..365f27aa 100644 --- a/perf/measure.mjs +++ b/perf/measure.mjs @@ -32,6 +32,7 @@ // [--fast-decode-name] [--fast-number-to-string] // [--fast-size-in-bytes] [--fast-inflate] // [--fast-parse-number] [--fast-parse-dict] +// [--fast-sync-load] // // --render-only bails out after the render phase. Skips meta extraction, // parseOutline, page.pdf, and the pdf-lib roundtrip / incremental writer. @@ -145,13 +146,30 @@ // --fast-decode-name each lookup is still a Map.get on fastCache. // Pool-dedup makes the canonical PDFNames reference-stable, so // captured constants replace the four calls verbatim. +// +// --fast-sync-load rips pdf-lib's parseSpeed / objectsPerTick / +// shouldWaitForTick / waitForTick machinery out of both the load +// path (PDFDocument.load + PDFParser.parseDocument / parseDocumentSection +// / parseIndirectObjects / parseIndirectObject + +// PDFObjectStreamParser.parseIntoContext) and the save path +// (PDFWriter.serializeToBuffer / computeBufferSize + +// PDFStreamWriter.computeBufferSize). pdf-lib's TS downlevel wraps +// each in tslib __awaiter / __generator so on browsers they can +// `await waitForTick()` every `objectsPerTick` objects; with +// objectsPerTick: Infinity (or the load path's parseSpeed: Fastest) +// the gate never fires, but every indirect object still pays the +// generator state-machine + Promise allocation. The shim removes +// the scaffolding and the waitForTick yields entirely. Production +// runs through it; the parseSpeed / objectsPerTick options are +// dropped from PDFDocument.load / parallelSave / pdfDoc.save call +// sites in step with this shim. import { pathToFileURL, fileURLToPath } from 'node:url'; import { dirname, resolve, join } from 'node:path'; import { mkdirSync, writeFileSync, existsSync } from 'node:fs'; import { Session } from 'node:inspector/promises'; import puppeteer from 'puppeteer'; -import { PDFDocument, ParseSpeeds } from 'pdf-lib'; +import { PDFDocument } from 'pdf-lib'; // Shared with docs/render-book.mjs -- the helpers and the paged.js // bundle live under docs/lib/ now that we've dropped the pagedjs-cli // dependency. Importing from there guarantees the harness measures the @@ -196,6 +214,7 @@ let fastInflate = false; let fastParseNumber = false; let fastDictIter = false; let fastParseDict = false; +let fastSyncLoad = false; for (let i = 0; i < args.length; i++) { const a = args[i]; if (a === '--out') outArg = args[++i]; @@ -226,6 +245,7 @@ for (let i = 0; i < args.length; i++) { else if (a === '--fast-parse-number') fastParseNumber = true; else if (a === '--fast-dict-iter') fastDictIter = true; else if (a === '--fast-parse-dict') fastParseDict = true; + else if (a === '--fast-sync-load') fastSyncLoad = true; else if (!inputArg) inputArg = a; else { console.error(`unknown arg: ${a}`); process.exit(2); } } @@ -299,6 +319,10 @@ if (fastParseDict) { await import('../docs/lib/fast-parse-dict.mjs'); console.log('[harness] fast-parse-dict: hoist Type/Catalog/Pages/Page sentinel PDFNames out of parseDict'); } +if (fastSyncLoad) { + await import('../docs/lib/fast-sync-load.mjs'); + console.log('[harness] fast-sync-load: synchronify PDFParser load path, strip waitForTick machinery'); +} const stamp = new Date().toISOString().replace(/[:.]/g, '-'); const outDir = outArg @@ -580,15 +604,17 @@ try { finalPdf = bytes; processBreakdown = { incrementalMs: incMs, ...stats }; } else { - // pdf-lib's defaults are catastrophically slow: parseSpeed=Slow (100 - // objects/tick) and objectsPerTick=50 both yield to the event loop - // between batches, turning a ~2s load into ~36s on a 52 MB PDF (~34s - // pure idle in the cpuprofile). Override to Fastest/Infinity so the - // "baseline" we report reflects the library's actual CPU cost, not - // an artefact of yielding cadence. The harness has no parallel work - // to make space for, so cooperative yielding is pure overhead here. + // Upstream pdf-lib's load yields to the event loop every + // `parseSpeed` objects via `await waitForTick()`; the save side + // does the same every `objectsPerTick`. With --fast-sync-load on + // (the production default) both yield gates are ripped out -- the + // option arguments are silently ignored, so we don't bother + // passing them. Without --fast-sync-load, the run measures pdf-lib's + // cautious defaults (parseSpeed: Slow, objectsPerTick: 50) which + // yield ~500 / ~1000 times per phase on the book; that's pdf-lib's + // out-of-the-box behaviour, useful as a baseline for A/B work. const tLoadStart = Date.now(); - const pdfDoc = await PDFDocument.load(rawPdf, { parseSpeed: ParseSpeeds.Fastest }); + const pdfDoc = await PDFDocument.load(rawPdf); const loadMs = Date.now() - tLoadStart; setMetadata(pdfDoc, meta); @@ -600,11 +626,11 @@ try { const tSaveStart = Date.now(); let parallelStreamCount = 0; if (parallelDeflate) { - const { bytes, streamCount } = await parallelSave(pdfDoc, { objectsPerTick: Infinity, objectsPerStream: 500 }); + const { bytes, streamCount } = await parallelSave(pdfDoc, { objectsPerStream: 500 }); finalPdf = bytes; parallelStreamCount = streamCount; } else { - finalPdf = await pdfDoc.save({ objectsPerTick: Infinity }); + finalPdf = await pdfDoc.save(); } const saveMs = Date.now() - tSaveStart; diff --git a/perf/notes/08-pdf-lib.md b/perf/notes/08-pdf-lib.md index acdeafd9..1c6db344 100644 --- a/perf/notes/08-pdf-lib.md +++ b/perf/notes/08-pdf-lib.md @@ -1462,6 +1462,244 @@ is a bigger code change for a smaller individual win. Worth revisiting if a future optimisation moves the floor and parseDict becomes a larger relative share. +## Strip the parse-speed machinery: synchronify the load path + +After the eight `--fast-*` patches above had nibbled the process +phase from 7.8 s down to 1.66 s, the next interesting thing in the +profile wasn't *a function* -- it was *function scaffolding*. +Three top-15 rows were the tslib `__awaiter` / `__generator` +machinery that pdf-lib's TypeScript downlevel emits for its +`async`-marked parser methods: + +``` + self_ms self_% function source + ------- ------ ---------------------------------------------- + 51.66 3.12% (anonymous) (parseIndirectObject body) PDFParser.js:126 + 43.05 2.60% step (generator runner) tslib.js:123 + 40.90 2.47% (anonymous) (parseIndirectObjects body) PDFParser.js:190 +``` + +Together ~135 ms / ~8 % of process self-time, sitting on top of +the parsing work that's already attributed to the named frames +below them. + +### What that scaffolding was for + +pdf-lib targets browsers as well as Node. On a browser, locking +the main thread for the seconds it takes to parse a big PDF would +freeze the page, so pdf-lib has a knob -- `parseSpeed`, also +exposed as `objectsPerTick` -- that controls how many indirect +objects the parser processes before yielding to the event loop via +`await waitForTick()`. The default is the cautious +`ParseSpeeds.Slow = 100`. The mechanism is a constructor-installed +predicate (`PDFParser.js:31`): + +```js +this.shouldWaitForTick = function () { + this.parsedObjects += 1; + return this.parsedObjects % this.objectsPerTick === 0; +}; +``` + +…queried at the bottom of every `parseIndirectObjects` iteration +(`PDFParser.js:215`) and every `parseIntoContext` iteration in +`PDFObjectStreamParser.js:42`, gating an `await waitForTick()` +(= `setImmediate`). + +`render-book.mjs` already passed `parseSpeed: ParseSpeeds.Fastest` +to `PDFDocument.load`, which is `objectsPerTick: Infinity`, which +makes `shouldWaitForTick()` return `false` on every call: the +modulo never hits zero, the yield never fires. The +`Fastest`-vs-`Slow` speedup we'd measured years earlier (see +[01-baseline-and-detach.md](01-baseline-and-detach.md)) +was precisely removing those yields' wall-clock contribution. + +But removing the *yields* didn't remove the **scaffolding**. Even +with `objectsPerTick: Infinity`, every call to +`parseIndirectObject` still: + +1. Allocates a Promise (the `__awaiter` return). +2. Allocates a generator object (the inner `__generator` return). +3. Allocates an activation record (the closed-over `_a` state). +4. Enters the tslib `step` runner, which calls the generator + body, which enters `switch (_a.label) { case 0: ... }`, runs + all the synchronous work, falls through to `return [2 /*return*/, ref]`, + which `step` unpacks and resolves the Promise with. +5. The caller `await`s that Promise (one microtask hop). + +For ~50 k indirect objects on the book that's 50 k of each. +Roughly ~135 ms of attributed self-time (the three rows above) +plus an unknowable but non-trivial fraction of the 240 ms GC row +(Promise + generator + activation are all short-lived heap +allocations). + +The same shape applies to `parseIndirectObjects` (which calls +`parseIndirectObject`), `parseDocumentSection` (which calls +`parseIndirectObjects`), `parseDocument` (which calls +`parseDocumentSection`), and `PDFDocument.load` (which calls +`parseDocument`). Five `async` wrappers around code that, on the +hot path, runs synchronously. + +### Why bother on the ObjStm branch too + +`parseIndirectObject` *does* have one genuinely-await-ing branch +at `PDFParser.js:142`: if the parsed object is an object stream +(PDF 1.5 §7.5.7, type `ObjStm`), it dispatches to +`PDFObjectStreamParser.parseIntoContext()`, which itself is +`async`. But `parseIntoContext`'s only `await` is the same kind +of conditionally-gated `waitForTick` -- and `shouldWaitForTick` +is passed in from the parent parser, so it's still `() => false` +under our config. The whole sub-stream walk is already morally +synchronous; just no upstream code path ever constructs a parser +without `shouldWaitForTick`. + +(Aside: Chrome's `SkPDF` writer doesn't emit ObjStm at all -- it +writes every indirect object at its own xref offset and uses the +classic xref table. So on our pipeline the ObjStm branch of +`parseIndirectObject` doesn't even fire. But pdf-lib loads have +to work generically; the patch handles the branch correctly.) + +### The shim + +`docs/lib/fast-sync-load.mjs` replaces six prototype methods with +synchronous twins: + +``` +PDFParser.prototype.parseDocument +PDFParser.prototype.parseDocumentSection +PDFParser.prototype.parseIndirectObjects +PDFParser.prototype.parseIndirectObject +PDFObjectStreamParser.prototype.parseIntoContext +PDFDocument.load (static) +``` + +The bodies are line-by-line ports of the upstream `case`-blocks -- +same loop, same `parseObject` / `context.assign` / `parseHeader` / +`maybeParseCrossRefSection` / `maybeParseTrailerDict` / +`maybeParseTrailer` / `skipJibberish` calls in the same order -- +with three changes: + +1. No `__awaiter` / `__generator` wrapper. The function returns + directly. +2. No `shouldWaitForTick` check, no `waitForTick` yield. +3. The three `PDFName.of(...)` calls in `parseIndirectObject`'s + type-dispatch tail (`'Type'`, `'ObjStm'`, `'XRef'`) are hoisted + to module-level constants -- same trick as + [`fast-parse-dict.mjs`](#parsedict-hoist-the-sentinel-pdfnames-out-of-the-type-dispatch-tail), + since pool-dedup makes the `PDFName` instances reference-stable. + +The patches have to land together: each method awaits the next +one down, so desugaring any one in isolation still leaves a +Promise chain dangling. + +`PDFDocument.load`'s signature is preserved -- still callable as +`await PDFDocument.load(bytes)`. `await` on a non-Promise resolves +to the value immediately, so existing call sites need no change. +The `parseSpeed` option is now silently ignored (no yield gate +left to tune). + +The shim's correctness depends on the upstream pdf-lib source +being structurally what the line-by-line port assumed. `pdf-lib` +1.17.1 (Hopding's last release, abandoned) is byte-stable on npm +and that's what we ship against; `package.json` is updated in +this change to pin to `1.17.1` exact (was `^1.17.1`), similarly +for `puppeteer` `25.0.4`, so a stray `npm update` can't silently +swap upstream from under the shim. + +### Results + +Paired process-phase profiles, same harness config except +`--fast-sync-load`: + +| metric | PRE | POST | Δ | +| --- | --- | --- | --- | +| **process wall-clock** | **1.66 s** | **1.30 s** | **-0.36 s (-22 %)** | +| ↳ load | 1.09 s | 0.81 s | -0.28 s (-26 %) | +| ↳ save | 0.56 s | 0.48 s | -0.08 s (noise; writer not touched) | +| GC self-time | 240 ms | 187 ms | -53 ms (-22 %) | +| `(anonymous) @ PDFParser.js:126` | 51.66 ms | gone | -51.66 ms | +| `step @ tslib.js:123` | 43.05 ms | gone | -43.05 ms | +| `(anonymous) @ PDFParser.js:190` | 40.90 ms | gone | -40.90 ms | +| **scaffolding total** | **~135 ms** | **0** | **-135 ms (eliminated)** | + +The wall-clock delta is larger than the sum of the eliminated +rows because the GC win is real time too: the per-object Promise ++ generator + activation allocations weren't free in V8's +internals either, just not attributed to any named frame. + +Output PDF: byte-count identical (16,077,319 bytes both runs); +MD5 differs only because Chrome's `page.pdf()` embeds a fresh +`/CreationDate` + `/ModDate` per run (same ±27-byte timestamp +jitter `docs/book.bat` output has always had). + +### Extending to the save side + +The shim covers the writers too, by symmetry. Three more methods: + +``` +PDFWriter.prototype.serializeToBuffer +PDFWriter.prototype.computeBufferSize +PDFStreamWriter.prototype.computeBufferSize +``` + +Only `serializeToBuffer` actually runs on our pipeline -- +`ParallelStreamWriter extends PDFStreamWriter` overrides +`computeBufferSize` with its own three-phase parallel-deflate +version (genuinely async because of `await Promise.all(deflated)` +over libuv's thread pool, which we keep). But the inherited +`serializeToBuffer` still had a dead `shouldWaitForTick` gate in +its main loop. Same shape as the load side: per-object dispatch, +no actual yield because `objectsPerTick` is effectively `Infinity`, +but every iteration pays the generator-machine + Promise cost. + +`serializeToBuffer` stays `async` (it has to `await +this.computeBufferSize()`, which is the genuinely-async override). +The change is: drop the `__awaiter` / `__generator` wrapper, use +ES `async function` with one real `await`, strip the +`shouldWaitForTick` gate. `computeBufferSize` on both base and +stream writers becomes fully synchronous (their only async +ingredient was the same dead yield). + +Measured wins on the writer side: **none reliably above noise**. +The save phase dropped from 0.56 s before the load-side patches +to 0.48 s after, and the writer patches don't move it further +(0.50 s in the post-extension profile, within the run-to-run +band). No writer frame ever broke into the top 15 in the first +place -- the overhead was real but distributed across +unattributed scaffolding and `(program)` time, not big enough to +register individually. + +The reason to ship it anyway is structural, not performance: with +load patched, the only remaining +`shouldWaitForTick` / `waitForTick` references in our hot path +were on the save side, and leaving them would defeat the "rip out +the machinery" intent. With the save patches landed, neither +phase routes through tslib `__awaiter` scaffolding except where +there's a legitimate `await` underneath. + +### Dropping the flags + +The companion change is to drop the `parseSpeed` / `objectsPerTick` +options from all our call sites, since with the shim in effect +neither does anything: + +- `docs/render-book.mjs` drops `parseSpeed: ParseSpeeds.Fastest` + from `PDFDocument.load` and `objectsPerTick: Infinity` from + `parallelSave`. The `ParseSpeeds` import goes with them. +- `docs/lib/parallel-deflate.mjs` drops `objectsPerTick` from + `parallelSave`'s public options object and from + `ParallelStreamWriter`'s constructor parameters. `PDFWriter`'s + base constructor still takes `objectsPerTick` as positional + arg 2 -- vestigial after `fast-sync-load`, but we pass + `Infinity` explicitly to make the constructor chain happy. +- `perf/measure.mjs` removes the same options from + `PDFDocument.load`, `parallelSave`, and `pdfDoc.save`. + +`perf/profile-roundtrip.mjs` keeps its `parseSpeed` / +`objectsPerTick` knob comparison -- that file's whole purpose is +to A/B pdf-lib's defaults against `Fastest`, and it runs against +vanilla pdf-lib without the shim by design. + ## `@cantoo/pdf-lib`: not a drop-in replacement Spot-checked the maintained fork (`@cantoo/pdf-lib` 2.6.5) as an @@ -1491,7 +1729,8 @@ Cumulative process-phase cost, baseline → after the shims to date: | + fast-decode-name + fast-number-to-string | ~1.6 s | ~1.0 s | ~0.6 s | | + fast-size-in-bytes | ~1.5 s | ~1.0 s | ~0.5 s | | + fast-dict-iter | ~1.4 s | ~1.0 s | ~0.4 s | -| **+ fast-parse-dict (this section)** | **~1.4 s** | **~1.0 s** | **~0.4 s** | +| + fast-parse-dict | ~1.4 s | ~1.0 s | ~0.4 s | +| **+ fast-sync-load (this section)** | **~1.3 s** | **~0.8 s** | **~0.5 s** | The bottom-up after the latest pair is what's left of pdf-lib's genuine parser work: `PDFDict.entries`, `PDFObjectParser.parseName`, From d928c2de491137370e727d6a0dd1c018efff7a45 Mon Sep 17 00:00:00 2001 From: Kuba Sunderland-Ober Date: Sun, 24 May 2026 00:41:20 +0200 Subject: [PATCH 11/44] Dispatch parseObject by first byte; gate true/false/null matchKeyword scans. --- docs/lib/fast-parse-object.mjs | 92 ++++++++++++++++++++++ docs/render-book.mjs | 8 ++ perf/README.md | 22 +++++- perf/measure.mjs | 17 ++++- perf/notes/08-pdf-lib.md | 136 +++++++++++++++++++++++++++++++++ 5 files changed, 272 insertions(+), 3 deletions(-) create mode 100644 docs/lib/fast-parse-object.mjs diff --git a/docs/lib/fast-parse-object.mjs b/docs/lib/fast-parse-object.mjs new file mode 100644 index 00000000..e573dc44 --- /dev/null +++ b/docs/lib/fast-parse-object.mjs @@ -0,0 +1,92 @@ +// Dispatch PDFObjectParser.parseObject by first byte; gate the three +// keyword scans behind a byte check. +// +// The upstream parseObject +// ([PDFObjectParser.js:36](node_modules/pdf-lib/cjs/core/parser/PDFObjectParser.js:36)) +// runs three speculative matchKeyword calls (true / false / null) +// before peeking the dispatch byte: +// +// parseObject() { +// this.skipWhitespaceAndComments(); +// if (this.matchKeyword(Keywords.true)) return PDFBool.True; +// if (this.matchKeyword(Keywords.false)) return PDFBool.False; +// if (this.matchKeyword(Keywords.null)) return PDFNull; +// var byte = this.bytes.peek(); +// ... +// } +// +// parseObject is called for every dict value, array element, and +// indirect-object body -- same call density as fastParseDict, which +// is the #2 row in the process profile. true / false / null are +// extraordinarily rare in real PDFs (boolean / null entries on +// individual dict values, mostly), so the three matchKeyword calls +// fail-and-rewind on essentially every invocation. Each failure +// still pays bytes.offset() + bytes.next() + comparison + +// bytes.moveTo(initialOffset). +// +// This shim flips the dispatch: peek the first byte, branch by byte +// for the structural tokens, and only enter matchKeyword when the +// byte is `t` / `f` / `n` (i.e. could plausibly start the keyword). +// Dispatch order is by observed frequency in dict-value position: +// numbers / refs first (digits + sign + period), then dicts (<<), +// names (/), arrays ([), strings ((), hex strings (<). Same +// semantics -- a value starting with `t`/`f`/`n` that isn't a +// keyword still falls through to the same PDFObjectParsingError +// throw. +// +// Mechanism: PDFObjectParser isn't re-exported from pdf-lib's index, +// so we reach in through the CJS internals via createRequire (same +// shape as fast-parse-dict.mjs). Mutating +// PDFObjectParser.prototype.parseObject is global -- every parser +// instance created after this shim loads picks it up. +// +// Side-effecting import. Import once before PDFDocument.load runs: +// +// import "./lib/fast-parse-object.mjs"; +// +// Idempotent -- repeated imports do nothing after the first. + +import { createRequire } from 'node:module'; + +const require = createRequire(import.meta.url); +const PDFObjectParser = require('pdf-lib/cjs/core/parser/PDFObjectParser.js').default; +const PDFBool = require('pdf-lib/cjs/core/objects/PDFBool.js').default; +const PDFNull = require('pdf-lib/cjs/core/objects/PDFNull.js').default; +const CharCodes = require('pdf-lib/cjs/core/syntax/CharCodes.js').default; +const { Keywords } = require('pdf-lib/cjs/core/syntax/Keywords.js'); +const { IsNumeric } = require('pdf-lib/cjs/core/syntax/Numeric.js'); +const { PDFObjectParsingError } = require('pdf-lib/cjs/core/errors.js'); + +const KwTrue = Keywords.true; +const KwFalse = Keywords.false; +const KwNull = Keywords.null; + +const LessThan = CharCodes.LessThan; +const ForwardSlash = CharCodes.ForwardSlash; +const LeftSquareBracket = CharCodes.LeftSquareBracket; +const LeftParen = CharCodes.LeftParen; +const t_code = CharCodes.t; +const f_code = CharCodes.f; +const n_code = CharCodes.n; + +if (!PDFObjectParser.prototype.__fastParseObjectInstalled) { + PDFObjectParser.prototype.parseObject = function fastParseObject() { + this.skipWhitespaceAndComments(); + const bytes = this.bytes; + const byte = bytes.peek(); + if (IsNumeric[byte]) return this.parseNumberOrRef(); + if (byte === LessThan) { + if (bytes.peekAhead(1) === LessThan) return this.parseDictOrStream(); + return this.parseHexString(); + } + if (byte === ForwardSlash) return this.parseName(); + if (byte === LeftSquareBracket) return this.parseArray(); + if (byte === LeftParen) return this.parseString(); + if (byte === t_code && this.matchKeyword(KwTrue)) return PDFBool.True; + if (byte === f_code && this.matchKeyword(KwFalse)) return PDFBool.False; + if (byte === n_code && this.matchKeyword(KwNull)) return PDFNull; + throw new PDFObjectParsingError(bytes.position(), byte); + }; + + PDFObjectParser.prototype.__fastParseObjectInstalled = true; +} diff --git a/docs/render-book.mjs b/docs/render-book.mjs index 82c7f4b4..2cf1fc73 100644 --- a/docs/render-book.mjs +++ b/docs/render-book.mjs @@ -70,6 +70,13 @@ import { PDFDocument } from 'pdf-lib'; // per-dict on every load; pool-dedup makes the canonical // PDFNames reference-stable, so captured constants replace // the calls verbatim. Pulls ~17 ms off fastOf self-time. +// fast-parse-object -- replace PDFObjectParser.prototype.parseObject +// with a first-byte-dispatch version that gates the three +// matchKeyword (true / false / null) scans behind a byte check. +// parseObject fires per dict value / array element / indirect +// object body; the upstream version pays three speculative +// matchKeyword fail-and-rewind costs on every invocation. Same +// semantics, dispatch reordered by observed frequency. // fast-sync-load -- rip the parseSpeed / objectsPerTick / // shouldWaitForTick / waitForTick machinery out of both pdf-lib's // load path (PDFDocument.load + five PDFParser / @@ -92,6 +99,7 @@ import './lib/fast-number-to-string.mjs'; import './lib/fast-size-in-bytes.mjs'; import './lib/fast-dict-iter.mjs'; import './lib/fast-parse-dict.mjs'; +import './lib/fast-parse-object.mjs'; import './lib/fast-sync-load.mjs'; import { parseOutline, setOutline } from './lib/outline.mjs'; import { setMetadata } from './lib/postprocesser.mjs'; diff --git a/perf/README.md b/perf/README.md index a7635cc5..66107fea 100644 --- a/perf/README.md +++ b/perf/README.md @@ -66,7 +66,7 @@ or pdf-lib), or to write `book.pdf` for behavioural verification. The mirror command for CPU-profiling the pdf-lib roundtrip: ``` -node measure.mjs --fast-refs --parallel-deflate --fast-decode-name --fast-number-to-string --fast-size-in-bytes --fast-inflate --fast-parse-number --fast-dict-iter --fast-parse-dict --fast-sync-load --cpu-profile-process --cpu-sampling 100 +node measure.mjs --fast-refs --parallel-deflate --fast-decode-name --fast-number-to-string --fast-size-in-bytes --fast-inflate --fast-parse-number --fast-dict-iter --fast-parse-dict --fast-parse-object --fast-sync-load --cpu-profile-process --cpu-sampling 100 ``` Flag rationale: @@ -142,6 +142,22 @@ Flag rationale: `fastOf` was still the #4 row in the profile -- removing the calls pulls ~17 ms (~22 %) off `fastOf` self-time. Production runs through it. +- `--fast-parse-object` -- inject + [docs/lib/fast-parse-object.mjs](../docs/lib/fast-parse-object.mjs), + replacing `PDFObjectParser.prototype.parseObject` with a + first-byte-dispatch version that gates the three speculative + `matchKeyword` calls (`true` / `false` / `null`) behind a byte + check. The upstream `parseObject` pays three `matchKeyword` + fail-and-rewind costs per dispatch (`bytes.offset()` + + `bytes.next()` + comparison + `bytes.moveTo(initialOffset)`) + before peeking the dispatch byte, on every call -- and the + three keywords are extraordinarily rare in real PDFs. The shim + peeks first and only enters `matchKeyword` when the byte could + plausibly start a keyword (`t` / `f` / `n`); dispatch order is + reshuffled by observed frequency in dict-value position (numbers + / refs first, then `<<`, names, arrays, strings). Same + semantics. Pulls `parseObject` self-time from ~82 ms (5.2 %) + to ~40 ms (3.1 %). Production runs through it. - `--fast-sync-load` -- inject [docs/lib/fast-sync-load.mjs](../docs/lib/fast-sync-load.mjs), replacing nine `__awaiter`-wrapped methods across pdf-lib's load @@ -313,6 +329,7 @@ run.bat --fast-inflate # swap pako.inflate for node:zlib.infl run.bat --fast-parse-number # direct-integer accumulator for parseRawNumber/parseRawInt (also ships; opt-in here for A/B) run.bat --fast-dict-iter # in-place Map.forEach for PDFDict.sizeInBytes/copyBytesInto (also ships; opt-in here for A/B) run.bat --fast-parse-dict # hoist Type/Catalog/Pages/Page sentinel PDFNames out of parseDict (also ships; opt-in here for A/B) +run.bat --fast-parse-object # first-byte dispatch in parseObject; gate true/false/null matchKeyword behind byte check (also ships; opt-in here for A/B) run.bat --fast-sync-load # synchronify PDFDocument.load + parser; strip waitForTick machinery (also ships; opt-in here for A/B) ``` @@ -429,6 +446,7 @@ file documenting each: | `PDFDict` iter (Map.forEach with hoisted callbacks) | [08](notes/08-pdf-lib.md) | ~80 ms process (dict path -6 pp) | | `parseDict` sentinel-PDFName hoist (Type/Catalog/Pages/Page) | [08](notes/08-pdf-lib.md) | ~17 ms profile (fastOf -22 %) | | Synchronify pdf-lib load + save (strip `__awaiter` scaffolding) | [08](notes/08-pdf-lib.md) | ~0.36 s process (load -26 %, GC -53 ms) | +| `parseObject` first-byte dispatch + gated keyword scans | [08](notes/08-pdf-lib.md) | ~42 ms profile (parseObject -51 %) | What was tried and didn't ship: @@ -455,4 +473,4 @@ order; later ones reference earlier ones for context. | [05-blink-trace.md](notes/05-blink-trace.md) | What happened when we tried move-not-clone (a `previousLeaf` cache shipped instead of move); cracking the cpu profile's `(program)` row open with a Blink-category trace; the WhiteSpaceFilter paired-A/B that found it wasn't worth its layout cost in our pipeline. | | [06-microtasks-pageranges-css.md](notes/06-microtasks-pageranges-css.md) | Following `RunMicrotasks` down to zero (chunker fully sync); why `pageRanges` sharding is off the table; CSS cost attribution showing print.css's individual sections are all below the noise floor. | | [07-memory.md](notes/07-memory.md) | Where the renderer's 1.9 GB goes -- process-tree footprint, per-allocator + per-Blink-class breakdown, `--disable-gpu` + `--in-process-gpu` saving ~200 MB, a GC-pass probe finding 180 MB of unswept Oilpan garbage. | -| [08-pdf-lib.md](notes/08-pdf-lib.md) | Profiling the process phase via `--cpu-profile-process`; pako's per-stream init dominates with ~4 500 small streams (routing pdf-lib's `deflate` + `inflate` through `node:zlib` saves ~1.5 s); `PDFRef.of`'s string-keyed Map lookup at 1.2 M calls per load (dense-array gen=0 cache saves ~0.2 s); parallelising save's per-stream deflate on libuv's pool with `objectsPerStream: 500` (~0.3 s off the main thread; PDF -5 %); `decodeName`'s regex scan on 2.76 M `PDFName.of` calls per load with a 0.0001 % hit rate (no-`#` cache saves ~0.5 s); `numberToString`'s redundant `toString`/`split`/`parseInt` on the 100 % no-`e` path; `sizeInBytes` allocating `n.toString(2)` on ~300 k xref-writer calls (short-circuit ladder saves ~60 ms); `PDFDict.entries` allocating `Array.from(map.entries())` on every dict serialisation (`Map.forEach` with hoisted callbacks saves ~80 ms); `parseDict`'s type-dispatch tail re-running `PDFName.of('Type'/'Catalog'/'Pages'/'Page')` per dict (hoisted sentinel constants drop `fastOf` self-time by 22 %); pdf-lib's `__awaiter`/`__generator` scaffolding on nine load + save methods costing ~135 ms of attributed self-time + ~50 ms GC (synchronified twins save 0.36 s of process). | +| [08-pdf-lib.md](notes/08-pdf-lib.md) | Profiling the process phase via `--cpu-profile-process`; pako's per-stream init dominates with ~4 500 small streams (routing pdf-lib's `deflate` + `inflate` through `node:zlib` saves ~1.5 s); `PDFRef.of`'s string-keyed Map lookup at 1.2 M calls per load (dense-array gen=0 cache saves ~0.2 s); parallelising save's per-stream deflate on libuv's pool with `objectsPerStream: 500` (~0.3 s off the main thread; PDF -5 %); `decodeName`'s regex scan on 2.76 M `PDFName.of` calls per load with a 0.0001 % hit rate (no-`#` cache saves ~0.5 s); `numberToString`'s redundant `toString`/`split`/`parseInt` on the 100 % no-`e` path; `sizeInBytes` allocating `n.toString(2)` on ~300 k xref-writer calls (short-circuit ladder saves ~60 ms); `PDFDict.entries` allocating `Array.from(map.entries())` on every dict serialisation (`Map.forEach` with hoisted callbacks saves ~80 ms); `parseDict`'s type-dispatch tail re-running `PDFName.of('Type'/'Catalog'/'Pages'/'Page')` per dict (hoisted sentinel constants drop `fastOf` self-time by 22 %); pdf-lib's `__awaiter`/`__generator` scaffolding on nine load + save methods costing ~135 ms of attributed self-time + ~50 ms GC (synchronified twins save 0.36 s of process); `parseObject`'s three speculative `matchKeyword(true/false/null)` scans on every dispatch (first-byte peek + gated keyword scans halve `parseObject` self-time). | diff --git a/perf/measure.mjs b/perf/measure.mjs index 365f27aa..1bf9aa46 100644 --- a/perf/measure.mjs +++ b/perf/measure.mjs @@ -32,7 +32,7 @@ // [--fast-decode-name] [--fast-number-to-string] // [--fast-size-in-bytes] [--fast-inflate] // [--fast-parse-number] [--fast-parse-dict] -// [--fast-sync-load] +// [--fast-parse-object] [--fast-sync-load] // // --render-only bails out after the render phase. Skips meta extraction, // parseOutline, page.pdf, and the pdf-lib roundtrip / incremental writer. @@ -147,6 +147,15 @@ // Pool-dedup makes the canonical PDFNames reference-stable, so // captured constants replace the four calls verbatim. // +// --fast-parse-object replaces PDFObjectParser.prototype.parseObject +// with a first-byte-dispatch version that gates the three +// matchKeyword (true / false / null) scans behind a byte check. +// parseObject fires per dict value / array element / indirect +// object body (hundreds of thousands of calls on the book); the +// upstream version pays three speculative matchKeyword fail-and- +// rewind costs on every invocation. Same semantics, dispatch +// reordered by observed frequency in dict-value position. +// // --fast-sync-load rips pdf-lib's parseSpeed / objectsPerTick / // shouldWaitForTick / waitForTick machinery out of both the load // path (PDFDocument.load + PDFParser.parseDocument / parseDocumentSection @@ -214,6 +223,7 @@ let fastInflate = false; let fastParseNumber = false; let fastDictIter = false; let fastParseDict = false; +let fastParseObject = false; let fastSyncLoad = false; for (let i = 0; i < args.length; i++) { const a = args[i]; @@ -245,6 +255,7 @@ for (let i = 0; i < args.length; i++) { else if (a === '--fast-parse-number') fastParseNumber = true; else if (a === '--fast-dict-iter') fastDictIter = true; else if (a === '--fast-parse-dict') fastParseDict = true; + else if (a === '--fast-parse-object') fastParseObject = true; else if (a === '--fast-sync-load') fastSyncLoad = true; else if (!inputArg) inputArg = a; else { console.error(`unknown arg: ${a}`); process.exit(2); } @@ -319,6 +330,10 @@ if (fastParseDict) { await import('../docs/lib/fast-parse-dict.mjs'); console.log('[harness] fast-parse-dict: hoist Type/Catalog/Pages/Page sentinel PDFNames out of parseDict'); } +if (fastParseObject) { + await import('../docs/lib/fast-parse-object.mjs'); + console.log('[harness] fast-parse-object: first-byte dispatch in parseObject, gate true/false/null matchKeyword behind byte check'); +} if (fastSyncLoad) { await import('../docs/lib/fast-sync-load.mjs'); console.log('[harness] fast-sync-load: synchronify PDFParser load path, strip waitForTick machinery'); diff --git a/perf/notes/08-pdf-lib.md b/perf/notes/08-pdf-lib.md index 1c6db344..c0ebcb91 100644 --- a/perf/notes/08-pdf-lib.md +++ b/perf/notes/08-pdf-lib.md @@ -1462,6 +1462,141 @@ is a bigger code change for a smaller individual win. Worth revisiting if a future optimisation moves the floor and parseDict becomes a larger relative share. +## `parseObject`: dispatch by first byte, gate the keyword scans + +After `fast-parse-dict` shipped, `PDFObjectParser.parseObject` was +the next obvious row in the bottom-up table: + +``` + self_ms self_% function @ source + 213.28 13.41% (garbage collector) + 113.05 7.11% fastParseDict + 99.12 6.23% fastOf + 86.87 5.46% PDFRef.of + 86.32 5.43% PDFObjectParser.parseName + 81.86 5.15% PDFObjectParser.parseObject <-- this row + ... +``` + +`parseObject` is the dispatch hub of the PDF object parser. It's +called once per dict value, per array element, and per +indirect-object body -- same call density as `fastParseDict` two +rows above (every dict that fastParseDict builds calls parseObject +N times for its N values). + +### What parseObject was doing + +The upstream body (`PDFObjectParser.js:36`): + +```js +parseObject() { + this.skipWhitespaceAndComments(); + if (this.matchKeyword(Keywords.true)) return PDFBool.True; + if (this.matchKeyword(Keywords.false)) return PDFBool.False; + if (this.matchKeyword(Keywords.null)) return PDFNull; + const byte = this.bytes.peek(); + if (byte === LessThan && this.bytes.peekAhead(1) === LessThan) return this.parseDictOrStream(); + if (byte === LessThan) return this.parseHexString(); + if (byte === LeftParen) return this.parseString(); + if (byte === ForwardSlash) return this.parseName(); + if (byte === LeftSquareBracket) return this.parseArray(); + if (IsNumeric[byte]) return this.parseNumberOrRef(); + throw new PDFObjectParsingError(this.bytes.position(), byte); +} +``` + +Three speculative `matchKeyword` calls run on every invocation, +before the dispatch byte is ever peeked. `matchKeyword` +(`BaseParser.js:97`) on a fast-fail mismatch does `bytes.offset()`, +then `bytes.next()` on the first byte of the keyword, comparison, +then `bytes.moveTo(initialOffset)` to restore. Three of those per +`parseObject` call -- multiplied by the hundreds of thousands of +calls per book load -- adds up. + +`true` / `false` / `null` are extraordinarily rare in real PDFs. +The bulk of dict values are refs (`N N R`), numbers, names, +sub-dicts, and arrays. Putting the dispatch-byte test *before* +the keyword scans, and only entering `matchKeyword` when the +first byte could plausibly start one of the three keywords, +skips three method calls + a `moveTo` per `parseObject` on the +overwhelming majority of inputs. + +### The shim + +`docs/lib/fast-parse-object.mjs` replaces +`PDFObjectParser.prototype.parseObject` with: + +```js +parseObject() { + this.skipWhitespaceAndComments(); + const bytes = this.bytes; + const byte = bytes.peek(); + if (IsNumeric[byte]) return this.parseNumberOrRef(); + if (byte === LessThan) { + if (bytes.peekAhead(1) === LessThan) return this.parseDictOrStream(); + return this.parseHexString(); + } + if (byte === ForwardSlash) return this.parseName(); + if (byte === LeftSquareBracket) return this.parseArray(); + if (byte === LeftParen) return this.parseString(); + if (byte === t_code && this.matchKeyword(KwTrue)) return PDFBool.True; + if (byte === f_code && this.matchKeyword(KwFalse)) return PDFBool.False; + if (byte === n_code && this.matchKeyword(KwNull)) return PDFNull; + throw new PDFObjectParsingError(bytes.position(), byte); +} +``` + +Three changes from upstream: + +1. Peek the first byte once, up front. +2. Dispatch order reshuffled for dict-value frequency: numbers / + refs first (`IsNumeric[byte]` is a Uint8Array index, the + cheapest possible test), then `<<` / `<` (collapsed into one + `LessThan` branch with the `peekAhead` lookup inside), then + names, arrays, strings. +3. The three keyword paths are gated -- `byte === t` / `f` / `n` + guards each `matchKeyword` call, so a non-keyword input never + pays for the speculative scan + rewind. + +Correctness: a value starting with `t`/`f`/`n` that isn't +`true`/`false`/`null` falls through to the same +`PDFObjectParsingError` the upstream code would throw. Dict keys +can't reach parseObject (`parseDict` calls `parseName()` for +keys, parseObject only for values), and names always start with +`/`. Numbers can't start with letters. So the only valid values +that hit the gated keyword branches are the three keywords +themselves. + +`PDFObjectParser` isn't re-exported from pdf-lib's index, so the +shim reaches in via `pdf-lib/cjs/core/parser/PDFObjectParser.js` +through `createRequire` -- same shape as `fast-parse-dict.mjs`. + +### Results + +Profile diff, both runs `--detach-pages --no-timing` with every +other shipping shim active, 100 us sampling: + +| metric | pre | post | Δ | +| --- | --- | --- | --- | +| `parseObject` / `fastParseObject` self | 81.86 ms (5.15 %) | 40.25 ms (3.07 %) | **-41.6 ms (-51 %)** | +| `fastOf` self | 99.12 ms (6.23 %) | 64.18 ms (4.90 %) | -34.9 ms | +| `fastParseDict` self | 113.05 ms (7.11 %) | 65.26 ms (4.98 %) | -47.8 ms | + +The targeted row roughly halves in self-time, as the model +predicts (three `matchKeyword` calls collapsed to first-byte +dispatch). The `fastOf` and `fastParseDict` drops aren't from +this shim doing less work in those frames -- they're profile +attribution shifting around once `parseObject` is no longer +dominating its own children's sampling window (sampled duration +fell from 1.58 s to 1.34 s overall). + +Wall-clock is too noisy on this machine to read at this scale -- +the mechanism-confirmed ~42 ms via profile attribution is the +honest number. + +PDF output is byte-equivalent: same dispatch decisions, same +fallthrough behaviour, same error shape. + ## Strip the parse-speed machinery: synchronify the load path After the eight `--fast-*` patches above had nibbled the process @@ -1730,6 +1865,7 @@ Cumulative process-phase cost, baseline → after the shims to date: | + fast-size-in-bytes | ~1.5 s | ~1.0 s | ~0.5 s | | + fast-dict-iter | ~1.4 s | ~1.0 s | ~0.4 s | | + fast-parse-dict | ~1.4 s | ~1.0 s | ~0.4 s | +| + fast-parse-object | ~1.4 s | ~1.0 s | ~0.4 s | | **+ fast-sync-load (this section)** | **~1.3 s** | **~0.8 s** | **~0.5 s** | The bottom-up after the latest pair is what's left of pdf-lib's From 3bde6138bc12c4c2e9b5f407e95a4b3557812056 Mon Sep 17 00:00:00 2001 From: Kuba Sunderland-Ober Date: Sun, 24 May 2026 00:46:57 +0200 Subject: [PATCH 12/44] Add sampling-heap-profile instrumentation for the process phase. --- perf/README.md | 41 +++++++++++++++++++++++++++++++++++++++- perf/measure.mjs | 41 +++++++++++++++++++++++++++++++++++----- perf/notes/08-pdf-lib.md | 21 +++++++++++++++++--- 3 files changed, 94 insertions(+), 9 deletions(-) diff --git a/perf/README.md b/perf/README.md index 66107fea..0bbb3724 100644 --- a/perf/README.md +++ b/perf/README.md @@ -197,6 +197,42 @@ of hot functions called millions of times (`PDFRef.of` in particular). For "did this wall-clock change," do a paired no-profile A/B as a sanity check. +## Profiling pdf-lib heap allocation (process phase): canonical command + +The companion command for the **sampling heap profile** of the +process phase -- "where is pdf-lib allocating bytes?" rather than +"where is it spending cycles?": + +``` +node measure.mjs --fast-refs --parallel-deflate --fast-decode-name --fast-number-to-string --fast-size-in-bytes --fast-inflate --fast-parse-number --fast-dict-iter --fast-parse-dict --fast-parse-object --fast-sync-load --heap-profile-process --heap-sampling 512 +``` + +Same `--fast-*` set as the CPU command (production is the baseline +we care about); the new flags: + +- `--heap-profile-process` -- attach Node's `inspector/promises` + `HeapProfiler` around the process phase only. Writes + `process.heapprofile` into the timestamped `results/` folder. + Output is V8's sampling-heap-profile JSON (a tree of + `{ callFrame, selfSize, children }` rooted at `head`), not the + flat-nodes shape that `.cpuprofile` uses, so the cpu analyzers + don't apply. Use `analyze-heap-profile.mjs` instead, which walks + the tree and aggregates `selfSize` by `(functionName + url:line)`: + `node analyze-heap-profile.mjs results//process.heapprofile --top 10`. +- `--heap-sampling 512` -- 512-byte sampling interval. V8's default + is 32768 (32 KB); on the ~150 MB process-phase allocation total + that's only ~5 k samples and the bottom-up table runs coarse. + 512 B yields ~250 k samples on the book, plenty of resolution + for "which frame allocated this Map?". Caveat: 512 B sampling + inflates process wall-clock substantially (the sampler's + per-allocation bookkeeping fires 64x more often). Read the + attribution, not the timing, from heap-profiled runs. + +`--heap-profile-process` composes with `--cpu-profile-process` -- +both attach to the same inspector session, so you can capture cpu +and heap in a single run if you want. The same `--render-only` +incompatibility applies (no process phase to profile). + See [notes/08-pdf-lib.md](notes/08-pdf-lib.md) for the process-phase investigations these flags enabled. @@ -313,6 +349,7 @@ run.bat --no-detach-pages # opt out of the detach-pages fix (mea run.bat --timing # collect per-page wall time + heap (writes timing.csv + quartile summary) run.bat --cpu-profile # CPU-profile the render phase (CDP, Chromium-side) run.bat --cpu-profile-process # CPU-profile the process phase (Node inspector, Node-side) +run.bat --heap-profile-process # sampling heap-profile the process phase (Node inspector HeapProfiler); pair with --heap-sampling 512 for fine attribution run.bat --render-only # bail out after render (skip generate + process, ~47s saved) run.bat --clone-count # report Layout.append clones appended vs survivors per page run.bat --instrument # count + time DOM-accessor calls @@ -336,7 +373,9 @@ run.bat --fast-sync-load # synchronify PDFDocument.load + parse Flags compose. The CPU profile lands as `render.cpuprofile` (loadable in Chrome DevTools -> Performance -> "Load profile..."); `--cpu-profile-process` writes `process.cpuprofile` alongside it; -`--instrument` prints a per-op table at end-of-render. +`--heap-profile-process` writes `process.heapprofile` (loadable in +Chrome DevTools -> Memory -> "Load profile..."); `--instrument` +prints a per-op table at end-of-render. You need `_site-pdf\book.html` to exist first -- run `docs\build.bat` (which is `bundle exec jekyll build`) if you haven't already. diff --git a/perf/measure.mjs b/perf/measure.mjs index 1bf9aa46..0ed196ba 100644 --- a/perf/measure.mjs +++ b/perf/measure.mjs @@ -23,7 +23,8 @@ // node measure.mjs [path/to/book.html] [--out ] [--keep-open] // [--cpu-profile] [--cpu-profile-process] // [--cpu-sampling ] -// [--heap-profile] [--heap-sampling ] +// [--heap-profile] [--heap-profile-process] +// [--heap-sampling ] // [--tracing] // [--no-detach-pages] [--instrument] [--time-hooks] // [--incremental] [--chrome-outline] [--timing] @@ -96,6 +97,13 @@ // Honours --cpu-sampling. Composable with --cpu-profile when you want // both phases captured in one run. // +// --heap-profile-process wraps the process phase in V8's sampling heap +// profiler (Inspector's HeapProfiler domain) and writes +// process.heapprofile alongside the cpu one. --heap-sampling sets the +// sampling interval in bytes; default 32768 (V8's default). Drop to +// 512 for finer-grained attribution on short phases. Composable with +// --cpu-profile-process; both share one inspector session. +// // --fast-refs replaces PDFRef.of's string-keyed Map lookup with a // dense-array cache for the gen=0 case (82 % of ~1.2 M calls on the // book). Eliminates the per-call ` R` string allocation @@ -204,7 +212,8 @@ let cpuProfile = false; let cpuProfileProcess = false; let cpuSampling = 1000; // microseconds let heapProfile = false; -let heapSampling = 32768; // bytes between samples (CDP default) +let heapProfileProcess = false; +let heapSampling = 32768; // bytes between samples (V8 default; used by both CDP render-side and inspector process-side) let detachPages = true; let instrument = false; let timeHooks = false; @@ -233,6 +242,7 @@ for (let i = 0; i < args.length; i++) { else if (a === '--cpu-profile-process') cpuProfileProcess = true; else if (a === '--cpu-sampling') cpuSampling = parseInt(args[++i], 10); else if (a === '--heap-profile') heapProfile = true; + else if (a === '--heap-profile-process') heapProfileProcess = true; else if (a === '--heap-sampling') heapSampling = parseInt(args[++i], 10); else if (a === '--detach-pages') detachPages = true; // accepted for backwards compat; default since the fix landed else if (a === '--no-detach-pages') detachPages = false; @@ -295,6 +305,10 @@ if (cpuProfileProcess && renderOnly) { console.error('--cpu-profile-process is incompatible with --render-only (the process phase is skipped).'); process.exit(2); } +if (heapProfileProcess && renderOnly) { + console.error('--heap-profile-process is incompatible with --render-only (the process phase is skipped).'); + process.exit(2); +} // Install the dense-array cache for PDFRef.of's gen=0 path before any // pdf-lib operation. Side-effecting import; idempotent. @@ -535,6 +549,7 @@ try { let processMs = null; let processBreakdown = null; let processProfilePath = null; + let processHeapProfilePath = null; let finalPdf = null; if (!renderOnly) { @@ -602,14 +617,21 @@ try { // pdf-lib runs locally. Output file shape (V8 .cpuprofile JSON) is the // same either way. let inspectorSession = null; - if (cpuProfileProcess) { + if (cpuProfileProcess || heapProfileProcess) { inspectorSession = new Session(); inspectorSession.connect(); + } + if (cpuProfileProcess) { await inspectorSession.post('Profiler.enable'); await inspectorSession.post('Profiler.setSamplingInterval', { interval: cpuSampling }); await inspectorSession.post('Profiler.start'); console.log(`[harness] process cpu profile: sampling every ${cpuSampling}us`); } + if (heapProfileProcess) { + await inspectorSession.post('HeapProfiler.enable'); + await inspectorSession.post('HeapProfiler.startSampling', { samplingInterval: heapSampling }); + console.log(`[harness] process heap profile: sampling every ${heapSampling}B`); + } const tProcStart = Date.now(); if (incremental) { @@ -653,15 +675,23 @@ try { } const tProcEnd = Date.now(); processMs = tProcEnd - tProcStart; - if (inspectorSession) { + if (heapProfileProcess) { + const { profile } = await inspectorSession.post('HeapProfiler.stopSampling'); + await inspectorSession.post('HeapProfiler.disable'); + processHeapProfilePath = join(outDir, 'process.heapprofile'); + const profileJson = JSON.stringify(profile); + writeFileSync(processHeapProfilePath, profileJson); + console.log(`[harness] process heap profile: ${processHeapProfilePath} (${(profileJson.length / 1024 / 1024).toFixed(1)} MB)`); + } + if (cpuProfileProcess) { const { profile } = await inspectorSession.post('Profiler.stop'); await inspectorSession.post('Profiler.disable'); - inspectorSession.disconnect(); processProfilePath = join(outDir, 'process.cpuprofile'); const profileJson = JSON.stringify(profile); writeFileSync(processProfilePath, profileJson); console.log(`[harness] process cpu profile: ${processProfilePath} (${(profileJson.length / 1024 / 1024).toFixed(1)} MB)`); } + if (inspectorSession) inspectorSession.disconnect(); if (incremental) { console.log(`[harness] process ${fmtMs(processMs)} (incremental=${fmtMs(processBreakdown.incrementalMs)}, +${processBreakdown.appendedBytes}B, ${processBreakdown.newObjectCount} new objs)`); } else { @@ -709,6 +739,7 @@ try { ms: processMs, mode: incremental ? 'incremental' : 'pdf-lib-roundtrip', cpuProfile: processProfilePath, + heapProfile: processHeapProfilePath, ...processBreakdown, }; } diff --git a/perf/notes/08-pdf-lib.md b/perf/notes/08-pdf-lib.md index c0ebcb91..7543ba5b 100644 --- a/perf/notes/08-pdf-lib.md +++ b/perf/notes/08-pdf-lib.md @@ -11,17 +11,32 @@ no bottom-up table to point at: CDP's `Profiler` attaches to Chromium and the process phase runs in Node, so `--cpu-profile` couldn't see it. -## `--cpu-profile-process` +## `--cpu-profile-process` (and `--heap-profile-process`) Added to `measure.mjs`: opens an in-process V8 Profiler via `node:inspector/promises`, brackets the process phase the same way `--cpu-profile` brackets render, and writes `process.cpuprofile` -alongside `render.cpuprofile`. Same `.cpuprofile` JSON shape, so the -existing `analyze-profile.mjs` / `find-callers.mjs` / +alongside `render.cpuprofile`. Same `.cpuprofile` JSON shape, so +the existing `analyze-profile.mjs` / `find-callers.mjs` / `find-callees.mjs` work unchanged. See the *Profiling pdf-lib (process phase): canonical command* section in [the README](../README.md) for the operational form. +The heap counterpart -- `--heap-profile-process` -- arrived later +(once allocation became the obvious next thing to attack: GC was +sitting at the top of every CPU profile in this phase). It shares +the same inspector session, so capturing both in one run is one +flag away. Output is a `.heapprofile`, a tree of +`{ callFrame, selfSize, children }` rooted at `head` -- *not* the +flat `.cpuprofile` shape -- so `analyze-heap-profile.mjs` handles +it instead of the cpu analyzers. See *Profiling pdf-lib heap +allocation (process phase): canonical command* in +[the README](../README.md) for the operational form. The findings +this tool enabled are folded into the per-shim sections below +(decodeName / sizeInBytes / PDFDict.entries / ...) -- each names +which path it came from when the heap profile, not the cpu +profile, was the diagnostic that pointed at the function. + First run on the 1638-page book (`--detach-pages --no-timing --cpu-profile-process --cpu-sampling 100`), process 4.66 s (load 1.88 s, setOutline 0.01 s, save 2.77 s). Top of the bottom-up table: From 893953082f71afe73a3e7ce8073ddde4abe6e1cd Mon Sep 17 00:00:00 2001 From: Kuba Sunderland-Ober Date: Sun, 24 May 2026 00:48:05 +0200 Subject: [PATCH 13/44] Add find-heap-callers.mjs: attribute heap allocations to direct callers. --- perf/README.md | 1 + perf/find-heap-callers.mjs | 69 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 70 insertions(+) create mode 100644 perf/find-heap-callers.mjs diff --git a/perf/README.md b/perf/README.md index 0bbb3724..20772d1c 100644 --- a/perf/README.md +++ b/perf/README.md @@ -264,6 +264,7 @@ or `--tracing`): | `analyze-trace.mjs` | Bottom-up self-time analyzer for Chrome traces (`trace.json` from `--tracing`). Computes per-event self-time on the renderer's main thread (`CrRendererMain` by default) by walking nested `X`-phase events. Cracks the cpu profile's `(program)` bucket open into named Blink / V8 events (`Layout`, `RecalcStyle`, `RunMicrotasks`, `V8.GC_*`, ...). Operates on the Blink trace events only -- ignores any embedded V8 cpu samples (`Profile` / `ProfileChunk`). | | `analyze-hybrid.mjs` | Bottom-up analyzer that *combines* the V8 cpu samples and the Blink trace events from a hybrid `trace.json`. Builds a `[JS root..leaf] ++ [Blink outer..inner]` stack at each sample (filtering V8's virtual frames and JS-entry wrapper events) and prints either top-N self-time mixing JS function names with Blink/V8 event names, or `--callees