From 470917c866995624a8767093bca076c8a7733df3 Mon Sep 17 00:00:00 2001
From: Kuba Sunderland-Ober <kuba@mareimbrium.org>
Date: Sat, 23 May 2026 23:48:44 +0200
Subject: [PATCH 01/44] Use node's built-in zlib instead of pako, saves >1s
 from the PDF build.

---
 docs/lib/fast-deflate.mjs |  37 +++++
 docs/render-book.mjs      |   4 +
 perf/README.md            |  29 +++-
 perf/measure.mjs          |  64 ++++++++-
 perf/notes/08-pdf-lib.md  | 293 ++++++++++++++++++++++++++++++++++++++
 5 files changed, 423 insertions(+), 4 deletions(-)
 create mode 100644 docs/lib/fast-deflate.mjs
 create mode 100644 perf/notes/08-pdf-lib.md

diff --git a/docs/lib/fast-deflate.mjs b/docs/lib/fast-deflate.mjs
new file mode 100644
index 00000000..08725ac3
--- /dev/null
+++ b/docs/lib/fast-deflate.mjs
@@ -0,0 +1,37 @@
+// Replace pako's pure-JS deflate with Node's zlib for the one path
+// pdf-lib actually uses it on: PDFFlateStream.computeContents in
+// node_modules/pdf-lib/cjs/core/structures/PDFFlateStream.js, which
+// calls `pako.deflate(unencodedContents)` once per FlateStream during
+// PDFDocument.save().
+//
+// PDF /FlateDecode (ISO 32000-1 §7.4.4) is the zlib format (RFC 1950):
+// a 2-byte zlib header + a raw deflate body (RFC 1951) + a 4-byte
+// Adler-32 trailer. Both pako.deflate and zlib.deflateSync produce that
+// format with default level 6, so the swap is wire-compatible -- output
+// bytes may differ by a small amount (different match choices in the
+// compressor's inner loop) but every PDF viewer reads either.
+//
+// Mechanism: pdf-lib is CJS in node_modules and calls
+// `require("pako").deflate(...)` at the call site, not at import time.
+// Mutating the live pako exports object is enough; no fork required.
+//
+// Side-effecting import. Import once before PDFDocument.save() runs:
+//
+//   import "./lib/fast-deflate.mjs";
+//
+// Idempotent -- repeated imports do nothing after the first.
+
+import { deflateSync } from "node:zlib";
+import pako from "pako";
+
+if (!pako.__fastDeflateInstalled) {
+  const original = pako.deflate;
+  pako.deflate = function fastDeflate(data, options) {
+    // pdf-lib's only caller passes no options. Anything fancier (dictionary,
+    // raw, custom level) goes back to pako so we don't change behaviour
+    // outside the one hot path we care about.
+    if (options) return original.call(pako, data, options);
+    return deflateSync(data);
+  };
+  pako.__fastDeflateInstalled = true;
+}
diff --git a/docs/render-book.mjs b/docs/render-book.mjs
index e7ad9bfc..942fee5c 100644
--- a/docs/render-book.mjs
+++ b/docs/render-book.mjs
@@ -32,6 +32,10 @@ import { dirname, resolve } from 'node:path';
 import { writeFileSync, existsSync } from 'node:fs';
 import puppeteer from 'puppeteer';
 import { PDFDocument, ParseSpeeds } from 'pdf-lib';
+// Side-effecting import: swaps pdf-lib's pako.deflate (pure JS) for
+// node:zlib.deflateSync (C). Save phase only, same /FlateDecode output,
+// ~1.5 s saved on the book. See perf/notes/08-pdf-lib.md.
+import './lib/fast-deflate.mjs';
 import { parseOutline, setOutline } from './lib/outline.mjs';
 import { setMetadata }              from './lib/postprocesser.mjs';
 
diff --git a/perf/README.md b/perf/README.md
index 81e67b6f..4decd14b 100644
--- a/perf/README.md
+++ b/perf/README.md
@@ -61,6 +61,26 @@ Drop `--render-only` whenever you need to also measure generate /
 process (e.g. confirming a fix doesn't shift cost into `page.pdf()`
 or pdf-lib), or to write `book.pdf` for behavioural verification.
 
+## Profiling pdf-lib (process phase): canonical command
+
+The mirror command for CPU-profiling the pdf-lib roundtrip:
+
+```
+node measure.mjs --cpu-profile-process --cpu-sampling 100
+```
+
+`--cpu-profile-process` is the symmetric counterpart of
+`--cpu-profile`. The render-side profile attaches to Chromium's V8
+via CDP because paged.js runs there; the process-side profile
+attaches to Node's V8 via `node:inspector/promises` because pdf-lib
+runs locally. Both produce the same `.cpuprofile` JSON shape, so
+`analyze-profile.mjs` / `find-callers.mjs` / `find-callees.mjs` /
+`grep-profile.mjs` work against either one. The two flags compose
+when you want both phases captured in a single run.
+
+See [notes/08-pdf-lib.md](notes/08-pdf-lib.md) for the process-phase
+investigations the flag enabled.
+
 ## What's in this folder
 
 The harness and core probes:
@@ -172,7 +192,8 @@ run.bat path\to\some-other.html           # explicit input
 run.bat --out my-run                      # explicit output directory
 run.bat --no-detach-pages                 # opt out of the detach-pages fix (measure pre-fix O(n²) baseline)
 run.bat --timing                          # collect per-page wall time + heap (writes timing.csv + quartile summary)
-run.bat --cpu-profile                     # CPU-profile the render phase
+run.bat --cpu-profile                     # CPU-profile the render phase (CDP, Chromium-side)
+run.bat --cpu-profile-process             # CPU-profile the process phase (Node inspector, Node-side)
 run.bat --render-only                     # bail out after render (skip generate + process, ~47s saved)
 run.bat --clone-count                     # report Layout.append clones appended vs survivors per page
 run.bat --instrument                      # count + time DOM-accessor calls
@@ -180,10 +201,12 @@ run.bat --time-hooks                      # per-task timing of every chunker/pol
 run.bat --incremental                     # process via incremental update instead of pdf-lib roundtrip
 run.bat --chrome-outline                  # let Chrome emit /Outlines (skip parseOutline + setOutline)
 run.bat --tracing                         # capture a hybrid Chrome trace (Blink events + embedded V8 cpu samples)
+run.bat --fast-deflate                    # route pdf-lib's deflate through node:zlib (ships in render-book.mjs by default; opt-in here for A/B)
 ```
 
 Flags compose. The CPU profile lands as `render.cpuprofile`
 (loadable in Chrome DevTools -> Performance -> "Load profile...");
+`--cpu-profile-process` writes `process.cpuprofile` alongside it;
 `--instrument` prints a per-op table at end-of-render.
 
 You need `_site-pdf\book.html` to exist first -- run `docs\build.bat`
@@ -285,6 +308,7 @@ file documenting each:
 | Disable WhiteSpaceFilter | [05](notes/05-blink-trace.md) | ~0.7 s render |
 | Full sync chain (RunMicrotasks → 0) | [06](notes/06-microtasks-pageranges-css.md) | re-attribution |
 | `--disable-gpu` + `--in-process-gpu` | [07](notes/07-memory.md) | ~200 MB memory |
+| `pako.deflate` → `node:zlib.deflateSync` | [08](notes/08-pdf-lib.md) | ~1.5 s process (save -58 %) |
 
 What was tried and didn't ship:
 
@@ -298,7 +322,7 @@ What was tried and didn't ship:
 
 ## Investigation log
 
-The seven phase files in [`notes/`](notes/) cover the full investigation
+The phase files in [`notes/`](notes/) cover the full investigation
 narrative. Each is self-contained but they're written in chronological
 order; later ones reference earlier ones for context.
 
@@ -311,3 +335,4 @@ order; later ones reference earlier ones for context.
 | [05-blink-trace.md](notes/05-blink-trace.md) | What happened when we tried move-not-clone (a `previousLeaf` cache shipped instead of move); cracking the cpu profile's `(program)` row open with a Blink-category trace; the WhiteSpaceFilter paired-A/B that found it wasn't worth its layout cost in our pipeline. |
 | [06-microtasks-pageranges-css.md](notes/06-microtasks-pageranges-css.md) | Following `RunMicrotasks` down to zero (chunker fully sync); why `pageRanges` sharding is off the table; CSS cost attribution showing print.css's individual sections are all below the noise floor. |
 | [07-memory.md](notes/07-memory.md) | Where the renderer's 1.9 GB goes -- process-tree footprint, per-allocator + per-Blink-class breakdown, `--disable-gpu` + `--in-process-gpu` saving ~200 MB, a GC-pass probe finding 180 MB of unswept Oilpan garbage. |
+| [08-pdf-lib.md](notes/08-pdf-lib.md) | Profiling the process phase via `--cpu-profile-process`; finding pako's per-stream init dominates with ~4 500 small streams; routing `pako.deflate` through `node:zlib` (save -58 %, GC -383 ms). |
diff --git a/perf/measure.mjs b/perf/measure.mjs
index c3ec049a..48b2aa8f 100644
--- a/perf/measure.mjs
+++ b/perf/measure.mjs
@@ -21,12 +21,13 @@
 //
 // Usage:
 //   node measure.mjs [path/to/book.html] [--out <dir>] [--keep-open]
-//                    [--cpu-profile] [--cpu-sampling <microseconds>]
+//                    [--cpu-profile] [--cpu-profile-process]
+//                    [--cpu-sampling <microseconds>]
 //                    [--heap-profile] [--heap-sampling <bytes>]
 //                    [--tracing]
 //                    [--no-detach-pages] [--instrument] [--time-hooks]
 //                    [--incremental] [--chrome-outline] [--timing]
-//                    [--clone-count] [--render-only]
+//                    [--clone-count] [--render-only] [--fast-deflate]
 //
 // --render-only bails out after the render phase. Skips meta extraction,
 // parseOutline, page.pdf, and the pdf-lib roundtrip / incremental writer.
@@ -82,10 +83,24 @@
 // chrome://tracing or perfetto.dev, or run analyze-trace.mjs against it
 // for a top-N self-time table grouped by event name. Composable with
 // --cpu-profile; uses an independent CDP domain.
+//
+// --cpu-profile-process wraps the process phase only (pdf-lib roundtrip
+// or incremental writer) in a V8 Profiler trace via Node's inspector
+// module -- the process phase runs in Node, not Chromium, so CDP's
+// Profiler can't see it. Writes process.cpuprofile alongside render's.
+// Honours --cpu-sampling. Composable with --cpu-profile when you want
+// both phases captured in one run.
+//
+// --fast-deflate routes pdf-lib's PDFFlateStream compression through
+// Node's zlib (C++) instead of pako (pure JS). Same wire format
+// (PDF /FlateDecode = RFC 1950 zlib), ~5-10x faster on big inputs.
+// Save phase only -- load uses pako.inflate, which the profile shows
+// isn't a hot path for our content.
 
 import { pathToFileURL, fileURLToPath } from 'node:url';
 import { dirname, resolve, join } from 'node:path';
 import { mkdirSync, writeFileSync, existsSync } from 'node:fs';
+import { Session } from 'node:inspector/promises';
 import puppeteer from 'puppeteer';
 import { PDFDocument, ParseSpeeds } from 'pdf-lib';
 // Shared with docs/render-book.mjs -- the helpers and the paged.js
@@ -109,6 +124,7 @@ let inputArg = null;
 let outArg = null;
 let keepOpen = false;
 let cpuProfile = false;
+let cpuProfileProcess = false;
 let cpuSampling = 1000; // microseconds
 let heapProfile = false;
 let heapSampling = 32768; // bytes between samples (CDP default)
@@ -121,11 +137,13 @@ let timing = false;
 let cloneCount = false;
 let renderOnly = false;
 let tracing = false;
+let fastDeflate = false;
 for (let i = 0; i < args.length; i++) {
   const a = args[i];
   if (a === '--out') outArg = args[++i];
   else if (a === '--keep-open') keepOpen = true;
   else if (a === '--cpu-profile') cpuProfile = true;
+  else if (a === '--cpu-profile-process') cpuProfileProcess = true;
   else if (a === '--cpu-sampling') cpuSampling = parseInt(args[++i], 10);
   else if (a === '--heap-profile') heapProfile = true;
   else if (a === '--heap-sampling') heapSampling = parseInt(args[++i], 10);
@@ -141,6 +159,7 @@ for (let i = 0; i < args.length; i++) {
   else if (a === '--render-only') renderOnly = true;
   else if (a === '--tracing') tracing = true;
   else if (a === '--no-affinity') { /* handled in pin-cpu.mjs */ }
+  else if (a === '--fast-deflate') fastDeflate = true;
   else if (!inputArg) inputArg = a;
   else { console.error(`unknown arg: ${a}`); process.exit(2); }
 }
@@ -175,6 +194,20 @@ for (const p of required) {
   }
 }
 
+if (cpuProfileProcess && renderOnly) {
+  console.error('--cpu-profile-process is incompatible with --render-only (the process phase is skipped).');
+  process.exit(2);
+}
+
+// Install the Node-zlib override for pdf-lib's PDFFlateStream compression
+// before any pdf-lib operation. Side-effecting import; idempotent. The
+// override only kicks in on pako.deflate calls (i.e. save()), so render-
+// only runs that never reach the pdf-lib path are unaffected either way.
+if (fastDeflate) {
+  await import('../docs/lib/fast-deflate.mjs');
+  console.log('[harness] fast-deflate: pako.deflate -> node:zlib.deflateSync');
+}
+
 const stamp = new Date().toISOString().replace(/[:.]/g, '-');
 const outDir = outArg
   ? resolve(process.cwd(), outArg)
@@ -370,6 +403,7 @@ try {
   let rawPdfBytes = null;
   let processMs = null;
   let processBreakdown = null;
+  let processProfilePath = null;
   let finalPdf = null;
 
   if (!renderOnly) {
@@ -430,6 +464,22 @@ try {
   //
   // Either way we time the full phase plus the meaningful sub-steps so the
   // breakdown matches across runs.
+  //
+  // --cpu-profile-process attaches Node's inspector Profiler around this
+  // block. The render phase profiles via CDP because the work happens in
+  // Chromium; the process phase profiles via Node's inspector because
+  // pdf-lib runs locally. Output file shape (V8 .cpuprofile JSON) is the
+  // same either way.
+  let inspectorSession = null;
+  if (cpuProfileProcess) {
+    inspectorSession = new Session();
+    inspectorSession.connect();
+    await inspectorSession.post('Profiler.enable');
+    await inspectorSession.post('Profiler.setSamplingInterval', { interval: cpuSampling });
+    await inspectorSession.post('Profiler.start');
+    console.log(`[harness] process cpu profile: sampling every ${cpuSampling}us`);
+  }
+
   const tProcStart = Date.now();
   if (incremental) {
     const tIncStart = Date.now();
@@ -463,6 +513,15 @@ try {
   }
   const tProcEnd  = Date.now();
   processMs = tProcEnd - tProcStart;
+  if (inspectorSession) {
+    const { profile } = await inspectorSession.post('Profiler.stop');
+    await inspectorSession.post('Profiler.disable');
+    inspectorSession.disconnect();
+    processProfilePath = join(outDir, 'process.cpuprofile');
+    const profileJson = JSON.stringify(profile);
+    writeFileSync(processProfilePath, profileJson);
+    console.log(`[harness] process cpu profile: ${processProfilePath} (${(profileJson.length / 1024 / 1024).toFixed(1)} MB)`);
+  }
   if (incremental) {
     console.log(`[harness] process  ${fmtMs(processMs)}  (incremental=${fmtMs(processBreakdown.incrementalMs)}, +${processBreakdown.appendedBytes}B, ${processBreakdown.newObjectCount} new objs)`);
   } else {
@@ -506,6 +565,7 @@ try {
     record.phases.process = {
       ms: processMs,
       mode: incremental ? 'incremental' : 'pdf-lib-roundtrip',
+      cpuProfile: processProfilePath,
       ...processBreakdown,
     };
   }
diff --git a/perf/notes/08-pdf-lib.md b/perf/notes/08-pdf-lib.md
new file mode 100644
index 00000000..9abc127b
--- /dev/null
+++ b/perf/notes/08-pdf-lib.md
@@ -0,0 +1,293 @@
+# pdf-lib: profiling the process phase
+
+Wiring `--cpu-profile-process` so the pdf-lib roundtrip becomes visible to the same `analyze-profile.mjs` toolchain we already use on the render phase, then following the bottom-up table -- pako dominates with per-stream init overhead, routing `pako.deflate` through `node:zlib` saves ~1.5 s of process wall (save -58 %).
+
+The render-side investigations (notes [01](01-baseline-and-detach.md)
+through [07](07-memory.md)) brought render down from ~104 s to ~8 s
+and process from ~40 s to ~5 s. By [`pdf-lib parseSpeed: Fastest`](01-baseline-and-detach.md)
+the process phase was a flat ~5 s of `load + setOutline + save`, the
+sub-step numbers were the only thing we knew about it, and there was
+no bottom-up table to point at: CDP's `Profiler` attaches to Chromium
+and the process phase runs in Node, so `--cpu-profile` couldn't see
+it.
+
+## `--cpu-profile-process`
+
+Added to `measure.mjs`: opens an in-process V8 Profiler via
+`node:inspector/promises`, brackets the process phase the same way
+`--cpu-profile` brackets render, and writes `process.cpuprofile`
+alongside `render.cpuprofile`. Same `.cpuprofile` JSON shape, so the
+existing `analyze-profile.mjs` / `find-callers.mjs` /
+`find-callees.mjs` work unchanged. See the *Profiling pdf-lib
+(process phase): canonical command* section in [the README](../README.md)
+for the operational form.
+
+First run on the 1638-page book (`--detach-pages --no-timing
+--cpu-profile-process --cpu-sampling 100`), process 4.66 s (load
+1.88 s, setOutline 0.01 s, save 2.77 s). Top of the bottom-up table:
+
+```
+samples: 8560   duration: 4.68s   us/sample: 547
+
+   self_ms   self_%   function  @  source
+   -------   ------   ----------------------------------------------
+    645.24   13.85%   (garbage collector)
+    460.42    9.88%   longest_match            pako/lib/zlib/deflate.js:231
+    428.15    9.19%   deflateInit2             pako/lib/zlib/deflate.js:1327
+    374.02    8.03%   PDFRef.of                pdf-lib/.../PDFRef.js:34
+    218.73    4.69%   decodeName               pdf-lib/.../PDFName.js:9
+    218.73    4.69%   PDFDict.entries          pdf-lib/.../PDFDict.js:22
+    182.64    3.92%   deflate_slow             pako/lib/zlib/deflate.js:726
+    119.75    2.57%   parseRawNumber           pdf-lib/.../BaseParser.js:33
+    114.28    2.45%   DeflateState             pako/lib/zlib/deflate.js:1092
+    113.19    2.43%   parseName                pdf-lib/.../PDFObjectParser.js:117
+     ... pako rows and parser rows continue down the table ...
+```
+
+Adding up pako frames (`longest_match` + `deflateInit2` +
+`deflate_slow` + `DeflateState` + `lm_init` + `compress_block` +
+`build_tree` + `Deflate.push` + `adler32`) lands at **~1.42 s, ~30 %
+of the process phase**. Of that, the *initialization* group
+(`deflateInit2` + `DeflateState` + `lm_init`) was **~628 ms** -- so
+~44 % of pako's time was spent setting up Deflate state, not
+compressing bytes. That number per call doesn't explain itself
+unless the call count is high.
+
+## Are we compressing Chrome's already-compressed streams?
+
+Reasonable hypothesis: pdf-lib loads, decompresses Chrome's content
+streams, and then re-compresses them on save. That would put Chrome's
+~52 MB of content through deflate twice, and explain the heavy
+pako time as wasted work.
+
+Walking the code:
+
+- `PDFObjectParser.parseDictOrStream` (`pdf-lib/.../parser/PDFObjectParser.js:171`)
+  always ends with `return PDFRawStream.of(dict, contents)`. Every
+  stream pdf-lib parses out of the input is a `PDFRawStream` holding
+  the verbatim bytes between `stream` / `endstream`. No decompression.
+- `PDFRawStream.getContents` (`pdf-lib/.../objects/PDFRawStream.js:22`)
+  returns those bytes unchanged.
+- `PDFStreamWriter.computeBufferSize` (`pdf-lib/.../writers/PDFStreamWriter.js:43-46`)
+  marks `shouldNotCompress = true` for anything that's `instanceof
+  PDFStream` (which includes `PDFRawStream`). Those go out verbatim
+  with the original `/Filter` preserved.
+
+`pako.deflate` lives in `PDFFlateStream.computeContents`
+(`pdf-lib/.../structures/PDFFlateStream.js:15`); the only subclasses
+are `PDFContentStream`, `PDFCrossRefStream`, and `PDFObjectStream`.
+None of those are instantiated by the parser. So **Chrome's content
+streams ride through as `PDFRawStream` and never see pako**.
+
+Confirmed by instrumenting `pako.deflate` and re-running the save
+on the produced book.pdf:
+
+```
+deflate calls during save : 4524
+bytes fed to deflate      : 24.28 MB
+bytes produced            :  4.39 MB
+final pdf size            : 16.08 MB
+```
+
+The 4,524 deflate calls are pdf-lib's **own** new streams:
+
+- ~4,523 `PDFObjectStream` chunks. `PDFStreamWriter.forContext`
+  defaults to `objectsPerStream = 50`; the book has **228,191
+  indirect objects**, so pdf-lib packs ~4,564 chunks of 50 each.
+- 1 `PDFCrossRefStream` for the xref.
+
+## Wait -- the pdf-lib output is *smaller* than Chrome's. What's going on?
+
+Chrome's raw PDF is 39.3 MB, pdf-lib's final PDF is 16.1 MB. That
+23 MB shrink isn't pdf-lib throwing anything away -- it's compressing
+something Chrome chose to emit verbatim.
+
+Tallying the 228,191 indirect objects pdf-lib sees by type:
+
+```
+130,787  StructElem /S=/NonStruct      (a11y wrapper around content w/o structural role)
+ 22,193  StructElem /S=/Strong         (bold)
+ 11,003  Dict /Type=/Annot             (mostly hyperlinks)
+ 10,054  StructElem /S=/Link
+  9,164  StructElem /S=/P              (paragraph)
+  8,417  StructElem /S=/Em             (emphasis)
+  5,270  StructElem /S=/TD             (table cell)
+  4,822  StructElem /S=/Code
+  3,392  StructElem /S=/LI             (list item)
+  3,040  StructElem /S=/H5
+    ... another ~15 k StructElems in long tail (H1-H6, L, TR, Art, ...)
+  2,061  PDFRawStream                  (Chrome's content + font + image streams)
+  1,651  Dict /Type=/Page
+   ... ~3.5 k misc dicts ...
+```
+
+**Over 225,000 are tiny `<<...>>` StructElem dicts** -- the
+tagged-PDF structure tree, which Chrome emits because we pass
+`tagged: true` to `page.pdf()`. Each `StructElem` is something like
+`<</Type /StructElem /S /P /P [123 0 R] /K [...] /Pg 5 0 R>>` -- a
+few hundred bytes of mostly boilerplate.
+
+Chrome writes them as plain text indirect objects -- 225k × a few
+hundred bytes ≈ 28 MB of `<<...>>` source. pdf-lib's
+`PDFStreamWriter` packs those 50 at a time into PDFObjectStreams,
+each of which is then deflate-compressed. The dict syntax is wildly
+repetitive across siblings (`/Type /StructElem` literally appears
+225k times), so deflate compresses the packed text ~5.5x. The
+24.28 MB of small-dict text fed to deflate above comes out the
+other side at 4.39 MB. Add the ~11 MB of `PDFRawStream` bytes that
+pass through verbatim, plus a few KB of misc, and the 16.1 MB total
+checks out.
+
+The pdf-lib roundtrip's win over Chrome's raw output is **encoding
+the same information** in PDF 1.5's compressed-object-streams
+feature instead of as plain `<<...>>` text. Skia's PDF writer
+chooses not to use that feature.
+
+This also explains the pako profile shape. The workload is *many
+small streams* (~4,500 of them at ~5.4 KB input each), which is
+exactly where per-stream initialization dominates: the 628 ms in
+`deflateInit2` + `DeflateState` + `lm_init` is paid 4,500 times,
+while the per-call payload is small enough that the actual
+compression work (~755 ms across `longest_match` + `deflate_slow`
++ `compress_block` + `build_tree` + `adler32`) isn't proportionally
+larger.
+
+## The shim
+
+PDF `/FlateDecode` (ISO 32000-1 §7.4.4) is the zlib format
+(RFC 1950): 2-byte zlib header + raw deflate body (RFC 1951) + 4-byte
+Adler-32 trailer. Both `pako.deflate(data)` and Node's
+`zlib.deflateSync(data)` produce that format at default level 6.
+Verified head-to-head: each compresses to an equivalent-size zlib
+stream starting `78 9c`, and either can decompress the other's
+output back to the original input bytes.
+
+`docs/lib/fast-deflate.mjs` is a side-effecting import that mutates
+the live `pako` exports:
+
+```js
+import { deflateSync } from "node:zlib";
+import pako from "pako";
+
+if (!pako.__fastDeflateInstalled) {
+  const original = pako.deflate;
+  pako.deflate = function fastDeflate(data, options) {
+    if (options) return original.call(pako, data, options);
+    return deflateSync(data);
+  };
+  pako.__fastDeflateInstalled = true;
+}
+```
+
+pdf-lib's CJS code reads `require("pako").deflate` at call time
+(`pako_1.default.deflate(unencodedContents)` inside
+`PDFFlateStream.computeContents`), so mutating the live module
+exports propagates without forking pdf-lib. The `options`
+fallthrough means any caller that needs pako's non-default
+behaviour (dictionaries, raw deflate, custom level) is unaffected;
+pdf-lib's only call site passes no options.
+
+Microbenchmark on the harness machine, both unrelated to the book:
+
+```
+zlib.deflateSync(50 MB of ASCII)                        112 ms
+zlib.deflateSync(book.pdf as input, 16.1 MB)            283 ms
+```
+
+For comparison, pako spent ~1.42 s on the book's actual save
+workload (~24 MB across 4,524 calls). Same order of magnitude as
+the raw-throughput numbers above, but with more per-call overhead
+-- which matches what a JS implementation is expected to lose
+against C when amortised across many small calls.
+
+`docs/render-book.mjs` imports the shim unconditionally near its
+pdf-lib import; production runs through it. `measure.mjs` adds a
+`--fast-deflate` flag, opt-in in the harness so paired pre/post
+A/Bs are still easy.
+
+## Results
+
+Paired A/B, four interleaved runs (`pre1 post1 pre2 post2`) with
+`--detach-pages --no-timing --cpu-profile-process --cpu-sampling
+100`, same 1638-page book each:
+
+| metric        | pre1   | pre2   | pre avg | post1  | post2  | post avg | Δ                |
+| ------------- | ------ | ------ | ------- | ------ | ------ | -------- | ---------------- |
+| **process**   | 4.20 s | 4.27 s | **4.24 s** | 2.79 s | 2.74 s | **2.77 s** | **-1.47 s (-35 %)** |
+| ↳ load        | 1.53 s | 1.54 s | 1.54 s  | 1.67 s | 1.61 s | 1.64 s   | +0.10 s (noise; load goes through `pako.inflate`, untouched) |
+| ↳ setOutline  | 0.01 s | 0.01 s | 0.01 s  | 0.01 s | 0.01 s | 0.01 s   | unchanged |
+| ↳ **save**    | 2.66 s | 2.72 s | **2.69 s** | 1.11 s | 1.12 s | **1.12 s** | **-1.57 s (-58 %)** |
+| pdf size      | 16.1 MB | 16.1 MB | 16.1 MB | 16.1 MB | 16.1 MB | 16.1 MB | identical |
+
+Render and generate wall-clock numbers varied ±5 s between runs
+(machine load) but the process numbers are tight to ±0.05 s.
+
+Post-fix bottom-up profile, same flags:
+
+```
+samples: 5229   duration: 2.82s   us/sample: 540
+
+   self_ms   self_%   function
+   -------   ------   --------------------------------------------------
+    348.83   12.48%   writeSync                  (Node libuv syscall)
+    335.87   12.01%   PDFRef.of                  pdf-lib/.../PDFRef.js:34
+    262.44    9.39%   (garbage collector)
+    165.24    5.91%   PDFDict.entries
+    159.84    5.72%   decodeName
+    108.00    3.86%   parseName
+    102.60    3.67%   parseRawNumber
+     88.56    3.17%   parseRawInt
+     72.90    2.61%   PDFName.of
+     71.28    2.55%   parseDict
+     ... pako rows absent from the table ...
+```
+
+Two structural changes worth calling out:
+
+- All pako frames dropped out of the top 20. `writeSync` at 12.48 %
+  is libuv's syscall wrapper waiting on zlib's C++ work; that work
+  doesn't itself show in the JS-frame bottom-up because it runs off
+  the JS thread. The ~349 ms here is the total wait time across all
+  ~4,500 calls.
+- `(garbage collector)` dropped from 645 ms to 262 ms (-383 ms).
+  That matches the per-call allocator pressure from creating a fresh
+  `Deflate` instance + `DeflateState` per pako call, now gone.
+
+End-to-end `book.bat` run with the shim:
+
+```
+render:   8.5s   (1651 pages)
+generate: 37.1s  (raw 39.3 MB)
+process:  2.5s
+saved:    docs\_pdf\book.pdf  (16.1 MB)
+total:    50.1s
+```
+
+Process is now under three seconds on the production path. Wall-clock
+total ~50 s vs the prior ~70 s baseline. Output PDF byte size
+unchanged from the pre-shim build (16.1 MB; standard `/CreationDate`
+drift between runs).
+
+## What this didn't fix
+
+After the shim the bottom-up profile points at the next two
+JS-attributable buckets:
+
+- `PDFRef.of` at 336 ms self-time (12 %). The function builds a
+  string key `<num> <gen> R` per call and Map-looks it up; the
+  string allocation per call is the cost. A drop-in fix would
+  replace the `Map<string>` pool with a flat array for the gen=0
+  case (the overwhelming majority) and a fallback Map for gen ≠ 0.
+  Estimated ~300 ms saved.
+- `(garbage collector)` at 262 ms (9 %). Tied to `PDFRef.of` and
+  the per-object dict allocations in the writer; would likely
+  shrink along with the first item.
+
+Neither moves the wall-clock total meaningfully on its own --
+process is now 2.77 s of a ~50 s build -- so they're left in place
+unless or until they become the bottleneck.
+
+The strategic note from earlier phases still stands: `pageRanges`
+sharding of generate is the only remaining knob with a profile
+target large enough to move total wall-clock by more than a few
+seconds.

From c423231047cae9fb64c2438cddfa16274fb61221 Mon Sep 17 00:00:00 2001
From: Kuba Sunderland-Ober <kuba@mareimbrium.org>
Date: Sat, 23 May 2026 23:55:00 +0200
Subject: [PATCH 02/44] Speed up pdf loading by ~0.3s.

---
 docs/lib/fast-refs.mjs   |  46 ++++++++
 docs/render-book.mjs     |  13 ++-
 perf/README.md           |  52 ++++++++--
 perf/measure.mjs         |  15 ++-
 perf/notes/08-pdf-lib.md | 219 +++++++++++++++++++++++++++++++++++++--
 5 files changed, 319 insertions(+), 26 deletions(-)
 create mode 100644 docs/lib/fast-refs.mjs

diff --git a/docs/lib/fast-refs.mjs b/docs/lib/fast-refs.mjs
new file mode 100644
index 00000000..4212162c
--- /dev/null
+++ b/docs/lib/fast-refs.mjs
@@ -0,0 +1,46 @@
+// Replace pdf-lib's PDFRef.of pool lookup with a dense-array cache
+// for the generation=0 case (the overwhelmingly common one).
+//
+// The upstream implementation
+// (node_modules/pdf-lib/cjs/core/objects/PDFRef.js) keys its pool by
+// a freshly-built string `<obj> <gen> R` on every call:
+//
+//   var tag = objectNumber + " " + generationNumber + " R";
+//   var instance = pool.get(tag);
+//
+// On the book we see ~1.2 M PDFRef.of calls per load, 82 % of them
+// with gen=0; each call allocates the tag string before Map.get can
+// hash it. That's ~330 ms of self-time on the process-phase profile
+// plus measurable GC pressure.
+//
+// Shim: dense array indexed by objectNumber for the gen=0 branch.
+// Plain array indexing, no string alloc, no Map hash. Cache-in-front
+// of the original PDFRef.of so we don't need its module-private
+// ENFORCER -- on miss we delegate, on hit we return our cached
+// instance.
+//
+// gen != 0 calls (the other 18 %, pdf-lib's xref-stream bookkeeping
+// where the "generation" field encodes an in-ObjStm index per
+// PDF 1.5 spec, see PDFXRefStreamParser.js:74-80) pass through to
+// the original unchanged.
+//
+// Side-effecting import. Import once before any pdf-lib operation.
+// Idempotent.
+
+import { PDFRef } from "pdf-lib";
+
+if (!PDFRef.__fastPoolInstalled) {
+  const original = PDFRef.of;
+  const pool0 = [];
+  PDFRef.of = function fastOf(objectNumber, generationNumber) {
+    if (generationNumber === undefined || generationNumber === 0) {
+      const existing = pool0[objectNumber];
+      if (existing) return existing;
+      const fresh = original.call(PDFRef, objectNumber, 0);
+      pool0[objectNumber] = fresh;
+      return fresh;
+    }
+    return original.call(PDFRef, objectNumber, generationNumber);
+  };
+  PDFRef.__fastPoolInstalled = true;
+}
diff --git a/docs/render-book.mjs b/docs/render-book.mjs
index 942fee5c..f219ec06 100644
--- a/docs/render-book.mjs
+++ b/docs/render-book.mjs
@@ -32,10 +32,17 @@ import { dirname, resolve } from 'node:path';
 import { writeFileSync, existsSync } from 'node:fs';
 import puppeteer from 'puppeteer';
 import { PDFDocument, ParseSpeeds } from 'pdf-lib';
-// Side-effecting import: swaps pdf-lib's pako.deflate (pure JS) for
-// node:zlib.deflateSync (C). Save phase only, same /FlateDecode output,
-// ~1.5 s saved on the book. See perf/notes/08-pdf-lib.md.
+// Side-effecting imports. Order doesn't matter; both mutate live
+// module exports before any pdf-lib operation. See
+// perf/notes/08-pdf-lib.md.
+//
+//   fast-deflate -- swaps pdf-lib's pako.deflate (pure JS) for
+//     node:zlib.deflateSync (C). ~1.5 s saved on the save phase.
+//   fast-refs    -- dense-array cache in front of PDFRef.of for the
+//     gen=0 case (82 % of ~1.2 M calls per load). ~0.2 s saved on
+//     load.
 import './lib/fast-deflate.mjs';
+import './lib/fast-refs.mjs';
 import { parseOutline, setOutline } from './lib/outline.mjs';
 import { setMetadata }              from './lib/postprocesser.mjs';
 
diff --git a/perf/README.md b/perf/README.md
index 4decd14b..53f0a167 100644
--- a/perf/README.md
+++ b/perf/README.md
@@ -66,20 +66,48 @@ or pdf-lib), or to write `book.pdf` for behavioural verification.
 The mirror command for CPU-profiling the pdf-lib roundtrip:
 
 ```
-node measure.mjs --cpu-profile-process --cpu-sampling 100
+node measure.mjs --fast-deflate --fast-refs --cpu-profile-process --cpu-sampling 100
 ```
 
-`--cpu-profile-process` is the symmetric counterpart of
-`--cpu-profile`. The render-side profile attaches to Chromium's V8
-via CDP because paged.js runs there; the process-side profile
-attaches to Node's V8 via `node:inspector/promises` because pdf-lib
-runs locally. Both produce the same `.cpuprofile` JSON shape, so
-`analyze-profile.mjs` / `find-callers.mjs` / `find-callees.mjs` /
-`grep-profile.mjs` work against either one. The two flags compose
-when you want both phases captured in a single run.
+Flag rationale:
+
+- `--fast-deflate` -- inject the
+  [docs/lib/fast-deflate.mjs](../docs/lib/fast-deflate.mjs) shipping
+  fix (`pako.deflate` -> `node:zlib.deflateSync`). Production runs
+  through it; the profile should too.
+- `--fast-refs` -- inject the
+  [docs/lib/fast-refs.mjs](../docs/lib/fast-refs.mjs) shipping fix
+  (dense-array cache for `PDFRef.of`'s gen=0 path). Same logic.
+- `--cpu-profile-process` -- attach Node's `inspector/promises`
+  Profiler around the process phase only (skips render and generate).
+  Writes `process.cpuprofile` into the timestamped `results/` folder.
+  The render-phase `--cpu-profile` is CDP / Chromium; this one is
+  Node / Node's V8 -- different runtimes, same `.cpuprofile` JSON
+  shape, so `analyze-profile.mjs` / `find-callers.mjs` /
+  `find-callees.mjs` / `grep-profile.mjs` work against either.
+- `--cpu-sampling 100` -- 100 us sampling. The process phase is now
+  ~2.3 s; at 1 ms default sampling that's only ~2300 samples and the
+  bottom-up table runs noisy. 100 us is the right resolution for
+  this length.
+
+The command intentionally **does not** pass `--cpu-profile`. There's
+no rule against running both at once -- they attach to different V8s
+and don't interfere -- but the render profile dilutes the bottom-up
+view of "what's left in pdf-lib," and the trace files are large.
+Profile one phase at a time.
+
+Why no `--render-only`? `--cpu-profile-process` requires the process
+phase to run; the harness errors out if you combine them.
+
+To compare against upstream pdf-lib (e.g. when proposing a change
+upstream), drop `--fast-deflate` and `--fast-refs`. Caveat for
+A/B work: profiler-on attribution overstates the cost of hot
+functions called millions of times (`PDFRef.of` in particular). For
+"did this wall-clock change," do a paired no-profile A/B as a
+sanity check.
 
 See [notes/08-pdf-lib.md](notes/08-pdf-lib.md) for the process-phase
-investigations the flag enabled.
+investigations these flags enabled.
 
 ## What's in this folder
 
@@ -202,6 +230,7 @@ run.bat --incremental                     # process via incremental update inste
 run.bat --chrome-outline                  # let Chrome emit /Outlines (skip parseOutline + setOutline)
 run.bat --tracing                         # capture a hybrid Chrome trace (Blink events + embedded V8 cpu samples)
 run.bat --fast-deflate                    # route pdf-lib's deflate through node:zlib (ships in render-book.mjs by default; opt-in here for A/B)
+run.bat --fast-refs                       # dense-array cache for PDFRef.of's gen=0 path (also ships; opt-in here for A/B)
 ```
 
 Flags compose. The CPU profile lands as `render.cpuprofile`
@@ -309,6 +338,7 @@ file documenting each:
 | Full sync chain (RunMicrotasks → 0) | [06](notes/06-microtasks-pageranges-css.md) | re-attribution |
 | `--disable-gpu` + `--in-process-gpu` | [07](notes/07-memory.md) | ~200 MB memory |
 | `pako.deflate` → `node:zlib.deflateSync` | [08](notes/08-pdf-lib.md) | ~1.5 s process (save -58 %) |
+| `PDFRef.of` dense-array cache (gen=0) | [08](notes/08-pdf-lib.md) | ~0.2 s process (load -16 %) |
 
 What was tried and didn't ship:
 
@@ -335,4 +365,4 @@ order; later ones reference earlier ones for context.
 | [05-blink-trace.md](notes/05-blink-trace.md) | What happened when we tried move-not-clone (a `previousLeaf` cache shipped instead of move); cracking the cpu profile's `(program)` row open with a Blink-category trace; the WhiteSpaceFilter paired-A/B that found it wasn't worth its layout cost in our pipeline. |
 | [06-microtasks-pageranges-css.md](notes/06-microtasks-pageranges-css.md) | Following `RunMicrotasks` down to zero (chunker fully sync); why `pageRanges` sharding is off the table; CSS cost attribution showing print.css's individual sections are all below the noise floor. |
 | [07-memory.md](notes/07-memory.md) | Where the renderer's 1.9 GB goes -- process-tree footprint, per-allocator + per-Blink-class breakdown, `--disable-gpu` + `--in-process-gpu` saving ~200 MB, a GC-pass probe finding 180 MB of unswept Oilpan garbage. |
-| [08-pdf-lib.md](notes/08-pdf-lib.md) | Profiling the process phase via `--cpu-profile-process`; finding pako's per-stream init dominates with ~4 500 small streams; routing `pako.deflate` through `node:zlib` (save -58 %, GC -383 ms). |
+| [08-pdf-lib.md](notes/08-pdf-lib.md) | Profiling the process phase via `--cpu-profile-process`; pako's per-stream init dominates with ~4 500 small streams (routing `pako.deflate` through `node:zlib` saves ~1.5 s); `PDFRef.of`'s string-keyed Map lookup at 1.2 M calls per load (dense-array gen=0 cache saves ~0.2 s). |
diff --git a/perf/measure.mjs b/perf/measure.mjs
index 48b2aa8f..9e9e0646 100644
--- a/perf/measure.mjs
+++ b/perf/measure.mjs
@@ -27,7 +27,8 @@
 //                    [--tracing]
 //                    [--no-detach-pages] [--instrument] [--time-hooks]
 //                    [--incremental] [--chrome-outline] [--timing]
-//                    [--clone-count] [--render-only] [--fast-deflate]
+//                    [--clone-count] [--render-only]
+//                    [--fast-deflate] [--fast-refs]
 //
 // --render-only bails out after the render phase. Skips meta extraction,
 // parseOutline, page.pdf, and the pdf-lib roundtrip / incremental writer.
@@ -96,6 +97,12 @@
 // (PDF /FlateDecode = RFC 1950 zlib), ~5-10x faster on big inputs.
 // Save phase only -- load uses pako.inflate, which the profile shows
 // isn't a hot path for our content.
+//
+// --fast-refs replaces PDFRef.of's string-keyed Map lookup with a
+// dense-array cache for the gen=0 case (82 % of ~1.2 M calls on the
+// book). Eliminates the per-call `<obj> <gen> R` string allocation
+// and Map hash. gen != 0 calls (pdf-lib's xref-stream bookkeeping
+// for compressed objects) pass through unchanged.
 
 import { pathToFileURL, fileURLToPath } from 'node:url';
 import { dirname, resolve, join } from 'node:path';
@@ -138,6 +145,7 @@ let cloneCount = false;
 let renderOnly = false;
 let tracing = false;
 let fastDeflate = false;
+let fastRefs = false;
 for (let i = 0; i < args.length; i++) {
   const a = args[i];
   if (a === '--out') outArg = args[++i];
@@ -160,6 +168,7 @@ for (let i = 0; i < args.length; i++) {
   else if (a === '--tracing') tracing = true;
   else if (a === '--no-affinity') { /* handled in pin-cpu.mjs */ }
   else if (a === '--fast-deflate') fastDeflate = true;
+  else if (a === '--fast-refs') fastRefs = true;
   else if (!inputArg) inputArg = a;
   else { console.error(`unknown arg: ${a}`); process.exit(2); }
 }
@@ -207,6 +216,10 @@ if (fastDeflate) {
   await import('../docs/lib/fast-deflate.mjs');
   console.log('[harness] fast-deflate: pako.deflate -> node:zlib.deflateSync');
 }
+if (fastRefs) {
+  await import('../docs/lib/fast-refs.mjs');
+  console.log('[harness] fast-refs: PDFRef.of dense-array cache for gen=0');
+}
 
 const stamp = new Date().toISOString().replace(/[:.]/g, '-');
 const outDir = outArg
diff --git a/perf/notes/08-pdf-lib.md b/perf/notes/08-pdf-lib.md
index 9abc127b..51855446 100644
--- a/perf/notes/08-pdf-lib.md
+++ b/perf/notes/08-pdf-lib.md
@@ -268,7 +268,7 @@ total ~50 s vs the prior ~70 s baseline. Output PDF byte size
 unchanged from the pre-shim build (16.1 MB; standard `/CreationDate`
 drift between runs).
 
-## What this didn't fix
+## After the shim: what's left
 
 After the shim the bottom-up profile points at the next two
 JS-attributable buckets:
@@ -277,17 +277,214 @@ JS-attributable buckets:
   string key `<num> <gen> R` per call and Map-looks it up; the
   string allocation per call is the cost. A drop-in fix would
   replace the `Map<string>` pool with a flat array for the gen=0
-  case (the overwhelming majority) and a fallback Map for gen ≠ 0.
-  Estimated ~300 ms saved.
+  case and a fallback Map for gen ≠ 0. Followed up below.
 - `(garbage collector)` at 262 ms (9 %). Tied to `PDFRef.of` and
-  the per-object dict allocations in the writer; would likely
+  the per-object dict allocations in the writer; expected to
   shrink along with the first item.
 
-Neither moves the wall-clock total meaningfully on its own --
-process is now 2.77 s of a ~50 s build -- so they're left in place
-unless or until they become the bottleneck.
+## `PDFRef.of`: dense-array cache for the gen=0 path
 
-The strategic note from earlier phases still stands: `pageRanges`
-sharding of generate is the only remaining knob with a profile
-target large enough to move total wall-clock by more than a few
-seconds.
+The upstream implementation:
+
+```js
+var pool = new Map();
+PDFRef.of = function (objectNumber, generationNumber) {
+    if (generationNumber === void 0) { generationNumber = 0; }
+    var tag = objectNumber + " " + generationNumber + " R";   // alloc
+    var instance = pool.get(tag);                              // hash
+    if (!instance) {
+        instance = new PDFRef(ENFORCER, objectNumber, generationNumber);
+        pool.set(tag, instance);
+    }
+    return instance;
+};
+```
+
+Per call: build a fresh `<obj> <gen> R` string, hand it to a
+`Map<string>` lookup that has to hash it, branch on miss. The
+string allocation is the cost we care about -- the dedup pool
+itself works correctly, it's just paying for its key on every read.
+
+### Workload shape
+
+Instrumented `PDFRef.of` and re-ran the harness through load + save:
+
+```
+total PDFRef.of calls     : 1,231,643
+  gen=0 (or undefined)    : 1,010,034  (82 %)
+  gen != 0                :   221,608  (18 %)
+gen=N value distribution (top, 4523 calls each):
+  gen=1, gen=2, ... gen=50: 4523 calls/value
+```
+
+The 1.2 M gen=0 calls are what the parser does for every
+encountered `N 0 R` reference and every per-object PDFRef
+construction. The 221 k gen != 0 calls are pdf-lib's xref-stream
+bookkeeping for PDF 1.5+ compressed-object entries: in a
+cross-reference stream's type-2 entry, the spec uses the
+"generation number" field to store the **index of the object
+within its ObjStm**, and pdf-lib feeds that index straight to
+`PDFRef.of` (`PDFXRefStreamParser.js:74-80`). 4,523 ObjStms × 50
+entries each ≈ the observed 221 k.
+
+So 82 % of calls have generationNumber=0. That's the path worth
+optimising.
+
+### The shim
+
+`docs/lib/fast-refs.mjs` is the symmetric side-effecting import to
+`fast-deflate`:
+
+```js
+import { PDFRef } from "pdf-lib";
+
+if (!PDFRef.__fastPoolInstalled) {
+  const original = PDFRef.of;
+  const pool0 = [];
+  PDFRef.of = function fastOf(objectNumber, generationNumber) {
+    if (generationNumber === undefined || generationNumber === 0) {
+      const existing = pool0[objectNumber];
+      if (existing) return existing;
+      const fresh = original.call(PDFRef, objectNumber, 0);
+      pool0[objectNumber] = fresh;
+      return fresh;
+    }
+    return original.call(PDFRef, objectNumber, generationNumber);
+  };
+  PDFRef.__fastPoolInstalled = true;
+}
+```
+
+Dense-array indexed by `objectNumber` for the gen=0 case -- no
+string alloc, no Map hash, just an array read. gen != 0 passes
+through to the original (which still allocates the tag and runs
+the Map lookup, but that's only 18 % of calls).
+
+The cache is **in front of** the original `PDFRef.of`, not a
+replacement: on a miss we call the original to produce the PDFRef
+instance, then cache it. That dodges the module-private `ENFORCER`
+token the upstream constructor demands. Memory cost is a second
+reference per PDFRef on top of the upstream pool's entry -- ~228 k
+tiny objects, negligible.
+
+The interning contract is preserved: `PDFRef.of(42) === PDFRef.of(42, 0)`
+and both `!== PDFRef.of(42, 1)`, as before.
+
+### Results: profiler-on vs profiler-off matters
+
+First A/B with the process-phase profiler attached (paired,
+`--detach-pages --no-timing --cpu-profile-process --cpu-sampling 100
+--fast-deflate [--fast-refs]`):
+
+| metric    | pre (no fast-refs) | post (+ fast-refs) | Δ |
+| ---       | ---                | ---                | --- |
+| process   | 2.94 s             | 2.52 s             | **-0.42 s (-14 %)** |
+| ↳ load    | 1.81 s             | 1.42 s             | -0.39 s |
+| ↳ save    | 1.12 s             | 1.08 s             | flat |
+| `PDFRef.of` self in profile | 336 ms (12 %) | 148 ms (5.9 %) | -188 ms |
+| `(garbage collector)` self  | 262 ms (9 %) | 194 ms (7.8 %) | -68 ms |
+
+`PDFRef.of`'s self-time roughly halved, GC pressure dropped, and
+the wall-clock saving (390 ms on load) looked like a clean win.
+
+But: paired A/B *without* the profiler attached told a different
+story:
+
+| metric    | pre (no fast-refs) | post (+ fast-refs) | Δ |
+| ---       | ---                | ---                | --- |
+| process   | 2.48 s             | 2.26 s             | **-0.22 s (-9 %)** |
+| ↳ load    | 1.51 s             | 1.27 s             | **-0.24 s (-16 %)** |
+| ↳ save    | 0.96 s             | 0.98 s             | flat |
+
+**Real wall-clock saving is ~240 ms**, not 390 ms. The remaining
+~150 ms of the profiler-on delta was profiler-attribution overhead
+that our shim removed by making the hot function shorter -- fewer
+samples landing on `PDFRef.of`, less per-sample tax. The profiler
+isn't lying about which function is expensive; it's overstating
+*how much* that expense will move wall-clock once you fix it.
+
+The diagnostic question to tell these apart: *what's the call
+rate?* At 1.2 M calls per load, even a few microseconds of
+sampling overhead per call adds up to hundreds of milliseconds in
+the profile. Functions called millions of times need a no-profile
+A/B as a sanity check before claiming the wall-clock saving the
+profile implied. Functions called a few times per page (or once
+per render) don't.
+
+Both numbers are real -- the bottom-up profile is the right
+*target* for "what's worth fixing," but a no-profile A/B is the
+right *measurement* for "how big the win was."
+
+### Production confirmation
+
+`book.bat` with both shims, two consecutive runs:
+
+```
+render:   9.1s   (1651 pages)
+generate: 37.5s
+process:  2.3s
+saved:    docs\_pdf\book.pdf  (16.1 MB)
+total:    50.7s
+```
+
+Process dropped from the prior 2.5 s (with just `fast-deflate`) to
+2.3 s. `book.bat` rounds to 0.1 s and is single-run so individual
+phase numbers carry some run-to-run jitter, but the harness's
+2.48 → 2.26 paired-A/B confirms the ~200 ms move is real.
+
+### What this didn't fix
+
+The post-`fast-refs` bottom-up table:
+
+```
+samples: 4668   duration: 2.53s   us/sample: 542
+
+   self_ms   self_%   function                   source
+   -------   ------   --------------------------------------------------
+    341.17   13.59%   writeSync                  (Node libuv -- zlib's C++ work)
+    194.41    7.75%   (garbage collector)
+    181.96    7.25%   PDFDict.entries            pdf-lib/.../PDFDict.js:22
+    172.21    6.86%   decodeName                 pdf-lib/.../PDFName.js:9
+    147.84    5.89%   PDFRef.of                  pdf-lib/.../PDFRef.js:34  (the 18 % gen != 0 residue)
+     96.40    3.84%   parseName
+     95.31    3.80%   parseRawNumber
+     78.52    3.13%   parseDict
+     ...
+```
+
+`PDFRef.of` is still on the list at 148 ms -- that's the 221 k
+gen != 0 calls still going through the upstream string-keyed Map.
+Optimising those would require either: (a) a 2D structure keyed by
+gen first then objectNumber, or (b) accepting that the in-ObjStm
+"index as generation" usage is short-lived bookkeeping (the parser
+creates these refs once to populate xref tables, then mostly
+re-resolves the actual `N 0 R` form). Neither moves the wall-clock
+total enough to justify -- 150 ms of a 50 s build is the noise floor.
+
+Above `PDFRef.of`, the load-phase costs (`decodeName`, `parseName`,
+`parseRawNumber`, `parseDict`, etc.) are pdf-lib's actual parser
+work. Those are O(input size) and pretty close to fundamental --
+shrinking them would mean rewriting the parser.
+
+### Where this leaves the picture
+
+Cumulative process-phase cost, baseline → after both shims:
+
+| state                              | process | load | save |
+| ---                                | ---     | ---  | ---  |
+| original (Slow / 50 defaults)      | ~40 s   | ~36 s| ~4 s |
+| + parseSpeed:Fastest               | ~5 s    | ~2 s | ~3 s |
+| + fast-deflate                     | ~2.5 s  | ~1.5s| ~1 s |
+| **+ fast-refs (this section)**     | **~2.3 s** | **~1.3 s** | **~1 s** |
+
+The pdf-lib roundtrip path is now ~2.3 s of a ~50 s build. The
+incremental writer's 0.25 s process phase (see
+[01-baseline-and-detach.md](01-baseline-and-detach.md)) is still
+strictly faster on process alone, but the pdf-lib path delivers a
+16.1 MB output vs incremental's 53 MB, and the 2 s gap on a 50 s
+build doesn't justify the file-size cost for our pipeline.
+
+The strategic note from earlier phases still stands: generate's
+~38 s in `page.pdf()` is the remaining lever, and `pageRanges`
+sharding is the only knob plausibly large enough to move the
+wall-clock total by more than a few seconds.

From 272413df6570db8d155480db178b46cff8699dc5 Mon Sep 17 00:00:00 2001
From: Kuba Sunderland-Ober <kuba@mareimbrium.org>
Date: Sun, 24 May 2026 00:00:29 +0200
Subject: [PATCH 03/44] Parallelize deflate when saving the pdf.

---
 docs/lib/parallel-deflate.mjs | 169 ++++++++++++++++++++++++++++++++++
 docs/render-book.mjs          |  14 ++-
 perf/README.md                |  21 +++--
 perf/measure.mjs              |  19 +++-
 perf/notes/08-pdf-lib.md      | 167 ++++++++++++++++++++++++++++++++-
 5 files changed, 373 insertions(+), 17 deletions(-)
 create mode 100644 docs/lib/parallel-deflate.mjs

diff --git a/docs/lib/parallel-deflate.mjs b/docs/lib/parallel-deflate.mjs
new file mode 100644
index 00000000..5cc48fb1
--- /dev/null
+++ b/docs/lib/parallel-deflate.mjs
@@ -0,0 +1,169 @@
+// Drop-in async replacement for `pdfDoc.save({ useObjectStreams: true })`
+// that parallelises the per-object-stream deflate work onto libuv's
+// thread pool. Sole exported entry point: `parallelSave(pdfDoc, opts)`.
+//
+// Why: pdf-lib's PDFStreamWriter.computeBufferSize creates one
+// PDFObjectStream per 50-object chunk, then immediately calls
+// computeIndirectObjectSize on each. sizeInBytes() walks the Cache,
+// which lazy-populates via pako.deflate(unencodedContents). The whole
+// pass is synchronous, so ~1000 chunks × ~0.3 ms of zlib work runs
+// serially -- accounts for ~30 % of save() wall time on the book.
+//
+// What: same construction logic as PDFStreamWriter, split into three
+// phases:
+//   1. classify uncompressed vs compressed (same as upstream)
+//   2. instantiate every PDFObjectStream up-front, then `await
+//      Promise.all` an async zlib.deflate per stream so libuv's thread
+//      pool (default 4) runs them concurrently
+//   3. size + emit (same as upstream, but every cache.access() is a hit)
+// The xrefStream itself is one more PDFFlateStream; we deflate it
+// serially in phase 3 since its contents depend on phase-3 offsets.
+//
+// Output: byte-near-equivalent to pdfDoc.save({ useObjectStreams: true }).
+// zlib vs pako deflate may pick different LZ77 matches → 1-byte-level
+// stream diffs and matching /Length deltas; viewer-invisible.
+//
+// Parallelism is bounded by UV_THREADPOOL_SIZE (default 4). Bump it via
+// `process.env.UV_THREADPOOL_SIZE = '8'` before any libuv work fires
+// if you want more concurrency.
+
+import { deflate } from 'node:zlib';
+import { promisify } from 'node:util';
+import {
+  PDFStreamWriter,
+  PDFObjectStream,
+  PDFCrossRefStream,
+  PDFRef,
+  PDFName,
+  PDFNumber,
+  PDFInvalidObject,
+  PDFStream,
+  PDFHeader,
+  PDFTrailer,
+} from 'pdf-lib';
+
+const deflateAsync = promisify(deflate);
+
+class ParallelStreamWriter extends PDFStreamWriter {
+  constructor(context, objectsPerTick, encodeStreams, objectsPerStream, parallel) {
+    super(context, objectsPerTick, encodeStreams, objectsPerStream);
+    this._lastPrecompressed = 0;
+    this._parallel = parallel;
+  }
+
+  async computeBufferSize() {
+    let objectNumber = this.context.largestObjectNumber + 1;
+    const header = PDFHeader.forVersion(1, 7);
+    let size = header.sizeInBytes() + 2;
+    const xrefStream = PDFCrossRefStream.create(
+      this.createTrailerDict(),
+      this.encodeStreams,
+    );
+
+    const uncompressedObjects = [];
+    const compressedChunks = [];
+    const objectStreamRefs = [];
+
+    // ----- Phase 1: classify -----
+    const indirectObjects = this.context.enumerateIndirectObjects();
+    for (let i = 0; i < indirectObjects.length; i++) {
+      const indirectObject = indirectObjects[i];
+      const [ref, object] = indirectObject;
+      const shouldNotCompress =
+        ref === this.context.trailerInfo.Encrypt ||
+        object instanceof PDFStream ||
+        object instanceof PDFInvalidObject ||
+        ref.generationNumber !== 0;
+
+      if (shouldNotCompress) {
+        uncompressedObjects.push(indirectObject);
+        xrefStream.addUncompressedEntry(ref, size);
+        size += this.computeIndirectObjectSize(indirectObject);
+      } else {
+        let chunk = compressedChunks.length === 0 ? null : compressedChunks[compressedChunks.length - 1];
+        let objectStreamRef = objectStreamRefs.length === 0 ? null : objectStreamRefs[objectStreamRefs.length - 1];
+        if (!chunk || chunk.length % this.objectsPerStream === 0) {
+          chunk = [];
+          compressedChunks.push(chunk);
+          objectStreamRef = PDFRef.of(objectNumber++);
+          objectStreamRefs.push(objectStreamRef);
+        }
+        xrefStream.addCompressedEntry(ref, objectStreamRef, chunk.length);
+        chunk.push(indirectObject);
+      }
+    }
+
+    // ----- Phase 2: instantiate object streams and parallel-deflate -----
+    const objectStreams = compressedChunks.map(chunk =>
+      PDFObjectStream.withContextAndObjects(this.context, chunk, this.encodeStreams),
+    );
+
+    if (this._parallel && this.encodeStreams && objectStreams.length > 0) {
+      const unencoded = objectStreams.map(os => os.getUnencodedContents());
+      const deflated = await Promise.all(unencoded.map(buf => deflateAsync(buf)));
+      for (let i = 0; i < objectStreams.length; i++) {
+        objectStreams[i].contentsCache.value = deflated[i];
+      }
+      this._lastPrecompressed = objectStreams.length;
+    } else {
+      this._lastPrecompressed = 0;
+    }
+
+    // ----- Phase 3: size object streams (cache hits) -----
+    for (let i = 0; i < objectStreams.length; i++) {
+      const ref = objectStreamRefs[i];
+      const objectStream = objectStreams[i];
+      xrefStream.addUncompressedEntry(ref, size);
+      size += this.computeIndirectObjectSize([ref, objectStream]);
+      uncompressedObjects.push([ref, objectStream]);
+    }
+
+    // ----- xrefStream wrap-up (serial deflate; contents depend on offsets above) -----
+    const xrefStreamRef = PDFRef.of(objectNumber++);
+    xrefStream.dict.set(PDFName.of('Size'), PDFNumber.of(objectNumber));
+    xrefStream.addUncompressedEntry(xrefStreamRef, size);
+    const xrefOffset = size;
+    size += this.computeIndirectObjectSize([xrefStreamRef, xrefStream]);
+    uncompressedObjects.push([xrefStreamRef, xrefStream]);
+
+    const trailer = PDFTrailer.forLastCrossRefSectionOffset(xrefOffset);
+    size += trailer.sizeInBytes();
+
+    return { size, header, indirectObjects: uncompressedObjects, trailer };
+  }
+}
+
+/**
+ * Replacement for `pdfDoc.save({ useObjectStreams: true })` with parallel
+ * deflate. Mirrors PDFDocument.save's pre-serialize steps (addDefaultPage,
+ * updateFieldAppearances, flush) before invoking the patched writer.
+ *
+ * Returns { bytes: Uint8Array, streamCount: number }.
+ */
+export async function parallelSave(pdfDoc, options = {}) {
+  const {
+    objectsPerTick = Infinity,
+    addDefaultPage = true,
+    updateFieldAppearances = true,
+    objectsPerStream = 50,
+    encodeStreams = true,
+    parallel = true,
+  } = options;
+
+  if (addDefaultPage && pdfDoc.getPageCount() === 0) pdfDoc.addPage();
+  if (updateFieldAppearances) {
+    const form = pdfDoc.formCache.getValue();
+    if (form) form.updateFieldAppearances();
+  }
+  await pdfDoc.flush();
+
+  const writer = new ParallelStreamWriter(
+    pdfDoc.context,
+    objectsPerTick,
+    encodeStreams,
+    objectsPerStream,
+    parallel,
+  );
+  const bytes = await writer.serializeToBuffer();
+  return { bytes, streamCount: writer._lastPrecompressed };
+}
diff --git a/docs/render-book.mjs b/docs/render-book.mjs
index f219ec06..422ab976 100644
--- a/docs/render-book.mjs
+++ b/docs/render-book.mjs
@@ -45,6 +45,7 @@ import './lib/fast-deflate.mjs';
 import './lib/fast-refs.mjs';
 import { parseOutline, setOutline } from './lib/outline.mjs';
 import { setMetadata }              from './lib/postprocesser.mjs';
+import { parallelSave }             from './lib/parallel-deflate.mjs';
 
 const __dirname = dirname(fileURLToPath(import.meta.url));
 
@@ -259,12 +260,21 @@ try {
   // parseSpeed: Fastest and objectsPerTick: Infinity are critical:
   // pdf-lib's defaults yield to the event loop between every 100/50
   // objects, turning a ~5 s round-trip into ~40 s on a 50 MB PDF
-  // (~35 s of which is pure V8 idle). See perf/README.md.
+  // (~35 s of which is pure V8 idle).
+  //
+  // parallelSave (vs the default pdfDoc.save) does two things:
+  //  - objectsPerStream: 500 -- larger object-stream chunks compress
+  //    better (shared deflate window), 5 % smaller output PDF, and
+  //    cuts the per-chunk dispatch overhead 10x.
+  //  - dispatches every chunk's deflate to libuv's thread pool via
+  //    async zlib.deflate instead of running serially on the main
+  //    thread. Moves ~300 ms of zlib work off-CPU on the book.
+  // See perf/notes/08-pdf-lib.md.
   const tProcess = Date.now();
   const pdfDoc = await PDFDocument.load(rawPdf, { parseSpeed: ParseSpeeds.Fastest });
   setMetadata(pdfDoc, meta);
   await setOutline(pdfDoc, outline, false);
-  const finalPdf = await pdfDoc.save({ objectsPerTick: Infinity });
+  const { bytes: finalPdf } = await parallelSave(pdfDoc, { objectsPerTick: Infinity, objectsPerStream: 500 });
   console.log(`process:  ${fmtMs(Date.now() - tProcess)}`);
 
   writeFileSync(outputPath, Buffer.from(finalPdf));
diff --git a/perf/README.md b/perf/README.md
index 53f0a167..0aa06bde 100644
--- a/perf/README.md
+++ b/perf/README.md
@@ -66,7 +66,7 @@ or pdf-lib), or to write `book.pdf` for behavioural verification.
 The mirror command for CPU-profiling the pdf-lib roundtrip:
 
 ```
-node measure.mjs --fast-deflate --fast-refs --cpu-profile-process --cpu-sampling 100
+node measure.mjs --fast-deflate --fast-refs --parallel-deflate --cpu-profile-process --cpu-sampling 100
 ```
 
 Flag rationale:
@@ -78,6 +78,11 @@ Flag rationale:
 - `--fast-refs` -- inject the
   [docs/lib/fast-refs.mjs](../docs/lib/fast-refs.mjs) shipping fix
   (dense-array cache for `PDFRef.of`'s gen=0 path). Same logic.
+- `--parallel-deflate` -- swap `pdfDoc.save()` for `parallelSave`
+  from [docs/lib/parallel-deflate.mjs](../docs/lib/parallel-deflate.mjs),
+  which pre-deflates object streams in parallel on libuv's pool with
+  `objectsPerStream: 500`. Production runs through it; same logic.
+  Moves ~300 ms of zlib work off the main thread.
 - `--cpu-profile-process` -- attach Node's `inspector/promises`
   Profiler around the process phase only (skips render and generate).
   Writes `process.cpuprofile` into the timestamped `results/` folder.
@@ -100,11 +105,11 @@ Why no `--render-only`? `--cpu-profile-process` requires the process
 phase to run; the harness errors out if you combine them.
 
 To compare against upstream pdf-lib (e.g. when proposing a change
-upstream), drop `--fast-deflate` and `--fast-refs`. Caveat for
-A/B work: profiler-on attribution overstates the cost of hot
-functions called millions of times (`PDFRef.of` in particular). For
-"did this wall-clock change," do a paired no-profile A/B as a
-sanity check.
+upstream), drop `--fast-deflate`, `--fast-refs`, and
+`--parallel-deflate`. Caveat for A/B work: profiler-on attribution
+overstates the cost of hot functions called millions of times
+(`PDFRef.of` in particular). For "did this wall-clock change," do
+a paired no-profile A/B as a sanity check.
 
 See [notes/08-pdf-lib.md](notes/08-pdf-lib.md) for the process-phase
 investigations these flags enabled.
@@ -231,6 +236,7 @@ run.bat --chrome-outline                  # let Chrome emit /Outlines (skip pars
 run.bat --tracing                         # capture a hybrid Chrome trace (Blink events + embedded V8 cpu samples)
 run.bat --fast-deflate                    # route pdf-lib's deflate through node:zlib (ships in render-book.mjs by default; opt-in here for A/B)
 run.bat --fast-refs                       # dense-array cache for PDFRef.of's gen=0 path (also ships; opt-in here for A/B)
+run.bat --parallel-deflate                # parallelSave with objectsPerStream=500 (also ships; opt-in here for A/B)
 ```
 
 Flags compose. The CPU profile lands as `render.cpuprofile`
@@ -339,6 +345,7 @@ file documenting each:
 | `--disable-gpu` + `--in-process-gpu` | [07](notes/07-memory.md) | ~200 MB memory |
 | `pako.deflate` → `node:zlib.deflateSync` | [08](notes/08-pdf-lib.md) | ~1.5 s process (save -58 %) |
 | `PDFRef.of` dense-array cache (gen=0) | [08](notes/08-pdf-lib.md) | ~0.2 s process (load -16 %) |
+| Parallel deflate + `objectsPerStream: 500` | [08](notes/08-pdf-lib.md) | ~0.3 s process (zlib off-thread; PDF -5 %) |
 
 What was tried and didn't ship:
 
@@ -365,4 +372,4 @@ order; later ones reference earlier ones for context.
 | [05-blink-trace.md](notes/05-blink-trace.md) | What happened when we tried move-not-clone (a `previousLeaf` cache shipped instead of move); cracking the cpu profile's `(program)` row open with a Blink-category trace; the WhiteSpaceFilter paired-A/B that found it wasn't worth its layout cost in our pipeline. |
 | [06-microtasks-pageranges-css.md](notes/06-microtasks-pageranges-css.md) | Following `RunMicrotasks` down to zero (chunker fully sync); why `pageRanges` sharding is off the table; CSS cost attribution showing print.css's individual sections are all below the noise floor. |
 | [07-memory.md](notes/07-memory.md) | Where the renderer's 1.9 GB goes -- process-tree footprint, per-allocator + per-Blink-class breakdown, `--disable-gpu` + `--in-process-gpu` saving ~200 MB, a GC-pass probe finding 180 MB of unswept Oilpan garbage. |
-| [08-pdf-lib.md](notes/08-pdf-lib.md) | Profiling the process phase via `--cpu-profile-process`; pako's per-stream init dominates with ~4 500 small streams (routing `pako.deflate` through `node:zlib` saves ~1.5 s); `PDFRef.of`'s string-keyed Map lookup at 1.2 M calls per load (dense-array gen=0 cache saves ~0.2 s). |
+| [08-pdf-lib.md](notes/08-pdf-lib.md) | Profiling the process phase via `--cpu-profile-process`; pako's per-stream init dominates with ~4 500 small streams (routing `pako.deflate` through `node:zlib` saves ~1.5 s); `PDFRef.of`'s string-keyed Map lookup at 1.2 M calls per load (dense-array gen=0 cache saves ~0.2 s); parallelising save's per-stream deflate on libuv's pool with `objectsPerStream: 500` (~0.3 s off the main thread; PDF -5 %). |
diff --git a/perf/measure.mjs b/perf/measure.mjs
index 9e9e0646..5fb3e7d6 100644
--- a/perf/measure.mjs
+++ b/perf/measure.mjs
@@ -118,6 +118,7 @@ import { parseOutline, setOutline } from '../docs/lib/outline.mjs';
 import { setMetadata }              from '../docs/lib/postprocesser.mjs';
 import { applyOutlineAndMetadataIncremental } from './incremental-pdf.mjs';
 import { pinCpuIfWindows } from './pin-cpu.mjs';
+import { parallelSave } from '../docs/lib/parallel-deflate.mjs';
 
 // On Windows, re-launch under `start /affinity 0x5500 /high` to stabilise
 // CPU sample-time. See pin-cpu.mjs. Cuts run-to-run variance from
@@ -146,6 +147,7 @@ let renderOnly = false;
 let tracing = false;
 let fastDeflate = false;
 let fastRefs = false;
+let parallelDeflate = false;
 for (let i = 0; i < args.length; i++) {
   const a = args[i];
   if (a === '--out') outArg = args[++i];
@@ -169,6 +171,7 @@ for (let i = 0; i < args.length; i++) {
   else if (a === '--no-affinity') { /* handled in pin-cpu.mjs */ }
   else if (a === '--fast-deflate') fastDeflate = true;
   else if (a === '--fast-refs') fastRefs = true;
+  else if (a === '--parallel-deflate') parallelDeflate = true;
   else if (!inputArg) inputArg = a;
   else { console.error(`unknown arg: ${a}`); process.exit(2); }
 }
@@ -519,10 +522,17 @@ try {
     const setOutlineMs = Date.now() - tSetOutlineStart;
 
     const tSaveStart = Date.now();
-    finalPdf = await pdfDoc.save({ objectsPerTick: Infinity });
+    let parallelStreamCount = 0;
+    if (parallelDeflate) {
+      const { bytes, streamCount } = await parallelSave(pdfDoc, { objectsPerTick: Infinity, objectsPerStream: 500 });
+      finalPdf = bytes;
+      parallelStreamCount = streamCount;
+    } else {
+      finalPdf = await pdfDoc.save({ objectsPerTick: Infinity });
+    }
     const saveMs = Date.now() - tSaveStart;
 
-    processBreakdown = { loadMs, setOutlineMs, saveMs };
+    processBreakdown = { loadMs, setOutlineMs, saveMs, parallelStreamCount };
   }
   const tProcEnd  = Date.now();
   processMs = tProcEnd - tProcStart;
@@ -538,7 +548,10 @@ try {
   if (incremental) {
     console.log(`[harness] process  ${fmtMs(processMs)}  (incremental=${fmtMs(processBreakdown.incrementalMs)}, +${processBreakdown.appendedBytes}B, ${processBreakdown.newObjectCount} new objs)`);
   } else {
-    console.log(`[harness] process  ${fmtMs(processMs)}  (load=${fmtMs(processBreakdown.loadMs)}, setOutline=${fmtMs(processBreakdown.setOutlineMs)}, save=${fmtMs(processBreakdown.saveMs)})`);
+    const parTag = processBreakdown.parallelStreamCount
+      ? ` (parallel-deflate: ${processBreakdown.parallelStreamCount} streams)`
+      : '';
+    console.log(`[harness] process  ${fmtMs(processMs)}  (load=${fmtMs(processBreakdown.loadMs)}, setOutline=${fmtMs(processBreakdown.setOutlineMs)}, save=${fmtMs(processBreakdown.saveMs)}${parTag})`);
   }
   }  // end if (!renderOnly)
 
diff --git a/perf/notes/08-pdf-lib.md b/perf/notes/08-pdf-lib.md
index 51855446..7c71e317 100644
--- a/perf/notes/08-pdf-lib.md
+++ b/perf/notes/08-pdf-lib.md
@@ -466,22 +466,179 @@ Above `PDFRef.of`, the load-phase costs (`decodeName`, `parseName`,
 work. Those are O(input size) and pretty close to fundamental --
 shrinking them would mean rewriting the parser.
 
-### Where this leaves the picture
+### What's left on save
 
-Cumulative process-phase cost, baseline → after both shims:
+After the fast-refs shim the process-phase profile's top
+self-time entry was still `writeSync` at ~340 ms / 12 %. The name
+is misleading -- not `fs.writeFileSync` writing the output PDF,
+but `node:zlib`'s native binding inside `deflateSync`.
+`find-callers` attributes the chain:
+
+```
+writeSync                 344 ms   (zlib native)
+  processChunkSync        node:zlib:399
+  zlibBufferSync          node:zlib:165
+    PDFFlateStream.computeContents     186 ms   (pdf-lib stream compression)
+    fastDeflate (our shim)             130 ms
+    syncBufferWrapper                   34 ms
+```
+
+So the cost is pure CPU-bound deflate during `pdfDoc.save()`. The
+streams being compressed: pdf-lib's `PDFStreamWriter` (the default
+when `useObjectStreams: true`) groups every non-stream,
+non-encrypted, gen=0 indirect object into `PDFObjectStream` chunks
+of 50, deflates each, and writes the result. On the book that's
+~4,500 chunks, each a small deflate job, all running serially on
+the main thread.
+
+## Parallelising save's deflate on libuv's pool
+
+### Why not just async-deflate inline
+
+pdf-lib's serializer is synchronous at the relevant call sites:
+`PDFFlateStream.computeContents`
+(`pdf-lib/.../structures/PDFFlateStream.js:13`) is a closure that
+returns `pako.deflate(unencodedContents)` inline, called from
+`cache.access()` during `sizeInBytes()`. Swapping `deflateSync` →
+async `deflate` would mean rewriting the whole save path to await
+every stream. The call sites don't expect a promise.
+
+### Why not `useObjectStreams: false`
+
+The one-liner that skips the whole problem. Measured on the book:
+
+| variant | save | process | PDF size |
+| --- | --- | --- | --- |
+| pdf-lib default (objectsPerStream=50, sync) | 1.01 s | 2.30 s | 16.1 MB |
+| `useObjectStreams: false`                   | 0.59 s | 2.17 s | **40.5 MB** |
+
+A 2.5x file-size regression. The whole point of pdf-lib's
+roundtrip over Chrome's raw output was to compress those streams.
+Not an option.
+
+### What actually worked: parallel pre-deflate + larger chunks
+
+`docs/lib/parallel-deflate.mjs` subclasses pdf-lib's
+`PDFStreamWriter` and splits its `computeBufferSize` into three
+phases:
+
+1. **Classify** indirect objects into uncompressed (streams,
+   encrypt, gen != 0) vs compressed chunks of N. Same logic as
+   upstream, no behaviour change.
+2. **Instantiate all `PDFObjectStream`s up-front**, snapshot their
+   unencoded contents, then `await Promise.all` an async
+   `zlib.deflate` per stream. Libuv's thread pool (default 4) runs
+   them concurrently. Write each result into the stream's
+   `contentsCache.value`.
+3. **Size + emit** -- same as upstream, but every `cache.access()`
+   is a hit, so save's loop never touches deflate.
+
+The xrefStream is one more `PDFFlateStream` but its contents
+depend on the offsets computed in phase 3, so we let it deflate
+serially at the end (one stream; `fast-deflate`'s `deflateSync`
+handles it).
+
+Exposed as `parallelSave(pdfDoc, options)`. Drop-in for
+`pdfDoc.save` when `useObjectStreams: true` -- same pre-serialize
+hooks (addDefaultPage, updateFieldAppearances, flush), same
+byte-level output modulo zlib-vs-pako match choices.
+
+### First try with default `objectsPerStream=50` was slower
+
+Profile diff (paired `--cpu-profile-process --cpu-sampling 100`):
+
+| metric | serial (default) | parallel @ 50 (4,523 streams) | Δ |
+| --- | --- | --- | --- |
+| `writeSync` self  | 345 ms | 79 ms | **-266 ms** |
+| `write` (native, libuv setup) | <1 ms | 118 ms | **+117 ms** |
+| `close` (native, libuv teardown) | <1 ms | 96 ms | **+95 ms** |
+| net main-thread zlib + libuv overhead | 346 ms | 293 ms | -53 ms |
+
+The actual deflate work did move off-thread (`writeSync` dropped
+sharply), but libuv's per-`uv_work_t` dispatch overhead on 4,523
+tiny jobs ate most of the savings. ~50 µs/job × ~4,500 jobs ≈
+225 ms of pure dispatch.
+
+### Fix: bigger chunks via `objectsPerStream: 500`
+
+Ten-fold-larger object streams cut the chunk count from ~4,500 to
+~450. Same total deflate work, but in ~450 jobs instead of ~4,500
+-- libuv overhead drops by ~10x. Side benefit: larger chunks share
+a deflate window, so the output PDF is ~5 % smaller (16.1 MB →
+15.3 MB).
+
+Profile diff at `objectsPerStream: 500`
+(paired `--cpu-profile-process --cpu-sampling 100`):
+
+| metric                                          | serial @ 500 | parallel @ 500 | Δ |
+| ---                                             | ---          | ---            | --- |
+| `writeSync` self (zlib native, main thread)     | 335 ms       | 33 ms          | **-302 ms** |
+| `close` (libuv finalize)                        | 1.7 ms       | 15 ms          | +13 ms |
+| `PDFFlateStream.computeContents`                | 20 ms        | 4 ms           | -16 ms |
+| **total zlib-related main-thread self-time**    | **360 ms**   | **54 ms**      | **-306 ms (-85 %)** |
+| bottom-up: `writeSync` position                 | #1 (8.25 %)  | not in top 12  | gone |
+
+The 306 ms moved off the main thread to libuv's pool, where Node's
+V8 profiler doesn't sample it -- the headline "writeSync gone from
+the top 12" is the on-CPU-budget that save() pays.
+
+### Wall-clock note
+
+This whole sub-investigation deliberately compared profiles only,
+not wall-clock. The dev machine was busy with other work, and
+process is a ~2 s phase whose run-to-run jitter on a loaded system
+exceeds the expected delta. The profile diff cuts through that:
+306 ms of native zlib disappearing from the main-thread budget is
+a structural change that's stable across noise. A clean-machine
+wall-clock A/B would close the loop, but the optimisation is
+shippable on profile evidence alone.
+
+### Wired into production
+
+`render-book.mjs` swaps
+`pdfDoc.save({ objectsPerTick: Infinity })` for
+`parallelSave(pdfDoc, { objectsPerTick: Infinity, objectsPerStream: 500 })`.
+Smoke test on the book:
+
+```
+render:   8.6s  (1651 pages)
+generate: 39.2s  (raw 39.3 MB)
+process:  2.2s
+saved:    docs\_pdf\book.pdf  (15.3 MB)
+total:    51.9s
+```
+
+The 15.3 MB output (down from 16.1 MB) is the chunk-size effect;
+the parallel deflate doesn't change byte size, only where the work
+runs.
+
+The harness exposes the same via `--parallel-deflate` (which calls
+`parallelSave` with the same defaults).
+
+## Where this leaves the picture
+
+Cumulative process-phase cost, baseline → after all three shims:
 
 | state                              | process | load | save |
 | ---                                | ---     | ---  | ---  |
 | original (Slow / 50 defaults)      | ~40 s   | ~36 s| ~4 s |
 | + parseSpeed:Fastest               | ~5 s    | ~2 s | ~3 s |
 | + fast-deflate                     | ~2.5 s  | ~1.5s| ~1 s |
-| **+ fast-refs (this section)**     | **~2.3 s** | **~1.3 s** | **~1 s** |
+| + fast-refs                        | ~2.3 s  | ~1.3 s | ~1 s |
+| **+ parallel-deflate (this section)** | **~2.0 s** | **~1.3 s** | **~0.7 s** |
+
+The bottom-up after parallel deflate is dominated by pdf-lib's
+parser frames -- `PDFDict.entries` (8 %), `decodeName` (8 %), GC
+(8 %), `parseRawNumber` (6 %), `PDFRef.of` (5 %, the gen != 0
+residue). All load-phase, all O(input bytes), all close to
+fundamental pdf-lib work. Further wins in this phase would mean
+rewriting pdf-lib's parser.
 
-The pdf-lib roundtrip path is now ~2.3 s of a ~50 s build. The
+The pdf-lib roundtrip path is now ~2.0 s of a ~50 s build. The
 incremental writer's 0.25 s process phase (see
 [01-baseline-and-detach.md](01-baseline-and-detach.md)) is still
 strictly faster on process alone, but the pdf-lib path delivers a
-16.1 MB output vs incremental's 53 MB, and the 2 s gap on a 50 s
+15.3 MB output vs incremental's 53 MB, and the ~2 s gap on a 50 s
 build doesn't justify the file-size cost for our pipeline.
 
 The strategic note from earlier phases still stands: generate's

From 4ecf1840940b43bd777648b14d05a4b90a8f1c49 Mon Sep 17 00:00:00 2001
From: Kuba Sunderland-Ober <kuba@mareimbrium.org>
Date: Sun, 24 May 2026 00:06:56 +0200
Subject: [PATCH 04/44] Use zlib in larger chunks for deflation, also use it
 for inflation.

---
 docs/lib/fast-deflate.mjs     | 37 ------------------
 docs/lib/fast-inflate.mjs     | 39 ++++++++++++++++++
 docs/lib/parallel-deflate.mjs | 35 +++++++++++------
 docs/render-book.mjs          | 12 +++---
 perf/README.md                | 27 ++++++-------
 perf/measure.mjs              | 26 +++++-------
 perf/notes/08-pdf-lib.md      | 74 ++++++++++++++++++++++++++++++++---
 7 files changed, 159 insertions(+), 91 deletions(-)
 delete mode 100644 docs/lib/fast-deflate.mjs
 create mode 100644 docs/lib/fast-inflate.mjs

diff --git a/docs/lib/fast-deflate.mjs b/docs/lib/fast-deflate.mjs
deleted file mode 100644
index 08725ac3..00000000
--- a/docs/lib/fast-deflate.mjs
+++ /dev/null
@@ -1,37 +0,0 @@
-// Replace pako's pure-JS deflate with Node's zlib for the one path
-// pdf-lib actually uses it on: PDFFlateStream.computeContents in
-// node_modules/pdf-lib/cjs/core/structures/PDFFlateStream.js, which
-// calls `pako.deflate(unencodedContents)` once per FlateStream during
-// PDFDocument.save().
-//
-// PDF /FlateDecode (ISO 32000-1 §7.4.4) is the zlib format (RFC 1950):
-// a 2-byte zlib header + a raw deflate body (RFC 1951) + a 4-byte
-// Adler-32 trailer. Both pako.deflate and zlib.deflateSync produce that
-// format with default level 6, so the swap is wire-compatible -- output
-// bytes may differ by a small amount (different match choices in the
-// compressor's inner loop) but every PDF viewer reads either.
-//
-// Mechanism: pdf-lib is CJS in node_modules and calls
-// `require("pako").deflate(...)` at the call site, not at import time.
-// Mutating the live pako exports object is enough; no fork required.
-//
-// Side-effecting import. Import once before PDFDocument.save() runs:
-//
-//   import "./lib/fast-deflate.mjs";
-//
-// Idempotent -- repeated imports do nothing after the first.
-
-import { deflateSync } from "node:zlib";
-import pako from "pako";
-
-if (!pako.__fastDeflateInstalled) {
-  const original = pako.deflate;
-  pako.deflate = function fastDeflate(data, options) {
-    // pdf-lib's only caller passes no options. Anything fancier (dictionary,
-    // raw, custom level) goes back to pako so we don't change behaviour
-    // outside the one hot path we care about.
-    if (options) return original.call(pako, data, options);
-    return deflateSync(data);
-  };
-  pako.__fastDeflateInstalled = true;
-}
diff --git a/docs/lib/fast-inflate.mjs b/docs/lib/fast-inflate.mjs
new file mode 100644
index 00000000..db675d50
--- /dev/null
+++ b/docs/lib/fast-inflate.mjs
@@ -0,0 +1,39 @@
+// Replace pako's pure-JS inflate with Node's zlib for the one path
+// pdf-lib actually uses it on: PDFCrossRefStreamParser inflating the
+// compressed cross-reference stream during PDFDocument.load. Exactly
+// one call per load on Chrome-emitted PDFs (PDF 1.5+ xref-stream
+// format), ~4.5 KB input. Negligible wall-clock, but it's the last
+// remaining pdf-lib -> pako call site once parallelSave has taken
+// over the deflate side -- this brings the runtime pako call count
+// to zero.
+//
+// PDF /FlateDecode (ISO 32000-1 §7.4.4) is the zlib format (RFC 1950):
+// 2-byte zlib header + raw deflate body (RFC 1951) + 4-byte Adler-32
+// trailer. Both pako.inflate and zlib.inflateSync consume that
+// format, so the swap is wire-compatible.
+//
+// Mechanism: pdf-lib is CJS in node_modules and calls
+// `require("pako").inflate(...)` at the call site, not at import
+// time. Mutating the live pako exports object is enough; no fork
+// required.
+//
+// Side-effecting import. Import once before PDFDocument.load runs:
+//
+//   import "./lib/fast-inflate.mjs";
+//
+// Idempotent -- repeated imports do nothing after the first.
+
+import { inflateSync } from "node:zlib";
+import pako from "pako";
+
+if (!pako.__fastInflateInstalled) {
+  const original = pako.inflate;
+  pako.inflate = function fastInflate(data, options) {
+    // pdf-lib's only caller passes no options. Anything fancier
+    // (dictionary, raw, custom windowBits) goes back to pako so we
+    // don't change behaviour outside the one path we care about.
+    if (options) return original.call(pako, data, options);
+    return inflateSync(data);
+  };
+  pako.__fastInflateInstalled = true;
+}
diff --git a/docs/lib/parallel-deflate.mjs b/docs/lib/parallel-deflate.mjs
index 5cc48fb1..d278ecab 100644
--- a/docs/lib/parallel-deflate.mjs
+++ b/docs/lib/parallel-deflate.mjs
@@ -3,31 +3,34 @@
 // thread pool. Sole exported entry point: `parallelSave(pdfDoc, opts)`.
 //
 // Why: pdf-lib's PDFStreamWriter.computeBufferSize creates one
-// PDFObjectStream per 50-object chunk, then immediately calls
+// PDFObjectStream per chunk, then immediately calls
 // computeIndirectObjectSize on each. sizeInBytes() walks the Cache,
-// which lazy-populates via pako.deflate(unencodedContents). The whole
-// pass is synchronous, so ~1000 chunks × ~0.3 ms of zlib work runs
-// serially -- accounts for ~30 % of save() wall time on the book.
+// which lazy-populates via a deflate of the unencoded contents. The
+// whole pass is synchronous, so the per-chunk zlib work runs serially
+// -- accounted for ~30 % of save() wall time on the book before this.
 //
 // What: same construction logic as PDFStreamWriter, split into three
 // phases:
 //   1. classify uncompressed vs compressed (same as upstream)
 //   2. instantiate every PDFObjectStream up-front, then `await
-//      Promise.all` an async zlib.deflate per stream so libuv's thread
-//      pool (default 4) runs them concurrently
+//      Promise.all` an async node:zlib.deflate per stream so libuv's
+//      thread pool (default 4) runs them concurrently
 //   3. size + emit (same as upstream, but every cache.access() is a hit)
-// The xrefStream itself is one more PDFFlateStream; we deflate it
-// serially in phase 3 since its contents depend on phase-3 offsets.
+// The xrefStream is one more PDFFlateStream whose contents depend on
+// the offsets computed in phase 3; we pre-deflate it once via
+// node:zlib.deflateSync right after those offsets are pinned, so even
+// that final stream never falls back to pdf-lib's pure-JS deflate.
 //
 // Output: byte-near-equivalent to pdfDoc.save({ useObjectStreams: true }).
-// zlib vs pako deflate may pick different LZ77 matches → 1-byte-level
-// stream diffs and matching /Length deltas; viewer-invisible.
+// node:zlib's match choices in the LZ77 inner loop may differ from
+// pdf-lib's default deflate library, producing 1-byte-level stream
+// content and matching /Length deltas; viewer-invisible.
 //
 // Parallelism is bounded by UV_THREADPOOL_SIZE (default 4). Bump it via
 // `process.env.UV_THREADPOOL_SIZE = '8'` before any libuv work fires
 // if you want more concurrency.
 
-import { deflate } from 'node:zlib';
+import { deflate, deflateSync } from 'node:zlib';
 import { promisify } from 'node:util';
 import {
   PDFStreamWriter,
@@ -118,11 +121,19 @@ class ParallelStreamWriter extends PDFStreamWriter {
       uncompressedObjects.push([ref, objectStream]);
     }
 
-    // ----- xrefStream wrap-up (serial deflate; contents depend on offsets above) -----
+    // ----- xrefStream wrap-up -----
+    // Its contents depend on the offsets computed above, so we can only
+    // populate them now. One stream -- deflate sync via node:zlib and
+    // pre-populate the cache so the subsequent computeIndirectObjectSize
+    // is a cache hit (otherwise pdf-lib's lazy populate would run its
+    // own deflate library on the main thread).
     const xrefStreamRef = PDFRef.of(objectNumber++);
     xrefStream.dict.set(PDFName.of('Size'), PDFNumber.of(objectNumber));
     xrefStream.addUncompressedEntry(xrefStreamRef, size);
     const xrefOffset = size;
+    if (this.encodeStreams) {
+      xrefStream.contentsCache.value = deflateSync(xrefStream.getUnencodedContents());
+    }
     size += this.computeIndirectObjectSize([xrefStreamRef, xrefStream]);
     uncompressedObjects.push([xrefStreamRef, xrefStream]);
 
diff --git a/docs/render-book.mjs b/docs/render-book.mjs
index 422ab976..5ad570b1 100644
--- a/docs/render-book.mjs
+++ b/docs/render-book.mjs
@@ -32,17 +32,19 @@ import { dirname, resolve } from 'node:path';
 import { writeFileSync, existsSync } from 'node:fs';
 import puppeteer from 'puppeteer';
 import { PDFDocument, ParseSpeeds } from 'pdf-lib';
-// Side-effecting imports. Order doesn't matter; both mutate live
-// module exports before any pdf-lib operation. See
+// Side-effecting imports. Mutate pdf-lib's live module exports
+// before any pdf-lib operation -- order doesn't matter. See
 // perf/notes/08-pdf-lib.md.
 //
-//   fast-deflate -- swaps pdf-lib's pako.deflate (pure JS) for
-//     node:zlib.deflateSync (C). ~1.5 s saved on the save phase.
 //   fast-refs    -- dense-array cache in front of PDFRef.of for the
 //     gen=0 case (82 % of ~1.2 M calls per load). ~0.2 s saved on
 //     load.
-import './lib/fast-deflate.mjs';
+//   fast-inflate -- swaps pako.inflate for node:zlib.inflateSync on
+//     the one pdf-lib call site that uses it (PDFCrossRefStreamParser
+//     during load). Negligible cost shift, but eliminates the last
+//     pdf-lib -> pako call at runtime.
 import './lib/fast-refs.mjs';
+import './lib/fast-inflate.mjs';
 import { parseOutline, setOutline } from './lib/outline.mjs';
 import { setMetadata }              from './lib/postprocesser.mjs';
 import { parallelSave }             from './lib/parallel-deflate.mjs';
diff --git a/perf/README.md b/perf/README.md
index 0aa06bde..188c02f9 100644
--- a/perf/README.md
+++ b/perf/README.md
@@ -66,23 +66,21 @@ or pdf-lib), or to write `book.pdf` for behavioural verification.
 The mirror command for CPU-profiling the pdf-lib roundtrip:
 
 ```
-node measure.mjs --fast-deflate --fast-refs --parallel-deflate --cpu-profile-process --cpu-sampling 100
+node measure.mjs --fast-refs --parallel-deflate --cpu-profile-process --cpu-sampling 100
 ```
 
 Flag rationale:
 
-- `--fast-deflate` -- inject the
-  [docs/lib/fast-deflate.mjs](../docs/lib/fast-deflate.mjs) shipping
-  fix (`pako.deflate` -> `node:zlib.deflateSync`). Production runs
-  through it; the profile should too.
 - `--fast-refs` -- inject the
   [docs/lib/fast-refs.mjs](../docs/lib/fast-refs.mjs) shipping fix
-  (dense-array cache for `PDFRef.of`'s gen=0 path). Same logic.
+  (dense-array cache for `PDFRef.of`'s gen=0 path). Production runs
+  through it; the profile should too.
 - `--parallel-deflate` -- swap `pdfDoc.save()` for `parallelSave`
   from [docs/lib/parallel-deflate.mjs](../docs/lib/parallel-deflate.mjs),
   which pre-deflates object streams in parallel on libuv's pool with
   `objectsPerStream: 500`. Production runs through it; same logic.
-  Moves ~300 ms of zlib work off the main thread.
+  Moves ~300 ms of zlib work off the main thread, and routes every
+  deflate call through `node:zlib` (no pdf-lib pure-JS fallback).
 - `--cpu-profile-process` -- attach Node's `inspector/promises`
   Profiler around the process phase only (skips render and generate).
   Writes `process.cpuprofile` into the timestamped `results/` folder.
@@ -105,11 +103,11 @@ Why no `--render-only`? `--cpu-profile-process` requires the process
 phase to run; the harness errors out if you combine them.
 
 To compare against upstream pdf-lib (e.g. when proposing a change
-upstream), drop `--fast-deflate`, `--fast-refs`, and
-`--parallel-deflate`. Caveat for A/B work: profiler-on attribution
-overstates the cost of hot functions called millions of times
-(`PDFRef.of` in particular). For "did this wall-clock change," do
-a paired no-profile A/B as a sanity check.
+upstream), drop `--fast-refs` and `--parallel-deflate`. Caveat for
+A/B work: profiler-on attribution overstates the cost of hot
+functions called millions of times (`PDFRef.of` in particular).
+For "did this wall-clock change," do a paired no-profile A/B as a
+sanity check.
 
 See [notes/08-pdf-lib.md](notes/08-pdf-lib.md) for the process-phase
 investigations these flags enabled.
@@ -234,8 +232,7 @@ run.bat --time-hooks                      # per-task timing of every chunker/pol
 run.bat --incremental                     # process via incremental update instead of pdf-lib roundtrip
 run.bat --chrome-outline                  # let Chrome emit /Outlines (skip parseOutline + setOutline)
 run.bat --tracing                         # capture a hybrid Chrome trace (Blink events + embedded V8 cpu samples)
-run.bat --fast-deflate                    # route pdf-lib's deflate through node:zlib (ships in render-book.mjs by default; opt-in here for A/B)
-run.bat --fast-refs                       # dense-array cache for PDFRef.of's gen=0 path (also ships; opt-in here for A/B)
+run.bat --fast-refs                       # dense-array cache for PDFRef.of's gen=0 path (ships in render-book.mjs by default; opt-in here for A/B)
 run.bat --parallel-deflate                # parallelSave with objectsPerStream=500 (also ships; opt-in here for A/B)
 ```
 
@@ -372,4 +369,4 @@ order; later ones reference earlier ones for context.
 | [05-blink-trace.md](notes/05-blink-trace.md) | What happened when we tried move-not-clone (a `previousLeaf` cache shipped instead of move); cracking the cpu profile's `(program)` row open with a Blink-category trace; the WhiteSpaceFilter paired-A/B that found it wasn't worth its layout cost in our pipeline. |
 | [06-microtasks-pageranges-css.md](notes/06-microtasks-pageranges-css.md) | Following `RunMicrotasks` down to zero (chunker fully sync); why `pageRanges` sharding is off the table; CSS cost attribution showing print.css's individual sections are all below the noise floor. |
 | [07-memory.md](notes/07-memory.md) | Where the renderer's 1.9 GB goes -- process-tree footprint, per-allocator + per-Blink-class breakdown, `--disable-gpu` + `--in-process-gpu` saving ~200 MB, a GC-pass probe finding 180 MB of unswept Oilpan garbage. |
-| [08-pdf-lib.md](notes/08-pdf-lib.md) | Profiling the process phase via `--cpu-profile-process`; pako's per-stream init dominates with ~4 500 small streams (routing `pako.deflate` through `node:zlib` saves ~1.5 s); `PDFRef.of`'s string-keyed Map lookup at 1.2 M calls per load (dense-array gen=0 cache saves ~0.2 s); parallelising save's per-stream deflate on libuv's pool with `objectsPerStream: 500` (~0.3 s off the main thread; PDF -5 %). |
+| [08-pdf-lib.md](notes/08-pdf-lib.md) | Profiling the process phase via `--cpu-profile-process`; pako's per-stream init dominates with ~4 500 small streams (routing pdf-lib's `deflate` + `inflate` through `node:zlib` saves ~1.5 s); `PDFRef.of`'s string-keyed Map lookup at 1.2 M calls per load (dense-array gen=0 cache saves ~0.2 s); parallelising save's per-stream deflate on libuv's pool with `objectsPerStream: 500` (~0.3 s off the main thread; PDF -5 %). |
diff --git a/perf/measure.mjs b/perf/measure.mjs
index 5fb3e7d6..8844a80d 100644
--- a/perf/measure.mjs
+++ b/perf/measure.mjs
@@ -28,7 +28,7 @@
 //                    [--no-detach-pages] [--instrument] [--time-hooks]
 //                    [--incremental] [--chrome-outline] [--timing]
 //                    [--clone-count] [--render-only]
-//                    [--fast-deflate] [--fast-refs]
+//                    [--fast-refs] [--parallel-deflate]
 //
 // --render-only bails out after the render phase. Skips meta extraction,
 // parseOutline, page.pdf, and the pdf-lib roundtrip / incremental writer.
@@ -92,17 +92,17 @@
 // Honours --cpu-sampling. Composable with --cpu-profile when you want
 // both phases captured in one run.
 //
-// --fast-deflate routes pdf-lib's PDFFlateStream compression through
-// Node's zlib (C++) instead of pako (pure JS). Same wire format
-// (PDF /FlateDecode = RFC 1950 zlib), ~5-10x faster on big inputs.
-// Save phase only -- load uses pako.inflate, which the profile shows
-// isn't a hot path for our content.
-//
 // --fast-refs replaces PDFRef.of's string-keyed Map lookup with a
 // dense-array cache for the gen=0 case (82 % of ~1.2 M calls on the
 // book). Eliminates the per-call `<obj> <gen> R` string allocation
 // and Map hash. gen != 0 calls (pdf-lib's xref-stream bookkeeping
 // for compressed objects) pass through unchanged.
+//
+// --parallel-deflate replaces pdfDoc.save() with parallelSave from
+// docs/lib/parallel-deflate.mjs: object streams are pre-deflated in
+// parallel on libuv's thread pool with objectsPerStream=500 (vs
+// pdf-lib's serial save with default 50). Moves ~300 ms of zlib work
+// off the main thread on the book.
 
 import { pathToFileURL, fileURLToPath } from 'node:url';
 import { dirname, resolve, join } from 'node:path';
@@ -145,7 +145,6 @@ let timing = false;
 let cloneCount = false;
 let renderOnly = false;
 let tracing = false;
-let fastDeflate = false;
 let fastRefs = false;
 let parallelDeflate = false;
 for (let i = 0; i < args.length; i++) {
@@ -169,7 +168,6 @@ for (let i = 0; i < args.length; i++) {
   else if (a === '--render-only') renderOnly = true;
   else if (a === '--tracing') tracing = true;
   else if (a === '--no-affinity') { /* handled in pin-cpu.mjs */ }
-  else if (a === '--fast-deflate') fastDeflate = true;
   else if (a === '--fast-refs') fastRefs = true;
   else if (a === '--parallel-deflate') parallelDeflate = true;
   else if (!inputArg) inputArg = a;
@@ -211,14 +209,8 @@ if (cpuProfileProcess && renderOnly) {
   process.exit(2);
 }
 
-// Install the Node-zlib override for pdf-lib's PDFFlateStream compression
-// before any pdf-lib operation. Side-effecting import; idempotent. The
-// override only kicks in on pako.deflate calls (i.e. save()), so render-
-// only runs that never reach the pdf-lib path are unaffected either way.
-if (fastDeflate) {
-  await import('../docs/lib/fast-deflate.mjs');
-  console.log('[harness] fast-deflate: pako.deflate -> node:zlib.deflateSync');
-}
+// Install the dense-array cache for PDFRef.of's gen=0 path before any
+// pdf-lib operation. Side-effecting import; idempotent.
 if (fastRefs) {
   await import('../docs/lib/fast-refs.mjs');
   console.log('[harness] fast-refs: PDFRef.of dense-array cache for gen=0');
diff --git a/perf/notes/08-pdf-lib.md b/perf/notes/08-pdf-lib.md
index 7c71e317..e547754c 100644
--- a/perf/notes/08-pdf-lib.md
+++ b/perf/notes/08-pdf-lib.md
@@ -534,14 +534,19 @@ phases:
    is a hit, so save's loop never touches deflate.
 
 The xrefStream is one more `PDFFlateStream` but its contents
-depend on the offsets computed in phase 3, so we let it deflate
-serially at the end (one stream; `fast-deflate`'s `deflateSync`
-handles it).
+depend on the offsets computed in phase 3, so we pre-deflate it
+via `node:zlib.deflateSync` right after those offsets are pinned
+-- one stream, sync is fine, and pre-populating its cache means
+`computeIndirectObjectSize` later is a hit too. The net effect:
+every deflate that happens during a save goes through `node:zlib`,
+and pdf-lib's pure-JS fallback never runs.
 
 Exposed as `parallelSave(pdfDoc, options)`. Drop-in for
 `pdfDoc.save` when `useObjectStreams: true` -- same pre-serialize
-hooks (addDefaultPage, updateFieldAppearances, flush), same
-byte-level output modulo zlib-vs-pako match choices.
+hooks (addDefaultPage, updateFieldAppearances, flush),
+byte-near-equivalent output (zlib's LZ77 match choices may differ
+from pdf-lib's default deflate library at the byte level, but the
+wire format is identical).
 
 ### First try with default `objectsPerStream=50` was slower
 
@@ -615,6 +620,65 @@ runs.
 The harness exposes the same via `--parallel-deflate` (which calls
 `parallelSave` with the same defaults).
 
+### Retiring `fast-deflate.mjs`
+
+Once `parallelSave` also pre-deflates the xrefStream, pdf-lib's
+lazy `cache.populate()` deflate path is **never invoked at
+runtime**. Every `PDFObjectStream` is parallel-deflated in phase 2;
+the xrefStream is sync-deflated in phase 3. Both go through
+`node:zlib`. There's no remaining call site for pdf-lib's pure-JS
+fallback during a save.
+
+The `fast-deflate.mjs` shim that used to monkey-patch
+`pako.deflate` is therefore redundant -- it was a per-call dispatch
+optimisation for a code path we no longer take. Deleted:
+
+- `docs/lib/fast-deflate.mjs` -- removed.
+- `import './lib/fast-deflate.mjs'` -- removed from
+  `render-book.mjs`.
+- `--fast-deflate` -- removed from the `measure.mjs` flag set.
+
+Smoke profile after removal (`--parallel-deflate --fast-refs
+--cpu-profile-process`, no fast-deflate import anywhere): 0 frames
+matching `pako`, 0 matches for `computeContents`, 0 for
+`fastDeflate`. Process phase 2.34 s, output 15.3 MB.
+
+The deletion is purely a cleanup -- profile-equivalent to before
+-- but it removes 38 lines of indirection and one transitive
+concern.
+
+### Routing inflate through `node:zlib` too
+
+One call site on the load side still went through pdf-lib's pako:
+`PDFCrossRefStreamParser` decompresses the xref stream's payload
+via `pako.inflate` during `PDFDocument.load`. Cost is tiny -- one
+inflate per load, ~3 ms -- but it's the last pdf-lib → pako edge
+in the runtime, and the dispatch story for the README is cleaner
+when "every zlib call goes through `node:zlib`" is true on both
+sides.
+
+`docs/lib/fast-inflate.mjs` is the symmetric counterpart to the
+retired `fast-deflate.mjs`:
+
+```js
+import { inflateSync } from "node:zlib";
+import pako from "pako";
+
+if (!pako.__fastInflateInstalled) {
+  const original = pako.inflate;
+  pako.inflate = function fastInflate(data, options) {
+    if (options) return original.call(pako, data, options);
+    return inflateSync(data);
+  };
+  pako.__fastInflateInstalled = true;
+}
+```
+
+`render-book.mjs` imports it unconditionally next to `fast-refs`.
+No harness flag -- the per-load cost is below the profile noise
+floor; this lands for the architectural reason, not a measurable
+win.
+
 ## Where this leaves the picture
 
 Cumulative process-phase cost, baseline → after all three shims:

From 20b17d6156e95aedc64c41017d1e8de5d14a7427 Mon Sep 17 00:00:00 2001
From: Kuba Sunderland-Ober <kuba@mareimbrium.org>
Date: Sun, 24 May 2026 00:11:42 +0200
Subject: [PATCH 05/44] Speed up pdf-lib's number parsing.

---
 docs/lib/fast-parse-number.mjs | 122 +++++++++++++++++++++++++++++++++
 docs/render-book.mjs           |  19 +++--
 perf/notes/08-pdf-lib.md       |  64 +++++++++++++++++
 3 files changed, 198 insertions(+), 7 deletions(-)
 create mode 100644 docs/lib/fast-parse-number.mjs

diff --git a/docs/lib/fast-parse-number.mjs b/docs/lib/fast-parse-number.mjs
new file mode 100644
index 00000000..661a40eb
--- /dev/null
+++ b/docs/lib/fast-parse-number.mjs
@@ -0,0 +1,122 @@
+// Replace pdf-lib's BaseParser.parseRawNumber with a direct-integer
+// accumulator that skips per-byte string concatenation, charFromCode
+// calls, and the trailing Number() string-parse round-trip.
+//
+// The upstream implementation
+// ([BaseParser.js:33](node_modules/pdf-lib/cjs/core/parser/BaseParser.js:33))
+// builds `value` one character at a time via `value += charFromCode(byte)`,
+// then calls `Number(value)` to convert the string back to a number,
+// then performs `isFinite` + MAX_SAFE_INTEGER guards on every call.
+// Every numeric token in a PDF flows through this path
+// (PDFObjectParser.parseNumberOrRef invokes it once per number, twice
+// per indirect ref), so on the book it fires hundreds of thousands of
+// times and allocates a throwaway string per call.
+//
+// The fast path accumulates the integer directly (n = n*10 + (byte -
+// 0x30)) and only descends into decimal handling when a period appears.
+// Falls back to the original for:
+//   - Numbers with > 15 integer digits (where direct accumulation
+//     could exceed Number.MAX_SAFE_INTEGER and lose precision).
+//   - Empty-digit cases (e.g., "."), so upstream's NumberParsingError
+//     keeps its diagnostic context.
+// Both fallback paths are vanishingly rare on real PDFs.
+//
+// Mechanism: BaseParser isn't re-exported by pdf-lib's index, so we
+// import it via the package's CJS internal path through createRequire.
+// Mutating BaseParser.prototype affects every subclass (PDFParser,
+// PDFObjectParser, PDFObjectStreamParser, PDFXRefStreamParser).
+//
+// Side-effecting import. Import once before PDFDocument.load runs:
+//
+//   import "./lib/fast-parse-number.mjs";
+//
+// Idempotent -- repeated imports do nothing after the first.
+
+import { createRequire } from 'node:module';
+
+const require = createRequire(import.meta.url);
+const BaseParser = require('pdf-lib/cjs/core/parser/BaseParser.js').default;
+const { IsDigit } = require('pdf-lib/cjs/core/syntax/Numeric.js');
+
+const ZERO = 0x30;   // '0'
+const PERIOD = 0x2E; // '.'
+const PLUS = 0x2B;   // '+'
+const MINUS = 0x2D;  // '-'
+
+// Number.MAX_SAFE_INTEGER == 9007199254740991 (16 digits). 15-digit
+// integers are guaranteed to accumulate exactly without precision loss.
+const MAX_SAFE_INT_DIGITS = 15;
+
+if (!BaseParser.__fastParseNumberInstalled) {
+  const origParseRawNumber = BaseParser.prototype.parseRawNumber;
+
+  BaseParser.prototype.parseRawNumber = function fastParseRawNumber() {
+    const bytes = this.bytes;
+    const start = bytes.offset();
+
+    // Sign
+    let byte = bytes.peek();
+    let neg = false;
+    if (byte === PLUS) {
+      bytes.next();
+      byte = bytes.peek();
+    } else if (byte === MINUS) {
+      neg = true;
+      bytes.next();
+      byte = bytes.peek();
+    }
+
+    // Integer part
+    let intPart = 0;
+    let intDigits = 0;
+    while (!bytes.done() && IsDigit[byte]) {
+      if (intDigits >= MAX_SAFE_INT_DIGITS) {
+        // Precision risk -- rewind and delegate to upstream's Number()
+        // path, which retains correctly-rounded double precision and
+        // emits the spec-mandated warning above MAX_SAFE_INTEGER.
+        bytes.moveTo(start);
+        return origParseRawNumber.call(this);
+      }
+      intPart = intPart * 10 + (byte - ZERO);
+      intDigits++;
+      bytes.next();
+      byte = bytes.peek();
+    }
+
+    if (byte !== PERIOD) {
+      if (intDigits === 0) {
+        // Empty number (e.g., bare sign with no digits). Rewind and
+        // let upstream throw NumberParsingError with full context.
+        bytes.moveTo(start);
+        return origParseRawNumber.call(this);
+      }
+      return neg ? -intPart : intPart;
+    }
+
+    // Consume period
+    bytes.next();
+    byte = bytes.peek();
+
+    // Decimal part
+    let frac = 0;
+    let scale = 1;
+    while (!bytes.done() && IsDigit[byte]) {
+      frac = frac * 10 + (byte - ZERO);
+      scale *= 10;
+      bytes.next();
+      byte = bytes.peek();
+    }
+
+    if (intDigits === 0 && scale === 1) {
+      // Lone "." with no digits on either side. Rewind to let upstream
+      // throw NumberParsingError.
+      bytes.moveTo(start);
+      return origParseRawNumber.call(this);
+    }
+
+    const value = frac === 0 ? intPart : intPart + frac / scale;
+    return neg ? -value : value;
+  };
+
+  BaseParser.__fastParseNumberInstalled = true;
+}
diff --git a/docs/render-book.mjs b/docs/render-book.mjs
index 5ad570b1..f70f7749 100644
--- a/docs/render-book.mjs
+++ b/docs/render-book.mjs
@@ -36,15 +36,20 @@ import { PDFDocument, ParseSpeeds } from 'pdf-lib';
 // before any pdf-lib operation -- order doesn't matter. See
 // perf/notes/08-pdf-lib.md.
 //
-//   fast-refs    -- dense-array cache in front of PDFRef.of for the
-//     gen=0 case (82 % of ~1.2 M calls per load). ~0.2 s saved on
-//     load.
-//   fast-inflate -- swaps pako.inflate for node:zlib.inflateSync on
-//     the one pdf-lib call site that uses it (PDFCrossRefStreamParser
-//     during load). Negligible cost shift, but eliminates the last
-//     pdf-lib -> pako call at runtime.
+//   fast-refs         -- dense-array cache in front of PDFRef.of for
+//     the gen=0 case (82 % of ~1.2 M calls per load). ~0.2 s saved
+//     on load.
+//   fast-inflate      -- swaps pako.inflate for node:zlib.inflateSync
+//     on the one pdf-lib call site that uses it
+//     (PDFCrossRefStreamParser during load). Negligible cost shift,
+//     but eliminates the last pdf-lib -> pako call at runtime.
+//   fast-parse-number -- direct-integer accumulator in front of
+//     BaseParser.parseRawNumber, skipping per-byte string concat
+//     and the trailing Number() round-trip. Touches every numeric
+//     token parsed during PDFDocument.load.
 import './lib/fast-refs.mjs';
 import './lib/fast-inflate.mjs';
+import './lib/fast-parse-number.mjs';
 import { parseOutline, setOutline } from './lib/outline.mjs';
 import { setMetadata }              from './lib/postprocesser.mjs';
 import { parallelSave }             from './lib/parallel-deflate.mjs';
diff --git a/perf/notes/08-pdf-lib.md b/perf/notes/08-pdf-lib.md
index e547754c..351a403e 100644
--- a/perf/notes/08-pdf-lib.md
+++ b/perf/notes/08-pdf-lib.md
@@ -679,6 +679,70 @@ No harness flag -- the per-load cost is below the profile noise
 floor; this lands for the architectural reason, not a measurable
 win.
 
+## `BaseParser.parseRawNumber`: direct-integer accumulator
+
+After `fast-deflate` + `fast-refs` + `parallel-deflate`, the load
+side of the bottom-up table shifted onto the parser frames. One of
+them is `BaseParser.parseRawNumber`, called once per numeric token
+encountered during `PDFDocument.load` and twice per `N gen R`
+indirect reference -- so on the book it fires several hundred
+thousand times per load.
+
+The upstream implementation
+(`pdf-lib/.../parser/BaseParser.js:33`) builds the number as a
+string, one character at a time, then converts:
+
+```js
+let value = '';
+while (!this.bytes.done() && IsDigit[this.bytes.peek()]) {
+  value += charFromCode(this.bytes.next());
+}
+// ... fractional part, sign handling ...
+const numberValue = Number(value);
+if (!isFinite(numberValue) || numberValue > Number.MAX_SAFE_INTEGER) { ... }
+return numberValue;
+```
+
+Every call allocates a throwaway string of length 1..N (one `+=`
+allocation per digit), then runs `Number(value)` to parse the
+string back into a double, then runs guards. The string allocation
++ `Number()` round-trip is the cost we care about.
+
+### The shim
+
+`docs/lib/fast-parse-number.mjs` mutates
+`BaseParser.prototype.parseRawNumber` to accumulate the integer
+directly (`n = n * 10 + (byte - 0x30)`), descending into decimal
+handling only when a period appears. Falls back to the original
+for:
+
+- **More than 15 integer digits** -- direct accumulation could
+  exceed `Number.MAX_SAFE_INTEGER` (16 digits) and silently lose
+  precision. Upstream's `Number(value)` retains correctly-rounded
+  double precision in that range and emits the spec-mandated
+  overflow warning, so we rewind and delegate.
+- **Empty-digit cases** (e.g. `+`, `.`, bare sign) -- rewind and
+  let upstream throw `NumberParsingError` with full diagnostic
+  context. Both fallback paths are vanishingly rare on real PDFs.
+
+`BaseParser` isn't re-exported by pdf-lib's index, so we reach it
+via the package's CJS internal path through `createRequire`:
+
+```js
+const require = createRequire(import.meta.url);
+const BaseParser = require('pdf-lib/cjs/core/parser/BaseParser.js').default;
+```
+
+Mutating `BaseParser.prototype` propagates to every subclass --
+`PDFParser`, `PDFObjectParser`, `PDFObjectStreamParser`,
+`PDFXRefStreamParser`. One side-effecting import covers them all.
+
+`render-book.mjs` imports it unconditionally next to `fast-refs`.
+No harness flag yet; the win is small per-call but the call rate
+is high enough to matter -- to be measured later when the
+follow-on work (size-in-bytes / iterator / parseDict shims) makes
+the parser side worth quantifying as a group.
+
 ## Where this leaves the picture
 
 Cumulative process-phase cost, baseline → after all three shims:

From 79aa2a157a408bf44781ecbdd2ba3b3abc06a8fc Mon Sep 17 00:00:00 2001
From: Kuba Sunderland-Ober <kuba@mareimbrium.org>
Date: Sun, 24 May 2026 00:16:55 +0200
Subject: [PATCH 06/44] Improve performance of a few helper functions.

---
 docs/lib/fast-decode-name.mjs      |  70 ++++++
 docs/lib/fast-number-to-string.mjs |  65 +++++
 docs/lib/fast-parse-number.mjs     |  61 +++--
 docs/render-book.mjs               |  17 +-
 perf/README.md                     |  25 +-
 perf/measure.mjs                   |  23 ++
 perf/notes/08-pdf-lib.md           | 390 ++++++++++++++++++++++++++---
 7 files changed, 594 insertions(+), 57 deletions(-)
 create mode 100644 docs/lib/fast-decode-name.mjs
 create mode 100644 docs/lib/fast-number-to-string.mjs

diff --git a/docs/lib/fast-decode-name.mjs b/docs/lib/fast-decode-name.mjs
new file mode 100644
index 00000000..0f20a9fd
--- /dev/null
+++ b/docs/lib/fast-decode-name.mjs
@@ -0,0 +1,70 @@
+// Skip pdf-lib's decodeName regex scan when the input has no `#`.
+//
+// The upstream PDFName.of
+// ([PDFName.js:100](node_modules/pdf-lib/cjs/core/objects/PDFName.js:100))
+// is the gatekeeper for every PDFName instance the parser builds:
+//
+//   PDFName.of = function (name) {
+//       var decodedValue = decodeName(name);   // <-- always runs
+//       var instance = pool.get(decodedValue);
+//       if (!instance) { ... }
+//       return instance;
+//   };
+//
+// and decodeName at line 9 is:
+//
+//   name.replace(/#([\dABCDEF]{2})/g, function (_, hex) { ... })
+//
+// PDF spec (ISO 32000-1 §7.3.5) requires `#XX` hex-escape for any
+// byte outside printable-ASCII or for delimiters / whitespace. In
+// real PDFs almost no names use it. Instrumenting on the book:
+//
+//   PDFName.of calls       : 2,759,635
+//     raw input has # char : 2 (0.000%)
+//
+// So decodeName runs a regex scan against 2.76 M strings to find a
+// `#` that's only there twice in the whole load. Profile attributes
+// ~168 ms (7 %) of process self-time to this function.
+//
+// Shim: a parallel Map<string, PDFName> keyed by the raw `name`
+// argument. When `name` contains no `#`, decoded form equals raw
+// form, so our key matches pdf-lib's internal pool key and a hit
+// returns the deduped instance with zero regex work. Misses
+// delegate to the original (which does the regex scan once and
+// stores the instance in pdf-lib's pool); we cache the result so
+// every subsequent occurrence of the same name hits our fast path.
+//
+// Names containing `#` fall through to the original unchanged --
+// the correctness path (e.g. uppercase-only regex, lowercase escapes
+// silently un-decoded) is preserved exactly.
+//
+// Mechanism: PDFName is re-exported from pdf-lib's index, so we can
+// patch PDFName.of directly without reaching into CJS internals.
+// Static initializers (PDFName.Length, .FlateDecode, ...) ran when
+// pdf-lib's module body executed -- before this shim imports -- so
+// pdf-lib's pool is already populated with the canonical instances
+// the parser will see.
+//
+// Side-effecting import. Import once before any pdf-lib operation:
+//
+//   import "./lib/fast-decode-name.mjs";
+//
+// Idempotent -- repeated imports do nothing after the first.
+
+import { PDFName } from "pdf-lib";
+
+if (!PDFName.__fastDecodeNameInstalled) {
+  const original = PDFName.of;
+  const fastCache = new Map();
+  PDFName.of = function fastOf(name) {
+    if (name.indexOf("#") === -1) {
+      const cached = fastCache.get(name);
+      if (cached) return cached;
+      const instance = original.call(PDFName, name);
+      fastCache.set(name, instance);
+      return instance;
+    }
+    return original.call(PDFName, name);
+  };
+  PDFName.__fastDecodeNameInstalled = true;
+}
diff --git a/docs/lib/fast-number-to-string.mjs b/docs/lib/fast-number-to-string.mjs
new file mode 100644
index 00000000..57640a97
--- /dev/null
+++ b/docs/lib/fast-number-to-string.mjs
@@ -0,0 +1,65 @@
+// Skip pdf-lib's numberToString redundant work when the input doesn't
+// stringify to exponential notation.
+//
+// The upstream numberToString
+// ([numbers.js:13](node_modules/pdf-lib/cjs/utils/numbers.js:13)) is:
+//
+//   exports.numberToString = function (num) {
+//       var numStr = String(num);
+//       if (Math.abs(num) < 1.0) {
+//           var e = parseInt(num.toString().split('e-')[1]);
+//           if (e) { ... }
+//       } else {
+//           var e = parseInt(num.toString().split('+')[1]);
+//           if (e > 20) { ... }
+//       }
+//       return numStr;
+//   };
+//
+// It always computes `numStr = String(num)` up front -- but then
+// re-calls `num.toString()`, allocates a `.split(...)` array, and
+// runs parseInt on the result, even though `numStr` is already what
+// `.toString()` returns. Exponential notation in `String(num)` only
+// appears for |num| < 1e-6 or |num| >= 1e21, neither of which real
+// PDFs emit: object refs, generations, byte offsets, content-stream
+// coordinates, /Size, /Length, etc. all stringify to plain decimal.
+//
+// Shim: short-circuit when `String(num)` contains no `'e'` and return
+// it immediately. The rare exponential cases fall through to the
+// original so the spec-compliant expansion logic is preserved.
+//
+// Why three patches and not one: pdf-lib ships compiled against
+// tslib 1.x, whose `__exportStar` does a value-copy (`exports[p] =
+// m[p]`) rather than installing a live getter. So by the time
+// PDFNumber.js's `index_1.numberToString(value)` runs, `index_1` (the
+// utils/index barrel) holds a captured reference to the original
+// function, and mutating `numbers.numberToString` alone is invisible
+// to the call site. We patch the captured copies along the re-export
+// chain: utils/numbers (source), utils/index (the barrel PDFNumber
+// reads from), and pdf-lib's top-level index (the public surface).
+//
+// Side-effecting import. Import once before any pdf-lib operation:
+//
+//   import "./lib/fast-number-to-string.mjs";
+//
+// Idempotent -- repeated imports do nothing after the first.
+
+import { createRequire } from 'node:module';
+
+const require = createRequire(import.meta.url);
+const numbers     = require('pdf-lib/cjs/utils/numbers.js');
+const utilsBarrel = require('pdf-lib/cjs/utils/index.js');
+const topBarrel   = require('pdf-lib/cjs/index.js');
+
+if (!numbers.__fastNumberToStringInstalled) {
+  const original = numbers.numberToString;
+  const fastNumberToString = function fastNumberToString(num) {
+    const numStr = String(num);
+    if (numStr.indexOf('e') === -1) return numStr;
+    return original(num);
+  };
+  numbers.numberToString     = fastNumberToString;
+  utilsBarrel.numberToString = fastNumberToString;
+  topBarrel.numberToString   = fastNumberToString;
+  numbers.__fastNumberToStringInstalled = true;
+}
diff --git a/docs/lib/fast-parse-number.mjs b/docs/lib/fast-parse-number.mjs
index 661a40eb..0f202d0a 100644
--- a/docs/lib/fast-parse-number.mjs
+++ b/docs/lib/fast-parse-number.mjs
@@ -1,24 +1,28 @@
-// Replace pdf-lib's BaseParser.parseRawNumber with a direct-integer
-// accumulator that skips per-byte string concatenation, charFromCode
-// calls, and the trailing Number() string-parse round-trip.
+// Replace pdf-lib's BaseParser.parseRawNumber and BaseParser.parseRawInt
+// with direct-integer accumulators that skip per-byte string
+// concatenation, charFromCode calls, and the trailing Number()
+// string-parse round-trip.
 //
-// The upstream implementation
-// ([BaseParser.js:33](node_modules/pdf-lib/cjs/core/parser/BaseParser.js:33))
-// builds `value` one character at a time via `value += charFromCode(byte)`,
-// then calls `Number(value)` to convert the string back to a number,
-// then performs `isFinite` + MAX_SAFE_INTEGER guards on every call.
-// Every numeric token in a PDF flows through this path
-// (PDFObjectParser.parseNumberOrRef invokes it once per number, twice
-// per indirect ref), so on the book it fires hundreds of thousands of
-// times and allocates a throwaway string per call.
+// The upstream implementations
+// ([BaseParser.js:17 + :33](node_modules/pdf-lib/cjs/core/parser/BaseParser.js:17))
+// build `value` one character at a time via `value += charFromCode(byte)`,
+// then call `Number(value)` to convert the string back to a number,
+// then perform `isFinite` (and for parseRawNumber, MAX_SAFE_INTEGER)
+// guards on every call. Every numeric token in a PDF flows through
+// these paths: parseRawNumber via PDFObjectParser.parseNumberOrRef
+// (once per number, twice per indirect ref), parseRawInt via
+// PDFParser.parseIndirectObjectHeader (twice per indirect object) and
+// PDFObjectStreamParser (twice per object inside an ObjStm). On the
+// book this fires hundreds of thousands of times and allocates a
+// throwaway string per call.
 //
 // The fast path accumulates the integer directly (n = n*10 + (byte -
-// 0x30)) and only descends into decimal handling when a period appears.
-// Falls back to the original for:
+// 0x30)). parseRawNumber additionally descends into decimal handling
+// when a period appears. Both fall back to the original for:
 //   - Numbers with > 15 integer digits (where direct accumulation
 //     could exceed Number.MAX_SAFE_INTEGER and lose precision).
-//   - Empty-digit cases (e.g., "."), so upstream's NumberParsingError
-//     keeps its diagnostic context.
+//   - Empty-digit cases (e.g., bare sign or lone "."), so upstream's
+//     NumberParsingError keeps its diagnostic context.
 // Both fallback paths are vanishingly rare on real PDFs.
 //
 // Mechanism: BaseParser isn't re-exported by pdf-lib's index, so we
@@ -49,6 +53,31 @@ const MAX_SAFE_INT_DIGITS = 15;
 
 if (!BaseParser.__fastParseNumberInstalled) {
   const origParseRawNumber = BaseParser.prototype.parseRawNumber;
+  const origParseRawInt = BaseParser.prototype.parseRawInt;
+
+  BaseParser.prototype.parseRawInt = function fastParseRawInt() {
+    const bytes = this.bytes;
+    const start = bytes.offset();
+
+    let n = 0;
+    let digits = 0;
+    let byte = bytes.peek();
+    while (!bytes.done() && IsDigit[byte]) {
+      if (digits >= MAX_SAFE_INT_DIGITS) {
+        bytes.moveTo(start);
+        return origParseRawInt.call(this);
+      }
+      n = n * 10 + (byte - ZERO);
+      digits++;
+      bytes.next();
+      byte = bytes.peek();
+    }
+    if (digits === 0) {
+      bytes.moveTo(start);
+      return origParseRawInt.call(this);
+    }
+    return n;
+  };
 
   BaseParser.prototype.parseRawNumber = function fastParseRawNumber() {
     const bytes = this.bytes;
diff --git a/docs/render-book.mjs b/docs/render-book.mjs
index f70f7749..15fa4b2a 100644
--- a/docs/render-book.mjs
+++ b/docs/render-book.mjs
@@ -43,13 +43,22 @@ import { PDFDocument, ParseSpeeds } from 'pdf-lib';
 //     on the one pdf-lib call site that uses it
 //     (PDFCrossRefStreamParser during load). Negligible cost shift,
 //     but eliminates the last pdf-lib -> pako call at runtime.
-//   fast-parse-number -- direct-integer accumulator in front of
-//     BaseParser.parseRawNumber, skipping per-byte string concat
-//     and the trailing Number() round-trip. Touches every numeric
-//     token parsed during PDFDocument.load.
+//   fast-parse-number -- direct-integer accumulators in front of
+//     BaseParser.parseRawNumber + parseRawInt, skipping per-byte
+//     string concat and the trailing Number() round-trip. Touches
+//     every numeric token parsed during PDFDocument.load.
+//   fast-decode-name -- cache in front of PDFName.of that skips
+//     the decodeName regex scan when the input has no `#` (which
+//     is 99.999 % of the ~2.8 M PDFName.of calls per load).
+//   fast-number-to-string -- short-circuit numberToString when
+//     `String(num)` already lacks an `e` (i.e. for every PDF number
+//     that's not in the exponential-notation tail). Skips a
+//     redundant toString + split + parseInt per call.
 import './lib/fast-refs.mjs';
 import './lib/fast-inflate.mjs';
 import './lib/fast-parse-number.mjs';
+import './lib/fast-decode-name.mjs';
+import './lib/fast-number-to-string.mjs';
 import { parseOutline, setOutline } from './lib/outline.mjs';
 import { setMetadata }              from './lib/postprocesser.mjs';
 import { parallelSave }             from './lib/parallel-deflate.mjs';
diff --git a/perf/README.md b/perf/README.md
index 188c02f9..5bab84ba 100644
--- a/perf/README.md
+++ b/perf/README.md
@@ -66,7 +66,7 @@ or pdf-lib), or to write `book.pdf` for behavioural verification.
 The mirror command for CPU-profiling the pdf-lib roundtrip:
 
 ```
-node measure.mjs --fast-refs --parallel-deflate --cpu-profile-process --cpu-sampling 100
+node measure.mjs --fast-refs --parallel-deflate --fast-decode-name --fast-number-to-string --cpu-profile-process --cpu-sampling 100
 ```
 
 Flag rationale:
@@ -81,6 +81,20 @@ Flag rationale:
   `objectsPerStream: 500`. Production runs through it; same logic.
   Moves ~300 ms of zlib work off the main thread, and routes every
   deflate call through `node:zlib` (no pdf-lib pure-JS fallback).
+- `--fast-decode-name` -- inject
+  [docs/lib/fast-decode-name.mjs](../docs/lib/fast-decode-name.mjs), a
+  parallel `Map<string, PDFName>` in front of `PDFName.of` that
+  skips the `decodeName` regex scan when the raw name has no `#`
+  hex escape (99.999 % of the ~2.8 M `PDFName.of` calls per load).
+  Production runs through it; ~530 ms saved on process.
+- `--fast-number-to-string` -- inject
+  [docs/lib/fast-number-to-string.mjs](../docs/lib/fast-number-to-string.mjs),
+  short-circuiting pdf-lib's `numberToString` when `String(num)`
+  already lacks an `e` (i.e. for every PDF number that isn't in
+  the exponential-notation tail -- 100 % of ~290 k calls on the
+  book). Skips a redundant `toString` + `split` + `parseInt` per
+  call. Production runs through it. Profile self-time on the
+  function drops from ~45-50 ms (~2 % of process) to ~5-12 ms.
 - `--cpu-profile-process` -- attach Node's `inspector/promises`
   Profiler around the process phase only (skips render and generate).
   Writes `process.cpuprofile` into the timestamped `results/` folder.
@@ -103,7 +117,8 @@ Why no `--render-only`? `--cpu-profile-process` requires the process
 phase to run; the harness errors out if you combine them.
 
 To compare against upstream pdf-lib (e.g. when proposing a change
-upstream), drop `--fast-refs` and `--parallel-deflate`. Caveat for
+upstream), drop `--fast-refs`, `--parallel-deflate`,
+`--fast-decode-name`, and `--fast-number-to-string`. Caveat for
 A/B work: profiler-on attribution overstates the cost of hot
 functions called millions of times (`PDFRef.of` in particular).
 For "did this wall-clock change," do a paired no-profile A/B as a
@@ -234,6 +249,8 @@ run.bat --chrome-outline                  # let Chrome emit /Outlines (skip pars
 run.bat --tracing                         # capture a hybrid Chrome trace (Blink events + embedded V8 cpu samples)
 run.bat --fast-refs                       # dense-array cache for PDFRef.of's gen=0 path (ships in render-book.mjs by default; opt-in here for A/B)
 run.bat --parallel-deflate                # parallelSave with objectsPerStream=500 (also ships; opt-in here for A/B)
+run.bat --fast-decode-name                # skip decodeName regex when name has no # (also ships; opt-in here for A/B)
+run.bat --fast-number-to-string           # skip numberToString redundant toString/split when no exponential (also ships; opt-in here for A/B)
 ```
 
 Flags compose. The CPU profile lands as `render.cpuprofile`
@@ -343,6 +360,8 @@ file documenting each:
 | `pako.deflate` → `node:zlib.deflateSync` | [08](notes/08-pdf-lib.md) | ~1.5 s process (save -58 %) |
 | `PDFRef.of` dense-array cache (gen=0) | [08](notes/08-pdf-lib.md) | ~0.2 s process (load -16 %) |
 | Parallel deflate + `objectsPerStream: 500` | [08](notes/08-pdf-lib.md) | ~0.3 s process (zlib off-thread; PDF -5 %) |
+| `PDFName.of` no-`#` cache (skip `decodeName` regex) | [08](notes/08-pdf-lib.md) | ~0.5 s process (load -17 %, GC -101 ms) |
+| `numberToString` no-`e` short-circuit | [08](notes/08-pdf-lib.md) | ~40 ms profile, below wall-clock noise |
 
 What was tried and didn't ship:
 
@@ -369,4 +388,4 @@ order; later ones reference earlier ones for context.
 | [05-blink-trace.md](notes/05-blink-trace.md) | What happened when we tried move-not-clone (a `previousLeaf` cache shipped instead of move); cracking the cpu profile's `(program)` row open with a Blink-category trace; the WhiteSpaceFilter paired-A/B that found it wasn't worth its layout cost in our pipeline. |
 | [06-microtasks-pageranges-css.md](notes/06-microtasks-pageranges-css.md) | Following `RunMicrotasks` down to zero (chunker fully sync); why `pageRanges` sharding is off the table; CSS cost attribution showing print.css's individual sections are all below the noise floor. |
 | [07-memory.md](notes/07-memory.md) | Where the renderer's 1.9 GB goes -- process-tree footprint, per-allocator + per-Blink-class breakdown, `--disable-gpu` + `--in-process-gpu` saving ~200 MB, a GC-pass probe finding 180 MB of unswept Oilpan garbage. |
-| [08-pdf-lib.md](notes/08-pdf-lib.md) | Profiling the process phase via `--cpu-profile-process`; pako's per-stream init dominates with ~4 500 small streams (routing pdf-lib's `deflate` + `inflate` through `node:zlib` saves ~1.5 s); `PDFRef.of`'s string-keyed Map lookup at 1.2 M calls per load (dense-array gen=0 cache saves ~0.2 s); parallelising save's per-stream deflate on libuv's pool with `objectsPerStream: 500` (~0.3 s off the main thread; PDF -5 %). |
+| [08-pdf-lib.md](notes/08-pdf-lib.md) | Profiling the process phase via `--cpu-profile-process`; pako's per-stream init dominates with ~4 500 small streams (routing pdf-lib's `deflate` + `inflate` through `node:zlib` saves ~1.5 s); `PDFRef.of`'s string-keyed Map lookup at 1.2 M calls per load (dense-array gen=0 cache saves ~0.2 s); parallelising save's per-stream deflate on libuv's pool with `objectsPerStream: 500` (~0.3 s off the main thread; PDF -5 %); `decodeName`'s regex scan on 2.76 M `PDFName.of` calls per load with a 0.0001 % hit rate (no-`#` cache saves ~0.5 s); `numberToString`'s redundant `toString`/`split`/`parseInt` on the 100 % no-`e` path. |
diff --git a/perf/measure.mjs b/perf/measure.mjs
index 8844a80d..00c9d1ac 100644
--- a/perf/measure.mjs
+++ b/perf/measure.mjs
@@ -29,6 +29,7 @@
 //                    [--incremental] [--chrome-outline] [--timing]
 //                    [--clone-count] [--render-only]
 //                    [--fast-refs] [--parallel-deflate]
+//                    [--fast-decode-name] [--fast-number-to-string]
 //
 // --render-only bails out after the render phase. Skips meta extraction,
 // parseOutline, page.pdf, and the pdf-lib roundtrip / incremental writer.
@@ -103,6 +104,16 @@
 // parallel on libuv's thread pool with objectsPerStream=500 (vs
 // pdf-lib's serial save with default 50). Moves ~300 ms of zlib work
 // off the main thread on the book.
+//
+// --fast-decode-name installs a parallel cache in front of PDFName.of
+// that skips the decodeName regex scan when the raw name contains
+// no `#` hex escape (which is 99.999 % of the ~2.8 M PDFName.of
+// calls per load on the book). ~150 ms saved on process load.
+//
+// --fast-number-to-string short-circuits pdf-lib's numberToString
+// when String(num) already lacks an `e`. Skips a redundant toString,
+// split, and parseInt per call; only the rare exponential-notation
+// tail still falls through to the original implementation.
 
 import { pathToFileURL, fileURLToPath } from 'node:url';
 import { dirname, resolve, join } from 'node:path';
@@ -147,6 +158,8 @@ let renderOnly = false;
 let tracing = false;
 let fastRefs = false;
 let parallelDeflate = false;
+let fastDecodeName = false;
+let fastNumberToString = false;
 for (let i = 0; i < args.length; i++) {
   const a = args[i];
   if (a === '--out') outArg = args[++i];
@@ -170,6 +183,8 @@ for (let i = 0; i < args.length; i++) {
   else if (a === '--no-affinity') { /* handled in pin-cpu.mjs */ }
   else if (a === '--fast-refs') fastRefs = true;
   else if (a === '--parallel-deflate') parallelDeflate = true;
+  else if (a === '--fast-decode-name') fastDecodeName = true;
+  else if (a === '--fast-number-to-string') fastNumberToString = true;
   else if (!inputArg) inputArg = a;
   else { console.error(`unknown arg: ${a}`); process.exit(2); }
 }
@@ -215,6 +230,14 @@ if (fastRefs) {
   await import('../docs/lib/fast-refs.mjs');
   console.log('[harness] fast-refs: PDFRef.of dense-array cache for gen=0');
 }
+if (fastDecodeName) {
+  await import('../docs/lib/fast-decode-name.mjs');
+  console.log('[harness] fast-decode-name: skip decodeName regex when name has no #');
+}
+if (fastNumberToString) {
+  await import('../docs/lib/fast-number-to-string.mjs');
+  console.log('[harness] fast-number-to-string: skip redundant toString/split when no exponential');
+}
 
 const stamp = new Date().toISOString().replace(/[:.]/g, '-');
 const outDir = outArg
diff --git a/perf/notes/08-pdf-lib.md b/perf/notes/08-pdf-lib.md
index 351a403e..8d2c9dfa 100644
--- a/perf/notes/08-pdf-lib.md
+++ b/perf/notes/08-pdf-lib.md
@@ -679,14 +679,15 @@ No harness flag -- the per-load cost is below the profile noise
 floor; this lands for the architectural reason, not a measurable
 win.
 
-## `BaseParser.parseRawNumber`: direct-integer accumulator
+## `BaseParser.parseRawNumber` + `parseRawInt`: direct-integer accumulators
 
 After `fast-deflate` + `fast-refs` + `parallel-deflate`, the load
-side of the bottom-up table shifted onto the parser frames. One of
-them is `BaseParser.parseRawNumber`, called once per numeric token
-encountered during `PDFDocument.load` and twice per `N gen R`
-indirect reference -- so on the book it fires several hundred
-thousand times per load.
+side of the bottom-up table shifted onto the parser frames. Two of
+them are `BaseParser.parseRawNumber` (called once per numeric
+token, twice per `N gen R` indirect reference) and
+`BaseParser.parseRawInt` (called twice per indirect-object header
+and twice per object inside an `ObjStm`). Between them they fire
+hundreds of thousands of times per load on the book.
 
 The upstream implementation
 (`pdf-lib/.../parser/BaseParser.js:33`) builds the number as a
@@ -710,11 +711,12 @@ string back into a double, then runs guards. The string allocation
 
 ### The shim
 
-`docs/lib/fast-parse-number.mjs` mutates
-`BaseParser.prototype.parseRawNumber` to accumulate the integer
-directly (`n = n * 10 + (byte - 0x30)`), descending into decimal
-handling only when a period appears. Falls back to the original
-for:
+`docs/lib/fast-parse-number.mjs` mutates both
+`BaseParser.prototype.parseRawNumber` and
+`BaseParser.prototype.parseRawInt` to accumulate the integer
+directly (`n = n * 10 + (byte - 0x30)`). The number variant
+additionally descends into decimal handling when a period appears.
+Both fall back to the original for:
 
 - **More than 15 integer digits** -- direct accumulation could
   exceed `Number.MAX_SAFE_INTEGER` (16 digits) and silently lose
@@ -743,31 +745,351 @@ is high enough to matter -- to be measured later when the
 follow-on work (size-in-bytes / iterator / parseDict shims) makes
 the parser side worth quantifying as a group.
 
+## `decodeName`: skip the regex on the 99.999 % no-`#` path
+
+The earlier closing summary above wrote off `decodeName` as "close
+to fundamental pdf-lib work." Re-reading the function on a later
+pass disproved that.
+
+`pdf-lib/.../objects/PDFName.js:9`:
+
+```js
+var decodeName = function (name) {
+    return name.replace(/#([\dABCDEF]{2})/g, function (_, hex) {
+        return utils_1.charFromHexCode(hex);
+    });
+};
+```
+
+PDF spec (ISO 32000-1 §7.3.5) requires `#XX` hex-escape for any
+byte outside the printable-ASCII regular range plus delimiters /
+whitespace. `decodeName` reverses that on every `PDFName.of(name)`
+call so the pool key is the canonical decoded form, dedup'ing
+`/foo#20bar` and `/foo bar` to the same instance.
+
+The catch: the regex has to scan every byte of every name looking
+for `#`, even when there is none.
+
+### Workload shape
+
+Instrumented `PDFName.of` on the book, counting calls and how
+often the input contains a `#`:
+
+```
+PDFName.of calls       : 2,759,635
+  raw input has # char : 2 (0.000%)
+```
+
+Two. In 2.76 million calls. The other 2,759,633 are regex scans
+against strings like `Type`, `S`, `P`, `Pg`, `StructElem`, `Kids`,
+`Count`, `Filter`, `FlateDecode` -- ordinary PDF names that need
+no escaping. We measured ~214 ms (7 %) of process self-time on
+`decodeName` and another ~91 ms on `PDFName.of`'s body that calls
+it.
+
+### The shim
+
+`docs/lib/fast-decode-name.mjs` follows the `fast-refs.mjs` shape:
+cache in front of `PDFName.of` rather than replacing it. The key
+insight is that when `name` has no `#`, the decoded form equals
+the raw form, so the raw `name` is already a valid pool key for
+pdf-lib's internal dedup pool -- a fast-side `Map<string, PDFName>`
+keyed by the raw input returns the same `PDFName` instance pdf-lib
+would have produced after a regex scan + pool lookup, without ever
+running the regex.
+
+```js
+import { PDFName } from "pdf-lib";
+
+if (!PDFName.__fastDecodeNameInstalled) {
+  const original = PDFName.of;
+  const fastCache = new Map();
+  PDFName.of = function fastOf(name) {
+    if (name.indexOf("#") === -1) {
+      const cached = fastCache.get(name);
+      if (cached) return cached;
+      const instance = original.call(PDFName, name);
+      fastCache.set(name, instance);
+      return instance;
+    }
+    return original.call(PDFName, name);
+  };
+  PDFName.__fastDecodeNameInstalled = true;
+}
+```
+
+Names with `#` fall through to the original -- the dual canonical-
+form contract is preserved exactly. Static `PDFName.Length`,
+`PDFName.FlateDecode`, ... initialisers ran when pdf-lib's module
+body executed (before the shim imports), so pdf-lib's pool is
+already populated with the canonical instances; the parser then
+hits the fast cache on every subsequent reference.
+
+### Results
+
+Paired A/B, four interleaved runs (`pre1 post1 pre2 post2`),
+`--detach-pages --no-timing --fast-refs --parallel-deflate
+--cpu-profile-process --cpu-sampling 100`, same 1651-page book:
+
+| metric        | pre avg | post avg | Δ                |
+| ------------- | ------- | -------- | ---------------- |
+| **process**   | **2.74 s** | **2.21 s** | **-0.53 s (-19 %)** |
+| ↳ load        | 1.69 s  | 1.40 s   | -0.29 s (-17 %) |
+| ↳ setOutline  | 0.01 s  | 0.01 s   | unchanged |
+| ↳ save        | 1.04 s  | 0.81 s   | -0.23 s (-22 %) |
+| pdf size      | 16.1 MB | 16.1 MB  | byte-identical pairwise (pre1↔post1, pre2↔post2; 31 B intra-pair drift is `/CreationDate`) |
+
+The load drop is what the instrumentation predicted. The save drop
+was a surprise -- save doesn't call `PDFName.of` to build outline
+metadata in the hot path, so the saving is almost certainly GC
+pressure relief from no longer allocating ~2.76 M regex-match
+objects during load.
+
+Profile diff (single run each, same flags):
+
+| function | PRE | POST | Δ |
+| --- | --- | --- | --- |
+| `decodeName`             | 214 ms (7.4 %) | not in top 15 | **-214 ms** |
+| `PDFName.of`             |  91 ms (3.1 %) | not in top 15 | **-91 ms** |
+| `fastOf` (the shim body) | n/a            |  91 ms (4.1 %) | +91 ms |
+| `(garbage collector)`    | 339 ms (11.7 %) | 238 ms (10.8 %) | -101 ms |
+| profile duration         | 2.92 s         | 2.22 s | -0.70 s |
+
+The `fastOf` row sits at the same self-time as the old
+`PDFName.of` forwarder (~91 ms) -- that's the per-call cost of the
+`indexOf` check + `fastCache.get` + return, which all calls now
+pay. The 214 ms `decodeName` row is gone entirely (regex never
+runs on the fast path), and the GC drop is the allocator relief.
+
+### Production confirmation
+
+Two consecutive `book.bat` runs with all four shims live
+(`fast-refs`, `fast-parse-number`, `parallel-deflate`,
+`fast-decode-name`):
+
+| metric | run 1 | run 2 |
+| --- | --- | --- |
+| render   | 8.9 s | 8.3 s |
+| generate | 39.3 s | 37.6 s |
+| process  | **1.6 s** | **1.6 s** |
+| total    | 51.8 s | 50.0 s |
+
+Process is now ~1.6 s on the production path, off the profiler.
+The harness numbers above are higher (~2.2 s post-fix) because of
+profiler-on attribution overhead at 100 us sampling -- the same
+caveat the `PDFRef.of` section flagged. The paired-A/B delta from
+the harness (-0.53 s) is the correct measure of the shim's win;
+the absolute 1.6 s is the production floor.
+
+### Methodology note
+
+This one almost didn't get found. The earlier "what's left" summary
+explicitly wrote `decodeName` off as "close to fundamental" parser
+work, on the strength of it living in a single regex line. The
+actual investigation took 30 seconds: read the function, ask
+"what's the hit rate of that regex on real PDF names?", instrument
+with a one-liner counter, find that the answer is 0.0001 %. Worth
+re-checking the "fundamental" label on remaining JS-body rows
+whenever a small change to the workload might invert it.
+
+## `numberToString`: skip the redundant toString/split on the 100 % no-`e` path
+
+`pdf-lib/.../utils/numbers.js:13` is pdf-lib's `.toString()`
+replacement that suppresses exponential notation -- PDF syntax
+requires plain decimal in the object body (`1e-7` is invalid), so
+every numeric token written into the file goes through:
+
+```js
+exports.numberToString = function (num) {
+    var numStr = String(num);
+    if (Math.abs(num) < 1.0) {
+        var e = parseInt(num.toString().split('e-')[1]);
+        if (e) { /* expand "1e-7" -> "0.0000001" */ }
+    } else {
+        var e = parseInt(num.toString().split('+')[1]);
+        if (e > 20) { /* expand "1e+21" -> "100...0" */ }
+    }
+    return numStr;
+};
+```
+
+`numStr` is computed up front via `String(num)`. Then -- regardless
+of whether `numStr` actually contains an `e` -- the function calls
+`num.toString()` *again*, allocates a `.split(...)` array, and
+runs `parseInt` on the (almost always undefined) result. Pure
+overhead on every call where `String(num)` already returned a
+plain decimal, which on a real PDF is every call.
+
+### Workload shape
+
+Instrumented `numberToString` on the book, counting fast-path
+(`String(num).indexOf('e') === -1`) vs slow-path hits:
+
+```
+numberToString calls : 290,231
+  String(num) has 'e' : 0 (0.000 %)
+```
+
+Zero. Of 290 k calls. `String(num)` returns exponential notation
+only when `|num| < 1e-6` or `|num| >= 1e21`, and a PDF's object
+refs, generations, byte offsets, content-stream coordinates,
+`/Size`, `/Length` etc. never land in either tail. The credit-card
+trick guarding the `e` cases is paid 290 k times to handle 0.
+
+### The shim
+
+`docs/lib/fast-number-to-string.mjs` short-circuits the no-`e`
+case and delegates the rare exponential cases to the original
+implementation unchanged:
+
+```js
+const fastNumberToString = function fastNumberToString(num) {
+  const numStr = String(num);
+  if (numStr.indexOf('e') === -1) return numStr;
+  return original(num);
+};
+numbers.numberToString     = fastNumberToString;
+utilsBarrel.numberToString = fastNumberToString;
+topBarrel.numberToString   = fastNumberToString;
+```
+
+### Wiring gotcha: tslib 1.x value-copy re-exports
+
+pdf-lib ships compiled against `tslib@1.14.1`, whose
+`__exportStar` is:
+
+```js
+function (m, exports) {
+    for (var p in m) if (p !== "default" && !exports.hasOwnProperty(p)) exports[p] = m[p];
+}
+```
+
+A plain value-copy. tslib 2.x replaced this with a live getter
+(`Object.defineProperty(o, p, { get: () => m[p] })`), so on modern
+compilations a single `numbers.numberToString = fast` patch would
+propagate through every re-export automatically. On 1.x it
+doesn't.
+
+`PDFNumber`'s call site -- the only consumer of `numberToString`
+in pdf-lib's source -- reads from the utils-barrel, not from
+`numbers.js` directly:
+
+```js
+// PDFNumber.js
+var index_1 = require("../../utils/index");
+...
+_this.stringValue = index_1.numberToString(value);   // <-- captured copy
+```
+
+Because `import { PDFDocument } from 'pdf-lib'` runs *before* the
+shim's dynamic import, the barrel has already executed
+`__exportStar(numbersModule, exports)` and stamped its own copy of
+the original function. Mutating `numbers.numberToString`
+afterwards is invisible to `PDFNumber`. The first iteration of
+this shim looked installed (the standalone test showed the patched
+function on the barrel, because that test imported the barrel
+*after* the shim) but the harness counter recorded 0 hits on the
+patched body -- the upstream function was still hot in the profile
+under its original name.
+
+Fix: patch every re-export in the chain that captures by value:
+`utils/numbers` (the source), `utils/index` (the barrel
+`PDFNumber` reads from), and `cjs/index` (pdf-lib's top-level,
+which `__exportStar`s the utils barrel onward to anyone importing
+from `'pdf-lib'`). All three get the same `fastNumberToString`
+reference.
+
+The `fast-decode-name` / `fast-refs` / `fast-parse-number` shims
+don't hit this trap because their targets are class-static methods
+(`PDFName.of`, `PDFRef.of`) or `BaseParser.prototype` methods --
+all looked up at call time via the class/prototype object, not via
+a captured value. `numberToString` is the first free function
+we've patched in pdf-lib.
+
+### Results
+
+Paired A/B, two interleaved runs each (`pre1 post1 pre2 post2`),
+`--detach-pages --no-timing --fast-refs --parallel-deflate
+--fast-decode-name --cpu-profile-process --cpu-sampling 100`,
+same 1638-page book:
+
+| metric                                  | pre1   | pre2   | post1  | post2  |
+| ---                                     | ---    | ---    | ---    | ---    |
+| upstream `numberToString` self-time     | 45 ms  | 51 ms  | 0 ms   | 0 ms   |
+| shim `fastNumberToString` self-time     | n/a    | n/a    | 5 ms   | 12 ms  |
+| **combined self-time on this function** | **45 ms** | **51 ms** | **5 ms** | **12 ms** |
+| slow-path delegations to original       | n/a    | n/a    | 0      | 0      |
+
+The `String(num).indexOf('e') === -1` short-circuit fires on 100 %
+of calls; the upstream function is unreachable in practice.
+Function-level self-time drops by ~80 % (~40 ms saved on the hot
+function), the redundant `num.toString()` + `.split(...)` +
+`parseInt(...)` work gone from the trace.
+
+Wall-clock process-phase numbers on this dev machine bounce around
+enough run-to-run (~±0.15 s) that the ~40 ms function-level saving
+is invisible at the phase total -- both pre and post sit near
+2.05 s. The profile-level evidence is the real signal: the cycles
+were redundant, they're not being spent any more.
+
+### Methodology note
+
+The first cut of this shim mutated `numbers.numberToString` only,
+following the assumption that pdf-lib's re-exports would propagate
+the change. The hit counter (`fast=0 slow=0` on a full book run)
+caught the mistake before the README evidence was written -- a
+shim that *looks* installed but never actually runs would have
+shown identical "before" and "after" profile numbers within noise,
+indistinguishable from a no-op patch.
+
+Lesson for the next pdf-lib shim of a free function (rather than a
+class method): check `tslib.__exportStar`'s shape before assuming
+a single-site patch works.
+
+## `@cantoo/pdf-lib`: not a drop-in replacement
+
+Spot-checked the maintained fork (`@cantoo/pdf-lib` 2.6.5) as an
+alternative to Hopding's abandoned `pdf-lib` 1.17.1. Source-diff:
+the four hot paths our shims address (`PDFRef.of`'s string-keyed
+pool, `decodeName`'s unconditional regex, `parseRawInt` /
+`parseRawNumber`'s per-byte string concat,
+`PDFFlateStream.computeContents`'s synchronous pako call) are
+byte-identical to upstream. Paired A/B on the book confirmed:
+cantoo without shims runs the process phase in ~150 s vs our ~1.5 s
+with shims, and has its own footguns (silent compression-disable
+on PDF < 1.5, separate save-path pathology with `useObjectStreams:
+true` that wasn't chased). Not a drop-in replacement; staying on
+Hopding + shims.
+
 ## Where this leaves the picture
 
-Cumulative process-phase cost, baseline → after all three shims:
-
-| state                              | process | load | save |
-| ---                                | ---     | ---  | ---  |
-| original (Slow / 50 defaults)      | ~40 s   | ~36 s| ~4 s |
-| + parseSpeed:Fastest               | ~5 s    | ~2 s | ~3 s |
-| + fast-deflate                     | ~2.5 s  | ~1.5s| ~1 s |
-| + fast-refs                        | ~2.3 s  | ~1.3 s | ~1 s |
-| **+ parallel-deflate (this section)** | **~2.0 s** | **~1.3 s** | **~0.7 s** |
-
-The bottom-up after parallel deflate is dominated by pdf-lib's
-parser frames -- `PDFDict.entries` (8 %), `decodeName` (8 %), GC
-(8 %), `parseRawNumber` (6 %), `PDFRef.of` (5 %, the gen != 0
-residue). All load-phase, all O(input bytes), all close to
-fundamental pdf-lib work. Further wins in this phase would mean
-rewriting pdf-lib's parser.
-
-The pdf-lib roundtrip path is now ~2.0 s of a ~50 s build. The
-incremental writer's 0.25 s process phase (see
-[01-baseline-and-detach.md](01-baseline-and-detach.md)) is still
-strictly faster on process alone, but the pdf-lib path delivers a
-15.3 MB output vs incremental's 53 MB, and the ~2 s gap on a 50 s
-build doesn't justify the file-size cost for our pipeline.
+Cumulative process-phase cost, baseline → after the shims to date:
+
+| state                                | process | load | save |
+| ---                                  | ---     | ---  | ---  |
+| original (Slow / 50 defaults)        | ~40 s   | ~36 s| ~4 s |
+| + parseSpeed:Fastest                 | ~5 s    | ~2 s | ~3 s |
+| + fast-deflate                       | ~2.5 s  | ~1.5s| ~1 s |
+| + fast-refs                          | ~2.3 s  | ~1.3 s | ~1 s |
+| + parallel-deflate                   | ~2.0 s  | ~1.3 s | ~0.7 s |
+| **+ fast-decode-name + fast-number-to-string (this section)** | **~1.6 s** | **~1.0 s** | **~0.6 s** |
+
+The bottom-up after the latest pair is what's left of pdf-lib's
+genuine parser work: `PDFDict.entries`, `PDFObjectParser.parseName`,
+`PDFObjectParser.parseDict`, GC, with no remaining JS-body row
+sitting on "regex scanning for something that's never there" or
+"redundant `toString` round-trip" shape. The `fastOf` row at
+~91 ms is a real floor for any cache-in-front approach: the
+`indexOf` + `Map.get` cost ~33 ns per call across 2.76 M calls.
+
+The pdf-lib roundtrip path is now ~1.6 s on production
+(profiler-off; the harness reports ~2.0-2.2 s with profiler-on
+attribution overhead). The incremental writer's 0.25 s process
+phase (see [01-baseline-and-detach.md](01-baseline-and-detach.md))
+is still strictly faster on process alone, but the pdf-lib path
+delivers a 15.3 MB output vs incremental's 53 MB, and the ~1.4 s
+gap on a 50 s build doesn't justify the file-size cost for our
+pipeline.
 
 The strategic note from earlier phases still stands: generate's
 ~38 s in `page.pdf()` is the remaining lever, and `pageRanges`

From cbd1534cc90c654868325b6191eccbf45bb14291 Mon Sep 17 00:00:00 2001
From: Kuba Sunderland-Ober <kuba@mareimbrium.org>
Date: Sun, 24 May 2026 00:21:57 +0200
Subject: [PATCH 07/44] Speed up size-in-bytes, add previous patches to
 measure.mjs.

---
 docs/lib/fast-size-in-bytes.mjs |  62 ++++++++++++++
 docs/render-book.mjs            |   5 ++
 perf/README.md                  |  44 ++++++++--
 perf/measure.mjs                |  41 +++++++++
 perf/notes/08-pdf-lib.md        | 142 +++++++++++++++++++++++++++++++-
 5 files changed, 284 insertions(+), 10 deletions(-)
 create mode 100644 docs/lib/fast-size-in-bytes.mjs

diff --git a/docs/lib/fast-size-in-bytes.mjs b/docs/lib/fast-size-in-bytes.mjs
new file mode 100644
index 00000000..779ade41
--- /dev/null
+++ b/docs/lib/fast-size-in-bytes.mjs
@@ -0,0 +1,62 @@
+// Replace pdf-lib's utils.sizeInBytes -- which allocates a base-2 string
+// just to count its bit length -- with a non-allocating short-circuit
+// ladder.
+//
+// The upstream sizeInBytes
+// ([numbers.js:37](node_modules/pdf-lib/cjs/utils/numbers.js:37)) is:
+//
+//   exports.sizeInBytes = function (n) {
+//       return Math.ceil(n.toString(2).length / 8);
+//   };
+//
+// It's called from PDFCrossRefStream.computeMaxEntryByteWidths (three
+// calls per xref entry, ~50 k entries on the book) and from
+// utils.bytesFor (to size the Uint8Array before filling it byte-by-
+// byte, called from PDFCrossRefStream.getUnencodedContents). Both
+// paths are part of writing the cross-reference stream.
+//
+// For the xref values the distribution is heavily skewed small: type
+// is always 0/1/2 (1 byte), generationNumber is always 0 (1 byte),
+// object-stream indices are small (1-2 bytes), and file offsets are
+// 3-4 bytes for any sub-4GB PDF. A short-circuit ladder catches the
+// dominant cases in one compare; the rare 5+ byte tail falls through
+// to a Math.clz32-based fallback that's still allocation-free.
+//
+// Why patch three places (and why bytesFor isn't on the list):
+// pdf-lib ships compiled against tslib 1.x, whose `__exportStar`
+// does a value-copy (`exports[p] = m[p]`) rather than installing a
+// live getter. So consumers that read sizeInBytes through a barrel
+// (`utils_1.sizeInBytes(...)` from PDFCrossRefStream) hold a
+// captured reference and won't see a mutation of `numbers.sizeInBytes`
+// alone. Patch all three barrel layers (utils/numbers, utils/index,
+// top-level index) to cover every observed call site. utils.bytesFor
+// reads `exports.sizeInBytes` at call time from the same module
+// object we mutate first, so it picks up the fast path without a
+// separate patch.
+//
+// Side-effecting import. Import once before any pdf-lib operation:
+//
+//   import "./lib/fast-size-in-bytes.mjs";
+//
+// Idempotent -- repeated imports do nothing after the first.
+
+import { createRequire } from 'node:module';
+
+const require = createRequire(import.meta.url);
+const numbers     = require('pdf-lib/cjs/utils/numbers.js');
+const utilsBarrel = require('pdf-lib/cjs/utils/index.js');
+const topBarrel   = require('pdf-lib/cjs/index.js');
+
+if (!numbers.__fastSizeInBytesInstalled) {
+  const fastSizeInBytes = function fastSizeInBytes(n) {
+    if (n < 0x100) return 1;
+    if (n < 0x10000) return 2;
+    if (n < 0x1000000) return 3;
+    if (n < 0x100000000) return 4;
+    return 4 + Math.ceil((32 - Math.clz32(Math.floor(n / 0x100000000))) / 8);
+  };
+  numbers.sizeInBytes     = fastSizeInBytes;
+  utilsBarrel.sizeInBytes = fastSizeInBytes;
+  topBarrel.sizeInBytes   = fastSizeInBytes;
+  numbers.__fastSizeInBytesInstalled = true;
+}
diff --git a/docs/render-book.mjs b/docs/render-book.mjs
index 15fa4b2a..bccf6a38 100644
--- a/docs/render-book.mjs
+++ b/docs/render-book.mjs
@@ -54,11 +54,16 @@ import { PDFDocument, ParseSpeeds } from 'pdf-lib';
 //     `String(num)` already lacks an `e` (i.e. for every PDF number
 //     that's not in the exponential-notation tail). Skips a
 //     redundant toString + split + parseInt per call.
+//   fast-size-in-bytes -- replace utils.sizeInBytes (which allocates
+//     `n.toString(2)` just to count its bit length) with a non-
+//     allocating short-circuit ladder. Called ~300 k times per save
+//     from PDFCrossRefStream's xref writer.
 import './lib/fast-refs.mjs';
 import './lib/fast-inflate.mjs';
 import './lib/fast-parse-number.mjs';
 import './lib/fast-decode-name.mjs';
 import './lib/fast-number-to-string.mjs';
+import './lib/fast-size-in-bytes.mjs';
 import { parseOutline, setOutline } from './lib/outline.mjs';
 import { setMetadata }              from './lib/postprocesser.mjs';
 import { parallelSave }             from './lib/parallel-deflate.mjs';
diff --git a/perf/README.md b/perf/README.md
index 5bab84ba..3b97916e 100644
--- a/perf/README.md
+++ b/perf/README.md
@@ -66,7 +66,7 @@ or pdf-lib), or to write `book.pdf` for behavioural verification.
 The mirror command for CPU-profiling the pdf-lib roundtrip:
 
 ```
-node measure.mjs --fast-refs --parallel-deflate --fast-decode-name --fast-number-to-string --cpu-profile-process --cpu-sampling 100
+node measure.mjs --fast-refs --parallel-deflate --fast-decode-name --fast-number-to-string --fast-size-in-bytes --fast-inflate --fast-parse-number --cpu-profile-process --cpu-sampling 100
 ```
 
 Flag rationale:
@@ -95,6 +95,30 @@ Flag rationale:
   book). Skips a redundant `toString` + `split` + `parseInt` per
   call. Production runs through it. Profile self-time on the
   function drops from ~45-50 ms (~2 % of process) to ~5-12 ms.
+- `--fast-size-in-bytes` -- inject
+  [docs/lib/fast-size-in-bytes.mjs](../docs/lib/fast-size-in-bytes.mjs),
+  replacing pdf-lib's `utils.sizeInBytes` (which allocates
+  `n.toString(2)` just to count its bit length) with a non-
+  allocating short-circuit ladder. Called ~300 k times per save
+  from `PDFCrossRefStream`'s xref writer; the dominant inputs
+  are 1-2 byte values so a `n < 0x100 ? 1 : ...` ladder catches
+  most calls in one compare. Production runs through it. ~60 ms
+  saved on process.
+- `--fast-inflate` -- inject
+  [docs/lib/fast-inflate.mjs](../docs/lib/fast-inflate.mjs), swapping
+  `pako.inflate` for `node:zlib.inflateSync` on the one path
+  pdf-lib uses it (the compressed xref stream during load).
+  Negligible wall-clock; flag exists so paired A/Bs against pure
+  upstream pdf-lib can keep the rest of the perf set on while
+  isolating this swap. Production runs through it.
+- `--fast-parse-number` -- inject
+  [docs/lib/fast-parse-number.mjs](../docs/lib/fast-parse-number.mjs),
+  replacing `BaseParser.parseRawNumber` / `parseRawInt` with
+  direct-integer accumulators (`n = n*10 + (byte - 0x30)`) that
+  skip per-byte string concat and the trailing `Number()` round-
+  trip. Every numeric token parsed during `PDFDocument.load`
+  flows through these -- hundreds of thousands of calls per load
+  on the book. Production runs through it.
 - `--cpu-profile-process` -- attach Node's `inspector/promises`
   Profiler around the process phase only (skips render and generate).
   Writes `process.cpuprofile` into the timestamped `results/` folder.
@@ -116,13 +140,11 @@ Profile one phase at a time.
 Why no `--render-only`? `--cpu-profile-process` requires the process
 phase to run; the harness errors out if you combine them.
 
-To compare against upstream pdf-lib (e.g. when proposing a change
-upstream), drop `--fast-refs`, `--parallel-deflate`,
-`--fast-decode-name`, and `--fast-number-to-string`. Caveat for
-A/B work: profiler-on attribution overstates the cost of hot
-functions called millions of times (`PDFRef.of` in particular).
-For "did this wall-clock change," do a paired no-profile A/B as a
-sanity check.
+upstream), drop every `--fast-*` flag and `--parallel-deflate`.
+Caveat for A/B work: profiler-on attribution overstates the cost
+of hot functions called millions of times (`PDFRef.of` in
+particular). For "did this wall-clock change," do a paired
+no-profile A/B as a sanity check.
 
 See [notes/08-pdf-lib.md](notes/08-pdf-lib.md) for the process-phase
 investigations these flags enabled.
@@ -251,6 +273,9 @@ run.bat --fast-refs                       # dense-array cache for PDFRef.of's ge
 run.bat --parallel-deflate                # parallelSave with objectsPerStream=500 (also ships; opt-in here for A/B)
 run.bat --fast-decode-name                # skip decodeName regex when name has no # (also ships; opt-in here for A/B)
 run.bat --fast-number-to-string           # skip numberToString redundant toString/split when no exponential (also ships; opt-in here for A/B)
+run.bat --fast-size-in-bytes              # non-allocating ladder for xref byte-width (also ships; opt-in here for A/B)
+run.bat --fast-inflate                    # swap pako.inflate for node:zlib.inflateSync (also ships; opt-in here for A/B)
+run.bat --fast-parse-number               # direct-integer accumulator for parseRawNumber/parseRawInt (also ships; opt-in here for A/B)
 ```
 
 Flags compose. The CPU profile lands as `render.cpuprofile`
@@ -362,6 +387,7 @@ file documenting each:
 | Parallel deflate + `objectsPerStream: 500` | [08](notes/08-pdf-lib.md) | ~0.3 s process (zlib off-thread; PDF -5 %) |
 | `PDFName.of` no-`#` cache (skip `decodeName` regex) | [08](notes/08-pdf-lib.md) | ~0.5 s process (load -17 %, GC -101 ms) |
 | `numberToString` no-`e` short-circuit | [08](notes/08-pdf-lib.md) | ~40 ms profile, below wall-clock noise |
+| `sizeInBytes` short-circuit ladder (no base-2 string) | [08](notes/08-pdf-lib.md) | ~60 ms process (save -70 ms) |
 
 What was tried and didn't ship:
 
@@ -388,4 +414,4 @@ order; later ones reference earlier ones for context.
 | [05-blink-trace.md](notes/05-blink-trace.md) | What happened when we tried move-not-clone (a `previousLeaf` cache shipped instead of move); cracking the cpu profile's `(program)` row open with a Blink-category trace; the WhiteSpaceFilter paired-A/B that found it wasn't worth its layout cost in our pipeline. |
 | [06-microtasks-pageranges-css.md](notes/06-microtasks-pageranges-css.md) | Following `RunMicrotasks` down to zero (chunker fully sync); why `pageRanges` sharding is off the table; CSS cost attribution showing print.css's individual sections are all below the noise floor. |
 | [07-memory.md](notes/07-memory.md) | Where the renderer's 1.9 GB goes -- process-tree footprint, per-allocator + per-Blink-class breakdown, `--disable-gpu` + `--in-process-gpu` saving ~200 MB, a GC-pass probe finding 180 MB of unswept Oilpan garbage. |
-| [08-pdf-lib.md](notes/08-pdf-lib.md) | Profiling the process phase via `--cpu-profile-process`; pako's per-stream init dominates with ~4 500 small streams (routing pdf-lib's `deflate` + `inflate` through `node:zlib` saves ~1.5 s); `PDFRef.of`'s string-keyed Map lookup at 1.2 M calls per load (dense-array gen=0 cache saves ~0.2 s); parallelising save's per-stream deflate on libuv's pool with `objectsPerStream: 500` (~0.3 s off the main thread; PDF -5 %); `decodeName`'s regex scan on 2.76 M `PDFName.of` calls per load with a 0.0001 % hit rate (no-`#` cache saves ~0.5 s); `numberToString`'s redundant `toString`/`split`/`parseInt` on the 100 % no-`e` path. |
+| [08-pdf-lib.md](notes/08-pdf-lib.md) | Profiling the process phase via `--cpu-profile-process`; pako's per-stream init dominates with ~4 500 small streams (routing pdf-lib's `deflate` + `inflate` through `node:zlib` saves ~1.5 s); `PDFRef.of`'s string-keyed Map lookup at 1.2 M calls per load (dense-array gen=0 cache saves ~0.2 s); parallelising save's per-stream deflate on libuv's pool with `objectsPerStream: 500` (~0.3 s off the main thread; PDF -5 %); `decodeName`'s regex scan on 2.76 M `PDFName.of` calls per load with a 0.0001 % hit rate (no-`#` cache saves ~0.5 s); `numberToString`'s redundant `toString`/`split`/`parseInt` on the 100 % no-`e` path; `sizeInBytes` allocating `n.toString(2)` on ~300 k xref-writer calls (short-circuit ladder saves ~60 ms). |
diff --git a/perf/measure.mjs b/perf/measure.mjs
index 00c9d1ac..1de7eb71 100644
--- a/perf/measure.mjs
+++ b/perf/measure.mjs
@@ -30,6 +30,8 @@
 //                    [--clone-count] [--render-only]
 //                    [--fast-refs] [--parallel-deflate]
 //                    [--fast-decode-name] [--fast-number-to-string]
+//                    [--fast-size-in-bytes] [--fast-inflate]
+//                    [--fast-parse-number]
 //
 // --render-only bails out after the render phase. Skips meta extraction,
 // parseOutline, page.pdf, and the pdf-lib roundtrip / incremental writer.
@@ -114,6 +116,27 @@
 // when String(num) already lacks an `e`. Skips a redundant toString,
 // split, and parseInt per call; only the rare exponential-notation
 // tail still falls through to the original implementation.
+//
+// --fast-size-in-bytes replaces pdf-lib's utils.sizeInBytes -- which
+// allocates `n.toString(2)` just to count its bit length -- with a
+// non-allocating short-circuit ladder. Called ~300 k times per save
+// from PDFCrossRefStream's xref writer; the dominant inputs are
+// 1-2 byte values (type, gen, index, small obj-stream refs) so a
+// `n < 0x100 ? 1 : ...` ladder is the right shape.
+//
+// --fast-inflate swaps pako.inflate for node:zlib.inflateSync on
+// pdf-lib's one remaining pako call site (PDFCrossRefStreamParser
+// inflating the compressed cross-reference stream during
+// PDFDocument.load). One call per load, negligible wall-clock; flag
+// exists so paired A/Bs can compare against pure-pdf-lib behaviour.
+// Production runs through it.
+//
+// --fast-parse-number replaces pdf-lib's BaseParser.parseRawNumber
+// and parseRawInt with direct-integer accumulators (n = n*10 +
+// (byte - 0x30)) that skip per-byte string concatenation and the
+// trailing Number() round-trip. Every numeric token in a parsed
+// PDF flows through these; hundreds of thousands of calls per load
+// on the book. Production runs through it.
 
 import { pathToFileURL, fileURLToPath } from 'node:url';
 import { dirname, resolve, join } from 'node:path';
@@ -160,6 +183,9 @@ let fastRefs = false;
 let parallelDeflate = false;
 let fastDecodeName = false;
 let fastNumberToString = false;
+let fastSizeInBytes = false;
+let fastInflate = false;
+let fastParseNumber = false;
 for (let i = 0; i < args.length; i++) {
   const a = args[i];
   if (a === '--out') outArg = args[++i];
@@ -185,6 +211,9 @@ for (let i = 0; i < args.length; i++) {
   else if (a === '--parallel-deflate') parallelDeflate = true;
   else if (a === '--fast-decode-name') fastDecodeName = true;
   else if (a === '--fast-number-to-string') fastNumberToString = true;
+  else if (a === '--fast-size-in-bytes') fastSizeInBytes = true;
+  else if (a === '--fast-inflate') fastInflate = true;
+  else if (a === '--fast-parse-number') fastParseNumber = true;
   else if (!inputArg) inputArg = a;
   else { console.error(`unknown arg: ${a}`); process.exit(2); }
 }
@@ -238,6 +267,18 @@ if (fastNumberToString) {
   await import('../docs/lib/fast-number-to-string.mjs');
   console.log('[harness] fast-number-to-string: skip redundant toString/split when no exponential');
 }
+if (fastSizeInBytes) {
+  await import('../docs/lib/fast-size-in-bytes.mjs');
+  console.log('[harness] fast-size-in-bytes: non-allocating ladder for xref byte-width');
+}
+if (fastInflate) {
+  await import('../docs/lib/fast-inflate.mjs');
+  console.log('[harness] fast-inflate: swap pako.inflate for node:zlib.inflateSync');
+}
+if (fastParseNumber) {
+  await import('../docs/lib/fast-parse-number.mjs');
+  console.log('[harness] fast-parse-number: direct-integer accumulator for parseRawNumber/parseRawInt');
+}
 
 const stamp = new Date().toISOString().replace(/[:.]/g, '-');
 const outDir = outArg
diff --git a/perf/notes/08-pdf-lib.md b/perf/notes/08-pdf-lib.md
index 8d2c9dfa..4aa673e8 100644
--- a/perf/notes/08-pdf-lib.md
+++ b/perf/notes/08-pdf-lib.md
@@ -1046,6 +1046,145 @@ Lesson for the next pdf-lib shim of a free function (rather than a
 class method): check `tslib.__exportStar`'s shape before assuming
 a single-site patch works.
 
+## `sizeInBytes`: stop allocating a base-2 string just to count its bits
+
+A fresh process-phase profile under the post-`fast-decode-name` /
+`fast-number-to-string` shipping set (1638-page book, `--fast-refs
+--parallel-deflate --fast-decode-name --fast-number-to-string
+--cpu-profile-process --cpu-sampling 100`) put process at 1.95 s
+and showed an oddly-shaped row in the top-15:
+
+```
+   self_ms   self_%   function  @  source
+   -------   ------   ----------------------------------------------
+    213.02   10.97%   (garbage collector)
+    171.60    8.83%   PDFDict.entries          pdf-lib/PDFDict.js:22
+    144.16    7.42%   PDFRef.of                pdf-lib/PDFRef.js:34
+    ...
+     56.48    2.91%   exports.sizeInBytes      pdf-lib/utils/numbers.js:37
+```
+
+`sizeInBytes` is a four-line utility:
+
+```js
+exports.sizeInBytes = function (n) { return Math.ceil(n.toString(2).length / 8); };
+```
+
+It computes how many bytes a non-negative integer takes by
+stringifying it as base-2, counting characters, and dividing by 8.
+The string is thrown away immediately.
+
+`find-callers.mjs` attributed the 56 ms across two callers, both
+inside the xref-stream writer:
+
+| caller | attributed |
+| --- | --- |
+| `bytesFor` (`utils/numbers.js:49`) -- sizes the `Uint8Array` that gets filled byte-by-byte | 29.6 ms |
+| `PDFCrossRefStream.computeMaxEntryByteWidths` (`structures/PDFCrossRefStream.js:66`) -- 3 calls per xref entry to compute the `/W` widths | 26.9 ms |
+
+For a ~50 k-object PDF that's roughly 300 k `n.toString(2)` calls
+per save, each allocating a short-lived 1-to-32-char string.
+Likely a contributor to the 213 ms GC at the top of the table too.
+
+### The shim
+
+`docs/lib/fast-size-in-bytes.mjs` replaces `utils.sizeInBytes`
+with a non-allocating short-circuit ladder:
+
+```js
+function fastSizeInBytes(n) {
+  if (n < 0x100) return 1;
+  if (n < 0x10000) return 2;
+  if (n < 0x1000000) return 3;
+  if (n < 0x100000000) return 4;
+  return 4 + Math.ceil((32 - Math.clz32(Math.floor(n / 0x100000000))) / 8);
+}
+```
+
+The ladder shape matches the actual value distribution in
+`computeEntryTuples`. The xref entry tuples are
+`(type, second, third)` where:
+
+- `type` is 0, 1, or 2 (1 byte, always)
+- `gen` / `index` are small (1-2 bytes)
+- `offset` for uncompressed entries reaches 3-4 bytes on a 16 MB
+  PDF
+- `nextFreeObjectNumber` for deleted entries is small
+
+So most calls take the very first branch. A `Math.clz32`-based
+alternative would be simpler but slower in the common case,
+because it always pays for the native call + sub + div + ceil.
+The ladder exits in one compare for the dominant case.
+
+Triple-patch shape mirrors `fast-number-to-string.mjs` -- pdf-lib
+ships compiled against tslib 1.x whose `__exportStar` value-copies
+re-exports rather than installing live getters, so consumers that
+read `sizeInBytes` through a barrel (`PDFCrossRefStream` does:
+`utils_1.sizeInBytes(...)`) hold a captured reference. Patch the
+source module, the utils/index barrel, and the top-level index to
+cover every observed call site. `utils.bytesFor` reads
+`exports.sizeInBytes` at call time from the same module object we
+mutate first, so it picks up the fast path without a separate
+patch.
+
+### Results
+
+A/B (2 runs each, `--fast-refs --parallel-deflate
+--fast-decode-name --fast-number-to-string --cpu-profile-process
+--cpu-sampling 100`, with `--fast-size-in-bytes` the only
+difference):
+
+| run | PRE | POST |
+| --- | --- | --- |
+| 1 | 1.95 s | 1.91 s |
+| 2 | 2.01 s | 1.93 s |
+| **avg** | **1.98 s** | **1.92 s** |
+| save sub-phase avg | 0.80 s | 0.73 s |
+
+**Δ = -60 ms process (-3.0 %).** The save sub-phase carries
+-70 ms of that -- exactly where `sizeInBytes` lives (xref writer
+fires during save, not load), so the attribution lines up.
+
+Profile self-time, POST run:
+
+- `exports.sizeInBytes` row: 56.48 ms → undetectable. V8 inlined
+  the ladder into both callers; `fastSizeInBytes` doesn't appear
+  in the profile by name either.
+- GC: 213 ms → 201 ms (-12 ms, consistent with no longer
+  allocating ~300 k short-lived base-2 strings per save).
+- No cost migration to other rows. The surrounding parser /
+  writer rows are flat within noise.
+
+PDF byte-equivalent (31-byte `/CreationDate` drift between PRE
+and POST -- well inside the standard timestamp band).
+
+### Side finding: the harness flag set wasn't tracking production
+
+While landing this change, the harness flag set was audited
+against `render-book.mjs`'s imports. `render-book.mjs` was
+importing five `fast-*` shims (`fast-refs`, `fast-inflate`,
+`fast-parse-number`, `fast-decode-name`, `fast-number-to-string`),
+but `measure.mjs` only exposed three of them as flags
+(`--fast-refs`, `--fast-decode-name`, `--fast-number-to-string`).
+So the canonical process-profile command was measuring a *subset*
+of what production actually runs -- two production shims
+(`fast-inflate` and `fast-parse-number`) had been on for
+production and silently off for the perf harness.
+
+Wall-clock impact of that gap is small in absolute terms (the two
+missing shims target the load sub-phase, which is ~1.2 s out of
+the 1.95 s process total), but the bottom-up table in the
+canonical command was attributing time to functions that don't
+run that way in production. Fixed in the same change: `measure.mjs`
+now exposes `--fast-inflate` and `--fast-parse-number`, and the
+canonical command in the README lists all five production shims
+plus `--fast-size-in-bytes`.
+
+The general lesson: when a new shim lands, audit the harness's
+flag set against `render-book.mjs`'s import list. A flag missing
+on the harness side silently moves the harness baseline away from
+production -- and the divergence accumulates over time.
+
 ## `@cantoo/pdf-lib`: not a drop-in replacement
 
 Spot-checked the maintained fork (`@cantoo/pdf-lib` 2.6.5) as an
@@ -1072,7 +1211,8 @@ Cumulative process-phase cost, baseline → after the shims to date:
 | + fast-deflate                       | ~2.5 s  | ~1.5s| ~1 s |
 | + fast-refs                          | ~2.3 s  | ~1.3 s | ~1 s |
 | + parallel-deflate                   | ~2.0 s  | ~1.3 s | ~0.7 s |
-| **+ fast-decode-name + fast-number-to-string (this section)** | **~1.6 s** | **~1.0 s** | **~0.6 s** |
+| + fast-decode-name + fast-number-to-string | ~1.6 s  | ~1.0 s | ~0.6 s |
+| **+ fast-size-in-bytes (this section)** | **~1.5 s** | **~1.0 s** | **~0.5 s** |
 
 The bottom-up after the latest pair is what's left of pdf-lib's
 genuine parser work: `PDFDict.entries`, `PDFObjectParser.parseName`,

From 4515a8f90d6ce540f19e1cff6ac0cb3505f9941c Mon Sep 17 00:00:00 2001
From: Kuba Sunderland-Ober <kuba@mareimbrium.org>
Date: Sun, 24 May 2026 00:24:49 +0200
Subject: [PATCH 08/44] Improve iterator performance.

---
 docs/lib/fast-dict-iter.mjs |  81 +++++++++++++++++++
 docs/render-book.mjs        |   7 ++
 perf/README.md              |  17 +++-
 perf/measure.mjs            |   6 ++
 perf/notes/08-pdf-lib.md    | 154 +++++++++++++++++++++++++++++++++++-
 5 files changed, 262 insertions(+), 3 deletions(-)
 create mode 100644 docs/lib/fast-dict-iter.mjs

diff --git a/docs/lib/fast-dict-iter.mjs b/docs/lib/fast-dict-iter.mjs
new file mode 100644
index 00000000..1d2a6cb8
--- /dev/null
+++ b/docs/lib/fast-dict-iter.mjs
@@ -0,0 +1,81 @@
+// Replace pdf-lib's PDFDict.sizeInBytes and PDFDict.copyBytesInto -- both of
+// which materialize a fresh Array of [key, value] tuples via this.entries()
+// on every call -- with versions that iterate the underlying Map in place.
+//
+// The upstream entries() helper
+// ([PDFDict.js:22](node_modules/pdf-lib/cjs/core/objects/PDFDict.js:22)) is:
+//
+//   PDFDict.prototype.entries = function () {
+//       return Array.from(this.dict.entries());
+//   };
+//
+// Per call that is: one MapIterator + one outer Array + one fresh
+// [key, value] tuple per entry (allocated by the iterator itself). The save
+// path fires both consumers on every dict (sizeInBytes to measure first,
+// then copyBytesInto to write), so on the book that's ~100 k Array.from
+// calls feeding the GC; PDFDict.entries was the largest non-GC row in the
+// process profile (~10 % of process self-time) and (garbage collector) sat
+// at the top.
+//
+// Map.prototype.forEach((value, key) => ...) calls back with positional
+// arguments and never allocates a tuple. The two consumers don't need the
+// tuple form -- they immediately destructure -- so swapping is local.
+//
+// We do NOT touch PDFDict.prototype.entries itself: clone() and toString()
+// still call it and rely on the Array-of-tuples contract. Those paths fire
+// rarely (clone on incremental updates only, toString in debug output) and
+// aren't worth the contract churn.
+//
+// Side-effecting import. Import once before any pdf-lib save:
+//
+//   import "./lib/fast-dict-iter.mjs";
+//
+// Idempotent -- repeated imports do nothing after the first.
+
+import { createRequire } from 'node:module';
+
+const require    = createRequire(import.meta.url);
+const PDFDict    = require('pdf-lib/cjs/core/objects/PDFDict.js').default;
+const CharCodes  = require('pdf-lib/cjs/core/syntax/CharCodes.js').default;
+
+// Callbacks are module-level (not closures) so Map.forEach reuses the same
+// function reference on every call instead of allocating a fresh context
+// per invocation. Per-call state is threaded through forEach's `thisArg`
+// (one small object alloc per call, instead of one closure context plus
+// one heap cell for the captured `offset` mutation).
+function _sizeInBytesEntry(value, key) {
+  this.s += key.sizeInBytes() + value.sizeInBytes() + 2;
+}
+
+function _copyBytesIntoEntry(value, key) {
+  const buf = this.buf;
+  let off = this.off;
+  off += key.copyBytesInto(buf, off);
+  buf[off++] = CharCodes.Space;
+  off += value.copyBytesInto(buf, off);
+  buf[off++] = CharCodes.Newline;
+  this.off = off;
+}
+
+if (!PDFDict.prototype.__fastDictIterInstalled) {
+  PDFDict.prototype.sizeInBytes = function () {
+    const ctx = { s: 5 };
+    this.dict.forEach(_sizeInBytesEntry, ctx);
+    return ctx.s;
+  };
+
+  PDFDict.prototype.copyBytesInto = function (buffer, offset) {
+    const initialOffset = offset;
+    buffer[offset++] = CharCodes.LessThan;
+    buffer[offset++] = CharCodes.LessThan;
+    buffer[offset++] = CharCodes.Newline;
+    const ctx = { buf: buffer, off: offset };
+    this.dict.forEach(_copyBytesIntoEntry, ctx);
+    offset = ctx.off;
+    buffer[offset++] = CharCodes.GreaterThan;
+    buffer[offset++] = CharCodes.GreaterThan;
+    return offset - initialOffset;
+  };
+
+  PDFDict.prototype.__fastDictIterInstalled = true;
+}
diff --git a/docs/render-book.mjs b/docs/render-book.mjs
index bccf6a38..c29eff7b 100644
--- a/docs/render-book.mjs
+++ b/docs/render-book.mjs
@@ -58,12 +58,19 @@ import { PDFDocument, ParseSpeeds } from 'pdf-lib';
 //     `n.toString(2)` just to count its bit length) with a non-
 //     allocating short-circuit ladder. Called ~300 k times per save
 //     from PDFCrossRefStream's xref writer.
+//   fast-dict-iter -- replace PDFDict.sizeInBytes / copyBytesInto
+//     with versions that iterate the underlying Map in place via
+//     forEach, instead of materialising a fresh Array of [key, value]
+//     tuples via this.entries() on every call. ~80 ms saved per
+//     process run on the book + eliminates the largest non-GC row
+//     (PDFDict.entries was ~10 % of process self-time).
 import './lib/fast-refs.mjs';
 import './lib/fast-inflate.mjs';
 import './lib/fast-parse-number.mjs';
 import './lib/fast-decode-name.mjs';
 import './lib/fast-number-to-string.mjs';
 import './lib/fast-size-in-bytes.mjs';
+import './lib/fast-dict-iter.mjs';
 import { parseOutline, setOutline } from './lib/outline.mjs';
 import { setMetadata }              from './lib/postprocesser.mjs';
 import { parallelSave }             from './lib/parallel-deflate.mjs';
diff --git a/perf/README.md b/perf/README.md
index 3b97916e..1d33a107 100644
--- a/perf/README.md
+++ b/perf/README.md
@@ -66,7 +66,7 @@ or pdf-lib), or to write `book.pdf` for behavioural verification.
 The mirror command for CPU-profiling the pdf-lib roundtrip:
 
 ```
-node measure.mjs --fast-refs --parallel-deflate --fast-decode-name --fast-number-to-string --fast-size-in-bytes --fast-inflate --fast-parse-number --cpu-profile-process --cpu-sampling 100
+node measure.mjs --fast-refs --parallel-deflate --fast-decode-name --fast-number-to-string --fast-size-in-bytes --fast-inflate --fast-parse-number --fast-dict-iter --cpu-profile-process --cpu-sampling 100
 ```
 
 Flag rationale:
@@ -119,6 +119,17 @@ Flag rationale:
   trip. Every numeric token parsed during `PDFDocument.load`
   flows through these -- hundreds of thousands of calls per load
   on the book. Production runs through it.
+- `--fast-dict-iter` -- inject
+  [docs/lib/fast-dict-iter.mjs](../docs/lib/fast-dict-iter.mjs),
+  replacing `PDFDict.sizeInBytes` and `PDFDict.copyBytesInto` with
+  versions that iterate the underlying Map in place via
+  `forEach((value, key), thisArg)` instead of materialising a fresh
+  Array of `[key, value]` tuples via `this.entries()` on every call.
+  The save path fires both consumers on every dict (~100 k
+  `Array.from` calls feeding the GC), so this was the largest
+  non-GC row in the profile (~10 % of process self-time charged to
+  `PDFDict.entries`). ~80 ms saved per process run. Production runs
+  through it.
 - `--cpu-profile-process` -- attach Node's `inspector/promises`
   Profiler around the process phase only (skips render and generate).
   Writes `process.cpuprofile` into the timestamped `results/` folder.
@@ -276,6 +287,7 @@ run.bat --fast-number-to-string           # skip numberToString redundant toStri
 run.bat --fast-size-in-bytes              # non-allocating ladder for xref byte-width (also ships; opt-in here for A/B)
 run.bat --fast-inflate                    # swap pako.inflate for node:zlib.inflateSync (also ships; opt-in here for A/B)
 run.bat --fast-parse-number               # direct-integer accumulator for parseRawNumber/parseRawInt (also ships; opt-in here for A/B)
+run.bat --fast-dict-iter                  # in-place Map.forEach for PDFDict.sizeInBytes/copyBytesInto (also ships; opt-in here for A/B)
 ```
 
 Flags compose. The CPU profile lands as `render.cpuprofile`
@@ -388,6 +400,7 @@ file documenting each:
 | `PDFName.of` no-`#` cache (skip `decodeName` regex) | [08](notes/08-pdf-lib.md) | ~0.5 s process (load -17 %, GC -101 ms) |
 | `numberToString` no-`e` short-circuit | [08](notes/08-pdf-lib.md) | ~40 ms profile, below wall-clock noise |
 | `sizeInBytes` short-circuit ladder (no base-2 string) | [08](notes/08-pdf-lib.md) | ~60 ms process (save -70 ms) |
+| `PDFDict` iter (Map.forEach with hoisted callbacks) | [08](notes/08-pdf-lib.md) | ~80 ms process (dict path -6 pp) |
 
 What was tried and didn't ship:
 
@@ -414,4 +427,4 @@ order; later ones reference earlier ones for context.
 | [05-blink-trace.md](notes/05-blink-trace.md) | What happened when we tried move-not-clone (a `previousLeaf` cache shipped instead of move); cracking the cpu profile's `(program)` row open with a Blink-category trace; the WhiteSpaceFilter paired-A/B that found it wasn't worth its layout cost in our pipeline. |
 | [06-microtasks-pageranges-css.md](notes/06-microtasks-pageranges-css.md) | Following `RunMicrotasks` down to zero (chunker fully sync); why `pageRanges` sharding is off the table; CSS cost attribution showing print.css's individual sections are all below the noise floor. |
 | [07-memory.md](notes/07-memory.md) | Where the renderer's 1.9 GB goes -- process-tree footprint, per-allocator + per-Blink-class breakdown, `--disable-gpu` + `--in-process-gpu` saving ~200 MB, a GC-pass probe finding 180 MB of unswept Oilpan garbage. |
-| [08-pdf-lib.md](notes/08-pdf-lib.md) | Profiling the process phase via `--cpu-profile-process`; pako's per-stream init dominates with ~4 500 small streams (routing pdf-lib's `deflate` + `inflate` through `node:zlib` saves ~1.5 s); `PDFRef.of`'s string-keyed Map lookup at 1.2 M calls per load (dense-array gen=0 cache saves ~0.2 s); parallelising save's per-stream deflate on libuv's pool with `objectsPerStream: 500` (~0.3 s off the main thread; PDF -5 %); `decodeName`'s regex scan on 2.76 M `PDFName.of` calls per load with a 0.0001 % hit rate (no-`#` cache saves ~0.5 s); `numberToString`'s redundant `toString`/`split`/`parseInt` on the 100 % no-`e` path; `sizeInBytes` allocating `n.toString(2)` on ~300 k xref-writer calls (short-circuit ladder saves ~60 ms). |
+| [08-pdf-lib.md](notes/08-pdf-lib.md) | Profiling the process phase via `--cpu-profile-process`; pako's per-stream init dominates with ~4 500 small streams (routing pdf-lib's `deflate` + `inflate` through `node:zlib` saves ~1.5 s); `PDFRef.of`'s string-keyed Map lookup at 1.2 M calls per load (dense-array gen=0 cache saves ~0.2 s); parallelising save's per-stream deflate on libuv's pool with `objectsPerStream: 500` (~0.3 s off the main thread; PDF -5 %); `decodeName`'s regex scan on 2.76 M `PDFName.of` calls per load with a 0.0001 % hit rate (no-`#` cache saves ~0.5 s); `numberToString`'s redundant `toString`/`split`/`parseInt` on the 100 % no-`e` path; `sizeInBytes` allocating `n.toString(2)` on ~300 k xref-writer calls (short-circuit ladder saves ~60 ms); `PDFDict.entries` allocating `Array.from(map.entries())` on every dict serialisation (`Map.forEach` with hoisted callbacks saves ~80 ms). |
diff --git a/perf/measure.mjs b/perf/measure.mjs
index 1de7eb71..60185dde 100644
--- a/perf/measure.mjs
+++ b/perf/measure.mjs
@@ -186,6 +186,7 @@ let fastNumberToString = false;
 let fastSizeInBytes = false;
 let fastInflate = false;
 let fastParseNumber = false;
+let fastDictIter = false;
 for (let i = 0; i < args.length; i++) {
   const a = args[i];
   if (a === '--out') outArg = args[++i];
@@ -214,6 +215,7 @@ for (let i = 0; i < args.length; i++) {
   else if (a === '--fast-size-in-bytes') fastSizeInBytes = true;
   else if (a === '--fast-inflate') fastInflate = true;
   else if (a === '--fast-parse-number') fastParseNumber = true;
+  else if (a === '--fast-dict-iter') fastDictIter = true;
   else if (!inputArg) inputArg = a;
   else { console.error(`unknown arg: ${a}`); process.exit(2); }
 }
@@ -279,6 +281,10 @@ if (fastParseNumber) {
   await import('../docs/lib/fast-parse-number.mjs');
   console.log('[harness] fast-parse-number: direct-integer accumulator for parseRawNumber/parseRawInt');
 }
+if (fastDictIter) {
+  await import('../docs/lib/fast-dict-iter.mjs');
+  console.log('[harness] fast-dict-iter: in-place Map.forEach for PDFDict.sizeInBytes/copyBytesInto');
+}
 
 const stamp = new Date().toISOString().replace(/[:.]/g, '-');
 const outDir = outArg
diff --git a/perf/notes/08-pdf-lib.md b/perf/notes/08-pdf-lib.md
index 4aa673e8..dd667686 100644
--- a/perf/notes/08-pdf-lib.md
+++ b/perf/notes/08-pdf-lib.md
@@ -1185,6 +1185,157 @@ flag set against `render-book.mjs`'s import list. A flag missing
 on the harness side silently moves the harness baseline away from
 production -- and the divergence accumulates over time.
 
+## `PDFDict.entries`: stop allocating a tuple array per save
+
+A profile of the process phase with every prior shipping shim
+applied still showed `PDFDict.entries` at the top of the non-GC
+self-time table, ~10 % of process. The function is a one-liner:
+
+```js
+PDFDict.prototype.entries = function () {
+    return Array.from(this.dict.entries());
+};
+```
+
+Per call: one `MapIterator` + one outer Array + one fresh
+`[key, value]` tuple per entry (allocated by the iterator itself,
+then collected by `Array.from` into the outer array). The save
+path fires both consumers on every dict -- `sizeInBytes` first to
+measure, then `copyBytesInto` to write -- so on the book that's
+~100 k `Array.from` calls feeding the GC. `(garbage collector)`
+sat at the top of the table too, which is the cost shape the
+allocation pattern predicts.
+
+Both consumers immediately destructure the tuples:
+
+```js
+var entries = this.entries();
+for (var idx = 0, len = entries.length; idx < len; idx++) {
+    var _a = entries[idx], key = _a[0], value = _a[1];
+    ...
+}
+```
+
+So nothing actually wants the array-of-tuples shape -- the
+upstream code uses it because that's what `entries()` returns,
+and the materialised array is dead by the next iteration.
+
+### The shim
+
+`docs/lib/fast-dict-iter.mjs` replaces
+`PDFDict.prototype.sizeInBytes` and
+`PDFDict.prototype.copyBytesInto` with versions that iterate the
+underlying Map in place via `Map.prototype.forEach((value, key),
+thisArg)`. The callback's positional `(value, key)` arguments
+mean no tuple is ever allocated, and routing per-call state
+through `forEach`'s `thisArg` instead of closure capture lets the
+callback stay a module-level function reference (no per-call
+closure context).
+
+The callbacks are hoisted to module top-level (not closures):
+
+```js
+function _sizeInBytesEntry(value, key) {
+  this.s += key.sizeInBytes() + value.sizeInBytes() + 2;
+}
+function _copyBytesIntoEntry(value, key) {
+  const buf = this.buf;
+  let off = this.off;
+  off += key.copyBytesInto(buf, off);
+  buf[off++] = CharCodes.Space;
+  off += value.copyBytesInto(buf, off);
+  buf[off++] = CharCodes.Newline;
+  this.off = off;
+}
+```
+
+Each consumer allocates a single small `ctx` object per call (one
+alloc, vs the prior `1 + N` Array allocations) and threads it
+through `thisArg`:
+
+```js
+PDFDict.prototype.copyBytesInto = function (buffer, offset) {
+  // ... write '<<\n' ...
+  const ctx = { buf: buffer, off: offset };
+  this.dict.forEach(_copyBytesIntoEntry, ctx);
+  offset = ctx.off;
+  // ... write '>>' ...
+};
+```
+
+The `PDFDict.prototype.entries` method itself stays untouched --
+`clone()` and `toString()` still call it and rely on the
+array-of-tuples contract. Those paths fire rarely (clone on
+incremental updates, toString in debug output) and don't justify
+the contract churn.
+
+### Results
+
+Profile diff, both runs `--detach-pages --no-timing` with every
+other shipping shim active, 100 us sampling:
+
+| metric                              | pre        | post       | Δ                  |
+| ---                                 | ---        | ---        | ---                |
+| `PDFDict.entries` self              | 164.16 ms  | off-list   | **-164 ms (-100 %)** |
+| `PDFDict.copyBytesInto` self        | 27.54 ms   | 25.42 ms   | flat               |
+| `_copyBytesIntoEntry` (callback)    | n/a        | 23.83 ms   | new                |
+| `PDFDict.sizeInBytes` self          | sub-cutoff | 15.89 ms   | n/a                |
+| `_sizeInBytesEntry` (callback)      | n/a        | 12.71 ms   | new                |
+| **dict-serialisation path subtotal**| **~192 ms (~11 % of process)** | **~78 ms (~5 % of process)** | **~80 ms / -6 pp** |
+| `(garbage collector)`               | 201 ms (12 %) | 227 ms (15 %) | +26 ms / +3 pp  |
+
+The 164 ms `entries` self-time is reliably gone. The replacement
+work in the four-row split (the two consumers + their named
+callbacks) sums to ~78 ms -- about a **6 pp drop** in process
+attribution to this code path.
+
+The `(garbage collector)` row going *up* was the surprise. A
+first-cut variant of the shim used closures (`forEach((value,
+key) => { ... captures `offset` ... })`) and showed the same GC
+increase. Hypothesis: the captured-and-mutated `offset` cell was
+forcing V8 to heap-allocate a closure context per call. So we
+tested the hoisted-callback variant above, which has zero
+closure capture. The GC row landed at almost exactly the same
+absolute value (~227 ms vs ~271 ms, both ~15 % of process).
+
+So the closure-capture hypothesis was wrong -- V8's escape
+analysis was already eliding the `offset` cell. The GC nudge is
+either run-to-run load-phase variance (the profile spans load +
+setOutline + save, and load dominates) or the per-call `ctx`
+object allocation we couldn't avoid without bigger code surgery.
+Either way it doesn't reverse the win: the dict-path attributable
+time dropped by ~80 ms, and that's real cycles removed.
+
+PDF output is byte-equivalent to the pre-shim build:
+`Map.forEach` iterates in insertion order, same as
+`Array.from(map.entries())`, so the serialised byte sequence is
+identical.
+
+### Lesson: hoist forEach callbacks when state is mutable
+
+The hoisted-callback pattern (callback = module-level function,
+state via `forEach`'s `thisArg`) reads as overkill -- a closure
+is fewer lines and easier to follow. Two reasons it's still the
+right shape here:
+
+1. **Profile attribution.** Named callbacks
+   (`_copyBytesIntoEntry`, `_sizeInBytesEntry`) appear in CPU
+   profiles under their names. Closures show up as
+   `(anonymous) @ file.mjs:55`, which makes future
+   profile-reading harder (you have to cross-reference the line
+   number every time).
+2. **Future-proofing against V8 changes.** Escape analysis can
+   handle the closure capture today, but the JIT's heuristics
+   shift across Node versions. The hoisted pattern is
+   semantically explicit -- no implicit allocation depends on
+   the compiler being smart. Same shape that has aged well in
+   other hot pdf-lib paths we've patched.
+
+Cost is negligible (six extra lines and two declarations);
+upside is the profile reads cleanly and the perf shape is robust
+to JIT changes. Worth doing whenever the callback's state
+outlives a single iteration.
+
 ## `@cantoo/pdf-lib`: not a drop-in replacement
 
 Spot-checked the maintained fork (`@cantoo/pdf-lib` 2.6.5) as an
@@ -1212,7 +1363,8 @@ Cumulative process-phase cost, baseline → after the shims to date:
 | + fast-refs                          | ~2.3 s  | ~1.3 s | ~1 s |
 | + parallel-deflate                   | ~2.0 s  | ~1.3 s | ~0.7 s |
 | + fast-decode-name + fast-number-to-string | ~1.6 s  | ~1.0 s | ~0.6 s |
-| **+ fast-size-in-bytes (this section)** | **~1.5 s** | **~1.0 s** | **~0.5 s** |
+| + fast-size-in-bytes                 | ~1.5 s  | ~1.0 s | ~0.5 s |
+| **+ fast-dict-iter (this section)**  | **~1.4 s** | **~1.0 s** | **~0.4 s** |
 
 The bottom-up after the latest pair is what's left of pdf-lib's
 genuine parser work: `PDFDict.entries`, `PDFObjectParser.parseName`,

From 5e6cdfd275fdc973a4ab5d53364fa134ea039028 Mon Sep 17 00:00:00 2001
From: Kuba Sunderland-Ober <kuba@mareimbrium.org>
Date: Sun, 24 May 2026 00:30:08 +0200
Subject: [PATCH 09/44] Hoist sentinel PDFNames out of parseDict's
 type-dispatch tail.

PDFObjectParser.parseDict ends every dict it parses with four
PDFName.of calls for Type/Catalog/Pages/Page, even on the dicts
that have no /Type entry at all. With fast-decode-name in effect
each call collapses to a fastCache.get, but fastOf was still the
#4 row in process.cpuprofile at ~5%.

Pool-dedup makes the canonical PDFNames reference-stable for the
whole load, so capture them once at shim-load and substitute
module-level constants for the four calls. Drops ~17 ms (~22%)
of fastOf self-time. Output byte-equivalent.
---
 docs/lib/fast-parse-dict.mjs |  87 +++++++++++++++++++++++
 docs/render-book.mjs         |   7 ++
 perf/README.md               |  18 ++++-
 perf/measure.mjs             |  16 ++++-
 perf/notes/08-pdf-lib.md     | 129 ++++++++++++++++++++++++++++++++++-
 5 files changed, 253 insertions(+), 4 deletions(-)
 create mode 100644 docs/lib/fast-parse-dict.mjs

diff --git a/docs/lib/fast-parse-dict.mjs b/docs/lib/fast-parse-dict.mjs
new file mode 100644
index 00000000..203549c8
--- /dev/null
+++ b/docs/lib/fast-parse-dict.mjs
@@ -0,0 +1,87 @@
+// Hoist the four sentinel PDFName.of calls out of
+// PDFObjectParser.prototype.parseDict.
+//
+// The upstream parseDict
+// ([PDFObjectParser.js:141](node_modules/pdf-lib/cjs/core/parser/PDFObjectParser.js:141))
+// ends every dict it parses with a Type-dispatch tail:
+//
+//   var Type = dict.get(PDFName.of('Type'));
+//   if (Type === PDFName.of('Catalog')) return PDFCatalog.fromMapWithContext(...);
+//   else if (Type === PDFName.of('Pages')) return PDFPageTree.fromMapWithContext(...);
+//   else if (Type === PDFName.of('Page'))  return PDFPageLeaf.fromMapWithContext(...);
+//   else                                   return PDFDict.fromMapWithContext(...);
+//
+// That's 4 PDFName.of calls per dict, even on the overwhelming
+// majority (resource dicts, font descriptors, content-stream dicts)
+// that have no /Type entry at all. With --fast-decode-name in
+// effect each call collapses to a Map.get on fastCache, but
+// fastOf is still the #4 row in process.cpuprofile (~80 ms,
+// 5.2 %).
+//
+// PDFName instances are pool-deduped
+// ([PDFName.js:18,100](node_modules/pdf-lib/cjs/core/objects/PDFName.js:18))
+// so the sentinel "Type" / "Catalog" / "Pages" / "Page" PDFNames
+// are reference-stable for the entire load. Capture them once at
+// shim-load time and substitute direct constants for the four
+// PDFName.of calls inside parseDict. The rest of the function
+// body is preserved verbatim -- same loop, same dict.set, same
+// dispatch shape.
+//
+// Mechanism: PDFObjectParser isn't re-exported by pdf-lib's index,
+// so we reach in through the CJS internals via createRequire (same
+// shape as fast-parse-number.mjs / fast-dict-iter.mjs). Mutating
+// PDFObjectParser.prototype.parseDict is global -- every parser
+// instance created after this shim loads picks it up.
+//
+// Side-effecting import. Import once before PDFDocument.load runs:
+//
+//   import "./lib/fast-parse-dict.mjs";
+//
+// Idempotent -- repeated imports do nothing after the first.
+
+import { createRequire } from 'node:module';
+
+const require = createRequire(import.meta.url);
+const PDFObjectParser = require('pdf-lib/cjs/core/parser/PDFObjectParser.js').default;
+const PDFName         = require('pdf-lib/cjs/core/objects/PDFName.js').default;
+const PDFDict         = require('pdf-lib/cjs/core/objects/PDFDict.js').default;
+const PDFCatalog      = require('pdf-lib/cjs/core/structures/PDFCatalog.js').default;
+const PDFPageTree     = require('pdf-lib/cjs/core/structures/PDFPageTree.js').default;
+const PDFPageLeaf     = require('pdf-lib/cjs/core/structures/PDFPageLeaf.js').default;
+const CharCodes       = require('pdf-lib/cjs/core/syntax/CharCodes.js').default;
+
+// Capture canonical PDFName instances. Pool-dedup guarantees the
+// parser would have built === these even if the original parseDict
+// were still in play.
+const TypeName    = PDFName.of('Type');
+const CatalogName = PDFName.of('Catalog');
+const PagesName   = PDFName.of('Pages');
+const PageName    = PDFName.of('Page');
+
+if (!PDFObjectParser.prototype.__fastParseDictInstalled) {
+  PDFObjectParser.prototype.parseDict = function fastParseDict() {
+    const bytes = this.bytes;
+    bytes.assertNext(CharCodes.LessThan);
+    bytes.assertNext(CharCodes.LessThan);
+    this.skipWhitespaceAndComments();
+    const dict = new Map();
+    while (!bytes.done() &&
+           bytes.peek() !== CharCodes.GreaterThan &&
+           bytes.peekAhead(1) !== CharCodes.GreaterThan) {
+      const key = this.parseName();
+      const value = this.parseObject();
+      dict.set(key, value);
+      this.skipWhitespaceAndComments();
+    }
+    this.skipWhitespaceAndComments();
+    bytes.assertNext(CharCodes.GreaterThan);
+    bytes.assertNext(CharCodes.GreaterThan);
+    const Type = dict.get(TypeName);
+    if (Type === CatalogName) return PDFCatalog.fromMapWithContext(dict, this.context);
+    if (Type === PagesName)   return PDFPageTree.fromMapWithContext(dict, this.context);
+    if (Type === PageName)    return PDFPageLeaf.fromMapWithContext(dict, this.context);
+    return PDFDict.fromMapWithContext(dict, this.context);
+  };
+
+  PDFObjectParser.prototype.__fastParseDictInstalled = true;
+}
diff --git a/docs/render-book.mjs b/docs/render-book.mjs
index c29eff7b..40f57257 100644
--- a/docs/render-book.mjs
+++ b/docs/render-book.mjs
@@ -64,6 +64,12 @@ import { PDFDocument, ParseSpeeds } from 'pdf-lib';
 //     tuples via this.entries() on every call. ~80 ms saved per
 //     process run on the book + eliminates the largest non-GC row
 //     (PDFDict.entries was ~10 % of process self-time).
+//   fast-parse-dict -- hoist the four sentinel PDFName.of calls
+//     (Type / Catalog / Pages / Page) out of the type-dispatch tail
+//     in PDFObjectParser.prototype.parseDict. The dispatch fires
+//     per-dict on every load; pool-dedup makes the canonical
+//     PDFNames reference-stable, so captured constants replace
+//     the calls verbatim. Pulls ~17 ms off fastOf self-time.
 import './lib/fast-refs.mjs';
 import './lib/fast-inflate.mjs';
 import './lib/fast-parse-number.mjs';
@@ -71,6 +77,7 @@ import './lib/fast-decode-name.mjs';
 import './lib/fast-number-to-string.mjs';
 import './lib/fast-size-in-bytes.mjs';
 import './lib/fast-dict-iter.mjs';
+import './lib/fast-parse-dict.mjs';
 import { parseOutline, setOutline } from './lib/outline.mjs';
 import { setMetadata }              from './lib/postprocesser.mjs';
 import { parallelSave }             from './lib/parallel-deflate.mjs';
diff --git a/perf/README.md b/perf/README.md
index 1d33a107..a93bda36 100644
--- a/perf/README.md
+++ b/perf/README.md
@@ -66,7 +66,7 @@ or pdf-lib), or to write `book.pdf` for behavioural verification.
 The mirror command for CPU-profiling the pdf-lib roundtrip:
 
 ```
-node measure.mjs --fast-refs --parallel-deflate --fast-decode-name --fast-number-to-string --fast-size-in-bytes --fast-inflate --fast-parse-number --fast-dict-iter --cpu-profile-process --cpu-sampling 100
+node measure.mjs --fast-refs --parallel-deflate --fast-decode-name --fast-number-to-string --fast-size-in-bytes --fast-inflate --fast-parse-number --fast-dict-iter --fast-parse-dict --cpu-profile-process --cpu-sampling 100
 ```
 
 Flag rationale:
@@ -130,6 +130,18 @@ Flag rationale:
   non-GC row in the profile (~10 % of process self-time charged to
   `PDFDict.entries`). ~80 ms saved per process run. Production runs
   through it.
+- `--fast-parse-dict` -- inject
+  [docs/lib/fast-parse-dict.mjs](../docs/lib/fast-parse-dict.mjs),
+  replacing `PDFObjectParser.prototype.parseDict` with a version
+  that hoists the four sentinel `PDFName.of` calls (`'Type'`,
+  `'Catalog'`, `'Pages'`, `'Page'`) out of the type-dispatch tail
+  and substitutes module-level captured constants. Pool-dedup
+  guarantees the canonical `PDFName`s are reference-stable for the
+  whole load. With `--fast-decode-name` already in effect the four
+  calls were collapsing to `fastCache.get` hits per dict, but
+  `fastOf` was still the #4 row in the profile -- removing the
+  calls pulls ~17 ms (~22 %) off `fastOf` self-time. Production
+  runs through it.
 - `--cpu-profile-process` -- attach Node's `inspector/promises`
   Profiler around the process phase only (skips render and generate).
   Writes `process.cpuprofile` into the timestamped `results/` folder.
@@ -288,6 +300,7 @@ run.bat --fast-size-in-bytes              # non-allocating ladder for xref byte-
 run.bat --fast-inflate                    # swap pako.inflate for node:zlib.inflateSync (also ships; opt-in here for A/B)
 run.bat --fast-parse-number               # direct-integer accumulator for parseRawNumber/parseRawInt (also ships; opt-in here for A/B)
 run.bat --fast-dict-iter                  # in-place Map.forEach for PDFDict.sizeInBytes/copyBytesInto (also ships; opt-in here for A/B)
+run.bat --fast-parse-dict                 # hoist Type/Catalog/Pages/Page sentinel PDFNames out of parseDict (also ships; opt-in here for A/B)
 ```
 
 Flags compose. The CPU profile lands as `render.cpuprofile`
@@ -401,6 +414,7 @@ file documenting each:
 | `numberToString` no-`e` short-circuit | [08](notes/08-pdf-lib.md) | ~40 ms profile, below wall-clock noise |
 | `sizeInBytes` short-circuit ladder (no base-2 string) | [08](notes/08-pdf-lib.md) | ~60 ms process (save -70 ms) |
 | `PDFDict` iter (Map.forEach with hoisted callbacks) | [08](notes/08-pdf-lib.md) | ~80 ms process (dict path -6 pp) |
+| `parseDict` sentinel-PDFName hoist (Type/Catalog/Pages/Page) | [08](notes/08-pdf-lib.md) | ~17 ms profile (fastOf -22 %) |
 
 What was tried and didn't ship:
 
@@ -427,4 +441,4 @@ order; later ones reference earlier ones for context.
 | [05-blink-trace.md](notes/05-blink-trace.md) | What happened when we tried move-not-clone (a `previousLeaf` cache shipped instead of move); cracking the cpu profile's `(program)` row open with a Blink-category trace; the WhiteSpaceFilter paired-A/B that found it wasn't worth its layout cost in our pipeline. |
 | [06-microtasks-pageranges-css.md](notes/06-microtasks-pageranges-css.md) | Following `RunMicrotasks` down to zero (chunker fully sync); why `pageRanges` sharding is off the table; CSS cost attribution showing print.css's individual sections are all below the noise floor. |
 | [07-memory.md](notes/07-memory.md) | Where the renderer's 1.9 GB goes -- process-tree footprint, per-allocator + per-Blink-class breakdown, `--disable-gpu` + `--in-process-gpu` saving ~200 MB, a GC-pass probe finding 180 MB of unswept Oilpan garbage. |
-| [08-pdf-lib.md](notes/08-pdf-lib.md) | Profiling the process phase via `--cpu-profile-process`; pako's per-stream init dominates with ~4 500 small streams (routing pdf-lib's `deflate` + `inflate` through `node:zlib` saves ~1.5 s); `PDFRef.of`'s string-keyed Map lookup at 1.2 M calls per load (dense-array gen=0 cache saves ~0.2 s); parallelising save's per-stream deflate on libuv's pool with `objectsPerStream: 500` (~0.3 s off the main thread; PDF -5 %); `decodeName`'s regex scan on 2.76 M `PDFName.of` calls per load with a 0.0001 % hit rate (no-`#` cache saves ~0.5 s); `numberToString`'s redundant `toString`/`split`/`parseInt` on the 100 % no-`e` path; `sizeInBytes` allocating `n.toString(2)` on ~300 k xref-writer calls (short-circuit ladder saves ~60 ms); `PDFDict.entries` allocating `Array.from(map.entries())` on every dict serialisation (`Map.forEach` with hoisted callbacks saves ~80 ms). |
+| [08-pdf-lib.md](notes/08-pdf-lib.md) | Profiling the process phase via `--cpu-profile-process`; pako's per-stream init dominates with ~4 500 small streams (routing pdf-lib's `deflate` + `inflate` through `node:zlib` saves ~1.5 s); `PDFRef.of`'s string-keyed Map lookup at 1.2 M calls per load (dense-array gen=0 cache saves ~0.2 s); parallelising save's per-stream deflate on libuv's pool with `objectsPerStream: 500` (~0.3 s off the main thread; PDF -5 %); `decodeName`'s regex scan on 2.76 M `PDFName.of` calls per load with a 0.0001 % hit rate (no-`#` cache saves ~0.5 s); `numberToString`'s redundant `toString`/`split`/`parseInt` on the 100 % no-`e` path; `sizeInBytes` allocating `n.toString(2)` on ~300 k xref-writer calls (short-circuit ladder saves ~60 ms); `PDFDict.entries` allocating `Array.from(map.entries())` on every dict serialisation (`Map.forEach` with hoisted callbacks saves ~80 ms); `parseDict`'s type-dispatch tail re-running `PDFName.of('Type'/'Catalog'/'Pages'/'Page')` per dict (hoisted sentinel constants drop `fastOf` self-time by 22 %). |
diff --git a/perf/measure.mjs b/perf/measure.mjs
index 60185dde..c5cca424 100644
--- a/perf/measure.mjs
+++ b/perf/measure.mjs
@@ -31,7 +31,7 @@
 //                    [--fast-refs] [--parallel-deflate]
 //                    [--fast-decode-name] [--fast-number-to-string]
 //                    [--fast-size-in-bytes] [--fast-inflate]
-//                    [--fast-parse-number]
+//                    [--fast-parse-number] [--fast-parse-dict]
 //
 // --render-only bails out after the render phase. Skips meta extraction,
 // parseOutline, page.pdf, and the pdf-lib roundtrip / incremental writer.
@@ -137,6 +137,14 @@
 // trailing Number() round-trip. Every numeric token in a parsed
 // PDF flows through these; hundreds of thousands of calls per load
 // on the book. Production runs through it.
+//
+// --fast-parse-dict hoists the four sentinel PDFName.of calls
+// (Type / Catalog / Pages / Page) out of the type-dispatch tail
+// in PDFObjectParser.prototype.parseDict. The dispatch fires
+// per-dict (tens of thousands on the book) and even with
+// --fast-decode-name each lookup is still a Map.get on fastCache.
+// Pool-dedup makes the canonical PDFNames reference-stable, so
+// captured constants replace the four calls verbatim.
 
 import { pathToFileURL, fileURLToPath } from 'node:url';
 import { dirname, resolve, join } from 'node:path';
@@ -187,6 +195,7 @@ let fastSizeInBytes = false;
 let fastInflate = false;
 let fastParseNumber = false;
 let fastDictIter = false;
+let fastParseDict = false;
 for (let i = 0; i < args.length; i++) {
   const a = args[i];
   if (a === '--out') outArg = args[++i];
@@ -216,6 +225,7 @@ for (let i = 0; i < args.length; i++) {
   else if (a === '--fast-inflate') fastInflate = true;
   else if (a === '--fast-parse-number') fastParseNumber = true;
   else if (a === '--fast-dict-iter') fastDictIter = true;
+  else if (a === '--fast-parse-dict') fastParseDict = true;
   else if (!inputArg) inputArg = a;
   else { console.error(`unknown arg: ${a}`); process.exit(2); }
 }
@@ -285,6 +295,10 @@ if (fastDictIter) {
   await import('../docs/lib/fast-dict-iter.mjs');
   console.log('[harness] fast-dict-iter: in-place Map.forEach for PDFDict.sizeInBytes/copyBytesInto');
 }
+if (fastParseDict) {
+  await import('../docs/lib/fast-parse-dict.mjs');
+  console.log('[harness] fast-parse-dict: hoist Type/Catalog/Pages/Page sentinel PDFNames out of parseDict');
+}
 
 const stamp = new Date().toISOString().replace(/[:.]/g, '-');
 const outDir = outArg
diff --git a/perf/notes/08-pdf-lib.md b/perf/notes/08-pdf-lib.md
index dd667686..acdeafd9 100644
--- a/perf/notes/08-pdf-lib.md
+++ b/perf/notes/08-pdf-lib.md
@@ -1336,6 +1336,132 @@ upside is the profile reads cleanly and the perf shape is robust
 to JIT changes. Worth doing whenever the callback's state
 outlives a single iteration.
 
+## `parseDict`: hoist the sentinel `PDFName`s out of the type-dispatch tail
+
+With every other process-phase shim in place, the top of the
+bottom-up table looked like:
+
+```
+   self_ms   self_%   function  @  source
+    194.12   12.49%   (garbage collector)
+    127.05    8.18%   PDFRef.of
+     86.70    5.58%   PDFObjectParser.parseName
+     80.70    5.19%   fastOf                       (fast-decode-name)
+     74.70    4.81%   PDFObjectParser.parseDict
+     ...
+```
+
+`fastOf` -- the cache in front of `PDFName.of` -- shouldn't be
+this high. The whole point of `fast-decode-name` is to collapse
+`PDFName.of` to a `Map.get` per call. So the question is why so
+many calls still hit it.
+
+Reading `PDFObjectParser.parseDict`
+(`pdf-lib/.../parser/PDFObjectParser.js:141`) shows the
+type-dispatch tail at the bottom:
+
+```js
+var Type = dict.get(PDFName.of('Type'));
+if (Type === PDFName.of('Catalog')) return PDFCatalog.fromMapWithContext(...);
+else if (Type === PDFName.of('Pages')) return PDFPageTree.fromMapWithContext(...);
+else if (Type === PDFName.of('Page'))  return PDFPageLeaf.fromMapWithContext(...);
+else                                   return PDFDict.fromMapWithContext(...);
+```
+
+Four `PDFName.of` calls per dict, **including** the dicts that
+have no `/Type` entry at all (resource dicts, font descriptors,
+content-stream dicts -- the bulk of what a real PDF contains).
+With `fast-decode-name` each call is a `fastCache.get` on a 4-byte
+string, which is cheap individually -- but on a 1638-page book
+that's tens of thousands of dicts × 4 calls = hundreds of
+thousands of cache lookups for the same handful of canonical
+`PDFName`s.
+
+### The shim
+
+`docs/lib/fast-parse-dict.mjs` replaces
+`PDFObjectParser.prototype.parseDict` with a version that
+captures the four sentinel `PDFName`s once at shim-load:
+
+```js
+const TypeName    = PDFName.of('Type');
+const CatalogName = PDFName.of('Catalog');
+const PagesName   = PDFName.of('Pages');
+const PageName    = PDFName.of('Page');
+```
+
+and references them directly in the type-dispatch tail:
+
+```js
+const Type = dict.get(TypeName);
+if (Type === CatalogName) return PDFCatalog.fromMapWithContext(dict, this.context);
+if (Type === PagesName)   return PDFPageTree.fromMapWithContext(dict, this.context);
+if (Type === PageName)    return PDFPageLeaf.fromMapWithContext(dict, this.context);
+return PDFDict.fromMapWithContext(dict, this.context);
+```
+
+The rest of the function body (the `<< ... >>` parse loop, the
+`dict.set` calls, the whitespace skipping) is verbatim. Pool-dedup
+guarantees the captured `PDFName`s are `===` to whatever the
+parser would have built via the slow `PDFName.of` calls, so the
+dispatch identity comparisons work unchanged.
+
+`PDFObjectParser` isn't re-exported from pdf-lib's index, so the
+shim reaches in via `pdf-lib/cjs/core/parser/PDFObjectParser.js`
+through `createRequire` -- same shape as `fast-parse-number.mjs`
+and `fast-dict-iter.mjs`.
+
+### Results
+
+Profile diff, both runs `--detach-pages --no-timing` with every
+other shipping shim active, 100 us sampling:
+
+| metric                              | pre        | post       | Δ                  |
+| ---                                 | ---        | ---        | ---                |
+| `fastOf` self                       | 80.70 ms (5.19 %) | 63.20 ms (4.43 %) | **-17.5 ms (-22 %)** |
+| `parseDict` / `fastParseDict` self  | 74.70 ms (4.81 %) | 77.79 ms (5.45 %) | flat (noise)       |
+| process wall-clock                  | 1.55 s     | 1.42 s     | -0.13 s (~noise floor) |
+
+The cleanest signal is the `fastOf` drop: removing four
+`PDFName.of` calls per dict re-attributes ~17 ms away from the
+cache layer. `parseDict`'s own self-time is essentially unchanged
+because the four `PDFName.of` calls were already being charged to
+`fastOf`, not to `parseDict` (child frames don't roll into parent
+self-time). So the optimisation reads as "fastOf got cheaper"
+rather than "parseDict got faster," but it's the same removed
+work either way.
+
+The 130 ms wall-clock delta is mostly within run-to-run noise on a
+1.5 s phase. The mechanism-confirmed ~17 ms via profile
+attribution is the honest number.
+
+PDF output is byte-equivalent: same Map iteration order, same
+dispatch decisions, same canonical `PDFName` instances.
+
+### Why this is the bottom of the easy wins on parseDict
+
+`fastParseDict` is still in the top 15 (5.45 %), which suggests
+more juice in the function. The next-tier targets are all in the
+inner loop:
+
+- `!bytes.done() && bytes.peek() !== 0x3E && bytes.peekAhead(1) !== 0x3E`
+  -- three method calls per iteration, all reading the underlying
+  `Uint8Array`. Inlining would cut method-dispatch overhead but
+  requires reaching into `ByteStream`'s internals.
+- `dict.set(key, value)` -- Map entry allocation. Could be swapped
+  for a plain object via `Object.create(null)`, but
+  `PDFDict.fromMapWithContext` and the existing `fast-dict-iter`
+  shim both assume a Map, so it's a larger surgery.
+- `this.skipWhitespaceAndComments()` -- already on the top-15 list
+  in its own right (~32 ms / 2 %). Two-method-call body
+  (`skipWhitespace` + `skipComment` loop); inlining at parseDict's
+  call site would shed one method-dispatch per loop iteration.
+
+None of these are as clean as the sentinel-hoist patch, and each
+is a bigger code change for a smaller individual win. Worth
+revisiting if a future optimisation moves the floor and parseDict
+becomes a larger relative share.
+
 ## `@cantoo/pdf-lib`: not a drop-in replacement
 
 Spot-checked the maintained fork (`@cantoo/pdf-lib` 2.6.5) as an
@@ -1364,7 +1490,8 @@ Cumulative process-phase cost, baseline → after the shims to date:
 | + parallel-deflate                   | ~2.0 s  | ~1.3 s | ~0.7 s |
 | + fast-decode-name + fast-number-to-string | ~1.6 s  | ~1.0 s | ~0.6 s |
 | + fast-size-in-bytes                 | ~1.5 s  | ~1.0 s | ~0.5 s |
-| **+ fast-dict-iter (this section)**  | **~1.4 s** | **~1.0 s** | **~0.4 s** |
+| + fast-dict-iter                     | ~1.4 s  | ~1.0 s | ~0.4 s |
+| **+ fast-parse-dict (this section)** | **~1.4 s** | **~1.0 s** | **~0.4 s** |
 
 The bottom-up after the latest pair is what's left of pdf-lib's
 genuine parser work: `PDFDict.entries`, `PDFObjectParser.parseName`,

From 3cf4743eb7d78f2a446fd78a1e668d2fa1d42662 Mon Sep 17 00:00:00 2001
From: Kuba Sunderland-Ober <kuba@mareimbrium.org>
Date: Sun, 24 May 2026 00:36:32 +0200
Subject: [PATCH 10/44] Synchronify pdf-lib's load + save paths, pin pdf-lib +
 puppeteer.

---
 docs/lib/fast-sync-load.mjs   | 331 ++++++++++++++++++++++++++++++++++
 docs/lib/parallel-deflate.mjs |  10 +-
 docs/render-book.mjs          |  32 +++-
 package-lock.json             |   4 +-
 package.json                  |   4 +-
 perf/README.md                |  18 +-
 perf/measure.mjs              |  48 +++--
 perf/notes/08-pdf-lib.md      | 241 ++++++++++++++++++++++++-
 8 files changed, 658 insertions(+), 30 deletions(-)
 create mode 100644 docs/lib/fast-sync-load.mjs

diff --git a/docs/lib/fast-sync-load.mjs b/docs/lib/fast-sync-load.mjs
new file mode 100644
index 00000000..fd473f8b
--- /dev/null
+++ b/docs/lib/fast-sync-load.mjs
@@ -0,0 +1,331 @@
+// Strip pdf-lib's parseSpeed / objectsPerTick / shouldWaitForTick /
+// waitForTick machinery entirely. Synchronify everywhere the conditional
+// yield was the only async thing in the method.
+//
+// pdf-lib's parser and writers are downlevel-compiled from TypeScript
+// `async function` to tslib's __awaiter + __generator state machine,
+// so on browsers they can yield to the event loop every
+// `objectsPerTick` objects via `await waitForTick()`. In Node with
+// objectsPerTick: Infinity (which parseSpeed: Fastest historically
+// set on the load side) the gate never fires -- the entire generator
+// runs in one tick -- yet every indirect object (~50 k on the book)
+// still pays the state-machine dispatch + Promise allocation for a
+// single fall-through `case 0`.
+//
+// Eight methods participate in this pattern; this shim replaces all
+// of them with synchronous (or, where a legitimate await remains,
+// awaiterless `async`) twins:
+//
+//   Load side (parser):
+//     PDFParser.prototype.parseDocument
+//     PDFParser.prototype.parseDocumentSection
+//     PDFParser.prototype.parseIndirectObjects
+//     PDFParser.prototype.parseIndirectObject
+//     PDFObjectStreamParser.prototype.parseIntoContext
+//     PDFDocument.load   (static; only awaited parseDocument)
+//
+//   Save side (writers):
+//     PDFWriter.prototype.serializeToBuffer
+//       (kept `async` because the inherited path awaits the
+//        ParallelStreamWriter override of computeBufferSize, which
+//        does genuine Promise.all-driven libuv-pool concurrency)
+//     PDFWriter.prototype.computeBufferSize
+//     PDFStreamWriter.prototype.computeBufferSize
+//
+// The load-side patches have to land together: each method awaits
+// the next one down, so desugaring any one in isolation still leaves
+// a Promise chain dangling.
+//
+// PDFDocument.load's signature is preserved (still callable as
+// `await PDFDocument.load(bytes)`; awaiting a non-Promise resolves
+// to the value), so existing call sites need no change. The
+// parseSpeed option is silently ignored. parallel-deflate.mjs's
+// parallelSave drops `objectsPerTick` from its public API in step
+// with this shim.
+//
+// Side-effecting import. Import once before any pdf-lib operation:
+//
+//   import "./lib/fast-sync-load.mjs";
+//
+// Idempotent -- repeated imports do nothing after the first.
+
+import { createRequire } from 'node:module';
+
+const require = createRequire(import.meta.url);
+const PDFParser              = require('pdf-lib/cjs/core/parser/PDFParser.js').default;
+const PDFObjectStreamParser  = require('pdf-lib/cjs/core/parser/PDFObjectStreamParser.js').default;
+const PDFXRefStreamParser    = require('pdf-lib/cjs/core/parser/PDFXRefStreamParser.js').default;
+const PDFRawStream           = require('pdf-lib/cjs/core/objects/PDFRawStream.js').default;
+const PDFRef                 = require('pdf-lib/cjs/core/objects/PDFRef.js').default;
+const PDFName                = require('pdf-lib/cjs/core/objects/PDFName.js').default;
+const PDFNumber              = require('pdf-lib/cjs/core/objects/PDFNumber.js').default;
+const PDFStream              = require('pdf-lib/cjs/core/objects/PDFStream.js').default;
+const PDFInvalidObject       = require('pdf-lib/cjs/core/objects/PDFInvalidObject.js').default;
+const PDFDocument            = require('pdf-lib/cjs/api/PDFDocument.js').default;
+const PDFWriter              = require('pdf-lib/cjs/core/writers/PDFWriter.js').default;
+const PDFStreamWriter        = require('pdf-lib/cjs/core/writers/PDFStreamWriter.js').default;
+const PDFHeader              = require('pdf-lib/cjs/core/document/PDFHeader.js').default;
+const PDFTrailer             = require('pdf-lib/cjs/core/document/PDFTrailer.js').default;
+const PDFTrailerDict         = require('pdf-lib/cjs/core/document/PDFTrailerDict.js').default;
+const PDFCrossRefSection     = require('pdf-lib/cjs/core/document/PDFCrossRefSection.js').default;
+const PDFCrossRefStream      = require('pdf-lib/cjs/core/structures/PDFCrossRefStream.js').default;
+const PDFObjectStream        = require('pdf-lib/cjs/core/structures/PDFObjectStream.js').default;
+const CharCodes              = require('pdf-lib/cjs/core/syntax/CharCodes.js').default;
+const { ReparseError, StalledParserError } = require('pdf-lib/cjs/core/errors.js');
+const { IsDigit }            = require('pdf-lib/cjs/core/syntax/Numeric.js');
+const { Keywords }           = require('pdf-lib/cjs/core/syntax/Keywords.js');
+const { toUint8Array, copyStringIntoBuffer, last } = require('pdf-lib/cjs/utils/index.js');
+
+// Pool-deduped PDFName instances are reference-stable for the whole
+// load (see fast-parse-dict.mjs for the same trick). Capture the three
+// sentinels parseIndirectObject's Type-dispatch needs.
+const TypeName   = PDFName.of('Type');
+const ObjStmName = PDFName.of('ObjStm');
+const XRefName   = PDFName.of('XRef');
+const RefZero    = PDFRef.of(0);
+const SizeName   = PDFName.of('Size');
+
+if (!PDFParser.prototype.__fastSyncLoadInstalled) {
+
+  // ----- Load side ---------------------------------------------------
+
+  PDFParser.prototype.parseDocument = function parseDocumentSync() {
+    if (this.alreadyParsed) {
+      throw new ReparseError('PDFParser', 'parseDocument');
+    }
+    this.alreadyParsed = true;
+    this.context.header = this.parseHeader();
+
+    let prevOffset;
+    while (!this.bytes.done()) {
+      this.parseDocumentSection();
+      const offset = this.bytes.offset();
+      if (offset === prevOffset) {
+        throw new StalledParserError(this.bytes.position());
+      }
+      prevOffset = offset;
+    }
+
+    this.maybeRecoverRoot();
+    if (this.context.lookup(RefZero)) {
+      console.warn('Removing parsed object: 0 0 R');
+      this.context.delete(RefZero);
+    }
+    return this.context;
+  };
+
+  PDFParser.prototype.parseDocumentSection = function parseDocumentSectionSync() {
+    this.parseIndirectObjects();
+    this.maybeParseCrossRefSection();
+    this.maybeParseTrailerDict();
+    this.maybeParseTrailer();
+    this.skipJibberish();
+  };
+
+  PDFParser.prototype.parseIndirectObjects = function parseIndirectObjectsSync() {
+    this.skipWhitespaceAndComments();
+    while (!this.bytes.done() && IsDigit[this.bytes.peek()]) {
+      const initialOffset = this.bytes.offset();
+      try {
+        this.parseIndirectObject();
+      } catch (e) {
+        this.bytes.moveTo(initialOffset);
+        this.tryToParseInvalidIndirectObject();
+      }
+      this.skipWhitespaceAndComments();
+      this.skipJibberish();
+    }
+  };
+
+  PDFParser.prototype.parseIndirectObject = function parseIndirectObjectSync() {
+    const ref = this.parseIndirectObjectHeader();
+    this.skipWhitespaceAndComments();
+    const object = this.parseObject();
+    this.skipWhitespaceAndComments();
+    this.matchKeyword(Keywords.endobj);
+    if (object instanceof PDFRawStream &&
+        object.dict.lookup(TypeName) === ObjStmName) {
+      PDFObjectStreamParser.forStream(object).parseIntoContext();
+    } else if (object instanceof PDFRawStream &&
+               object.dict.lookup(TypeName) === XRefName) {
+      PDFXRefStreamParser.forStream(object).parseIntoContext();
+    } else {
+      this.context.assign(ref, object);
+    }
+    return ref;
+  };
+
+  PDFObjectStreamParser.prototype.parseIntoContext = function parseIntoContextSync() {
+    if (this.alreadyParsed) {
+      throw new ReparseError('PDFObjectStreamParser', 'parseIntoContext');
+    }
+    this.alreadyParsed = true;
+    const offsetsAndObjectNumbers = this.parseOffsetsAndObjectNumbers();
+    for (let i = 0, len = offsetsAndObjectNumbers.length; i < len; i++) {
+      const entry = offsetsAndObjectNumbers[i];
+      this.bytes.moveTo(this.firstOffset + entry.offset);
+      const object = this.parseObject();
+      const ref = PDFRef.of(entry.objectNumber, 0);
+      this.context.assign(ref, object);
+    }
+  };
+
+  // PDFDocument.load only awaited parseDocument(); now that's sync, the
+  // outer __awaiter is wasted too. Drop it. Signature unchanged --
+  // `await PDFDocument.load(...)` on a non-Promise resolves to the value.
+  // The parseSpeed option is silently ignored (no more yield gate to tune).
+  PDFDocument.load = function loadSync(pdf, options) {
+    if (options === undefined) options = {};
+    const ignoreEncryption      = options.ignoreEncryption      === undefined ? false : options.ignoreEncryption;
+    const throwOnInvalidObject  = options.throwOnInvalidObject  === undefined ? false : options.throwOnInvalidObject;
+    const updateMetadata        = options.updateMetadata        === undefined ? true  : options.updateMetadata;
+    const capNumbers            = options.capNumbers            === undefined ? false : options.capNumbers;
+    const bytes = toUint8Array(pdf);
+    const context = PDFParser.forBytesWithOptions(
+      bytes, Infinity, throwOnInvalidObject, capNumbers,
+    ).parseDocument();
+    return new PDFDocument(context, ignoreEncryption, updateMetadata);
+  };
+
+  // ----- Save side ---------------------------------------------------
+
+  // PDFWriter.serializeToBuffer awaits computeBufferSize, which in our
+  // pipeline is the ParallelStreamWriter override -- genuinely async
+  // because of `await Promise.all(deflated)` over libuv's thread pool.
+  // So the wrapper stays async. The conditional waitForTick yield in
+  // its main loop is the only piece we strip.
+  PDFWriter.prototype.serializeToBuffer = async function serializeToBufferSync() {
+    const { size, header, indirectObjects, xref, trailerDict, trailer } =
+      await this.computeBufferSize();
+    const buffer = new Uint8Array(size);
+    let offset = 0;
+    offset += header.copyBytesInto(buffer, offset);
+    buffer[offset++] = CharCodes.Newline;
+    buffer[offset++] = CharCodes.Newline;
+    for (let idx = 0, len = indirectObjects.length; idx < len; idx++) {
+      const indirectObject = indirectObjects[idx];
+      const ref = indirectObject[0];
+      const object = indirectObject[1];
+      offset += copyStringIntoBuffer(String(ref.objectNumber), buffer, offset);
+      buffer[offset++] = CharCodes.Space;
+      offset += copyStringIntoBuffer(String(ref.generationNumber), buffer, offset);
+      buffer[offset++] = CharCodes.Space;
+      buffer[offset++] = CharCodes.o;
+      buffer[offset++] = CharCodes.b;
+      buffer[offset++] = CharCodes.j;
+      buffer[offset++] = CharCodes.Newline;
+      offset += object.copyBytesInto(buffer, offset);
+      buffer[offset++] = CharCodes.Newline;
+      buffer[offset++] = CharCodes.e;
+      buffer[offset++] = CharCodes.n;
+      buffer[offset++] = CharCodes.d;
+      buffer[offset++] = CharCodes.o;
+      buffer[offset++] = CharCodes.b;
+      buffer[offset++] = CharCodes.j;
+      buffer[offset++] = CharCodes.Newline;
+      buffer[offset++] = CharCodes.Newline;
+    }
+    if (xref) {
+      offset += xref.copyBytesInto(buffer, offset);
+      buffer[offset++] = CharCodes.Newline;
+    }
+    if (trailerDict) {
+      offset += trailerDict.copyBytesInto(buffer, offset);
+      buffer[offset++] = CharCodes.Newline;
+      buffer[offset++] = CharCodes.Newline;
+    }
+    offset += trailer.copyBytesInto(buffer, offset);
+    return buffer;
+  };
+
+  // PDFWriter.computeBufferSize -- the basic (non-stream) writer's
+  // sizing pass. Not on our pipeline's hot path (we route through
+  // PDFStreamWriter via ParallelStreamWriter, both of which override
+  // this method) but patched for consistency: the only async thing
+  // upstream is the conditional waitForTick yield in its loop.
+  PDFWriter.prototype.computeBufferSize = function computeBufferSizeBaseSync() {
+    const header = PDFHeader.forVersion(1, 7);
+    let size = header.sizeInBytes() + 2;
+    const xref = PDFCrossRefSection.create();
+    const indirectObjects = this.context.enumerateIndirectObjects();
+    for (let idx = 0, len = indirectObjects.length; idx < len; idx++) {
+      const indirectObject = indirectObjects[idx];
+      const ref = indirectObject[0];
+      xref.addEntry(ref, size);
+      size += this.computeIndirectObjectSize(indirectObject);
+    }
+    const xrefOffset = size;
+    size += xref.sizeInBytes() + 1;
+    const trailerDict = PDFTrailerDict.of(this.createTrailerDict());
+    size += trailerDict.sizeInBytes() + 2;
+    const trailer = PDFTrailer.forLastCrossRefSectionOffset(xrefOffset);
+    size += trailer.sizeInBytes();
+    return { size, header, indirectObjects, xref, trailerDict, trailer };
+  };
+
+  // PDFStreamWriter.computeBufferSize -- the upstream stream writer's
+  // sizing pass with two waitForTick gates (one per loop). Not on our
+  // pipeline's hot path (ParallelStreamWriter overrides this with its
+  // own three-phase parallel-deflate version) but patched for
+  // consistency. Logic mirrors the upstream method body exactly.
+  PDFStreamWriter.prototype.computeBufferSize = function computeBufferSizeStreamSync() {
+    let objectNumber = this.context.largestObjectNumber + 1;
+    const header = PDFHeader.forVersion(1, 7);
+    let size = header.sizeInBytes() + 2;
+    const xrefStream = PDFCrossRefStream.create(this.createTrailerDict(), this.encodeStreams);
+
+    const uncompressedObjects = [];
+    const compressedObjects = [];
+    const objectStreamRefs = [];
+
+    const indirectObjects = this.context.enumerateIndirectObjects();
+    for (let idx = 0, len = indirectObjects.length; idx < len; idx++) {
+      const indirectObject = indirectObjects[idx];
+      const ref = indirectObject[0];
+      const object = indirectObject[1];
+      const shouldNotCompress =
+        ref === this.context.trailerInfo.Encrypt ||
+        object instanceof PDFStream ||
+        object instanceof PDFInvalidObject ||
+        ref.generationNumber !== 0;
+      if (shouldNotCompress) {
+        uncompressedObjects.push(indirectObject);
+        xrefStream.addUncompressedEntry(ref, size);
+        size += this.computeIndirectObjectSize(indirectObject);
+      } else {
+        let chunk = last(compressedObjects);
+        let objectStreamRef = last(objectStreamRefs);
+        if (!chunk || chunk.length % this.objectsPerStream === 0) {
+          chunk = [];
+          compressedObjects.push(chunk);
+          objectStreamRef = PDFRef.of(objectNumber++);
+          objectStreamRefs.push(objectStreamRef);
+        }
+        xrefStream.addCompressedEntry(ref, objectStreamRef, chunk.length);
+        chunk.push(indirectObject);
+      }
+    }
+
+    for (let idx = 0, len = compressedObjects.length; idx < len; idx++) {
+      const chunk = compressedObjects[idx];
+      const ref = objectStreamRefs[idx];
+      const objectStream = PDFObjectStream.withContextAndObjects(this.context, chunk, this.encodeStreams);
+      xrefStream.addUncompressedEntry(ref, size);
+      size += this.computeIndirectObjectSize([ref, objectStream]);
+      uncompressedObjects.push([ref, objectStream]);
+    }
+
+    const xrefStreamRef = PDFRef.of(objectNumber++);
+    xrefStream.dict.set(SizeName, PDFNumber.of(objectNumber));
+    xrefStream.addUncompressedEntry(xrefStreamRef, size);
+    const xrefOffset = size;
+    size += this.computeIndirectObjectSize([xrefStreamRef, xrefStream]);
+    uncompressedObjects.push([xrefStreamRef, xrefStream]);
+
+    const trailer = PDFTrailer.forLastCrossRefSectionOffset(xrefOffset);
+    size += trailer.sizeInBytes();
+    return { size, header, indirectObjects: uncompressedObjects, trailer };
+  };
+
+  PDFParser.prototype.__fastSyncLoadInstalled = true;
+}
diff --git a/docs/lib/parallel-deflate.mjs b/docs/lib/parallel-deflate.mjs
index d278ecab..b7b71499 100644
--- a/docs/lib/parallel-deflate.mjs
+++ b/docs/lib/parallel-deflate.mjs
@@ -48,8 +48,12 @@ import {
 const deflateAsync = promisify(deflate);
 
 class ParallelStreamWriter extends PDFStreamWriter {
-  constructor(context, objectsPerTick, encodeStreams, objectsPerStream, parallel) {
-    super(context, objectsPerTick, encodeStreams, objectsPerStream);
+  constructor(context, encodeStreams, objectsPerStream, parallel) {
+    // PDFWriter's second ctor param is objectsPerTick -- the yield knob
+    // that drives shouldWaitForTick. fast-sync-load.mjs rips out every
+    // caller of shouldWaitForTick on both the parser and writer sides,
+    // so the value here is vestigial. Pass Infinity for explicitness.
+    super(context, Infinity, encodeStreams, objectsPerStream);
     this._lastPrecompressed = 0;
     this._parallel = parallel;
   }
@@ -153,7 +157,6 @@ class ParallelStreamWriter extends PDFStreamWriter {
  */
 export async function parallelSave(pdfDoc, options = {}) {
   const {
-    objectsPerTick = Infinity,
     addDefaultPage = true,
     updateFieldAppearances = true,
     objectsPerStream = 50,
@@ -170,7 +173,6 @@ export async function parallelSave(pdfDoc, options = {}) {
 
   const writer = new ParallelStreamWriter(
     pdfDoc.context,
-    objectsPerTick,
     encodeStreams,
     objectsPerStream,
     parallel,
diff --git a/docs/render-book.mjs b/docs/render-book.mjs
index 40f57257..82c7f4b4 100644
--- a/docs/render-book.mjs
+++ b/docs/render-book.mjs
@@ -31,7 +31,7 @@ import { pathToFileURL, fileURLToPath } from 'node:url';
 import { dirname, resolve } from 'node:path';
 import { writeFileSync, existsSync } from 'node:fs';
 import puppeteer from 'puppeteer';
-import { PDFDocument, ParseSpeeds } from 'pdf-lib';
+import { PDFDocument } from 'pdf-lib';
 // Side-effecting imports. Mutate pdf-lib's live module exports
 // before any pdf-lib operation -- order doesn't matter. See
 // perf/notes/08-pdf-lib.md.
@@ -70,6 +70,20 @@ import { PDFDocument, ParseSpeeds } from 'pdf-lib';
 //     per-dict on every load; pool-dedup makes the canonical
 //     PDFNames reference-stable, so captured constants replace
 //     the calls verbatim. Pulls ~17 ms off fastOf self-time.
+//   fast-sync-load -- rip the parseSpeed / objectsPerTick /
+//     shouldWaitForTick / waitForTick machinery out of both pdf-lib's
+//     load path (PDFDocument.load + five PDFParser /
+//     PDFObjectStreamParser methods underneath it) and its save path
+//     (PDFWriter.serializeToBuffer + computeBufferSize, plus the
+//     unreachable PDFStreamWriter.computeBufferSize patched for
+//     consistency). Each upstream method is wrapped in __awaiter so
+//     on browsers it can yield to the event loop every objectsPerTick
+//     objects; in Node the gate never fires but every indirect object
+//     still paid for the generator state machine + Promise
+//     allocation. ~135 ms of attributed parser self-time + ~40 ms
+//     writer + an unknowable chunk of the GC row removed; the
+//     parseSpeed / objectsPerTick options drop off all our call sites
+//     in step with this shim.
 import './lib/fast-refs.mjs';
 import './lib/fast-inflate.mjs';
 import './lib/fast-parse-number.mjs';
@@ -78,6 +92,7 @@ import './lib/fast-number-to-string.mjs';
 import './lib/fast-size-in-bytes.mjs';
 import './lib/fast-dict-iter.mjs';
 import './lib/fast-parse-dict.mjs';
+import './lib/fast-sync-load.mjs';
 import { parseOutline, setOutline } from './lib/outline.mjs';
 import { setMetadata }              from './lib/postprocesser.mjs';
 import { parallelSave }             from './lib/parallel-deflate.mjs';
@@ -292,12 +307,13 @@ try {
   console.log(`generate: ${fmtMs(Date.now() - tGenerate)}  (raw ${(rawPdf.length / 1024 / 1024).toFixed(1)} MB)`);
 
   // Process -- pdf-lib roundtrip with outline + metadata attached.
-  // parseSpeed: Fastest and objectsPerTick: Infinity are critical:
-  // pdf-lib's defaults yield to the event loop between every 100/50
-  // objects, turning a ~5 s round-trip into ~40 s on a 50 MB PDF
-  // (~35 s of which is pure V8 idle).
+  // fast-sync-load strips the waitForTick yield gates on both load
+  // and save sides entirely (load was ~40 s under pdf-lib's Slow
+  // default that yields every 100 objects; ~5 s on Fastest; now
+  // ~1 s with the gates ripped out -- so parseSpeed / objectsPerTick
+  // no longer matter and drop from the call sites).
   //
-  // parallelSave (vs the default pdfDoc.save) does two things:
+  // parallelSave (vs the default pdfDoc.save):
   //  - objectsPerStream: 500 -- larger object-stream chunks compress
   //    better (shared deflate window), 5 % smaller output PDF, and
   //    cuts the per-chunk dispatch overhead 10x.
@@ -306,10 +322,10 @@ try {
   //    thread. Moves ~300 ms of zlib work off-CPU on the book.
   // See perf/notes/08-pdf-lib.md.
   const tProcess = Date.now();
-  const pdfDoc = await PDFDocument.load(rawPdf, { parseSpeed: ParseSpeeds.Fastest });
+  const pdfDoc = await PDFDocument.load(rawPdf);
   setMetadata(pdfDoc, meta);
   await setOutline(pdfDoc, outline, false);
-  const { bytes: finalPdf } = await parallelSave(pdfDoc, { objectsPerTick: Infinity, objectsPerStream: 500 });
+  const { bytes: finalPdf } = await parallelSave(pdfDoc, { objectsPerStream: 500 });
   console.log(`process:  ${fmtMs(Date.now() - tProcess)}`);
 
   writeFileSync(outputPath, Buffer.from(finalPdf));
diff --git a/package-lock.json b/package-lock.json
index 4d32d0a6..3e17b771 100644
--- a/package-lock.json
+++ b/package-lock.json
@@ -9,8 +9,8 @@
       "version": "0.0.0",
       "devDependencies": {
         "html-entities": "^2.6.0",
-        "pdf-lib": "^1.17.1",
-        "puppeteer": "^25.0.4"
+        "pdf-lib": "1.17.1",
+        "puppeteer": "25.0.4"
       }
     },
     "node_modules/@babel/code-frame": {
diff --git a/package.json b/package.json
index ba6093ad..3dce8713 100644
--- a/package.json
+++ b/package.json
@@ -5,7 +5,7 @@
   "description": "PDF book pipeline and profiling harness for the twinBASIC documentation",
   "devDependencies": {
     "html-entities": "^2.6.0",
-    "pdf-lib": "^1.17.1",
-    "puppeteer": "^25.0.4"
+    "pdf-lib": "1.17.1",
+    "puppeteer": "25.0.4"
   }
 }
diff --git a/perf/README.md b/perf/README.md
index a93bda36..a7635cc5 100644
--- a/perf/README.md
+++ b/perf/README.md
@@ -66,7 +66,7 @@ or pdf-lib), or to write `book.pdf` for behavioural verification.
 The mirror command for CPU-profiling the pdf-lib roundtrip:
 
 ```
-node measure.mjs --fast-refs --parallel-deflate --fast-decode-name --fast-number-to-string --fast-size-in-bytes --fast-inflate --fast-parse-number --fast-dict-iter --fast-parse-dict --cpu-profile-process --cpu-sampling 100
+node measure.mjs --fast-refs --parallel-deflate --fast-decode-name --fast-number-to-string --fast-size-in-bytes --fast-inflate --fast-parse-number --fast-dict-iter --fast-parse-dict --fast-sync-load --cpu-profile-process --cpu-sampling 100
 ```
 
 Flag rationale:
@@ -142,6 +142,18 @@ Flag rationale:
   `fastOf` was still the #4 row in the profile -- removing the
   calls pulls ~17 ms (~22 %) off `fastOf` self-time. Production
   runs through it.
+- `--fast-sync-load` -- inject
+  [docs/lib/fast-sync-load.mjs](../docs/lib/fast-sync-load.mjs),
+  replacing nine `__awaiter`-wrapped methods across pdf-lib's load
+  and save paths with awaiterless twins. Each upstream method is
+  wrapped in tslib `__awaiter` / `__generator` so on browsers it
+  can `await waitForTick()` every `objectsPerTick` objects; in
+  Node the yield gate never fires (objectsPerTick: Infinity), but
+  every indirect object still pays the generator state-machine
+  dispatch + Promise allocation. The shim removes the scaffolding
+  entirely. The `parseSpeed` / `objectsPerTick` options drop off
+  `PDFDocument.load`, `parallelSave`, and `pdfDoc.save` call sites
+  in step. Production runs through it.
 - `--cpu-profile-process` -- attach Node's `inspector/promises`
   Profiler around the process phase only (skips render and generate).
   Writes `process.cpuprofile` into the timestamped `results/` folder.
@@ -301,6 +313,7 @@ run.bat --fast-inflate                    # swap pako.inflate for node:zlib.infl
 run.bat --fast-parse-number               # direct-integer accumulator for parseRawNumber/parseRawInt (also ships; opt-in here for A/B)
 run.bat --fast-dict-iter                  # in-place Map.forEach for PDFDict.sizeInBytes/copyBytesInto (also ships; opt-in here for A/B)
 run.bat --fast-parse-dict                 # hoist Type/Catalog/Pages/Page sentinel PDFNames out of parseDict (also ships; opt-in here for A/B)
+run.bat --fast-sync-load                  # synchronify PDFDocument.load + parser; strip waitForTick machinery (also ships; opt-in here for A/B)
 ```
 
 Flags compose. The CPU profile lands as `render.cpuprofile`
@@ -415,6 +428,7 @@ file documenting each:
 | `sizeInBytes` short-circuit ladder (no base-2 string) | [08](notes/08-pdf-lib.md) | ~60 ms process (save -70 ms) |
 | `PDFDict` iter (Map.forEach with hoisted callbacks) | [08](notes/08-pdf-lib.md) | ~80 ms process (dict path -6 pp) |
 | `parseDict` sentinel-PDFName hoist (Type/Catalog/Pages/Page) | [08](notes/08-pdf-lib.md) | ~17 ms profile (fastOf -22 %) |
+| Synchronify pdf-lib load + save (strip `__awaiter` scaffolding) | [08](notes/08-pdf-lib.md) | ~0.36 s process (load -26 %, GC -53 ms) |
 
 What was tried and didn't ship:
 
@@ -441,4 +455,4 @@ order; later ones reference earlier ones for context.
 | [05-blink-trace.md](notes/05-blink-trace.md) | What happened when we tried move-not-clone (a `previousLeaf` cache shipped instead of move); cracking the cpu profile's `(program)` row open with a Blink-category trace; the WhiteSpaceFilter paired-A/B that found it wasn't worth its layout cost in our pipeline. |
 | [06-microtasks-pageranges-css.md](notes/06-microtasks-pageranges-css.md) | Following `RunMicrotasks` down to zero (chunker fully sync); why `pageRanges` sharding is off the table; CSS cost attribution showing print.css's individual sections are all below the noise floor. |
 | [07-memory.md](notes/07-memory.md) | Where the renderer's 1.9 GB goes -- process-tree footprint, per-allocator + per-Blink-class breakdown, `--disable-gpu` + `--in-process-gpu` saving ~200 MB, a GC-pass probe finding 180 MB of unswept Oilpan garbage. |
-| [08-pdf-lib.md](notes/08-pdf-lib.md) | Profiling the process phase via `--cpu-profile-process`; pako's per-stream init dominates with ~4 500 small streams (routing pdf-lib's `deflate` + `inflate` through `node:zlib` saves ~1.5 s); `PDFRef.of`'s string-keyed Map lookup at 1.2 M calls per load (dense-array gen=0 cache saves ~0.2 s); parallelising save's per-stream deflate on libuv's pool with `objectsPerStream: 500` (~0.3 s off the main thread; PDF -5 %); `decodeName`'s regex scan on 2.76 M `PDFName.of` calls per load with a 0.0001 % hit rate (no-`#` cache saves ~0.5 s); `numberToString`'s redundant `toString`/`split`/`parseInt` on the 100 % no-`e` path; `sizeInBytes` allocating `n.toString(2)` on ~300 k xref-writer calls (short-circuit ladder saves ~60 ms); `PDFDict.entries` allocating `Array.from(map.entries())` on every dict serialisation (`Map.forEach` with hoisted callbacks saves ~80 ms); `parseDict`'s type-dispatch tail re-running `PDFName.of('Type'/'Catalog'/'Pages'/'Page')` per dict (hoisted sentinel constants drop `fastOf` self-time by 22 %). |
+| [08-pdf-lib.md](notes/08-pdf-lib.md) | Profiling the process phase via `--cpu-profile-process`; pako's per-stream init dominates with ~4 500 small streams (routing pdf-lib's `deflate` + `inflate` through `node:zlib` saves ~1.5 s); `PDFRef.of`'s string-keyed Map lookup at 1.2 M calls per load (dense-array gen=0 cache saves ~0.2 s); parallelising save's per-stream deflate on libuv's pool with `objectsPerStream: 500` (~0.3 s off the main thread; PDF -5 %); `decodeName`'s regex scan on 2.76 M `PDFName.of` calls per load with a 0.0001 % hit rate (no-`#` cache saves ~0.5 s); `numberToString`'s redundant `toString`/`split`/`parseInt` on the 100 % no-`e` path; `sizeInBytes` allocating `n.toString(2)` on ~300 k xref-writer calls (short-circuit ladder saves ~60 ms); `PDFDict.entries` allocating `Array.from(map.entries())` on every dict serialisation (`Map.forEach` with hoisted callbacks saves ~80 ms); `parseDict`'s type-dispatch tail re-running `PDFName.of('Type'/'Catalog'/'Pages'/'Page')` per dict (hoisted sentinel constants drop `fastOf` self-time by 22 %); pdf-lib's `__awaiter`/`__generator` scaffolding on nine load + save methods costing ~135 ms of attributed self-time + ~50 ms GC (synchronified twins save 0.36 s of process). |
diff --git a/perf/measure.mjs b/perf/measure.mjs
index c5cca424..365f27aa 100644
--- a/perf/measure.mjs
+++ b/perf/measure.mjs
@@ -32,6 +32,7 @@
 //                    [--fast-decode-name] [--fast-number-to-string]
 //                    [--fast-size-in-bytes] [--fast-inflate]
 //                    [--fast-parse-number] [--fast-parse-dict]
+//                    [--fast-sync-load]
 //
 // --render-only bails out after the render phase. Skips meta extraction,
 // parseOutline, page.pdf, and the pdf-lib roundtrip / incremental writer.
@@ -145,13 +146,30 @@
 // --fast-decode-name each lookup is still a Map.get on fastCache.
 // Pool-dedup makes the canonical PDFNames reference-stable, so
 // captured constants replace the four calls verbatim.
+//
+// --fast-sync-load rips pdf-lib's parseSpeed / objectsPerTick /
+// shouldWaitForTick / waitForTick machinery out of both the load
+// path (PDFDocument.load + PDFParser.parseDocument / parseDocumentSection
+// / parseIndirectObjects / parseIndirectObject +
+// PDFObjectStreamParser.parseIntoContext) and the save path
+// (PDFWriter.serializeToBuffer / computeBufferSize +
+// PDFStreamWriter.computeBufferSize). pdf-lib's TS downlevel wraps
+// each in tslib __awaiter / __generator so on browsers they can
+// `await waitForTick()` every `objectsPerTick` objects; with
+// objectsPerTick: Infinity (or the load path's parseSpeed: Fastest)
+// the gate never fires, but every indirect object still pays the
+// generator state-machine + Promise allocation. The shim removes
+// the scaffolding and the waitForTick yields entirely. Production
+// runs through it; the parseSpeed / objectsPerTick options are
+// dropped from PDFDocument.load / parallelSave / pdfDoc.save call
+// sites in step with this shim.
 
 import { pathToFileURL, fileURLToPath } from 'node:url';
 import { dirname, resolve, join } from 'node:path';
 import { mkdirSync, writeFileSync, existsSync } from 'node:fs';
 import { Session } from 'node:inspector/promises';
 import puppeteer from 'puppeteer';
-import { PDFDocument, ParseSpeeds } from 'pdf-lib';
+import { PDFDocument } from 'pdf-lib';
 // Shared with docs/render-book.mjs -- the helpers and the paged.js
 // bundle live under docs/lib/ now that we've dropped the pagedjs-cli
 // dependency. Importing from there guarantees the harness measures the
@@ -196,6 +214,7 @@ let fastInflate = false;
 let fastParseNumber = false;
 let fastDictIter = false;
 let fastParseDict = false;
+let fastSyncLoad = false;
 for (let i = 0; i < args.length; i++) {
   const a = args[i];
   if (a === '--out') outArg = args[++i];
@@ -226,6 +245,7 @@ for (let i = 0; i < args.length; i++) {
   else if (a === '--fast-parse-number') fastParseNumber = true;
   else if (a === '--fast-dict-iter') fastDictIter = true;
   else if (a === '--fast-parse-dict') fastParseDict = true;
+  else if (a === '--fast-sync-load') fastSyncLoad = true;
   else if (!inputArg) inputArg = a;
   else { console.error(`unknown arg: ${a}`); process.exit(2); }
 }
@@ -299,6 +319,10 @@ if (fastParseDict) {
   await import('../docs/lib/fast-parse-dict.mjs');
   console.log('[harness] fast-parse-dict: hoist Type/Catalog/Pages/Page sentinel PDFNames out of parseDict');
 }
+if (fastSyncLoad) {
+  await import('../docs/lib/fast-sync-load.mjs');
+  console.log('[harness] fast-sync-load: synchronify PDFParser load path, strip waitForTick machinery');
+}
 
 const stamp = new Date().toISOString().replace(/[:.]/g, '-');
 const outDir = outArg
@@ -580,15 +604,17 @@ try {
     finalPdf = bytes;
     processBreakdown = { incrementalMs: incMs, ...stats };
   } else {
-    // pdf-lib's defaults are catastrophically slow: parseSpeed=Slow (100
-    // objects/tick) and objectsPerTick=50 both yield to the event loop
-    // between batches, turning a ~2s load into ~36s on a 52 MB PDF (~34s
-    // pure idle in the cpuprofile). Override to Fastest/Infinity so the
-    // "baseline" we report reflects the library's actual CPU cost, not
-    // an artefact of yielding cadence. The harness has no parallel work
-    // to make space for, so cooperative yielding is pure overhead here.
+    // Upstream pdf-lib's load yields to the event loop every
+    // `parseSpeed` objects via `await waitForTick()`; the save side
+    // does the same every `objectsPerTick`. With --fast-sync-load on
+    // (the production default) both yield gates are ripped out -- the
+    // option arguments are silently ignored, so we don't bother
+    // passing them. Without --fast-sync-load, the run measures pdf-lib's
+    // cautious defaults (parseSpeed: Slow, objectsPerTick: 50) which
+    // yield ~500 / ~1000 times per phase on the book; that's pdf-lib's
+    // out-of-the-box behaviour, useful as a baseline for A/B work.
     const tLoadStart = Date.now();
-    const pdfDoc = await PDFDocument.load(rawPdf, { parseSpeed: ParseSpeeds.Fastest });
+    const pdfDoc = await PDFDocument.load(rawPdf);
     const loadMs = Date.now() - tLoadStart;
 
     setMetadata(pdfDoc, meta);
@@ -600,11 +626,11 @@ try {
     const tSaveStart = Date.now();
     let parallelStreamCount = 0;
     if (parallelDeflate) {
-      const { bytes, streamCount } = await parallelSave(pdfDoc, { objectsPerTick: Infinity, objectsPerStream: 500 });
+      const { bytes, streamCount } = await parallelSave(pdfDoc, { objectsPerStream: 500 });
       finalPdf = bytes;
       parallelStreamCount = streamCount;
     } else {
-      finalPdf = await pdfDoc.save({ objectsPerTick: Infinity });
+      finalPdf = await pdfDoc.save();
     }
     const saveMs = Date.now() - tSaveStart;
 
diff --git a/perf/notes/08-pdf-lib.md b/perf/notes/08-pdf-lib.md
index acdeafd9..1c6db344 100644
--- a/perf/notes/08-pdf-lib.md
+++ b/perf/notes/08-pdf-lib.md
@@ -1462,6 +1462,244 @@ is a bigger code change for a smaller individual win. Worth
 revisiting if a future optimisation moves the floor and parseDict
 becomes a larger relative share.
 
+## Strip the parse-speed machinery: synchronify the load path
+
+After the eight `--fast-*` patches above had nibbled the process
+phase from 7.8 s down to 1.66 s, the next interesting thing in the
+profile wasn't *a function* -- it was *function scaffolding*.
+Three top-15 rows were the tslib `__awaiter` / `__generator`
+machinery that pdf-lib's TypeScript downlevel emits for its
+`async`-marked parser methods:
+
+```
+   self_ms   self_%   function                                  source
+   -------   ------   ----------------------------------------------
+     51.66    3.12%   (anonymous)  (parseIndirectObject body)   PDFParser.js:126
+     43.05    2.60%   step         (generator runner)           tslib.js:123
+     40.90    2.47%   (anonymous)  (parseIndirectObjects body)  PDFParser.js:190
+```
+
+Together ~135 ms / ~8 % of process self-time, sitting on top of
+the parsing work that's already attributed to the named frames
+below them.
+
+### What that scaffolding was for
+
+pdf-lib targets browsers as well as Node. On a browser, locking
+the main thread for the seconds it takes to parse a big PDF would
+freeze the page, so pdf-lib has a knob -- `parseSpeed`, also
+exposed as `objectsPerTick` -- that controls how many indirect
+objects the parser processes before yielding to the event loop via
+`await waitForTick()`. The default is the cautious
+`ParseSpeeds.Slow = 100`. The mechanism is a constructor-installed
+predicate (`PDFParser.js:31`):
+
+```js
+this.shouldWaitForTick = function () {
+  this.parsedObjects += 1;
+  return this.parsedObjects % this.objectsPerTick === 0;
+};
+```
+
+…queried at the bottom of every `parseIndirectObjects` iteration
+(`PDFParser.js:215`) and every `parseIntoContext` iteration in
+`PDFObjectStreamParser.js:42`, gating an `await waitForTick()`
+(= `setImmediate`).
+
+`render-book.mjs` already passed `parseSpeed: ParseSpeeds.Fastest`
+to `PDFDocument.load`, which is `objectsPerTick: Infinity`, which
+makes `shouldWaitForTick()` return `false` on every call: the
+modulo never hits zero, the yield never fires. The
+`Fastest`-vs-`Slow` speedup we'd measured years earlier (see
+[01-baseline-and-detach.md](01-baseline-and-detach.md))
+was precisely removing those yields' wall-clock contribution.
+
+But removing the *yields* didn't remove the **scaffolding**. Even
+with `objectsPerTick: Infinity`, every call to
+`parseIndirectObject` still:
+
+1. Allocates a Promise (the `__awaiter` return).
+2. Allocates a generator object (the inner `__generator` return).
+3. Allocates an activation record (the closed-over `_a` state).
+4. Enters the tslib `step` runner, which calls the generator
+   body, which enters `switch (_a.label) { case 0: ... }`, runs
+   all the synchronous work, falls through to `return [2 /*return*/, ref]`,
+   which `step` unpacks and resolves the Promise with.
+5. The caller `await`s that Promise (one microtask hop).
+
+For ~50 k indirect objects on the book that's 50 k of each.
+Roughly ~135 ms of attributed self-time (the three rows above)
+plus an unknowable but non-trivial fraction of the 240 ms GC row
+(Promise + generator + activation are all short-lived heap
+allocations).
+
+The same shape applies to `parseIndirectObjects` (which calls
+`parseIndirectObject`), `parseDocumentSection` (which calls
+`parseIndirectObjects`), `parseDocument` (which calls
+`parseDocumentSection`), and `PDFDocument.load` (which calls
+`parseDocument`). Five `async` wrappers around code that, on the
+hot path, runs synchronously.
+
+### Why bother on the ObjStm branch too
+
+`parseIndirectObject` *does* have one genuinely-await-ing branch
+at `PDFParser.js:142`: if the parsed object is an object stream
+(PDF 1.5 §7.5.7, type `ObjStm`), it dispatches to
+`PDFObjectStreamParser.parseIntoContext()`, which itself is
+`async`. But `parseIntoContext`'s only `await` is the same kind
+of conditionally-gated `waitForTick` -- and `shouldWaitForTick`
+is passed in from the parent parser, so it's still `() => false`
+under our config. The whole sub-stream walk is already morally
+synchronous; just no upstream code path ever constructs a parser
+without `shouldWaitForTick`.
+
+(Aside: Chrome's `SkPDF` writer doesn't emit ObjStm at all -- it
+writes every indirect object at its own xref offset and uses the
+classic xref table. So on our pipeline the ObjStm branch of
+`parseIndirectObject` doesn't even fire. But pdf-lib loads have
+to work generically; the patch handles the branch correctly.)
+
+### The shim
+
+`docs/lib/fast-sync-load.mjs` replaces six prototype methods with
+synchronous twins:
+
+```
+PDFParser.prototype.parseDocument
+PDFParser.prototype.parseDocumentSection
+PDFParser.prototype.parseIndirectObjects
+PDFParser.prototype.parseIndirectObject
+PDFObjectStreamParser.prototype.parseIntoContext
+PDFDocument.load   (static)
+```
+
+The bodies are line-by-line ports of the upstream `case`-blocks --
+same loop, same `parseObject` / `context.assign` / `parseHeader` /
+`maybeParseCrossRefSection` / `maybeParseTrailerDict` /
+`maybeParseTrailer` / `skipJibberish` calls in the same order --
+with three changes:
+
+1. No `__awaiter` / `__generator` wrapper. The function returns
+   directly.
+2. No `shouldWaitForTick` check, no `waitForTick` yield.
+3. The three `PDFName.of(...)` calls in `parseIndirectObject`'s
+   type-dispatch tail (`'Type'`, `'ObjStm'`, `'XRef'`) are hoisted
+   to module-level constants -- same trick as
+   [`fast-parse-dict.mjs`](#parsedict-hoist-the-sentinel-pdfnames-out-of-the-type-dispatch-tail),
+   since pool-dedup makes the `PDFName` instances reference-stable.
+
+The patches have to land together: each method awaits the next
+one down, so desugaring any one in isolation still leaves a
+Promise chain dangling.
+
+`PDFDocument.load`'s signature is preserved -- still callable as
+`await PDFDocument.load(bytes)`. `await` on a non-Promise resolves
+to the value immediately, so existing call sites need no change.
+The `parseSpeed` option is now silently ignored (no yield gate
+left to tune).
+
+The shim's correctness depends on the upstream pdf-lib source
+being structurally what the line-by-line port assumed. `pdf-lib`
+1.17.1 (Hopding's last release, abandoned) is byte-stable on npm
+and that's what we ship against; `package.json` is updated in
+this change to pin to `1.17.1` exact (was `^1.17.1`), similarly
+for `puppeteer` `25.0.4`, so a stray `npm update` can't silently
+swap upstream from under the shim.
+
+### Results
+
+Paired process-phase profiles, same harness config except
+`--fast-sync-load`:
+
+| metric                                  | PRE       | POST      | Δ                |
+| ---                                     | ---       | ---       | ---              |
+| **process wall-clock**                  | **1.66 s** | **1.30 s** | **-0.36 s (-22 %)** |
+| ↳ load                                  | 1.09 s    | 0.81 s    | -0.28 s (-26 %)  |
+| ↳ save                                  | 0.56 s    | 0.48 s    | -0.08 s (noise; writer not touched) |
+| GC self-time                            | 240 ms    | 187 ms    | -53 ms (-22 %)   |
+| `(anonymous) @ PDFParser.js:126`        | 51.66 ms  | gone      | -51.66 ms        |
+| `step @ tslib.js:123`                   | 43.05 ms  | gone      | -43.05 ms        |
+| `(anonymous) @ PDFParser.js:190`        | 40.90 ms  | gone      | -40.90 ms        |
+| **scaffolding total**                   | **~135 ms** | **0**   | **-135 ms (eliminated)** |
+
+The wall-clock delta is larger than the sum of the eliminated
+rows because the GC win is real time too: the per-object Promise
++ generator + activation allocations weren't free in V8's
+internals either, just not attributed to any named frame.
+
+Output PDF: byte-count identical (16,077,319 bytes both runs);
+MD5 differs only because Chrome's `page.pdf()` embeds a fresh
+`/CreationDate` + `/ModDate` per run (same ±27-byte timestamp
+jitter `docs/book.bat` output has always had).
+
+### Extending to the save side
+
+The shim covers the writers too, by symmetry. Three more methods:
+
+```
+PDFWriter.prototype.serializeToBuffer
+PDFWriter.prototype.computeBufferSize
+PDFStreamWriter.prototype.computeBufferSize
+```
+
+Only `serializeToBuffer` actually runs on our pipeline --
+`ParallelStreamWriter extends PDFStreamWriter` overrides
+`computeBufferSize` with its own three-phase parallel-deflate
+version (genuinely async because of `await Promise.all(deflated)`
+over libuv's thread pool, which we keep). But the inherited
+`serializeToBuffer` still had a dead `shouldWaitForTick` gate in
+its main loop. Same shape as the load side: per-object dispatch,
+no actual yield because `objectsPerTick` is effectively `Infinity`,
+but every iteration pays the generator-machine + Promise cost.
+
+`serializeToBuffer` stays `async` (it has to `await
+this.computeBufferSize()`, which is the genuinely-async override).
+The change is: drop the `__awaiter` / `__generator` wrapper, use
+ES `async function` with one real `await`, strip the
+`shouldWaitForTick` gate. `computeBufferSize` on both base and
+stream writers becomes fully synchronous (their only async
+ingredient was the same dead yield).
+
+Measured wins on the writer side: **none reliably above noise**.
+The save phase dropped from 0.56 s before the load-side patches
+to 0.48 s after, and the writer patches don't move it further
+(0.50 s in the post-extension profile, within the run-to-run
+band). No writer frame ever broke into the top 15 in the first
+place -- the overhead was real but distributed across
+unattributed scaffolding and `(program)` time, not big enough to
+register individually.
+
+The reason to ship it anyway is structural, not performance: with
+load patched, the only remaining
+`shouldWaitForTick` / `waitForTick` references in our hot path
+were on the save side, and leaving them would defeat the "rip out
+the machinery" intent. With the save patches landed, neither
+phase routes through tslib `__awaiter` scaffolding except where
+there's a legitimate `await` underneath.
+
+### Dropping the flags
+
+The companion change is to drop the `parseSpeed` / `objectsPerTick`
+options from all our call sites, since with the shim in effect
+neither does anything:
+
+- `docs/render-book.mjs` drops `parseSpeed: ParseSpeeds.Fastest`
+  from `PDFDocument.load` and `objectsPerTick: Infinity` from
+  `parallelSave`. The `ParseSpeeds` import goes with them.
+- `docs/lib/parallel-deflate.mjs` drops `objectsPerTick` from
+  `parallelSave`'s public options object and from
+  `ParallelStreamWriter`'s constructor parameters. `PDFWriter`'s
+  base constructor still takes `objectsPerTick` as positional
+  arg 2 -- vestigial after `fast-sync-load`, but we pass
+  `Infinity` explicitly to make the constructor chain happy.
+- `perf/measure.mjs` removes the same options from
+  `PDFDocument.load`, `parallelSave`, and `pdfDoc.save`.
+
+`perf/profile-roundtrip.mjs` keeps its `parseSpeed` /
+`objectsPerTick` knob comparison -- that file's whole purpose is
+to A/B pdf-lib's defaults against `Fastest`, and it runs against
+vanilla pdf-lib without the shim by design.
+
 ## `@cantoo/pdf-lib`: not a drop-in replacement
 
 Spot-checked the maintained fork (`@cantoo/pdf-lib` 2.6.5) as an
@@ -1491,7 +1729,8 @@ Cumulative process-phase cost, baseline → after the shims to date:
 | + fast-decode-name + fast-number-to-string | ~1.6 s  | ~1.0 s | ~0.6 s |
 | + fast-size-in-bytes                 | ~1.5 s  | ~1.0 s | ~0.5 s |
 | + fast-dict-iter                     | ~1.4 s  | ~1.0 s | ~0.4 s |
-| **+ fast-parse-dict (this section)** | **~1.4 s** | **~1.0 s** | **~0.4 s** |
+| + fast-parse-dict                    | ~1.4 s  | ~1.0 s | ~0.4 s |
+| **+ fast-sync-load (this section)**  | **~1.3 s** | **~0.8 s** | **~0.5 s** |
 
 The bottom-up after the latest pair is what's left of pdf-lib's
 genuine parser work: `PDFDict.entries`, `PDFObjectParser.parseName`,

From d928c2de491137370e727d6a0dd1c018efff7a45 Mon Sep 17 00:00:00 2001
From: Kuba Sunderland-Ober <kuba@mareimbrium.org>
Date: Sun, 24 May 2026 00:41:20 +0200
Subject: [PATCH 11/44] Dispatch parseObject by first byte; gate
 true/false/null matchKeyword scans.

---
 docs/lib/fast-parse-object.mjs |  92 ++++++++++++++++++++++
 docs/render-book.mjs           |   8 ++
 perf/README.md                 |  22 +++++-
 perf/measure.mjs               |  17 ++++-
 perf/notes/08-pdf-lib.md       | 136 +++++++++++++++++++++++++++++++++
 5 files changed, 272 insertions(+), 3 deletions(-)
 create mode 100644 docs/lib/fast-parse-object.mjs

diff --git a/docs/lib/fast-parse-object.mjs b/docs/lib/fast-parse-object.mjs
new file mode 100644
index 00000000..e573dc44
--- /dev/null
+++ b/docs/lib/fast-parse-object.mjs
@@ -0,0 +1,92 @@
+// Dispatch PDFObjectParser.parseObject by first byte; gate the three
+// keyword scans behind a byte check.
+//
+// The upstream parseObject
+// ([PDFObjectParser.js:36](node_modules/pdf-lib/cjs/core/parser/PDFObjectParser.js:36))
+// runs three speculative matchKeyword calls (true / false / null)
+// before peeking the dispatch byte:
+//
+//   parseObject() {
+//     this.skipWhitespaceAndComments();
+//     if (this.matchKeyword(Keywords.true))  return PDFBool.True;
+//     if (this.matchKeyword(Keywords.false)) return PDFBool.False;
+//     if (this.matchKeyword(Keywords.null))  return PDFNull;
+//     var byte = this.bytes.peek();
+//     ...
+//   }
+//
+// parseObject is called for every dict value, array element, and
+// indirect-object body -- same call density as fastParseDict, which
+// is the #2 row in the process profile. true / false / null are
+// extraordinarily rare in real PDFs (boolean / null entries on
+// individual dict values, mostly), so the three matchKeyword calls
+// fail-and-rewind on essentially every invocation. Each failure
+// still pays bytes.offset() + bytes.next() + comparison +
+// bytes.moveTo(initialOffset).
+//
+// This shim flips the dispatch: peek the first byte, branch by byte
+// for the structural tokens, and only enter matchKeyword when the
+// byte is `t` / `f` / `n` (i.e. could plausibly start the keyword).
+// Dispatch order is by observed frequency in dict-value position:
+// numbers / refs first (digits + sign + period), then dicts (<<),
+// names (/), arrays ([), strings ((), hex strings (<). Same
+// semantics -- a value starting with `t`/`f`/`n` that isn't a
+// keyword still falls through to the same PDFObjectParsingError
+// throw.
+//
+// Mechanism: PDFObjectParser isn't re-exported from pdf-lib's index,
+// so we reach in through the CJS internals via createRequire (same
+// shape as fast-parse-dict.mjs). Mutating
+// PDFObjectParser.prototype.parseObject is global -- every parser
+// instance created after this shim loads picks it up.
+//
+// Side-effecting import. Import once before PDFDocument.load runs:
+//
+//   import "./lib/fast-parse-object.mjs";
+//
+// Idempotent -- repeated imports do nothing after the first.
+
+import { createRequire } from 'node:module';
+
+const require = createRequire(import.meta.url);
+const PDFObjectParser = require('pdf-lib/cjs/core/parser/PDFObjectParser.js').default;
+const PDFBool         = require('pdf-lib/cjs/core/objects/PDFBool.js').default;
+const PDFNull         = require('pdf-lib/cjs/core/objects/PDFNull.js').default;
+const CharCodes       = require('pdf-lib/cjs/core/syntax/CharCodes.js').default;
+const { Keywords }    = require('pdf-lib/cjs/core/syntax/Keywords.js');
+const { IsNumeric }   = require('pdf-lib/cjs/core/syntax/Numeric.js');
+const { PDFObjectParsingError } = require('pdf-lib/cjs/core/errors.js');
+
+const KwTrue  = Keywords.true;
+const KwFalse = Keywords.false;
+const KwNull  = Keywords.null;
+
+const LessThan          = CharCodes.LessThan;
+const ForwardSlash      = CharCodes.ForwardSlash;
+const LeftSquareBracket = CharCodes.LeftSquareBracket;
+const LeftParen         = CharCodes.LeftParen;
+const t_code            = CharCodes.t;
+const f_code            = CharCodes.f;
+const n_code            = CharCodes.n;
+
+if (!PDFObjectParser.prototype.__fastParseObjectInstalled) {
+  PDFObjectParser.prototype.parseObject = function fastParseObject() {
+    this.skipWhitespaceAndComments();
+    const bytes = this.bytes;
+    const byte = bytes.peek();
+    if (IsNumeric[byte]) return this.parseNumberOrRef();
+    if (byte === LessThan) {
+      if (bytes.peekAhead(1) === LessThan) return this.parseDictOrStream();
+      return this.parseHexString();
+    }
+    if (byte === ForwardSlash)      return this.parseName();
+    if (byte === LeftSquareBracket) return this.parseArray();
+    if (byte === LeftParen)         return this.parseString();
+    if (byte === t_code && this.matchKeyword(KwTrue))  return PDFBool.True;
+    if (byte === f_code && this.matchKeyword(KwFalse)) return PDFBool.False;
+    if (byte === n_code && this.matchKeyword(KwNull))  return PDFNull;
+    throw new PDFObjectParsingError(bytes.position(), byte);
+  };
+
+  PDFObjectParser.prototype.__fastParseObjectInstalled = true;
+}
diff --git a/docs/render-book.mjs b/docs/render-book.mjs
index 82c7f4b4..2cf1fc73 100644
--- a/docs/render-book.mjs
+++ b/docs/render-book.mjs
@@ -70,6 +70,13 @@ import { PDFDocument } from 'pdf-lib';
 //     per-dict on every load; pool-dedup makes the canonical
 //     PDFNames reference-stable, so captured constants replace
 //     the calls verbatim. Pulls ~17 ms off fastOf self-time.
+//   fast-parse-object -- replace PDFObjectParser.prototype.parseObject
+//     with a first-byte-dispatch version that gates the three
+//     matchKeyword (true / false / null) scans behind a byte check.
+//     parseObject fires per dict value / array element / indirect
+//     object body; the upstream version pays three speculative
+//     matchKeyword fail-and-rewind costs on every invocation. Same
+//     semantics, dispatch reordered by observed frequency.
 //   fast-sync-load -- rip the parseSpeed / objectsPerTick /
 //     shouldWaitForTick / waitForTick machinery out of both pdf-lib's
 //     load path (PDFDocument.load + five PDFParser /
@@ -92,6 +99,7 @@ import './lib/fast-number-to-string.mjs';
 import './lib/fast-size-in-bytes.mjs';
 import './lib/fast-dict-iter.mjs';
 import './lib/fast-parse-dict.mjs';
+import './lib/fast-parse-object.mjs';
 import './lib/fast-sync-load.mjs';
 import { parseOutline, setOutline } from './lib/outline.mjs';
 import { setMetadata }              from './lib/postprocesser.mjs';
diff --git a/perf/README.md b/perf/README.md
index a7635cc5..66107fea 100644
--- a/perf/README.md
+++ b/perf/README.md
@@ -66,7 +66,7 @@ or pdf-lib), or to write `book.pdf` for behavioural verification.
 The mirror command for CPU-profiling the pdf-lib roundtrip:
 
 ```
-node measure.mjs --fast-refs --parallel-deflate --fast-decode-name --fast-number-to-string --fast-size-in-bytes --fast-inflate --fast-parse-number --fast-dict-iter --fast-parse-dict --fast-sync-load --cpu-profile-process --cpu-sampling 100
+node measure.mjs --fast-refs --parallel-deflate --fast-decode-name --fast-number-to-string --fast-size-in-bytes --fast-inflate --fast-parse-number --fast-dict-iter --fast-parse-dict --fast-parse-object --fast-sync-load --cpu-profile-process --cpu-sampling 100
 ```
 
 Flag rationale:
@@ -142,6 +142,22 @@ Flag rationale:
   `fastOf` was still the #4 row in the profile -- removing the
   calls pulls ~17 ms (~22 %) off `fastOf` self-time. Production
   runs through it.
+- `--fast-parse-object` -- inject
+  [docs/lib/fast-parse-object.mjs](../docs/lib/fast-parse-object.mjs),
+  replacing `PDFObjectParser.prototype.parseObject` with a
+  first-byte-dispatch version that gates the three speculative
+  `matchKeyword` calls (`true` / `false` / `null`) behind a byte
+  check. The upstream `parseObject` pays three `matchKeyword`
+  fail-and-rewind costs per dispatch (`bytes.offset()` +
+  `bytes.next()` + comparison + `bytes.moveTo(initialOffset)`)
+  before peeking the dispatch byte, on every call -- and the
+  three keywords are extraordinarily rare in real PDFs. The shim
+  peeks first and only enters `matchKeyword` when the byte could
+  plausibly start a keyword (`t` / `f` / `n`); dispatch order is
+  reshuffled by observed frequency in dict-value position (numbers
+  / refs first, then `<<`, names, arrays, strings). Same
+  semantics. Pulls `parseObject` self-time from ~82 ms (5.2 %)
+  to ~40 ms (3.1 %). Production runs through it.
 - `--fast-sync-load` -- inject
   [docs/lib/fast-sync-load.mjs](../docs/lib/fast-sync-load.mjs),
   replacing nine `__awaiter`-wrapped methods across pdf-lib's load
@@ -313,6 +329,7 @@ run.bat --fast-inflate                    # swap pako.inflate for node:zlib.infl
 run.bat --fast-parse-number               # direct-integer accumulator for parseRawNumber/parseRawInt (also ships; opt-in here for A/B)
 run.bat --fast-dict-iter                  # in-place Map.forEach for PDFDict.sizeInBytes/copyBytesInto (also ships; opt-in here for A/B)
 run.bat --fast-parse-dict                 # hoist Type/Catalog/Pages/Page sentinel PDFNames out of parseDict (also ships; opt-in here for A/B)
+run.bat --fast-parse-object               # first-byte dispatch in parseObject; gate true/false/null matchKeyword behind byte check (also ships; opt-in here for A/B)
 run.bat --fast-sync-load                  # synchronify PDFDocument.load + parser; strip waitForTick machinery (also ships; opt-in here for A/B)
 ```
 
@@ -429,6 +446,7 @@ file documenting each:
 | `PDFDict` iter (Map.forEach with hoisted callbacks) | [08](notes/08-pdf-lib.md) | ~80 ms process (dict path -6 pp) |
 | `parseDict` sentinel-PDFName hoist (Type/Catalog/Pages/Page) | [08](notes/08-pdf-lib.md) | ~17 ms profile (fastOf -22 %) |
 | Synchronify pdf-lib load + save (strip `__awaiter` scaffolding) | [08](notes/08-pdf-lib.md) | ~0.36 s process (load -26 %, GC -53 ms) |
+| `parseObject` first-byte dispatch + gated keyword scans | [08](notes/08-pdf-lib.md) | ~42 ms profile (parseObject -51 %) |
 
 What was tried and didn't ship:
 
@@ -455,4 +473,4 @@ order; later ones reference earlier ones for context.
 | [05-blink-trace.md](notes/05-blink-trace.md) | What happened when we tried move-not-clone (a `previousLeaf` cache shipped instead of move); cracking the cpu profile's `(program)` row open with a Blink-category trace; the WhiteSpaceFilter paired-A/B that found it wasn't worth its layout cost in our pipeline. |
 | [06-microtasks-pageranges-css.md](notes/06-microtasks-pageranges-css.md) | Following `RunMicrotasks` down to zero (chunker fully sync); why `pageRanges` sharding is off the table; CSS cost attribution showing print.css's individual sections are all below the noise floor. |
 | [07-memory.md](notes/07-memory.md) | Where the renderer's 1.9 GB goes -- process-tree footprint, per-allocator + per-Blink-class breakdown, `--disable-gpu` + `--in-process-gpu` saving ~200 MB, a GC-pass probe finding 180 MB of unswept Oilpan garbage. |
-| [08-pdf-lib.md](notes/08-pdf-lib.md) | Profiling the process phase via `--cpu-profile-process`; pako's per-stream init dominates with ~4 500 small streams (routing pdf-lib's `deflate` + `inflate` through `node:zlib` saves ~1.5 s); `PDFRef.of`'s string-keyed Map lookup at 1.2 M calls per load (dense-array gen=0 cache saves ~0.2 s); parallelising save's per-stream deflate on libuv's pool with `objectsPerStream: 500` (~0.3 s off the main thread; PDF -5 %); `decodeName`'s regex scan on 2.76 M `PDFName.of` calls per load with a 0.0001 % hit rate (no-`#` cache saves ~0.5 s); `numberToString`'s redundant `toString`/`split`/`parseInt` on the 100 % no-`e` path; `sizeInBytes` allocating `n.toString(2)` on ~300 k xref-writer calls (short-circuit ladder saves ~60 ms); `PDFDict.entries` allocating `Array.from(map.entries())` on every dict serialisation (`Map.forEach` with hoisted callbacks saves ~80 ms); `parseDict`'s type-dispatch tail re-running `PDFName.of('Type'/'Catalog'/'Pages'/'Page')` per dict (hoisted sentinel constants drop `fastOf` self-time by 22 %); pdf-lib's `__awaiter`/`__generator` scaffolding on nine load + save methods costing ~135 ms of attributed self-time + ~50 ms GC (synchronified twins save 0.36 s of process). |
+| [08-pdf-lib.md](notes/08-pdf-lib.md) | Profiling the process phase via `--cpu-profile-process`; pako's per-stream init dominates with ~4 500 small streams (routing pdf-lib's `deflate` + `inflate` through `node:zlib` saves ~1.5 s); `PDFRef.of`'s string-keyed Map lookup at 1.2 M calls per load (dense-array gen=0 cache saves ~0.2 s); parallelising save's per-stream deflate on libuv's pool with `objectsPerStream: 500` (~0.3 s off the main thread; PDF -5 %); `decodeName`'s regex scan on 2.76 M `PDFName.of` calls per load with a 0.0001 % hit rate (no-`#` cache saves ~0.5 s); `numberToString`'s redundant `toString`/`split`/`parseInt` on the 100 % no-`e` path; `sizeInBytes` allocating `n.toString(2)` on ~300 k xref-writer calls (short-circuit ladder saves ~60 ms); `PDFDict.entries` allocating `Array.from(map.entries())` on every dict serialisation (`Map.forEach` with hoisted callbacks saves ~80 ms); `parseDict`'s type-dispatch tail re-running `PDFName.of('Type'/'Catalog'/'Pages'/'Page')` per dict (hoisted sentinel constants drop `fastOf` self-time by 22 %); pdf-lib's `__awaiter`/`__generator` scaffolding on nine load + save methods costing ~135 ms of attributed self-time + ~50 ms GC (synchronified twins save 0.36 s of process); `parseObject`'s three speculative `matchKeyword(true/false/null)` scans on every dispatch (first-byte peek + gated keyword scans halve `parseObject` self-time). |
diff --git a/perf/measure.mjs b/perf/measure.mjs
index 365f27aa..1bf9aa46 100644
--- a/perf/measure.mjs
+++ b/perf/measure.mjs
@@ -32,7 +32,7 @@
 //                    [--fast-decode-name] [--fast-number-to-string]
 //                    [--fast-size-in-bytes] [--fast-inflate]
 //                    [--fast-parse-number] [--fast-parse-dict]
-//                    [--fast-sync-load]
+//                    [--fast-parse-object] [--fast-sync-load]
 //
 // --render-only bails out after the render phase. Skips meta extraction,
 // parseOutline, page.pdf, and the pdf-lib roundtrip / incremental writer.
@@ -147,6 +147,15 @@
 // Pool-dedup makes the canonical PDFNames reference-stable, so
 // captured constants replace the four calls verbatim.
 //
+// --fast-parse-object replaces PDFObjectParser.prototype.parseObject
+// with a first-byte-dispatch version that gates the three
+// matchKeyword (true / false / null) scans behind a byte check.
+// parseObject fires per dict value / array element / indirect
+// object body (hundreds of thousands of calls on the book); the
+// upstream version pays three speculative matchKeyword fail-and-
+// rewind costs on every invocation. Same semantics, dispatch
+// reordered by observed frequency in dict-value position.
+//
 // --fast-sync-load rips pdf-lib's parseSpeed / objectsPerTick /
 // shouldWaitForTick / waitForTick machinery out of both the load
 // path (PDFDocument.load + PDFParser.parseDocument / parseDocumentSection
@@ -214,6 +223,7 @@ let fastInflate = false;
 let fastParseNumber = false;
 let fastDictIter = false;
 let fastParseDict = false;
+let fastParseObject = false;
 let fastSyncLoad = false;
 for (let i = 0; i < args.length; i++) {
   const a = args[i];
@@ -245,6 +255,7 @@ for (let i = 0; i < args.length; i++) {
   else if (a === '--fast-parse-number') fastParseNumber = true;
   else if (a === '--fast-dict-iter') fastDictIter = true;
   else if (a === '--fast-parse-dict') fastParseDict = true;
+  else if (a === '--fast-parse-object') fastParseObject = true;
   else if (a === '--fast-sync-load') fastSyncLoad = true;
   else if (!inputArg) inputArg = a;
   else { console.error(`unknown arg: ${a}`); process.exit(2); }
@@ -319,6 +330,10 @@ if (fastParseDict) {
   await import('../docs/lib/fast-parse-dict.mjs');
   console.log('[harness] fast-parse-dict: hoist Type/Catalog/Pages/Page sentinel PDFNames out of parseDict');
 }
+if (fastParseObject) {
+  await import('../docs/lib/fast-parse-object.mjs');
+  console.log('[harness] fast-parse-object: first-byte dispatch in parseObject, gate true/false/null matchKeyword behind byte check');
+}
 if (fastSyncLoad) {
   await import('../docs/lib/fast-sync-load.mjs');
   console.log('[harness] fast-sync-load: synchronify PDFParser load path, strip waitForTick machinery');
diff --git a/perf/notes/08-pdf-lib.md b/perf/notes/08-pdf-lib.md
index 1c6db344..c0ebcb91 100644
--- a/perf/notes/08-pdf-lib.md
+++ b/perf/notes/08-pdf-lib.md
@@ -1462,6 +1462,141 @@ is a bigger code change for a smaller individual win. Worth
 revisiting if a future optimisation moves the floor and parseDict
 becomes a larger relative share.
 
+## `parseObject`: dispatch by first byte, gate the keyword scans
+
+After `fast-parse-dict` shipped, `PDFObjectParser.parseObject` was
+the next obvious row in the bottom-up table:
+
+```
+   self_ms   self_%   function  @  source
+    213.28   13.41%   (garbage collector)
+    113.05    7.11%   fastParseDict
+     99.12    6.23%   fastOf
+     86.87    5.46%   PDFRef.of
+     86.32    5.43%   PDFObjectParser.parseName
+     81.86    5.15%   PDFObjectParser.parseObject     <-- this row
+     ...
+```
+
+`parseObject` is the dispatch hub of the PDF object parser. It's
+called once per dict value, per array element, and per
+indirect-object body -- same call density as `fastParseDict` two
+rows above (every dict that fastParseDict builds calls parseObject
+N times for its N values).
+
+### What parseObject was doing
+
+The upstream body (`PDFObjectParser.js:36`):
+
+```js
+parseObject() {
+  this.skipWhitespaceAndComments();
+  if (this.matchKeyword(Keywords.true))  return PDFBool.True;
+  if (this.matchKeyword(Keywords.false)) return PDFBool.False;
+  if (this.matchKeyword(Keywords.null))  return PDFNull;
+  const byte = this.bytes.peek();
+  if (byte === LessThan && this.bytes.peekAhead(1) === LessThan) return this.parseDictOrStream();
+  if (byte === LessThan)          return this.parseHexString();
+  if (byte === LeftParen)         return this.parseString();
+  if (byte === ForwardSlash)      return this.parseName();
+  if (byte === LeftSquareBracket) return this.parseArray();
+  if (IsNumeric[byte])            return this.parseNumberOrRef();
+  throw new PDFObjectParsingError(this.bytes.position(), byte);
+}
+```
+
+Three speculative `matchKeyword` calls run on every invocation,
+before the dispatch byte is ever peeked. `matchKeyword`
+(`BaseParser.js:97`) on a fast-fail mismatch does `bytes.offset()`,
+then `bytes.next()` on the first byte of the keyword, comparison,
+then `bytes.moveTo(initialOffset)` to restore. Three of those per
+`parseObject` call -- multiplied by the hundreds of thousands of
+calls per book load -- adds up.
+
+`true` / `false` / `null` are extraordinarily rare in real PDFs.
+The bulk of dict values are refs (`N N R`), numbers, names,
+sub-dicts, and arrays. Putting the dispatch-byte test *before*
+the keyword scans, and only entering `matchKeyword` when the
+first byte could plausibly start one of the three keywords,
+skips three method calls + a `moveTo` per `parseObject` on the
+overwhelming majority of inputs.
+
+### The shim
+
+`docs/lib/fast-parse-object.mjs` replaces
+`PDFObjectParser.prototype.parseObject` with:
+
+```js
+parseObject() {
+  this.skipWhitespaceAndComments();
+  const bytes = this.bytes;
+  const byte = bytes.peek();
+  if (IsNumeric[byte]) return this.parseNumberOrRef();
+  if (byte === LessThan) {
+    if (bytes.peekAhead(1) === LessThan) return this.parseDictOrStream();
+    return this.parseHexString();
+  }
+  if (byte === ForwardSlash)      return this.parseName();
+  if (byte === LeftSquareBracket) return this.parseArray();
+  if (byte === LeftParen)         return this.parseString();
+  if (byte === t_code && this.matchKeyword(KwTrue))  return PDFBool.True;
+  if (byte === f_code && this.matchKeyword(KwFalse)) return PDFBool.False;
+  if (byte === n_code && this.matchKeyword(KwNull))  return PDFNull;
+  throw new PDFObjectParsingError(bytes.position(), byte);
+}
+```
+
+Three changes from upstream:
+
+1. Peek the first byte once, up front.
+2. Dispatch order reshuffled for dict-value frequency: numbers /
+   refs first (`IsNumeric[byte]` is a Uint8Array index, the
+   cheapest possible test), then `<<` / `<` (collapsed into one
+   `LessThan` branch with the `peekAhead` lookup inside), then
+   names, arrays, strings.
+3. The three keyword paths are gated -- `byte === t` / `f` / `n`
+   guards each `matchKeyword` call, so a non-keyword input never
+   pays for the speculative scan + rewind.
+
+Correctness: a value starting with `t`/`f`/`n` that isn't
+`true`/`false`/`null` falls through to the same
+`PDFObjectParsingError` the upstream code would throw. Dict keys
+can't reach parseObject (`parseDict` calls `parseName()` for
+keys, parseObject only for values), and names always start with
+`/`. Numbers can't start with letters. So the only valid values
+that hit the gated keyword branches are the three keywords
+themselves.
+
+`PDFObjectParser` isn't re-exported from pdf-lib's index, so the
+shim reaches in via `pdf-lib/cjs/core/parser/PDFObjectParser.js`
+through `createRequire` -- same shape as `fast-parse-dict.mjs`.
+
+### Results
+
+Profile diff, both runs `--detach-pages --no-timing` with every
+other shipping shim active, 100 us sampling:
+
+| metric                                  | pre        | post       | Δ                  |
+| ---                                     | ---        | ---        | ---                |
+| `parseObject` / `fastParseObject` self  | 81.86 ms (5.15 %) | 40.25 ms (3.07 %) | **-41.6 ms (-51 %)** |
+| `fastOf` self                           | 99.12 ms (6.23 %) | 64.18 ms (4.90 %) | -34.9 ms           |
+| `fastParseDict` self                    | 113.05 ms (7.11 %) | 65.26 ms (4.98 %) | -47.8 ms           |
+
+The targeted row roughly halves in self-time, as the model
+predicts (three `matchKeyword` calls collapsed to first-byte
+dispatch). The `fastOf` and `fastParseDict` drops aren't from
+this shim doing less work in those frames -- they're profile
+attribution shifting around once `parseObject` is no longer
+dominating its own children's sampling window (sampled duration
+fell from 1.58 s to 1.34 s overall).
+
+Wall-clock is too noisy on this machine to read at this scale --
+the mechanism-confirmed ~42 ms via profile attribution is the
+honest number.
+
+PDF output is byte-equivalent: same dispatch decisions, same
+fallthrough behaviour, same error shape.
+
 ## Strip the parse-speed machinery: synchronify the load path
 
 After the eight `--fast-*` patches above had nibbled the process
@@ -1730,6 +1865,7 @@ Cumulative process-phase cost, baseline → after the shims to date:
 | + fast-size-in-bytes                 | ~1.5 s  | ~1.0 s | ~0.5 s |
 | + fast-dict-iter                     | ~1.4 s  | ~1.0 s | ~0.4 s |
 | + fast-parse-dict                    | ~1.4 s  | ~1.0 s | ~0.4 s |
+| + fast-parse-object                  | ~1.4 s  | ~1.0 s | ~0.4 s |
 | **+ fast-sync-load (this section)**  | **~1.3 s** | **~0.8 s** | **~0.5 s** |
 
 The bottom-up after the latest pair is what's left of pdf-lib's

From 3bde6138bc12c4c2e9b5f407e95a4b3557812056 Mon Sep 17 00:00:00 2001
From: Kuba Sunderland-Ober <kuba@mareimbrium.org>
Date: Sun, 24 May 2026 00:46:57 +0200
Subject: [PATCH 12/44] Add sampling-heap-profile instrumentation for the
 process phase.

---
 perf/README.md           | 41 +++++++++++++++++++++++++++++++++++++++-
 perf/measure.mjs         | 41 +++++++++++++++++++++++++++++++++++-----
 perf/notes/08-pdf-lib.md | 21 +++++++++++++++++---
 3 files changed, 94 insertions(+), 9 deletions(-)

diff --git a/perf/README.md b/perf/README.md
index 66107fea..0bbb3724 100644
--- a/perf/README.md
+++ b/perf/README.md
@@ -197,6 +197,42 @@ of hot functions called millions of times (`PDFRef.of` in
 particular). For "did this wall-clock change," do a paired
 no-profile A/B as a sanity check.
 
+## Profiling pdf-lib heap allocation (process phase): canonical command
+
+The companion command for the **sampling heap profile** of the
+process phase -- "where is pdf-lib allocating bytes?" rather than
+"where is it spending cycles?":
+
+```
+node measure.mjs --fast-refs --parallel-deflate --fast-decode-name --fast-number-to-string --fast-size-in-bytes --fast-inflate --fast-parse-number --fast-dict-iter --fast-parse-dict --fast-parse-object --fast-sync-load --heap-profile-process --heap-sampling 512
+```
+
+Same `--fast-*` set as the CPU command (production is the baseline
+we care about); the new flags:
+
+- `--heap-profile-process` -- attach Node's `inspector/promises`
+  `HeapProfiler` around the process phase only. Writes
+  `process.heapprofile` into the timestamped `results/` folder.
+  Output is V8's sampling-heap-profile JSON (a tree of
+  `{ callFrame, selfSize, children }` rooted at `head`), not the
+  flat-nodes shape that `.cpuprofile` uses, so the cpu analyzers
+  don't apply. Use `analyze-heap-profile.mjs` instead, which walks
+  the tree and aggregates `selfSize` by `(functionName + url:line)`:
+  `node analyze-heap-profile.mjs results/<run>/process.heapprofile --top 10`.
+- `--heap-sampling 512` -- 512-byte sampling interval. V8's default
+  is 32768 (32 KB); on the ~150 MB process-phase allocation total
+  that's only ~5 k samples and the bottom-up table runs coarse.
+  512 B yields ~250 k samples on the book, plenty of resolution
+  for "which frame allocated this Map?". Caveat: 512 B sampling
+  inflates process wall-clock substantially (the sampler's
+  per-allocation bookkeeping fires 64x more often). Read the
+  attribution, not the timing, from heap-profiled runs.
+
+`--heap-profile-process` composes with `--cpu-profile-process` --
+both attach to the same inspector session, so you can capture cpu
+and heap in a single run if you want. The same `--render-only`
+incompatibility applies (no process phase to profile).
+
 See [notes/08-pdf-lib.md](notes/08-pdf-lib.md) for the process-phase
 investigations these flags enabled.
 
@@ -313,6 +349,7 @@ run.bat --no-detach-pages                 # opt out of the detach-pages fix (mea
 run.bat --timing                          # collect per-page wall time + heap (writes timing.csv + quartile summary)
 run.bat --cpu-profile                     # CPU-profile the render phase (CDP, Chromium-side)
 run.bat --cpu-profile-process             # CPU-profile the process phase (Node inspector, Node-side)
+run.bat --heap-profile-process            # sampling heap-profile the process phase (Node inspector HeapProfiler); pair with --heap-sampling 512 for fine attribution
 run.bat --render-only                     # bail out after render (skip generate + process, ~47s saved)
 run.bat --clone-count                     # report Layout.append clones appended vs survivors per page
 run.bat --instrument                      # count + time DOM-accessor calls
@@ -336,7 +373,9 @@ run.bat --fast-sync-load                  # synchronify PDFDocument.load + parse
 Flags compose. The CPU profile lands as `render.cpuprofile`
 (loadable in Chrome DevTools -> Performance -> "Load profile...");
 `--cpu-profile-process` writes `process.cpuprofile` alongside it;
-`--instrument` prints a per-op table at end-of-render.
+`--heap-profile-process` writes `process.heapprofile` (loadable in
+Chrome DevTools -> Memory -> "Load profile..."); `--instrument`
+prints a per-op table at end-of-render.
 
 You need `_site-pdf\book.html` to exist first -- run `docs\build.bat`
 (which is `bundle exec jekyll build`) if you haven't already.
diff --git a/perf/measure.mjs b/perf/measure.mjs
index 1bf9aa46..0ed196ba 100644
--- a/perf/measure.mjs
+++ b/perf/measure.mjs
@@ -23,7 +23,8 @@
 //   node measure.mjs [path/to/book.html] [--out <dir>] [--keep-open]
 //                    [--cpu-profile] [--cpu-profile-process]
 //                    [--cpu-sampling <microseconds>]
-//                    [--heap-profile] [--heap-sampling <bytes>]
+//                    [--heap-profile] [--heap-profile-process]
+//                    [--heap-sampling <bytes>]
 //                    [--tracing]
 //                    [--no-detach-pages] [--instrument] [--time-hooks]
 //                    [--incremental] [--chrome-outline] [--timing]
@@ -96,6 +97,13 @@
 // Honours --cpu-sampling. Composable with --cpu-profile when you want
 // both phases captured in one run.
 //
+// --heap-profile-process wraps the process phase in V8's sampling heap
+// profiler (Inspector's HeapProfiler domain) and writes
+// process.heapprofile alongside the cpu one. --heap-sampling sets the
+// sampling interval in bytes; default 32768 (V8's default). Drop to
+// 512 for finer-grained attribution on short phases. Composable with
+// --cpu-profile-process; both share one inspector session.
+//
 // --fast-refs replaces PDFRef.of's string-keyed Map lookup with a
 // dense-array cache for the gen=0 case (82 % of ~1.2 M calls on the
 // book). Eliminates the per-call `<obj> <gen> R` string allocation
@@ -204,7 +212,8 @@ let cpuProfile = false;
 let cpuProfileProcess = false;
 let cpuSampling = 1000; // microseconds
 let heapProfile = false;
-let heapSampling = 32768; // bytes between samples (CDP default)
+let heapProfileProcess = false;
+let heapSampling = 32768; // bytes between samples (V8 default; used by both CDP render-side and inspector process-side)
 let detachPages = true;
 let instrument = false;
 let timeHooks = false;
@@ -233,6 +242,7 @@ for (let i = 0; i < args.length; i++) {
   else if (a === '--cpu-profile-process') cpuProfileProcess = true;
   else if (a === '--cpu-sampling') cpuSampling = parseInt(args[++i], 10);
   else if (a === '--heap-profile') heapProfile = true;
+  else if (a === '--heap-profile-process') heapProfileProcess = true;
   else if (a === '--heap-sampling') heapSampling = parseInt(args[++i], 10);
   else if (a === '--detach-pages') detachPages = true;       // accepted for backwards compat; default since the fix landed
   else if (a === '--no-detach-pages') detachPages = false;
@@ -295,6 +305,10 @@ if (cpuProfileProcess && renderOnly) {
   console.error('--cpu-profile-process is incompatible with --render-only (the process phase is skipped).');
   process.exit(2);
 }
+if (heapProfileProcess && renderOnly) {
+  console.error('--heap-profile-process is incompatible with --render-only (the process phase is skipped).');
+  process.exit(2);
+}
 
 // Install the dense-array cache for PDFRef.of's gen=0 path before any
 // pdf-lib operation. Side-effecting import; idempotent.
@@ -535,6 +549,7 @@ try {
   let processMs = null;
   let processBreakdown = null;
   let processProfilePath = null;
+  let processHeapProfilePath = null;
   let finalPdf = null;
 
   if (!renderOnly) {
@@ -602,14 +617,21 @@ try {
   // pdf-lib runs locally. Output file shape (V8 .cpuprofile JSON) is the
   // same either way.
   let inspectorSession = null;
-  if (cpuProfileProcess) {
+  if (cpuProfileProcess || heapProfileProcess) {
     inspectorSession = new Session();
     inspectorSession.connect();
+  }
+  if (cpuProfileProcess) {
     await inspectorSession.post('Profiler.enable');
     await inspectorSession.post('Profiler.setSamplingInterval', { interval: cpuSampling });
     await inspectorSession.post('Profiler.start');
     console.log(`[harness] process cpu profile: sampling every ${cpuSampling}us`);
   }
+  if (heapProfileProcess) {
+    await inspectorSession.post('HeapProfiler.enable');
+    await inspectorSession.post('HeapProfiler.startSampling', { samplingInterval: heapSampling });
+    console.log(`[harness] process heap profile: sampling every ${heapSampling}B`);
+  }
 
   const tProcStart = Date.now();
   if (incremental) {
@@ -653,15 +675,23 @@ try {
   }
   const tProcEnd  = Date.now();
   processMs = tProcEnd - tProcStart;
-  if (inspectorSession) {
+  if (heapProfileProcess) {
+    const { profile } = await inspectorSession.post('HeapProfiler.stopSampling');
+    await inspectorSession.post('HeapProfiler.disable');
+    processHeapProfilePath = join(outDir, 'process.heapprofile');
+    const profileJson = JSON.stringify(profile);
+    writeFileSync(processHeapProfilePath, profileJson);
+    console.log(`[harness] process heap profile: ${processHeapProfilePath} (${(profileJson.length / 1024 / 1024).toFixed(1)} MB)`);
+  }
+  if (cpuProfileProcess) {
     const { profile } = await inspectorSession.post('Profiler.stop');
     await inspectorSession.post('Profiler.disable');
-    inspectorSession.disconnect();
     processProfilePath = join(outDir, 'process.cpuprofile');
     const profileJson = JSON.stringify(profile);
     writeFileSync(processProfilePath, profileJson);
     console.log(`[harness] process cpu profile: ${processProfilePath} (${(profileJson.length / 1024 / 1024).toFixed(1)} MB)`);
   }
+  if (inspectorSession) inspectorSession.disconnect();
   if (incremental) {
     console.log(`[harness] process  ${fmtMs(processMs)}  (incremental=${fmtMs(processBreakdown.incrementalMs)}, +${processBreakdown.appendedBytes}B, ${processBreakdown.newObjectCount} new objs)`);
   } else {
@@ -709,6 +739,7 @@ try {
       ms: processMs,
       mode: incremental ? 'incremental' : 'pdf-lib-roundtrip',
       cpuProfile: processProfilePath,
+      heapProfile: processHeapProfilePath,
       ...processBreakdown,
     };
   }
diff --git a/perf/notes/08-pdf-lib.md b/perf/notes/08-pdf-lib.md
index c0ebcb91..7543ba5b 100644
--- a/perf/notes/08-pdf-lib.md
+++ b/perf/notes/08-pdf-lib.md
@@ -11,17 +11,32 @@ no bottom-up table to point at: CDP's `Profiler` attaches to Chromium
 and the process phase runs in Node, so `--cpu-profile` couldn't see
 it.
 
-## `--cpu-profile-process`
+## `--cpu-profile-process` (and `--heap-profile-process`)
 
 Added to `measure.mjs`: opens an in-process V8 Profiler via
 `node:inspector/promises`, brackets the process phase the same way
 `--cpu-profile` brackets render, and writes `process.cpuprofile`
-alongside `render.cpuprofile`. Same `.cpuprofile` JSON shape, so the
-existing `analyze-profile.mjs` / `find-callers.mjs` /
+alongside `render.cpuprofile`. Same `.cpuprofile` JSON shape, so
+the existing `analyze-profile.mjs` / `find-callers.mjs` /
 `find-callees.mjs` work unchanged. See the *Profiling pdf-lib
 (process phase): canonical command* section in [the README](../README.md)
 for the operational form.
 
+The heap counterpart -- `--heap-profile-process` -- arrived later
+(once allocation became the obvious next thing to attack: GC was
+sitting at the top of every CPU profile in this phase). It shares
+the same inspector session, so capturing both in one run is one
+flag away. Output is a `.heapprofile`, a tree of
+`{ callFrame, selfSize, children }` rooted at `head` -- *not* the
+flat `.cpuprofile` shape -- so `analyze-heap-profile.mjs` handles
+it instead of the cpu analyzers. See *Profiling pdf-lib heap
+allocation (process phase): canonical command* in
+[the README](../README.md) for the operational form. The findings
+this tool enabled are folded into the per-shim sections below
+(decodeName / sizeInBytes / PDFDict.entries / ...) -- each names
+which path it came from when the heap profile, not the cpu
+profile, was the diagnostic that pointed at the function.
+
 First run on the 1638-page book (`--detach-pages --no-timing
 --cpu-profile-process --cpu-sampling 100`), process 4.66 s (load
 1.88 s, setOutline 0.01 s, save 2.77 s). Top of the bottom-up table:

From 893953082f71afe73a3e7ce8073ddde4abe6e1cd Mon Sep 17 00:00:00 2001
From: Kuba Sunderland-Ober <kuba@mareimbrium.org>
Date: Sun, 24 May 2026 00:48:05 +0200
Subject: [PATCH 13/44] Add find-heap-callers.mjs: attribute heap allocations
 to direct callers.

---
 perf/README.md             |  1 +
 perf/find-heap-callers.mjs | 69 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 70 insertions(+)
 create mode 100644 perf/find-heap-callers.mjs

diff --git a/perf/README.md b/perf/README.md
index 0bbb3724..20772d1c 100644
--- a/perf/README.md
+++ b/perf/README.md
@@ -264,6 +264,7 @@ or `--tracing`):
 | `analyze-trace.mjs` | Bottom-up self-time analyzer for Chrome traces (`trace.json` from `--tracing`). Computes per-event self-time on the renderer's main thread (`CrRendererMain` by default) by walking nested `X`-phase events. Cracks the cpu profile's `(program)` bucket open into named Blink / V8 events (`Layout`, `RecalcStyle`, `RunMicrotasks`, `V8.GC_*`, ...). Operates on the Blink trace events only -- ignores any embedded V8 cpu samples (`Profile` / `ProfileChunk`). |
 | `analyze-hybrid.mjs` | Bottom-up analyzer that *combines* the V8 cpu samples and the Blink trace events from a hybrid `trace.json`. Builds a `[JS root..leaf] ++ [Blink outer..inner]` stack at each sample (filtering V8's virtual frames and JS-entry wrapper events) and prints either top-N self-time mixing JS function names with Blink/V8 event names, or `--callees <label>` direct-callees for any name on either axis. Lets you walk a single causation chain from a JS function down through the Blink layout / style work it triggered via gBCR (`hasOverflow -> getBoundingClientRect -> Document::UpdateStyleAndLayout -> Blink.ForcedStyleAndLayout.UpdateTime -> ...`). |
 | `find-callers.mjs` | "Who paid for this callee's time?" -- walks a `.cpuprofile` and attributes a target function's total time back to each direct caller. Used throughout the post-mortems to detect gBCR migration between callers. |
+| `find-heap-callers.mjs` | Heap-profile companion to `find-callers.mjs`. Walks a `.heapprofile` tree and attributes a target allocator's (e.g. `set`, `Map`, `String`) self+descendant bytes back to each direct caller. Useful for "where do all these Map.set calls come from?" questions. |
 | `find-callees.mjs` | The other direction of `find-callers.mjs`: splits a function's self+descendant time across its direct callees. Surfaces the cases where V8 has rolled native DOM work back into the calling JS frame (Range deletion in `removeOverflow`, HTML parser in `wrapContent`). |
 | `grep-profile.mjs` | Lists every node in a `.cpuprofile` whose `functionName` matches a regex, with self-time and location. Quick check for "is this frame in the profile at all, and what's it called?" |
 | `ab-css.mjs` | CSS cost attribution for `docs/_site-pdf/assets/css/print.css` + `rouge.css`. Renders the book per variant (full / drop-rouge / drop-print-extras / baseline-minimal) and reports **paired-difference** CPU sample-time across N pairs (default 3), with the baseline re-measured immediately before each variant pair to cancel machine-state drift. Pulls per-`Document::recalcStyle` / `LocalFrameView::performLayout` / `rebuildLayoutTree` / `ShapeText` total time from the embedded V8 cpu profile in the hybrid trace; prints mean ± SD per variant so noise-floor rows are visible. Auto-pins on Windows via `pin-cpu.mjs`. Optional `--per-print-section` adds one drop-print-`<section>` variant per `/* ---- ---- */` divider in print.css; individual sections of print.css turned out to be below the noise floor on this book, so off by default. |
diff --git a/perf/find-heap-callers.mjs b/perf/find-heap-callers.mjs
new file mode 100644
index 00000000..beaf0621
--- /dev/null
+++ b/perf/find-heap-callers.mjs
@@ -0,0 +1,69 @@
+// Attribute a heap allocator's self+descendant bytes to each direct caller.
+//
+// Reads a V8 .heapprofile (tree of { callFrame, selfSize, children }
+// rooted at `head`) and, for every node whose callFrame.functionName
+// matches the given target, attributes its self+descendant selfSize back
+// to its immediate parent frame.
+//
+// Companion to find-callers.mjs (which does the same for .cpuprofile).
+// The tree shape means each occurrence has exactly one parent, so this
+// is straightforward depth-first attribution -- no need for the
+// parent-of map that find-callers.mjs builds.
+//
+// Usage:
+//   node perf/find-heap-callers.mjs <profile> <calleeName>
+//
+// Example:
+//   node perf/find-heap-callers.mjs results/<run>/process.heapprofile set
+//   node perf/find-heap-callers.mjs results/<run>/process.heapprofile Map
+//
+// `set` and `Map` show up as bare V8 builtins (no url, no line), so the
+// useful question is "who called them"; this script answers it.
+
+import { readFileSync } from 'node:fs';
+
+const [profilePath, targetName] = process.argv.slice(2);
+if (!profilePath || !targetName) {
+  console.error('usage: node find-heap-callers.mjs <profile> <calleeName>');
+  process.exit(2);
+}
+
+const profile = JSON.parse(readFileSync(profilePath, 'utf8'));
+
+function subtreeBytes(n) {
+  let total = n.selfSize || 0;
+  for (const c of n.children || []) total += subtreeBytes(c);
+  return total;
+}
+
+const callerBytes = new Map();
+let targetSelf = 0;
+let targetTotal = 0;
+
+function walk(n, parent) {
+  const name = n.callFrame?.functionName || '';
+  if (name === targetName) {
+    targetSelf += n.selfSize || 0;
+    const total = subtreeBytes(n);
+    targetTotal += total;
+    if (parent) {
+      const cf = parent.callFrame || {};
+      const fn = cf.functionName || '(anon)';
+      const url = cf.url || '';
+      const line = cf.lineNumber != null ? cf.lineNumber + 1 : '?';
+      const pkey = `${fn} @ ${url ? url.replace(/^file:\/\/\//, '') : '(no url)'}:${line}`;
+      callerBytes.set(pkey, (callerBytes.get(pkey) || 0) + total);
+    }
+  }
+  for (const c of n.children || []) walk(c, n);
+}
+walk(profile.head, null);
+
+console.log(`${targetName}: self=${(targetSelf / 1024).toFixed(2)} KB, total=${(targetTotal / 1024).toFixed(2)} KB (${(targetTotal / 1024 / 1024).toFixed(2)} MB)`);
+console.log('callers (attributed total KB):');
+const rows = [...callerBytes.entries()].sort((a, b) => b[1] - a[1]);
+for (const [k, bytes] of rows) {
+  const kb = bytes / 1024;
+  if (kb < 1) continue;
+  console.log(`  ${kb.toFixed(2).padStart(10)} KB   ${k}`);
+}

From 1ef72bf80a8e0cf30bf5ed3617a1ab886ccd3818 Mon Sep 17 00:00:00 2001
From: Kuba Sunderland-Ober <kuba@mareimbrium.org>
Date: Sun, 24 May 2026 00:53:11 +0200
Subject: [PATCH 14/44] Replace PDFDict's backing Map with a flat [k,v,k,v,...]
 array.

The sampling heap profile of the process phase showed `new Map()` +
Map.prototype.set at ~80 MB combined (50 % of total allocations on
the book), 80 % of that traffic from the parser's per-dict
accumulator. PDF dicts are tiny (typically <= 10 entries), so the
hash-table arena per dict was pure overhead.

The new fast-dict-array shim patches PDFDict's storage to a flat
alternating array, plus all prototype methods (set/get/has/delete/
keys/values/entries/asMap/clone/toString/sizeInBytes/copyBytesInto)
and the parser's parseDict hot loop. Subsumes fast-dict-iter +
fast-parse-dict, both of which stay in the tree as A/B baselines.

Wall-clock: process phase 1.18s -> 1.13s (4 runs paired no-profile).
Heap: Map+set builtin traffic 79 MB -> 15 MB.
---
 docs/lib/fast-dict-array.mjs | 301 +++++++++++++++++++++++++++++++++++
 docs/render-book.mjs         |  28 ++--
 perf/README.md               |  54 +++----
 perf/measure.mjs             |  22 +++
 perf/notes/08-pdf-lib.md     | 200 ++++++++++++++++++++++-
 5 files changed, 562 insertions(+), 43 deletions(-)
 create mode 100644 docs/lib/fast-dict-array.mjs

diff --git a/docs/lib/fast-dict-array.mjs b/docs/lib/fast-dict-array.mjs
new file mode 100644
index 00000000..1ca21665
--- /dev/null
+++ b/docs/lib/fast-dict-array.mjs
@@ -0,0 +1,301 @@
+// Replace PDFDict's backing Map with a flat alternating array
+// [k0, v0, k1, v1, ...].
+//
+// Motivation. The sampling heap profile of the process phase (see
+// "Profiling pdf-lib heap allocation" in perf/README.md) put `Map`
+// constructors and `Map.prototype.set` at 50 % of total allocations
+// -- ~63 MB combined -- with ~80 % of that traffic coming from one
+// site: fastParseDict's per-dict accumulator
+// ([fast-parse-dict.mjs:62](docs/lib/fast-parse-dict.mjs:62)).
+//
+//     const dict = new Map();          // 24 MB of Map() constructors
+//     while (...) {
+//       const key = this.parseName();
+//       const value = this.parseObject();
+//       dict.set(key, value);          // 38 MB of Map.set entries
+//     }
+//     ... PDFDict.fromMapWithContext(dict, this.context);
+//
+// Each parsed dict pays for one Map header + one hash-table backing
+// arena + one bucket allocation per entry. PDF dicts are tiny (typical
+// has <= 10 entries, often 2-3), so the hash-table overhead is pure
+// loss vs a linear scan -- and the Map's amortized O(1) lookup buys
+// nothing because nobody iterates a parsed dict enough times for the
+// hash to pay back.
+//
+// The fix: store entries in a flat array. One allocation per dict
+// (the array itself; the inline alternating layout avoids any per-
+// entry bucket alloc). Lookup is a linear scan, which beats Map.get
+// at this size class on every V8 microbench I've seen.
+//
+// Mechanism. We do three things:
+//
+// 1. Patch PDFDict.prototype.{keys, values, entries, set, get, has,
+//    delete, asMap, clone, toString, sizeInBytes, copyBytesInto} so
+//    `this.dict` is read as a flat array instead of a Map.
+//    sizeInBytes / copyBytesInto subsume fast-dict-iter.mjs (no
+//    Map.forEach + thisArg context object needed; iteration is just
+//    `for (let i = 0; i < arr.length; i += 2)`).
+//
+// 2. Patch PDFDict.withContext, PDFDict.fromMapWithContext, and the
+//    parallel fromMapWithContext / withContextAndPages helpers on
+//    PDFCatalog / PDFPageTree / PDFPageLeaf, plus PDFPageLeaf's
+//    clone() which constructs `new Map()` directly. Each of these is
+//    rewritten to produce / accept a flat array; the Map argument is
+//    converted at the seam (rare-path cost, only a few dicts per
+//    document hit these factories).
+//
+// 3. Patch PDFObjectParser.prototype.parseDict so the parser's hot
+//    inner loop accumulates into a flat array directly (no Map(), no
+//    Map.set). The Type-sentinel dispatch at the tail becomes a
+//    short linear scan over the array; on dicts that have a /Type
+//    entry it's the first or second key (PDF convention), so the
+//    scan is effectively O(1). This subsumes fast-parse-dict.mjs.
+//
+// Compatibility. Every consumer of `dict.dict.X` inside pdf-lib
+// (ViewerPreferences, AppearanceCharacteristics, PDFAcroField,
+// PDFAcroChoice, PDFAcroText, PDFAcroForm, PDFAnnotation,
+// PDFWidgetAnnotation, BorderStyle, PDFStreamWriter, PDFCrossRefStream,
+// PDFObjectCopier, PDFXRefStreamParser, etc.) goes through
+// PDFDict.prototype methods (.set / .get / .has / .delete / .entries /
+// .lookup), all of which we re-implement to read the array. Nobody in
+// the codebase touches `dict.dict` expecting a Map iterator -- grep
+// confirmed. `asMap()` still returns a fresh `new Map(...)` for any
+// caller that genuinely wants a Map view.
+//
+// This shim is mutually exclusive with --fast-parse-dict and
+// --fast-dict-iter: both are subsumed and would re-install the
+// Map-based methods if loaded afterwards. measure.mjs enforces this.
+//
+// Side-effecting import. Import once before any pdf-lib operation:
+//
+//   import "./lib/fast-dict-array.mjs";
+//
+// Idempotent -- repeated imports do nothing after the first.
+
+import { createRequire } from 'node:module';
+
+const require = createRequire(import.meta.url);
+const PDFDict         = require('pdf-lib/cjs/core/objects/PDFDict.js').default;
+const PDFCatalog      = require('pdf-lib/cjs/core/structures/PDFCatalog.js').default;
+const PDFPageTree     = require('pdf-lib/cjs/core/structures/PDFPageTree.js').default;
+const PDFPageLeaf     = require('pdf-lib/cjs/core/structures/PDFPageLeaf.js').default;
+const PDFName         = require('pdf-lib/cjs/core/objects/PDFName.js').default;
+const PDFNull         = require('pdf-lib/cjs/core/objects/PDFNull.js').default;
+const PDFObjectParser = require('pdf-lib/cjs/core/parser/PDFObjectParser.js').default;
+const CharCodes       = require('pdf-lib/cjs/core/syntax/CharCodes.js').default;
+
+// Captured canonical PDFNames for the parser's Type-dispatch tail.
+// Pool-dedup ([PDFName.js:18,100]) guarantees reference equality with
+// whatever the parser sees inside the dict.
+const TypeName    = PDFName.of('Type');
+const CatalogName = PDFName.of('Catalog');
+const PagesName   = PDFName.of('Pages');
+const PageName    = PDFName.of('Page');
+
+// Map -> flat array. Called at the seam from the factories below; not
+// on the hot parse path.
+function mapToArray(map) {
+  const arr = new Array(map.size * 2);
+  let i = 0;
+  for (const [k, v] of map) { arr[i++] = k; arr[i++] = v; }
+  return arr;
+}
+
+// Linear scan for the index of `key` in [k0, v0, k1, v1, ...]; returns
+// the key-slot index, or -1 if absent.
+function indexOfKey(arr, key) {
+  for (let i = 0, len = arr.length; i < len; i += 2) {
+    if (arr[i] === key) return i;
+  }
+  return -1;
+}
+
+if (!PDFDict.prototype.__fastDictArrayInstalled) {
+
+  // ---- PDFDict.prototype --------------------------------------------
+
+  PDFDict.prototype.keys = function () {
+    const arr = this.dict;
+    const out = new Array(arr.length >> 1);
+    for (let i = 0, j = 0, len = arr.length; i < len; i += 2, j++) out[j] = arr[i];
+    return out;
+  };
+
+  PDFDict.prototype.values = function () {
+    const arr = this.dict;
+    const out = new Array(arr.length >> 1);
+    for (let i = 1, j = 0, len = arr.length; i < len; i += 2, j++) out[j] = arr[i];
+    return out;
+  };
+
+  PDFDict.prototype.entries = function () {
+    const arr = this.dict;
+    const out = new Array(arr.length >> 1);
+    for (let i = 0, j = 0, len = arr.length; i < len; i += 2, j++) {
+      out[j] = [arr[i], arr[i + 1]];
+    }
+    return out;
+  };
+
+  PDFDict.prototype.set = function (key, value) {
+    const arr = this.dict;
+    const idx = indexOfKey(arr, key);
+    if (idx >= 0) {
+      arr[idx + 1] = value;
+    } else {
+      arr.push(key, value);
+    }
+  };
+
+  PDFDict.prototype.get = function (key, preservePDFNull) {
+    if (preservePDFNull === undefined) preservePDFNull = false;
+    const arr = this.dict;
+    const idx = indexOfKey(arr, key);
+    if (idx < 0) return undefined;
+    const value = arr[idx + 1];
+    if (value === PDFNull && !preservePDFNull) return undefined;
+    return value;
+  };
+
+  PDFDict.prototype.has = function (key) {
+    const arr = this.dict;
+    const idx = indexOfKey(arr, key);
+    if (idx < 0) return false;
+    const value = arr[idx + 1];
+    return value !== undefined && value !== PDFNull;
+  };
+
+  PDFDict.prototype.delete = function (key) {
+    const arr = this.dict;
+    const idx = indexOfKey(arr, key);
+    if (idx < 0) return false;
+    arr.splice(idx, 2);
+    return true;
+  };
+
+  PDFDict.prototype.asMap = function () {
+    const arr = this.dict;
+    const m = new Map();
+    for (let i = 0, len = arr.length; i < len; i += 2) m.set(arr[i], arr[i + 1]);
+    return m;
+  };
+
+  PDFDict.prototype.clone = function (context) {
+    const ctx = context || this.context;
+    const cloned = this.dict.slice();
+    return new PDFDict(cloned, ctx);
+  };
+
+  PDFDict.prototype.toString = function () {
+    const arr = this.dict;
+    let s = '<<\n';
+    for (let i = 0, len = arr.length; i < len; i += 2) {
+      s += arr[i].toString() + ' ' + arr[i + 1].toString() + '\n';
+    }
+    return s + '>>';
+  };
+
+  PDFDict.prototype.sizeInBytes = function () {
+    const arr = this.dict;
+    let size = 5;
+    for (let i = 0, len = arr.length; i < len; i += 2) {
+      size += arr[i].sizeInBytes() + arr[i + 1].sizeInBytes() + 2;
+    }
+    return size;
+  };
+
+  PDFDict.prototype.copyBytesInto = function (buffer, offset) {
+    const initialOffset = offset;
+    buffer[offset++] = CharCodes.LessThan;
+    buffer[offset++] = CharCodes.LessThan;
+    buffer[offset++] = CharCodes.Newline;
+    const arr = this.dict;
+    for (let i = 0, len = arr.length; i < len; i += 2) {
+      offset += arr[i].copyBytesInto(buffer, offset);
+      buffer[offset++] = CharCodes.Space;
+      offset += arr[i + 1].copyBytesInto(buffer, offset);
+      buffer[offset++] = CharCodes.Newline;
+    }
+    buffer[offset++] = CharCodes.GreaterThan;
+    buffer[offset++] = CharCodes.GreaterThan;
+    return offset - initialOffset;
+  };
+
+  // ---- PDFDict factories --------------------------------------------
+
+  PDFDict.withContext = function (context) {
+    return new PDFDict([], context);
+  };
+  PDFDict.fromMapWithContext = function (map, context) {
+    return new PDFDict(mapToArray(map), context);
+  };
+
+  // ---- Subclass factories -------------------------------------------
+  // PDFCatalog.withContextAndPages builds a fresh 2-entry Map; just
+  // hand it the equivalent 2-entry array.
+
+  PDFCatalog.withContextAndPages = function (context, pages) {
+    return new PDFCatalog(
+      [PDFName.of('Type'), CatalogName, PagesName, pages],
+      context,
+    );
+  };
+  PDFCatalog.fromMapWithContext = function (map, context) {
+    return new PDFCatalog(mapToArray(map), context);
+  };
+
+  PDFPageTree.fromMapWithContext = function (map, context) {
+    return new PDFPageTree(mapToArray(map), context);
+  };
+
+  PDFPageLeaf.fromMapWithContext = function (map, context, autoNormalizeCTM) {
+    return new PDFPageLeaf(mapToArray(map), context, autoNormalizeCTM);
+  };
+  // PDFPageLeaf.prototype.clone constructs `new Map()` explicitly,
+  // then copies via this.entries() + clone.set(); since clone.set is
+  // PDFDict.prototype.set (now array-aware), it works as long as
+  // fromMapWithContext receives an empty Map and converts it.
+  // mapToArray(new Map()) yields []; nothing to patch here.
+
+  // ---- PDFObjectParser.prototype.parseDict --------------------------
+  // Subsumes fast-parse-dict.mjs: no `new Map()`, no `dict.set(...)`
+  // in the hot inner loop. The Type-sentinel dispatch at the tail is
+  // a short linear scan; PDF convention places /Type first, so it's
+  // effectively O(1) per dict.
+
+  PDFObjectParser.prototype.parseDict = function fastParseDictArray() {
+    const bytes = this.bytes;
+    bytes.assertNext(CharCodes.LessThan);
+    bytes.assertNext(CharCodes.LessThan);
+    this.skipWhitespaceAndComments();
+    const arr = [];
+    while (!bytes.done() &&
+           bytes.peek() !== CharCodes.GreaterThan &&
+           bytes.peekAhead(1) !== CharCodes.GreaterThan) {
+      const key = this.parseName();
+      const value = this.parseObject();
+      arr.push(key, value);
+      this.skipWhitespaceAndComments();
+    }
+    this.skipWhitespaceAndComments();
+    bytes.assertNext(CharCodes.GreaterThan);
+    bytes.assertNext(CharCodes.GreaterThan);
+
+    // Type-sentinel dispatch. Inline-scan for TypeName; in practice
+    // it's at arr[0] or arr[2].
+    let Type;
+    for (let i = 0, len = arr.length; i < len; i += 2) {
+      if (arr[i] === TypeName) { Type = arr[i + 1]; break; }
+    }
+    if (Type === CatalogName) return new PDFCatalog(arr, this.context);
+    if (Type === PagesName)   return new PDFPageTree(arr, this.context);
+    if (Type === PageName)    return new PDFPageLeaf(arr, this.context);
+    return new PDFDict(arr, this.context);
+  };
+
+  PDFDict.prototype.__fastDictArrayInstalled = true;
+  // Mark the subsumed shims as installed so a redundant load is a no-op.
+  PDFDict.prototype.__fastDictIterInstalled = true;
+  PDFObjectParser.prototype.__fastParseDictInstalled = true;
+}
diff --git a/docs/render-book.mjs b/docs/render-book.mjs
index 2cf1fc73..275662aa 100644
--- a/docs/render-book.mjs
+++ b/docs/render-book.mjs
@@ -58,18 +58,19 @@ import { PDFDocument } from 'pdf-lib';
 //     `n.toString(2)` just to count its bit length) with a non-
 //     allocating short-circuit ladder. Called ~300 k times per save
 //     from PDFCrossRefStream's xref writer.
-//   fast-dict-iter -- replace PDFDict.sizeInBytes / copyBytesInto
-//     with versions that iterate the underlying Map in place via
-//     forEach, instead of materialising a fresh Array of [key, value]
-//     tuples via this.entries() on every call. ~80 ms saved per
-//     process run on the book + eliminates the largest non-GC row
-//     (PDFDict.entries was ~10 % of process self-time).
-//   fast-parse-dict -- hoist the four sentinel PDFName.of calls
-//     (Type / Catalog / Pages / Page) out of the type-dispatch tail
-//     in PDFObjectParser.prototype.parseDict. The dispatch fires
-//     per-dict on every load; pool-dedup makes the canonical
-//     PDFNames reference-stable, so captured constants replace
-//     the calls verbatim. Pulls ~17 ms off fastOf self-time.
+//   fast-dict-array -- replace PDFDict's backing Map with a flat
+//     alternating array [k0, v0, k1, v1, ...]. The sampling heap
+//     profile of the process phase put `new Map()` + Map.prototype.set
+//     at ~80 MB combined (50 % of total allocations), 80 % of that
+//     traffic from the parser's per-dict accumulator. The flat-array
+//     shape is one allocation per dict, no hash-table arena; PDF dicts
+//     are tiny enough that linear lookup beats Map hashing. Subsumes
+//     both fast-dict-iter (sizeInBytes / copyBytesInto iterate the
+//     array in place, no Map.forEach context object) and
+//     fast-parse-dict (parser's hot loop accumulates into the array
+//     directly, Type-sentinel dispatch is a short linear scan). Drops
+//     Map+set heap traffic by ~80 %, GC self-time by ~20 %, process
+//     wall-clock by ~4 % (~48 ms / 1.18 s).
 //   fast-parse-object -- replace PDFObjectParser.prototype.parseObject
 //     with a first-byte-dispatch version that gates the three
 //     matchKeyword (true / false / null) scans behind a byte check.
@@ -97,8 +98,7 @@ import './lib/fast-parse-number.mjs';
 import './lib/fast-decode-name.mjs';
 import './lib/fast-number-to-string.mjs';
 import './lib/fast-size-in-bytes.mjs';
-import './lib/fast-dict-iter.mjs';
-import './lib/fast-parse-dict.mjs';
+import './lib/fast-dict-array.mjs';
 import './lib/fast-parse-object.mjs';
 import './lib/fast-sync-load.mjs';
 import { parseOutline, setOutline } from './lib/outline.mjs';
diff --git a/perf/README.md b/perf/README.md
index 20772d1c..16685f07 100644
--- a/perf/README.md
+++ b/perf/README.md
@@ -66,7 +66,7 @@ or pdf-lib), or to write `book.pdf` for behavioural verification.
 The mirror command for CPU-profiling the pdf-lib roundtrip:
 
 ```
-node measure.mjs --fast-refs --parallel-deflate --fast-decode-name --fast-number-to-string --fast-size-in-bytes --fast-inflate --fast-parse-number --fast-dict-iter --fast-parse-dict --fast-parse-object --fast-sync-load --cpu-profile-process --cpu-sampling 100
+node measure.mjs --fast-refs --parallel-deflate --fast-decode-name --fast-number-to-string --fast-size-in-bytes --fast-inflate --fast-parse-number --fast-dict-array --fast-parse-object --fast-sync-load --cpu-profile-process --cpu-sampling 100
 ```
 
 Flag rationale:
@@ -119,29 +119,25 @@ Flag rationale:
   trip. Every numeric token parsed during `PDFDocument.load`
   flows through these -- hundreds of thousands of calls per load
   on the book. Production runs through it.
-- `--fast-dict-iter` -- inject
-  [docs/lib/fast-dict-iter.mjs](../docs/lib/fast-dict-iter.mjs),
-  replacing `PDFDict.sizeInBytes` and `PDFDict.copyBytesInto` with
-  versions that iterate the underlying Map in place via
-  `forEach((value, key), thisArg)` instead of materialising a fresh
-  Array of `[key, value]` tuples via `this.entries()` on every call.
-  The save path fires both consumers on every dict (~100 k
-  `Array.from` calls feeding the GC), so this was the largest
-  non-GC row in the profile (~10 % of process self-time charged to
-  `PDFDict.entries`). ~80 ms saved per process run. Production runs
-  through it.
-- `--fast-parse-dict` -- inject
-  [docs/lib/fast-parse-dict.mjs](../docs/lib/fast-parse-dict.mjs),
-  replacing `PDFObjectParser.prototype.parseDict` with a version
-  that hoists the four sentinel `PDFName.of` calls (`'Type'`,
-  `'Catalog'`, `'Pages'`, `'Page'`) out of the type-dispatch tail
-  and substitutes module-level captured constants. Pool-dedup
-  guarantees the canonical `PDFName`s are reference-stable for the
-  whole load. With `--fast-decode-name` already in effect the four
-  calls were collapsing to `fastCache.get` hits per dict, but
-  `fastOf` was still the #4 row in the profile -- removing the
-  calls pulls ~17 ms (~22 %) off `fastOf` self-time. Production
-  runs through it.
+- `--fast-dict-array` -- inject
+  [docs/lib/fast-dict-array.mjs](../docs/lib/fast-dict-array.mjs),
+  replacing `PDFDict`'s backing `Map` with a flat alternating
+  `[k0, v0, k1, v1, ...]` array and patching every `PDFDict`
+  prototype method (and the parser's `parseDict`) to read it. The
+  sampling heap profile of the process phase put `new Map()` +
+  `Map.prototype.set` at ~80 MB combined (50 % of total allocations
+  on the book), 80 % of that traffic from the parser's per-dict
+  accumulator. The flat-array shape is one allocation per dict, no
+  hash-table arena; PDF dicts are tiny (typically <= 10 entries) so
+  linear lookup beats Map hashing. Subsumes `--fast-dict-iter`
+  (`sizeInBytes` / `copyBytesInto` iterate the array in place, no
+  `Map.forEach` context object) and `--fast-parse-dict` (parser's
+  hot loop accumulates into the array directly, Type-sentinel
+  dispatch is a short linear scan -- PDF convention places `/Type`
+  first, so the scan is effectively O(1) per dict). ~80 % drop in
+  `Map`+`set` heap traffic, ~20 % drop in process GC self-time,
+  ~4 % drop in process wall-clock (~48 ms / 1.18 s). Production
+  runs through it; the two old shims stay on disk as A/B baselines.
 - `--fast-parse-object` -- inject
   [docs/lib/fast-parse-object.mjs](../docs/lib/fast-parse-object.mjs),
   replacing `PDFObjectParser.prototype.parseObject` with a
@@ -204,7 +200,7 @@ process phase -- "where is pdf-lib allocating bytes?" rather than
 "where is it spending cycles?":
 
 ```
-node measure.mjs --fast-refs --parallel-deflate --fast-decode-name --fast-number-to-string --fast-size-in-bytes --fast-inflate --fast-parse-number --fast-dict-iter --fast-parse-dict --fast-parse-object --fast-sync-load --heap-profile-process --heap-sampling 512
+node measure.mjs --fast-refs --parallel-deflate --fast-decode-name --fast-number-to-string --fast-size-in-bytes --fast-inflate --fast-parse-number --fast-dict-array --fast-parse-object --fast-sync-load --heap-profile-process --heap-sampling 512
 ```
 
 Same `--fast-*` set as the CPU command (production is the baseline
@@ -365,8 +361,9 @@ run.bat --fast-number-to-string           # skip numberToString redundant toStri
 run.bat --fast-size-in-bytes              # non-allocating ladder for xref byte-width (also ships; opt-in here for A/B)
 run.bat --fast-inflate                    # swap pako.inflate for node:zlib.inflateSync (also ships; opt-in here for A/B)
 run.bat --fast-parse-number               # direct-integer accumulator for parseRawNumber/parseRawInt (also ships; opt-in here for A/B)
-run.bat --fast-dict-iter                  # in-place Map.forEach for PDFDict.sizeInBytes/copyBytesInto (also ships; opt-in here for A/B)
-run.bat --fast-parse-dict                 # hoist Type/Catalog/Pages/Page sentinel PDFNames out of parseDict (also ships; opt-in here for A/B)
+run.bat --fast-dict-iter                  # in-place Map.forEach for PDFDict.sizeInBytes/copyBytesInto (Map-shape baseline; subsumed by --fast-dict-array in production)
+run.bat --fast-parse-dict                 # hoist Type/Catalog/Pages/Page sentinel PDFNames out of parseDict (Map-shape baseline; subsumed by --fast-dict-array in production)
+run.bat --fast-dict-array                 # replace PDFDict's backing Map with a flat [k,v,k,v,...] array; subsumes --fast-dict-iter + --fast-parse-dict (also ships; opt-in here for A/B)
 run.bat --fast-parse-object               # first-byte dispatch in parseObject; gate true/false/null matchKeyword behind byte check (also ships; opt-in here for A/B)
 run.bat --fast-sync-load                  # synchronify PDFDocument.load + parser; strip waitForTick machinery (also ships; opt-in here for A/B)
 ```
@@ -487,6 +484,7 @@ file documenting each:
 | `parseDict` sentinel-PDFName hoist (Type/Catalog/Pages/Page) | [08](notes/08-pdf-lib.md) | ~17 ms profile (fastOf -22 %) |
 | Synchronify pdf-lib load + save (strip `__awaiter` scaffolding) | [08](notes/08-pdf-lib.md) | ~0.36 s process (load -26 %, GC -53 ms) |
 | `parseObject` first-byte dispatch + gated keyword scans | [08](notes/08-pdf-lib.md) | ~42 ms profile (parseObject -51 %) |
+| `PDFDict` flat-array storage (subsumes iter + parseDict shims) | [08](notes/08-pdf-lib.md) | ~48 ms process (Map+set heap -80 %, GC -20 %) |
 
 What was tried and didn't ship:
 
@@ -513,4 +511,4 @@ order; later ones reference earlier ones for context.
 | [05-blink-trace.md](notes/05-blink-trace.md) | What happened when we tried move-not-clone (a `previousLeaf` cache shipped instead of move); cracking the cpu profile's `(program)` row open with a Blink-category trace; the WhiteSpaceFilter paired-A/B that found it wasn't worth its layout cost in our pipeline. |
 | [06-microtasks-pageranges-css.md](notes/06-microtasks-pageranges-css.md) | Following `RunMicrotasks` down to zero (chunker fully sync); why `pageRanges` sharding is off the table; CSS cost attribution showing print.css's individual sections are all below the noise floor. |
 | [07-memory.md](notes/07-memory.md) | Where the renderer's 1.9 GB goes -- process-tree footprint, per-allocator + per-Blink-class breakdown, `--disable-gpu` + `--in-process-gpu` saving ~200 MB, a GC-pass probe finding 180 MB of unswept Oilpan garbage. |
-| [08-pdf-lib.md](notes/08-pdf-lib.md) | Profiling the process phase via `--cpu-profile-process`; pako's per-stream init dominates with ~4 500 small streams (routing pdf-lib's `deflate` + `inflate` through `node:zlib` saves ~1.5 s); `PDFRef.of`'s string-keyed Map lookup at 1.2 M calls per load (dense-array gen=0 cache saves ~0.2 s); parallelising save's per-stream deflate on libuv's pool with `objectsPerStream: 500` (~0.3 s off the main thread; PDF -5 %); `decodeName`'s regex scan on 2.76 M `PDFName.of` calls per load with a 0.0001 % hit rate (no-`#` cache saves ~0.5 s); `numberToString`'s redundant `toString`/`split`/`parseInt` on the 100 % no-`e` path; `sizeInBytes` allocating `n.toString(2)` on ~300 k xref-writer calls (short-circuit ladder saves ~60 ms); `PDFDict.entries` allocating `Array.from(map.entries())` on every dict serialisation (`Map.forEach` with hoisted callbacks saves ~80 ms); `parseDict`'s type-dispatch tail re-running `PDFName.of('Type'/'Catalog'/'Pages'/'Page')` per dict (hoisted sentinel constants drop `fastOf` self-time by 22 %); pdf-lib's `__awaiter`/`__generator` scaffolding on nine load + save methods costing ~135 ms of attributed self-time + ~50 ms GC (synchronified twins save 0.36 s of process); `parseObject`'s three speculative `matchKeyword(true/false/null)` scans on every dispatch (first-byte peek + gated keyword scans halve `parseObject` self-time). |
+| [08-pdf-lib.md](notes/08-pdf-lib.md) | Profiling the process phase via `--cpu-profile-process`; pako's per-stream init dominates with ~4 500 small streams (routing pdf-lib's `deflate` + `inflate` through `node:zlib` saves ~1.5 s); `PDFRef.of`'s string-keyed Map lookup at 1.2 M calls per load (dense-array gen=0 cache saves ~0.2 s); parallelising save's per-stream deflate on libuv's pool with `objectsPerStream: 500` (~0.3 s off the main thread; PDF -5 %); `decodeName`'s regex scan on 2.76 M `PDFName.of` calls per load with a 0.0001 % hit rate (no-`#` cache saves ~0.5 s); `numberToString`'s redundant `toString`/`split`/`parseInt` on the 100 % no-`e` path; `sizeInBytes` allocating `n.toString(2)` on ~300 k xref-writer calls (short-circuit ladder saves ~60 ms); `PDFDict.entries` allocating `Array.from(map.entries())` on every dict serialisation (`Map.forEach` with hoisted callbacks saves ~80 ms); `parseDict`'s type-dispatch tail re-running `PDFName.of('Type'/'Catalog'/'Pages'/'Page')` per dict (hoisted sentinel constants drop `fastOf` self-time by 22 %); pdf-lib's `__awaiter`/`__generator` scaffolding on nine load + save methods costing ~135 ms of attributed self-time + ~50 ms GC (synchronified twins save 0.36 s of process); `parseObject`'s three speculative `matchKeyword(true/false/null)` scans on every dispatch (first-byte peek + gated keyword scans halve `parseObject` self-time); the sampling heap profile pointing at `new Map()` + `Map.prototype.set` at ~50 % of process-phase allocations (replacing `PDFDict`'s backing `Map` with a flat alternating `[k,v,k,v,...]` array drops Map+set heap traffic ~80 % and subsumes the earlier `fast-dict-iter` + `fast-parse-dict` shims). |
diff --git a/perf/measure.mjs b/perf/measure.mjs
index 0ed196ba..81f64265 100644
--- a/perf/measure.mjs
+++ b/perf/measure.mjs
@@ -34,6 +34,7 @@
 //                    [--fast-size-in-bytes] [--fast-inflate]
 //                    [--fast-parse-number] [--fast-parse-dict]
 //                    [--fast-parse-object] [--fast-sync-load]
+//                    [--fast-dict-array]
 //
 // --render-only bails out after the render phase. Skips meta extraction,
 // parseOutline, page.pdf, and the pdf-lib roundtrip / incremental writer.
@@ -164,6 +165,17 @@
 // rewind costs on every invocation. Same semantics, dispatch
 // reordered by observed frequency in dict-value position.
 //
+// --fast-dict-array replaces PDFDict's backing Map with a flat
+// alternating array [k0, v0, k1, v1, ...]. The sampling heap profile
+// showed `new Map()` + `Map.prototype.set` accounting for half the
+// process-phase allocations (~63 MB combined), 80 % of that traffic
+// from the parser's per-dict accumulator. The flat array is one
+// allocation per dict, no hash-table arena; lookups are linear scans
+// but PDF dicts are tiny (typically <= 10 entries). Subsumes
+// --fast-parse-dict and --fast-dict-iter (the parser's hot loop
+// accumulates into the array directly; sizeInBytes / copyBytesInto
+// iterate in place). Production runs through it.
+//
 // --fast-sync-load rips pdf-lib's parseSpeed / objectsPerTick /
 // shouldWaitForTick / waitForTick machinery out of both the load
 // path (PDFDocument.load + PDFParser.parseDocument / parseDocumentSection
@@ -234,6 +246,7 @@ let fastDictIter = false;
 let fastParseDict = false;
 let fastParseObject = false;
 let fastSyncLoad = false;
+let fastDictArray = false;
 for (let i = 0; i < args.length; i++) {
   const a = args[i];
   if (a === '--out') outArg = args[++i];
@@ -267,6 +280,7 @@ for (let i = 0; i < args.length; i++) {
   else if (a === '--fast-parse-dict') fastParseDict = true;
   else if (a === '--fast-parse-object') fastParseObject = true;
   else if (a === '--fast-sync-load') fastSyncLoad = true;
+  else if (a === '--fast-dict-array') fastDictArray = true;
   else if (!inputArg) inputArg = a;
   else { console.error(`unknown arg: ${a}`); process.exit(2); }
 }
@@ -309,6 +323,10 @@ if (heapProfileProcess && renderOnly) {
   console.error('--heap-profile-process is incompatible with --render-only (the process phase is skipped).');
   process.exit(2);
 }
+if (fastDictArray && (fastParseDict || fastDictIter)) {
+  console.error('--fast-dict-array subsumes --fast-parse-dict and --fast-dict-iter (Map-backed shims). Pick one shape.');
+  process.exit(2);
+}
 
 // Install the dense-array cache for PDFRef.of's gen=0 path before any
 // pdf-lib operation. Side-effecting import; idempotent.
@@ -352,6 +370,10 @@ if (fastSyncLoad) {
   await import('../docs/lib/fast-sync-load.mjs');
   console.log('[harness] fast-sync-load: synchronify PDFParser load path, strip waitForTick machinery');
 }
+if (fastDictArray) {
+  await import('../docs/lib/fast-dict-array.mjs');
+  console.log('[harness] fast-dict-array: PDFDict backed by flat alternating array (subsumes fast-parse-dict + fast-dict-iter)');
+}
 
 const stamp = new Date().toISOString().replace(/[:.]/g, '-');
 const outDir = outArg
diff --git a/perf/notes/08-pdf-lib.md b/perf/notes/08-pdf-lib.md
index 7543ba5b..12e64358 100644
--- a/perf/notes/08-pdf-lib.md
+++ b/perf/notes/08-pdf-lib.md
@@ -1850,6 +1850,203 @@ neither does anything:
 to A/B pdf-lib's defaults against `Fastest`, and it runs against
 vanilla pdf-lib without the shim by design.
 
+## Replace `PDFDict`'s backing `Map` with a flat array
+
+With `fast-dict-iter` and `fast-parse-dict` both shipping, the
+process-phase CPU profile read tidy enough that the next move was
+to look at the *other* side of the ledger: the sampling heap
+profile rather than CPU. The motivating run, captured with the
+canonical heap command (`--heap-profile-process --heap-sampling
+512`):
+
+```
+   self_kb   self_%   function  @  source
+  54315.27   34.75%   set                                  (V8 builtin)
+  24804.17   15.87%   Map                                  (V8 builtin)
+  19488.12   12.47%   PDFObjectParser.parseArray
+  16786.41   10.74%   PDFParser.parseIndirectObjectHeader
+  15329.21    9.81%   PDFObjectParser.parseNumberOrRef
+   9599.45    6.14%   fastParseDict        (fast-parse-dict.mjs)
+   9581.25    6.13%   fastOf               (fast-decode-name.mjs)
+   ...
+```
+
+`set` and `Map` together at ~80 MB -- **half of all process-phase
+allocations** -- were the natural place to start.
+`find-heap-callers.mjs` attributed them cleanly:
+
+```
+$ node find-heap-callers.mjs process.heapprofile set
+set: total=53.04 MB
+  39107.27 KB   fastParseDict @ fast-parse-dict.mjs:62
+   7168.04 KB   PDFParser.parseIndirectObjectHeader
+   7168.04 KB   parseIndirectObjectSync @ fast-sync-load.mjs:140
+    ...
+
+$ node find-heap-callers.mjs process.heapprofile Map
+Map: total=24.22 MB
+  24691.51 KB   fastParseDict @ fast-parse-dict.mjs:62
+    112.13 KB   buildPdfObjectsForOutline
+```
+
+84 % of the combined Map+set traffic was one site, the parser's
+per-dict accumulator inside `fastParseDict`:
+
+```js
+const dict = new Map();             // 24 MB of Map() constructors here
+while (...) {
+  const key = this.parseName();
+  const value = this.parseObject();
+  dict.set(key, value);             // 38 MB of set() entries here
+  ...
+}
+return PDFDict.fromMapWithContext(dict, this.context);
+```
+
+One `new Map()` + N `Map.prototype.set` calls per parsed dict,
+then the Map gets stored as `PDFDict.dict` and consulted later by
+all the `PDFDict` methods. Every Map allocates a header + a
+hash-table backing arena + per-entry bucket objects; on the book
+that's ~9 k dicts each paying for an arena it doesn't need,
+because PDF dicts are **tiny** (typical has <= 10 entries, most
+have 2-3) and nothing in pdf-lib's API touches a parsed dict
+often enough for the hash to pay back.
+
+The remaining 16 % was `context.assign` populating
+`PDFContext.indirectObjects` (a Map<PDFRef, PDFObject>) -- that's
+a single Map shared across the load, not addressed here.
+
+### The shape
+
+Replace the Map with a flat alternating array:
+
+```js
+// before
+this.dict = new Map([[key0, value0], [key1, value1], ...]);
+// after
+this.dict = [key0, value0, key1, value1, ...];
+```
+
+One allocation per dict (the array; the entries are stored inline
+in the array's backing store, no per-entry boxes). Lookups become
+linear scans:
+
+```js
+function indexOfKey(arr, key) {
+  for (let i = 0, len = arr.length; i < len; i += 2) {
+    if (arr[i] === key) return i;
+  }
+  return -1;
+}
+```
+
+For 5-entry dicts (the dominant size class), a 5-iteration linear
+scan with strict-equality comparison beats `Map.prototype.get`
+(which has to hash the key, then walk a hash-bucket chain) on
+every V8 microbench checked. The crossover is somewhere around
+20-30 entries; PDF dicts almost never get there.
+
+### Compatibility
+
+`PDFDict`'s public method surface is
+`.keys / .values / .entries / .set / .get / .has / .delete /
+.lookup / .lookupMaybe / .asMap / .clone / .toString /
+.sizeInBytes / .copyBytesInto / .uniqueKey`. Grepping the rest of
+pdf-lib confirmed every consumer goes through that surface --
+`viewerPrefs.dict.set(...)`, `widgetAnnot.dict.get(...)`,
+`xrefStream.dict.set(...)`, etc. all call `PDFDict.prototype.set`/
+`.get`, which we re-implement against the array. Nobody in the
+codebase touches `dict.dict` expecting Map-specific iterators.
+The single direct-Map use, `asMap()`, still returns a fresh
+`new Map(...)` for any caller that wants one.
+
+The seam factories that take a `Map` argument
+(`fromMapWithContext`, `withContextAndPages`, `PDFPageLeaf.clone`'s
+`new Map()` initializer) get small wrappers that convert at the
+boundary. They're called a handful of times per document --
+catalog + page tree + page leaves -- so the conversion is free
+relative to the parser's ~9 k dicts.
+
+### Subsumes two earlier shims
+
+The two existing dict-shape shims are no longer useful in front of
+the array shape:
+
+- `fast-dict-iter` patched `PDFDict.sizeInBytes` and `copyBytesInto`
+  to call `this.dict.forEach((value, key) => ...)` instead of
+  `Array.from(this.dict.entries())`. With `this.dict` as a flat
+  array, both methods become `for (let i = 0; i < arr.length; i += 2)`
+  -- no `forEach`, no `thisArg` context object, no callback
+  allocation.
+- `fast-parse-dict` patched `parseDict` to hoist the
+  Type/Catalog/Pages/Page sentinel `PDFName.of` calls into
+  module-level constants. The new `parseDict` (in
+  `fast-dict-array.mjs`) keeps the hoisted constants and also
+  accumulates into the flat array directly. The Type-sentinel
+  dispatch becomes a short linear scan over the array; PDF
+  convention places `/Type` at index 0 or 2, so it's effectively
+  O(1) per dict.
+
+`fast-dict-array.mjs` carries both behaviours inline. The two
+older shims stay in the tree as opt-in flags on `measure.mjs`
+(useful for A/B against the `Map` shape) but are mutually
+exclusive with `--fast-dict-array` (the harness errors if you
+combine them).
+
+### Measured wins
+
+Heap profile (paired `--heap-profile-process --heap-sampling 512`,
+same canonical command otherwise):
+
+| Allocator        | Map shape (before) | Array shape (after) | Delta   |
+|------------------|-------------------:|--------------------:|--------:|
+| `set` builtin    |          54.3 MB   |             14.8 MB | -73 %   |
+| `Map` builtin    |          24.8 MB   |    < 1 MB (off top) | -96 %   |
+| `push` builtin   |              -     |              2.8 MB | +2.8 MB |
+| Total sampled    |         152.6 MB   |            140.1 MB | -8 %    |
+
+The total-allocation drop is smaller than the Map+set drop
+because the sampling profiler reattributes the array contents
+(`PDFObject` references that used to sit inside Map bucket
+allocations) to the `fastParseDictArray` frame that allocates the
+array -- the allocations are still there, just attributed
+differently. The **real** win is the absence of Map header +
+hash-table arena per dict, which the profile shows by the `Map`
+row collapsing.
+
+CPU profile (paired `--cpu-profile-process --cpu-sampling 100`):
+
+| Row                                | Before  | After   |
+|------------------------------------|--------:|--------:|
+| `(garbage collector)`              | 213 ms  | 170 ms  |
+| `fastParseDict` / `fastParseDictArray` | 113 ms  |  40 ms  |
+| `PDFDict.copyBytesInto` + `_copyBytesIntoEntry` | 60 ms |  26 ms  |
+
+Wall-clock (paired no-profile, 4 runs each, mean process phase):
+
+| Shape        | Process (mean) | Range          |
+|--------------|---------------:|----------------|
+| Map (before) |        1.180 s | 1.15 - 1.20 s  |
+| Array (after)|        1.132 s | 1.11 - 1.15 s  |
+
+**~48 ms saved on the 1.18 s process phase (~4 %).** The
+profile-time delta is bigger than the wall-clock delta because
+the CPU profiler's sampling overhead falls disproportionately on
+hot allocator paths -- a familiar caveat. The honest signal is
+the no-profile A/B.
+
+The output PDF is structurally identical (1651 pages, 1773
+outline nodes, same title / creator metadata), within the build's
+intrinsic timestamp/random-ID noise (the build is
+non-deterministic between runs anyway -- two consecutive no-shim
+runs differ by ~30 bytes too).
+
+`docs/render-book.mjs` swaps `./lib/fast-dict-iter.mjs` +
+`./lib/fast-parse-dict.mjs` for the single
+`./lib/fast-dict-array.mjs` import. The two older shims stay in
+the tree for A/B; the harness rejects combining them with
+`--fast-dict-array`.
+
 ## `@cantoo/pdf-lib`: not a drop-in replacement
 
 Spot-checked the maintained fork (`@cantoo/pdf-lib` 2.6.5) as an
@@ -1881,7 +2078,8 @@ Cumulative process-phase cost, baseline → after the shims to date:
 | + fast-dict-iter                     | ~1.4 s  | ~1.0 s | ~0.4 s |
 | + fast-parse-dict                    | ~1.4 s  | ~1.0 s | ~0.4 s |
 | + fast-parse-object                  | ~1.4 s  | ~1.0 s | ~0.4 s |
-| **+ fast-sync-load (this section)**  | **~1.3 s** | **~0.8 s** | **~0.5 s** |
+| + fast-sync-load                     | ~1.3 s  | ~0.8 s | ~0.5 s |
+| **+ fast-dict-array (this section)** | **~1.1 s** | **~0.7 s** | **~0.4 s** |
 
 The bottom-up after the latest pair is what's left of pdf-lib's
 genuine parser work: `PDFDict.entries`, `PDFObjectParser.parseName`,

From e2baded4a205add221f13b12a139292a5ce6ed5b Mon Sep 17 00:00:00 2001
From: Kuba Sunderland-Ober <kuba@mareimbrium.org>
Date: Sun, 24 May 2026 00:57:33 +0200
Subject: [PATCH 15/44] Replace PDFContext.indirectObjects Map with a dense
 gen=0 array.

After fast-dict-array shipped, PDFContext.assign's
`indirectObjects.set(ref, object)` was the only hot Map.set left in
the heap profile -- one set per parsed indirect object, ~14 MB of
hash-table growth on the book.

fast-indirect-objects patches assign / lookup / lookupMaybe / delete
/ getObjectRef / enumerateIndirectObjects to consult a dense array
keyed by objectNumber for gen=0 PDFRefs (the overwhelming common
case), Map fallback for gen!=0. Lazy-init on first assign; no
constructor patch needed. Mirror of the fast-refs trick on the value
side.

CPU: PDFContext.assign drops out of the process top-15.
Heap: set traffic 14.8 MB -> 7.7 MB (-48 %). Remaining 7 MB is the
upstream PDFRef pool.set on cache miss -- next target.
---
 docs/lib/fast-indirect-objects.mjs | 174 +++++++++++++++++++++++++++++
 docs/render-book.mjs               |  12 ++
 perf/README.md                     |  25 ++++-
 perf/measure.mjs                   |  18 ++-
 perf/notes/08-pdf-lib.md           | 100 ++++++++++++++++-
 5 files changed, 324 insertions(+), 5 deletions(-)
 create mode 100644 docs/lib/fast-indirect-objects.mjs

diff --git a/docs/lib/fast-indirect-objects.mjs b/docs/lib/fast-indirect-objects.mjs
new file mode 100644
index 00000000..9058414f
--- /dev/null
+++ b/docs/lib/fast-indirect-objects.mjs
@@ -0,0 +1,174 @@
+// Replace PDFContext.indirectObjects (Map<PDFRef, PDFObject>) with a
+// dense array keyed by objectNumber for the gen=0 path.
+//
+// Motivation. After fast-dict-array shipped, the only remaining hot
+// Map.set in the process-phase heap profile was
+// PDFContext.assign's `this.indirectObjects.set(ref, object)`:
+//
+//     $ node find-heap-callers.mjs <post-ship>.heapprofile set
+//     set: total=14.49 MB
+//       7168.04 KB   PDFParser.parseIndirectObjectHeader
+//       7168.04 KB   parseIndirectObjectSync @ fast-sync-load.mjs:140
+//        ...
+//
+// (Both ~7 MB rows are V8 inline-attribution duplicates of the same
+// logical call.) That's 14.5 MB of Map traffic for one Map -- one
+// `set` per indirect object during load, with the hash table
+// rebuilding through ~14 doubling steps to fit the book's ~9 k
+// indirect objects, discarding each intermediate arena to GC.
+//
+// PDFRefs are overwhelmingly gen=0 (revisions / incremental updates
+// are the only gen!=0 producers, and they're rare). fast-refs.mjs
+// already exploits this on the key side -- a dense array indexed by
+// objectNumber for the PDFRef pool, Map fallback for gen!=0. This
+// shim does the same on the value side for PDFContext.indirectObjects.
+//
+// Mechanism. Patch PDFContext.prototype.assign / lookup / lookupMaybe
+// / delete / getObjectRef / enumerateIndirectObjects to consult an
+// auxiliary `this._objArr` (dense array indexed by objectNumber) for
+// gen=0 PDFRefs first, falling back to the original Map for gen!=0.
+// The dense array is created lazily on first assign so we don't need
+// to touch the constructor.
+//
+// The original `this.indirectObjects` Map is left in place for two
+// reasons: (a) gen!=0 entries actually need it, and (b) external code
+// that reads `pdfContext.indirectObjects` directly (none in our
+// pipeline, but reasonable to defensive-preserve) continues to see a
+// Map-shaped object -- just usually empty.
+//
+// As a side benefit, `enumerateIndirectObjects` no longer needs to
+// sort: dense-array iteration is already in ascending objectNumber
+// order. (The Map-sourced gen!=0 entries are merged in sorted.)
+//
+// Side-effecting import. Import once before any PDFDocument.load:
+//
+//   import "./lib/fast-indirect-objects.mjs";
+//
+// Idempotent -- repeated imports do nothing after the first.
+
+import { createRequire } from 'node:module';
+
+const require = createRequire(import.meta.url);
+const PDFContext = require('pdf-lib/cjs/core/PDFContext.js').default;
+const PDFRef     = require('pdf-lib/cjs/core/objects/PDFRef.js').default;
+const PDFNull    = require('pdf-lib/cjs/core/objects/PDFNull.js').default;
+const UnexpectedObjectTypeError = require('pdf-lib/cjs/core/errors.js').UnexpectedObjectTypeError;
+
+const byAscendingObjectNumber = ([a], [b]) => a.objectNumber - b.objectNumber;
+
+if (!PDFContext.prototype.__fastIndirectObjectsInstalled) {
+
+  // ---- assign -------------------------------------------------------
+  // Hot path. gen=0 → dense array store; gen!=0 → Map. Maintains
+  // largestObjectNumber as before.
+
+  PDFContext.prototype.assign = function (ref, object) {
+    if (ref.generationNumber === 0) {
+      if (!this._objArr) this._objArr = [];
+      this._objArr[ref.objectNumber] = object;
+    } else {
+      this.indirectObjects.set(ref, object);
+    }
+    if (ref.objectNumber > this.largestObjectNumber) {
+      this.largestObjectNumber = ref.objectNumber;
+    }
+  };
+
+  // ---- delete -------------------------------------------------------
+  // Returns true iff something was removed. Dense slots are nulled
+  // (not spliced) so subsequent objectNumbers retain their slots.
+
+  PDFContext.prototype.delete = function (ref) {
+    if (ref.generationNumber === 0 && this._objArr) {
+      const slot = this._objArr[ref.objectNumber];
+      if (slot !== undefined) {
+        this._objArr[ref.objectNumber] = undefined;
+        return true;
+      }
+      return false;
+    }
+    return this.indirectObjects.delete(ref);
+  };
+
+  // ---- lookup / lookupMaybe -----------------------------------------
+  // Resolve the ref to an object via the dense array (gen=0) or Map
+  // (gen!=0), then run the original type-check tail verbatim.
+
+  function _resolve(ctx, ref) {
+    if (!(ref instanceof PDFRef)) return ref;
+    if (ref.generationNumber === 0 && ctx._objArr) {
+      return ctx._objArr[ref.objectNumber];
+    }
+    return ctx.indirectObjects.get(ref);
+  }
+
+  PDFContext.prototype.lookupMaybe = function (ref) {
+    const types = [];
+    for (let i = 1, len = arguments.length; i < len; i++) types[i - 1] = arguments[i];
+    const preservePDFNull = types.includes(PDFNull);
+    const result = _resolve(this, ref);
+    if (!result || (result === PDFNull && !preservePDFNull)) return undefined;
+    for (let idx = 0, len = types.length; idx < len; idx++) {
+      const type = types[idx];
+      if (type === PDFNull) {
+        if (result === PDFNull) return result;
+      } else {
+        if (result instanceof type) return result;
+      }
+    }
+    throw new UnexpectedObjectTypeError(types, result);
+  };
+
+  PDFContext.prototype.lookup = function (ref) {
+    const types = [];
+    for (let i = 1, len = arguments.length; i < len; i++) types[i - 1] = arguments[i];
+    const result = _resolve(this, ref);
+    if (types.length === 0) return result;
+    for (let idx = 0, len = types.length; idx < len; idx++) {
+      const type = types[idx];
+      if (type === PDFNull) {
+        if (result === PDFNull) return result;
+      } else {
+        if (result instanceof type) return result;
+      }
+    }
+    throw new UnexpectedObjectTypeError(types, result);
+  };
+
+  // ---- getObjectRef -------------------------------------------------
+  // Linear scan. Dense array first (gen=0 PDFRef reconstructed from
+  // objectNumber via PDFRef.of, which fast-refs has cached). Fall
+  // back to Map for any gen!=0 candidates.
+
+  PDFContext.prototype.getObjectRef = function (pdfObject) {
+    if (this._objArr) {
+      for (let i = 0, len = this._objArr.length; i < len; i++) {
+        if (this._objArr[i] === pdfObject) return PDFRef.of(i, 0);
+      }
+    }
+    for (const entry of this.indirectObjects) {
+      if (entry[1] === pdfObject) return entry[0];
+    }
+    return undefined;
+  };
+
+  // ---- enumerateIndirectObjects -------------------------------------
+  // Dense array is already iterable in objectNumber order. Merge in
+  // any gen!=0 entries from the Map and sort once -- but only if the
+  // Map is non-empty (the common case for parsed PDFs is empty).
+
+  PDFContext.prototype.enumerateIndirectObjects = function () {
+    const out = [];
+    if (this._objArr) {
+      for (let i = 0, len = this._objArr.length; i < len; i++) {
+        const obj = this._objArr[i];
+        if (obj !== undefined) out.push([PDFRef.of(i, 0), obj]);
+      }
+    }
+    if (this.indirectObjects.size === 0) return out;
+    for (const entry of this.indirectObjects) out.push(entry);
+    return out.sort(byAscendingObjectNumber);
+  };
+
+  PDFContext.prototype.__fastIndirectObjectsInstalled = true;
+}
diff --git a/docs/render-book.mjs b/docs/render-book.mjs
index 275662aa..850959a7 100644
--- a/docs/render-book.mjs
+++ b/docs/render-book.mjs
@@ -92,6 +92,17 @@ import { PDFDocument } from 'pdf-lib';
 //     writer + an unknowable chunk of the GC row removed; the
 //     parseSpeed / objectsPerTick options drop off all our call sites
 //     in step with this shim.
+//   fast-indirect-objects -- replace PDFContext.indirectObjects
+//     (Map<PDFRef, PDFObject>) with a dense array indexed by
+//     objectNumber for the gen=0 path. After fast-dict-array shipped,
+//     PDFContext.assign's `this.indirectObjects.set(ref, object)` was
+//     the only hot Map.set left in the heap profile (~7 MB of set
+//     traffic from the parser's once-per-indirect-object assign).
+//     Mirror of the fast-refs trick on the value side: dense array
+//     for gen=0, Map fallback for gen!=0. enumerateIndirectObjects
+//     skips its sort when the gen!=0 Map is empty (the common case).
+//     Drops PDFContext.assign out of the CPU top-15 and halves the
+//     remaining set heap traffic.
 import './lib/fast-refs.mjs';
 import './lib/fast-inflate.mjs';
 import './lib/fast-parse-number.mjs';
@@ -101,6 +112,7 @@ import './lib/fast-size-in-bytes.mjs';
 import './lib/fast-dict-array.mjs';
 import './lib/fast-parse-object.mjs';
 import './lib/fast-sync-load.mjs';
+import './lib/fast-indirect-objects.mjs';
 import { parseOutline, setOutline } from './lib/outline.mjs';
 import { setMetadata }              from './lib/postprocesser.mjs';
 import { parallelSave }             from './lib/parallel-deflate.mjs';
diff --git a/perf/README.md b/perf/README.md
index 16685f07..5f29cb63 100644
--- a/perf/README.md
+++ b/perf/README.md
@@ -66,7 +66,7 @@ or pdf-lib), or to write `book.pdf` for behavioural verification.
 The mirror command for CPU-profiling the pdf-lib roundtrip:
 
 ```
-node measure.mjs --fast-refs --parallel-deflate --fast-decode-name --fast-number-to-string --fast-size-in-bytes --fast-inflate --fast-parse-number --fast-dict-array --fast-parse-object --fast-sync-load --cpu-profile-process --cpu-sampling 100
+node measure.mjs --fast-refs --parallel-deflate --fast-decode-name --fast-number-to-string --fast-size-in-bytes --fast-inflate --fast-parse-number --fast-dict-array --fast-parse-object --fast-sync-load --fast-indirect-objects --cpu-profile-process --cpu-sampling 100
 ```
 
 Flag rationale:
@@ -166,6 +166,23 @@ Flag rationale:
   entirely. The `parseSpeed` / `objectsPerTick` options drop off
   `PDFDocument.load`, `parallelSave`, and `pdfDoc.save` call sites
   in step. Production runs through it.
+- `--fast-indirect-objects` -- inject
+  [docs/lib/fast-indirect-objects.mjs](../docs/lib/fast-indirect-objects.mjs),
+  replacing `PDFContext.indirectObjects` (`Map<PDFRef, PDFObject>`)
+  with a dense array indexed by `objectNumber` for the gen=0 path.
+  Mirror of `--fast-refs` on the value side. After `--fast-dict-array`
+  landed, `PDFContext.assign`'s
+  `this.indirectObjects.set(ref, object)` was the only hot
+  `Map.set` left in the heap profile (~7 MB of `set` traffic,
+  fired once per indirect object during load). Patches `assign` /
+  `lookup` / `lookupMaybe` / `delete` / `getObjectRef` /
+  `enumerateIndirectObjects` to consult the dense array first,
+  Map as gen!=0 fallback (rare on freshly-parsed PDFs). As a side
+  benefit `enumerateIndirectObjects` skips its sort when the
+  gen!=0 Map is empty -- dense-array iteration is already in
+  objectNumber order. Drops `PDFContext.assign` out of the CPU
+  top-15 and halves the remaining `set` heap traffic. Production
+  runs through it.
 - `--cpu-profile-process` -- attach Node's `inspector/promises`
   Profiler around the process phase only (skips render and generate).
   Writes `process.cpuprofile` into the timestamped `results/` folder.
@@ -200,7 +217,7 @@ process phase -- "where is pdf-lib allocating bytes?" rather than
 "where is it spending cycles?":
 
 ```
-node measure.mjs --fast-refs --parallel-deflate --fast-decode-name --fast-number-to-string --fast-size-in-bytes --fast-inflate --fast-parse-number --fast-dict-array --fast-parse-object --fast-sync-load --heap-profile-process --heap-sampling 512
+node measure.mjs --fast-refs --parallel-deflate --fast-decode-name --fast-number-to-string --fast-size-in-bytes --fast-inflate --fast-parse-number --fast-dict-array --fast-parse-object --fast-sync-load --fast-indirect-objects --heap-profile-process --heap-sampling 512
 ```
 
 Same `--fast-*` set as the CPU command (production is the baseline
@@ -364,6 +381,7 @@ run.bat --fast-parse-number               # direct-integer accumulator for parse
 run.bat --fast-dict-iter                  # in-place Map.forEach for PDFDict.sizeInBytes/copyBytesInto (Map-shape baseline; subsumed by --fast-dict-array in production)
 run.bat --fast-parse-dict                 # hoist Type/Catalog/Pages/Page sentinel PDFNames out of parseDict (Map-shape baseline; subsumed by --fast-dict-array in production)
 run.bat --fast-dict-array                 # replace PDFDict's backing Map with a flat [k,v,k,v,...] array; subsumes --fast-dict-iter + --fast-parse-dict (also ships; opt-in here for A/B)
+run.bat --fast-indirect-objects           # dense-array cache for PDFContext.indirectObjects (gen=0 path); mirror of --fast-refs on the value side (also ships; opt-in here for A/B)
 run.bat --fast-parse-object               # first-byte dispatch in parseObject; gate true/false/null matchKeyword behind byte check (also ships; opt-in here for A/B)
 run.bat --fast-sync-load                  # synchronify PDFDocument.load + parser; strip waitForTick machinery (also ships; opt-in here for A/B)
 ```
@@ -485,6 +503,7 @@ file documenting each:
 | Synchronify pdf-lib load + save (strip `__awaiter` scaffolding) | [08](notes/08-pdf-lib.md) | ~0.36 s process (load -26 %, GC -53 ms) |
 | `parseObject` first-byte dispatch + gated keyword scans | [08](notes/08-pdf-lib.md) | ~42 ms profile (parseObject -51 %) |
 | `PDFDict` flat-array storage (subsumes iter + parseDict shims) | [08](notes/08-pdf-lib.md) | ~48 ms process (Map+set heap -80 %, GC -20 %) |
+| `PDFContext.indirectObjects` dense gen=0 array | [08](notes/08-pdf-lib.md) | `assign` off CPU top-15; remaining `set` heap -48 % |
 
 What was tried and didn't ship:
 
@@ -511,4 +530,4 @@ order; later ones reference earlier ones for context.
 | [05-blink-trace.md](notes/05-blink-trace.md) | What happened when we tried move-not-clone (a `previousLeaf` cache shipped instead of move); cracking the cpu profile's `(program)` row open with a Blink-category trace; the WhiteSpaceFilter paired-A/B that found it wasn't worth its layout cost in our pipeline. |
 | [06-microtasks-pageranges-css.md](notes/06-microtasks-pageranges-css.md) | Following `RunMicrotasks` down to zero (chunker fully sync); why `pageRanges` sharding is off the table; CSS cost attribution showing print.css's individual sections are all below the noise floor. |
 | [07-memory.md](notes/07-memory.md) | Where the renderer's 1.9 GB goes -- process-tree footprint, per-allocator + per-Blink-class breakdown, `--disable-gpu` + `--in-process-gpu` saving ~200 MB, a GC-pass probe finding 180 MB of unswept Oilpan garbage. |
-| [08-pdf-lib.md](notes/08-pdf-lib.md) | Profiling the process phase via `--cpu-profile-process`; pako's per-stream init dominates with ~4 500 small streams (routing pdf-lib's `deflate` + `inflate` through `node:zlib` saves ~1.5 s); `PDFRef.of`'s string-keyed Map lookup at 1.2 M calls per load (dense-array gen=0 cache saves ~0.2 s); parallelising save's per-stream deflate on libuv's pool with `objectsPerStream: 500` (~0.3 s off the main thread; PDF -5 %); `decodeName`'s regex scan on 2.76 M `PDFName.of` calls per load with a 0.0001 % hit rate (no-`#` cache saves ~0.5 s); `numberToString`'s redundant `toString`/`split`/`parseInt` on the 100 % no-`e` path; `sizeInBytes` allocating `n.toString(2)` on ~300 k xref-writer calls (short-circuit ladder saves ~60 ms); `PDFDict.entries` allocating `Array.from(map.entries())` on every dict serialisation (`Map.forEach` with hoisted callbacks saves ~80 ms); `parseDict`'s type-dispatch tail re-running `PDFName.of('Type'/'Catalog'/'Pages'/'Page')` per dict (hoisted sentinel constants drop `fastOf` self-time by 22 %); pdf-lib's `__awaiter`/`__generator` scaffolding on nine load + save methods costing ~135 ms of attributed self-time + ~50 ms GC (synchronified twins save 0.36 s of process); `parseObject`'s three speculative `matchKeyword(true/false/null)` scans on every dispatch (first-byte peek + gated keyword scans halve `parseObject` self-time); the sampling heap profile pointing at `new Map()` + `Map.prototype.set` at ~50 % of process-phase allocations (replacing `PDFDict`'s backing `Map` with a flat alternating `[k,v,k,v,...]` array drops Map+set heap traffic ~80 % and subsumes the earlier `fast-dict-iter` + `fast-parse-dict` shims). |
+| [08-pdf-lib.md](notes/08-pdf-lib.md) | Profiling the process phase via `--cpu-profile-process`; pako's per-stream init dominates with ~4 500 small streams (routing pdf-lib's `deflate` + `inflate` through `node:zlib` saves ~1.5 s); `PDFRef.of`'s string-keyed Map lookup at 1.2 M calls per load (dense-array gen=0 cache saves ~0.2 s); parallelising save's per-stream deflate on libuv's pool with `objectsPerStream: 500` (~0.3 s off the main thread; PDF -5 %); `decodeName`'s regex scan on 2.76 M `PDFName.of` calls per load with a 0.0001 % hit rate (no-`#` cache saves ~0.5 s); `numberToString`'s redundant `toString`/`split`/`parseInt` on the 100 % no-`e` path; `sizeInBytes` allocating `n.toString(2)` on ~300 k xref-writer calls (short-circuit ladder saves ~60 ms); `PDFDict.entries` allocating `Array.from(map.entries())` on every dict serialisation (`Map.forEach` with hoisted callbacks saves ~80 ms); `parseDict`'s type-dispatch tail re-running `PDFName.of('Type'/'Catalog'/'Pages'/'Page')` per dict (hoisted sentinel constants drop `fastOf` self-time by 22 %); pdf-lib's `__awaiter`/`__generator` scaffolding on nine load + save methods costing ~135 ms of attributed self-time + ~50 ms GC (synchronified twins save 0.36 s of process); `parseObject`'s three speculative `matchKeyword(true/false/null)` scans on every dispatch (first-byte peek + gated keyword scans halve `parseObject` self-time); the sampling heap profile pointing at `new Map()` + `Map.prototype.set` at ~50 % of process-phase allocations (replacing `PDFDict`'s backing `Map` with a flat alternating `[k,v,k,v,...]` array drops Map+set heap traffic ~80 % and subsumes the earlier `fast-dict-iter` + `fast-parse-dict` shims); the only hot `Map.set` left being `PDFContext.assign`'s `indirectObjects.set(ref, object)` (replacing the Map with a dense gen=0 array indexed by `objectNumber` drops `assign` out of the CPU top-15 and halves the remaining `set` heap traffic). |
diff --git a/perf/measure.mjs b/perf/measure.mjs
index 81f64265..f79a31ec 100644
--- a/perf/measure.mjs
+++ b/perf/measure.mjs
@@ -34,7 +34,7 @@
 //                    [--fast-size-in-bytes] [--fast-inflate]
 //                    [--fast-parse-number] [--fast-parse-dict]
 //                    [--fast-parse-object] [--fast-sync-load]
-//                    [--fast-dict-array]
+//                    [--fast-dict-array] [--fast-indirect-objects]
 //
 // --render-only bails out after the render phase. Skips meta extraction,
 // parseOutline, page.pdf, and the pdf-lib roundtrip / incremental writer.
@@ -176,6 +176,16 @@
 // accumulates into the array directly; sizeInBytes / copyBytesInto
 // iterate in place). Production runs through it.
 //
+// --fast-indirect-objects replaces PDFContext.indirectObjects
+// (Map<PDFRef, PDFObject>) with a dense array indexed by
+// objectNumber for the gen=0 path -- mirror of the fast-refs trick
+// on the value side. After fast-dict-array shipped, that Map was
+// the last remaining hot Map.set in the heap profile (~14 MB of set
+// traffic from PDFContext.assign, fired once per indirect object
+// during load). gen!=0 PDFRefs fall through to the original Map.
+// enumerateIndirectObjects skips its sort when the gen!=0 Map is
+// empty (the parsed-PDF common case). Production runs through it.
+//
 // --fast-sync-load rips pdf-lib's parseSpeed / objectsPerTick /
 // shouldWaitForTick / waitForTick machinery out of both the load
 // path (PDFDocument.load + PDFParser.parseDocument / parseDocumentSection
@@ -247,6 +257,7 @@ let fastParseDict = false;
 let fastParseObject = false;
 let fastSyncLoad = false;
 let fastDictArray = false;
+let fastIndirectObjects = false;
 for (let i = 0; i < args.length; i++) {
   const a = args[i];
   if (a === '--out') outArg = args[++i];
@@ -281,6 +292,7 @@ for (let i = 0; i < args.length; i++) {
   else if (a === '--fast-parse-object') fastParseObject = true;
   else if (a === '--fast-sync-load') fastSyncLoad = true;
   else if (a === '--fast-dict-array') fastDictArray = true;
+  else if (a === '--fast-indirect-objects') fastIndirectObjects = true;
   else if (!inputArg) inputArg = a;
   else { console.error(`unknown arg: ${a}`); process.exit(2); }
 }
@@ -374,6 +386,10 @@ if (fastDictArray) {
   await import('../docs/lib/fast-dict-array.mjs');
   console.log('[harness] fast-dict-array: PDFDict backed by flat alternating array (subsumes fast-parse-dict + fast-dict-iter)');
 }
+if (fastIndirectObjects) {
+  await import('../docs/lib/fast-indirect-objects.mjs');
+  console.log('[harness] fast-indirect-objects: PDFContext.indirectObjects dense-array cache for gen=0 PDFRefs');
+}
 
 const stamp = new Date().toISOString().replace(/[:.]/g, '-');
 const outDir = outArg
diff --git a/perf/notes/08-pdf-lib.md b/perf/notes/08-pdf-lib.md
index 12e64358..8fb32471 100644
--- a/perf/notes/08-pdf-lib.md
+++ b/perf/notes/08-pdf-lib.md
@@ -2047,6 +2047,103 @@ runs differ by ~30 bytes too).
 the tree for A/B; the harness rejects combining them with
 `--fast-dict-array`.
 
+## Replace `PDFContext.indirectObjects` with a dense array
+
+With `fast-dict-array` shipping, the per-dict `new Map()` +
+`Map.prototype.set` traffic was gone -- but the heap profile
+still showed ~14.5 MB of `set` self-size. `find-heap-callers`
+localized it cleanly to one remaining site, attributed to two
+V8-inlined parent frames:
+
+```
+$ node find-heap-callers.mjs <post-dict-array>.heapprofile set
+set: total=14.49 MB
+  7168.04 KB   PDFParser.parseIndirectObjectHeader
+  7168.04 KB   parseIndirectObjectSync @ fast-sync-load.mjs:140
+```
+
+Both rows are the same logical call: `this.indirectObjects.set(ref, object)`
+inside `PDFContext.assign` (`pdf-lib/.../PDFContext.js:34`), fired
+once per indirect object during load. On the book that's ~9 k
+entries; V8's Map grows the underlying hash table through ~14
+doubling steps to fit them (4 -> 8 -> ... -> 16384), discarding
+each intermediate arena. The 14 MB total is final arena + bucket
+allocations + all the discarded growth arenas.
+
+`PDFRef`s are overwhelmingly gen=0 (rare gen!=0 cases come from
+revisions / incremental updates). `fast-refs` already uses a
+dense array indexed by `objectNumber` for the **key** side --
+`PDFRef.of`'s gen=0 pool. The same trick applies on the **value**
+side for `indirectObjects`: dense array keyed by `objectNumber`.
+
+### The shim
+
+`docs/lib/fast-indirect-objects.mjs` patches
+`PDFContext.prototype.assign / lookup / lookupMaybe / delete /
+getObjectRef / enumerateIndirectObjects` to consult an auxiliary
+`this._objArr` (dense array indexed by `objectNumber`) for gen=0
+`PDFRef`s first, falling back to the original Map for gen!=0.
+Lazy init on first `assign` -- no constructor patching needed.
+The original Map sits at `this.indirectObjects` unchanged; gen=0
+entries skip it entirely.
+
+```js
+PDFContext.prototype.assign = function (ref, object) {
+  if (ref.generationNumber === 0) {
+    if (!this._objArr) this._objArr = [];
+    this._objArr[ref.objectNumber] = object;     // dense store, no Map
+  } else {
+    this.indirectObjects.set(ref, object);       // gen!=0 fallback
+  }
+  if (ref.objectNumber > this.largestObjectNumber) {
+    this.largestObjectNumber = ref.objectNumber;
+  }
+};
+```
+
+`lookup` / `lookupMaybe` resolve the ref the same way then run
+the original type-check tail verbatim. `delete` nulls the slot
+(not splices -- subsequent objectNumbers retain their slots).
+`getObjectRef` linear-scans the dense array first, then the Map.
+The interesting one is `enumerateIndirectObjects`: dense-array
+iteration is already in ascending objectNumber order, so when
+the gen!=0 Map is empty (the parsed-PDF common case) the method
+returns without sorting -- the upstream
+`Array.from(this.indirectObjects.entries()).sort(byAscendingObjectNumber)`
+becomes a single linear pass with no `Array.from` materialization
+and no sort.
+
+### Measured wins
+
+CPU profile (paired `--cpu-profile-process --cpu-sampling 100`,
+fast-dict-array baseline vs + fast-indirect-objects):
+
+| Row                     | Pre (ms) | Post (ms) | Note               |
+|-------------------------|---------:|----------:|--------------------|
+| (garbage collector)     |   162.50 |    176.83 | within noise       |
+| **PDFContext.assign**   | **41.83**| **out of top 15** | **drops off**  |
+| PDFRef.of               |   124.42 |    118.24 | within noise       |
+| Total profile duration  |  1.21 s  |   1.14 s  | -70 ms             |
+
+The headline is `PDFContext.assign` exiting the top 15.
+Everything else moves within the sample-count noise band.
+
+Heap profile (paired `--heap-profile-process --heap-sampling 512`):
+
+| Allocator         | Pre (KB)  | Post (KB) | Delta                |
+|-------------------|----------:|----------:|---------------------:|
+| `set` builtin     | 14 840.20 |  7 674.41 | -7 166 KB (-48 %)    |
+| Total sampled     | 140.15 MB |  135.00 MB| -5.15 MB (-3.7 %)    |
+
+The remaining 7 MB of `set` is **not** `PDFContext.assign`
+anymore -- `find-heap-callers` on the post profile shows it's the
+upstream `PDFRef.of`'s `pool.set(tag, instance)` on cache miss.
+Even with `fast-refs`'s dense-array short-circuit on the LOOKUP
+side, the first time each unique objectNumber is encountered the
+shim calls through to the original `PDFRef.of`, which constructs
+the `PDFRef` AND populates the upstream `Map<string, PDFRef>`
+pool. That's the next target.
+
 ## `@cantoo/pdf-lib`: not a drop-in replacement
 
 Spot-checked the maintained fork (`@cantoo/pdf-lib` 2.6.5) as an
@@ -2079,7 +2176,8 @@ Cumulative process-phase cost, baseline → after the shims to date:
 | + fast-parse-dict                    | ~1.4 s  | ~1.0 s | ~0.4 s |
 | + fast-parse-object                  | ~1.4 s  | ~1.0 s | ~0.4 s |
 | + fast-sync-load                     | ~1.3 s  | ~0.8 s | ~0.5 s |
-| **+ fast-dict-array (this section)** | **~1.1 s** | **~0.7 s** | **~0.4 s** |
+| + fast-dict-array                    | ~1.1 s  | ~0.7 s | ~0.4 s |
+| **+ fast-indirect-objects (this section)** | **~1.1 s** | **~0.7 s** | **~0.4 s** |
 
 The bottom-up after the latest pair is what's left of pdf-lib's
 genuine parser work: `PDFDict.entries`, `PDFObjectParser.parseName`,

From e9574a41716a686eea3070975c00f7c60575b102 Mon Sep 17 00:00:00 2001
From: Kuba Sunderland-Ober <kuba@mareimbrium.org>
Date: Sun, 24 May 2026 01:00:44 +0200
Subject: [PATCH 16/44] Skip PDFRef pool.set on gen=0 miss: construct directly.

fast-refs' dense-array cache already short-circuited the LOOKUP
side, but on a gen=0 miss it still called through to upstream
PDFRef.of, which redundantly populated the upstream Map<string,
PDFRef> pool. After fast-indirect-objects shipped, that pool was
the last hot Map.set in the heap profile -- ~7 MB of growth-arena
churn from ~9 k unique-objectNumber misses on the book.

Replace the gen=0 miss path with Object.create(PDFRef.prototype)
+ manual field init. PDFObject (super) has a no-op constructor and
the only fields prototype methods read are objectNumber /
generationNumber / tag, so direct construction is safe.

CPU: PDFRef.of drops out of process top-15 (~93 ms saved).
Heap: set traffic 7.7 MB -> 0.5 MB. The residual 504 KB is PDFName
interning's fastCache.set, static-size and harmless. No more
materially-hot Map.set in the process-phase heap profile.
---
 docs/lib/fast-refs.mjs   |  31 ++++++++----
 perf/README.md           |  17 +++++--
 perf/notes/08-pdf-lib.md | 100 ++++++++++++++++++++++++++++++++++++++-
 3 files changed, 134 insertions(+), 14 deletions(-)

diff --git a/docs/lib/fast-refs.mjs b/docs/lib/fast-refs.mjs
index 4212162c..32b955e4 100644
--- a/docs/lib/fast-refs.mjs
+++ b/docs/lib/fast-refs.mjs
@@ -14,15 +14,24 @@
 // plus measurable GC pressure.
 //
 // Shim: dense array indexed by objectNumber for the gen=0 branch.
-// Plain array indexing, no string alloc, no Map hash. Cache-in-front
-// of the original PDFRef.of so we don't need its module-private
-// ENFORCER -- on miss we delegate, on hit we return our cached
-// instance.
+// Plain array indexing, no string alloc, no Map hash.
 //
-// gen != 0 calls (the other 18 %, pdf-lib's xref-stream bookkeeping
-// where the "generation" field encodes an in-ObjStm index per
-// PDF 1.5 spec, see PDFXRefStreamParser.js:74-80) pass through to
-// the original unchanged.
+// On a gen=0 cache miss we construct the PDFRef directly via
+// `Object.create(PDFRef.prototype)` plus manual field init, skipping
+// both the ENFORCER check and the upstream `pool.set(tag, instance)`.
+// The upstream pool was the last remaining hot Map.set in the heap
+// profile after fast-indirect-objects shipped (~7 MB of `set` from
+// the once-per-unique-objectNumber miss), all of which becomes dead
+// arena allocation once the dense array is the authoritative cache.
+// PDFRef's super (PDFObject) has a no-op constructor; the only
+// instance fields the prototype methods read are `objectNumber`,
+// `generationNumber`, and `tag` (used by toString / sizeInBytes /
+// copyBytesInto), so direct construction is safe.
+//
+// gen != 0 calls (the other ~18 %, pdf-lib's xref-stream bookkeeping
+// where "generation" encodes an in-ObjStm index per PDF 1.5 spec,
+// see PDFXRefStreamParser.js:74-80) still pass through the original
+// PDFRef.of -- their Map pool is harmless at gen!=0's volume.
 //
 // Side-effecting import. Import once before any pdf-lib operation.
 // Idempotent.
@@ -36,7 +45,11 @@ if (!PDFRef.__fastPoolInstalled) {
     if (generationNumber === undefined || generationNumber === 0) {
       const existing = pool0[objectNumber];
       if (existing) return existing;
-      const fresh = original.call(PDFRef, objectNumber, 0);
+      // Direct construction -- skip ENFORCER check, skip upstream pool.set.
+      const fresh = Object.create(PDFRef.prototype);
+      fresh.objectNumber = objectNumber;
+      fresh.generationNumber = 0;
+      fresh.tag = objectNumber + ' 0 R';
       pool0[objectNumber] = fresh;
       return fresh;
     }
diff --git a/perf/README.md b/perf/README.md
index 5f29cb63..37e2b964 100644
--- a/perf/README.md
+++ b/perf/README.md
@@ -72,9 +72,17 @@ node measure.mjs --fast-refs --parallel-deflate --fast-decode-name --fast-number
 Flag rationale:
 
 - `--fast-refs` -- inject the
-  [docs/lib/fast-refs.mjs](../docs/lib/fast-refs.mjs) shipping fix
-  (dense-array cache for `PDFRef.of`'s gen=0 path). Production runs
-  through it; the profile should too.
+  [docs/lib/fast-refs.mjs](../docs/lib/fast-refs.mjs) shipping
+  fix (dense-array cache for `PDFRef.of`'s gen=0 path). On miss,
+  constructs the `PDFRef` directly via
+  `Object.create(PDFRef.prototype)` + manual field init, bypassing
+  the upstream `pool.set(tag, instance)` -- after
+  `--fast-indirect-objects` shipped, that pool was the last hot
+  `Map.set` in the heap profile. The `PDFRef.of` row drops out of
+  the CPU top-15 and the `set` builtin row collapses from ~7.5 MB
+  to ~0.5 MB (the residual is `PDFName` interning's
+  `fastCache.set`, harmless). Production runs through it; the
+  profile should too.
 - `--parallel-deflate` -- swap `pdfDoc.save()` for `parallelSave`
   from [docs/lib/parallel-deflate.mjs](../docs/lib/parallel-deflate.mjs),
   which pre-deflates object streams in parallel on libuv's pool with
@@ -504,6 +512,7 @@ file documenting each:
 | `parseObject` first-byte dispatch + gated keyword scans | [08](notes/08-pdf-lib.md) | ~42 ms profile (parseObject -51 %) |
 | `PDFDict` flat-array storage (subsumes iter + parseDict shims) | [08](notes/08-pdf-lib.md) | ~48 ms process (Map+set heap -80 %, GC -20 %) |
 | `PDFContext.indirectObjects` dense gen=0 array | [08](notes/08-pdf-lib.md) | `assign` off CPU top-15; remaining `set` heap -48 % |
+| `PDFRef.of` direct-construct on cache miss (skip upstream `pool.set`) | [08](notes/08-pdf-lib.md) | `PDFRef.of` off CPU top-15 (~93 ms); `set` heap 7.7 MB → 0.5 MB |
 
 What was tried and didn't ship:
 
@@ -530,4 +539,4 @@ order; later ones reference earlier ones for context.
 | [05-blink-trace.md](notes/05-blink-trace.md) | What happened when we tried move-not-clone (a `previousLeaf` cache shipped instead of move); cracking the cpu profile's `(program)` row open with a Blink-category trace; the WhiteSpaceFilter paired-A/B that found it wasn't worth its layout cost in our pipeline. |
 | [06-microtasks-pageranges-css.md](notes/06-microtasks-pageranges-css.md) | Following `RunMicrotasks` down to zero (chunker fully sync); why `pageRanges` sharding is off the table; CSS cost attribution showing print.css's individual sections are all below the noise floor. |
 | [07-memory.md](notes/07-memory.md) | Where the renderer's 1.9 GB goes -- process-tree footprint, per-allocator + per-Blink-class breakdown, `--disable-gpu` + `--in-process-gpu` saving ~200 MB, a GC-pass probe finding 180 MB of unswept Oilpan garbage. |
-| [08-pdf-lib.md](notes/08-pdf-lib.md) | Profiling the process phase via `--cpu-profile-process`; pako's per-stream init dominates with ~4 500 small streams (routing pdf-lib's `deflate` + `inflate` through `node:zlib` saves ~1.5 s); `PDFRef.of`'s string-keyed Map lookup at 1.2 M calls per load (dense-array gen=0 cache saves ~0.2 s); parallelising save's per-stream deflate on libuv's pool with `objectsPerStream: 500` (~0.3 s off the main thread; PDF -5 %); `decodeName`'s regex scan on 2.76 M `PDFName.of` calls per load with a 0.0001 % hit rate (no-`#` cache saves ~0.5 s); `numberToString`'s redundant `toString`/`split`/`parseInt` on the 100 % no-`e` path; `sizeInBytes` allocating `n.toString(2)` on ~300 k xref-writer calls (short-circuit ladder saves ~60 ms); `PDFDict.entries` allocating `Array.from(map.entries())` on every dict serialisation (`Map.forEach` with hoisted callbacks saves ~80 ms); `parseDict`'s type-dispatch tail re-running `PDFName.of('Type'/'Catalog'/'Pages'/'Page')` per dict (hoisted sentinel constants drop `fastOf` self-time by 22 %); pdf-lib's `__awaiter`/`__generator` scaffolding on nine load + save methods costing ~135 ms of attributed self-time + ~50 ms GC (synchronified twins save 0.36 s of process); `parseObject`'s three speculative `matchKeyword(true/false/null)` scans on every dispatch (first-byte peek + gated keyword scans halve `parseObject` self-time); the sampling heap profile pointing at `new Map()` + `Map.prototype.set` at ~50 % of process-phase allocations (replacing `PDFDict`'s backing `Map` with a flat alternating `[k,v,k,v,...]` array drops Map+set heap traffic ~80 % and subsumes the earlier `fast-dict-iter` + `fast-parse-dict` shims); the only hot `Map.set` left being `PDFContext.assign`'s `indirectObjects.set(ref, object)` (replacing the Map with a dense gen=0 array indexed by `objectNumber` drops `assign` out of the CPU top-15 and halves the remaining `set` heap traffic). |
+| [08-pdf-lib.md](notes/08-pdf-lib.md) | Profiling the process phase via `--cpu-profile-process`; pako's per-stream init dominates with ~4 500 small streams (routing pdf-lib's `deflate` + `inflate` through `node:zlib` saves ~1.5 s); `PDFRef.of`'s string-keyed Map lookup at 1.2 M calls per load (dense-array gen=0 cache saves ~0.2 s); parallelising save's per-stream deflate on libuv's pool with `objectsPerStream: 500` (~0.3 s off the main thread; PDF -5 %); `decodeName`'s regex scan on 2.76 M `PDFName.of` calls per load with a 0.0001 % hit rate (no-`#` cache saves ~0.5 s); `numberToString`'s redundant `toString`/`split`/`parseInt` on the 100 % no-`e` path; `sizeInBytes` allocating `n.toString(2)` on ~300 k xref-writer calls (short-circuit ladder saves ~60 ms); `PDFDict.entries` allocating `Array.from(map.entries())` on every dict serialisation (`Map.forEach` with hoisted callbacks saves ~80 ms); `parseDict`'s type-dispatch tail re-running `PDFName.of('Type'/'Catalog'/'Pages'/'Page')` per dict (hoisted sentinel constants drop `fastOf` self-time by 22 %); pdf-lib's `__awaiter`/`__generator` scaffolding on nine load + save methods costing ~135 ms of attributed self-time + ~50 ms GC (synchronified twins save 0.36 s of process); `parseObject`'s three speculative `matchKeyword(true/false/null)` scans on every dispatch (first-byte peek + gated keyword scans halve `parseObject` self-time); the sampling heap profile pointing at `new Map()` + `Map.prototype.set` at ~50 % of process-phase allocations (replacing `PDFDict`'s backing `Map` with a flat alternating `[k,v,k,v,...]` array drops Map+set heap traffic ~80 % and subsumes the earlier `fast-dict-iter` + `fast-parse-dict` shims); the only hot `Map.set` left being `PDFContext.assign`'s `indirectObjects.set(ref, object)` (replacing the Map with a dense gen=0 array indexed by `objectNumber` drops `assign` out of the CPU top-15 and halves the remaining `set` heap traffic); the residual `set` after that being the upstream `PDFRef.of` pool.set on cache miss (directly constructing the `PDFRef` via `Object.create(PDFRef.prototype)` on the gen=0 miss path bypasses the redundant upstream pool entirely, dropping `set` heap traffic from 7.7 MB to 0.5 MB and saving another ~93 ms on `PDFRef.of` CPU). |
diff --git a/perf/notes/08-pdf-lib.md b/perf/notes/08-pdf-lib.md
index 8fb32471..1a431012 100644
--- a/perf/notes/08-pdf-lib.md
+++ b/perf/notes/08-pdf-lib.md
@@ -2144,6 +2144,103 @@ shim calls through to the original `PDFRef.of`, which constructs
 the `PDFRef` AND populates the upstream `Map<string, PDFRef>`
 pool. That's the next target.
 
+## Skip `PDFRef` `pool.set` on the gen=0 miss path
+
+With `fast-indirect-objects` shipping, the heap profile showed
+one last hot `set` source: the upstream `PDFRef.of`'s own pool
+(`pdf-lib/.../objects/PDFRef.js:34`):
+
+```js
+PDFRef.of = function (objectNumber, generationNumber) {
+    ...
+    var tag = objectNumber + " " + generationNumber + " R";
+    var instance = pool.get(tag);
+    if (!instance) {
+        instance = new PDFRef(ENFORCER, objectNumber, generationNumber);
+        pool.set(tag, instance);                  // ← 7 MB of set on the book
+    }
+    return instance;
+};
+```
+
+`fast-refs` already short-circuited the LOOKUP side with a dense
+array indexed by `objectNumber`. But on a gen=0 cache miss (~9 k
+unique objectNumbers per book), the shim was calling
+`original.call(PDFRef, objectNumber, 0)`, which dutifully built
+the tag string, looked it up in the upstream Map, missed,
+allocated a new `PDFRef`, AND populated the upstream pool --
+redundantly, since the dense array `pool0` is the authoritative
+cache from now on.
+
+Each `pool.set` over the load grew the Map's hash table through
+~14 doubling steps (4 -> 8 -> ... -> 16384), discarding each
+intermediate arena. Total: ~7 MB of `set` self-size in the heap
+profile, plus the matching ~93 ms of `PDFRef.of` CPU self-time
+(the function body that does the set is hot enough that V8
+charges all that growth to `PDFRef.of`'s frame).
+
+### The upgrade
+
+Replace the original-delegation on the gen=0 miss path with
+direct construction:
+
+```js
+PDFRef.of = function fastOf(objectNumber, generationNumber) {
+  if (generationNumber === undefined || generationNumber === 0) {
+    const existing = pool0[objectNumber];
+    if (existing) return existing;
+    const fresh = Object.create(PDFRef.prototype);
+    fresh.objectNumber = objectNumber;
+    fresh.generationNumber = 0;
+    fresh.tag = objectNumber + ' 0 R';
+    pool0[objectNumber] = fresh;
+    return fresh;
+  }
+  return original.call(PDFRef, objectNumber, generationNumber);
+};
+```
+
+Safety: `PDFRef`'s super class (`PDFObject`) has a no-op
+constructor (`pdf-lib/.../PDFObject.js:5`) so skipping
+`_super.call(this)` is fine. The only instance fields the
+prototype methods read are `objectNumber`, `generationNumber`,
+and `tag` (used by `toString` / `sizeInBytes` / `copyBytesInto`);
+direct field init covers them. The `ENFORCER` check exists to
+make `PDFRef.of` the single legitimate factory -- we already are
+that factory, so bypassing it doesn't violate any invariant.
+
+gen!=0 keeps the original delegation (rare on freshly-parsed
+PDFs; its `Map.set` traffic is negligible at gen!=0 volume).
+
+### Measured wins
+
+CPU profile (paired `--cpu-profile-process --cpu-sampling 100`,
+fast-indirect-objects baseline vs + this upgrade):
+
+| Row                  | Pre (ms) | Post (ms) | Note                       |
+|----------------------|---------:|----------:|----------------------------|
+| (garbage collector)  |   176.83 |    166.71 | -10 ms                     |
+| **PDFRef.of**        | **118.24** | **out of top 15** | **drops off (~93 ms saved)** |
+| fastOf @ fast-refs   |        - |     25.19 | new row (was inside `PDFRef.of`) |
+| Total profile        |  1.14 s  |   1.03 s  | -110 ms (-9.6 %)           |
+
+Heap profile (paired `--heap-profile-process --heap-sampling 512`):
+
+| Allocator         | Pre (KB)  | Post (KB) | Delta                |
+|-------------------|----------:|----------:|---------------------:|
+| `set` builtin     |  7 674.41 |    504.77 | **-7 170 KB (-93 %)** |
+| fastOf @ fast-refs|  9 367.39 |  7 734.79 | -1 633 KB             |
+| Total sampled     | 135.00 MB | 123.11 MB | -11.89 MB (-8.8 %)    |
+
+The residual 504 KB of `set` is `fastCache.set` in `PDFName`
+interning (~448 KB) plus a sliver of `__awaiter` machinery in
+`PDFDocument`; both are static-size and harmless. There is no
+longer any materially-hot `Map.prototype.set` in the process-phase
+heap profile.
+
+The edit is local to `docs/lib/fast-refs.mjs`; no production
+import change needed since `fast-refs` was already wired up.
+
 ## `@cantoo/pdf-lib`: not a drop-in replacement
 
 Spot-checked the maintained fork (`@cantoo/pdf-lib` 2.6.5) as an
@@ -2177,7 +2274,8 @@ Cumulative process-phase cost, baseline → after the shims to date:
 | + fast-parse-object                  | ~1.4 s  | ~1.0 s | ~0.4 s |
 | + fast-sync-load                     | ~1.3 s  | ~0.8 s | ~0.5 s |
 | + fast-dict-array                    | ~1.1 s  | ~0.7 s | ~0.4 s |
-| **+ fast-indirect-objects (this section)** | **~1.1 s** | **~0.7 s** | **~0.4 s** |
+| + fast-indirect-objects              | ~1.1 s  | ~0.7 s | ~0.4 s |
+| **+ fast-refs miss bypass (this section)** | **~1.0 s** | **~0.6 s** | **~0.4 s** |
 
 The bottom-up after the latest pair is what's left of pdf-lib's
 genuine parser work: `PDFDict.entries`, `PDFObjectParser.parseName`,

From 7db4b4fee19331a340cad559f540484ac926a825 Mon Sep 17 00:00:00 2001
From: Kuba Sunderland-Ober <kuba@mareimbrium.org>
Date: Sun, 24 May 2026 01:06:04 +0200
Subject: [PATCH 17/44] Pool PDFNumber instances by value; add
 find-heap-callees.mjs.

After the previous round of allocator-shape shims, parseNumberOrRef
was the next-largest row in the process-phase heap profile at
~15 MB -- mostly inlined `new PDFNumber(value)` from the parser's
number branch. PDFs reuse a handful of integer values constantly
(page indices, /Count, /N, /MediaBox dimensions), and PDFNumber
is conceptually immutable, so pooling by value is safe.

fast-pdfnumber-pool installs a dense-array cache for non-negative
integers in [0, 16384), Map fallback for floats / negatives. Same
shape as fast-refs. parseNumberOrRef's row collapses off the top
10; total process-phase heap traffic drops 123 MB -> 107 MB (-13 %).
PDFNumber row settles at 0.8 MB (the floor: one instance per unique
value).

Also lands find-heap-callees.mjs -- the children-of analyzer used
to investigate fastParseDictArray's mystery 58 MB self-row (turned
out to be recursive parseDict invocations across nesting levels,
intrinsic).
---
 docs/lib/fast-pdfnumber-pool.mjs |  61 +++++++++++++++
 docs/render-book.mjs             |  10 +++
 perf/README.md                   |  21 +++++-
 perf/find-heap-callees.mjs       |  70 +++++++++++++++++
 perf/measure.mjs                 |  16 ++++
 perf/notes/08-pdf-lib.md         | 125 ++++++++++++++++++++++++++++++-
 6 files changed, 299 insertions(+), 4 deletions(-)
 create mode 100644 docs/lib/fast-pdfnumber-pool.mjs
 create mode 100644 perf/find-heap-callees.mjs

diff --git a/docs/lib/fast-pdfnumber-pool.mjs b/docs/lib/fast-pdfnumber-pool.mjs
new file mode 100644
index 00000000..b0ee9990
--- /dev/null
+++ b/docs/lib/fast-pdfnumber-pool.mjs
@@ -0,0 +1,61 @@
+// Pool PDFNumber instances by value.
+//
+// After fast-refs / fast-indirect-objects / fast-dict-array shipped,
+// the residual heap profile attributed ~15 MB of self-size to
+// PDFObjectParser.parseNumberOrRef -- mostly inlined `new
+// PDFNumber(value)` calls (each of which also allocates a fresh
+// stringValue via `numberToString(value)`):
+//
+//     function PDFNumber(value) {
+//         var _this = _super.call(this) || this;
+//         _this.numberValue = value;
+//         _this.stringValue = numberToString(value);   // allocs
+//         return _this;
+//     }
+//     PDFNumber.of = function (value) { return new PDFNumber(value); };
+//
+// No pool. Every PDFNumber.of(N) returns a fresh instance, even
+// though PDFs are packed with repeated numeric values: page indices
+// 0..1651, /Count totals, /N object-stream lengths, common
+// /MediaBox dimensions (612, 792, 595, 842), font sizes, bit
+// widths. The book parses hundreds of thousands of PDFNumber.of
+// calls against a few thousand unique values.
+//
+// Shim. Dense array indexed by `value` for non-negative small
+// integers (0..POOL_SIZE-1, currently 16384 -- covers all observed
+// integer values in the book by a wide margin). Map fallback for
+// floats, negatives, and out-of-range integers. Same shape as
+// fast-refs on the PDFRef side. PDFNumber is immutable
+// (numberValue and stringValue are set in the constructor and never
+// mutated), so sharing instances is safe.
+//
+// Side-effecting import. Import once before any pdf-lib operation.
+// Idempotent.
+
+import { PDFNumber } from "pdf-lib";
+
+const POOL_SIZE = 16384;
+
+if (!PDFNumber.__fastPoolInstalled) {
+  const original = PDFNumber.of;
+  const intPool = new Array(POOL_SIZE);   // sparse, holes for unused slots
+  const otherPool = new Map();             // floats / negatives / large ints
+
+  PDFNumber.of = function fastNumberOf(value) {
+    // Hot path: non-negative integer within pool range.
+    if (value >= 0 && value < POOL_SIZE && (value | 0) === value) {
+      let pn = intPool[value];
+      if (pn !== undefined) return pn;
+      pn = original.call(PDFNumber, value);
+      intPool[value] = pn;
+      return pn;
+    }
+    // Cold path: Map cache. SameValueZero handles NaN / -0 correctly.
+    let pn = otherPool.get(value);
+    if (pn !== undefined) return pn;
+    pn = original.call(PDFNumber, value);
+    otherPool.set(value, pn);
+    return pn;
+  };
+  PDFNumber.__fastPoolInstalled = true;
+}
diff --git a/docs/render-book.mjs b/docs/render-book.mjs
index 850959a7..92395995 100644
--- a/docs/render-book.mjs
+++ b/docs/render-book.mjs
@@ -103,6 +103,15 @@ import { PDFDocument } from 'pdf-lib';
 //     skips its sort when the gen!=0 Map is empty (the common case).
 //     Drops PDFContext.assign out of the CPU top-15 and halves the
 //     remaining set heap traffic.
+//   fast-pdfnumber-pool -- value-keyed cache in front of PDFNumber.of.
+//     Dense array for non-negative integers in [0, 16384), Map
+//     fallback for floats / negatives / out-of-range. PDFs reuse the
+//     same numeric values (page indices, /Count, /N, /MediaBox
+//     dimensions) hundreds of thousands of times against only a few
+//     thousand unique values; pooling collapses parseNumberOrRef's
+//     ~15 MB of PDFNumber allocations to ~0.8 MB. Total process-phase
+//     heap traffic drops ~13 % (123 MB -> 107 MB). PDFNumber is
+//     immutable so sharing is safe.
 import './lib/fast-refs.mjs';
 import './lib/fast-inflate.mjs';
 import './lib/fast-parse-number.mjs';
@@ -113,6 +122,7 @@ import './lib/fast-dict-array.mjs';
 import './lib/fast-parse-object.mjs';
 import './lib/fast-sync-load.mjs';
 import './lib/fast-indirect-objects.mjs';
+import './lib/fast-pdfnumber-pool.mjs';
 import { parseOutline, setOutline } from './lib/outline.mjs';
 import { setMetadata }              from './lib/postprocesser.mjs';
 import { parallelSave }             from './lib/parallel-deflate.mjs';
diff --git a/perf/README.md b/perf/README.md
index 37e2b964..d631b090 100644
--- a/perf/README.md
+++ b/perf/README.md
@@ -66,7 +66,7 @@ or pdf-lib), or to write `book.pdf` for behavioural verification.
 The mirror command for CPU-profiling the pdf-lib roundtrip:
 
 ```
-node measure.mjs --fast-refs --parallel-deflate --fast-decode-name --fast-number-to-string --fast-size-in-bytes --fast-inflate --fast-parse-number --fast-dict-array --fast-parse-object --fast-sync-load --fast-indirect-objects --cpu-profile-process --cpu-sampling 100
+node measure.mjs --fast-refs --parallel-deflate --fast-decode-name --fast-number-to-string --fast-size-in-bytes --fast-inflate --fast-parse-number --fast-dict-array --fast-parse-object --fast-sync-load --fast-indirect-objects --fast-pdfnumber-pool --cpu-profile-process --cpu-sampling 100
 ```
 
 Flag rationale:
@@ -191,6 +191,18 @@ Flag rationale:
   objectNumber order. Drops `PDFContext.assign` out of the CPU
   top-15 and halves the remaining `set` heap traffic. Production
   runs through it.
+- `--fast-pdfnumber-pool` -- inject
+  [docs/lib/fast-pdfnumber-pool.mjs](../docs/lib/fast-pdfnumber-pool.mjs),
+  a value-keyed cache in front of `PDFNumber.of`. Dense array for
+  non-negative integers in `[0, 16384)`, Map fallback for floats
+  / negatives / out-of-range. PDFs reuse the same numeric values
+  (page indices, `/Count`, `/N`, `/MediaBox` dimensions, font
+  sizes) tens-to-hundreds of thousands of times against only a
+  few thousand unique values. `PDFNumber` is immutable so sharing
+  is safe. Collapses `parseNumberOrRef`'s ~15 MB of self-size to
+  ~0.8 MB (just the unique values); drops total process-phase
+  heap traffic by ~13 % (123 MB -> 107 MB). Production runs
+  through it.
 - `--cpu-profile-process` -- attach Node's `inspector/promises`
   Profiler around the process phase only (skips render and generate).
   Writes `process.cpuprofile` into the timestamped `results/` folder.
@@ -225,7 +237,7 @@ process phase -- "where is pdf-lib allocating bytes?" rather than
 "where is it spending cycles?":
 
 ```
-node measure.mjs --fast-refs --parallel-deflate --fast-decode-name --fast-number-to-string --fast-size-in-bytes --fast-inflate --fast-parse-number --fast-dict-array --fast-parse-object --fast-sync-load --fast-indirect-objects --heap-profile-process --heap-sampling 512
+node measure.mjs --fast-refs --parallel-deflate --fast-decode-name --fast-number-to-string --fast-size-in-bytes --fast-inflate --fast-parse-number --fast-dict-array --fast-parse-object --fast-sync-load --fast-indirect-objects --fast-pdfnumber-pool --heap-profile-process --heap-sampling 512
 ```
 
 Same `--fast-*` set as the CPU command (production is the baseline
@@ -286,6 +298,7 @@ or `--tracing`):
 | `analyze-hybrid.mjs` | Bottom-up analyzer that *combines* the V8 cpu samples and the Blink trace events from a hybrid `trace.json`. Builds a `[JS root..leaf] ++ [Blink outer..inner]` stack at each sample (filtering V8's virtual frames and JS-entry wrapper events) and prints either top-N self-time mixing JS function names with Blink/V8 event names, or `--callees <label>` direct-callees for any name on either axis. Lets you walk a single causation chain from a JS function down through the Blink layout / style work it triggered via gBCR (`hasOverflow -> getBoundingClientRect -> Document::UpdateStyleAndLayout -> Blink.ForcedStyleAndLayout.UpdateTime -> ...`). |
 | `find-callers.mjs` | "Who paid for this callee's time?" -- walks a `.cpuprofile` and attributes a target function's total time back to each direct caller. Used throughout the post-mortems to detect gBCR migration between callers. |
 | `find-heap-callers.mjs` | Heap-profile companion to `find-callers.mjs`. Walks a `.heapprofile` tree and attributes a target allocator's (e.g. `set`, `Map`, `String`) self+descendant bytes back to each direct caller. Useful for "where do all these Map.set calls come from?" questions. |
+| `find-heap-callees.mjs` | Other direction: walks a `.heapprofile` tree and lists a target frame's direct children with their (self + subtree) byte totals. Used to crack open mystery rows like "fastParseDictArray has 58 MB of self-size -- what's it actually allocating?". |
 | `find-callees.mjs` | The other direction of `find-callers.mjs`: splits a function's self+descendant time across its direct callees. Surfaces the cases where V8 has rolled native DOM work back into the calling JS frame (Range deletion in `removeOverflow`, HTML parser in `wrapContent`). |
 | `grep-profile.mjs` | Lists every node in a `.cpuprofile` whose `functionName` matches a regex, with self-time and location. Quick check for "is this frame in the profile at all, and what's it called?" |
 | `ab-css.mjs` | CSS cost attribution for `docs/_site-pdf/assets/css/print.css` + `rouge.css`. Renders the book per variant (full / drop-rouge / drop-print-extras / baseline-minimal) and reports **paired-difference** CPU sample-time across N pairs (default 3), with the baseline re-measured immediately before each variant pair to cancel machine-state drift. Pulls per-`Document::recalcStyle` / `LocalFrameView::performLayout` / `rebuildLayoutTree` / `ShapeText` total time from the embedded V8 cpu profile in the hybrid trace; prints mean ± SD per variant so noise-floor rows are visible. Auto-pins on Windows via `pin-cpu.mjs`. Optional `--per-print-section` adds one drop-print-`<section>` variant per `/* ---- ---- */` divider in print.css; individual sections of print.css turned out to be below the noise floor on this book, so off by default. |
@@ -390,6 +403,7 @@ run.bat --fast-dict-iter                  # in-place Map.forEach for PDFDict.siz
 run.bat --fast-parse-dict                 # hoist Type/Catalog/Pages/Page sentinel PDFNames out of parseDict (Map-shape baseline; subsumed by --fast-dict-array in production)
 run.bat --fast-dict-array                 # replace PDFDict's backing Map with a flat [k,v,k,v,...] array; subsumes --fast-dict-iter + --fast-parse-dict (also ships; opt-in here for A/B)
 run.bat --fast-indirect-objects           # dense-array cache for PDFContext.indirectObjects (gen=0 path); mirror of --fast-refs on the value side (also ships; opt-in here for A/B)
+run.bat --fast-pdfnumber-pool             # value-keyed cache in front of PDFNumber.of; dense array for small ints, Map for the rest (also ships; opt-in here for A/B)
 run.bat --fast-parse-object               # first-byte dispatch in parseObject; gate true/false/null matchKeyword behind byte check (also ships; opt-in here for A/B)
 run.bat --fast-sync-load                  # synchronify PDFDocument.load + parser; strip waitForTick machinery (also ships; opt-in here for A/B)
 ```
@@ -513,6 +527,7 @@ file documenting each:
 | `PDFDict` flat-array storage (subsumes iter + parseDict shims) | [08](notes/08-pdf-lib.md) | ~48 ms process (Map+set heap -80 %, GC -20 %) |
 | `PDFContext.indirectObjects` dense gen=0 array | [08](notes/08-pdf-lib.md) | `assign` off CPU top-15; remaining `set` heap -48 % |
 | `PDFRef.of` direct-construct on cache miss (skip upstream `pool.set`) | [08](notes/08-pdf-lib.md) | `PDFRef.of` off CPU top-15 (~93 ms); `set` heap 7.7 MB → 0.5 MB |
+| `PDFNumber.of` value-pool (dense int + Map fallback) | [08](notes/08-pdf-lib.md) | `parseNumberOrRef` off heap top-10; total process heap 123 MB → 107 MB (-13 %) |
 
 What was tried and didn't ship:
 
@@ -539,4 +554,4 @@ order; later ones reference earlier ones for context.
 | [05-blink-trace.md](notes/05-blink-trace.md) | What happened when we tried move-not-clone (a `previousLeaf` cache shipped instead of move); cracking the cpu profile's `(program)` row open with a Blink-category trace; the WhiteSpaceFilter paired-A/B that found it wasn't worth its layout cost in our pipeline. |
 | [06-microtasks-pageranges-css.md](notes/06-microtasks-pageranges-css.md) | Following `RunMicrotasks` down to zero (chunker fully sync); why `pageRanges` sharding is off the table; CSS cost attribution showing print.css's individual sections are all below the noise floor. |
 | [07-memory.md](notes/07-memory.md) | Where the renderer's 1.9 GB goes -- process-tree footprint, per-allocator + per-Blink-class breakdown, `--disable-gpu` + `--in-process-gpu` saving ~200 MB, a GC-pass probe finding 180 MB of unswept Oilpan garbage. |
-| [08-pdf-lib.md](notes/08-pdf-lib.md) | Profiling the process phase via `--cpu-profile-process`; pako's per-stream init dominates with ~4 500 small streams (routing pdf-lib's `deflate` + `inflate` through `node:zlib` saves ~1.5 s); `PDFRef.of`'s string-keyed Map lookup at 1.2 M calls per load (dense-array gen=0 cache saves ~0.2 s); parallelising save's per-stream deflate on libuv's pool with `objectsPerStream: 500` (~0.3 s off the main thread; PDF -5 %); `decodeName`'s regex scan on 2.76 M `PDFName.of` calls per load with a 0.0001 % hit rate (no-`#` cache saves ~0.5 s); `numberToString`'s redundant `toString`/`split`/`parseInt` on the 100 % no-`e` path; `sizeInBytes` allocating `n.toString(2)` on ~300 k xref-writer calls (short-circuit ladder saves ~60 ms); `PDFDict.entries` allocating `Array.from(map.entries())` on every dict serialisation (`Map.forEach` with hoisted callbacks saves ~80 ms); `parseDict`'s type-dispatch tail re-running `PDFName.of('Type'/'Catalog'/'Pages'/'Page')` per dict (hoisted sentinel constants drop `fastOf` self-time by 22 %); pdf-lib's `__awaiter`/`__generator` scaffolding on nine load + save methods costing ~135 ms of attributed self-time + ~50 ms GC (synchronified twins save 0.36 s of process); `parseObject`'s three speculative `matchKeyword(true/false/null)` scans on every dispatch (first-byte peek + gated keyword scans halve `parseObject` self-time); the sampling heap profile pointing at `new Map()` + `Map.prototype.set` at ~50 % of process-phase allocations (replacing `PDFDict`'s backing `Map` with a flat alternating `[k,v,k,v,...]` array drops Map+set heap traffic ~80 % and subsumes the earlier `fast-dict-iter` + `fast-parse-dict` shims); the only hot `Map.set` left being `PDFContext.assign`'s `indirectObjects.set(ref, object)` (replacing the Map with a dense gen=0 array indexed by `objectNumber` drops `assign` out of the CPU top-15 and halves the remaining `set` heap traffic); the residual `set` after that being the upstream `PDFRef.of` pool.set on cache miss (directly constructing the `PDFRef` via `Object.create(PDFRef.prototype)` on the gen=0 miss path bypasses the redundant upstream pool entirely, dropping `set` heap traffic from 7.7 MB to 0.5 MB and saving another ~93 ms on `PDFRef.of` CPU). |
+| [08-pdf-lib.md](notes/08-pdf-lib.md) | Profiling the process phase via `--cpu-profile-process`; pako's per-stream init dominates with ~4 500 small streams (routing pdf-lib's `deflate` + `inflate` through `node:zlib` saves ~1.5 s); `PDFRef.of`'s string-keyed Map lookup at 1.2 M calls per load (dense-array gen=0 cache saves ~0.2 s); parallelising save's per-stream deflate on libuv's pool with `objectsPerStream: 500` (~0.3 s off the main thread; PDF -5 %); `decodeName`'s regex scan on 2.76 M `PDFName.of` calls per load with a 0.0001 % hit rate (no-`#` cache saves ~0.5 s); `numberToString`'s redundant `toString`/`split`/`parseInt` on the 100 % no-`e` path; `sizeInBytes` allocating `n.toString(2)` on ~300 k xref-writer calls (short-circuit ladder saves ~60 ms); `PDFDict.entries` allocating `Array.from(map.entries())` on every dict serialisation (`Map.forEach` with hoisted callbacks saves ~80 ms); `parseDict`'s type-dispatch tail re-running `PDFName.of('Type'/'Catalog'/'Pages'/'Page')` per dict (hoisted sentinel constants drop `fastOf` self-time by 22 %); pdf-lib's `__awaiter`/`__generator` scaffolding on nine load + save methods costing ~135 ms of attributed self-time + ~50 ms GC (synchronified twins save 0.36 s of process); `parseObject`'s three speculative `matchKeyword(true/false/null)` scans on every dispatch (first-byte peek + gated keyword scans halve `parseObject` self-time); the sampling heap profile pointing at `new Map()` + `Map.prototype.set` at ~50 % of process-phase allocations (replacing `PDFDict`'s backing `Map` with a flat alternating `[k,v,k,v,...]` array drops Map+set heap traffic ~80 % and subsumes the earlier `fast-dict-iter` + `fast-parse-dict` shims); the only hot `Map.set` left being `PDFContext.assign`'s `indirectObjects.set(ref, object)` (replacing the Map with a dense gen=0 array indexed by `objectNumber` drops `assign` out of the CPU top-15 and halves the remaining `set` heap traffic); the residual `set` after that being the upstream `PDFRef.of` pool.set on cache miss (directly constructing the `PDFRef` via `Object.create(PDFRef.prototype)` on the gen=0 miss path bypasses the redundant upstream pool entirely, dropping `set` heap traffic from 7.7 MB to 0.5 MB and saving another ~93 ms on `PDFRef.of` CPU); `parseNumberOrRef` as the next-largest heap row at 15 MB of inlined `new PDFNumber(value)` calls -- PDFs reuse a few thousand unique numeric values hundreds of thousands of times (page indices, `/Count`, `/MediaBox` dimensions, font sizes), so pooling `PDFNumber` by value drops `parseNumberOrRef` off the top 10 and total process heap by ~13 %. |
diff --git a/perf/find-heap-callees.mjs b/perf/find-heap-callees.mjs
new file mode 100644
index 00000000..abdb70ad
--- /dev/null
+++ b/perf/find-heap-callees.mjs
@@ -0,0 +1,70 @@
+// Show what a target frame allocates under itself.
+//
+// Reads a V8 .heapprofile (tree of { callFrame, selfSize, children }
+// rooted at `head`) and, for every node whose callFrame.functionName
+// matches the given target, lists its direct child frames with their
+// (self + subtree) byte totals. Answers "what does function X
+// actually allocate?".
+//
+// Companion to find-heap-callers.mjs. Where find-heap-callers walks
+// up (target's parents), this walks down (target's children).
+//
+// Usage:
+//   node perf/find-heap-callees.mjs <profile> <calleeName>
+//
+// Example:
+//   node perf/find-heap-callees.mjs results/<run>/process.heapprofile fastParseDictArray
+
+import { readFileSync } from 'node:fs';
+
+const [profilePath, targetName] = process.argv.slice(2);
+if (!profilePath || !targetName) {
+  console.error('usage: node find-heap-callees.mjs <profile> <calleeName>');
+  process.exit(2);
+}
+
+const profile = JSON.parse(readFileSync(profilePath, 'utf8'));
+
+function subtreeBytes(n) {
+  let total = n.selfSize || 0;
+  for (const c of n.children || []) total += subtreeBytes(c);
+  return total;
+}
+
+const childTotals = new Map();
+const childSelfs = new Map();
+let targetSelf = 0;
+let targetSubtree = 0;
+
+function walk(n) {
+  const name = n.callFrame?.functionName || '';
+  if (name === targetName) {
+    targetSelf += n.selfSize || 0;
+    targetSubtree += subtreeBytes(n);
+    for (const c of n.children || []) {
+      const cf = c.callFrame || {};
+      const cname = cf.functionName || '(anonymous)';
+      const url = cf.url || '';
+      const line = cf.lineNumber != null ? cf.lineNumber + 1 : '?';
+      const key = `${cname} @ ${url ? url.replace(/^file:\/\/\//, '') : '(no url)'}:${line}`;
+      const subtree = subtreeBytes(c);
+      const self = c.selfSize || 0;
+      childTotals.set(key, (childTotals.get(key) || 0) + subtree);
+      childSelfs.set(key, (childSelfs.get(key) || 0) + self);
+    }
+  }
+  for (const c of n.children || []) walk(c);
+}
+walk(profile.head);
+
+console.log(`${targetName}: self=${(targetSelf / 1024).toFixed(2)} KB, subtree=${(targetSubtree / 1024 / 1024).toFixed(2)} MB`);
+console.log('direct children (subtree KB / self KB):');
+const rows = [...childTotals.entries()]
+  .map(([k, subtree]) => ({ k, subtree, self: childSelfs.get(k) || 0 }))
+  .sort((a, b) => b.subtree - a.subtree);
+for (const r of rows) {
+  const subKb = r.subtree / 1024;
+  if (subKb < 10) continue;
+  const selfKb = r.self / 1024;
+  console.log(`  ${subKb.toFixed(2).padStart(10)} KB  (self ${selfKb.toFixed(2).padStart(8)} KB)   ${r.k}`);
+}
diff --git a/perf/measure.mjs b/perf/measure.mjs
index f79a31ec..ac3005ea 100644
--- a/perf/measure.mjs
+++ b/perf/measure.mjs
@@ -35,6 +35,7 @@
 //                    [--fast-parse-number] [--fast-parse-dict]
 //                    [--fast-parse-object] [--fast-sync-load]
 //                    [--fast-dict-array] [--fast-indirect-objects]
+//                    [--fast-pdfnumber-pool]
 //
 // --render-only bails out after the render phase. Skips meta extraction,
 // parseOutline, page.pdf, and the pdf-lib roundtrip / incremental writer.
@@ -186,6 +187,15 @@
 // enumerateIndirectObjects skips its sort when the gen!=0 Map is
 // empty (the parsed-PDF common case). Production runs through it.
 //
+// --fast-pdfnumber-pool installs a value-keyed cache in front of
+// PDFNumber.of. Dense array for non-negative integers in
+// [0, 16384), Map fallback for floats / negatives / out-of-range.
+// PDFs reuse the same numeric values (page indices, /Count, /N,
+// /MediaBox dimensions) tens-to-hundreds of thousands of times;
+// pooling collapses parseNumberOrRef's ~15 MB of PDFNumber
+// allocations to a few thousand cached instances. PDFNumber is
+// immutable so sharing is safe. Production runs through it.
+//
 // --fast-sync-load rips pdf-lib's parseSpeed / objectsPerTick /
 // shouldWaitForTick / waitForTick machinery out of both the load
 // path (PDFDocument.load + PDFParser.parseDocument / parseDocumentSection
@@ -258,6 +268,7 @@ let fastParseObject = false;
 let fastSyncLoad = false;
 let fastDictArray = false;
 let fastIndirectObjects = false;
+let fastPdfnumberPool = false;
 for (let i = 0; i < args.length; i++) {
   const a = args[i];
   if (a === '--out') outArg = args[++i];
@@ -293,6 +304,7 @@ for (let i = 0; i < args.length; i++) {
   else if (a === '--fast-sync-load') fastSyncLoad = true;
   else if (a === '--fast-dict-array') fastDictArray = true;
   else if (a === '--fast-indirect-objects') fastIndirectObjects = true;
+  else if (a === '--fast-pdfnumber-pool') fastPdfnumberPool = true;
   else if (!inputArg) inputArg = a;
   else { console.error(`unknown arg: ${a}`); process.exit(2); }
 }
@@ -390,6 +402,10 @@ if (fastIndirectObjects) {
   await import('../docs/lib/fast-indirect-objects.mjs');
   console.log('[harness] fast-indirect-objects: PDFContext.indirectObjects dense-array cache for gen=0 PDFRefs');
 }
+if (fastPdfnumberPool) {
+  await import('../docs/lib/fast-pdfnumber-pool.mjs');
+  console.log('[harness] fast-pdfnumber-pool: value-keyed cache in front of PDFNumber.of');
+}
 
 const stamp = new Date().toISOString().replace(/[:.]/g, '-');
 const outDir = outArg
diff --git a/perf/notes/08-pdf-lib.md b/perf/notes/08-pdf-lib.md
index 1a431012..5d033785 100644
--- a/perf/notes/08-pdf-lib.md
+++ b/perf/notes/08-pdf-lib.md
@@ -2241,6 +2241,128 @@ heap profile.
 The edit is local to `docs/lib/fast-refs.mjs`; no production
 import change needed since `fast-refs` was already wired up.
 
+## Pool `PDFNumber` instances by value
+
+With every `Map.set` in the load path either eliminated or
+reduced to its irreducible floor (`PDFName` fastCache, ~0.5 MB),
+the next-largest bucket in the heap profile was
+`parseNumberOrRef` at 15 MB -- mostly inlined `new PDFNumber(value)`
+from the parser's number branch:
+
+```js
+function PDFNumber(value) {
+  var _this = _super.call(this) || this;
+  _this.numberValue = value;
+  _this.stringValue = numberToString(value);     // alloc per instance
+  return _this;
+}
+PDFNumber.of = function (value) { return new PDFNumber(value); };
+```
+
+No pool. Every `PDFNumber.of(N)` returns a fresh instance, even
+for the same `N`. PDFs reuse a handful of integer values
+*constantly*: the book has 1 651 page entries (each contributing
+`/MediaBox` dimensions like 612, 792, integer indices, `/Count`,
+`/N` totals), plus content-stream numeric literals, font sizes,
+and bit widths. Hundreds of thousands of `PDFNumber.of` calls
+against maybe a few thousand unique values.
+
+A `PDFNumber` is also conceptually immutable: `numberValue` and
+`stringValue` are written once in the constructor and never
+mutated. Pooling by value is therefore safe.
+
+### Could we just store a raw `number`?
+
+In principle yes. `PDFNumber` exists structurally to satisfy
+pdf-lib's polymorphic dispatch on every dict / array value
+(`value.copyBytesInto(buffer, offset)`, `value.sizeInBytes()`,
+`value.asNumber()`). Replacing it with a primitive would
+require:
+
+- Type-branching in `PDFDict.copyBytesInto` /
+  `PDFArray.copyBytesInto` / `sizeInBytes`: `typeof === 'number'`
+  fast-path that writes the number's string form directly.
+- Updating ~53 consumer sites in pdf-lib's API code (everything
+  that does `lookup(name, PDFNumber).asNumber()` or
+  `value instanceof PDFNumber`) to handle bare numbers.
+- A V8 deopt risk: the serializer's previously-monomorphic
+  `.copyBytesInto` call site becomes polymorphic across two
+  representations.
+
+That's a much bigger surgery for a similar magnitude of win,
+because pooling already collapses every repeated-value
+allocation to a single shared instance. So we ship the pool
+first; if a post-pool heap profile still showed `PDFNumber` as a
+top allocator, stripping would have been worth the API surgery.
+It doesn't.
+
+### The shim
+
+`docs/lib/fast-pdfnumber-pool.mjs` installs the cache. Same
+shape as `fast-refs`: dense array indexed by `value` for
+non-negative integers in `[0, 16384)` (covers every observed
+integer value in the book by a wide margin), Map fallback for
+floats, negatives, and out-of-range integers. Map's
+`SameValueZero` handles `NaN` / `-0` correctly, no special-casing
+needed.
+
+```js
+PDFNumber.of = function fastNumberOf(value) {
+  if (value >= 0 && value < POOL_SIZE && (value | 0) === value) {
+    let pn = intPool[value];
+    if (pn !== undefined) return pn;
+    pn = original.call(PDFNumber, value);
+    intPool[value] = pn;
+    return pn;
+  }
+  let pn = otherPool.get(value);
+  if (pn !== undefined) return pn;
+  pn = original.call(PDFNumber, value);
+  otherPool.set(value, pn);
+  return pn;
+};
+```
+
+### Measured wins
+
+Heap profile (paired `--heap-profile-process --heap-sampling 512`,
+fast-refs upgrade baseline vs + pool):
+
+| Allocator                | Pre (KB)  | Post (KB)            | Delta                |
+|--------------------------|----------:|---------------------:|---------------------:|
+| **parseNumberOrRef**     | 15 388.73 | **out of top 10**    | **-15+ MB**          |
+| `String` builtin         |  1 202.23 | out of top 10        | -                    |
+| `PDFNumber.of` (pool miss)|        - |               818.92 | new, ~unique count   |
+| Total sampled            | 123.11 MB |            107.21 MB | **-15.9 MB (-13 %)** |
+
+`parseNumberOrRef`'s row collapsed off the top 10. The new
+`PDFNumber.of` row at 0.8 MB is the floor -- one `PDFNumber` per
+unique value across the whole load. The `String` builtin row
+(`stringValue` allocations) also collapsed because they're now
+allocated once per unique value, not once per use site.
+
+CPU profile (same paired methodology): GC self-time effectively
+flat (166.71 ms -> 165.54 ms), total profile duration within
+sample-count noise (1.03 s -> 1.09 s). Pool cost per call is a
+branch + array index, which V8 inlines into the hot
+`parseNumberOrRef` path. CPU is a wash; the win is pure heap.
+
+### A companion analyzer: `find-heap-callees.mjs`
+
+Adding this shim also surfaced the question "what's
+`fastParseDictArray` actually allocating at its 58 MB self-row?".
+`find-heap-callers` answers "who calls X?"; the inverse --
+"what does X allocate?" -- needed a new tool. `find-heap-callees.mjs`
+walks the `.heapprofile` tree and lists a target frame's direct
+children with their (self + subtree) byte totals.
+
+Used here, it cracked open the `fastParseDictArray` row: most of
+the 58 MB was recursive `parseDict` invocations across nesting
+levels, not a single allocator. That's intrinsic to the document
+structure (page-tree dicts contain Kids arrays of Page dicts that
+contain Resources dicts...), not something a shim can shrink.
+The tool stays for future investigations.
+
 ## `@cantoo/pdf-lib`: not a drop-in replacement
 
 Spot-checked the maintained fork (`@cantoo/pdf-lib` 2.6.5) as an
@@ -2275,7 +2397,8 @@ Cumulative process-phase cost, baseline → after the shims to date:
 | + fast-sync-load                     | ~1.3 s  | ~0.8 s | ~0.5 s |
 | + fast-dict-array                    | ~1.1 s  | ~0.7 s | ~0.4 s |
 | + fast-indirect-objects              | ~1.1 s  | ~0.7 s | ~0.4 s |
-| **+ fast-refs miss bypass (this section)** | **~1.0 s** | **~0.6 s** | **~0.4 s** |
+| + fast-refs miss bypass              | ~1.0 s  | ~0.6 s | ~0.4 s |
+| **+ fast-pdfnumber-pool (this section)** | **~1.0 s** | **~0.6 s** | **~0.4 s** |
 
 The bottom-up after the latest pair is what's left of pdf-lib's
 genuine parser work: `PDFDict.entries`, `PDFObjectParser.parseName`,

From 99a1f56e8296c2bce0f459745ab15cef5c0e8887 Mon Sep 17 00:00:00 2001
From: Kuba Sunderland-Ober <kuba@mareimbrium.org>
Date: Sun, 24 May 2026 01:09:31 +0200
Subject: [PATCH 18/44] Pre-size parseDict's backing array to median dict size.

Instrumented parseDict shows 261k invocations on the book, 52% of
dicts at exactly 5 entries (10 push slots), 80% at 4-5 entries,
96% <= 7 entries, max recursion only 3. The original `const arr = []`
+ push-grow path was wasting ~85% of fastParseDictArray's 58 MB
self-row on FixedArray growth garbage (cap 4 -> 8 -> 16 doubling
discards on every 5-entry dict).

Allocate the accumulator at `new Array(10)` -- exact fit for the
median, 0 growth for 80% of dicts, one growth for the 5% at 7
entries. Direct indexing with len counter; push only on overflow.

Heap: total sampled 107 MB -> 92 MB (-14%); fastParseDictArray row
58 MB -> 44 MB (-25%). SCRATCH=10 beat SCRATCH=16 (saved 9.5 MB
more) because the cap-16 baseline was itself ~46 MB across 261k
calls.

Also lands instrument-parsedict.mjs and the --instrument-parsedict
flag on measure.mjs for future dict-workload investigations.
---
 docs/lib/fast-dict-array.mjs  |  25 ++++++-
 perf/README.md                |   3 +-
 perf/instrument-parsedict.mjs |  67 +++++++++++++++++
 perf/measure.mjs              |   5 ++
 perf/notes/08-pdf-lib.md      | 135 +++++++++++++++++++++++++++++++++-
 5 files changed, 230 insertions(+), 5 deletions(-)
 create mode 100644 perf/instrument-parsedict.mjs

diff --git a/docs/lib/fast-dict-array.mjs b/docs/lib/fast-dict-array.mjs
index 1ca21665..1d4bae5c 100644
--- a/docs/lib/fast-dict-array.mjs
+++ b/docs/lib/fast-dict-array.mjs
@@ -264,28 +264,47 @@ if (!PDFDict.prototype.__fastDictArrayInstalled) {
   // a short linear scan; PDF convention places /Type first, so it's
   // effectively O(1) per dict.
 
+  // Pre-sized accumulator: instrumented histogram on the book parse
+  // shows 5-entry dicts dominant (52 %, exactly 10 push slots),
+  // 4-entry next (28 %, 8 slots), then a long tail to 7-8 entries.
+  // SCRATCH = 10 is an exact fit for the median case; smaller dicts
+  // (2/3/4 entries) waste a few slots, larger ones (7+) take one
+  // growth via push. Cuts ~70 bytes of FixedArray-header allocation
+  // per dict vs SCRATCH=16 -- on 261 k dict invocations that adds up.
+  const SCRATCH = 10;
   PDFObjectParser.prototype.parseDict = function fastParseDictArray() {
     const bytes = this.bytes;
     bytes.assertNext(CharCodes.LessThan);
     bytes.assertNext(CharCodes.LessThan);
     this.skipWhitespaceAndComments();
-    const arr = [];
+    const arr = new Array(SCRATCH);
+    let len = 0;
     while (!bytes.done() &&
            bytes.peek() !== CharCodes.GreaterThan &&
            bytes.peekAhead(1) !== CharCodes.GreaterThan) {
       const key = this.parseName();
       const value = this.parseObject();
-      arr.push(key, value);
+      if (len < SCRATCH) {
+        arr[len]     = key;
+        arr[len + 1] = value;
+      } else {
+        // Rare overflow path: set length to current len so push
+        // appends at the right offset, then grow naturally.
+        arr.length = len;
+        arr.push(key, value);
+      }
+      len += 2;
       this.skipWhitespaceAndComments();
     }
     this.skipWhitespaceAndComments();
     bytes.assertNext(CharCodes.GreaterThan);
     bytes.assertNext(CharCodes.GreaterThan);
+    arr.length = len;
 
     // Type-sentinel dispatch. Inline-scan for TypeName; in practice
     // it's at arr[0] or arr[2].
     let Type;
-    for (let i = 0, len = arr.length; i < len; i += 2) {
+    for (let i = 0; i < len; i += 2) {
       if (arr[i] === TypeName) { Type = arr[i + 1]; break; }
     }
     if (Type === CatalogName) return new PDFCatalog(arr, this.context);
diff --git a/perf/README.md b/perf/README.md
index d631b090..6f1db950 100644
--- a/perf/README.md
+++ b/perf/README.md
@@ -528,6 +528,7 @@ file documenting each:
 | `PDFContext.indirectObjects` dense gen=0 array | [08](notes/08-pdf-lib.md) | `assign` off CPU top-15; remaining `set` heap -48 % |
 | `PDFRef.of` direct-construct on cache miss (skip upstream `pool.set`) | [08](notes/08-pdf-lib.md) | `PDFRef.of` off CPU top-15 (~93 ms); `set` heap 7.7 MB → 0.5 MB |
 | `PDFNumber.of` value-pool (dense int + Map fallback) | [08](notes/08-pdf-lib.md) | `parseNumberOrRef` off heap top-10; total process heap 123 MB → 107 MB (-13 %) |
+| Pre-size `parseDict` accumulator (`new Array(10)` median) | [08](notes/08-pdf-lib.md) | `fastParseDictArray` heap row -25 %; total process heap 107 MB → 92 MB (-14 %) |
 
 What was tried and didn't ship:
 
@@ -554,4 +555,4 @@ order; later ones reference earlier ones for context.
 | [05-blink-trace.md](notes/05-blink-trace.md) | What happened when we tried move-not-clone (a `previousLeaf` cache shipped instead of move); cracking the cpu profile's `(program)` row open with a Blink-category trace; the WhiteSpaceFilter paired-A/B that found it wasn't worth its layout cost in our pipeline. |
 | [06-microtasks-pageranges-css.md](notes/06-microtasks-pageranges-css.md) | Following `RunMicrotasks` down to zero (chunker fully sync); why `pageRanges` sharding is off the table; CSS cost attribution showing print.css's individual sections are all below the noise floor. |
 | [07-memory.md](notes/07-memory.md) | Where the renderer's 1.9 GB goes -- process-tree footprint, per-allocator + per-Blink-class breakdown, `--disable-gpu` + `--in-process-gpu` saving ~200 MB, a GC-pass probe finding 180 MB of unswept Oilpan garbage. |
-| [08-pdf-lib.md](notes/08-pdf-lib.md) | Profiling the process phase via `--cpu-profile-process`; pako's per-stream init dominates with ~4 500 small streams (routing pdf-lib's `deflate` + `inflate` through `node:zlib` saves ~1.5 s); `PDFRef.of`'s string-keyed Map lookup at 1.2 M calls per load (dense-array gen=0 cache saves ~0.2 s); parallelising save's per-stream deflate on libuv's pool with `objectsPerStream: 500` (~0.3 s off the main thread; PDF -5 %); `decodeName`'s regex scan on 2.76 M `PDFName.of` calls per load with a 0.0001 % hit rate (no-`#` cache saves ~0.5 s); `numberToString`'s redundant `toString`/`split`/`parseInt` on the 100 % no-`e` path; `sizeInBytes` allocating `n.toString(2)` on ~300 k xref-writer calls (short-circuit ladder saves ~60 ms); `PDFDict.entries` allocating `Array.from(map.entries())` on every dict serialisation (`Map.forEach` with hoisted callbacks saves ~80 ms); `parseDict`'s type-dispatch tail re-running `PDFName.of('Type'/'Catalog'/'Pages'/'Page')` per dict (hoisted sentinel constants drop `fastOf` self-time by 22 %); pdf-lib's `__awaiter`/`__generator` scaffolding on nine load + save methods costing ~135 ms of attributed self-time + ~50 ms GC (synchronified twins save 0.36 s of process); `parseObject`'s three speculative `matchKeyword(true/false/null)` scans on every dispatch (first-byte peek + gated keyword scans halve `parseObject` self-time); the sampling heap profile pointing at `new Map()` + `Map.prototype.set` at ~50 % of process-phase allocations (replacing `PDFDict`'s backing `Map` with a flat alternating `[k,v,k,v,...]` array drops Map+set heap traffic ~80 % and subsumes the earlier `fast-dict-iter` + `fast-parse-dict` shims); the only hot `Map.set` left being `PDFContext.assign`'s `indirectObjects.set(ref, object)` (replacing the Map with a dense gen=0 array indexed by `objectNumber` drops `assign` out of the CPU top-15 and halves the remaining `set` heap traffic); the residual `set` after that being the upstream `PDFRef.of` pool.set on cache miss (directly constructing the `PDFRef` via `Object.create(PDFRef.prototype)` on the gen=0 miss path bypasses the redundant upstream pool entirely, dropping `set` heap traffic from 7.7 MB to 0.5 MB and saving another ~93 ms on `PDFRef.of` CPU); `parseNumberOrRef` as the next-largest heap row at 15 MB of inlined `new PDFNumber(value)` calls -- PDFs reuse a few thousand unique numeric values hundreds of thousands of times (page indices, `/Count`, `/MediaBox` dimensions, font sizes), so pooling `PDFNumber` by value drops `parseNumberOrRef` off the top 10 and total process heap by ~13 %. |
+| [08-pdf-lib.md](notes/08-pdf-lib.md) | Profiling the process phase via `--cpu-profile-process`; pako's per-stream init dominates with ~4 500 small streams (routing pdf-lib's `deflate` + `inflate` through `node:zlib` saves ~1.5 s); `PDFRef.of`'s string-keyed Map lookup at 1.2 M calls per load (dense-array gen=0 cache saves ~0.2 s); parallelising save's per-stream deflate on libuv's pool with `objectsPerStream: 500` (~0.3 s off the main thread; PDF -5 %); `decodeName`'s regex scan on 2.76 M `PDFName.of` calls per load with a 0.0001 % hit rate (no-`#` cache saves ~0.5 s); `numberToString`'s redundant `toString`/`split`/`parseInt` on the 100 % no-`e` path; `sizeInBytes` allocating `n.toString(2)` on ~300 k xref-writer calls (short-circuit ladder saves ~60 ms); `PDFDict.entries` allocating `Array.from(map.entries())` on every dict serialisation (`Map.forEach` with hoisted callbacks saves ~80 ms); `parseDict`'s type-dispatch tail re-running `PDFName.of('Type'/'Catalog'/'Pages'/'Page')` per dict (hoisted sentinel constants drop `fastOf` self-time by 22 %); pdf-lib's `__awaiter`/`__generator` scaffolding on nine load + save methods costing ~135 ms of attributed self-time + ~50 ms GC (synchronified twins save 0.36 s of process); `parseObject`'s three speculative `matchKeyword(true/false/null)` scans on every dispatch (first-byte peek + gated keyword scans halve `parseObject` self-time); the sampling heap profile pointing at `new Map()` + `Map.prototype.set` at ~50 % of process-phase allocations (replacing `PDFDict`'s backing `Map` with a flat alternating `[k,v,k,v,...]` array drops Map+set heap traffic ~80 % and subsumes the earlier `fast-dict-iter` + `fast-parse-dict` shims); the only hot `Map.set` left being `PDFContext.assign`'s `indirectObjects.set(ref, object)` (replacing the Map with a dense gen=0 array indexed by `objectNumber` drops `assign` out of the CPU top-15 and halves the remaining `set` heap traffic); the residual `set` after that being the upstream `PDFRef.of` pool.set on cache miss (directly constructing the `PDFRef` via `Object.create(PDFRef.prototype)` on the gen=0 miss path bypasses the redundant upstream pool entirely, dropping `set` heap traffic from 7.7 MB to 0.5 MB and saving another ~93 ms on `PDFRef.of` CPU); `parseNumberOrRef` as the next-largest heap row at 15 MB of inlined `new PDFNumber(value)` calls -- PDFs reuse a few thousand unique numeric values hundreds of thousands of times (page indices, `/Count`, `/MediaBox` dimensions, font sizes), so pooling `PDFNumber` by value drops `parseNumberOrRef` off the top 10 and total process heap by ~13 %; `fastParseDictArray` at ~85 % FixedArray-growth garbage on 261 k dicts with a 5-entry median (pre-sizing the accumulator at `new Array(10)` collapses the per-call growth chain, dropping the row -25 % and total heap another -14 %). |
diff --git a/perf/instrument-parsedict.mjs b/perf/instrument-parsedict.mjs
new file mode 100644
index 00000000..356f2f74
--- /dev/null
+++ b/perf/instrument-parsedict.mjs
@@ -0,0 +1,67 @@
+// Per-call counters for fastParseDictArray. Wraps the shim's
+// parseDict to count invocations, entries-per-dict distribution,
+// and recursion depth. Prints a histogram on process exit.
+//
+// Used to crack open fastParseDictArray's 58 MB self-row in the
+// process-phase heap profile -- without counts, we can't tell
+// whether "58 MB" is 10k dicts at 6 KB each or 300k dicts at
+// 200 bytes each.
+//
+// Idempotent. Composes with --fast-dict-array (must be loaded
+// AFTER fast-dict-array so it wraps the patched parseDict).
+
+import { createRequire } from 'node:module';
+const require = createRequire(import.meta.url);
+const PDFObjectParser = require('pdf-lib/cjs/core/parser/PDFObjectParser.js').default;
+
+if (!PDFObjectParser.prototype.__instrumentParseDictInstalled) {
+  const originalParseDict = PDFObjectParser.prototype.parseDict;
+  let totalCalls = 0;
+  let totalEntries = 0;
+  let maxSize = 0;
+  let depth = 0;
+  let maxDepth = 0;
+  const sizeHistogram = new Array(33).fill(0);  // [0..31] then 32+
+
+  PDFObjectParser.prototype.parseDict = function () {
+    depth++;
+    if (depth > maxDepth) maxDepth = depth;
+    let result;
+    try {
+      result = originalParseDict.call(this);
+    } finally {
+      depth--;
+    }
+    totalCalls++;
+    // result.dict is the flat array [k0, v0, k1, v1, ...] (fast-dict-array)
+    // or a Map (upstream / fast-parse-dict). Handle both.
+    const inner = result.dict;
+    const entryCount = Array.isArray(inner) ? (inner.length >> 1) : inner.size;
+    totalEntries += entryCount;
+    if (entryCount > maxSize) maxSize = entryCount;
+    const bucket = entryCount < 32 ? entryCount : 32;
+    sizeHistogram[bucket]++;
+    return result;
+  };
+
+  process.on('exit', () => {
+    console.error('');
+    console.error('=== parseDict instrumentation ===');
+    console.error(`total calls       : ${totalCalls}`);
+    console.error(`total entries     : ${totalEntries}`);
+    console.error(`avg entries/dict  : ${(totalEntries / totalCalls).toFixed(2)}`);
+    console.error(`max entries/dict  : ${maxSize}`);
+    console.error(`max recursion     : ${maxDepth}`);
+    console.error('entries-per-dict histogram:');
+    for (let i = 0; i <= 32; i++) {
+      const n = sizeHistogram[i];
+      if (n === 0) continue;
+      const label = i === 32 ? '32+' : String(i);
+      const bar = '#'.repeat(Math.min(60, Math.round(n / totalCalls * 200)));
+      console.error(`  ${label.padStart(4)} : ${String(n).padStart(7)}  ${bar}`);
+    }
+  });
+
+  PDFObjectParser.prototype.__instrumentParseDictInstalled = true;
+  console.log('[harness] instrument-parsedict: counting parseDict calls + size distribution');
+}
diff --git a/perf/measure.mjs b/perf/measure.mjs
index ac3005ea..1d160a3b 100644
--- a/perf/measure.mjs
+++ b/perf/measure.mjs
@@ -269,6 +269,7 @@ let fastSyncLoad = false;
 let fastDictArray = false;
 let fastIndirectObjects = false;
 let fastPdfnumberPool = false;
+let instrumentParsedict = false;
 for (let i = 0; i < args.length; i++) {
   const a = args[i];
   if (a === '--out') outArg = args[++i];
@@ -305,6 +306,7 @@ for (let i = 0; i < args.length; i++) {
   else if (a === '--fast-dict-array') fastDictArray = true;
   else if (a === '--fast-indirect-objects') fastIndirectObjects = true;
   else if (a === '--fast-pdfnumber-pool') fastPdfnumberPool = true;
+  else if (a === '--instrument-parsedict') instrumentParsedict = true;
   else if (!inputArg) inputArg = a;
   else { console.error(`unknown arg: ${a}`); process.exit(2); }
 }
@@ -406,6 +408,9 @@ if (fastPdfnumberPool) {
   await import('../docs/lib/fast-pdfnumber-pool.mjs');
   console.log('[harness] fast-pdfnumber-pool: value-keyed cache in front of PDFNumber.of');
 }
+if (instrumentParsedict) {
+  await import('./instrument-parsedict.mjs');
+}
 
 const stamp = new Date().toISOString().replace(/[:.]/g, '-');
 const outDir = outArg
diff --git a/perf/notes/08-pdf-lib.md b/perf/notes/08-pdf-lib.md
index 5d033785..51922f7b 100644
--- a/perf/notes/08-pdf-lib.md
+++ b/perf/notes/08-pdf-lib.md
@@ -2363,6 +2363,138 @@ structure (page-tree dicts contain Kids arrays of Page dicts that
 contain Resources dicts...), not something a shim can shrink.
 The tool stays for future investigations.
 
+## Pre-size `parseDict`'s backing array
+
+After `fast-pdfnumber-pool` shipped, `fastParseDictArray` was
+53 % of the residual heap profile (~58 MB self-size). Three
+components in that frame:
+
+```js
+const arr = [];                                  // (1) array alloc + cap-4 FixedArray
+while (...) {
+  arr.push(key, value);                          // (2) growth via doubling
+}
+return new PDFDict(arr, this.context);           // (3) PDFDict instance
+```
+
+Without per-call counts, the 58 MB could plausibly be 10 k huge
+dicts or 300 k tiny ones. So we instrumented
+(`perf/instrument-parsedict.mjs`), which wraps the shim's
+`parseDict` to count invocations and size-distribution on exit.
+The book's workload:
+
+```
+total calls       : 260 967
+total entries     : 1 170 264
+avg entries/dict  : 4.48
+max entries/dict  : 4 353
+max recursion     : 3
+entries-per-dict histogram:
+     1 :     822
+     2 :  22 551
+     3 :  13 372
+     4 :  73 936    (28 %)
+     5 : 135 438    (52 %)   <-- median
+     6 :     231
+     7 :  12 458
+     8 :   1 644
+     9..31:  ~530
+   32+ :       2
+```
+
+**80 % of dicts have exactly 4 or 5 entries; 96 % have <= 7. Max
+recursion only 3 deep.** That maps cleanly onto V8's array
+growth behavior: a 5-entry dict's `arr.push(key, value)` chain
+grows the backing FixedArray from cap 4 -> 8 -> 16, discarding
+the two intermediate stores as garbage:
+
+| Dict entries | Push slots | Growth path | FixedArray bytes (incl. discards) |
+|-------------:|-----------:|-------------|----------------------------------:|
+|  4 (28 %)    |   8        | 4 -> 8      | 64 + 96 = 160 B                   |
+|  5 (52 %)    |  10        | 4 -> 8 -> 16 | 64 + 96 + 152 = 312 B           |
+|  7 (5 %)     |  14        | 4 -> 8 -> 16 | 312 B                             |
+|  2 (9 %)     |   4        | 4           | 64 B                              |
+
+Weighted average ~220 B of FixedArray throughput per dict.
+Across 261 k dicts: ~57 MB -- matching the observed 58 MB
+self-row almost exactly. **~85 % of the row is growth garbage
+from not pre-sizing.**
+
+### The fix
+
+Allocate the accumulator at the median size up front and use
+direct indexing with a `len` counter; fall back to push only for
+the rare overflow case.
+
+```js
+const SCRATCH = 10;   // median = 5 entries = 10 push slots
+const arr = new Array(SCRATCH);
+let len = 0;
+while (...) {
+  const key = this.parseName();
+  const value = this.parseObject();
+  if (len < SCRATCH) {
+    arr[len]     = key;
+    arr[len + 1] = value;
+  } else {
+    arr.length = len;
+    arr.push(key, value);   // rare: 7+ entry dicts grow from 10
+  }
+  len += 2;
+}
+arr.length = len;            // trim hole tail
+```
+
+### Picking SCRATCH
+
+`SCRATCH = 16` was the first try (covers 4-7 entry dicts without
+growth -- 96 % of cases). It saved only ~5.6 MB instead of an
+estimated ~22 MB. The reason: `new Array(16)` allocates a
+176-byte FixedArray *for every dict*, including the 9 % of
+2-entry dicts that previously needed only 64 bytes. The cap-16
+baseline is itself ~46 MB across 261 k calls.
+
+`SCRATCH = 10` is exact-fit for the 52 % dominant 5-entry case
+(no growth, no waste), small waste for 2/3/4-entry dicts (4-6
+unused slots), and one growth for the 5 % at 7 entries plus the
+~2 % above that. Best balance for this workload.
+
+### Measured wins
+
+Heap profile (paired `--heap-profile-process --heap-sampling 512`,
+post-fast-pdfnumber-pool baseline vs + SCRATCH=10):
+
+| Allocator                | Pre (KB)   | Post (KB)  | Delta              |
+|--------------------------|-----------:|-----------:|-------------------:|
+| **fastParseDictArray**   |  58 203.30 |  43 817.77 | **-14.4 MB (-25 %)** |
+| `push` builtin           |   2 843.44 |   1 621.62 | -1.2 MB            |
+| Total sampled            | 107.21 MB  |  92.13 MB  | **-15.1 MB (-14 %)** |
+
+Two-step path through SCRATCH:
+
+| Step                  | Total sampled | fastParseDictArray |
+|-----------------------|--------------:|-------------------:|
+| No pre-size           |     107.21 MB |          58.20 MB  |
+| `SCRATCH = 16`        |     101.61 MB |          55.03 MB  |
+| `SCRATCH = 10`        |  **92.13 MB** |       **43.82 MB** |
+
+### What about a parser-wide scratch buffer?
+
+The "escalation" alternative was a single long-lived backing
+array on the parser instance, append-then-slice per dict. It
+would eliminate the per-call `new Array(10)` allocation. But the
+slice result is still a fresh per-dict allocation, sized exactly
+-- which for the median 5-entry case is ~104 B (same as cap-10).
+The only net savings would be on small dicts (1-3 entries) where
+the slice is smaller than 10 slots; that's maybe ~2-3 MB across
+36 k small dicts. Not worth the recursion-safe length-pointer
+save/restore plumbing.
+
+The edit is local to `docs/lib/fast-dict-array.mjs`; no
+production import change needed since `fast-dict-array` was
+already wired up. The `--instrument-parsedict` flag stays on
+`measure.mjs` for future dict-workload investigations.
+
 ## `@cantoo/pdf-lib`: not a drop-in replacement
 
 Spot-checked the maintained fork (`@cantoo/pdf-lib` 2.6.5) as an
@@ -2398,7 +2530,8 @@ Cumulative process-phase cost, baseline → after the shims to date:
 | + fast-dict-array                    | ~1.1 s  | ~0.7 s | ~0.4 s |
 | + fast-indirect-objects              | ~1.1 s  | ~0.7 s | ~0.4 s |
 | + fast-refs miss bypass              | ~1.0 s  | ~0.6 s | ~0.4 s |
-| **+ fast-pdfnumber-pool (this section)** | **~1.0 s** | **~0.6 s** | **~0.4 s** |
+| + fast-pdfnumber-pool                | ~1.0 s  | ~0.6 s | ~0.4 s |
+| **+ parseDict pre-sized array (this section)** | **~1.0 s** | **~0.6 s** | **~0.4 s** |
 
 The bottom-up after the latest pair is what's left of pdf-lib's
 genuine parser work: `PDFDict.entries`, `PDFObjectParser.parseName`,

From ec1676d2c40f6dfee12f3fae4383f93372691eac Mon Sep 17 00:00:00 2001
From: Kuba Sunderland-Ober <kuba@mareimbrium.org>
Date: Sun, 24 May 2026 01:11:56 +0200
Subject: [PATCH 19/44] Rename SCRATCH to INITIAL_SLOTS in parseDict
 accumulator.

SCRATCH was a leftover name from when I was sketching the
parser-wide long-lived buffer approach (which we rejected). The
shipped const is just the initial capacity of the per-dict
backing array -- not a scratch buffer at all. Rename for clarity.

Also softens the notes' "What about a scratch buffer?" follow-up
section title to "What about a true scratch buffer?", and notes in
the code comment that this is a pre-sized permanent backing array,
not scratch.
---
 docs/lib/fast-dict-array.mjs | 28 +++++++++++------
 perf/notes/08-pdf-lib.md     | 59 +++++++++++++++++++-----------------
 2 files changed, 50 insertions(+), 37 deletions(-)

diff --git a/docs/lib/fast-dict-array.mjs b/docs/lib/fast-dict-array.mjs
index 1d4bae5c..5f709859 100644
--- a/docs/lib/fast-dict-array.mjs
+++ b/docs/lib/fast-dict-array.mjs
@@ -264,27 +264,35 @@ if (!PDFDict.prototype.__fastDictArrayInstalled) {
   // a short linear scan; PDF convention places /Type first, so it's
   // effectively O(1) per dict.
 
-  // Pre-sized accumulator: instrumented histogram on the book parse
-  // shows 5-entry dicts dominant (52 %, exactly 10 push slots),
-  // 4-entry next (28 %, 8 slots), then a long tail to 7-8 entries.
-  // SCRATCH = 10 is an exact fit for the median case; smaller dicts
-  // (2/3/4 entries) waste a few slots, larger ones (7+) take one
-  // growth via push. Cuts ~70 bytes of FixedArray-header allocation
-  // per dict vs SCRATCH=16 -- on 261 k dict invocations that adds up.
-  const SCRATCH = 10;
+  // Initial capacity for the per-dict accumulator. NOT a scratch
+  // buffer (the array isn't reused across calls -- it's allocated
+  // fresh each dict, filled with parsed entries, and handed to the
+  // PDFDict constructor where it lives as `pdfDict.dict` for the
+  // document's lifetime). Just a pre-sized initial capacity that
+  // skips push-grow's reallocation chain.
+  //
+  // Histogram from the book parse (see instrument-parsedict.mjs):
+  // 5-entry dicts dominate (52 %, exactly 10 push slots), 4-entry
+  // next (28 %, 8 slots), long tail to 7-8 entries. INITIAL_SLOTS =
+  // 10 is exact-fit for the median case; smaller dicts (2/3/4
+  // entries) waste a few slots, larger ones (7+) take one growth
+  // via push. Cuts ~70 bytes of FixedArray-header allocation per
+  // dict vs INITIAL_SLOTS=16 -- on 261 k dict invocations that
+  // adds up.
+  const INITIAL_SLOTS = 10;
   PDFObjectParser.prototype.parseDict = function fastParseDictArray() {
     const bytes = this.bytes;
     bytes.assertNext(CharCodes.LessThan);
     bytes.assertNext(CharCodes.LessThan);
     this.skipWhitespaceAndComments();
-    const arr = new Array(SCRATCH);
+    const arr = new Array(INITIAL_SLOTS);
     let len = 0;
     while (!bytes.done() &&
            bytes.peek() !== CharCodes.GreaterThan &&
            bytes.peekAhead(1) !== CharCodes.GreaterThan) {
       const key = this.parseName();
       const value = this.parseObject();
-      if (len < SCRATCH) {
+      if (len < INITIAL_SLOTS) {
         arr[len]     = key;
         arr[len + 1] = value;
       } else {
diff --git a/perf/notes/08-pdf-lib.md b/perf/notes/08-pdf-lib.md
index 51922f7b..68a2a2b4 100644
--- a/perf/notes/08-pdf-lib.md
+++ b/perf/notes/08-pdf-lib.md
@@ -2427,13 +2427,16 @@ direct indexing with a `len` counter; fall back to push only for
 the rare overflow case.
 
 ```js
-const SCRATCH = 10;   // median = 5 entries = 10 push slots
-const arr = new Array(SCRATCH);
+// Pre-sized permanent backing array (not a scratch buffer --
+// the array is what we hand to PDFDict, just with capacity set
+// to the median dict size up front to skip the growth chain).
+const INITIAL_SLOTS = 10;   // median = 5 entries = 10 push slots
+const arr = new Array(INITIAL_SLOTS);
 let len = 0;
 while (...) {
   const key = this.parseName();
   const value = this.parseObject();
-  if (len < SCRATCH) {
+  if (len < INITIAL_SLOTS) {
     arr[len]     = key;
     arr[len + 1] = value;
   } else {
@@ -2445,24 +2448,24 @@ while (...) {
 arr.length = len;            // trim hole tail
 ```
 
-### Picking SCRATCH
+### Picking `INITIAL_SLOTS`
 
-`SCRATCH = 16` was the first try (covers 4-7 entry dicts without
-growth -- 96 % of cases). It saved only ~5.6 MB instead of an
-estimated ~22 MB. The reason: `new Array(16)` allocates a
+`INITIAL_SLOTS = 16` was the first try (covers 4-7 entry dicts
+without growth -- 96 % of cases). It saved only ~5.6 MB instead
+of an estimated ~22 MB. The reason: `new Array(16)` allocates a
 176-byte FixedArray *for every dict*, including the 9 % of
 2-entry dicts that previously needed only 64 bytes. The cap-16
 baseline is itself ~46 MB across 261 k calls.
 
-`SCRATCH = 10` is exact-fit for the 52 % dominant 5-entry case
-(no growth, no waste), small waste for 2/3/4-entry dicts (4-6
-unused slots), and one growth for the 5 % at 7 entries plus the
-~2 % above that. Best balance for this workload.
+`INITIAL_SLOTS = 10` is exact-fit for the 52 % dominant 5-entry
+case (no growth, no waste), small waste for 2/3/4-entry dicts
+(4-6 unused slots), and one growth for the 5 % at 7 entries
+plus the ~2 % above that. Best balance for this workload.
 
 ### Measured wins
 
 Heap profile (paired `--heap-profile-process --heap-sampling 512`,
-post-fast-pdfnumber-pool baseline vs + SCRATCH=10):
+post-fast-pdfnumber-pool baseline vs + `INITIAL_SLOTS = 10`):
 
 | Allocator                | Pre (KB)   | Post (KB)  | Delta              |
 |--------------------------|-----------:|-----------:|-------------------:|
@@ -2470,25 +2473,27 @@ post-fast-pdfnumber-pool baseline vs + SCRATCH=10):
 | `push` builtin           |   2 843.44 |   1 621.62 | -1.2 MB            |
 | Total sampled            | 107.21 MB  |  92.13 MB  | **-15.1 MB (-14 %)** |
 
-Two-step path through SCRATCH:
+Two-step path through `INITIAL_SLOTS`:
 
-| Step                  | Total sampled | fastParseDictArray |
-|-----------------------|--------------:|-------------------:|
-| No pre-size           |     107.21 MB |          58.20 MB  |
-| `SCRATCH = 16`        |     101.61 MB |          55.03 MB  |
-| `SCRATCH = 10`        |  **92.13 MB** |       **43.82 MB** |
+| Step                        | Total sampled | fastParseDictArray |
+|-----------------------------|--------------:|-------------------:|
+| No pre-size                 |     107.21 MB |          58.20 MB  |
+| `INITIAL_SLOTS = 16`        |     101.61 MB |          55.03 MB  |
+| `INITIAL_SLOTS = 10`        |  **92.13 MB** |       **43.82 MB** |
 
-### What about a parser-wide scratch buffer?
+### What about a true scratch buffer?
 
 The "escalation" alternative was a single long-lived backing
-array on the parser instance, append-then-slice per dict. It
-would eliminate the per-call `new Array(10)` allocation. But the
-slice result is still a fresh per-dict allocation, sized exactly
--- which for the median 5-entry case is ~104 B (same as cap-10).
-The only net savings would be on small dicts (1-3 entries) where
-the slice is smaller than 10 slots; that's maybe ~2-3 MB across
-36 k small dicts. Not worth the recursion-safe length-pointer
-save/restore plumbing.
+array on the parser instance, append-then-slice per dict. That
+would actually be a scratch buffer -- reused across calls,
+sliced off into a fresh `PDFDict` storage per dict. It would
+eliminate the per-call `new Array(10)` allocation. But the slice
+result is still a fresh per-dict allocation, sized exactly --
+which for the median 5-entry case is ~104 B (same as cap-10).
+The only net savings would be on small dicts (1-3 entries)
+where the slice is smaller than 10 slots; that's maybe ~2-3 MB
+across 36 k small dicts. Not worth the recursion-safe
+length-pointer save/restore plumbing.
 
 The edit is local to `docs/lib/fast-dict-array.mjs`; no
 production import change needed since `fast-dict-array` was

From f7f63ae3a4df0bcb6d9af183d25e8e2252fc164b Mon Sep 17 00:00:00 2001
From: Kuba Sunderland-Ober <kuba@mareimbrium.org>
Date: Sat, 23 May 2026 17:19:26 +0200
Subject: [PATCH 20/44] View-based PDFDict: explored, didn't ship.

Each PDFDict would carry a (buf, start, end) view into a parser-wide
per-depth shared array that is append-only across all dicts at that
depth. Eliminates per-dict array allocation: 261k PDFDict-backing-
arrays collapse to 3 shared buffers per parser (max parseDict
recursion depth = 3 on the book). Per-depth caps pre-sized to the
book's measured workload + slack, so V8 doesn't grow them.
Mutations (catalog.set during setOutline) copy-on-write to a
private array.

Recursion handling: a single global shared buffer would interleave
outer and inner entries when parseObject recurses; per-depth
buffers keep outer's range contiguous.

Bug found and fixed during prototyping: `if (!this._dictDepth)`
re-init guard fires every time _dictDepth returns to 0 (between
top-level dicts), defeating buffer sharing. Use explicit
`_dictBufs === undefined` check.

Net win is modest: ~2.5 MB heap reduction vs fast-dict-array
(92.13 MB -> 89.68 MB). The buffer-sharing savings (~88 B/dict)
are largely offset by the larger PDFDict instance (Object.create
+ 5 fields vs constructor-inlined + 2). Superseded by the
one-buffer + packed-pointer approach that follows, which shrinks
the PDFDict instance back down.

Code dropped; narrative kept in perf/notes/08-pdf-lib.md as the
thinking that led to one-buffer.
---
 perf/notes/08-pdf-lib.md | 149 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 149 insertions(+)

diff --git a/perf/notes/08-pdf-lib.md b/perf/notes/08-pdf-lib.md
index 68a2a2b4..cef6bf29 100644
--- a/perf/notes/08-pdf-lib.md
+++ b/perf/notes/08-pdf-lib.md
@@ -2500,6 +2500,155 @@ production import change needed since `fast-dict-array` was
 already wired up. The `--instrument-parsedict` flag stays on
 `measure.mjs` for future dict-workload investigations.
 
+## View-based PDFDict (explored, didn't ship)
+
+After fast-dict-array pre-sized its per-dict accumulator to median
+size, the `fastParseDictArray` row was still 43.8 MB on the heap
+profile (48 % of total) -- mostly the irreducible floor of "one
+`new Array(10)` + one PDFDict instance per parsed dict, 261 k
+times". The natural next move: stop allocating per-dict storage at
+all, share one backing array across many dicts via a `(buf, start,
+end)` view.
+
+Prototyped as `fast-dict-view.mjs`. Each PDFDict carried a `(buf,
+start, end)` window into a parser-wide per-depth shared array,
+append-only across all dicts at that depth. The win on heap was
+only ~2.5 MB -- the fatter PDFDict instance (5 fields vs 2) ate
+back most of the buffer-sharing saving. Subsequently superseded
+by the one-buffer approach below, which packs the entire dict
+storage into a single mainBuf and shrinks the PDFDict instance
+back down. The view-based shim doesn't ship; the notes here are
+preserved as the thinking that led to one-buffer.
+
+### Why "not scratch"
+
+The earlier comment about "scratch buffer" was wrong vocabulary.
+A scratch buffer is a temporary workspace -- you write, use, and
+discard. Nothing here qualifies: every parsed entry lives until
+the PDFDocument is dropped. What we actually want is a *shared
+backing array* where each PDFDict claims a contiguous range,
+written once and kept. The buffer is append-only; slots are never
+rewritten.
+
+### The recursion gotcha
+
+A naive single shared buffer breaks under parseDict recursion. If
+outer parseDict appends entries to `buf` while parsing a value
+that recurses into inner parseDict, inner's entries get
+interleaved into outer's range. Outer's view would wrongly
+include inner's entries:
+
+```
+outer parseDict starts at len=0
+  outer parses keyA, valueA       -> buf[0,1], len=2
+  outer parses keyB, value=<<...>> -> calls inner parseDict
+    inner appends 3 entries        -> buf[2..7], len=8
+    inner returns view {start:2, end:8}
+  outer wants to write keyB,valueB at buf[8,9] -> len=10
+  outer parses keyC,valueC         -> buf[10,11], len=12
+outer's range: {start:0, end:12}  ← includes inner's entries!
+```
+
+Fix: **one buffer per parseDict-recursion-depth**, not one shared
+globally. Instrumentation
+([perf/instrument-parsedict.mjs](../instrument-parsedict.mjs))
+showed max parseDict depth = 3 on the book, so 3-4 buffers per
+parser. Each buffer is append-only across all dicts at that depth.
+Inner recursion writes to a different buffer than outer, so
+outer's range stays contiguous.
+
+### Copy-on-write for mutations
+
+Shared buffers are correct as long as nobody mutates the entries.
+But `pdfDoc.catalog.set(PDFName.of('Outlines'), outlineRef)` does
+happen in our pipeline (during setOutline). The shim added a COW
+hook to `PDFDict.prototype.set` and `.delete`: first mutation
+copies the (start..end) range into a private array, swaps the
+view to point at that copy with `_dictOwned = true`. Subsequent
+mutations on that dict operate in place. Other dicts sharing the
+original buffer are unaffected.
+
+### Pre-sizing the per-depth buffers
+
+Without pre-sizing the per-depth buffers, V8 doubles their
+backing FixedArray from cap 0 up to (depth 0 case) ~2.1M slots --
+~20 doublings, with each old arena becoming garbage. That growth
+garbage alone was 6.5 MB of the regression observed when first
+prototyping.
+
+Instrumented to measure the final per-depth lengths on a book
+parse:
+
+```
+=== fast-dict-view: depth stats ===
+parser instances seen: 1
+  depth 0: total 2 155 544 slots, max-per-parser 2 155 544 slots
+  depth 1: total   158 260 slots, max-per-parser   158 260 slots
+  depth 2: total    26 724 slots, max-per-parser    26 724 slots
+```
+
+Hardcoded the caps + 10 slack in the shim's `DEPTH_BUF_CAPS`,
+sized to skip all growth on the book. For other workloads the
+buffers grow naturally from these starting sizes;
+oversizing-by-2x doesn't hurt much because there's only one
+buffer per depth per parser.
+
+### Bug-hunt: the depth-reset gotcha
+
+The first version of the shim used `if (!this._dictDepth)` to
+lazy-init the per-parser buffer stack. `!this._dictDepth` is true
+when `_dictDepth = 0` -- which is exactly the state at the *end*
+of every top-level parseDict call (the depth counter was just
+decremented back to zero). The buffers were getting reset between
+every top-level dict; each one was effectively allocating fresh.
+
+Fix: `if (this._dictBufs === undefined)` -- explicit
+undefined-on-construction check. Easy to spot in retrospect, less
+easy to spot when looking at a regression that doesn't make
+sense.
+
+### Why the win is "only" 2.5 MB
+
+Even with perfect pre-sized buffers and the bug fix, fast-dict-view
+beats fast-dict-array by only ~2.5 MB on heap. The expected
+saving was bigger -- one shared buffer should beat 261 k separate
+ones by a lot.
+
+The reason: the PDFDict *instance* in fast-dict-view is itself
+larger. Where fast-dict-array stores `{dict, context}` (2 named
+slots, ~32 B per instance with V8's inline-properties packing),
+fast-dict-view stores `{_dictBuf, _dictStart, _dictEnd, _dictOwned,
+context}` (5 named slots, ~96 B per instance). Across 261 k
+dicts that's ~16 MB of extra per-instance storage that offsets
+most of the buffer-sharing win:
+
+| Per-dict allocation | fast-dict-array (INITIAL_SLOTS=10) | fast-dict-view (pre-sized) |
+|---------------------|-----------------------------------:|----------------------------:|
+| Backing storage     | 104 B per-dict `new Array(10)`     | ~16 B share of shared buf   |
+| PDFDict instance    | ~32 B (inlined constructor)        | ~96 B (Object.create + 5 fields) |
+| **Total / dict**    |                          **~136 B**|                  **~112 B** |
+
+The buffer sharing saves ~88 B per dict on storage, but the
+fatter PDFDict instance eats ~64 B back. Net ~24 B per dict =
+~6 MB structural win, of which ~2.5 MB shows in the heap profile
+after V8 internal overhead variance.
+
+### Measured wins
+
+Heap profile (paired `--heap-profile-process --heap-sampling 512`,
+fast-dict-array baseline vs + fast-dict-view):
+
+| Allocator                          | Pre (KB)  | Post (KB) | Delta          |
+|------------------------------------|----------:|----------:|---------------:|
+| `fastParseDictArray` / `*View`     |  43 817.77 |  40 955.37 | -2.86 MB       |
+| Total sampled                      |  92.13 MB  |  89.68 MB  | **-2.45 MB**   |
+
+Modest. The takeaway is structural: a view-based shape is the
+right direction, but the PDFDict instance shape itself is now
+the dominant per-dict cost -- so the next prototype needs to
+shrink the instance too. That's the one-buffer + packed-pointer
+work in the following sections.
+
 ## `@cantoo/pdf-lib`: not a drop-in replacement
 
 Spot-checked the maintained fork (`@cantoo/pdf-lib` 2.6.5) as an

From d83538460bed1c85455a18a22edab10581177aab Mon Sep 17 00:00:00 2001
From: Kuba Sunderland-Ober <kuba@mareimbrium.org>
Date: Sat, 23 May 2026 17:45:13 +0200
Subject: [PATCH 21/44] Single-double PDFDict: explored, didn't ship.

The fast-dict-view PDFDict instance was 5 fields (~96 B); packing
the lot into a single 53-bit Number `d` would shrink the instance
significantly. Reads via bitwise (fields below bit 32) or
arithmetic (Math.floor(d / 2**n) & mask) for higher fields. The
PDFContext is a singleton in our pipeline (one PDFDocument.load
per process), so the shim keeps it at module level; a second
distinct context throws.

PDFDict instance shrinks ~96 B -> ~24-32 B. PDFPageLeaf still
needs `normalized` + `autoNormalizeCTM` slots (~1.6 k page leaves
of 261 k total dicts, small fraction).

Heap: 89.7 MB -> 83.7 MB (-6 MB / -7%). GC self-time: 167 ms ->
129 ms (-23%). Cumulative arc from the original Map-backed PDFDict:
152 MB -> 84 MB (-45%).

Superseded by the one-buffer PDFDict approach that follows: keeps
the "packed into a Number" idea but moves entries into a single
per-parser mainBuf, folding bufIdx away and giving a tighter
bit layout.

Code dropped; narrative kept in perf/notes/08-pdf-lib.md.
---
 perf/notes/08-pdf-lib.md | 163 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 163 insertions(+)

diff --git a/perf/notes/08-pdf-lib.md b/perf/notes/08-pdf-lib.md
index cef6bf29..70ae4153 100644
--- a/perf/notes/08-pdf-lib.md
+++ b/perf/notes/08-pdf-lib.md
@@ -2649,6 +2649,169 @@ the dominant per-dict cost -- so the next prototype needs to
 shrink the instance too. That's the one-buffer + packed-pointer
 work in the following sections.
 
+## Single-double PDFDict (explored, didn't ship)
+
+fast-dict-view's win was capped by the PDFDict instance footprint:
+5 named slots (`_dictBuf`, `_dictStart`, `_dictEnd`, `_dictOwned`,
+`context`) at ~96 B per instance. Across 261 k dicts that's ~25 MB
+of per-dict object header.
+
+The instance shape is what was costing us. Most of those fields are
+small: `start` fits in 22 bits, `length` in 14 bits, `bufIdx` in
+~15 bits (counting setOutline's owned dicts), `owned` is 1 bit. The
+fields that *can't* obviously be made small are the `buf` and
+`context` *references* -- but `buf` already gets reference-by-index
+in fast-dict-view's design (via `_buffers[bufIdx]`), and `context`
+is a *singleton* in our pipeline.
+
+Prototyped as `fast-dict-double.mjs`. The idea: pack the whole
+instance state into one 53-bit Number stored as PDFDict's single
+`d` field, and treat the PDFContext as a module-level singleton.
+Heap dropped 90 MB → 84 MB (-6 MB / -7 %); GC self-time
+166.7 ms → 128.8 ms (-23 %). Promising, but the next move --
+also packing the entries into one shared buffer -- gives a
+cleaner overall shape and made fast-dict-double an opt-in
+stepping stone rather than a shipping target. The shim doesn't
+ship; the notes here document the design.
+
+### One PDFContext per process
+
+PDFContexts are created by `PDFParser.forBytesWithOptions` inside
+`PDFDocument.load`. In our pipeline `PDFDocument.load` is called
+exactly once per build (in `docs/render-book.mjs`), so exactly one
+PDFContext exists during the process phase. The shim stashed that
+one PDFContext in a module-level `_singletonContext` variable; the
+`PDFDict.prototype.context` getter just returned it. Any second
+distinct context would throw -- intentional bailout for workloads
+this shim isn't a fit for (e.g. merging two PDFs in one process).
+
+### 53-bit packed layout
+
+That leaves everything else fitting in one Number:
+
+```
+bits  0-21: start   (22 bits, max 4 M slots; depth-0 hits 2.16 M)
+bits 22-35: length  (14 bits, max 16 384 slots; max observed 8 706)
+bits 36-50: bufIdx  (15 bits, max 32 768 buffers; book uses ~1 800
+                    once setOutline creates per-outline-node
+                    owned dicts via the factory)
+bit  51   : owned flag
+bit  52   : spare
+```
+
+Stored as a single `d` field on each PDFDict instance. Reads use a
+mix of bitwise (for fields entirely below bit 32) and arithmetic
+(for fields straddling or above 32, since JS bitwise ops cast to
+int32):
+
+```js
+function _start(d)  { return d & MASK_22; }                  // bitwise
+function _length(d) { return Math.floor(d / POW_22) & MASK_14; }
+function _bufIdx(d) { return Math.floor(d / POW_36) & MASK_15; }
+function _owned(d)  { return Math.floor(d / POW_51) & 1; }
+```
+
+Writes:
+
+```js
+function pack(start, length, bufIdx, owned) {
+  if (start  >= MAX_START)  throw new Error('start overflow');
+  if (length >= MAX_LENGTH) throw new Error('length overflow');
+  if (bufIdx >= MAX_BUFIDX) throw new Error('bufIdx overflow');
+  return start + length * POW_22 + bufIdx * POW_36 + (owned ? POW_51 : 0);
+}
+```
+
+Overflow guards: if any field exceeds its budget, the shim throws
+with a clear message. The budgets are sized 2-5x the book's
+observed workload, so this is a guardrail for surprise inputs
+rather than a hot path.
+
+### V8 representation
+
+A property whose values consistently fall outside Smi range (which
+`d` does, since `bufIdx * 2**36` immediately exceeds 2^31) gets
+stored either inline as DoubleField (8 B inline double) or via
+TaggedField (8 B pointer + ~16-24 B HeapNumber). Empirically the
+heap drop was consistent with most instances using DoubleField:
+the `fastParseDictView` row's combined self+`_makeFromView` self
+dropped from 40.96 MB to 35.34 MB (an extra ~5 MB beyond what
+plain buffer-sharing achieved).
+
+### Subclasses
+
+PDFCatalog and PDFPageTree add no instance fields beyond `d`.
+PDFPageLeaf still needs `normalized` and `autoNormalizeCTM` as
+separate slots; that's ~1.6 k page leaves out of 261 k total dicts
+on the book, a small fraction.
+
+### Measured wins
+
+Heap profile (paired `--heap-profile-process --heap-sampling 512`,
+fast-dict-view baseline vs + fast-dict-double):
+
+| Allocator                          | Pre (KB)  | Post (KB) | Delta             |
+|------------------------------------|----------:|----------:|------------------:|
+| `fastParseDictView` / `*Double`    |  40 955.37 |  18 913.63 | -22.0 MB         |
+| `_makeFromView` (separate child row)|    773.09 |  16 429.68 | +15.7 MB         |
+| Combined (fastParse* + _makeFromView)| 41 728.46 |  35 343.31 | **-6.4 MB**      |
+| Total sampled                      |  89.68 MB |  83.68 MB | **-6.0 MB (-7 %)** |
+
+(`_makeFromView` shows up as a bigger separate row because V8
+de-inlined it slightly differently for fast-dict-double, but the
+combined "PDFDict construction overhead" dropped ~6 MB.)
+
+CPU profile (paired `--cpu-profile-process --cpu-sampling 100`):
+
+| Row                          | Pre (ms) | Post (ms) | Delta                |
+|------------------------------|---------:|----------:|---------------------:|
+| (garbage collector)          |   166.71 |    128.81 | **-37.9 ms (-23 %)** |
+| `fastParseDictView` / `*Double` |    28.95 |     44.36 | +15.4 ms (incl COW + pack/unpack) |
+| Total profile duration       |   1.03 s |    0.97 s | -60 ms (-6 %)        |
+
+The GC self-time drop is the headline: less heap allocation
+directly translates to less GC work. The fastParseDict* row went
+up a bit (more arithmetic in unpack), but the saving on GC and
+elsewhere comfortably outweighs it.
+
+### Cumulative arc
+
+Starting from the original Map-backed PDFDict:
+
+| State                            | Total sampled | Change vs prior |
+|----------------------------------|--------------:|----------------:|
+| Map-backed (pre-fast-dict-array) |   152 MB      | -               |
+| fast-dict-array (INITIAL_SLOTS=10)|    92 MB     | -60 MB          |
+| fast-dict-view (shared buffers)  |    90 MB      | -2 MB           |
+| **fast-dict-double**             |    **84 MB**  | **-6 MB**       |
+
+**-45 % cumulative reduction in process-phase heap traffic.**
+
+### Caveats
+
+- **Single context assumption.** If you load a second PDFDocument
+  in the same process the shim throws. For our build pipeline this
+  is fine; for general pdf-lib use a multi-context variant would
+  need an array + small ctxIdx field.
+- **Bit budgets.** Sized for the book and similar PDFs. A PDF with
+  a top-level dict count exceeding 4 M entries (very large book or
+  pathological generator) would trip the start budget; a PDF with
+  a single dict larger than 8 192 entries would trip length;
+  setOutline producing more than 32 k owned dicts would trip
+  bufIdx. All three are deliberate guards rather than expected
+  failures.
+- **Arithmetic in hot path.** Each read of a high-bit field is one
+  `Math.floor(d / 2**n) & mask`. V8 optimizes division by
+  powers-of-2 well, but it's not free. The 23 % GC drop is the
+  empirical confirmation that the heap savings outweigh the
+  unpack cost.
+
+The next prototype (one-buffer PDFDict) keeps the
+"packed-into-Number" idea but moves the entries themselves into
+a single per-parser mainBuf, which folds the bufIdx field away
+and lets a tighter bit layout track the (mainBuf-relative) start
++ length directly. That's what ends up shipping.
+
 ## `@cantoo/pdf-lib`: not a drop-in replacement
 
 Spot-checked the maintained fork (`@cantoo/pdf-lib` 2.6.5) as an

From 84c7bf2009ffa29861d188aa820bc31cbe735402 Mon Sep 17 00:00:00 2001
From: Kuba Sunderland-Ober <kuba@mareimbrium.org>
Date: Sat, 23 May 2026 18:00:31 +0200
Subject: [PATCH 22/44] One-buffer PDFDict: every entry lives in a single
 mainBuf.

Collapse PDFDict storage into one long-lived mainBuf. Recursion
gotcha solved with a two-area split: a small per-parser temp
array acts as a stack of parseDict recursion frames; each frame
appends to temp, then commits its frame to main in one
contiguous append, then pops temp back. Outer's frame stays
parked in temp while inner recurses, then resumes intact when
inner pops.

Owned dicts (factory-created post-parse, COW results) append to
mainBuf too. Mutations: in-place replace for existing keys, COW
(copy range to tail + push new pair) for new keys or delete.
setOutline's pattern (create-then-recurse-then-set) hits one COW
per dict; subsequent sets extend in place at the high-water mark.
~9k entry copies total for the book, negligible.

PDFContext is a singleton in our pipeline; a second distinct
context throws. Instance state packs into one 53-bit Number:
24 bit start + 14 bit length + 1 bit owned + 14 spare = 37 bits
used.

Heap: total process heap 92 MB -> 66 MB (-28% vs fast-dict-array).
Cumulative arc since Map-backed PDFDict: 152 MB -> 66 MB (-57%).
GC self-time bumps slightly (one big live array to scan);
wall-clock within noise.

Mutually exclusive with --fast-dict-array / --fast-dict-iter /
--fast-parse-dict. Production swaps render-book.mjs's
fast-dict-array import for fast-dict-onebuf; the legacy shims
stay in the tree as A/B baselines. The intermediate fast-dict-
view and fast-dict-double prototypes explored on the way to this
shape are documented as "explored, didn't ship" sections in
perf/notes/08-pdf-lib.md.
---
 docs/lib/fast-dict-onebuf.mjs | 434 ++++++++++++++++++++++++++++++++++
 docs/render-book.mjs          |  35 +--
 perf/README.md                |  51 ++--
 perf/measure.mjs              |  27 ++-
 perf/notes/08-pdf-lib.md      | 184 +++++++++++++-
 5 files changed, 693 insertions(+), 38 deletions(-)
 create mode 100644 docs/lib/fast-dict-onebuf.mjs

diff --git a/docs/lib/fast-dict-onebuf.mjs b/docs/lib/fast-dict-onebuf.mjs
new file mode 100644
index 00000000..92bfa9cf
--- /dev/null
+++ b/docs/lib/fast-dict-onebuf.mjs
@@ -0,0 +1,434 @@
+// One-buffer PDFDict: every committed entry lives in a single
+// append-only array (main), kept for the document's lifetime. The
+// parser uses a small per-instance temp array as a stack of recursion
+// frames; each parseDict invocation appends to temp, commits its
+// frame to main in one contiguous range, and pops temp back. After
+// parseDocument completes, temp is released. PDFDict instances only
+// ever read from main, so the bufIdx field disappears from the
+// packed value -- frees up bits.
+//
+// 53-bit packed Number layout (within Number.MAX_SAFE_INTEGER):
+//   bits  0-23: start  (24 bits, max 16 M slots in main)
+//   bits 24-37: length (14 bits, max 16 384 slots; max observed 8 706)
+//   bit  38   : owned flag
+//   bits 39-52: spare (14 bits)
+//
+// Recursion. Outer parseDict pushes entries onto temp. Calling
+// this.parseObject() to parse a value may recurse to inner
+// parseDict, which appends ON TOP of outer's pending entries. Inner
+// commits its frame to main in one append, then pops temp back to
+// the level it started at -- outer's frame is intact at the top of
+// temp again. Outer continues, eventually committing its (now
+// contiguous in temp) entries to main in one append. Outer's and
+// inner's ranges in main do not overlap; each was committed as a
+// single contiguous block at distinct points in time.
+//
+// Mutations. The shared range is read-only after parse. First
+// mutation:
+//   - set with existing key: in-place replace (safe; doesn't shift slots)
+//   - set with new key, dict at main's high-water mark: in-place push (extend the range)
+//   - set with new key, dict NOT at high-water mark: COW (copy
+//     range to main's tail, then push the new pair, update encoded
+//     value to the new range)
+//   - delete: COW (copy range minus deleted entry to tail)
+// On second+ mutations the dict is already 'owned'; same rules
+// apply but the COW step is skipped when we're at the high-water
+// mark.
+//
+// Singleton PDFContext (one PDFDocument.load per process in our
+// pipeline; throws if a second distinct context appears).
+//
+// Mutually exclusive with --fast-dict-double / --fast-dict-view /
+// --fast-dict-array.
+
+import { createRequire } from 'node:module';
+
+const require = createRequire(import.meta.url);
+const PDFDict         = require('pdf-lib/cjs/core/objects/PDFDict.js').default;
+const PDFCatalog      = require('pdf-lib/cjs/core/structures/PDFCatalog.js').default;
+const PDFPageTree     = require('pdf-lib/cjs/core/structures/PDFPageTree.js').default;
+const PDFPageLeaf     = require('pdf-lib/cjs/core/structures/PDFPageLeaf.js').default;
+const PDFName         = require('pdf-lib/cjs/core/objects/PDFName.js').default;
+const PDFNull         = require('pdf-lib/cjs/core/objects/PDFNull.js').default;
+const PDFObjectParser = require('pdf-lib/cjs/core/parser/PDFObjectParser.js').default;
+const CharCodes       = require('pdf-lib/cjs/core/syntax/CharCodes.js').default;
+
+const TypeName    = PDFName.of('Type');
+const CatalogName = PDFName.of('Catalog');
+const PagesName   = PDFName.of('Pages');
+const PageName    = PDFName.of('Page');
+
+// ---- The single buffer + temp ---------------------------------------
+
+// Pre-sized to total entries + slack measured on the book. Other
+// workloads grow it naturally (V8-amortized array growth from this
+// starting size).
+const MAIN_INITIAL_CAP = 2400000;
+const main = new Array(MAIN_INITIAL_CAP);
+let mainLen = 0;
+
+// ---- Bit-packing helpers --------------------------------------------
+
+const POW_24 = 16777216;          // 2^24
+const POW_38 = 274877906944;      // 2^38
+const MASK_24 = 0xFFFFFF;
+const MASK_14 = 0x3FFF;
+
+const MAX_START  = POW_24;          // exclusive
+const MAX_LENGTH = 1 << 14;         // 16384, exclusive
+
+function pack(start, length, owned) {
+  if (start  >= MAX_START)  throw new Error(`fast-dict-onebuf: start ${start} exceeds 24-bit budget`);
+  if (length >= MAX_LENGTH) throw new Error(`fast-dict-onebuf: length ${length} exceeds 14-bit budget`);
+  return start
+    + length * POW_24
+    + (owned ? POW_38 : 0);
+}
+
+function _start(d)  { return d & MASK_24; }
+function _length(d) { return Math.floor(d / POW_24) & MASK_14; }
+function _owned(d)  { return Math.floor(d / POW_38) & 1; }
+
+// ---- Singleton context ---------------------------------------------
+
+let _singletonContext = null;
+
+function _registerContext(ctx) {
+  if (_singletonContext === null) {
+    _singletonContext = ctx;
+  } else if (_singletonContext !== ctx) {
+    throw new Error('fast-dict-onebuf: expected a singleton PDFContext, got a second distinct one.');
+  }
+}
+
+// ---- Append helpers ------------------------------------------------
+
+function _appendEntries(entries, fromOffset, lenSlots) {
+  for (let i = 0; i < lenSlots; i++) {
+    main[mainLen + i] = entries[fromOffset + i];
+  }
+  mainLen += lenSlots;
+}
+
+function _appendArray(arr) {
+  const len = arr.length;
+  for (let i = 0; i < len; i++) main[mainLen + i] = arr[i];
+  mainLen += len;
+}
+
+// COW: copy this dict's range to main's tail, return the new packed
+// value (now owned, anchored at the new range).
+function _cow(pd) {
+  const d = pd.d;
+  if (_owned(d)) {
+    // Already owned and somewhere in main. If we're at the high-water
+    // mark we can mutate in place; otherwise we need to COW (the
+    // dict was created earlier, other dicts have been appended
+    // since, so we no longer abut the tail).
+    const start = _start(d);
+    const length = _length(d);
+    if (start + length === mainLen) return d;   // at HWM
+    const newStart = mainLen;
+    for (let i = 0; i < length; i++) main[mainLen + i] = main[start + i];
+    mainLen += length;
+    return pack(newStart, length, 1);
+  } else {
+    // Shared range. COW to tail.
+    const start = _start(d);
+    const length = _length(d);
+    const newStart = mainLen;
+    for (let i = 0; i < length; i++) main[mainLen + i] = main[start + i];
+    mainLen += length;
+    return pack(newStart, length, 1);
+  }
+}
+
+// ---- Construction ---------------------------------------------------
+
+function _makeFromRange(ProtoClass, start, length, owned, ctx) {
+  _registerContext(ctx);
+  const pd = Object.create(ProtoClass.prototype);
+  pd.d = pack(start, length, owned ? 1 : 0);
+  if (ProtoClass === PDFPageLeaf) {
+    pd.normalized = false;
+    pd.autoNormalizeCTM = true;
+  }
+  return pd;
+}
+
+function _ownedFromArray(ProtoClass, arr, ctx) {
+  const start = mainLen;
+  _appendArray(arr);
+  return _makeFromRange(ProtoClass, start, arr.length, true, ctx);
+}
+
+function mapToArray(map) {
+  const arr = new Array(map.size * 2);
+  let i = 0;
+  for (const [k, v] of map) { arr[i++] = k; arr[i++] = v; }
+  return arr;
+}
+
+if (!PDFDict.prototype.__fastDictOnebufInstalled) {
+
+  // ---- PDFDict.prototype --------------------------------------------
+
+  PDFDict.prototype.keys = function () {
+    const d = this.d;
+    const start = _start(d);
+    const length = _length(d);
+    const out = new Array(length >> 1);
+    for (let i = 0, j = 0; i < length; i += 2, j++) out[j] = main[start + i];
+    return out;
+  };
+
+  PDFDict.prototype.values = function () {
+    const d = this.d;
+    const start = _start(d);
+    const length = _length(d);
+    const out = new Array(length >> 1);
+    for (let i = 0, j = 0; i < length; i += 2, j++) out[j] = main[start + i + 1];
+    return out;
+  };
+
+  PDFDict.prototype.entries = function () {
+    const d = this.d;
+    const start = _start(d);
+    const length = _length(d);
+    const out = new Array(length >> 1);
+    for (let i = 0, j = 0; i < length; i += 2, j++) {
+      out[j] = [main[start + i], main[start + i + 1]];
+    }
+    return out;
+  };
+
+  PDFDict.prototype.set = function (key, value) {
+    const d0 = this.d;
+    const start0 = _start(d0);
+    const length0 = _length(d0);
+    // Try in-place replace
+    for (let i = 0; i < length0; i += 2) {
+      if (main[start0 + i] === key) { main[start0 + i + 1] = value; return; }
+    }
+    // Append: requires the dict to be at main's high-water mark, OR we COW.
+    let dNow = d0;
+    if (!_owned(d0) || start0 + length0 !== mainLen) {
+      dNow = _cow(this);
+    }
+    // After _cow (or if we were already at HWM owned), we abut the tail.
+    main[mainLen++] = key;
+    main[mainLen++] = value;
+    const start = _start(dNow);
+    this.d = pack(start, length0 + 2, 1);
+  };
+
+  PDFDict.prototype.get = function (key, preservePDFNull) {
+    if (preservePDFNull === undefined) preservePDFNull = false;
+    const d = this.d;
+    const start = _start(d);
+    const end = start + _length(d);
+    for (let i = start; i < end; i += 2) {
+      if (main[i] === key) {
+        const value = main[i + 1];
+        if (value === PDFNull && !preservePDFNull) return undefined;
+        return value;
+      }
+    }
+    return undefined;
+  };
+
+  PDFDict.prototype.has = function (key) {
+    const d = this.d;
+    const start = _start(d);
+    const end = start + _length(d);
+    for (let i = start; i < end; i += 2) {
+      if (main[i] === key) {
+        const value = main[i + 1];
+        return value !== undefined && value !== PDFNull;
+      }
+    }
+    return false;
+  };
+
+  PDFDict.prototype.delete = function (key) {
+    // Always COW for delete: shifting slots in main would corrupt
+    // other dicts that point into the affected region.
+    const d0 = this.d;
+    const start0 = _start(d0);
+    const length0 = _length(d0);
+    let foundIdx = -1;
+    for (let i = 0; i < length0; i += 2) {
+      if (main[start0 + i] === key) { foundIdx = i; break; }
+    }
+    if (foundIdx < 0) return false;
+    const newStart = mainLen;
+    for (let i = 0; i < length0; i++) {
+      if (i === foundIdx || i === foundIdx + 1) continue;
+      main[mainLen++] = main[start0 + i];
+    }
+    this.d = pack(newStart, length0 - 2, 1);
+    return true;
+  };
+
+  PDFDict.prototype.asMap = function () {
+    const d = this.d;
+    const start = _start(d);
+    const end = start + _length(d);
+    const m = new Map();
+    for (let i = start; i < end; i += 2) m.set(main[i], main[i + 1]);
+    return m;
+  };
+
+  PDFDict.prototype.clone = function (context) {
+    const d = this.d;
+    const start = _start(d);
+    const length = _length(d);
+    const newStart = mainLen;
+    for (let i = 0; i < length; i++) main[mainLen + i] = main[start + i];
+    mainLen += length;
+    _registerContext(context || _singletonContext);
+    const c = Object.create(PDFDict.prototype);
+    c.d = pack(newStart, length, 1);
+    return c;
+  };
+
+  PDFDict.prototype.toString = function () {
+    const d = this.d;
+    const start = _start(d);
+    const end = start + _length(d);
+    let s = '<<\n';
+    for (let i = start; i < end; i += 2) {
+      s += main[i].toString() + ' ' + main[i + 1].toString() + '\n';
+    }
+    return s + '>>';
+  };
+
+  PDFDict.prototype.sizeInBytes = function () {
+    const d = this.d;
+    const start = _start(d);
+    const end = start + _length(d);
+    let size = 5;
+    for (let i = start; i < end; i += 2) {
+      size += main[i].sizeInBytes() + main[i + 1].sizeInBytes() + 2;
+    }
+    return size;
+  };
+
+  PDFDict.prototype.copyBytesInto = function (buffer, offset) {
+    const initialOffset = offset;
+    buffer[offset++] = CharCodes.LessThan;
+    buffer[offset++] = CharCodes.LessThan;
+    buffer[offset++] = CharCodes.Newline;
+    const d = this.d;
+    const start = _start(d);
+    const end = start + _length(d);
+    for (let i = start; i < end; i += 2) {
+      offset += main[i].copyBytesInto(buffer, offset);
+      buffer[offset++] = CharCodes.Space;
+      offset += main[i + 1].copyBytesInto(buffer, offset);
+      buffer[offset++] = CharCodes.Newline;
+    }
+    buffer[offset++] = CharCodes.GreaterThan;
+    buffer[offset++] = CharCodes.GreaterThan;
+    return offset - initialOffset;
+  };
+
+  Object.defineProperty(PDFDict.prototype, 'context', {
+    get() { return _singletonContext; },
+    set(_ctx) { /* singleton is source of truth */ },
+    configurable: true,
+  });
+
+  // ---- PDFDict factories --------------------------------------------
+
+  PDFDict.withContext = function (context) {
+    return _ownedFromArray(PDFDict, [], context);
+  };
+  PDFDict.fromMapWithContext = function (map, context) {
+    return _ownedFromArray(PDFDict, mapToArray(map), context);
+  };
+
+  PDFCatalog.withContextAndPages = function (context, pages) {
+    return _ownedFromArray(
+      PDFCatalog,
+      [PDFName.of('Type'), CatalogName, PagesName, pages],
+      context,
+    );
+  };
+  PDFCatalog.fromMapWithContext = function (map, context) {
+    return _ownedFromArray(PDFCatalog, mapToArray(map), context);
+  };
+
+  PDFPageTree.fromMapWithContext = function (map, context) {
+    return _ownedFromArray(PDFPageTree, mapToArray(map), context);
+  };
+
+  PDFPageLeaf.fromMapWithContext = function (map, context, autoNormalizeCTM) {
+    const d = _ownedFromArray(PDFPageLeaf, mapToArray(map), context);
+    if (autoNormalizeCTM !== undefined) d.autoNormalizeCTM = autoNormalizeCTM;
+    return d;
+  };
+
+  // ---- PDFObjectParser.prototype.parseDict --------------------------
+  //
+  // Each parser instance carries its own temp array (small; sized to
+  // peak recursion-depth-stack of entries) plus a length cursor.
+  // parseDict pushes entries onto temp's tail; on completion, commits
+  // its frame to main in one contiguous append, pops temp back to
+  // frameStart, and returns a PDFDict view into main.
+
+  PDFObjectParser.prototype.parseDict = function fastParseDictOneBuf() {
+    const bytes = this.bytes;
+    bytes.assertNext(CharCodes.LessThan);
+    bytes.assertNext(CharCodes.LessThan);
+    this.skipWhitespaceAndComments();
+
+    if (this._dictTemp === undefined) {
+      this._dictTemp = new Array(64);   // grows naturally if needed
+      this._dictTempLen = 0;
+    }
+    const temp = this._dictTemp;
+    const frameStart = this._dictTempLen;
+
+    while (!bytes.done() &&
+           bytes.peek() !== CharCodes.GreaterThan &&
+           bytes.peekAhead(1) !== CharCodes.GreaterThan) {
+      const key = this.parseName();
+      const value = this.parseObject();    // may recurse; temp grows / shrinks
+      const len = this._dictTempLen;
+      temp[len]     = key;
+      temp[len + 1] = value;
+      this._dictTempLen = len + 2;
+      this.skipWhitespaceAndComments();
+    }
+    this.skipWhitespaceAndComments();
+    bytes.assertNext(CharCodes.GreaterThan);
+    bytes.assertNext(CharCodes.GreaterThan);
+
+    const frameLen = this._dictTempLen - frameStart;
+    // Commit this frame to main in one contiguous append
+    const start = mainLen;
+    _appendEntries(temp, frameStart, frameLen);
+    // Pop our frame off temp
+    this._dictTempLen = frameStart;
+
+    // Type-sentinel dispatch (scan the frame we just committed)
+    let Type;
+    const end = start + frameLen;
+    for (let i = start; i < end; i += 2) {
+      if (main[i] === TypeName) { Type = main[i + 1]; break; }
+    }
+    if (Type === CatalogName) return _makeFromRange(PDFCatalog,  start, frameLen, false, this.context);
+    if (Type === PagesName)   return _makeFromRange(PDFPageTree, start, frameLen, false, this.context);
+    if (Type === PageName)    return _makeFromRange(PDFPageLeaf, start, frameLen, false, this.context);
+    return _makeFromRange(PDFDict, start, frameLen, false, this.context);
+  };
+
+  PDFDict.prototype.__fastDictOnebufInstalled = true;
+  // Mark subsumed shims as installed.
+  PDFDict.prototype.__fastDictDoubleInstalled = true;
+  PDFDict.prototype.__fastDictViewInstalled = true;
+  PDFDict.prototype.__fastDictArrayInstalled = true;
+  PDFDict.prototype.__fastDictIterInstalled = true;
+  PDFObjectParser.prototype.__fastParseDictInstalled = true;
+}
diff --git a/docs/render-book.mjs b/docs/render-book.mjs
index 92395995..b99a6b07 100644
--- a/docs/render-book.mjs
+++ b/docs/render-book.mjs
@@ -58,19 +58,26 @@ import { PDFDocument } from 'pdf-lib';
 //     `n.toString(2)` just to count its bit length) with a non-
 //     allocating short-circuit ladder. Called ~300 k times per save
 //     from PDFCrossRefStream's xref writer.
-//   fast-dict-array -- replace PDFDict's backing Map with a flat
-//     alternating array [k0, v0, k1, v1, ...]. The sampling heap
-//     profile of the process phase put `new Map()` + Map.prototype.set
-//     at ~80 MB combined (50 % of total allocations), 80 % of that
-//     traffic from the parser's per-dict accumulator. The flat-array
-//     shape is one allocation per dict, no hash-table arena; PDF dicts
-//     are tiny enough that linear lookup beats Map hashing. Subsumes
-//     both fast-dict-iter (sizeInBytes / copyBytesInto iterate the
-//     array in place, no Map.forEach context object) and
-//     fast-parse-dict (parser's hot loop accumulates into the array
-//     directly, Type-sentinel dispatch is a short linear scan). Drops
-//     Map+set heap traffic by ~80 %, GC self-time by ~20 %, process
-//     wall-clock by ~4 % (~48 ms / 1.18 s).
+//   fast-dict-onebuf -- one long-lived buffer for every committed
+//     PDFDict entry across the whole document. Parser uses a small
+//     per-instance temp array as a stack of recursion frames; each
+//     parseDict invocation appends to temp, commits its frame to
+//     main in one contiguous append, and pops temp back. PDFDicts
+//     only ever read from main, so a packed (start, length, owned)
+//     Number is the whole instance state -- no separate bufIdx.
+//     Owned dicts (factory-created post-parse) also append to main.
+//     Mutations: in-place replace for existing keys, COW (copy
+//     range to tail, push new pair) for new keys or delete.
+//     PDFContext is a singleton -- one PDFDocument.load per
+//     process; a second distinct context throws. Subsumes
+//     fast-dict-array. Process-phase heap traffic drops from the
+//     Map-backed baseline of ~152 MB down to ~66 MB (-57%); -22%
+//     beyond fast-dict-array. See "One-buffer PDFDict" in
+//     perf/notes/08-pdf-lib.md.
+//
+//     Earlier dict-shape shims (fast-dict-array, fast-dict-iter,
+//     fast-parse-dict) stay in the tree as A/B baselines but are
+//     mutually exclusive with --fast-dict-onebuf in measure.mjs.
 //   fast-parse-object -- replace PDFObjectParser.prototype.parseObject
 //     with a first-byte-dispatch version that gates the three
 //     matchKeyword (true / false / null) scans behind a byte check.
@@ -118,7 +125,7 @@ import './lib/fast-parse-number.mjs';
 import './lib/fast-decode-name.mjs';
 import './lib/fast-number-to-string.mjs';
 import './lib/fast-size-in-bytes.mjs';
-import './lib/fast-dict-array.mjs';
+import './lib/fast-dict-onebuf.mjs';
 import './lib/fast-parse-object.mjs';
 import './lib/fast-sync-load.mjs';
 import './lib/fast-indirect-objects.mjs';
diff --git a/perf/README.md b/perf/README.md
index 6f1db950..553a3af1 100644
--- a/perf/README.md
+++ b/perf/README.md
@@ -66,7 +66,7 @@ or pdf-lib), or to write `book.pdf` for behavioural verification.
 The mirror command for CPU-profiling the pdf-lib roundtrip:
 
 ```
-node measure.mjs --fast-refs --parallel-deflate --fast-decode-name --fast-number-to-string --fast-size-in-bytes --fast-inflate --fast-parse-number --fast-dict-array --fast-parse-object --fast-sync-load --fast-indirect-objects --fast-pdfnumber-pool --cpu-profile-process --cpu-sampling 100
+node measure.mjs --fast-refs --parallel-deflate --fast-decode-name --fast-number-to-string --fast-size-in-bytes --fast-inflate --fast-parse-number --fast-dict-onebuf --fast-parse-object --fast-sync-load --fast-indirect-objects --fast-pdfnumber-pool --cpu-profile-process --cpu-sampling 100
 ```
 
 Flag rationale:
@@ -130,22 +130,27 @@ Flag rationale:
 - `--fast-dict-array` -- inject
   [docs/lib/fast-dict-array.mjs](../docs/lib/fast-dict-array.mjs),
   replacing `PDFDict`'s backing `Map` with a flat alternating
-  `[k0, v0, k1, v1, ...]` array and patching every `PDFDict`
-  prototype method (and the parser's `parseDict`) to read it. The
-  sampling heap profile of the process phase put `new Map()` +
-  `Map.prototype.set` at ~80 MB combined (50 % of total allocations
-  on the book), 80 % of that traffic from the parser's per-dict
-  accumulator. The flat-array shape is one allocation per dict, no
-  hash-table arena; PDF dicts are tiny (typically <= 10 entries) so
-  linear lookup beats Map hashing. Subsumes `--fast-dict-iter`
-  (`sizeInBytes` / `copyBytesInto` iterate the array in place, no
-  `Map.forEach` context object) and `--fast-parse-dict` (parser's
-  hot loop accumulates into the array directly, Type-sentinel
-  dispatch is a short linear scan -- PDF convention places `/Type`
-  first, so the scan is effectively O(1) per dict). ~80 % drop in
-  `Map`+`set` heap traffic, ~20 % drop in process GC self-time,
-  ~4 % drop in process wall-clock (~48 ms / 1.18 s). Production
-  runs through it; the two old shims stay on disk as A/B baselines.
+  `[k0, v0, k1, v1, ...]` array allocated per-dict (pre-sized to 10
+  slots, the median). Was production before `--fast-dict-onebuf`
+  superseded it; kept as an A/B baseline. See "Replace PDFDict's
+  backing Map with a flat array" in
+  [notes/08-pdf-lib.md](notes/08-pdf-lib.md).
+- `--fast-dict-onebuf` -- inject
+  [docs/lib/fast-dict-onebuf.mjs](../docs/lib/fast-dict-onebuf.mjs).
+  One long-lived buffer for every committed PDFDict entry across
+  the whole document. Parser uses a small per-parser temp array as
+  a stack of recursion frames; each parseDict invocation appends
+  to temp, commits its frame to main in one contiguous append,
+  and pops temp back. PDFDicts only ever read from main, so the
+  whole instance state packs into one 53-bit Number (24-bit start
+  + 14-bit length + 1-bit owned). Owned dicts (factory-created
+  post-parse, COW results) also append to main. Mutations:
+  in-place replace for existing keys, COW (copy range to tail,
+  append new pair, update encoded range) for new keys or delete.
+  Mutually exclusive with the other dict-shape shims. ~57 %
+  cumulative heap reduction since the original Map-backed PDFDict
+  (152 -> 66 MB). Production runs through it. See
+  [notes/08-pdf-lib.md "One-buffer PDFDict"](notes/08-pdf-lib.md).
 - `--fast-parse-object` -- inject
   [docs/lib/fast-parse-object.mjs](../docs/lib/fast-parse-object.mjs),
   replacing `PDFObjectParser.prototype.parseObject` with a
@@ -237,7 +242,7 @@ process phase -- "where is pdf-lib allocating bytes?" rather than
 "where is it spending cycles?":
 
 ```
-node measure.mjs --fast-refs --parallel-deflate --fast-decode-name --fast-number-to-string --fast-size-in-bytes --fast-inflate --fast-parse-number --fast-dict-array --fast-parse-object --fast-sync-load --fast-indirect-objects --fast-pdfnumber-pool --heap-profile-process --heap-sampling 512
+node measure.mjs --fast-refs --parallel-deflate --fast-decode-name --fast-number-to-string --fast-size-in-bytes --fast-inflate --fast-parse-number --fast-dict-onebuf --fast-parse-object --fast-sync-load --fast-indirect-objects --fast-pdfnumber-pool --heap-profile-process --heap-sampling 512
 ```
 
 Same `--fast-*` set as the CPU command (production is the baseline
@@ -399,9 +404,10 @@ run.bat --fast-number-to-string           # skip numberToString redundant toStri
 run.bat --fast-size-in-bytes              # non-allocating ladder for xref byte-width (also ships; opt-in here for A/B)
 run.bat --fast-inflate                    # swap pako.inflate for node:zlib.inflateSync (also ships; opt-in here for A/B)
 run.bat --fast-parse-number               # direct-integer accumulator for parseRawNumber/parseRawInt (also ships; opt-in here for A/B)
-run.bat --fast-dict-iter                  # in-place Map.forEach for PDFDict.sizeInBytes/copyBytesInto (Map-shape baseline; subsumed by --fast-dict-array in production)
-run.bat --fast-parse-dict                 # hoist Type/Catalog/Pages/Page sentinel PDFNames out of parseDict (Map-shape baseline; subsumed by --fast-dict-array in production)
-run.bat --fast-dict-array                 # replace PDFDict's backing Map with a flat [k,v,k,v,...] array; subsumes --fast-dict-iter + --fast-parse-dict (also ships; opt-in here for A/B)
+run.bat --fast-dict-iter                  # in-place Map.forEach for PDFDict.sizeInBytes/copyBytesInto (Map-shape baseline; production now runs --fast-dict-onebuf)
+run.bat --fast-parse-dict                 # hoist Type/Catalog/Pages/Page sentinel PDFNames out of parseDict (Map-shape baseline; production now runs --fast-dict-onebuf)
+run.bat --fast-dict-array                 # replace PDFDict's backing Map with a per-dict flat [k,v,k,v,...] array; subsumes --fast-dict-iter + --fast-parse-dict (A/B baseline; production now runs --fast-dict-onebuf)
+run.bat --fast-dict-onebuf                # ONE long-lived buffer for all PDFDict entries + small per-parser temp (also ships; opt-in here for A/B)
 run.bat --fast-indirect-objects           # dense-array cache for PDFContext.indirectObjects (gen=0 path); mirror of --fast-refs on the value side (also ships; opt-in here for A/B)
 run.bat --fast-pdfnumber-pool             # value-keyed cache in front of PDFNumber.of; dense array for small ints, Map for the rest (also ships; opt-in here for A/B)
 run.bat --fast-parse-object               # first-byte dispatch in parseObject; gate true/false/null matchKeyword behind byte check (also ships; opt-in here for A/B)
@@ -529,6 +535,7 @@ file documenting each:
 | `PDFRef.of` direct-construct on cache miss (skip upstream `pool.set`) | [08](notes/08-pdf-lib.md) | `PDFRef.of` off CPU top-15 (~93 ms); `set` heap 7.7 MB → 0.5 MB |
 | `PDFNumber.of` value-pool (dense int + Map fallback) | [08](notes/08-pdf-lib.md) | `parseNumberOrRef` off heap top-10; total process heap 123 MB → 107 MB (-13 %) |
 | Pre-size `parseDict` accumulator (`new Array(10)` median) | [08](notes/08-pdf-lib.md) | `fastParseDictArray` heap row -25 %; total process heap 107 MB → 92 MB (-14 %) |
+| One-buffer `PDFDict` (single mainBuf + packed 53-bit instance) | [08](notes/08-pdf-lib.md) | total process heap 92 MB → 66 MB (-28 %); cumulative -57 % since Map-backed PDFDict |
 
 What was tried and didn't ship:
 
@@ -555,4 +562,4 @@ order; later ones reference earlier ones for context.
 | [05-blink-trace.md](notes/05-blink-trace.md) | What happened when we tried move-not-clone (a `previousLeaf` cache shipped instead of move); cracking the cpu profile's `(program)` row open with a Blink-category trace; the WhiteSpaceFilter paired-A/B that found it wasn't worth its layout cost in our pipeline. |
 | [06-microtasks-pageranges-css.md](notes/06-microtasks-pageranges-css.md) | Following `RunMicrotasks` down to zero (chunker fully sync); why `pageRanges` sharding is off the table; CSS cost attribution showing print.css's individual sections are all below the noise floor. |
 | [07-memory.md](notes/07-memory.md) | Where the renderer's 1.9 GB goes -- process-tree footprint, per-allocator + per-Blink-class breakdown, `--disable-gpu` + `--in-process-gpu` saving ~200 MB, a GC-pass probe finding 180 MB of unswept Oilpan garbage. |
-| [08-pdf-lib.md](notes/08-pdf-lib.md) | Profiling the process phase via `--cpu-profile-process`; pako's per-stream init dominates with ~4 500 small streams (routing pdf-lib's `deflate` + `inflate` through `node:zlib` saves ~1.5 s); `PDFRef.of`'s string-keyed Map lookup at 1.2 M calls per load (dense-array gen=0 cache saves ~0.2 s); parallelising save's per-stream deflate on libuv's pool with `objectsPerStream: 500` (~0.3 s off the main thread; PDF -5 %); `decodeName`'s regex scan on 2.76 M `PDFName.of` calls per load with a 0.0001 % hit rate (no-`#` cache saves ~0.5 s); `numberToString`'s redundant `toString`/`split`/`parseInt` on the 100 % no-`e` path; `sizeInBytes` allocating `n.toString(2)` on ~300 k xref-writer calls (short-circuit ladder saves ~60 ms); `PDFDict.entries` allocating `Array.from(map.entries())` on every dict serialisation (`Map.forEach` with hoisted callbacks saves ~80 ms); `parseDict`'s type-dispatch tail re-running `PDFName.of('Type'/'Catalog'/'Pages'/'Page')` per dict (hoisted sentinel constants drop `fastOf` self-time by 22 %); pdf-lib's `__awaiter`/`__generator` scaffolding on nine load + save methods costing ~135 ms of attributed self-time + ~50 ms GC (synchronified twins save 0.36 s of process); `parseObject`'s three speculative `matchKeyword(true/false/null)` scans on every dispatch (first-byte peek + gated keyword scans halve `parseObject` self-time); the sampling heap profile pointing at `new Map()` + `Map.prototype.set` at ~50 % of process-phase allocations (replacing `PDFDict`'s backing `Map` with a flat alternating `[k,v,k,v,...]` array drops Map+set heap traffic ~80 % and subsumes the earlier `fast-dict-iter` + `fast-parse-dict` shims); the only hot `Map.set` left being `PDFContext.assign`'s `indirectObjects.set(ref, object)` (replacing the Map with a dense gen=0 array indexed by `objectNumber` drops `assign` out of the CPU top-15 and halves the remaining `set` heap traffic); the residual `set` after that being the upstream `PDFRef.of` pool.set on cache miss (directly constructing the `PDFRef` via `Object.create(PDFRef.prototype)` on the gen=0 miss path bypasses the redundant upstream pool entirely, dropping `set` heap traffic from 7.7 MB to 0.5 MB and saving another ~93 ms on `PDFRef.of` CPU); `parseNumberOrRef` as the next-largest heap row at 15 MB of inlined `new PDFNumber(value)` calls -- PDFs reuse a few thousand unique numeric values hundreds of thousands of times (page indices, `/Count`, `/MediaBox` dimensions, font sizes), so pooling `PDFNumber` by value drops `parseNumberOrRef` off the top 10 and total process heap by ~13 %; `fastParseDictArray` at ~85 % FixedArray-growth garbage on 261 k dicts with a 5-entry median (pre-sizing the accumulator at `new Array(10)` collapses the per-call growth chain, dropping the row -25 % and total heap another -14 %). |
+| [08-pdf-lib.md](notes/08-pdf-lib.md) | Profiling the process phase via `--cpu-profile-process`; pako's per-stream init dominates with ~4 500 small streams (routing pdf-lib's `deflate` + `inflate` through `node:zlib` saves ~1.5 s); `PDFRef.of`'s string-keyed Map lookup at 1.2 M calls per load (dense-array gen=0 cache saves ~0.2 s); parallelising save's per-stream deflate on libuv's pool with `objectsPerStream: 500` (~0.3 s off the main thread; PDF -5 %); `decodeName`'s regex scan on 2.76 M `PDFName.of` calls per load with a 0.0001 % hit rate (no-`#` cache saves ~0.5 s); `numberToString`'s redundant `toString`/`split`/`parseInt` on the 100 % no-`e` path; `sizeInBytes` allocating `n.toString(2)` on ~300 k xref-writer calls (short-circuit ladder saves ~60 ms); `PDFDict.entries` allocating `Array.from(map.entries())` on every dict serialisation (`Map.forEach` with hoisted callbacks saves ~80 ms); `parseDict`'s type-dispatch tail re-running `PDFName.of('Type'/'Catalog'/'Pages'/'Page')` per dict (hoisted sentinel constants drop `fastOf` self-time by 22 %); pdf-lib's `__awaiter`/`__generator` scaffolding on nine load + save methods costing ~135 ms of attributed self-time + ~50 ms GC (synchronified twins save 0.36 s of process); `parseObject`'s three speculative `matchKeyword(true/false/null)` scans on every dispatch (first-byte peek + gated keyword scans halve `parseObject` self-time); the sampling heap profile pointing at `new Map()` + `Map.prototype.set` at ~50 % of process-phase allocations (replacing `PDFDict`'s backing `Map` with a flat alternating `[k,v,k,v,...]` array drops Map+set heap traffic ~80 % and subsumes the earlier `fast-dict-iter` + `fast-parse-dict` shims); the only hot `Map.set` left being `PDFContext.assign`'s `indirectObjects.set(ref, object)` (replacing the Map with a dense gen=0 array indexed by `objectNumber` drops `assign` out of the CPU top-15 and halves the remaining `set` heap traffic); the residual `set` after that being the upstream `PDFRef.of` pool.set on cache miss (directly constructing the `PDFRef` via `Object.create(PDFRef.prototype)` on the gen=0 miss path bypasses the redundant upstream pool entirely, dropping `set` heap traffic from 7.7 MB to 0.5 MB and saving another ~93 ms on `PDFRef.of` CPU); `parseNumberOrRef` as the next-largest heap row at 15 MB of inlined `new PDFNumber(value)` calls -- PDFs reuse a few thousand unique numeric values hundreds of thousands of times (page indices, `/Count`, `/MediaBox` dimensions, font sizes), so pooling `PDFNumber` by value drops `parseNumberOrRef` off the top 10 and total process heap by ~13 %; `fastParseDictArray` at ~85 % FixedArray-growth garbage on 261 k dicts with a 5-entry median (pre-sizing the accumulator at `new Array(10)` collapses the per-call growth chain, dropping the row -25 % and total heap another -14 %); the next layer being the irreducible "one `new Array(10)` + one PDFDict instance per parsed dict" floor with ~1780 backing arrays still in flight after `--fast-dict-array` (collapse them into one long-lived mainBuf where every committed PDFDict entry lives, a per-parser temp stack handles parseDict recursion, owned dicts append to main and mutate in-place / COW-to-tail, and the whole PDFDict instance state packs into one 53-bit Number -- 24-bit start + 14-bit length + 1-bit owned -- so the per-dict object header collapses to a single field; total process heap drops 92 MB → 66 MB (-28 %), cumulative -57 % since the Map-backed baseline). |
diff --git a/perf/measure.mjs b/perf/measure.mjs
index 1d160a3b..c1a76c15 100644
--- a/perf/measure.mjs
+++ b/perf/measure.mjs
@@ -175,7 +175,22 @@
 // but PDF dicts are tiny (typically <= 10 entries). Subsumes
 // --fast-parse-dict and --fast-dict-iter (the parser's hot loop
 // accumulates into the array directly; sizeInBytes / copyBytesInto
-// iterate in place). Production runs through it.
+// iterate in place). Now superseded by --fast-dict-onebuf; kept as
+// an A/B baseline.
+//
+// --fast-dict-onebuf collapses the per-dict array allocation into
+// ONE long-lived mainBuf shared across every committed PDFDict
+// entry. A small per-parser temp array acts as a stack of parseDict
+// recursion frames so outer's range stays contiguous when inner
+// recurses. PDFDict instance state packs into one 53-bit Number
+// (24-bit start + 14-bit length + 1-bit owned), no per-dict array
+// header. Owned dicts (factory-created post-parse) append to main
+// and mutate in place / COW to the tail. PDFContext is a singleton
+// in our pipeline (one PDFDocument.load per process); a second
+// distinct context throws. Mutually exclusive with --fast-dict-array
+// and the other dict-shape shims. ~57 % cumulative heap reduction
+// since the original Map-backed PDFDict (152 -> 66 MB). Production
+// runs through it.
 //
 // --fast-indirect-objects replaces PDFContext.indirectObjects
 // (Map<PDFRef, PDFObject>) with a dense array indexed by
@@ -269,6 +284,7 @@ let fastSyncLoad = false;
 let fastDictArray = false;
 let fastIndirectObjects = false;
 let fastPdfnumberPool = false;
+let fastDictOnebuf = false;
 let instrumentParsedict = false;
 for (let i = 0; i < args.length; i++) {
   const a = args[i];
@@ -306,6 +322,7 @@ for (let i = 0; i < args.length; i++) {
   else if (a === '--fast-dict-array') fastDictArray = true;
   else if (a === '--fast-indirect-objects') fastIndirectObjects = true;
   else if (a === '--fast-pdfnumber-pool') fastPdfnumberPool = true;
+  else if (a === '--fast-dict-onebuf') fastDictOnebuf = true;
   else if (a === '--instrument-parsedict') instrumentParsedict = true;
   else if (!inputArg) inputArg = a;
   else { console.error(`unknown arg: ${a}`); process.exit(2); }
@@ -353,6 +370,10 @@ if (fastDictArray && (fastParseDict || fastDictIter)) {
   console.error('--fast-dict-array subsumes --fast-parse-dict and --fast-dict-iter (Map-backed shims). Pick one shape.');
   process.exit(2);
 }
+if (fastDictOnebuf && (fastDictArray || fastParseDict || fastDictIter)) {
+  console.error('--fast-dict-onebuf subsumes the other dict-shape shims (different storage shape). Pick one.');
+  process.exit(2);
+}
 
 // Install the dense-array cache for PDFRef.of's gen=0 path before any
 // pdf-lib operation. Side-effecting import; idempotent.
@@ -408,6 +429,10 @@ if (fastPdfnumberPool) {
   await import('../docs/lib/fast-pdfnumber-pool.mjs');
   console.log('[harness] fast-pdfnumber-pool: value-keyed cache in front of PDFNumber.of');
 }
+if (fastDictOnebuf) {
+  await import('../docs/lib/fast-dict-onebuf.mjs');
+  console.log('[harness] fast-dict-onebuf: ONE long-lived buffer for all PDFDict entries + small per-parser temp');
+}
 if (instrumentParsedict) {
   await import('./instrument-parsedict.mjs');
 }
diff --git a/perf/notes/08-pdf-lib.md b/perf/notes/08-pdf-lib.md
index 70ae4153..f15f7eda 100644
--- a/perf/notes/08-pdf-lib.md
+++ b/perf/notes/08-pdf-lib.md
@@ -2812,6 +2812,187 @@ a single per-parser mainBuf, which folds the bufIdx field away
 and lets a tighter bit layout track the (mainBuf-relative) start
 + length directly. That's what ends up shipping.
 
+## One-buffer PDFDict
+
+After the fast-dict-double prototype, the heap picture showed
+~1 780 backing arrays in flight: 3 per-depth parser buffers,
+~1 773 owned buffers created by setOutline's factory calls (one
+per outline node), plus a few during save. Each owned buffer
+has Array-header overhead; each parser-buffer needed its own
+slot in the `_buffers` registry. And `bufIdx` in the packed
+value had to be wide enough to address all of them -- 15 bits.
+
+Using **one buffer** for every committed PDFDict entry across the
+whole document would:
+
+- drop ~1 780 Array headers to 1
+- drop `bufIdx` from the packed value entirely (always 0)
+- keep all dict data in contiguous memory (better cache behavior)
+
+This is what ships as
+[fast-dict-onebuf.mjs](../../docs/lib/fast-dict-onebuf.mjs). It
+takes the place of fast-dict-array on the production import in
+`render-book.mjs`. Earlier dict-shape shims (fast-dict-array,
+fast-dict-iter, fast-parse-dict) stay in the tree as A/B
+baselines; the harness mutex rejects combining them.
+
+### The recursion gotcha (again)
+
+A single shared buffer breaks naive parseDict recursion exactly
+like it did when the view-based prototype first hit the same
+question: inner recursion writes into the middle of outer's
+entries, breaking outer's contiguous range.
+
+The fix is a **two-area split**:
+
+- `main` -- one long-lived buffer for committed entries. Append-only.
+- `temp` -- small per-parser working area for active parseDict
+  frames. Reused across all parseDict calls on the parser.
+
+```
+parseDict invocation (at any recursion depth):
+  frameStart = temp.length
+  while (parsing) {
+    key   = parseName()
+    value = parseObject()      // may recurse; temp grows then pops
+    temp.push(key, value)       // ON TOP of anything recursion left
+  }
+  // Commit this frame to main in one contiguous append
+  start = main.length
+  for entry in temp[frameStart..temp.length]:
+    main.push(entry)
+  // Pop our frame off temp
+  temp.length = frameStart
+  return PDFDict with view (start, length)
+```
+
+Outer's entries stay parked in `temp[frameStart..]` while inner
+recurses. Inner appends ON TOP of outer, commits its frame to
+`main` in one append, and pops its frame off `temp`. Outer's
+frame is intact at the top of `temp` again; outer continues
+pushing. When outer commits, its entries are contiguous in `temp`
+and commit contiguously to `main`. Outer's and inner's ranges in
+`main` are at distinct, non-overlapping offsets.
+
+`temp` is tiny -- max recursion depth × max single-dict size = a
+couple dozen slots peak on the book.
+
+### Mutations
+
+The shared (parser-created) range is read-only after parse. The
+ownership flag in `d` distinguishes shared from owned dicts:
+
+- **`set` with existing key**: in-place replace at `main[start +
+  i + 1]`. Safe for both shared and owned; no shifts.
+- **`set` with new key, dict at main's high-water mark**: just
+  `main.push(key, value)` and extend the range by 2. Common for
+  owned dicts that have just been created and are being filled
+  with `.set` calls (the outline construction pattern).
+- **`set` with new key, dict not at high-water mark**: COW. Copy
+  the range to `main`'s tail, append new pair, update encoded
+  value. Happens when other dicts were created between this dict's
+  creation and the `.set` call.
+- **`delete`**: always COW (shifting slots in `main` would corrupt
+  other dicts that point into the affected region).
+
+For setOutline's pattern -- create outline dict, recurse to build
+children, then call `.set(Prev/Next/First/Last/Count)` on it --
+the first `.set` after the recursion COWs the dict to the tail.
+Subsequent `.set`s on the same dict extend in place. Net: ~one
+COW per outline dict, ~5 entry copies each = ~9 k pair copies
+total. Negligible.
+
+### Bit layout
+
+With `bufIdx` gone, the packed value shrinks:
+
+```
+bits  0-23: start  (24 bits, max 16 M slots in main)
+bits 24-37: length (14 bits, max 16 384 slots; max observed 8 706)
+bit  38   : owned flag
+bits 39-52: spare (14 bits)
+```
+
+37 bits used. Still above Smi range (so V8 stores `d` as a
+DoubleField or HeapNumber), but with plenty of headroom and a
+much cleaner layout.
+
+### Measured wins
+
+Heap profile (paired `--heap-profile-process --heap-sampling 512`,
+fast-dict-double baseline vs + fast-dict-onebuf):
+
+| Allocator                          | Pre (KB)  | Post (KB) | Delta             |
+|------------------------------------|----------:|----------:|------------------:|
+| `fastParseDictDouble` / `*OneBuf`  |  18 913.63 |       — (out of top 10) | **-18.9 MB**   |
+| `_makeFromView` / `_makeFromRange` |  16 429.68 |  16 613.10 | flat              |
+| PDFObjectParser.parseArray         |  19 502.52 |  19 512.08 | flat              |
+| Total sampled                      |  83.68 MB |  65.55 MB | **-18.1 MB (-22 %)** |
+
+The dominant change: `fastParseDictDouble` had 18.9 MB of self-
+attributed allocations (the 3 parser per-depth buffers' growth +
+the per-dict array creation in factory paths). With fast-dict-
+onebuf, those are gone entirely -- everything appends to `main`,
+which is allocated once.
+
+CPU profile (same paired methodology, with the wall-clock-is-noisy
+caveat):
+
+| Row                              | Pre (ms) | Post (ms) | Delta              |
+|----------------------------------|---------:|----------:|-------------------:|
+| (garbage collector)              |   128.81 |    151.05 | +22.2 ms           |
+| `fastParseDictDouble` / `*OneBuf` |    44.36 |     53.44 | +9.1 ms            |
+| Total profile duration           |   0.97 s |    1.05 s | +80 ms (~8 %, within machine noise) |
+
+GC self-time bumped up a bit. The `main` buffer is one giant
+~19 MB live object now; V8's mark phase scans it every cycle even
+though we're allocating less new garbage. Heap throughput went
+down 22 %, but live-heap mark cost went up modestly. On this
+machine wall-clock isn't a reliable signal anyway; the heap
+reduction is the headline.
+
+### Cumulative arc
+
+| State                            | Total sampled | Change vs prior |
+|----------------------------------|--------------:|----------------:|
+| Map-backed (pre-fast-dict-array) |   152 MB      | -               |
+| fast-dict-array                  |    92 MB      | -60 MB          |
+| fast-dict-view  (explored)       |    90 MB      | -2 MB           |
+| fast-dict-double (explored)      |    84 MB      | -6 MB           |
+| **fast-dict-onebuf**             |    **66 MB**  | **-18 MB**      |
+
+**-57 % cumulative reduction since the start of this PDFDict
+storage-shape work.** Staging's chain skips the two intermediate
+shims and goes from fast-dict-array straight to fast-dict-onebuf;
+the heap drop on that direct hop is 92 → 66 MB (-28 %).
+
+### Caveats
+
+- **Single context.** Same singleton-PDFContext assumption that
+  fast-dict-double introduced: throws if a second PDFContext is
+  constructed in the process. Fine for our build pipeline (one
+  `PDFDocument.load` per build); a general-purpose variant would
+  need an array + small ctxIdx field.
+- **Single 24-bit start budget.** If `main` exceeds 16 M slots
+  (8 M entries) the next pack() throws. The book's `main` peaks
+  at ~2.4 M slots; 6x headroom.
+- **COW on delete.** Always. Cheap for small dicts; could be slow
+  for huge dicts with frequent deletes. Not a pattern we see.
+- **Live `main` is bigger than the prior approach's transient
+  allocations.** GC mark phase pays for that. The tradeoff -- less
+  *allocation* (heap throughput) but slightly more *live* (mark
+  cost) -- shows in the modestly higher GC time. Profile both
+  signals when evaluating.
+
+### Shipped
+
+`docs/render-book.mjs` imports
+[`./lib/fast-dict-onebuf.mjs`](../../docs/lib/fast-dict-onebuf.mjs)
+in place of the prior `./lib/fast-dict-array.mjs`. fast-dict-array
+stays in the tree as an A/B baseline; the `--fast-dict-onebuf`
+mutex in `measure.mjs` rejects combining either with the other
+dict-shape shims.
+
 ## `@cantoo/pdf-lib`: not a drop-in replacement
 
 Spot-checked the maintained fork (`@cantoo/pdf-lib` 2.6.5) as an
@@ -2848,7 +3029,8 @@ Cumulative process-phase cost, baseline → after the shims to date:
 | + fast-indirect-objects              | ~1.1 s  | ~0.7 s | ~0.4 s |
 | + fast-refs miss bypass              | ~1.0 s  | ~0.6 s | ~0.4 s |
 | + fast-pdfnumber-pool                | ~1.0 s  | ~0.6 s | ~0.4 s |
-| **+ parseDict pre-sized array (this section)** | **~1.0 s** | **~0.6 s** | **~0.4 s** |
+| + parseDict pre-sized array          | ~1.0 s  | ~0.6 s | ~0.4 s |
+| **+ fast-dict-onebuf (this section)** | **~1.0 s** | **~0.6 s** | **~0.4 s** |
 
 The bottom-up after the latest pair is what's left of pdf-lib's
 genuine parser work: `PDFDict.entries`, `PDFObjectParser.parseName`,

From f76701a481d45a412def96d60462dc4a6fc96992 Mon Sep 17 00:00:00 2001
From: Kuba Sunderland-Ober <kuba@mareimbrium.org>
Date: Sat, 23 May 2026 18:49:22 +0200
Subject: [PATCH 23/44] Phase 0 viability gate: no-allocate byte walker
 prototype.

Walks the PDF grammar (indirect objects, dicts, arrays, names,
numbers, refs, strings, streams, ObjStms-with-inflate) without
instantiating any PDFObject. Counts only: 261k dicts, 2.34M dict
slots, 81k arrays, 750k ref appearances, max recursion depth 4.
The measure pass is preparation for a two-pass measure-allocate-
work architecture where mainBuf becomes a Float64Array sized
exactly to measured demand, eliminating V8's mark traversal of
its 2.4M Object-ref slots.

On perf/raw.pdf (39.3 MB Chrome output), measure pass runs in
135 ms (min of 5) vs PDFDocument.load at 1238 ms -- ratio 0.109,
~9x cheaper. Architecture cleared: even at 80% work-pass cost,
135 + 990 = 1125 ms vs current 1238 ms is net win on CPU before
any GC reduction.

Wired:
- perf/measure.mjs --dump-raw-pdf <path>: one-time flag that saves
  Chrome's raw page.pdf() output before pdf-lib processing.
- perf/raw.pdf (gitignored): canonical 39.3 MB input for measure /
  heap-profile investigations going forward.
- perf/phase0-measure.mjs: the prototype walker. Measurement-only;
  doesn't ship in any production path.
---
 perf/.gitignore          |   1 +
 perf/README.md           |   3 +-
 perf/measure.mjs         |   9 +
 perf/notes/08-pdf-lib.md | 186 +++++++++++
 perf/phase0-measure.mjs  | 702 +++++++++++++++++++++++++++++++++++++++
 5 files changed, 900 insertions(+), 1 deletion(-)
 create mode 100644 perf/phase0-measure.mjs

diff --git a/perf/.gitignore b/perf/.gitignore
index df01c96d..001fa9ed 100644
--- a/perf/.gitignore
+++ b/perf/.gitignore
@@ -1,3 +1,4 @@
 results/
 ab-css/
 ab-css-*/
+raw.pdf
diff --git a/perf/README.md b/perf/README.md
index 553a3af1..bbbab7ea 100644
--- a/perf/README.md
+++ b/perf/README.md
@@ -332,6 +332,7 @@ Side experiments / one-shot probes:
 | `probe-outline-exclusions.mjs` | Tests which per-element attributes / styles (aria-hidden, role=presentation, hidden, display:none, CSS bookmark-level, ...) make Chrome drop a heading from its outline. |
 | `probe-parallel.mjs` | Two-shard `Promise.all` `page.pdf()` probe -- the cost-of-`pageRanges`-sharding measurement (see *`pageRanges` sharding: off the table for now* in [notes/06-microtasks-pageranges-css.md](notes/06-microtasks-pageranges-css.md)). |
 | `probe-idle-browser.mjs` | Standalone probe: launches a headless browser and measures steady-state idle memory + sample-time, for separating render cost from browser-fixed overhead. |
+| `phase0-measure.mjs` | No-allocate byte walker over a raw PDF: recognises the grammar (indirect objects, dicts, arrays, names, numbers, refs, strings, streams, ObjStms) and produces counts only, without instantiating any PDFObject. Viability gate for the two-pass measure-allocate-work architecture that ships as `measure-pass.mjs`. Run with `node perf/phase0-measure.mjs <input.pdf> --runs N`; defaults to the most recent `perf/results/*/book.pdf`. Companion to `--dump-raw-pdf <path>` on `measure.mjs`, which captures the canonical 39 MB Chrome-output input once. |
 
 Documentation:
 
@@ -562,4 +563,4 @@ order; later ones reference earlier ones for context.
 | [05-blink-trace.md](notes/05-blink-trace.md) | What happened when we tried move-not-clone (a `previousLeaf` cache shipped instead of move); cracking the cpu profile's `(program)` row open with a Blink-category trace; the WhiteSpaceFilter paired-A/B that found it wasn't worth its layout cost in our pipeline. |
 | [06-microtasks-pageranges-css.md](notes/06-microtasks-pageranges-css.md) | Following `RunMicrotasks` down to zero (chunker fully sync); why `pageRanges` sharding is off the table; CSS cost attribution showing print.css's individual sections are all below the noise floor. |
 | [07-memory.md](notes/07-memory.md) | Where the renderer's 1.9 GB goes -- process-tree footprint, per-allocator + per-Blink-class breakdown, `--disable-gpu` + `--in-process-gpu` saving ~200 MB, a GC-pass probe finding 180 MB of unswept Oilpan garbage. |
-| [08-pdf-lib.md](notes/08-pdf-lib.md) | Profiling the process phase via `--cpu-profile-process`; pako's per-stream init dominates with ~4 500 small streams (routing pdf-lib's `deflate` + `inflate` through `node:zlib` saves ~1.5 s); `PDFRef.of`'s string-keyed Map lookup at 1.2 M calls per load (dense-array gen=0 cache saves ~0.2 s); parallelising save's per-stream deflate on libuv's pool with `objectsPerStream: 500` (~0.3 s off the main thread; PDF -5 %); `decodeName`'s regex scan on 2.76 M `PDFName.of` calls per load with a 0.0001 % hit rate (no-`#` cache saves ~0.5 s); `numberToString`'s redundant `toString`/`split`/`parseInt` on the 100 % no-`e` path; `sizeInBytes` allocating `n.toString(2)` on ~300 k xref-writer calls (short-circuit ladder saves ~60 ms); `PDFDict.entries` allocating `Array.from(map.entries())` on every dict serialisation (`Map.forEach` with hoisted callbacks saves ~80 ms); `parseDict`'s type-dispatch tail re-running `PDFName.of('Type'/'Catalog'/'Pages'/'Page')` per dict (hoisted sentinel constants drop `fastOf` self-time by 22 %); pdf-lib's `__awaiter`/`__generator` scaffolding on nine load + save methods costing ~135 ms of attributed self-time + ~50 ms GC (synchronified twins save 0.36 s of process); `parseObject`'s three speculative `matchKeyword(true/false/null)` scans on every dispatch (first-byte peek + gated keyword scans halve `parseObject` self-time); the sampling heap profile pointing at `new Map()` + `Map.prototype.set` at ~50 % of process-phase allocations (replacing `PDFDict`'s backing `Map` with a flat alternating `[k,v,k,v,...]` array drops Map+set heap traffic ~80 % and subsumes the earlier `fast-dict-iter` + `fast-parse-dict` shims); the only hot `Map.set` left being `PDFContext.assign`'s `indirectObjects.set(ref, object)` (replacing the Map with a dense gen=0 array indexed by `objectNumber` drops `assign` out of the CPU top-15 and halves the remaining `set` heap traffic); the residual `set` after that being the upstream `PDFRef.of` pool.set on cache miss (directly constructing the `PDFRef` via `Object.create(PDFRef.prototype)` on the gen=0 miss path bypasses the redundant upstream pool entirely, dropping `set` heap traffic from 7.7 MB to 0.5 MB and saving another ~93 ms on `PDFRef.of` CPU); `parseNumberOrRef` as the next-largest heap row at 15 MB of inlined `new PDFNumber(value)` calls -- PDFs reuse a few thousand unique numeric values hundreds of thousands of times (page indices, `/Count`, `/MediaBox` dimensions, font sizes), so pooling `PDFNumber` by value drops `parseNumberOrRef` off the top 10 and total process heap by ~13 %; `fastParseDictArray` at ~85 % FixedArray-growth garbage on 261 k dicts with a 5-entry median (pre-sizing the accumulator at `new Array(10)` collapses the per-call growth chain, dropping the row -25 % and total heap another -14 %); the next layer being the irreducible "one `new Array(10)` + one PDFDict instance per parsed dict" floor with ~1780 backing arrays still in flight after `--fast-dict-array` (collapse them into one long-lived mainBuf where every committed PDFDict entry lives, a per-parser temp stack handles parseDict recursion, owned dicts append to main and mutate in-place / COW-to-tail, and the whole PDFDict instance state packs into one 53-bit Number -- 24-bit start + 14-bit length + 1-bit owned -- so the per-dict object header collapses to a single field; total process heap drops 92 MB → 66 MB (-28 %), cumulative -57 % since the Map-backed baseline). |
+| [08-pdf-lib.md](notes/08-pdf-lib.md) | Profiling the process phase via `--cpu-profile-process`; pako's per-stream init dominates with ~4 500 small streams (routing pdf-lib's `deflate` + `inflate` through `node:zlib` saves ~1.5 s); `PDFRef.of`'s string-keyed Map lookup at 1.2 M calls per load (dense-array gen=0 cache saves ~0.2 s); parallelising save's per-stream deflate on libuv's pool with `objectsPerStream: 500` (~0.3 s off the main thread; PDF -5 %); `decodeName`'s regex scan on 2.76 M `PDFName.of` calls per load with a 0.0001 % hit rate (no-`#` cache saves ~0.5 s); `numberToString`'s redundant `toString`/`split`/`parseInt` on the 100 % no-`e` path; `sizeInBytes` allocating `n.toString(2)` on ~300 k xref-writer calls (short-circuit ladder saves ~60 ms); `PDFDict.entries` allocating `Array.from(map.entries())` on every dict serialisation (`Map.forEach` with hoisted callbacks saves ~80 ms); `parseDict`'s type-dispatch tail re-running `PDFName.of('Type'/'Catalog'/'Pages'/'Page')` per dict (hoisted sentinel constants drop `fastOf` self-time by 22 %); pdf-lib's `__awaiter`/`__generator` scaffolding on nine load + save methods costing ~135 ms of attributed self-time + ~50 ms GC (synchronified twins save 0.36 s of process); `parseObject`'s three speculative `matchKeyword(true/false/null)` scans on every dispatch (first-byte peek + gated keyword scans halve `parseObject` self-time); the sampling heap profile pointing at `new Map()` + `Map.prototype.set` at ~50 % of process-phase allocations (replacing `PDFDict`'s backing `Map` with a flat alternating `[k,v,k,v,...]` array drops Map+set heap traffic ~80 % and subsumes the earlier `fast-dict-iter` + `fast-parse-dict` shims); the only hot `Map.set` left being `PDFContext.assign`'s `indirectObjects.set(ref, object)` (replacing the Map with a dense gen=0 array indexed by `objectNumber` drops `assign` out of the CPU top-15 and halves the remaining `set` heap traffic); the residual `set` after that being the upstream `PDFRef.of` pool.set on cache miss (directly constructing the `PDFRef` via `Object.create(PDFRef.prototype)` on the gen=0 miss path bypasses the redundant upstream pool entirely, dropping `set` heap traffic from 7.7 MB to 0.5 MB and saving another ~93 ms on `PDFRef.of` CPU); `parseNumberOrRef` as the next-largest heap row at 15 MB of inlined `new PDFNumber(value)` calls -- PDFs reuse a few thousand unique numeric values hundreds of thousands of times (page indices, `/Count`, `/MediaBox` dimensions, font sizes), so pooling `PDFNumber` by value drops `parseNumberOrRef` off the top 10 and total process heap by ~13 %; `fastParseDictArray` at ~85 % FixedArray-growth garbage on 261 k dicts with a 5-entry median (pre-sizing the accumulator at `new Array(10)` collapses the per-call growth chain, dropping the row -25 % and total heap another -14 %); the next layer being the irreducible "one `new Array(10)` + one PDFDict instance per parsed dict" floor with ~1780 backing arrays still in flight after `--fast-dict-array` (collapse them into one long-lived mainBuf where every committed PDFDict entry lives, a per-parser temp stack handles parseDict recursion, owned dicts append to main and mutate in-place / COW-to-tail, and the whole PDFDict instance state packs into one 53-bit Number -- 24-bit start + 14-bit length + 1-bit owned -- so the per-dict object header collapses to a single field; total process heap drops 92 MB → 66 MB (-28 %), cumulative -57 % since the Map-backed baseline); GC self-time still ~150 ms / 15 % of process after fast-dict-onebuf (V8-flag knobs don't move it -- mark cost is dominated by walking the ~2.4 M Object-ref slots in mainBuf), so the next attack surface is shrinking the *live set* V8 has to mark by encoding slots as Numbers in a Float64Array mainBuf instead of Object references; that needs exact pool sizing, which is best done by a separate measure-allocate-work pass (Phase 0 viability gate: a no-allocate byte walker prototype clocks 135 ms vs PDFDocument.load at 1238 ms on the 39 MB Chrome-output PDF, ~9x cheaper, so even an 80 %-of-current work pass would land net-positive on CPU before any GC reduction; the prototype lives in `perf/phase0-measure.mjs` with `--dump-raw-pdf` on `measure.mjs` for capturing the canonical input). |
diff --git a/perf/measure.mjs b/perf/measure.mjs
index c1a76c15..7efd65f8 100644
--- a/perf/measure.mjs
+++ b/perf/measure.mjs
@@ -286,6 +286,7 @@ let fastIndirectObjects = false;
 let fastPdfnumberPool = false;
 let fastDictOnebuf = false;
 let instrumentParsedict = false;
+let dumpRawPdf = null;
 for (let i = 0; i < args.length; i++) {
   const a = args[i];
   if (a === '--out') outArg = args[++i];
@@ -324,6 +325,7 @@ for (let i = 0; i < args.length; i++) {
   else if (a === '--fast-pdfnumber-pool') fastPdfnumberPool = true;
   else if (a === '--fast-dict-onebuf') fastDictOnebuf = true;
   else if (a === '--instrument-parsedict') instrumentParsedict = true;
+  else if (a === '--dump-raw-pdf') dumpRawPdf = args[++i];
   else if (!inputArg) inputArg = a;
   else { console.error(`unknown arg: ${a}`); process.exit(2); }
 }
@@ -677,6 +679,13 @@ try {
   pdfMs = Date.now() - tPdfStart;
   rawPdfBytes = rawPdf.length;
 
+  if (dumpRawPdf) {
+    const dumpPath = resolve(process.cwd(), dumpRawPdf);
+    mkdirSync(dirname(dumpPath), { recursive: true });
+    writeFileSync(dumpPath, Buffer.from(rawPdf));
+    console.log(`[harness] dumped raw Chrome PDF: ${dumpPath} (${(rawPdf.length / 1024 / 1024).toFixed(1)} MB)`);
+  }
+
   const tGenEnd = Date.now();
   generateMs = tGenEnd - tGenStart;
   console.log(`[harness] generate ${fmtMs(generateMs)}  (parseOutline=${fmtMs(parseOutlineMs)}, page.pdf=${fmtMs(pdfMs)}, ${(rawPdf.length / 1024 / 1024).toFixed(1)}MB)`);
diff --git a/perf/notes/08-pdf-lib.md b/perf/notes/08-pdf-lib.md
index f15f7eda..6ad945fb 100644
--- a/perf/notes/08-pdf-lib.md
+++ b/perf/notes/08-pdf-lib.md
@@ -2993,6 +2993,192 @@ stays in the tree as an A/B baseline; the `--fast-dict-onebuf`
 mutex in `measure.mjs` rejects combining either with the other
 dict-shape shims.
 
+## Two-pass measure-allocate-work: Phase 0 viability gate
+
+After fast-dict-onebuf, GC self-time settled at ~150 ms / 15 % of
+the process phase. V8-flag knobs (`--max-semi-space-size`,
+`--max-old-space-size`, `--no-incremental-marking`,
+`--gc-interval=-1`) didn't move it -- mark cost is dominated by
+walking the live set, not by allocation rate. The remaining
+attack surface is **shrink the live set V8 has to mark**, ideally
+by representing dict slots as Numbers (a Float64Array mainBuf)
+rather than Object references.
+
+That option needs an encoding scheme for every value type that
+can live in a dict slot. Names, refs, numbers, and nested dicts
+are already pooled or naturally Number-encodable. PDFArray,
+PDFString, and PDFHexString are not pooled today, so they'd need
+a side `Object[]` fallback -- which V8 still marks. The fallback
+would shrink mark cost in proportion to how many slots are
+pooled, but not eliminate it.
+
+The cleaner version sidesteps the encoding-headroom question
+entirely by **measuring before allocating**:
+
+1. **Measure pass** -- walk the bytes as a state machine, no
+   PDFObject instantiation. Produce only counts and small
+   interning tables (Map<name, id>, dense ref array).
+2. **Allocate pass** -- every pool sized exactly: mainBuf as
+   `Float64Array(exact_slot_count)`, name/ref/number pools as
+   exact-sized arrays, string buffer as one exact-sized
+   `Uint8Array`. No growth, no slack.
+3. **Work pass** -- re-parse, this time encoding each value as a
+   pool-index Number into mainBuf. Every pool's size is known so
+   the encoding scheme is trivial (3 bits of type tag + N bits
+   of pool index, all fitting comfortably in 53 bits). All of
+   mainBuf is Float64; V8 marks nothing in it.
+
+The catch: a second parse is more CPU. Today's load is ~1.2 s on
+the 39 MB Chrome input; if measure-pass were 600 ms we'd regress
+on CPU even if GC dropped to zero. Phase 0 is a viability gate:
+implement the no-allocate measure pass, time it, decide whether
+the architecture is worth the engineering surface.
+
+### The walker
+
+[`perf/phase0-measure.mjs`](../phase0-measure.mjs) is a
+no-allocate byte walker that recognises the PDF grammar:
+indirect-object headers, dicts (`<< ... >>`), arrays
+(`[ ... ]`), names (`/foo`), strings (`(...)`), hex strings
+(`<...>`), numbers (integer and real, with or without a leading
+integer part), refs (`X Y R`), streams (detected as `dict`
+followed by `stream` keyword), and ObjStms (detected via
+`/Type /ObjStm` and inflated to recurse).
+
+Allocation discipline:
+
+- No string concat anywhere. Names, numbers, and strings are
+  skipped by advancing the byte cursor without keeping bytes.
+- Counters and per-frame dict captures live on typed-array
+  stacks (`Int32Array`, `Uint8Array`), depth-indexed to a max
+  of 64 (observed max recursion is 4).
+- ObjStm offset arrays are reusable `Int32Array(512)` instances,
+  grown on demand. The inflate destination is a fresh Buffer
+  per ObjStm (Chrome's raw output has zero ObjStms anyway; book.pdf
+  has 453 of them after pdf-lib's save bundles them).
+- Per-dict capture stack stores `/Length`, `/Type` (matched
+  against `ObjStm`), `/N`, `/First` -- enough to detect streams
+  and seek through them without a fallback scan in the common case.
+  Key disambiguation is inline byte comparison against the four
+  known stream-related names; everything else falls through to
+  unconditional name-body skip.
+
+### Two corners worth remembering
+
+- **PDF reals can omit the integer part.** `.251` is a valid
+  number; the first cut required `>= 1` integer digit and threw
+  on `<</CA .251 ...>>` (Chrome emits `/CA` and `/ca` alpha
+  values this way). Fix: accept `[sign?][digits?][. [digits?]]?`
+  with the constraint that at least one digit (int OR frac)
+  appears. pdf-lib's `parseRawNumber` handles this natively;
+  custom byte walkers have to remember.
+- **fast-dict-onebuf is singleton-context.** A second
+  `PDFDocument.load` in the same process throws. The Phase 0
+  comparison runs measure-pass N times (independent) but the
+  pdf-lib load only once.
+
+### Measured cost
+
+Input: `perf/raw.pdf` (39.3 MB, Chrome's raw output for the book,
+saved via the new `--dump-raw-pdf` flag below).
+
+| Pass                           | Time              | Notes                                |
+|--------------------------------|------------------:|--------------------------------------|
+| Measure pass (min of 5)        |          **135 ms** | runs were 135 / 143 / 147 / 152 / 156 |
+| `PDFDocument.load` (1 run)     |         **1238 ms** | production shim set imported         |
+| **ratio measure / load**       |        **0.109**  | ~9x cheaper                          |
+
+Throughput cross-check: book.pdf is 15.3 MB but the measure pass
+inflates 23.2 MB of ObjStm content, so effective bytes walked is
+~38.5 MB. raw.pdf walks 39.3 MB. Both clock ~290 MB/sec; the
+work-per-byte is consistent across two very different physical
+layouts.
+
+### What the counts unlock
+
+Per-run summary (raw.pdf, last run):
+
+```
+  indirect objects:  226 417
+  dicts:             260 966   slots: 2 340 522   max single: 8 706
+  arrays:             81 191   slots:   495 639   max single: 25 308
+  refs (appearances):       749 779
+  names (appearances):    1 679 151
+  numbers (appearances):    284 104
+  strings (literal/hex):    7 375 / 0
+  streams:                    2 061   ~11 MB content
+  objstms:                        0
+  max recursion depth:            4
+```
+
+Direct consequences for Phase 1+:
+
+- `mainBuf` would be `Float64Array(2 340 522 + slack)` -- a hard
+  upper bound, no growth ever.
+- Array-side mainBuf would be `Float64Array(495 639 + slack)`.
+- Recursion stack peaks at 4; no need to overallocate the temp.
+- Single largest dict is 8 706 slots, single largest array is
+  25 308 slots -- both well below the 14-bit length field
+  fast-dict-onebuf already uses.
+
+Three caveats on the counts:
+
+- **Appearance counts, not unique.** 1.68 M name appearances
+  resolve to a few thousand unique strings after interning. The
+  measure pass needs an interning Map<string, id> for names
+  (and similar for refs) to produce the *unique* pool sizes
+  needed for exact allocation. That's a Phase 1 addition --
+  cheap to add, will slightly raise measure-pass cost.
+- **Counts are physical-layout-independent.** raw.pdf has
+  226 k flat indirect objects and zero ObjStms; book.pdf has
+  2.5 k indirect objects of which 453 are ObjStms bundling 226 k
+  dicts. The *dict* count is identical (~261 k) either way.
+  This is the right invariant: pool sizing tracks the logical
+  document, not Chrome's vs pdf-lib's packing decision.
+- **Stream-length capture is fast-path-only.** When `/Length`
+  is a direct integer (the common case) we seek by it. When it's
+  a ref (`/Length 5 0 R`) we fall back to scanning for
+  `endstream`. We don't currently count fallbacks; would need to
+  add a counter if it ever looks like a non-trivial fraction.
+
+### Decision
+
+Architecture cleared. Measure-pass at ~11 % of load leaves
+plenty of headroom: even if the work pass came out at 80 % of
+current load (~990 ms) we'd land at 135 + 990 = 1 125 ms vs the
+current 1 238 ms -- net win on CPU before any GC reduction. The
+Float64Array mainBuf in the work pass should compound on top of
+that.
+
+### Wiring
+
+- **[`perf/measure.mjs`](../measure.mjs)** gains a `--dump-raw-pdf
+  <path>` flag. When set, the harness writes the raw Chrome
+  output (the input to pdf-lib's load) to the given path right
+  after `page.pdf()` returns. Used once to capture the canonical
+  input; not part of any routine run.
+- **`perf/raw.pdf`** (gitignored) is the canonical 39.3 MB
+  Chrome-output PDF, captured with the production shim set and
+  the new flag. The reference input for measure / heap-profile
+  investigations going forward.
+- **[`perf/phase0-measure.mjs`](../phase0-measure.mjs)** is the
+  prototype walker. Takes a PDF path and `--runs N`, runs the
+  measure pass N times, then runs `PDFDocument.load` once
+  (singleton-context), prints counts and the measure / load
+  ratio. Defaults to the most recent `perf/results/*/book.pdf`
+  if no path is given.
+
+Run it via:
+
+```
+node perf/phase0-measure.mjs perf/raw.pdf --runs 5
+```
+
+The prototype is measurement-only -- it doesn't ship in any
+production path. Phase 1 (next section) wires the measure-pass
+into production by using the dict-slot count to pre-size
+fast-dict-onebuf's mainBuf in place.
+
 ## `@cantoo/pdf-lib`: not a drop-in replacement
 
 Spot-checked the maintained fork (`@cantoo/pdf-lib` 2.6.5) as an
diff --git a/perf/phase0-measure.mjs b/perf/phase0-measure.mjs
new file mode 100644
index 00000000..40139458
--- /dev/null
+++ b/perf/phase0-measure.mjs
@@ -0,0 +1,702 @@
+// Phase 0 prototype: no-allocate measure pass over a PDF byte stream.
+//
+// Walks the PDF grammar as a state machine without instantiating any
+// PDFObject. Counts what would need allocating: indirect objects,
+// dicts and their slot counts, arrays and their slot counts, refs,
+// names, numbers, strings, streams (incl. ObjStms with inflate +
+// inner-object walk), max recursion depth.
+//
+// Then runs PDFDocument.load on the same bytes (with the production
+// shim set imported), so we can compare CPU cost head-to-head.
+//
+// This is a viability gate: if measure-pass is <<load (e.g. <300 ms
+// vs load's 1-2 s), the two-pass measure-then-allocate architecture
+// is worth committing to. If it's not, we revisit.
+//
+// Usage:
+//   node perf/phase0-measure.mjs [path/to/pdf] [--runs N] [--no-load]
+//
+// Defaults: --runs 3, input = most recent perf/results/*/book.pdf.
+
+import { readFileSync, readdirSync, statSync } from 'node:fs';
+import { join, resolve } from 'node:path';
+import { inflateSync } from 'node:zlib';
+import { performance } from 'node:perf_hooks';
+import { createRequire } from 'node:module';
+
+// Production-equivalent shim wiring (same order as docs/render-book.mjs).
+await import('../docs/lib/fast-refs.mjs');
+await import('../docs/lib/fast-inflate.mjs');
+await import('../docs/lib/fast-parse-number.mjs');
+await import('../docs/lib/fast-decode-name.mjs');
+await import('../docs/lib/fast-number-to-string.mjs');
+await import('../docs/lib/fast-size-in-bytes.mjs');
+await import('../docs/lib/fast-dict-onebuf.mjs');
+await import('../docs/lib/fast-parse-object.mjs');
+await import('../docs/lib/fast-sync-load.mjs');
+await import('../docs/lib/fast-indirect-objects.mjs');
+await import('../docs/lib/fast-pdfnumber-pool.mjs');
+
+const require = createRequire(import.meta.url);
+const { PDFDocument } = require('pdf-lib');
+
+// ---- Byte constants -------------------------------------------------
+
+const TAB = 9, LF = 10, FF = 12, CR = 13, SP = 32;
+const LT = 60 /* < */, GT = 62 /* > */;
+const LB = 91 /* [ */, RB = 93 /* ] */;
+const LP = 40 /* ( */, RP = 41 /* ) */;
+const SLASH = 47, PERCENT = 37, BACKSLASH = 92;
+const D0 = 48, D9 = 57;
+const MINUS = 45, PLUS = 43, DOT = 46;
+const a_ = 97, b_ = 98, d_ = 100, e_ = 101, f_ = 102, j_ = 106;
+const l_ = 108, m_ = 109, n_ = 110, o_ = 111, r_ = 114, s_ = 115;
+const t_ = 116, u_ = 117, x_ = 120;
+const R_CH = 82, L_CH = 76, T_CH = 84, N_CH = 78, F_CH = 70;
+
+// ---- Lookup tables (mirror pdf-lib's IsWhitespace / IsDelimiter / IsDigit / IsNumeric) ----
+
+const IsWS = new Uint8Array(256);
+IsWS[0] = IsWS[TAB] = IsWS[LF] = IsWS[FF] = IsWS[CR] = IsWS[SP] = 1;
+
+const IsDelim = new Uint8Array(256);
+IsDelim[LT] = IsDelim[GT] = IsDelim[LB] = IsDelim[RB] = 1;
+IsDelim[LP] = IsDelim[RP] = IsDelim[SLASH] = IsDelim[PERCENT] = 1;
+
+const IsDigit = new Uint8Array(256);
+for (let b = D0; b <= D9; b++) IsDigit[b] = 1;
+
+const IsNumeric = new Uint8Array(IsDigit);
+IsNumeric[DOT] = IsNumeric[MINUS] = IsNumeric[PLUS] = 1;
+
+// ---- Measurer -------------------------------------------------------
+
+class Measurer {
+  constructor(buf) {
+    this.buf = buf;
+    this.pos = 0;
+    this._len = buf.length;
+
+    // Counters
+    this.numIndirectObjects = 0;
+    this.numDicts = 0;
+    this.numDictSlots = 0;
+    this.numArrays = 0;
+    this.numArraySlots = 0;
+    this.numRefs = 0;
+    this.numNames = 0;
+    this.numNumbers = 0;
+    this.numStrings = 0;
+    this.numHexStrings = 0;
+    this.numStreams = 0;
+    this.numObjStms = 0;
+    this.numObjStmInnerObjects = 0;
+    this.maxDictSlots = 0;
+    this.maxArraySlots = 0;
+    this.maxRecursionDepth = 0;
+    this.totalStreamBytes = 0;
+    this.totalInflatedBytes = 0;
+
+    // Dict-frame stack: parseDict pushes a frame and leaves it for
+    // its caller to read (then pop). We track /Length, /Type=/ObjStm,
+    // /N, /First per frame for stream/ObjStm handling.
+    const MAX_DEPTH = 64;
+    this._depth = 0;
+    this._stLength  = new Int32Array(MAX_DEPTH);
+    this._stIsObjStm = new Uint8Array(MAX_DEPTH);
+    this._stN      = new Int32Array(MAX_DEPTH);
+    this._stFirst  = new Int32Array(MAX_DEPTH);
+
+    // Reusable ObjStm offset arrays (grown on demand)
+    this._objNums    = new Int32Array(512);
+    this._objOffsets = new Int32Array(512);
+  }
+
+  // ---- Skip helpers (no allocation) --------------------------------
+
+  skipWS() {
+    const buf = this.buf, len = this._len;
+    let p = this.pos;
+    while (p < len) {
+      const b = buf[p];
+      if (IsWS[b]) { p++; continue; }
+      if (b === PERCENT) {
+        while (p < len && buf[p] !== LF && buf[p] !== CR) p++;
+        continue;
+      }
+      break;
+    }
+    this.pos = p;
+  }
+
+  // Parse an integer in place. No string concat. Returns NaN if no digit.
+  // Does NOT bump numNumbers (used for metadata: header, ObjStm offsets).
+  _skipInt() {
+    const buf = this.buf, len = this._len;
+    let p = this.pos, v = 0, sign = 1, any = 0;
+    if (buf[p] === MINUS) { sign = -1; p++; }
+    else if (buf[p] === PLUS) { p++; }
+    while (p < len) {
+      const b = buf[p];
+      if (b < D0 || b > D9) break;
+      v = v * 10 + (b - D0);
+      any = 1; p++;
+    }
+    this.pos = p;
+    return any ? sign * v : NaN;
+  }
+
+  // Skip a name (already past '/'); just consume body bytes
+  _skipNameBody() {
+    const buf = this.buf, len = this._len;
+    let p = this.pos;
+    while (p < len) {
+      const b = buf[p];
+      if (IsWS[b] || IsDelim[b]) break;
+      p++;
+    }
+    this.pos = p;
+  }
+
+  // Skip a /name token, bumping numNames
+  skipName() {
+    this.pos++; // skip /
+    this._skipNameBody();
+    this.numNames++;
+  }
+
+  // Skip a literal (...) string, handling escapes
+  skipString() {
+    this.pos++; // skip (
+    const buf = this.buf, len = this._len;
+    let p = this.pos, depth = 1;
+    while (p < len && depth > 0) {
+      const b = buf[p];
+      if (b === BACKSLASH) { p += 2; continue; }
+      if (b === LP) depth++;
+      else if (b === RP) depth--;
+      p++;
+    }
+    this.pos = p;
+    this.numStrings++;
+  }
+
+  // Skip a <hex> string
+  skipHexString() {
+    this.pos++; // skip <
+    const buf = this.buf, len = this._len;
+    let p = this.pos;
+    while (p < len && buf[p] !== GT) p++;
+    p++; // skip >
+    this.pos = p;
+    this.numHexStrings++;
+  }
+
+  // ---- Name disambiguation (no allocation) ------------------------
+
+  // Skip /name and tag whether it matched a known stream-related key.
+  // Returns: 0=other, 1=Length, 2=Type, 3=N, 4=First
+  matchDictKey() {
+    const buf = this.buf, len = this._len;
+    this.pos++; // skip /
+    const start = this.pos;
+    let match = 0;
+
+    const b0 = buf[start];
+    if (b0 === L_CH /* L */) {
+      if (start + 6 <= len &&
+          buf[start+1] === e_ && buf[start+2] === n_ &&
+          buf[start+3] === 103 /* g */ && buf[start+4] === t_ &&
+          buf[start+5] === 104 /* h */ &&
+          (start+6 === len || IsWS[buf[start+6]] || IsDelim[buf[start+6]])) {
+        match = 1;
+        this.pos = start + 6;
+      }
+    } else if (b0 === T_CH /* T */) {
+      if (start + 4 <= len &&
+          buf[start+1] === 121 /* y */ && buf[start+2] === 112 /* p */ &&
+          buf[start+3] === e_ &&
+          (start+4 === len || IsWS[buf[start+4]] || IsDelim[buf[start+4]])) {
+        match = 2;
+        this.pos = start + 4;
+      }
+    } else if (b0 === N_CH /* N */) {
+      if (start + 1 === len || IsWS[buf[start+1]] || IsDelim[buf[start+1]]) {
+        match = 3;
+        this.pos = start + 1;
+      }
+    } else if (b0 === F_CH /* F */) {
+      if (start + 5 <= len &&
+          buf[start+1] === 105 /* i */ && buf[start+2] === r_ &&
+          buf[start+3] === s_ && buf[start+4] === t_ &&
+          (start+5 === len || IsWS[buf[start+5]] || IsDelim[buf[start+5]])) {
+        match = 4;
+        this.pos = start + 5;
+      }
+    }
+
+    if (match === 0) this._skipNameBody();
+    this.numNames++;
+    return match;
+  }
+
+  // After / is already skipped, check if name body equals an ASCII string.
+  // Does NOT move pos. Caller _skipNameBody afterwards.
+  _isNameAt(p, name) {
+    const buf = this.buf, len = this._len;
+    const n = name.length;
+    if (p + n > len) return false;
+    for (let i = 0; i < n; i++) {
+      if (buf[p + i] !== name.charCodeAt(i)) return false;
+    }
+    if (p + n === len) return true;
+    const after = buf[p + n];
+    return !!(IsWS[after] || IsDelim[after]);
+  }
+
+  // ---- Number / Ref ------------------------------------------------
+
+  // Parse a number-or-ref token starting at pos. Bumps numNumbers or
+  // numRefs as appropriate. Returns the integer value if it was a plain
+  // integer (for /Length capture); else NaN.
+  //
+  // PDF grammar: optional sign, optional digits, optional dot, optional
+  // digits. At least one digit required somewhere. No exponentials.
+  // So '.251', '-1.5', '+5', '5.', '5' are all valid.
+  parseNumberOrRefCapture() {
+    const buf = this.buf, len = this._len;
+    let p = this.pos;
+    let sign = 1;
+    if (buf[p] === MINUS) { sign = -1; p++; }
+    else if (buf[p] === PLUS) { p++; }
+    let intDigits = 0, intVal = 0;
+    while (p < len && buf[p] >= D0 && buf[p] <= D9) {
+      intVal = intVal * 10 + (buf[p] - D0);
+      intDigits++; p++;
+    }
+    let hasDot = 0, fracDigits = 0;
+    if (p < len && buf[p] === DOT) {
+      hasDot = 1; p++;
+      while (p < len && buf[p] >= D0 && buf[p] <= D9) { fracDigits++; p++; }
+    }
+    if (intDigits === 0 && fracDigits === 0) {
+      throw new Error('expected number at ' + this.pos);
+    }
+    this.pos = p;
+    if (hasDot) {
+      this.numNumbers++;
+      return NaN;
+    }
+    // Pure integer: lookahead for ref "<sp> <int> <sp> R"
+    const save = this.pos;
+    this.skipWS();
+    if (this.pos < len && IsDigit[buf[this.pos]]) {
+      this._skipInt();
+      this.skipWS();
+      if (this.pos < len && buf[this.pos] === R_CH) {
+        this.pos++;
+        this.numRefs++;
+        return NaN;
+      }
+    }
+    this.pos = save;
+    this.numNumbers++;
+    return sign * intVal;
+  }
+
+  // ---- Object dispatch --------------------------------------------
+
+  parseObject() {
+    this.skipWS();
+    const buf = this.buf, len = this._len;
+    if (this.pos >= len) return;
+    const b = buf[this.pos];
+
+    // Keywords: true / false / null
+    if (b === t_) {
+      if (this.pos + 4 <= len &&
+          buf[this.pos+1] === r_ && buf[this.pos+2] === u_ && buf[this.pos+3] === e_) {
+        this.pos += 4; return;
+      }
+    } else if (b === f_) {
+      if (this.pos + 5 <= len &&
+          buf[this.pos+1] === a_ && buf[this.pos+2] === l_ &&
+          buf[this.pos+3] === s_ && buf[this.pos+4] === e_) {
+        this.pos += 5; return;
+      }
+    } else if (b === n_) {
+      if (this.pos + 4 <= len &&
+          buf[this.pos+1] === u_ && buf[this.pos+2] === l_ && buf[this.pos+3] === l_) {
+        this.pos += 4; return;
+      }
+    }
+
+    if (b === LT) {
+      if (buf[this.pos + 1] === LT) {
+        // Dict value: parse, then pop the frame (caller doesn't care)
+        const d = this._depth;
+        this.parseDict();
+        this._depth = d;
+        return;
+      }
+      this.skipHexString();
+      return;
+    }
+    if (b === LP) { this.skipString(); return; }
+    if (b === SLASH) { this.skipName(); return; }
+    if (b === LB) { this.parseArray(); return; }
+    if (IsNumeric[b]) { this.parseNumberOrRefCapture(); return; }
+
+    throw new Error(`parseObject: unexpected byte ${b} ('${String.fromCharCode(b)}') at ${this.pos}`);
+  }
+
+  // Parse << ... >>. Push frame on stack; do NOT decrement depth.
+  // Caller reads stack frame at index this._depth - 1 and decrements.
+  parseDict() {
+    const d = this._depth++;
+    if (d >= 64) throw new Error('dict depth overflow at ' + this.pos);
+    if (this._depth > this.maxRecursionDepth) this.maxRecursionDepth = this._depth;
+    this._stLength[d]  = -1;
+    this._stIsObjStm[d] = 0;
+    this._stN[d]      = -1;
+    this._stFirst[d]  = -1;
+
+    this.pos += 2; // skip <<
+    this.skipWS();
+
+    const buf = this.buf, len = this._len;
+    let count = 0;
+    while (this.pos < len) {
+      if (buf[this.pos] === GT && buf[this.pos + 1] === GT) break;
+      if (buf[this.pos] !== SLASH) throw new Error('expected name at ' + this.pos);
+
+      const tag = this.matchDictKey();
+      this.skipWS();
+
+      if (tag === 1 && IsNumeric[buf[this.pos]]) {
+        const v = this.parseNumberOrRefCapture();
+        if (!isNaN(v)) this._stLength[d] = v;
+      } else if (tag === 2 && buf[this.pos] === SLASH) {
+        // /Type value -- detect /ObjStm
+        if (this._isNameAt(this.pos + 1, 'ObjStm')) this._stIsObjStm[d] = 1;
+        this.pos++; // skip /
+        this._skipNameBody();
+        this.numNames++;
+      } else if (tag === 3 && IsNumeric[buf[this.pos]]) {
+        const v = this.parseNumberOrRefCapture();
+        if (!isNaN(v)) this._stN[d] = v;
+      } else if (tag === 4 && IsNumeric[buf[this.pos]]) {
+        const v = this.parseNumberOrRefCapture();
+        if (!isNaN(v)) this._stFirst[d] = v;
+      } else {
+        this.parseObject();
+      }
+      this.skipWS();
+      count++;
+    }
+    this.pos += 2; // skip >>
+
+    this.numDicts++;
+    this.numDictSlots += count * 2;
+    if (count * 2 > this.maxDictSlots) this.maxDictSlots = count * 2;
+    // Don't decrement _depth here -- caller reads frame then pops.
+  }
+
+  parseArray() {
+    const d = this._depth++;
+    if (this._depth > this.maxRecursionDepth) this.maxRecursionDepth = this._depth;
+
+    this.pos++; // skip [
+    this.skipWS();
+
+    const buf = this.buf, len = this._len;
+    let count = 0;
+    while (this.pos < len && buf[this.pos] !== RB) {
+      this.parseObject();
+      this.skipWS();
+      count++;
+    }
+    this.pos++; // skip ]
+
+    this.numArrays++;
+    this.numArraySlots += count;
+    if (count > this.maxArraySlots) this.maxArraySlots = count;
+    this._depth--;
+  }
+
+  // ---- Indirect object + stream handling --------------------------
+
+  findEndStream(from) {
+    const buf = this.buf, len = this._len;
+    let p = from;
+    while (p + 9 <= len) {
+      if (buf[p] === e_ && buf[p+1] === n_ && buf[p+2] === d_ &&
+          buf[p+3] === s_ && buf[p+4] === t_ && buf[p+5] === r_ &&
+          buf[p+6] === e_ && buf[p+7] === a_ && buf[p+8] === m_) {
+        let end = p;
+        while (end > from && (buf[end-1] === LF || buf[end-1] === CR)) end--;
+        return end;
+      }
+      p++;
+    }
+    throw new Error('endstream not found from ' + from);
+  }
+
+  // Inflate an ObjStm and walk its inner objects.
+  processObjStm(start, end, N, first) {
+    const compressed = this.buf.subarray(start, end);
+    let inflated;
+    try {
+      inflated = inflateSync(compressed);
+    } catch (e) {
+      console.warn(`inflate failed at ${start}: ${e.message}`);
+      return;
+    }
+    this.totalInflatedBytes += inflated.length;
+    this.numObjStmInnerObjects += N;
+
+    // Grow offset arrays if needed
+    if (N > this._objOffsets.length) {
+      this._objOffsets = new Int32Array(N);
+      this._objNums = new Int32Array(N);
+    }
+
+    const saveBuf = this.buf, savePos = this.pos, saveLen = this._len;
+    this.buf = inflated;
+    this.pos = 0;
+    this._len = inflated.length;
+
+    // Read N (objNum, byteOffset) pairs
+    for (let i = 0; i < N; i++) {
+      this.skipWS();
+      this._objNums[i] = this._skipInt();
+      this.skipWS();
+      this._objOffsets[i] = this._skipInt();
+    }
+
+    // Walk each inner object
+    for (let i = 0; i < N; i++) {
+      this.pos = first + this._objOffsets[i];
+      const d0 = this._depth;
+      this.parseObject();
+      this._depth = d0; // safety pop
+    }
+
+    this.buf = saveBuf;
+    this.pos = savePos;
+    this._len = saveLen;
+  }
+
+  parseIndirectObject() {
+    this.skipWS();
+    this._skipInt(); // objNum
+    this.skipWS();
+    this._skipInt(); // gen
+    this.skipWS();
+
+    const buf = this.buf, len = this._len;
+    if (!(this.pos + 3 <= len && buf[this.pos] === o_ && buf[this.pos+1] === b_ && buf[this.pos+2] === j_)) {
+      throw new Error('expected "obj" at ' + this.pos);
+    }
+    this.pos += 3;
+    this.skipWS();
+    this.numIndirectObjects++;
+
+    // Parse the object body. If it's a dict, leave the frame on the
+    // stack so we can read /Length / /Type / /N / /First if a stream
+    // follows.
+    const frameDepth = this._depth;
+    let wasDict = false;
+    if (this.pos + 2 <= len && buf[this.pos] === LT && buf[this.pos+1] === LT) {
+      this.parseDict();
+      wasDict = true;
+    } else {
+      this.parseObject();
+    }
+    this.skipWS();
+
+    // Stream?
+    if (wasDict && this.pos + 6 <= len &&
+        buf[this.pos] === s_ && buf[this.pos+1] === t_ && buf[this.pos+2] === r_ &&
+        buf[this.pos+3] === e_ && buf[this.pos+4] === a_ && buf[this.pos+5] === m_) {
+      this.pos += 6;
+      // Optional CR/LF after 'stream'
+      if (this.pos < len && buf[this.pos] === CR) this.pos++;
+      if (this.pos < len && buf[this.pos] === LF) this.pos++;
+
+      const streamStart = this.pos;
+      const length    = this._stLength[frameDepth];
+      const isObjStm  = this._stIsObjStm[frameDepth];
+      const N         = this._stN[frameDepth];
+      const first     = this._stFirst[frameDepth];
+
+      let streamEnd;
+      if (length > 0) {
+        streamEnd = streamStart + length;
+        // Sanity: streamEnd should land near 'endstream'. If not, fallback.
+        if (streamEnd > len ||
+            !(buf[streamEnd] === LF || buf[streamEnd] === CR ||
+              buf[streamEnd] === e_ || IsWS[buf[streamEnd]])) {
+          streamEnd = this.findEndStream(streamStart);
+        }
+      } else {
+        streamEnd = this.findEndStream(streamStart);
+      }
+      this.pos = streamEnd;
+      this.totalStreamBytes += (streamEnd - streamStart);
+      this.numStreams++;
+
+      if (isObjStm && N > 0 && first > 0) {
+        this.numObjStms++;
+        this.processObjStm(streamStart, streamEnd, N, first);
+        this.pos = streamEnd; // restore (processObjStm restores too, defensive)
+      }
+
+      this.skipWS();
+      // Optional 'endstream' keyword (we already positioned past content)
+      if (this.pos + 9 <= len &&
+          buf[this.pos] === e_ && buf[this.pos+1] === n_ && buf[this.pos+2] === d_ &&
+          buf[this.pos+3] === s_ && buf[this.pos+4] === t_ && buf[this.pos+5] === r_ &&
+          buf[this.pos+6] === e_ && buf[this.pos+7] === a_ && buf[this.pos+8] === m_) {
+        this.pos += 9;
+      }
+      this.skipWS();
+    }
+
+    // Pop the dict frame
+    if (wasDict) this._depth = frameDepth;
+
+    // 'endobj' (lenient: tolerate missing)
+    this.skipWS();
+    if (this.pos + 6 <= len &&
+        buf[this.pos] === e_ && buf[this.pos+1] === n_ && buf[this.pos+2] === d_ &&
+        buf[this.pos+3] === o_ && buf[this.pos+4] === b_ && buf[this.pos+5] === j_) {
+      this.pos += 6;
+    }
+  }
+
+  // ---- Top-level walk --------------------------------------------
+
+  walk() {
+    const buf = this.buf, len = this._len;
+
+    // Skip header line (%PDF-x.y), binary marker, etc.
+    // Strategy: scan forward until we see a digit followed by "<sp> <digit>+ <sp> obj"
+    // -- the first indirect-object header.
+    while (this.pos < len) {
+      this.skipWS();
+      if (this.pos >= len) break;
+      const b = buf[this.pos];
+      if (IsDigit[b]) {
+        // Try to validate this looks like an indirect-obj header
+        const save = this.pos;
+        this._skipInt();
+        if (buf[this.pos] === SP || buf[this.pos] === TAB) {
+          this.skipWS();
+          if (IsDigit[buf[this.pos]]) {
+            this._skipInt();
+            this.skipWS();
+            if (this.pos + 3 <= len && buf[this.pos] === o_ &&
+                buf[this.pos+1] === b_ && buf[this.pos+2] === j_) {
+              this.pos = save;
+              break;
+            }
+          }
+        }
+        this.pos = save + 1;
+      } else {
+        this.pos++;
+      }
+    }
+
+    // Walk indirect objects until xref / startxref / trailer
+    while (this.pos < len) {
+      this.skipWS();
+      if (this.pos >= len) break;
+      const b = buf[this.pos];
+      if (b === x_) break;            // xref
+      if (b === t_ && buf[this.pos+1] === r_ && buf[this.pos+2] === a_ &&
+          buf[this.pos+3] === 105 /* i */) break;  // trailer
+      if (b === s_ && buf[this.pos+1] === t_ && buf[this.pos+2] === a_ &&
+          buf[this.pos+3] === r_ && buf[this.pos+4] === t_) break;  // startxref
+      if (!IsDigit[b]) break;
+      this.parseIndirectObject();
+    }
+  }
+}
+
+// ---- Main -----------------------------------------------------------
+
+function pickDefaultPdf() {
+  const dir = resolve('perf/results');
+  const entries = readdirSync(dir)
+    .filter(d => /^\d{4}-\d{2}-\d{2}T/.test(d))
+    .filter(d => statSync(join(dir, d)).isDirectory())
+    .sort();
+  for (let i = entries.length - 1; i >= 0; i--) {
+    const p = join(dir, entries[i], 'book.pdf');
+    try { statSync(p); return p; } catch (_) {}
+  }
+  throw new Error('no perf/results/*/book.pdf found; pass a path as argv[2]');
+}
+
+async function main() {
+  const args = process.argv.slice(2);
+  let inputPath = null;
+  let runs = 3;
+  let skipLoad = false;
+  for (let i = 0; i < args.length; i++) {
+    const a = args[i];
+    if (a === '--runs') runs = parseInt(args[++i], 10);
+    else if (a === '--no-load') skipLoad = true;
+    else if (!inputPath) inputPath = a;
+  }
+  if (!inputPath) inputPath = pickDefaultPdf();
+  const buf = readFileSync(inputPath);
+  console.log(`input: ${inputPath}`);
+  console.log(`size:  ${(buf.length / 1024 / 1024).toFixed(2)} MB`);
+  console.log('');
+
+  // Measure pass
+  console.log(`--- measure pass (${runs} runs) ---`);
+  const measureTimes = [];
+  let lastM = null;
+  for (let i = 0; i < runs; i++) {
+    const m = new Measurer(buf);
+    const t0 = performance.now();
+    m.walk();
+    const ms = performance.now() - t0;
+    measureTimes.push(ms);
+    console.log(`  run ${i+1}: ${ms.toFixed(1)} ms`);
+    lastM = m;
+  }
+  const minMeasure = Math.min(...measureTimes);
+  console.log(`  min:   ${minMeasure.toFixed(1)} ms`);
+  console.log('');
+  console.log('counts (last run):');
+  console.log(`  indirect objects:    ${lastM.numIndirectObjects}`);
+  console.log(`  dicts:               ${lastM.numDicts}   slots: ${lastM.numDictSlots}   max: ${lastM.maxDictSlots}`);
+  console.log(`  arrays:              ${lastM.numArrays}   slots: ${lastM.numArraySlots}   max: ${lastM.maxArraySlots}`);
+  console.log(`  refs:                ${lastM.numRefs}`);
+  console.log(`  names:               ${lastM.numNames}`);
+  console.log(`  numbers:             ${lastM.numNumbers}`);
+  console.log(`  strings (literal):   ${lastM.numStrings}`);
+  console.log(`  strings (hex):       ${lastM.numHexStrings}`);
+  console.log(`  streams:             ${lastM.numStreams}   bytes: ${(lastM.totalStreamBytes/1024/1024).toFixed(2)} MB`);
+  console.log(`  objstms:             ${lastM.numObjStms}   inner objs: ${lastM.numObjStmInnerObjects}   inflated: ${(lastM.totalInflatedBytes/1024/1024).toFixed(2)} MB`);
+  console.log(`  max recursion:       ${lastM.maxRecursionDepth}`);
+  console.log('');
+
+  if (skipLoad) return;
+
+  // pdf-lib load (1 run only -- fast-dict-onebuf is singleton-context)
+  console.log(`--- PDFDocument.load (1 run; shim is singleton-context) ---`);
+  const t0 = performance.now();
+  await PDFDocument.load(buf);
+  const loadMs = performance.now() - t0;
+  console.log(`  load: ${loadMs.toFixed(1)} ms`);
+  console.log('');
+  console.log(`ratio measure(min)/load: ${(minMeasure / loadMs).toFixed(3)}  (lower = better)`);
+}
+
+main().catch(e => { console.error(e); process.exit(1); });

From d963c183f8b1449b403a1a9aa839d70d1571c00d Mon Sep 17 00:00:00 2001
From: Kuba Sunderland-Ober <kuba@mareimbrium.org>
Date: Sat, 23 May 2026 19:28:12 +0200
Subject: [PATCH 24/44] Measure-pass Phase 1: pre-size fast-dict-onebuf mainBuf
 in place.

docs/lib/measure-pass.mjs productionises the Phase 0 walker as a
stand-alone library exporting measure(bytes) -> counts.
fast-dict-onebuf gains setExpectedDictSlots(slots) that resizes
the module-level main backing Array to exact measured demand.
perf/measure.mjs gains --measure-pass that wires the two together
before PDFDocument.load, with mutex checks against --incremental,
--render-only, and the (required) --fast-dict-onebuf.

Structural validation: byte-identical output (1651 pages, 1773
outline nodes, matching titles; 31-byte rawPdf-timestamp jitter
on the saved bytes).

A V8 inline-cache gotcha worth capturing: the first cut reassigned
the module binding (`let main; main = new Array(N)`) which broke
IC slots in every hot closure that read main. Heap profile showed
_appendEntries leaking 27 MB and total sampled jumping 65 -> 92 MB,
despite the resized array being identical in shape. Pre-filling
with arr.fill(null) didn't help (wasn't an element-kind issue).
Fix: keep the same Array identity, resize in place via
`main.length = N`. Heap regression collapses to +0.14 MB noise.
Lesson recorded in notes/08-pdf-lib.md: never rebind a module-level
value that hot closures specialise against, even if language
semantics allow it -- mutate in place.

Measured cost (paired, production shim set):
  measure-pass:  +60 ms (inline; 135 ms standalone Phase 0)
  load:          unchanged (within noise)
  net process:   +40 ms

Heap: flat. Phase 1 doesn't change what gets allocated, only the
initial capacity of the backing Array (which is module-load-time
cost, invisible in process-phase profiles).

Behind --measure-pass flag in the harness. NOT yet in
docs/render-book.mjs's production import chain -- no current
consumer wins anything back from the measured counts. The flag
exists so a later commit can flip it into production once the
architecture has another consumer.
---
 docs/lib/fast-dict-onebuf.mjs |  24 +-
 docs/lib/measure-pass.mjs     | 600 ++++++++++++++++++++++++++++++++++
 perf/README.md                |   3 +-
 perf/measure.mjs              |  56 +++-
 perf/notes/08-pdf-lib.md      | 168 ++++++++++
 5 files changed, 847 insertions(+), 4 deletions(-)
 create mode 100644 docs/lib/measure-pass.mjs

diff --git a/docs/lib/fast-dict-onebuf.mjs b/docs/lib/fast-dict-onebuf.mjs
index 92bfa9cf..c77384d7 100644
--- a/docs/lib/fast-dict-onebuf.mjs
+++ b/docs/lib/fast-dict-onebuf.mjs
@@ -62,11 +62,33 @@ const PageName    = PDFName.of('Page');
 
 // Pre-sized to total entries + slack measured on the book. Other
 // workloads grow it naturally (V8-amortized array growth from this
-// starting size).
+// starting size). When the measure-pass shim runs first, it calls
+// setExpectedDictSlots() before parse, which resizes `main` to exact
+// measured demand via `main.length = N`.
 const MAIN_INITIAL_CAP = 2400000;
 const main = new Array(MAIN_INITIAL_CAP);
 let mainLen = 0;
 
+// Replace `main` with an exact-sized backing array. Must be called
+// before any parseDict / withContext / fromMapWithContext (i.e. while
+// mainLen is still 0). `slack` is a multiplier on `slots`; default 1.0
+// (exact). Use a small slack only if the measure pass is approximate.
+export function setExpectedDictSlots(slots, slack = 1.0) {
+  if (mainLen > 0) {
+    throw new Error(
+      `fast-dict-onebuf: setExpectedDictSlots called after parse started (mainLen=${mainLen})`,
+    );
+  }
+  const sized = Math.ceil(slots * slack);
+  // Resize in place rather than reassigning. Reassigning the module-
+  // level `main` binding invalidates V8's inline-cache slots in every
+  // closure that reads it -- the closures get deopted on first call
+  // and recompile against the new array, with a parse-time allocation
+  // spike attributed to _appendEntries (~27 MB sampled on the book).
+  // `main.length = N` keeps the same Array identity; ICs stay valid.
+  main.length = sized;
+}
+
 // ---- Bit-packing helpers --------------------------------------------
 
 const POW_24 = 16777216;          // 2^24
diff --git a/docs/lib/measure-pass.mjs b/docs/lib/measure-pass.mjs
new file mode 100644
index 00000000..293e6887
--- /dev/null
+++ b/docs/lib/measure-pass.mjs
@@ -0,0 +1,600 @@
+// No-allocate measure pass over a PDF byte stream.
+//
+// Walks the PDF grammar (indirect objects, dicts, arrays, names,
+// numbers, refs, strings, streams, ObjStms-with-inflate) without
+// instantiating any PDFObject. Produces counts that downstream
+// pre-sizing shims consume:
+//
+//   { indirectObjects, dicts, dictSlots, arrays, arraySlots,
+//     refs, names, numbers, strings, hexStrings, streams,
+//     objStms, objStmInner, maxDictSlots, maxArraySlots,
+//     maxRecursion, totalStreamBytes, totalInflatedBytes }
+//
+// Counts are *appearances*, not unique values. Phase 2+ will add
+// interning to produce unique-count tables (for exact name/ref/
+// number pool sizing).
+//
+// Allocation discipline:
+//   - No string concat. Names, numbers, strings are skipped by
+//     advancing the byte cursor without keeping bytes.
+//   - Per-dict captures (/Length, /Type, /N, /First) live on
+//     depth-indexed typed-array stacks. Max recursion observed
+//     on the book is 4; stack size 64 is plenty.
+//   - ObjStm offset arrays are reusable Int32Array(512), grown
+//     on demand. The inflate destination is a fresh Buffer per
+//     ObjStm (Chrome's raw output has zero ObjStms; book.pdf
+//     has 453 after pdf-lib's save bundles them).
+//
+// One PDF parse-corner to remember: PDF reals can omit the
+// integer part. `.251` is valid (Chrome emits it for /CA, /ca
+// alpha values). The parser accepts `[sign?][digits?]
+// [.[digits?]]?` with the constraint that at least one digit
+// appears.
+
+import { inflateSync } from 'node:zlib';
+
+// ---- Byte constants -------------------------------------------------
+
+const TAB = 9, LF = 10, FF = 12, CR = 13, SP = 32;
+const LT = 60 /* < */, GT = 62 /* > */;
+const LB = 91 /* [ */, RB = 93 /* ] */;
+const LP = 40 /* ( */, RP = 41 /* ) */;
+const SLASH = 47, PERCENT = 37, BACKSLASH = 92;
+const D0 = 48, D9 = 57;
+const MINUS = 45, PLUS = 43, DOT = 46;
+const a_ = 97, b_ = 98, d_ = 100, e_ = 101, f_ = 102, j_ = 106;
+const l_ = 108, m_ = 109, n_ = 110, o_ = 111, r_ = 114, s_ = 115;
+const t_ = 116, u_ = 117, x_ = 120;
+const R_CH = 82, L_CH = 76, T_CH = 84, N_CH = 78, F_CH = 70;
+
+// ---- Lookup tables (mirror pdf-lib's IsWhitespace / IsDelimiter / IsDigit / IsNumeric) ----
+
+const IsWS = new Uint8Array(256);
+IsWS[0] = IsWS[TAB] = IsWS[LF] = IsWS[FF] = IsWS[CR] = IsWS[SP] = 1;
+
+const IsDelim = new Uint8Array(256);
+IsDelim[LT] = IsDelim[GT] = IsDelim[LB] = IsDelim[RB] = 1;
+IsDelim[LP] = IsDelim[RP] = IsDelim[SLASH] = IsDelim[PERCENT] = 1;
+
+const IsDigit = new Uint8Array(256);
+for (let b = D0; b <= D9; b++) IsDigit[b] = 1;
+
+const IsNumeric = new Uint8Array(IsDigit);
+IsNumeric[DOT] = IsNumeric[MINUS] = IsNumeric[PLUS] = 1;
+
+// ---- Measurer -------------------------------------------------------
+
+export class Measurer {
+  constructor(buf) {
+    this.buf = buf;
+    this.pos = 0;
+    this._len = buf.length;
+
+    this.numIndirectObjects = 0;
+    this.numDicts = 0;
+    this.numDictSlots = 0;
+    this.numArrays = 0;
+    this.numArraySlots = 0;
+    this.numRefs = 0;
+    this.numNames = 0;
+    this.numNumbers = 0;
+    this.numStrings = 0;
+    this.numHexStrings = 0;
+    this.numStreams = 0;
+    this.numObjStms = 0;
+    this.numObjStmInnerObjects = 0;
+    this.maxDictSlots = 0;
+    this.maxArraySlots = 0;
+    this.maxRecursionDepth = 0;
+    this.totalStreamBytes = 0;
+    this.totalInflatedBytes = 0;
+
+    const MAX_DEPTH = 64;
+    this._depth = 0;
+    this._stLength  = new Int32Array(MAX_DEPTH);
+    this._stIsObjStm = new Uint8Array(MAX_DEPTH);
+    this._stN      = new Int32Array(MAX_DEPTH);
+    this._stFirst  = new Int32Array(MAX_DEPTH);
+
+    this._objNums    = new Int32Array(512);
+    this._objOffsets = new Int32Array(512);
+  }
+
+  // ---- Skip helpers (no allocation) --------------------------------
+
+  skipWS() {
+    const buf = this.buf, len = this._len;
+    let p = this.pos;
+    while (p < len) {
+      const b = buf[p];
+      if (IsWS[b]) { p++; continue; }
+      if (b === PERCENT) {
+        while (p < len && buf[p] !== LF && buf[p] !== CR) p++;
+        continue;
+      }
+      break;
+    }
+    this.pos = p;
+  }
+
+  // Parse an integer in place. No string concat. Returns NaN if no digit.
+  // Does NOT bump numNumbers (used for metadata: header, ObjStm offsets).
+  _skipInt() {
+    const buf = this.buf, len = this._len;
+    let p = this.pos, v = 0, sign = 1, any = 0;
+    if (buf[p] === MINUS) { sign = -1; p++; }
+    else if (buf[p] === PLUS) { p++; }
+    while (p < len) {
+      const b = buf[p];
+      if (b < D0 || b > D9) break;
+      v = v * 10 + (b - D0);
+      any = 1; p++;
+    }
+    this.pos = p;
+    return any ? sign * v : NaN;
+  }
+
+  _skipNameBody() {
+    const buf = this.buf, len = this._len;
+    let p = this.pos;
+    while (p < len) {
+      const b = buf[p];
+      if (IsWS[b] || IsDelim[b]) break;
+      p++;
+    }
+    this.pos = p;
+  }
+
+  skipName() {
+    this.pos++;
+    this._skipNameBody();
+    this.numNames++;
+  }
+
+  skipString() {
+    this.pos++;
+    const buf = this.buf, len = this._len;
+    let p = this.pos, depth = 1;
+    while (p < len && depth > 0) {
+      const b = buf[p];
+      if (b === BACKSLASH) { p += 2; continue; }
+      if (b === LP) depth++;
+      else if (b === RP) depth--;
+      p++;
+    }
+    this.pos = p;
+    this.numStrings++;
+  }
+
+  skipHexString() {
+    this.pos++;
+    const buf = this.buf, len = this._len;
+    let p = this.pos;
+    while (p < len && buf[p] !== GT) p++;
+    p++;
+    this.pos = p;
+    this.numHexStrings++;
+  }
+
+  // Skip /name; tag whether it matched a known stream-related key.
+  // 0=other, 1=Length, 2=Type, 3=N, 4=First.
+  matchDictKey() {
+    const buf = this.buf, len = this._len;
+    this.pos++;
+    const start = this.pos;
+    let match = 0;
+    const b0 = buf[start];
+    if (b0 === L_CH) {
+      if (start + 6 <= len &&
+          buf[start+1] === e_ && buf[start+2] === n_ &&
+          buf[start+3] === 103 /* g */ && buf[start+4] === t_ &&
+          buf[start+5] === 104 /* h */ &&
+          (start+6 === len || IsWS[buf[start+6]] || IsDelim[buf[start+6]])) {
+        match = 1; this.pos = start + 6;
+      }
+    } else if (b0 === T_CH) {
+      if (start + 4 <= len &&
+          buf[start+1] === 121 /* y */ && buf[start+2] === 112 /* p */ &&
+          buf[start+3] === e_ &&
+          (start+4 === len || IsWS[buf[start+4]] || IsDelim[buf[start+4]])) {
+        match = 2; this.pos = start + 4;
+      }
+    } else if (b0 === N_CH) {
+      if (start + 1 === len || IsWS[buf[start+1]] || IsDelim[buf[start+1]]) {
+        match = 3; this.pos = start + 1;
+      }
+    } else if (b0 === F_CH) {
+      if (start + 5 <= len &&
+          buf[start+1] === 105 /* i */ && buf[start+2] === r_ &&
+          buf[start+3] === s_ && buf[start+4] === t_ &&
+          (start+5 === len || IsWS[buf[start+5]] || IsDelim[buf[start+5]])) {
+        match = 4; this.pos = start + 5;
+      }
+    }
+    if (match === 0) this._skipNameBody();
+    this.numNames++;
+    return match;
+  }
+
+  // After / is already skipped, check if name body equals an ASCII string.
+  // Does NOT move pos.
+  _isNameAt(p, name) {
+    const buf = this.buf, len = this._len;
+    const n = name.length;
+    if (p + n > len) return false;
+    for (let i = 0; i < n; i++) {
+      if (buf[p + i] !== name.charCodeAt(i)) return false;
+    }
+    if (p + n === len) return true;
+    const after = buf[p + n];
+    return !!(IsWS[after] || IsDelim[after]);
+  }
+
+  // ---- Number / Ref ------------------------------------------------
+
+  // PDF number grammar: optional sign, optional digits, optional dot,
+  // optional digits. At least one digit required somewhere. No exps.
+  // Returns the integer value for pure-integer-non-ref case (for
+  // /Length capture); else NaN.
+  parseNumberOrRefCapture() {
+    const buf = this.buf, len = this._len;
+    let p = this.pos;
+    let sign = 1;
+    if (buf[p] === MINUS) { sign = -1; p++; }
+    else if (buf[p] === PLUS) { p++; }
+    let intDigits = 0, intVal = 0;
+    while (p < len && buf[p] >= D0 && buf[p] <= D9) {
+      intVal = intVal * 10 + (buf[p] - D0);
+      intDigits++; p++;
+    }
+    let hasDot = 0, fracDigits = 0;
+    if (p < len && buf[p] === DOT) {
+      hasDot = 1; p++;
+      while (p < len && buf[p] >= D0 && buf[p] <= D9) { fracDigits++; p++; }
+    }
+    if (intDigits === 0 && fracDigits === 0) {
+      throw new Error('measure-pass: expected number at ' + this.pos);
+    }
+    this.pos = p;
+    if (hasDot) {
+      this.numNumbers++;
+      return NaN;
+    }
+    const save = this.pos;
+    this.skipWS();
+    if (this.pos < len && IsDigit[buf[this.pos]]) {
+      this._skipInt();
+      this.skipWS();
+      if (this.pos < len && buf[this.pos] === R_CH) {
+        this.pos++;
+        this.numRefs++;
+        return NaN;
+      }
+    }
+    this.pos = save;
+    this.numNumbers++;
+    return sign * intVal;
+  }
+
+  // ---- Object dispatch --------------------------------------------
+
+  parseObject() {
+    this.skipWS();
+    const buf = this.buf, len = this._len;
+    if (this.pos >= len) return;
+    const b = buf[this.pos];
+
+    if (b === t_) {
+      if (this.pos + 4 <= len &&
+          buf[this.pos+1] === r_ && buf[this.pos+2] === u_ && buf[this.pos+3] === e_) {
+        this.pos += 4; return;
+      }
+    } else if (b === f_) {
+      if (this.pos + 5 <= len &&
+          buf[this.pos+1] === a_ && buf[this.pos+2] === l_ &&
+          buf[this.pos+3] === s_ && buf[this.pos+4] === e_) {
+        this.pos += 5; return;
+      }
+    } else if (b === n_) {
+      if (this.pos + 4 <= len &&
+          buf[this.pos+1] === u_ && buf[this.pos+2] === l_ && buf[this.pos+3] === l_) {
+        this.pos += 4; return;
+      }
+    }
+
+    if (b === LT) {
+      if (buf[this.pos + 1] === LT) {
+        const d = this._depth;
+        this.parseDict();
+        this._depth = d;
+        return;
+      }
+      this.skipHexString();
+      return;
+    }
+    if (b === LP) { this.skipString(); return; }
+    if (b === SLASH) { this.skipName(); return; }
+    if (b === LB) { this.parseArray(); return; }
+    if (IsNumeric[b]) { this.parseNumberOrRefCapture(); return; }
+
+    throw new Error(`measure-pass: unexpected byte ${b} ('${String.fromCharCode(b)}') at ${this.pos}`);
+  }
+
+  // Parse << ... >>. Push frame on stack; do NOT decrement depth.
+  // Caller reads stack frame at index this._depth - 1 and decrements.
+  parseDict() {
+    const d = this._depth++;
+    if (d >= 64) throw new Error('measure-pass: dict depth overflow at ' + this.pos);
+    if (this._depth > this.maxRecursionDepth) this.maxRecursionDepth = this._depth;
+    this._stLength[d]  = -1;
+    this._stIsObjStm[d] = 0;
+    this._stN[d]      = -1;
+    this._stFirst[d]  = -1;
+
+    this.pos += 2;
+    this.skipWS();
+
+    const buf = this.buf, len = this._len;
+    let count = 0;
+    while (this.pos < len) {
+      if (buf[this.pos] === GT && buf[this.pos + 1] === GT) break;
+      if (buf[this.pos] !== SLASH) throw new Error('measure-pass: expected name at ' + this.pos);
+
+      const tag = this.matchDictKey();
+      this.skipWS();
+
+      if (tag === 1 && IsNumeric[buf[this.pos]]) {
+        const v = this.parseNumberOrRefCapture();
+        if (!isNaN(v)) this._stLength[d] = v;
+      } else if (tag === 2 && buf[this.pos] === SLASH) {
+        if (this._isNameAt(this.pos + 1, 'ObjStm')) this._stIsObjStm[d] = 1;
+        this.pos++;
+        this._skipNameBody();
+        this.numNames++;
+      } else if (tag === 3 && IsNumeric[buf[this.pos]]) {
+        const v = this.parseNumberOrRefCapture();
+        if (!isNaN(v)) this._stN[d] = v;
+      } else if (tag === 4 && IsNumeric[buf[this.pos]]) {
+        const v = this.parseNumberOrRefCapture();
+        if (!isNaN(v)) this._stFirst[d] = v;
+      } else {
+        this.parseObject();
+      }
+      this.skipWS();
+      count++;
+    }
+    this.pos += 2;
+
+    this.numDicts++;
+    this.numDictSlots += count * 2;
+    if (count * 2 > this.maxDictSlots) this.maxDictSlots = count * 2;
+  }
+
+  parseArray() {
+    const d = this._depth++;
+    if (this._depth > this.maxRecursionDepth) this.maxRecursionDepth = this._depth;
+
+    this.pos++;
+    this.skipWS();
+
+    const buf = this.buf, len = this._len;
+    let count = 0;
+    while (this.pos < len && buf[this.pos] !== RB) {
+      this.parseObject();
+      this.skipWS();
+      count++;
+    }
+    this.pos++;
+
+    this.numArrays++;
+    this.numArraySlots += count;
+    if (count > this.maxArraySlots) this.maxArraySlots = count;
+    this._depth--;
+  }
+
+  // ---- Indirect object + stream handling --------------------------
+
+  findEndStream(from) {
+    const buf = this.buf, len = this._len;
+    let p = from;
+    while (p + 9 <= len) {
+      if (buf[p] === e_ && buf[p+1] === n_ && buf[p+2] === d_ &&
+          buf[p+3] === s_ && buf[p+4] === t_ && buf[p+5] === r_ &&
+          buf[p+6] === e_ && buf[p+7] === a_ && buf[p+8] === m_) {
+        let end = p;
+        while (end > from && (buf[end-1] === LF || buf[end-1] === CR)) end--;
+        return end;
+      }
+      p++;
+    }
+    throw new Error('measure-pass: endstream not found from ' + from);
+  }
+
+  processObjStm(start, end, N, first) {
+    const compressed = this.buf.subarray(start, end);
+    let inflated;
+    try {
+      inflated = inflateSync(compressed);
+    } catch (e) {
+      console.warn(`measure-pass: inflate failed at ${start}: ${e.message}`);
+      return;
+    }
+    this.totalInflatedBytes += inflated.length;
+    this.numObjStmInnerObjects += N;
+
+    if (N > this._objOffsets.length) {
+      this._objOffsets = new Int32Array(N);
+      this._objNums    = new Int32Array(N);
+    }
+
+    const saveBuf = this.buf, savePos = this.pos, saveLen = this._len;
+    this.buf = inflated;
+    this.pos = 0;
+    this._len = inflated.length;
+
+    for (let i = 0; i < N; i++) {
+      this.skipWS();
+      this._objNums[i] = this._skipInt();
+      this.skipWS();
+      this._objOffsets[i] = this._skipInt();
+    }
+    for (let i = 0; i < N; i++) {
+      this.pos = first + this._objOffsets[i];
+      const d0 = this._depth;
+      this.parseObject();
+      this._depth = d0;
+    }
+
+    this.buf = saveBuf;
+    this.pos = savePos;
+    this._len = saveLen;
+  }
+
+  parseIndirectObject() {
+    this.skipWS();
+    this._skipInt();
+    this.skipWS();
+    this._skipInt();
+    this.skipWS();
+
+    const buf = this.buf, len = this._len;
+    if (!(this.pos + 3 <= len && buf[this.pos] === o_ && buf[this.pos+1] === b_ && buf[this.pos+2] === j_)) {
+      throw new Error('measure-pass: expected "obj" at ' + this.pos);
+    }
+    this.pos += 3;
+    this.skipWS();
+    this.numIndirectObjects++;
+
+    const frameDepth = this._depth;
+    let wasDict = false;
+    if (this.pos + 2 <= len && buf[this.pos] === LT && buf[this.pos+1] === LT) {
+      this.parseDict();
+      wasDict = true;
+    } else {
+      this.parseObject();
+    }
+    this.skipWS();
+
+    if (wasDict && this.pos + 6 <= len &&
+        buf[this.pos] === s_ && buf[this.pos+1] === t_ && buf[this.pos+2] === r_ &&
+        buf[this.pos+3] === e_ && buf[this.pos+4] === a_ && buf[this.pos+5] === m_) {
+      this.pos += 6;
+      if (this.pos < len && buf[this.pos] === CR) this.pos++;
+      if (this.pos < len && buf[this.pos] === LF) this.pos++;
+
+      const streamStart = this.pos;
+      const length    = this._stLength[frameDepth];
+      const isObjStm  = this._stIsObjStm[frameDepth];
+      const N         = this._stN[frameDepth];
+      const first     = this._stFirst[frameDepth];
+
+      let streamEnd;
+      if (length > 0) {
+        streamEnd = streamStart + length;
+        if (streamEnd > len ||
+            !(buf[streamEnd] === LF || buf[streamEnd] === CR ||
+              buf[streamEnd] === e_ || IsWS[buf[streamEnd]])) {
+          streamEnd = this.findEndStream(streamStart);
+        }
+      } else {
+        streamEnd = this.findEndStream(streamStart);
+      }
+      this.pos = streamEnd;
+      this.totalStreamBytes += (streamEnd - streamStart);
+      this.numStreams++;
+
+      if (isObjStm && N > 0 && first > 0) {
+        this.numObjStms++;
+        this.processObjStm(streamStart, streamEnd, N, first);
+        this.pos = streamEnd;
+      }
+
+      this.skipWS();
+      if (this.pos + 9 <= len &&
+          buf[this.pos] === e_ && buf[this.pos+1] === n_ && buf[this.pos+2] === d_ &&
+          buf[this.pos+3] === s_ && buf[this.pos+4] === t_ && buf[this.pos+5] === r_ &&
+          buf[this.pos+6] === e_ && buf[this.pos+7] === a_ && buf[this.pos+8] === m_) {
+        this.pos += 9;
+      }
+      this.skipWS();
+    }
+
+    if (wasDict) this._depth = frameDepth;
+
+    this.skipWS();
+    if (this.pos + 6 <= len &&
+        buf[this.pos] === e_ && buf[this.pos+1] === n_ && buf[this.pos+2] === d_ &&
+        buf[this.pos+3] === o_ && buf[this.pos+4] === b_ && buf[this.pos+5] === j_) {
+      this.pos += 6;
+    }
+  }
+
+  walk() {
+    const buf = this.buf, len = this._len;
+
+    while (this.pos < len) {
+      this.skipWS();
+      if (this.pos >= len) break;
+      const b = buf[this.pos];
+      if (IsDigit[b]) {
+        const save = this.pos;
+        this._skipInt();
+        if (buf[this.pos] === SP || buf[this.pos] === TAB) {
+          this.skipWS();
+          if (IsDigit[buf[this.pos]]) {
+            this._skipInt();
+            this.skipWS();
+            if (this.pos + 3 <= len && buf[this.pos] === o_ &&
+                buf[this.pos+1] === b_ && buf[this.pos+2] === j_) {
+              this.pos = save;
+              break;
+            }
+          }
+        }
+        this.pos = save + 1;
+      } else {
+        this.pos++;
+      }
+    }
+
+    while (this.pos < len) {
+      this.skipWS();
+      if (this.pos >= len) break;
+      const b = buf[this.pos];
+      if (b === x_) break;
+      if (b === t_ && buf[this.pos+1] === r_ && buf[this.pos+2] === a_ &&
+          buf[this.pos+3] === 105 /* i */) break;
+      if (b === s_ && buf[this.pos+1] === t_ && buf[this.pos+2] === a_ &&
+          buf[this.pos+3] === r_ && buf[this.pos+4] === t_) break;
+      if (!IsDigit[b]) break;
+      this.parseIndirectObject();
+    }
+  }
+}
+
+// ---- Convenience wrapper -------------------------------------------
+
+export function measure(bytes) {
+  const m = new Measurer(bytes);
+  m.walk();
+  return {
+    indirectObjects:    m.numIndirectObjects,
+    dicts:              m.numDicts,
+    dictSlots:          m.numDictSlots,
+    arrays:             m.numArrays,
+    arraySlots:         m.numArraySlots,
+    refs:               m.numRefs,
+    names:              m.numNames,
+    numbers:            m.numNumbers,
+    strings:            m.numStrings,
+    hexStrings:         m.numHexStrings,
+    streams:            m.numStreams,
+    objStms:            m.numObjStms,
+    objStmInner:        m.numObjStmInnerObjects,
+    maxDictSlots:       m.maxDictSlots,
+    maxArraySlots:      m.maxArraySlots,
+    maxRecursion:       m.maxRecursionDepth,
+    totalStreamBytes:   m.totalStreamBytes,
+    totalInflatedBytes: m.totalInflatedBytes,
+  };
+}
diff --git a/perf/README.md b/perf/README.md
index bbbab7ea..2b58e0f7 100644
--- a/perf/README.md
+++ b/perf/README.md
@@ -409,6 +409,7 @@ run.bat --fast-dict-iter                  # in-place Map.forEach for PDFDict.siz
 run.bat --fast-parse-dict                 # hoist Type/Catalog/Pages/Page sentinel PDFNames out of parseDict (Map-shape baseline; production now runs --fast-dict-onebuf)
 run.bat --fast-dict-array                 # replace PDFDict's backing Map with a per-dict flat [k,v,k,v,...] array; subsumes --fast-dict-iter + --fast-parse-dict (A/B baseline; production now runs --fast-dict-onebuf)
 run.bat --fast-dict-onebuf                # ONE long-lived buffer for all PDFDict entries + small per-parser temp (also ships; opt-in here for A/B)
+run.bat --measure-pass --fast-dict-onebuf # walk rawPdf with the no-allocate measure pass and pre-size --fast-dict-onebuf's mainBuf to the exact dict-slot count (Phase 1 of the two-pass architecture; mutex with --incremental and --render-only)
 run.bat --fast-indirect-objects           # dense-array cache for PDFContext.indirectObjects (gen=0 path); mirror of --fast-refs on the value side (also ships; opt-in here for A/B)
 run.bat --fast-pdfnumber-pool             # value-keyed cache in front of PDFNumber.of; dense array for small ints, Map for the rest (also ships; opt-in here for A/B)
 run.bat --fast-parse-object               # first-byte dispatch in parseObject; gate true/false/null matchKeyword behind byte check (also ships; opt-in here for A/B)
@@ -563,4 +564,4 @@ order; later ones reference earlier ones for context.
 | [05-blink-trace.md](notes/05-blink-trace.md) | What happened when we tried move-not-clone (a `previousLeaf` cache shipped instead of move); cracking the cpu profile's `(program)` row open with a Blink-category trace; the WhiteSpaceFilter paired-A/B that found it wasn't worth its layout cost in our pipeline. |
 | [06-microtasks-pageranges-css.md](notes/06-microtasks-pageranges-css.md) | Following `RunMicrotasks` down to zero (chunker fully sync); why `pageRanges` sharding is off the table; CSS cost attribution showing print.css's individual sections are all below the noise floor. |
 | [07-memory.md](notes/07-memory.md) | Where the renderer's 1.9 GB goes -- process-tree footprint, per-allocator + per-Blink-class breakdown, `--disable-gpu` + `--in-process-gpu` saving ~200 MB, a GC-pass probe finding 180 MB of unswept Oilpan garbage. |
-| [08-pdf-lib.md](notes/08-pdf-lib.md) | Profiling the process phase via `--cpu-profile-process`; pako's per-stream init dominates with ~4 500 small streams (routing pdf-lib's `deflate` + `inflate` through `node:zlib` saves ~1.5 s); `PDFRef.of`'s string-keyed Map lookup at 1.2 M calls per load (dense-array gen=0 cache saves ~0.2 s); parallelising save's per-stream deflate on libuv's pool with `objectsPerStream: 500` (~0.3 s off the main thread; PDF -5 %); `decodeName`'s regex scan on 2.76 M `PDFName.of` calls per load with a 0.0001 % hit rate (no-`#` cache saves ~0.5 s); `numberToString`'s redundant `toString`/`split`/`parseInt` on the 100 % no-`e` path; `sizeInBytes` allocating `n.toString(2)` on ~300 k xref-writer calls (short-circuit ladder saves ~60 ms); `PDFDict.entries` allocating `Array.from(map.entries())` on every dict serialisation (`Map.forEach` with hoisted callbacks saves ~80 ms); `parseDict`'s type-dispatch tail re-running `PDFName.of('Type'/'Catalog'/'Pages'/'Page')` per dict (hoisted sentinel constants drop `fastOf` self-time by 22 %); pdf-lib's `__awaiter`/`__generator` scaffolding on nine load + save methods costing ~135 ms of attributed self-time + ~50 ms GC (synchronified twins save 0.36 s of process); `parseObject`'s three speculative `matchKeyword(true/false/null)` scans on every dispatch (first-byte peek + gated keyword scans halve `parseObject` self-time); the sampling heap profile pointing at `new Map()` + `Map.prototype.set` at ~50 % of process-phase allocations (replacing `PDFDict`'s backing `Map` with a flat alternating `[k,v,k,v,...]` array drops Map+set heap traffic ~80 % and subsumes the earlier `fast-dict-iter` + `fast-parse-dict` shims); the only hot `Map.set` left being `PDFContext.assign`'s `indirectObjects.set(ref, object)` (replacing the Map with a dense gen=0 array indexed by `objectNumber` drops `assign` out of the CPU top-15 and halves the remaining `set` heap traffic); the residual `set` after that being the upstream `PDFRef.of` pool.set on cache miss (directly constructing the `PDFRef` via `Object.create(PDFRef.prototype)` on the gen=0 miss path bypasses the redundant upstream pool entirely, dropping `set` heap traffic from 7.7 MB to 0.5 MB and saving another ~93 ms on `PDFRef.of` CPU); `parseNumberOrRef` as the next-largest heap row at 15 MB of inlined `new PDFNumber(value)` calls -- PDFs reuse a few thousand unique numeric values hundreds of thousands of times (page indices, `/Count`, `/MediaBox` dimensions, font sizes), so pooling `PDFNumber` by value drops `parseNumberOrRef` off the top 10 and total process heap by ~13 %; `fastParseDictArray` at ~85 % FixedArray-growth garbage on 261 k dicts with a 5-entry median (pre-sizing the accumulator at `new Array(10)` collapses the per-call growth chain, dropping the row -25 % and total heap another -14 %); the next layer being the irreducible "one `new Array(10)` + one PDFDict instance per parsed dict" floor with ~1780 backing arrays still in flight after `--fast-dict-array` (collapse them into one long-lived mainBuf where every committed PDFDict entry lives, a per-parser temp stack handles parseDict recursion, owned dicts append to main and mutate in-place / COW-to-tail, and the whole PDFDict instance state packs into one 53-bit Number -- 24-bit start + 14-bit length + 1-bit owned -- so the per-dict object header collapses to a single field; total process heap drops 92 MB → 66 MB (-28 %), cumulative -57 % since the Map-backed baseline); GC self-time still ~150 ms / 15 % of process after fast-dict-onebuf (V8-flag knobs don't move it -- mark cost is dominated by walking the ~2.4 M Object-ref slots in mainBuf), so the next attack surface is shrinking the *live set* V8 has to mark by encoding slots as Numbers in a Float64Array mainBuf instead of Object references; that needs exact pool sizing, which is best done by a separate measure-allocate-work pass (Phase 0 viability gate: a no-allocate byte walker prototype clocks 135 ms vs PDFDocument.load at 1238 ms on the 39 MB Chrome-output PDF, ~9x cheaper, so even an 80 %-of-current work pass would land net-positive on CPU before any GC reduction; the prototype lives in `perf/phase0-measure.mjs` with `--dump-raw-pdf` on `measure.mjs` for capturing the canonical input). |
+| [08-pdf-lib.md](notes/08-pdf-lib.md) | Profiling the process phase via `--cpu-profile-process`; pako's per-stream init dominates with ~4 500 small streams (routing pdf-lib's `deflate` + `inflate` through `node:zlib` saves ~1.5 s); `PDFRef.of`'s string-keyed Map lookup at 1.2 M calls per load (dense-array gen=0 cache saves ~0.2 s); parallelising save's per-stream deflate on libuv's pool with `objectsPerStream: 500` (~0.3 s off the main thread; PDF -5 %); `decodeName`'s regex scan on 2.76 M `PDFName.of` calls per load with a 0.0001 % hit rate (no-`#` cache saves ~0.5 s); `numberToString`'s redundant `toString`/`split`/`parseInt` on the 100 % no-`e` path; `sizeInBytes` allocating `n.toString(2)` on ~300 k xref-writer calls (short-circuit ladder saves ~60 ms); `PDFDict.entries` allocating `Array.from(map.entries())` on every dict serialisation (`Map.forEach` with hoisted callbacks saves ~80 ms); `parseDict`'s type-dispatch tail re-running `PDFName.of('Type'/'Catalog'/'Pages'/'Page')` per dict (hoisted sentinel constants drop `fastOf` self-time by 22 %); pdf-lib's `__awaiter`/`__generator` scaffolding on nine load + save methods costing ~135 ms of attributed self-time + ~50 ms GC (synchronified twins save 0.36 s of process); `parseObject`'s three speculative `matchKeyword(true/false/null)` scans on every dispatch (first-byte peek + gated keyword scans halve `parseObject` self-time); the sampling heap profile pointing at `new Map()` + `Map.prototype.set` at ~50 % of process-phase allocations (replacing `PDFDict`'s backing `Map` with a flat alternating `[k,v,k,v,...]` array drops Map+set heap traffic ~80 % and subsumes the earlier `fast-dict-iter` + `fast-parse-dict` shims); the only hot `Map.set` left being `PDFContext.assign`'s `indirectObjects.set(ref, object)` (replacing the Map with a dense gen=0 array indexed by `objectNumber` drops `assign` out of the CPU top-15 and halves the remaining `set` heap traffic); the residual `set` after that being the upstream `PDFRef.of` pool.set on cache miss (directly constructing the `PDFRef` via `Object.create(PDFRef.prototype)` on the gen=0 miss path bypasses the redundant upstream pool entirely, dropping `set` heap traffic from 7.7 MB to 0.5 MB and saving another ~93 ms on `PDFRef.of` CPU); `parseNumberOrRef` as the next-largest heap row at 15 MB of inlined `new PDFNumber(value)` calls -- PDFs reuse a few thousand unique numeric values hundreds of thousands of times (page indices, `/Count`, `/MediaBox` dimensions, font sizes), so pooling `PDFNumber` by value drops `parseNumberOrRef` off the top 10 and total process heap by ~13 %; `fastParseDictArray` at ~85 % FixedArray-growth garbage on 261 k dicts with a 5-entry median (pre-sizing the accumulator at `new Array(10)` collapses the per-call growth chain, dropping the row -25 % and total heap another -14 %); the next layer being the irreducible "one `new Array(10)` + one PDFDict instance per parsed dict" floor with ~1780 backing arrays still in flight after `--fast-dict-array` (collapse them into one long-lived mainBuf where every committed PDFDict entry lives, a per-parser temp stack handles parseDict recursion, owned dicts append to main and mutate in-place / COW-to-tail, and the whole PDFDict instance state packs into one 53-bit Number -- 24-bit start + 14-bit length + 1-bit owned -- so the per-dict object header collapses to a single field; total process heap drops 92 MB → 66 MB (-28 %), cumulative -57 % since the Map-backed baseline); GC self-time still ~150 ms / 15 % of process after fast-dict-onebuf (V8-flag knobs don't move it -- mark cost is dominated by walking the ~2.4 M Object-ref slots in mainBuf), so the next attack surface is shrinking the *live set* V8 has to mark by encoding slots as Numbers in a Float64Array mainBuf instead of Object references; that needs exact pool sizing, which is best done by a separate measure-allocate-work pass (Phase 0 viability gate: a no-allocate byte walker prototype clocks 135 ms vs PDFDocument.load at 1238 ms on the 39 MB Chrome-output PDF, ~9x cheaper, so even an 80 %-of-current work pass would land net-positive on CPU before any GC reduction; the prototype lives in `perf/phase0-measure.mjs` with `--dump-raw-pdf` on `measure.mjs` for capturing the canonical input); Phase 1 productionises that walker as `docs/lib/measure-pass.mjs` and adds `setExpectedDictSlots()` to fast-dict-onebuf so the harness's `--measure-pass` flag pre-sizes mainBuf exact (byte-identical output, +40 ms net process on the book) -- pipeline shape committed for Phase 2 to layer on, even though Phase 1 alone doesn't move the heap; a V8 IC-invalidation gotcha worth recording: rebinding the module-level `main` (rather than resizing it in place via `main.length = N`) made `_appendEntries` deopt and the heap jump 65 → 92 MB despite the resized Array being shape-identical. |
diff --git a/perf/measure.mjs b/perf/measure.mjs
index 7efd65f8..768570a0 100644
--- a/perf/measure.mjs
+++ b/perf/measure.mjs
@@ -211,6 +211,17 @@
 // allocations to a few thousand cached instances. PDFNumber is
 // immutable so sharing is safe. Production runs through it.
 //
+// --measure-pass runs the no-allocate measure pass from
+// docs/lib/measure-pass.mjs against the raw Chrome PDF before
+// pdf-lib's load, and uses the measured dict-slot count to
+// pre-size fast-dict-onebuf's mainBuf to exact demand (no
+// V8-amortized growth, no slack). Phase 1 of the two-pass
+// measure-allocate-work architecture -- the win is purely the
+// plumbing landing byte-identical; Phase 2 (Float64Array mainBuf)
+// is where the GC mark cost actually drops. Requires
+// --fast-dict-onebuf (the only consumer of setExpectedDictSlots
+// so far). Adds ~135 ms to the process phase on the book.
+//
 // --fast-sync-load rips pdf-lib's parseSpeed / objectsPerTick /
 // shouldWaitForTick / waitForTick machinery out of both the load
 // path (PDFDocument.load + PDFParser.parseDocument / parseDocumentSection
@@ -287,6 +298,7 @@ let fastPdfnumberPool = false;
 let fastDictOnebuf = false;
 let instrumentParsedict = false;
 let dumpRawPdf = null;
+let measurePass = false;
 for (let i = 0; i < args.length; i++) {
   const a = args[i];
   if (a === '--out') outArg = args[++i];
@@ -326,6 +338,7 @@ for (let i = 0; i < args.length; i++) {
   else if (a === '--fast-dict-onebuf') fastDictOnebuf = true;
   else if (a === '--instrument-parsedict') instrumentParsedict = true;
   else if (a === '--dump-raw-pdf') dumpRawPdf = args[++i];
+  else if (a === '--measure-pass') measurePass = true;
   else if (!inputArg) inputArg = a;
   else { console.error(`unknown arg: ${a}`); process.exit(2); }
 }
@@ -376,6 +389,18 @@ if (fastDictOnebuf && (fastDictArray || fastParseDict || fastDictIter)) {
   console.error('--fast-dict-onebuf subsumes the other dict-shape shims (different storage shape). Pick one.');
   process.exit(2);
 }
+if (measurePass && !fastDictOnebuf) {
+  console.error('--measure-pass requires --fast-dict-onebuf (the only shim that consumes setExpectedDictSlots so far).');
+  process.exit(2);
+}
+if (measurePass && incremental) {
+  console.error('--measure-pass operates on the pdf-lib load path; --incremental skips that path entirely.');
+  process.exit(2);
+}
+if (measurePass && renderOnly) {
+  console.error('--measure-pass needs the process phase; --render-only skips it.');
+  process.exit(2);
+}
 
 // Install the dense-array cache for PDFRef.of's gen=0 path before any
 // pdf-lib operation. Side-effecting import; idempotent.
@@ -439,6 +464,20 @@ if (instrumentParsedict) {
   await import('./instrument-parsedict.mjs');
 }
 
+// --measure-pass loads the measure walker and the setter; both are
+// invoked in-flight (after rawPdf is in hand, before PDFDocument.load).
+let _runMeasurePass = null;
+if (measurePass) {
+  const { measure } = await import('../docs/lib/measure-pass.mjs');
+  const { setExpectedDictSlots } = await import('../docs/lib/fast-dict-onebuf.mjs');
+  _runMeasurePass = (bytes) => {
+    const counts = measure(bytes);
+    setExpectedDictSlots(counts.dictSlots);
+    return counts;
+  };
+  console.log('[harness] measure-pass: no-allocate prelude, pre-sizes fast-dict-onebuf mainBuf to measured dict-slot count');
+}
+
 const stamp = new Date().toISOString().replace(/[:.]/g, '-');
 const outDir = outArg
   ? resolve(process.cwd(), outArg)
@@ -743,6 +782,15 @@ try {
     // cautious defaults (parseSpeed: Slow, objectsPerTick: 50) which
     // yield ~500 / ~1000 times per phase on the book; that's pdf-lib's
     // out-of-the-box behaviour, useful as a baseline for A/B work.
+    let measurePassMs = 0;
+    let measureCounts = null;
+    if (_runMeasurePass) {
+      const tMeasureStart = Date.now();
+      measureCounts = _runMeasurePass(rawPdf);
+      measurePassMs = Date.now() - tMeasureStart;
+      console.log(`[harness] measure-pass ${fmtMs(measurePassMs)}  (dicts=${measureCounts.dicts}, dictSlots=${measureCounts.dictSlots}, maxRecursion=${measureCounts.maxRecursion})`);
+    }
+
     const tLoadStart = Date.now();
     const pdfDoc = await PDFDocument.load(rawPdf);
     const loadMs = Date.now() - tLoadStart;
@@ -764,7 +812,8 @@ try {
     }
     const saveMs = Date.now() - tSaveStart;
 
-    processBreakdown = { loadMs, setOutlineMs, saveMs, parallelStreamCount };
+    processBreakdown = { measurePassMs, loadMs, setOutlineMs, saveMs, parallelStreamCount };
+    if (measureCounts) processBreakdown.measureCounts = measureCounts;
   }
   const tProcEnd  = Date.now();
   processMs = tProcEnd - tProcStart;
@@ -791,7 +840,10 @@ try {
     const parTag = processBreakdown.parallelStreamCount
       ? ` (parallel-deflate: ${processBreakdown.parallelStreamCount} streams)`
       : '';
-    console.log(`[harness] process  ${fmtMs(processMs)}  (load=${fmtMs(processBreakdown.loadMs)}, setOutline=${fmtMs(processBreakdown.setOutlineMs)}, save=${fmtMs(processBreakdown.saveMs)}${parTag})`);
+    const measureTag = processBreakdown.measurePassMs
+      ? `measure=${fmtMs(processBreakdown.measurePassMs)}, `
+      : '';
+    console.log(`[harness] process  ${fmtMs(processMs)}  (${measureTag}load=${fmtMs(processBreakdown.loadMs)}, setOutline=${fmtMs(processBreakdown.setOutlineMs)}, save=${fmtMs(processBreakdown.saveMs)}${parTag})`);
   }
   }  // end if (!renderOnly)
 
diff --git a/perf/notes/08-pdf-lib.md b/perf/notes/08-pdf-lib.md
index 6ad945fb..aad3c3c1 100644
--- a/perf/notes/08-pdf-lib.md
+++ b/perf/notes/08-pdf-lib.md
@@ -3179,6 +3179,174 @@ production path. Phase 1 (next section) wires the measure-pass
 into production by using the dict-slot count to pre-size
 fast-dict-onebuf's mainBuf in place.
 
+## Phase 1: pre-size mainBuf via measure-pass
+
+The narrow first step of the two-pass architecture. Productionises
+Phase 0's walker, exposes a `setExpectedDictSlots()` hook on
+fast-dict-onebuf, and wires the two together. Replaces
+`new Array(MAIN_INITIAL_CAP = 2_400_000)` with
+`new Array(measuredDictSlots)` -- exact, no slack, no V8 growth.
+
+This is plumbing, not a perf win. The mainBuf savings are
+trivial (~60 K slots of slack on a 2.34 M-slot backing store)
+and the measure pass itself costs ~60 ms inline. Net cost on
+the book is ~40 ms (the measure-pass time minus run-to-run
+noise on load). What Phase 1 buys is **landing the two-pass
+pipeline byte-identical** so a future Phase 2 (Float64Array
+mainBuf) can convert the storage type without re-doing the
+plumbing.
+
+### The shim
+
+- [`docs/lib/measure-pass.mjs`](../../docs/lib/measure-pass.mjs)
+  -- a direct port of the Phase 0 `Measurer` class as a
+  production library. Exports the class and a
+  `measure(bytes) -> counts` convenience wrapper. No
+  dependencies on any `fast-*` shim or on pdf-lib itself; it's
+  a stand-alone byte walker.
+- [`docs/lib/fast-dict-onebuf.mjs`](../../docs/lib/fast-dict-onebuf.mjs)
+  -- gains `setExpectedDictSlots(slots, slack = 1.0)`. Resizes
+  the module-level `main` in place via
+  `main.length = ceil(slots * slack)`. Throws if called after
+  `mainLen > 0` (i.e. after any dict has been committed). Used
+  by the measure-pass wiring; harmless to ignore.
+- [`perf/measure.mjs`](../measure.mjs) `--measure-pass` --
+  runs the walker on rawPdf, calls
+  `setExpectedDictSlots(counts.dictSlots)`, then proceeds to
+  `PDFDocument.load`. Mutex-checked against `--incremental`,
+  `--render-only`, and the (required) `--fast-dict-onebuf`.
+
+### A V8 IC-invalidation gotcha (worth the diversion)
+
+First implementation reassigned the module binding:
+
+```js
+let main = new Array(MAIN_INITIAL_CAP);  // module load
+// ...
+export function setExpectedDictSlots(slots) {
+  main = new Array(slots);                // setter
+}
+```
+
+JS closures see the current binding value -- the reassignment
+*works correctly* in the language sense, and structural validation
+passes. But the heap profile showed `_appendEntries` jumping from
+below-threshold (~430 KB) to **27 MB / 29 %** of total samples,
+with sampled heap going **65 → 92 MB (+27 MB)**.
+
+Hypothesis trail:
+- First guess: HOLEY_SMI_ELEMENTS → HOLEY_ELEMENTS transition on
+  first Object-pointer write, reallocating the ~18 MB backing
+  store. Pre-filling with `arr.fill(null)` to force the transition
+  at allocation time -- *no change*.
+- Second guess: V8's inline caches in `_appendEntries`,
+  `PDFDict.prototype.get`, etc. specialised for the original
+  `main` object (its hidden class, element kind, address).
+  Rebinding `main` to a fresh Array makes the IC slots stale;
+  every call deopts, recompiles, and accumulates allocation
+  overhead attributed to the running frame.
+
+Fix: keep the same Array identity, just resize.
+
+```js
+const main = new Array(MAIN_INITIAL_CAP);  // module load, back to const
+export function setExpectedDictSlots(slots) {
+  main.length = slots;                     // in-place resize
+}
+```
+
+That collapses the regression to noise (+0.14 MB heap, ~0 ms
+CPU). Lesson: **never rebind a module-level value that hot
+closures specialise against, even if the language semantics
+allow it.** Mutate in place.
+
+### Validation: byte-identical output
+
+Two full-pipeline runs through the production shim set, one
+with `--measure-pass` and one without. Both produce a 1 651-page,
+1 773-outline-node, "twinBASIC Documentation"-titled PDF; bytes
+differ by 31 due to Chrome's per-run rawPdf timestamps, which
+propagate through `pdfDoc.save`. Structural identity confirmed.
+
+| Field                | baseline           | with measure-pass  |
+|----------------------|--------------------|--------------------|
+| pages                | 1 651              | 1 651              |
+| outline nodes        | 1 773              | 1 773              |
+| title                | "twinBASIC Documentation" | "twinBASIC Documentation" |
+| bytes                | 16 077 319         | 16 077 288         |
+
+### Measured cost (after the in-place-resize fix)
+
+Paired runs, production shim set, on the book (39 MB rawPdf):
+
+| Phase             | Without measure-pass | With measure-pass | Delta |
+|-------------------|---------------------:|------------------:|------:|
+| measure-pass      | -                    | 60 ms             | +60   |
+| load              | 520 ms               | 500 ms            | -20   |
+| save              | 420 ms               | 420 ms            |   0   |
+| **process total** | **950 ms**           | **990 ms**        | **+40** |
+
+The 60 ms inline-measure number is faster than the 135 ms
+standalone Phase 0 number, almost certainly because rawPdf is
+still hot in CPU caches from `page.pdf()`. Standalone phase0-
+measure.mjs reads it cold from disk into a Buffer first.
+
+The -20 ms on load is within run-to-run noise on this machine.
+The honest summary: Phase 1 adds the cost of the measure pass
+itself (~60 ms) and not much else.
+
+### Measured heap
+
+Paired heap-profile runs (`--heap-profile-process --heap-sampling
+512`), top frames:
+
+| Frame                                | Baseline (KB) | With measure (KB) | Delta |
+|--------------------------------------|--------------:|------------------:|------:|
+| `PDFObjectParser.parseArray`         |     19 583.67 |         19 435.74 | flat  |
+| `_makeFromRange`                     |     16 510.94 |         16 657.94 | flat  |
+| `parseIndirectObjectHeader`          |     13 510.65 |         13 558.62 | flat  |
+| `fastOf`                             |      7 695.92 |          7 817.85 | flat  |
+| `parseIndirectObjectSync`            |      2 101.19 |          2 102.32 | flat  |
+| `_appendEntries` (post-fix)          |          ~430 |              ~430 | flat  |
+| **total sampled**                    |  **65.27 MB** |      **65.41 MB** | **+0.14 MB** |
+
+Flat as expected. Phase 1 doesn't change what gets allocated --
+only the initial capacity of the backing Array, which is a
+one-time module-load-time cost that the process-phase profile
+doesn't see.
+
+### Caveats
+
+- **Requires --fast-dict-onebuf.** The only shim that consumes
+  `setExpectedDictSlots` so far. The mutex check enforces this.
+- **Singleton context inherited.** Phase 1 doesn't loosen
+  fast-dict-onebuf's "one PDFContext per process" constraint --
+  same throw-on-second-load behaviour.
+- **Pre-sizing assumes the measure and load see the same bytes.**
+  Always true for our pipeline (rawPdf is computed once, both
+  measure and load read it). Would break if the bytes mutated
+  between measure and load -- not a pattern we have.
+- **Counts are appearances, not unique.** Phase 1 only needs
+  dict-slot count, which is an appearance count (every slot is
+  one). Any later phase 2+ pool sizing would need unique counts
+  and would add interning to the walker.
+
+### Where this lands
+
+`--measure-pass` lives behind a flag in the harness. Even at just
+~40 ms net regression, there's no current consumer of the
+measured counts that wins anything back on its own -- pre-sizing
+mainBuf exact saves nothing material in isolation. Phase 1
+commits to the wire-up shape; a separate later commit flips the
+flag into production once the architecture has another consumer.
+
+[`docs/lib/measure-pass.mjs`](../../docs/lib/measure-pass.mjs)
+ships as a library (the production home of the walker). Imported
+only when `--measure-pass` is passed. `perf/phase0-measure.mjs`
+is left alone -- it's the historical record of the viability
+gate, intentionally self-contained even though it now duplicates
+the walker.
+
 ## `@cantoo/pdf-lib`: not a drop-in replacement
 
 Spot-checked the maintained fork (`@cantoo/pdf-lib` 2.6.5) as an

From 6bd02fdc561e5dbe5ef70bf9d1765d54b96e241d Mon Sep 17 00:00:00 2001
From: Kuba Sunderland-Ober <kuba@mareimbrium.org>
Date: Sat, 23 May 2026 19:42:07 +0200
Subject: [PATCH 25/44] fast-dict-onebuf: drop the owned bit from the packed
 value.

The owned/shared flag at bit 38 only gated set's append path:
"extend in place at HWM only if owned." Re-reading the safety
argument shows this is over-cautious. Each parseDict commits a
contiguous frame to main and mainLen advances past it; no two
PDFDict instances share slots. So if a dict's range satisfies
start + length === mainLen, the slots past mainLen are free
regardless of how the range was created. The owned/shared
distinction the bit encoded doesn't correspond to anything the
safety check needs.

Changes:
- pack(start, length): third arg gone; no POW_38 OR-in.
- _owned and POW_38: deleted.
- _cow: collapses to one branch (was two paths that differed
  only in the owned-at-HWM early return).
- set: gate becomes just `start + length !== mainLen`.
- _makeFromRange: owned param gone.
- _ownedFromArray renamed _makeFromAppend for accuracy.
- Bit 38 is now spare; spare grows from 14 to 15 bits.

Net behavioural change: shared dicts that still abut the HWM
at first .set now extend in place instead of COWing -- ~5-10
slot copies avoided per such mutation. Tiny win in the right
direction.

Byte-identical output (1651 pages, 1773 outline nodes,
matching titles). Heap flat: 65.34 MB vs 65.27 MB baseline,
within noise. Top frames in the heap profile are structurally
the same.
---
 docs/lib/fast-dict-onebuf.mjs | 99 +++++++++++++++--------------------
 perf/README.md                |  2 +-
 perf/notes/08-pdf-lib.md      | 41 +++++++++++++++
 3 files changed, 84 insertions(+), 58 deletions(-)

diff --git a/docs/lib/fast-dict-onebuf.mjs b/docs/lib/fast-dict-onebuf.mjs
index c77384d7..7f83adce 100644
--- a/docs/lib/fast-dict-onebuf.mjs
+++ b/docs/lib/fast-dict-onebuf.mjs
@@ -7,11 +7,10 @@
 // ever read from main, so the bufIdx field disappears from the
 // packed value -- frees up bits.
 //
-// 53-bit packed Number layout (within Number.MAX_SAFE_INTEGER):
+// 38-bit packed Number layout (well within Number.MAX_SAFE_INTEGER):
 //   bits  0-23: start  (24 bits, max 16 M slots in main)
 //   bits 24-37: length (14 bits, max 16 384 slots; max observed 8 706)
-//   bit  38   : owned flag
-//   bits 39-52: spare (14 bits)
+//   bits 38-52: spare (15 bits)
 //
 // Recursion. Outer parseDict pushes entries onto temp. Calling
 // this.parseObject() to parse a value may recurse to inner
@@ -23,17 +22,20 @@
 // inner's ranges in main do not overlap; each was committed as a
 // single contiguous block at distinct points in time.
 //
-// Mutations. The shared range is read-only after parse. First
-// mutation:
-//   - set with existing key: in-place replace (safe; doesn't shift slots)
-//   - set with new key, dict at main's high-water mark: in-place push (extend the range)
+// Mutations:
+//   - set with existing key: in-place replace (safe; no shifts)
+//   - set with new key, dict at main's high-water mark: in-place
+//     push (extend the range)
 //   - set with new key, dict NOT at high-water mark: COW (copy
 //     range to main's tail, then push the new pair, update encoded
 //     value to the new range)
 //   - delete: COW (copy range minus deleted entry to tail)
-// On second+ mutations the dict is already 'owned'; same rules
-// apply but the COW step is skipped when we're at the high-water
-// mark.
+// The at-HWM check fully determines whether extending is safe;
+// each dict's range is unique to that dict (no slot sharing), so
+// extending past the dict's end at HWM never disturbs anything.
+// An earlier design tracked an owned/shared bit to gate this; it
+// was redundant -- shared dicts at HWM extend just as safely as
+// owned ones.
 //
 // Singleton PDFContext (one PDFDocument.load per process in our
 // pipeline; throws if a second distinct context appears).
@@ -92,24 +94,20 @@ export function setExpectedDictSlots(slots, slack = 1.0) {
 // ---- Bit-packing helpers --------------------------------------------
 
 const POW_24 = 16777216;          // 2^24
-const POW_38 = 274877906944;      // 2^38
 const MASK_24 = 0xFFFFFF;
 const MASK_14 = 0x3FFF;
 
 const MAX_START  = POW_24;          // exclusive
 const MAX_LENGTH = 1 << 14;         // 16384, exclusive
 
-function pack(start, length, owned) {
+function pack(start, length) {
   if (start  >= MAX_START)  throw new Error(`fast-dict-onebuf: start ${start} exceeds 24-bit budget`);
   if (length >= MAX_LENGTH) throw new Error(`fast-dict-onebuf: length ${length} exceeds 14-bit budget`);
-  return start
-    + length * POW_24
-    + (owned ? POW_38 : 0);
+  return start + length * POW_24;
 }
 
 function _start(d)  { return d & MASK_24; }
 function _length(d) { return Math.floor(d / POW_24) & MASK_14; }
-function _owned(d)  { return Math.floor(d / POW_38) & 1; }
 
 // ---- Singleton context ---------------------------------------------
 
@@ -139,38 +137,25 @@ function _appendArray(arr) {
 }
 
 // COW: copy this dict's range to main's tail, return the new packed
-// value (now owned, anchored at the new range).
+// value anchored at the new range. If we're already at the HWM,
+// nothing to copy -- return d unchanged.
 function _cow(pd) {
   const d = pd.d;
-  if (_owned(d)) {
-    // Already owned and somewhere in main. If we're at the high-water
-    // mark we can mutate in place; otherwise we need to COW (the
-    // dict was created earlier, other dicts have been appended
-    // since, so we no longer abut the tail).
-    const start = _start(d);
-    const length = _length(d);
-    if (start + length === mainLen) return d;   // at HWM
-    const newStart = mainLen;
-    for (let i = 0; i < length; i++) main[mainLen + i] = main[start + i];
-    mainLen += length;
-    return pack(newStart, length, 1);
-  } else {
-    // Shared range. COW to tail.
-    const start = _start(d);
-    const length = _length(d);
-    const newStart = mainLen;
-    for (let i = 0; i < length; i++) main[mainLen + i] = main[start + i];
-    mainLen += length;
-    return pack(newStart, length, 1);
-  }
+  const start = _start(d);
+  const length = _length(d);
+  if (start + length === mainLen) return d;   // at HWM, extend in place
+  const newStart = mainLen;
+  for (let i = 0; i < length; i++) main[mainLen + i] = main[start + i];
+  mainLen += length;
+  return pack(newStart, length);
 }
 
 // ---- Construction ---------------------------------------------------
 
-function _makeFromRange(ProtoClass, start, length, owned, ctx) {
+function _makeFromRange(ProtoClass, start, length, ctx) {
   _registerContext(ctx);
   const pd = Object.create(ProtoClass.prototype);
-  pd.d = pack(start, length, owned ? 1 : 0);
+  pd.d = pack(start, length);
   if (ProtoClass === PDFPageLeaf) {
     pd.normalized = false;
     pd.autoNormalizeCTM = true;
@@ -178,10 +163,10 @@ function _makeFromRange(ProtoClass, start, length, owned, ctx) {
   return pd;
 }
 
-function _ownedFromArray(ProtoClass, arr, ctx) {
+function _makeFromAppend(ProtoClass, arr, ctx) {
   const start = mainLen;
   _appendArray(arr);
-  return _makeFromRange(ProtoClass, start, arr.length, true, ctx);
+  return _makeFromRange(ProtoClass, start, arr.length, ctx);
 }
 
 function mapToArray(map) {
@@ -234,14 +219,14 @@ if (!PDFDict.prototype.__fastDictOnebufInstalled) {
     }
     // Append: requires the dict to be at main's high-water mark, OR we COW.
     let dNow = d0;
-    if (!_owned(d0) || start0 + length0 !== mainLen) {
+    if (start0 + length0 !== mainLen) {
       dNow = _cow(this);
     }
-    // After _cow (or if we were already at HWM owned), we abut the tail.
+    // After _cow (or if we were already at HWM), we abut the tail.
     main[mainLen++] = key;
     main[mainLen++] = value;
     const start = _start(dNow);
-    this.d = pack(start, length0 + 2, 1);
+    this.d = pack(start, length0 + 2);
   };
 
   PDFDict.prototype.get = function (key, preservePDFNull) {
@@ -288,7 +273,7 @@ if (!PDFDict.prototype.__fastDictOnebufInstalled) {
       if (i === foundIdx || i === foundIdx + 1) continue;
       main[mainLen++] = main[start0 + i];
     }
-    this.d = pack(newStart, length0 - 2, 1);
+    this.d = pack(newStart, length0 - 2);
     return true;
   };
 
@@ -310,7 +295,7 @@ if (!PDFDict.prototype.__fastDictOnebufInstalled) {
     mainLen += length;
     _registerContext(context || _singletonContext);
     const c = Object.create(PDFDict.prototype);
-    c.d = pack(newStart, length, 1);
+    c.d = pack(newStart, length);
     return c;
   };
 
@@ -364,29 +349,29 @@ if (!PDFDict.prototype.__fastDictOnebufInstalled) {
   // ---- PDFDict factories --------------------------------------------
 
   PDFDict.withContext = function (context) {
-    return _ownedFromArray(PDFDict, [], context);
+    return _makeFromAppend(PDFDict, [], context);
   };
   PDFDict.fromMapWithContext = function (map, context) {
-    return _ownedFromArray(PDFDict, mapToArray(map), context);
+    return _makeFromAppend(PDFDict, mapToArray(map), context);
   };
 
   PDFCatalog.withContextAndPages = function (context, pages) {
-    return _ownedFromArray(
+    return _makeFromAppend(
       PDFCatalog,
       [PDFName.of('Type'), CatalogName, PagesName, pages],
       context,
     );
   };
   PDFCatalog.fromMapWithContext = function (map, context) {
-    return _ownedFromArray(PDFCatalog, mapToArray(map), context);
+    return _makeFromAppend(PDFCatalog, mapToArray(map), context);
   };
 
   PDFPageTree.fromMapWithContext = function (map, context) {
-    return _ownedFromArray(PDFPageTree, mapToArray(map), context);
+    return _makeFromAppend(PDFPageTree, mapToArray(map), context);
   };
 
   PDFPageLeaf.fromMapWithContext = function (map, context, autoNormalizeCTM) {
-    const d = _ownedFromArray(PDFPageLeaf, mapToArray(map), context);
+    const d = _makeFromAppend(PDFPageLeaf, mapToArray(map), context);
     if (autoNormalizeCTM !== undefined) d.autoNormalizeCTM = autoNormalizeCTM;
     return d;
   };
@@ -440,10 +425,10 @@ if (!PDFDict.prototype.__fastDictOnebufInstalled) {
     for (let i = start; i < end; i += 2) {
       if (main[i] === TypeName) { Type = main[i + 1]; break; }
     }
-    if (Type === CatalogName) return _makeFromRange(PDFCatalog,  start, frameLen, false, this.context);
-    if (Type === PagesName)   return _makeFromRange(PDFPageTree, start, frameLen, false, this.context);
-    if (Type === PageName)    return _makeFromRange(PDFPageLeaf, start, frameLen, false, this.context);
-    return _makeFromRange(PDFDict, start, frameLen, false, this.context);
+    if (Type === CatalogName) return _makeFromRange(PDFCatalog,  start, frameLen, this.context);
+    if (Type === PagesName)   return _makeFromRange(PDFPageTree, start, frameLen, this.context);
+    if (Type === PageName)    return _makeFromRange(PDFPageLeaf, start, frameLen, this.context);
+    return _makeFromRange(PDFDict, start, frameLen, this.context);
   };
 
   PDFDict.prototype.__fastDictOnebufInstalled = true;
diff --git a/perf/README.md b/perf/README.md
index 2b58e0f7..04b8aca7 100644
--- a/perf/README.md
+++ b/perf/README.md
@@ -564,4 +564,4 @@ order; later ones reference earlier ones for context.
 | [05-blink-trace.md](notes/05-blink-trace.md) | What happened when we tried move-not-clone (a `previousLeaf` cache shipped instead of move); cracking the cpu profile's `(program)` row open with a Blink-category trace; the WhiteSpaceFilter paired-A/B that found it wasn't worth its layout cost in our pipeline. |
 | [06-microtasks-pageranges-css.md](notes/06-microtasks-pageranges-css.md) | Following `RunMicrotasks` down to zero (chunker fully sync); why `pageRanges` sharding is off the table; CSS cost attribution showing print.css's individual sections are all below the noise floor. |
 | [07-memory.md](notes/07-memory.md) | Where the renderer's 1.9 GB goes -- process-tree footprint, per-allocator + per-Blink-class breakdown, `--disable-gpu` + `--in-process-gpu` saving ~200 MB, a GC-pass probe finding 180 MB of unswept Oilpan garbage. |
-| [08-pdf-lib.md](notes/08-pdf-lib.md) | Profiling the process phase via `--cpu-profile-process`; pako's per-stream init dominates with ~4 500 small streams (routing pdf-lib's `deflate` + `inflate` through `node:zlib` saves ~1.5 s); `PDFRef.of`'s string-keyed Map lookup at 1.2 M calls per load (dense-array gen=0 cache saves ~0.2 s); parallelising save's per-stream deflate on libuv's pool with `objectsPerStream: 500` (~0.3 s off the main thread; PDF -5 %); `decodeName`'s regex scan on 2.76 M `PDFName.of` calls per load with a 0.0001 % hit rate (no-`#` cache saves ~0.5 s); `numberToString`'s redundant `toString`/`split`/`parseInt` on the 100 % no-`e` path; `sizeInBytes` allocating `n.toString(2)` on ~300 k xref-writer calls (short-circuit ladder saves ~60 ms); `PDFDict.entries` allocating `Array.from(map.entries())` on every dict serialisation (`Map.forEach` with hoisted callbacks saves ~80 ms); `parseDict`'s type-dispatch tail re-running `PDFName.of('Type'/'Catalog'/'Pages'/'Page')` per dict (hoisted sentinel constants drop `fastOf` self-time by 22 %); pdf-lib's `__awaiter`/`__generator` scaffolding on nine load + save methods costing ~135 ms of attributed self-time + ~50 ms GC (synchronified twins save 0.36 s of process); `parseObject`'s three speculative `matchKeyword(true/false/null)` scans on every dispatch (first-byte peek + gated keyword scans halve `parseObject` self-time); the sampling heap profile pointing at `new Map()` + `Map.prototype.set` at ~50 % of process-phase allocations (replacing `PDFDict`'s backing `Map` with a flat alternating `[k,v,k,v,...]` array drops Map+set heap traffic ~80 % and subsumes the earlier `fast-dict-iter` + `fast-parse-dict` shims); the only hot `Map.set` left being `PDFContext.assign`'s `indirectObjects.set(ref, object)` (replacing the Map with a dense gen=0 array indexed by `objectNumber` drops `assign` out of the CPU top-15 and halves the remaining `set` heap traffic); the residual `set` after that being the upstream `PDFRef.of` pool.set on cache miss (directly constructing the `PDFRef` via `Object.create(PDFRef.prototype)` on the gen=0 miss path bypasses the redundant upstream pool entirely, dropping `set` heap traffic from 7.7 MB to 0.5 MB and saving another ~93 ms on `PDFRef.of` CPU); `parseNumberOrRef` as the next-largest heap row at 15 MB of inlined `new PDFNumber(value)` calls -- PDFs reuse a few thousand unique numeric values hundreds of thousands of times (page indices, `/Count`, `/MediaBox` dimensions, font sizes), so pooling `PDFNumber` by value drops `parseNumberOrRef` off the top 10 and total process heap by ~13 %; `fastParseDictArray` at ~85 % FixedArray-growth garbage on 261 k dicts with a 5-entry median (pre-sizing the accumulator at `new Array(10)` collapses the per-call growth chain, dropping the row -25 % and total heap another -14 %); the next layer being the irreducible "one `new Array(10)` + one PDFDict instance per parsed dict" floor with ~1780 backing arrays still in flight after `--fast-dict-array` (collapse them into one long-lived mainBuf where every committed PDFDict entry lives, a per-parser temp stack handles parseDict recursion, owned dicts append to main and mutate in-place / COW-to-tail, and the whole PDFDict instance state packs into one 53-bit Number -- 24-bit start + 14-bit length + 1-bit owned -- so the per-dict object header collapses to a single field; total process heap drops 92 MB → 66 MB (-28 %), cumulative -57 % since the Map-backed baseline); GC self-time still ~150 ms / 15 % of process after fast-dict-onebuf (V8-flag knobs don't move it -- mark cost is dominated by walking the ~2.4 M Object-ref slots in mainBuf), so the next attack surface is shrinking the *live set* V8 has to mark by encoding slots as Numbers in a Float64Array mainBuf instead of Object references; that needs exact pool sizing, which is best done by a separate measure-allocate-work pass (Phase 0 viability gate: a no-allocate byte walker prototype clocks 135 ms vs PDFDocument.load at 1238 ms on the 39 MB Chrome-output PDF, ~9x cheaper, so even an 80 %-of-current work pass would land net-positive on CPU before any GC reduction; the prototype lives in `perf/phase0-measure.mjs` with `--dump-raw-pdf` on `measure.mjs` for capturing the canonical input); Phase 1 productionises that walker as `docs/lib/measure-pass.mjs` and adds `setExpectedDictSlots()` to fast-dict-onebuf so the harness's `--measure-pass` flag pre-sizes mainBuf exact (byte-identical output, +40 ms net process on the book) -- pipeline shape committed for Phase 2 to layer on, even though Phase 1 alone doesn't move the heap; a V8 IC-invalidation gotcha worth recording: rebinding the module-level `main` (rather than resizing it in place via `main.length = N`) made `_appendEntries` deopt and the heap jump 65 → 92 MB despite the resized Array being shape-identical. |
+| [08-pdf-lib.md](notes/08-pdf-lib.md) | Profiling the process phase via `--cpu-profile-process`; pako's per-stream init dominates with ~4 500 small streams (routing pdf-lib's `deflate` + `inflate` through `node:zlib` saves ~1.5 s); `PDFRef.of`'s string-keyed Map lookup at 1.2 M calls per load (dense-array gen=0 cache saves ~0.2 s); parallelising save's per-stream deflate on libuv's pool with `objectsPerStream: 500` (~0.3 s off the main thread; PDF -5 %); `decodeName`'s regex scan on 2.76 M `PDFName.of` calls per load with a 0.0001 % hit rate (no-`#` cache saves ~0.5 s); `numberToString`'s redundant `toString`/`split`/`parseInt` on the 100 % no-`e` path; `sizeInBytes` allocating `n.toString(2)` on ~300 k xref-writer calls (short-circuit ladder saves ~60 ms); `PDFDict.entries` allocating `Array.from(map.entries())` on every dict serialisation (`Map.forEach` with hoisted callbacks saves ~80 ms); `parseDict`'s type-dispatch tail re-running `PDFName.of('Type'/'Catalog'/'Pages'/'Page')` per dict (hoisted sentinel constants drop `fastOf` self-time by 22 %); pdf-lib's `__awaiter`/`__generator` scaffolding on nine load + save methods costing ~135 ms of attributed self-time + ~50 ms GC (synchronified twins save 0.36 s of process); `parseObject`'s three speculative `matchKeyword(true/false/null)` scans on every dispatch (first-byte peek + gated keyword scans halve `parseObject` self-time); the sampling heap profile pointing at `new Map()` + `Map.prototype.set` at ~50 % of process-phase allocations (replacing `PDFDict`'s backing `Map` with a flat alternating `[k,v,k,v,...]` array drops Map+set heap traffic ~80 % and subsumes the earlier `fast-dict-iter` + `fast-parse-dict` shims); the only hot `Map.set` left being `PDFContext.assign`'s `indirectObjects.set(ref, object)` (replacing the Map with a dense gen=0 array indexed by `objectNumber` drops `assign` out of the CPU top-15 and halves the remaining `set` heap traffic); the residual `set` after that being the upstream `PDFRef.of` pool.set on cache miss (directly constructing the `PDFRef` via `Object.create(PDFRef.prototype)` on the gen=0 miss path bypasses the redundant upstream pool entirely, dropping `set` heap traffic from 7.7 MB to 0.5 MB and saving another ~93 ms on `PDFRef.of` CPU); `parseNumberOrRef` as the next-largest heap row at 15 MB of inlined `new PDFNumber(value)` calls -- PDFs reuse a few thousand unique numeric values hundreds of thousands of times (page indices, `/Count`, `/MediaBox` dimensions, font sizes), so pooling `PDFNumber` by value drops `parseNumberOrRef` off the top 10 and total process heap by ~13 %; `fastParseDictArray` at ~85 % FixedArray-growth garbage on 261 k dicts with a 5-entry median (pre-sizing the accumulator at `new Array(10)` collapses the per-call growth chain, dropping the row -25 % and total heap another -14 %); the next layer being the irreducible "one `new Array(10)` + one PDFDict instance per parsed dict" floor with ~1780 backing arrays still in flight after `--fast-dict-array` (collapse them into one long-lived mainBuf where every committed PDFDict entry lives, a per-parser temp stack handles parseDict recursion, owned dicts append to main and mutate in-place / COW-to-tail, and the whole PDFDict instance state packs into one 53-bit Number -- 24-bit start + 14-bit length + 1-bit owned -- so the per-dict object header collapses to a single field; total process heap drops 92 MB → 66 MB (-28 %), cumulative -57 % since the Map-backed baseline); GC self-time still ~150 ms / 15 % of process after fast-dict-onebuf (V8-flag knobs don't move it -- mark cost is dominated by walking the ~2.4 M Object-ref slots in mainBuf), so the next attack surface is shrinking the *live set* V8 has to mark by encoding slots as Numbers in a Float64Array mainBuf instead of Object references; that needs exact pool sizing, which is best done by a separate measure-allocate-work pass (Phase 0 viability gate: a no-allocate byte walker prototype clocks 135 ms vs PDFDocument.load at 1238 ms on the 39 MB Chrome-output PDF, ~9x cheaper, so even an 80 %-of-current work pass would land net-positive on CPU before any GC reduction; the prototype lives in `perf/phase0-measure.mjs` with `--dump-raw-pdf` on `measure.mjs` for capturing the canonical input); Phase 1 productionises that walker as `docs/lib/measure-pass.mjs` and adds `setExpectedDictSlots()` to fast-dict-onebuf so the harness's `--measure-pass` flag pre-sizes mainBuf exact (byte-identical output, +40 ms net process on the book) -- pipeline shape committed for Phase 2 to layer on, even though Phase 1 alone doesn't move the heap; a V8 IC-invalidation gotcha worth recording: rebinding the module-level `main` (rather than resizing it in place via `main.length = N`) made `_appendEntries` deopt and the heap jump 65 → 92 MB despite the resized Array being shape-identical; post-Phase-1 cleanup: the One-buffer `owned` bit was over-cautious -- `start + length === mainLen` is enough to know slots past mainLen are claimable regardless of whether the range came from the parser or a factory call, so the bit drops out of the packed value entirely (set's COW gate simplifies, _cow collapses to one branch, bit 38 becomes spare; byte-identical output, heap flat). |
diff --git a/perf/notes/08-pdf-lib.md b/perf/notes/08-pdf-lib.md
index aad3c3c1..a270738e 100644
--- a/perf/notes/08-pdf-lib.md
+++ b/perf/notes/08-pdf-lib.md
@@ -3347,6 +3347,47 @@ is left alone -- it's the historical record of the viability
 gate, intentionally self-contained even though it now duplicates
 the walker.
 
+## Dropping the owned bit (post-Phase-1 cleanup)
+
+The One-buffer PDFDict layout above carried an `owned` flag at
+bit 38, distinguishing parser-created ("shared") ranges from
+factory-created ("owned") ones. Its only behavioural effect was
+gating the `set` append path: a dict was allowed to extend in
+place at the high-water mark only if `owned`.
+
+Re-reading the safety argument: each parseDict commits a
+contiguous frame to main and mainLen advances past it. No two
+PDFDict instances share slots. So if a dict's range satisfies
+`start + length === mainLen`, nothing past mainLen is initialised
+and the slots are free to claim -- *regardless* of whether the
+range came from the parser or a factory call. The owned/shared
+distinction doesn't correspond to anything the safety check
+needs.
+
+Dropping it:
+
+- `pack(start, length)` -- third arg gone, no OR-in of `POW_38`.
+- `_owned`, `POW_38` -- deleted.
+- `_cow` -- collapses to one branch (was two identical-except-
+  for-the-HWM-early-return paths).
+- `set` -- the gating condition simplifies from
+  `!_owned(d0) || start0 + length0 !== mainLen` to just
+  `start0 + length0 !== mainLen`.
+- `_makeFromRange(ProtoClass, start, length, ctx)` -- owned param
+  gone; `_ownedFromArray` renamed `_makeFromAppend` for accuracy.
+- Bit 38 is now spare; spare grows from 14 to 15 bits.
+
+Net behavioural change: shared dicts that still abut the HWM at
+first `set` now extend in place instead of COWing, saving ~5-10
+slot copies per such mutation. Tiny win, but in the right
+direction.
+
+Validated byte-identical on both the no-measure-pass path and
+the `--measure-pass` path; structural diff (1 651 pages, 1 773
+outline nodes, matching titles) holds. Heap is flat as expected
+-- this is a code simplification, not an allocation-pattern
+change.
+
 ## `@cantoo/pdf-lib`: not a drop-in replacement
 
 Spot-checked the maintained fork (`@cantoo/pdf-lib` 2.6.5) as an

From 4389f52a4b10abc53de33bef93117d06465483a9 Mon Sep 17 00:00:00 2001
From: Kuba Sunderland-Ober <kuba@mareimbrium.org>
Date: Sat, 23 May 2026 20:15:40 +0200
Subject: [PATCH 26/44] Slot-type histogram for fast-dict-onebuf's main buffer.

Measurement-only tooling for Phase 2 design. fast-dict-onebuf
exports `main` and a `getMainLen()` getter so external consumers
can inspect the buffer. perf/instrument-slot-types.mjs walks
main[0..mainLen) after setOutline and classifies each slot by
PDFObject subtype, printing key/value counts and percentages.
perf/measure.mjs gains --instrument-slot-types that loads the
module and invokes the classifier (requires --fast-dict-onebuf;
not compatible with --incremental / --render-only).

Distribution on the book (production shim set + --measure-pass):
total slots = 2 358 630, keys = 1 179 315, values = 1 179 315.

  type           keys      key%       values    value%   total%
  -----------------------------------------------------------------
  PDFName        1179315   100.00%    493256    41.83%   70.91%
  PDFRef               0     0.00%    435217    36.90%   18.45%
  PDFNumber            0     0.00%    162325    13.76%    6.88%
  PDFArray             0     0.00%     79468     6.74%    3.37%
  PDFDict              0     0.00%      5660     0.48%    0.24%
  PDFHexString         0     0.00%      1776     0.15%    0.08%
  PDFString            0     0.00%      1601     0.14%    0.07%
  PDFBool.True         0     0.00%        12     0.00%   0.0005%
  PDFBool.False        0     0.00%         0     0.00%        0
  PDFNull              0     0.00%         0     0.00%        0

Key findings: (1) keys are 100% PDFName -- the even/odd invariant
holds. (2) Four big pools (Name, Ref, Number, Dict) cover 96.4%
of all slots; encoding them directly into the Float64 mainBuf
collapses ~96% of slot-mark traversals. (3) Side-pool fallback
for unpooled types (Array, String, HexString) is ~3.5% --
~82 800 slots that V8 would still mark, vs ~2.34M today. (4)
Nested PDFDicts as slot values are only 5 660 -- most dicts are
referenced via Ref rather than embedded. (5) Bool/Null/RawStream
in dict slots are essentially zero; tag-only encoding works.

Classification cost: 39ms (single pass over 2.36M slots).
---
 docs/lib/fast-dict-onebuf.mjs  |   6 ++
 perf/README.md                 |   3 +-
 perf/instrument-slot-types.mjs | 102 +++++++++++++++++++++++++++++++++
 perf/measure.mjs               |  31 ++++++++++
 perf/notes/08-pdf-lib.md       |  60 +++++++++++++++++++
 5 files changed, 201 insertions(+), 1 deletion(-)
 create mode 100644 perf/instrument-slot-types.mjs

diff --git a/docs/lib/fast-dict-onebuf.mjs b/docs/lib/fast-dict-onebuf.mjs
index 7f83adce..b810d3cc 100644
--- a/docs/lib/fast-dict-onebuf.mjs
+++ b/docs/lib/fast-dict-onebuf.mjs
@@ -71,6 +71,12 @@ const MAIN_INITIAL_CAP = 2400000;
 const main = new Array(MAIN_INITIAL_CAP);
 let mainLen = 0;
 
+// Exposed for measurement-only consumers (perf/instrument-*.mjs).
+// The encoded `d` values held by PDFDict instances reference main by
+// (start, length); reading the slots requires access to main itself.
+export { main };
+export function getMainLen() { return mainLen; }
+
 // Replace `main` with an exact-sized backing array. Must be called
 // before any parseDict / withContext / fromMapWithContext (i.e. while
 // mainLen is still 0). `slack` is a multiplier on `slots`; default 1.0
diff --git a/perf/README.md b/perf/README.md
index 04b8aca7..1c224c2c 100644
--- a/perf/README.md
+++ b/perf/README.md
@@ -333,6 +333,7 @@ Side experiments / one-shot probes:
 | `probe-parallel.mjs` | Two-shard `Promise.all` `page.pdf()` probe -- the cost-of-`pageRanges`-sharding measurement (see *`pageRanges` sharding: off the table for now* in [notes/06-microtasks-pageranges-css.md](notes/06-microtasks-pageranges-css.md)). |
 | `probe-idle-browser.mjs` | Standalone probe: launches a headless browser and measures steady-state idle memory + sample-time, for separating render cost from browser-fixed overhead. |
 | `phase0-measure.mjs` | No-allocate byte walker over a raw PDF: recognises the grammar (indirect objects, dicts, arrays, names, numbers, refs, strings, streams, ObjStms) and produces counts only, without instantiating any PDFObject. Viability gate for the two-pass measure-allocate-work architecture that ships as `measure-pass.mjs`. Run with `node perf/phase0-measure.mjs <input.pdf> --runs N`; defaults to the most recent `perf/results/*/book.pdf`. Companion to `--dump-raw-pdf <path>` on `measure.mjs`, which captures the canonical 39 MB Chrome-output input once. |
+| `instrument-slot-types.mjs` | Walks `fast-dict-onebuf`'s `main` buffer after setOutline and classifies each slot by PDFObject subtype, printing key/value counts and percentages. Used to scope the Phase 2 / Phase 3 encoding work -- how many slot-marks would a Float64Array mainBuf actually eliminate, and what's the side-pool fallback rate. Invoked via `--instrument-slot-types` on `measure.mjs` (requires `--fast-dict-onebuf`; mutex with `--incremental` / `--render-only`). |
 
 Documentation:
 
@@ -564,4 +565,4 @@ order; later ones reference earlier ones for context.
 | [05-blink-trace.md](notes/05-blink-trace.md) | What happened when we tried move-not-clone (a `previousLeaf` cache shipped instead of move); cracking the cpu profile's `(program)` row open with a Blink-category trace; the WhiteSpaceFilter paired-A/B that found it wasn't worth its layout cost in our pipeline. |
 | [06-microtasks-pageranges-css.md](notes/06-microtasks-pageranges-css.md) | Following `RunMicrotasks` down to zero (chunker fully sync); why `pageRanges` sharding is off the table; CSS cost attribution showing print.css's individual sections are all below the noise floor. |
 | [07-memory.md](notes/07-memory.md) | Where the renderer's 1.9 GB goes -- process-tree footprint, per-allocator + per-Blink-class breakdown, `--disable-gpu` + `--in-process-gpu` saving ~200 MB, a GC-pass probe finding 180 MB of unswept Oilpan garbage. |
-| [08-pdf-lib.md](notes/08-pdf-lib.md) | Profiling the process phase via `--cpu-profile-process`; pako's per-stream init dominates with ~4 500 small streams (routing pdf-lib's `deflate` + `inflate` through `node:zlib` saves ~1.5 s); `PDFRef.of`'s string-keyed Map lookup at 1.2 M calls per load (dense-array gen=0 cache saves ~0.2 s); parallelising save's per-stream deflate on libuv's pool with `objectsPerStream: 500` (~0.3 s off the main thread; PDF -5 %); `decodeName`'s regex scan on 2.76 M `PDFName.of` calls per load with a 0.0001 % hit rate (no-`#` cache saves ~0.5 s); `numberToString`'s redundant `toString`/`split`/`parseInt` on the 100 % no-`e` path; `sizeInBytes` allocating `n.toString(2)` on ~300 k xref-writer calls (short-circuit ladder saves ~60 ms); `PDFDict.entries` allocating `Array.from(map.entries())` on every dict serialisation (`Map.forEach` with hoisted callbacks saves ~80 ms); `parseDict`'s type-dispatch tail re-running `PDFName.of('Type'/'Catalog'/'Pages'/'Page')` per dict (hoisted sentinel constants drop `fastOf` self-time by 22 %); pdf-lib's `__awaiter`/`__generator` scaffolding on nine load + save methods costing ~135 ms of attributed self-time + ~50 ms GC (synchronified twins save 0.36 s of process); `parseObject`'s three speculative `matchKeyword(true/false/null)` scans on every dispatch (first-byte peek + gated keyword scans halve `parseObject` self-time); the sampling heap profile pointing at `new Map()` + `Map.prototype.set` at ~50 % of process-phase allocations (replacing `PDFDict`'s backing `Map` with a flat alternating `[k,v,k,v,...]` array drops Map+set heap traffic ~80 % and subsumes the earlier `fast-dict-iter` + `fast-parse-dict` shims); the only hot `Map.set` left being `PDFContext.assign`'s `indirectObjects.set(ref, object)` (replacing the Map with a dense gen=0 array indexed by `objectNumber` drops `assign` out of the CPU top-15 and halves the remaining `set` heap traffic); the residual `set` after that being the upstream `PDFRef.of` pool.set on cache miss (directly constructing the `PDFRef` via `Object.create(PDFRef.prototype)` on the gen=0 miss path bypasses the redundant upstream pool entirely, dropping `set` heap traffic from 7.7 MB to 0.5 MB and saving another ~93 ms on `PDFRef.of` CPU); `parseNumberOrRef` as the next-largest heap row at 15 MB of inlined `new PDFNumber(value)` calls -- PDFs reuse a few thousand unique numeric values hundreds of thousands of times (page indices, `/Count`, `/MediaBox` dimensions, font sizes), so pooling `PDFNumber` by value drops `parseNumberOrRef` off the top 10 and total process heap by ~13 %; `fastParseDictArray` at ~85 % FixedArray-growth garbage on 261 k dicts with a 5-entry median (pre-sizing the accumulator at `new Array(10)` collapses the per-call growth chain, dropping the row -25 % and total heap another -14 %); the next layer being the irreducible "one `new Array(10)` + one PDFDict instance per parsed dict" floor with ~1780 backing arrays still in flight after `--fast-dict-array` (collapse them into one long-lived mainBuf where every committed PDFDict entry lives, a per-parser temp stack handles parseDict recursion, owned dicts append to main and mutate in-place / COW-to-tail, and the whole PDFDict instance state packs into one 53-bit Number -- 24-bit start + 14-bit length + 1-bit owned -- so the per-dict object header collapses to a single field; total process heap drops 92 MB → 66 MB (-28 %), cumulative -57 % since the Map-backed baseline); GC self-time still ~150 ms / 15 % of process after fast-dict-onebuf (V8-flag knobs don't move it -- mark cost is dominated by walking the ~2.4 M Object-ref slots in mainBuf), so the next attack surface is shrinking the *live set* V8 has to mark by encoding slots as Numbers in a Float64Array mainBuf instead of Object references; that needs exact pool sizing, which is best done by a separate measure-allocate-work pass (Phase 0 viability gate: a no-allocate byte walker prototype clocks 135 ms vs PDFDocument.load at 1238 ms on the 39 MB Chrome-output PDF, ~9x cheaper, so even an 80 %-of-current work pass would land net-positive on CPU before any GC reduction; the prototype lives in `perf/phase0-measure.mjs` with `--dump-raw-pdf` on `measure.mjs` for capturing the canonical input); Phase 1 productionises that walker as `docs/lib/measure-pass.mjs` and adds `setExpectedDictSlots()` to fast-dict-onebuf so the harness's `--measure-pass` flag pre-sizes mainBuf exact (byte-identical output, +40 ms net process on the book) -- pipeline shape committed for Phase 2 to layer on, even though Phase 1 alone doesn't move the heap; a V8 IC-invalidation gotcha worth recording: rebinding the module-level `main` (rather than resizing it in place via `main.length = N`) made `_appendEntries` deopt and the heap jump 65 → 92 MB despite the resized Array being shape-identical; post-Phase-1 cleanup: the One-buffer `owned` bit was over-cautious -- `start + length === mainLen` is enough to know slots past mainLen are claimable regardless of whether the range came from the parser or a factory call, so the bit drops out of the packed value entirely (set's COW gate simplifies, _cow collapses to one branch, bit 38 becomes spare; byte-identical output, heap flat). |
+| [08-pdf-lib.md](notes/08-pdf-lib.md) | Profiling the process phase via `--cpu-profile-process`; pako's per-stream init dominates with ~4 500 small streams (routing pdf-lib's `deflate` + `inflate` through `node:zlib` saves ~1.5 s); `PDFRef.of`'s string-keyed Map lookup at 1.2 M calls per load (dense-array gen=0 cache saves ~0.2 s); parallelising save's per-stream deflate on libuv's pool with `objectsPerStream: 500` (~0.3 s off the main thread; PDF -5 %); `decodeName`'s regex scan on 2.76 M `PDFName.of` calls per load with a 0.0001 % hit rate (no-`#` cache saves ~0.5 s); `numberToString`'s redundant `toString`/`split`/`parseInt` on the 100 % no-`e` path; `sizeInBytes` allocating `n.toString(2)` on ~300 k xref-writer calls (short-circuit ladder saves ~60 ms); `PDFDict.entries` allocating `Array.from(map.entries())` on every dict serialisation (`Map.forEach` with hoisted callbacks saves ~80 ms); `parseDict`'s type-dispatch tail re-running `PDFName.of('Type'/'Catalog'/'Pages'/'Page')` per dict (hoisted sentinel constants drop `fastOf` self-time by 22 %); pdf-lib's `__awaiter`/`__generator` scaffolding on nine load + save methods costing ~135 ms of attributed self-time + ~50 ms GC (synchronified twins save 0.36 s of process); `parseObject`'s three speculative `matchKeyword(true/false/null)` scans on every dispatch (first-byte peek + gated keyword scans halve `parseObject` self-time); the sampling heap profile pointing at `new Map()` + `Map.prototype.set` at ~50 % of process-phase allocations (replacing `PDFDict`'s backing `Map` with a flat alternating `[k,v,k,v,...]` array drops Map+set heap traffic ~80 % and subsumes the earlier `fast-dict-iter` + `fast-parse-dict` shims); the only hot `Map.set` left being `PDFContext.assign`'s `indirectObjects.set(ref, object)` (replacing the Map with a dense gen=0 array indexed by `objectNumber` drops `assign` out of the CPU top-15 and halves the remaining `set` heap traffic); the residual `set` after that being the upstream `PDFRef.of` pool.set on cache miss (directly constructing the `PDFRef` via `Object.create(PDFRef.prototype)` on the gen=0 miss path bypasses the redundant upstream pool entirely, dropping `set` heap traffic from 7.7 MB to 0.5 MB and saving another ~93 ms on `PDFRef.of` CPU); `parseNumberOrRef` as the next-largest heap row at 15 MB of inlined `new PDFNumber(value)` calls -- PDFs reuse a few thousand unique numeric values hundreds of thousands of times (page indices, `/Count`, `/MediaBox` dimensions, font sizes), so pooling `PDFNumber` by value drops `parseNumberOrRef` off the top 10 and total process heap by ~13 %; `fastParseDictArray` at ~85 % FixedArray-growth garbage on 261 k dicts with a 5-entry median (pre-sizing the accumulator at `new Array(10)` collapses the per-call growth chain, dropping the row -25 % and total heap another -14 %); the next layer being the irreducible "one `new Array(10)` + one PDFDict instance per parsed dict" floor with ~1780 backing arrays still in flight after `--fast-dict-array` (collapse them into one long-lived mainBuf where every committed PDFDict entry lives, a per-parser temp stack handles parseDict recursion, owned dicts append to main and mutate in-place / COW-to-tail, and the whole PDFDict instance state packs into one 53-bit Number -- 24-bit start + 14-bit length + 1-bit owned -- so the per-dict object header collapses to a single field; total process heap drops 92 MB → 66 MB (-28 %), cumulative -57 % since the Map-backed baseline); GC self-time still ~150 ms / 15 % of process after fast-dict-onebuf (V8-flag knobs don't move it -- mark cost is dominated by walking the ~2.4 M Object-ref slots in mainBuf), so the next attack surface is shrinking the *live set* V8 has to mark by encoding slots as Numbers in a Float64Array mainBuf instead of Object references; that needs exact pool sizing, which is best done by a separate measure-allocate-work pass (Phase 0 viability gate: a no-allocate byte walker prototype clocks 135 ms vs PDFDocument.load at 1238 ms on the 39 MB Chrome-output PDF, ~9x cheaper, so even an 80 %-of-current work pass would land net-positive on CPU before any GC reduction; the prototype lives in `perf/phase0-measure.mjs` with `--dump-raw-pdf` on `measure.mjs` for capturing the canonical input); Phase 1 productionises that walker as `docs/lib/measure-pass.mjs` and adds `setExpectedDictSlots()` to fast-dict-onebuf so the harness's `--measure-pass` flag pre-sizes mainBuf exact (byte-identical output, +40 ms net process on the book) -- pipeline shape committed for Phase 2 to layer on, even though Phase 1 alone doesn't move the heap; a V8 IC-invalidation gotcha worth recording: rebinding the module-level `main` (rather than resizing it in place via `main.length = N`) made `_appendEntries` deopt and the heap jump 65 → 92 MB despite the resized Array being shape-identical; post-Phase-1 cleanup: the One-buffer `owned` bit was over-cautious -- `start + length === mainLen` is enough to know slots past mainLen are claimable regardless of whether the range came from the parser or a factory call, so the bit drops out of the packed value entirely (set's COW gate simplifies, _cow collapses to one branch, bit 38 becomes spare; byte-identical output, heap flat); slot-type histogram on `main[0..mainLen)` confirms keys are 100 % PDFName and the four big pools (Name / Ref / Number / Dict) cover 96.4 % of all slots, so a Float64Array mainBuf with a side `Object[]` for the residual ~3.5 % (Array / String / HexString) would collapse ~96 % of GC slot-marks at the price of a side-pool indirection -- `perf/instrument-slot-types.mjs` does the classification in 39 ms via `--instrument-slot-types`. |
diff --git a/perf/instrument-slot-types.mjs b/perf/instrument-slot-types.mjs
new file mode 100644
index 00000000..5b33b5db
--- /dev/null
+++ b/perf/instrument-slot-types.mjs
@@ -0,0 +1,102 @@
+// Slot-type instrumentation for fast-dict-onebuf's `main` buffer.
+//
+// Walks main[0..mainLen) after the process phase is "done writing"
+// (i.e. after PDFDocument.load + setOutline; save reads but doesn't
+// write) and classifies each slot by its PDFObject subtype. main's
+// invariant is even-position = key, odd-position = value (each
+// committed frame is even-length, and mainLen always advances by an
+// even amount). So the histogram is broken into key-side and
+// value-side -- keys should be 100 % PDFName; values are the mixed
+// distribution Phase 2's encoding has to handle.
+//
+// Measurement-only. Imported when --instrument-slot-types is passed
+// to perf/measure.mjs (requires --fast-dict-onebuf since main lives
+// in that shim).
+
+import { createRequire } from 'node:module';
+import { main, getMainLen } from '../docs/lib/fast-dict-onebuf.mjs';
+
+const require = createRequire(import.meta.url);
+const PDFName      = require('pdf-lib/cjs/core/objects/PDFName.js').default;
+const PDFRef       = require('pdf-lib/cjs/core/objects/PDFRef.js').default;
+const PDFNumber    = require('pdf-lib/cjs/core/objects/PDFNumber.js').default;
+const PDFDict      = require('pdf-lib/cjs/core/objects/PDFDict.js').default;
+const PDFArray     = require('pdf-lib/cjs/core/objects/PDFArray.js').default;
+const PDFString    = require('pdf-lib/cjs/core/objects/PDFString.js').default;
+const PDFHexString = require('pdf-lib/cjs/core/objects/PDFHexString.js').default;
+const PDFBool      = require('pdf-lib/cjs/core/objects/PDFBool.js').default;
+const PDFNull      = require('pdf-lib/cjs/core/objects/PDFNull.js').default;
+const PDFRawStream = require('pdf-lib/cjs/core/objects/PDFRawStream.js').default;
+const PDFInvalid   = require('pdf-lib/cjs/core/objects/PDFInvalidObject.js').default;
+
+// Classify a single slot. Returns a string tag.
+// Order matters: subtypes before supertypes (PDFCatalog/PageTree/PageLeaf
+// extend PDFDict, so the PDFDict check catches them; PDFRawStream extends
+// PDFDict too but we check it first).
+function classify(v) {
+  if (v === undefined)        return 'undefined';
+  if (v === null)             return 'null';
+  if (v === PDFNull)          return 'PDFNull';
+  if (v === PDFBool.True)     return 'PDFBool.True';
+  if (v === PDFBool.False)    return 'PDFBool.False';
+  if (v instanceof PDFRef)         return 'PDFRef';
+  if (v instanceof PDFName)        return 'PDFName';
+  if (v instanceof PDFNumber)      return 'PDFNumber';
+  if (v instanceof PDFRawStream)   return 'PDFRawStream';
+  if (v instanceof PDFInvalid)     return 'PDFInvalidObject';
+  if (v instanceof PDFDict)        return 'PDFDict';
+  if (v instanceof PDFArray)       return 'PDFArray';
+  if (v instanceof PDFHexString)   return 'PDFHexString';
+  if (v instanceof PDFString)      return 'PDFString';
+  if (typeof v === 'number')  return 'number(raw)';
+  if (typeof v === 'string')  return 'string(raw)';
+  const ctor = v && v.constructor && v.constructor.name;
+  return `OTHER(${ctor || typeof v})`;
+}
+
+// Walk main, classify each slot. Returns {keys, values, total, keyTotal, valueTotal}.
+export function classifySlots() {
+  const mainLen = getMainLen();
+  const keys   = Object.create(null);
+  const values = Object.create(null);
+  let keyTotal = 0, valueTotal = 0;
+
+  for (let i = 0; i < mainLen; i++) {
+    const t = classify(main[i]);
+    if ((i & 1) === 0) {
+      keys[t] = (keys[t] || 0) + 1;
+      keyTotal++;
+    } else {
+      values[t] = (values[t] || 0) + 1;
+      valueTotal++;
+    }
+  }
+  return { keys, values, total: mainLen, keyTotal, valueTotal };
+}
+
+// Pretty-print, sorted by combined count descending.
+export function printHistogram(counts, label = '') {
+  const heading = label ? `[${label}] ` : '';
+  console.log(`${heading}slot classification: total=${counts.total}  keys=${counts.keyTotal}  values=${counts.valueTotal}`);
+  console.log('');
+
+  const allTypes = new Set([...Object.keys(counts.keys), ...Object.keys(counts.values)]);
+  const rows = [...allTypes].map(t => ({
+    type:    t,
+    keys:    counts.keys[t]   || 0,
+    values:  counts.values[t] || 0,
+    total:   (counts.keys[t] || 0) + (counts.values[t] || 0),
+  })).sort((a, b) => b.total - a.total);
+
+  console.log('  type               keys       key%       values     value%     total      total%');
+  console.log('  -----------------------------------------------------------------------------------');
+  const pct = (n, d) => d ? (100 * n / d).toFixed(2) : '0.00';
+  for (const r of rows) {
+    const kp = pct(r.keys, counts.keyTotal);
+    const vp = pct(r.values, counts.valueTotal);
+    const tp = pct(r.total, counts.total);
+    console.log(
+      `  ${r.type.padEnd(18)} ${r.keys.toString().padStart(8)}  ${kp.padStart(7)}%  ${r.values.toString().padStart(8)}  ${vp.padStart(7)}%  ${r.total.toString().padStart(8)}  ${tp.padStart(6)}%`
+    );
+  }
+}
diff --git a/perf/measure.mjs b/perf/measure.mjs
index 768570a0..77475d89 100644
--- a/perf/measure.mjs
+++ b/perf/measure.mjs
@@ -299,6 +299,7 @@ let fastDictOnebuf = false;
 let instrumentParsedict = false;
 let dumpRawPdf = null;
 let measurePass = false;
+let instrumentSlotTypes = false;
 for (let i = 0; i < args.length; i++) {
   const a = args[i];
   if (a === '--out') outArg = args[++i];
@@ -339,6 +340,7 @@ for (let i = 0; i < args.length; i++) {
   else if (a === '--instrument-parsedict') instrumentParsedict = true;
   else if (a === '--dump-raw-pdf') dumpRawPdf = args[++i];
   else if (a === '--measure-pass') measurePass = true;
+  else if (a === '--instrument-slot-types') instrumentSlotTypes = true;
   else if (!inputArg) inputArg = a;
   else { console.error(`unknown arg: ${a}`); process.exit(2); }
 }
@@ -401,6 +403,14 @@ if (measurePass && renderOnly) {
   console.error('--measure-pass needs the process phase; --render-only skips it.');
   process.exit(2);
 }
+if (instrumentSlotTypes && !fastDictOnebuf) {
+  console.error('--instrument-slot-types reads fast-dict-onebuf\'s main buffer; pass --fast-dict-onebuf too.');
+  process.exit(2);
+}
+if (instrumentSlotTypes && (incremental || renderOnly)) {
+  console.error('--instrument-slot-types needs the process phase; not compatible with --incremental or --render-only.');
+  process.exit(2);
+}
 
 // Install the dense-array cache for PDFRef.of's gen=0 path before any
 // pdf-lib operation. Side-effecting import; idempotent.
@@ -478,6 +488,17 @@ if (measurePass) {
   console.log('[harness] measure-pass: no-allocate prelude, pre-sizes fast-dict-onebuf mainBuf to measured dict-slot count');
 }
 
+// --instrument-slot-types loads the slot-type classifier; called after
+// setOutline, before save.
+let _classifySlots = null;
+let _printSlotHistogram = null;
+if (instrumentSlotTypes) {
+  const m = await import('./instrument-slot-types.mjs');
+  _classifySlots = m.classifySlots;
+  _printSlotHistogram = m.printHistogram;
+  console.log('[harness] instrument-slot-types: classify main[] slots by PDFObject subtype after setOutline');
+}
+
 const stamp = new Date().toISOString().replace(/[:.]/g, '-');
 const outDir = outArg
   ? resolve(process.cwd(), outArg)
@@ -801,6 +822,16 @@ try {
     setOutline(pdfDoc, outline, false);
     const setOutlineMs = Date.now() - tSetOutlineStart;
 
+    if (_classifySlots) {
+      const tClassifyStart = Date.now();
+      const slotCounts = _classifySlots();
+      const classifyMs = Date.now() - tClassifyStart;
+      console.log(`[harness] instrument-slot-types: classify took ${classifyMs}ms`);
+      console.log('');
+      _printSlotHistogram(slotCounts, 'main after load+setOutline');
+      console.log('');
+    }
+
     const tSaveStart = Date.now();
     let parallelStreamCount = 0;
     if (parallelDeflate) {
diff --git a/perf/notes/08-pdf-lib.md b/perf/notes/08-pdf-lib.md
index a270738e..69db0721 100644
--- a/perf/notes/08-pdf-lib.md
+++ b/perf/notes/08-pdf-lib.md
@@ -3388,6 +3388,66 @@ outline nodes, matching titles) holds. Heap is flat as expected
 -- this is a code simplification, not an allocation-pattern
 change.
 
+## Slot-type histogram for mainBuf
+
+The next attack surface on GC self-time -- the ~150 ms left after
+fast-dict-onebuf -- is converting `main` from `Array` (Object
+references that V8 must mark) to `Float64Array` (Number slots
+that V8 ignores during mark). That only works if every slot
+value can be encoded as a Number, or pooled into a side table
+where the marker count is small.
+
+To scope that work, [`perf/instrument-slot-types.mjs`](../instrument-slot-types.mjs)
+walks `main[0..mainLen)` after setOutline and classifies each
+slot by PDFObject subtype. The instrumentation hangs off two new
+exports on fast-dict-onebuf (the `main` Array itself and a
+`getMainLen()` getter) and runs behind a new
+`--instrument-slot-types` flag on `measure.mjs` that requires
+`--fast-dict-onebuf` and skips the incremental / render-only
+paths.
+
+Distribution on the book (production shim set + `--measure-pass`,
+total slots = 2 358 630, keys = 1 179 315, values = 1 179 315):
+
+```
+type           keys      key%       values    value%   total%
+-----------------------------------------------------------------
+PDFName        1179315   100.00%    493256    41.83%   70.91%
+PDFRef               0     0.00%    435217    36.90%   18.45%
+PDFNumber            0     0.00%    162325    13.76%    6.88%
+PDFArray             0     0.00%     79468     6.74%    3.37%
+PDFDict              0     0.00%      5660     0.48%    0.24%
+PDFHexString         0     0.00%      1776     0.15%    0.08%
+PDFString            0     0.00%      1601     0.14%    0.07%
+PDFBool.True         0     0.00%        12     0.00%   0.0005%
+PDFBool.False        0     0.00%         0     0.00%        0
+PDFNull              0     0.00%         0     0.00%        0
+```
+
+Key findings:
+
+1. **Keys are 100 % PDFName** -- the even/odd invariant the
+   parser maintains holds. Encoding keys as the name's pool
+   index is unambiguous.
+2. **Four big pools (Name, Ref, Number, Dict) cover 96.4 % of
+   all slots.** Encoding them directly as Numbers in a
+   Float64 mainBuf collapses ~96 % of slot-mark traversals.
+3. **Side-pool fallback for unpooled types (Array, String,
+   HexString) is ~3.5 %** -- ~82 800 slots that V8 would
+   still mark via the side `Object[]`, vs ~2.34 M today.
+4. **Nested PDFDicts as slot values are only 5 660** -- most
+   dicts are referenced via PDFRef rather than embedded inline.
+5. **Bool / Null / RawStream in dict slots are essentially zero**
+   -- tag-only encoding (a few reserved sentinel Numbers)
+   covers them.
+
+Classification cost: 39 ms (single pass over 2.36 M slots).
+
+This shape is informative even though it doesn't itself ship a
+change. The subsequent Phase 2 / Phase 3 prototypes (next two
+sections) use these numbers to predict their wins; both turn out
+not to ship for reasons documented there.
+
 ## `@cantoo/pdf-lib`: not a drop-in replacement
 
 Spot-checked the maintained fork (`@cantoo/pdf-lib` 2.6.5) as an

From fc273c7d862496af6a4e3664bedc93980edef1f4 Mon Sep 17 00:00:00 2001
From: Kuba Sunderland-Ober <kuba@mareimbrium.org>
Date: Sat, 23 May 2026 20:53:54 +0200
Subject: [PATCH 27/44] Phase 2: Float64Array mainBuf + encoded slots --
 explored, didn't ship.

The next architectural step from Phase 1 -- replace the Object[]
mainBuf with a Float64Array, encode every entry (key and value
alike) as a 4-bit type tag + 49-bit pool id / payload. Subsumes
fast-refs and fast-pdfnumber-pool by owning PDFRef.of and
PDFNumber.of with built-in pool-id assignment; adds new pools for
PDFArray (sequential id), PDFString and PDFHexString (value-dedup
since they're immutable). PDFDict slots encode the existing
38-bit (start, length) payload directly.

A trap worth recording: the first cut eagerly cached every
parse-created PDFDict in dictByPayload so decodeValue(TAG_DICT)
would return the same instance. That writes 261k Map entries
during parse; total heap went 65 -> 92 MB. Fix: lazy
materialization. Top-level dicts (226k) live in indirectObjects
and are never decoded via TAG_DICT; only nested dicts (~5660)
are. Caching on first access caps the Map at ~5660 entries.

Measured result on faraday: wash.
  process wall      1.16 s -> 1.18 s  (~+20 ms, noise)
  GC self-time      151 ms -> 149 ms  (~0 ms)
  heap allocation   65 MB  -> 68 MB   (+3 MB from new pool Maps)
  marked main slots 2.34 M -> 0       (architectural win, no $$)

The slot-mark-cost win is real but mainBuf wasn't the bottleneck
-- pointer-array marks are fast in V8. Encoding overhead roughly
cancels the savings.

Code dropped. Faraday kept it as opt-in foundation for Phase 3,
but Phase 3 also doesn't ship, so the dependency chain doesn't
earn its keep on staging. Design notes preserved in
perf/notes/08-pdf-lib.md as the takeaway worth keeping.
---
 perf/README.md           |   2 +-
 perf/notes/08-pdf-lib.md | 120 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 121 insertions(+), 1 deletion(-)

diff --git a/perf/README.md b/perf/README.md
index 1c224c2c..09cf7757 100644
--- a/perf/README.md
+++ b/perf/README.md
@@ -565,4 +565,4 @@ order; later ones reference earlier ones for context.
 | [05-blink-trace.md](notes/05-blink-trace.md) | What happened when we tried move-not-clone (a `previousLeaf` cache shipped instead of move); cracking the cpu profile's `(program)` row open with a Blink-category trace; the WhiteSpaceFilter paired-A/B that found it wasn't worth its layout cost in our pipeline. |
 | [06-microtasks-pageranges-css.md](notes/06-microtasks-pageranges-css.md) | Following `RunMicrotasks` down to zero (chunker fully sync); why `pageRanges` sharding is off the table; CSS cost attribution showing print.css's individual sections are all below the noise floor. |
 | [07-memory.md](notes/07-memory.md) | Where the renderer's 1.9 GB goes -- process-tree footprint, per-allocator + per-Blink-class breakdown, `--disable-gpu` + `--in-process-gpu` saving ~200 MB, a GC-pass probe finding 180 MB of unswept Oilpan garbage. |
-| [08-pdf-lib.md](notes/08-pdf-lib.md) | Profiling the process phase via `--cpu-profile-process`; pako's per-stream init dominates with ~4 500 small streams (routing pdf-lib's `deflate` + `inflate` through `node:zlib` saves ~1.5 s); `PDFRef.of`'s string-keyed Map lookup at 1.2 M calls per load (dense-array gen=0 cache saves ~0.2 s); parallelising save's per-stream deflate on libuv's pool with `objectsPerStream: 500` (~0.3 s off the main thread; PDF -5 %); `decodeName`'s regex scan on 2.76 M `PDFName.of` calls per load with a 0.0001 % hit rate (no-`#` cache saves ~0.5 s); `numberToString`'s redundant `toString`/`split`/`parseInt` on the 100 % no-`e` path; `sizeInBytes` allocating `n.toString(2)` on ~300 k xref-writer calls (short-circuit ladder saves ~60 ms); `PDFDict.entries` allocating `Array.from(map.entries())` on every dict serialisation (`Map.forEach` with hoisted callbacks saves ~80 ms); `parseDict`'s type-dispatch tail re-running `PDFName.of('Type'/'Catalog'/'Pages'/'Page')` per dict (hoisted sentinel constants drop `fastOf` self-time by 22 %); pdf-lib's `__awaiter`/`__generator` scaffolding on nine load + save methods costing ~135 ms of attributed self-time + ~50 ms GC (synchronified twins save 0.36 s of process); `parseObject`'s three speculative `matchKeyword(true/false/null)` scans on every dispatch (first-byte peek + gated keyword scans halve `parseObject` self-time); the sampling heap profile pointing at `new Map()` + `Map.prototype.set` at ~50 % of process-phase allocations (replacing `PDFDict`'s backing `Map` with a flat alternating `[k,v,k,v,...]` array drops Map+set heap traffic ~80 % and subsumes the earlier `fast-dict-iter` + `fast-parse-dict` shims); the only hot `Map.set` left being `PDFContext.assign`'s `indirectObjects.set(ref, object)` (replacing the Map with a dense gen=0 array indexed by `objectNumber` drops `assign` out of the CPU top-15 and halves the remaining `set` heap traffic); the residual `set` after that being the upstream `PDFRef.of` pool.set on cache miss (directly constructing the `PDFRef` via `Object.create(PDFRef.prototype)` on the gen=0 miss path bypasses the redundant upstream pool entirely, dropping `set` heap traffic from 7.7 MB to 0.5 MB and saving another ~93 ms on `PDFRef.of` CPU); `parseNumberOrRef` as the next-largest heap row at 15 MB of inlined `new PDFNumber(value)` calls -- PDFs reuse a few thousand unique numeric values hundreds of thousands of times (page indices, `/Count`, `/MediaBox` dimensions, font sizes), so pooling `PDFNumber` by value drops `parseNumberOrRef` off the top 10 and total process heap by ~13 %; `fastParseDictArray` at ~85 % FixedArray-growth garbage on 261 k dicts with a 5-entry median (pre-sizing the accumulator at `new Array(10)` collapses the per-call growth chain, dropping the row -25 % and total heap another -14 %); the next layer being the irreducible "one `new Array(10)` + one PDFDict instance per parsed dict" floor with ~1780 backing arrays still in flight after `--fast-dict-array` (collapse them into one long-lived mainBuf where every committed PDFDict entry lives, a per-parser temp stack handles parseDict recursion, owned dicts append to main and mutate in-place / COW-to-tail, and the whole PDFDict instance state packs into one 53-bit Number -- 24-bit start + 14-bit length + 1-bit owned -- so the per-dict object header collapses to a single field; total process heap drops 92 MB → 66 MB (-28 %), cumulative -57 % since the Map-backed baseline); GC self-time still ~150 ms / 15 % of process after fast-dict-onebuf (V8-flag knobs don't move it -- mark cost is dominated by walking the ~2.4 M Object-ref slots in mainBuf), so the next attack surface is shrinking the *live set* V8 has to mark by encoding slots as Numbers in a Float64Array mainBuf instead of Object references; that needs exact pool sizing, which is best done by a separate measure-allocate-work pass (Phase 0 viability gate: a no-allocate byte walker prototype clocks 135 ms vs PDFDocument.load at 1238 ms on the 39 MB Chrome-output PDF, ~9x cheaper, so even an 80 %-of-current work pass would land net-positive on CPU before any GC reduction; the prototype lives in `perf/phase0-measure.mjs` with `--dump-raw-pdf` on `measure.mjs` for capturing the canonical input); Phase 1 productionises that walker as `docs/lib/measure-pass.mjs` and adds `setExpectedDictSlots()` to fast-dict-onebuf so the harness's `--measure-pass` flag pre-sizes mainBuf exact (byte-identical output, +40 ms net process on the book) -- pipeline shape committed for Phase 2 to layer on, even though Phase 1 alone doesn't move the heap; a V8 IC-invalidation gotcha worth recording: rebinding the module-level `main` (rather than resizing it in place via `main.length = N`) made `_appendEntries` deopt and the heap jump 65 → 92 MB despite the resized Array being shape-identical; post-Phase-1 cleanup: the One-buffer `owned` bit was over-cautious -- `start + length === mainLen` is enough to know slots past mainLen are claimable regardless of whether the range came from the parser or a factory call, so the bit drops out of the packed value entirely (set's COW gate simplifies, _cow collapses to one branch, bit 38 becomes spare; byte-identical output, heap flat); slot-type histogram on `main[0..mainLen)` confirms keys are 100 % PDFName and the four big pools (Name / Ref / Number / Dict) cover 96.4 % of all slots, so a Float64Array mainBuf with a side `Object[]` for the residual ~3.5 % (Array / String / HexString) would collapse ~96 % of GC slot-marks at the price of a side-pool indirection -- `perf/instrument-slot-types.mjs` does the classification in 39 ms via `--instrument-slot-types`. |
+| [08-pdf-lib.md](notes/08-pdf-lib.md) | Profiling the process phase via `--cpu-profile-process`; pako's per-stream init dominates with ~4 500 small streams (routing pdf-lib's `deflate` + `inflate` through `node:zlib` saves ~1.5 s); `PDFRef.of`'s string-keyed Map lookup at 1.2 M calls per load (dense-array gen=0 cache saves ~0.2 s); parallelising save's per-stream deflate on libuv's pool with `objectsPerStream: 500` (~0.3 s off the main thread; PDF -5 %); `decodeName`'s regex scan on 2.76 M `PDFName.of` calls per load with a 0.0001 % hit rate (no-`#` cache saves ~0.5 s); `numberToString`'s redundant `toString`/`split`/`parseInt` on the 100 % no-`e` path; `sizeInBytes` allocating `n.toString(2)` on ~300 k xref-writer calls (short-circuit ladder saves ~60 ms); `PDFDict.entries` allocating `Array.from(map.entries())` on every dict serialisation (`Map.forEach` with hoisted callbacks saves ~80 ms); `parseDict`'s type-dispatch tail re-running `PDFName.of('Type'/'Catalog'/'Pages'/'Page')` per dict (hoisted sentinel constants drop `fastOf` self-time by 22 %); pdf-lib's `__awaiter`/`__generator` scaffolding on nine load + save methods costing ~135 ms of attributed self-time + ~50 ms GC (synchronified twins save 0.36 s of process); `parseObject`'s three speculative `matchKeyword(true/false/null)` scans on every dispatch (first-byte peek + gated keyword scans halve `parseObject` self-time); the sampling heap profile pointing at `new Map()` + `Map.prototype.set` at ~50 % of process-phase allocations (replacing `PDFDict`'s backing `Map` with a flat alternating `[k,v,k,v,...]` array drops Map+set heap traffic ~80 % and subsumes the earlier `fast-dict-iter` + `fast-parse-dict` shims); the only hot `Map.set` left being `PDFContext.assign`'s `indirectObjects.set(ref, object)` (replacing the Map with a dense gen=0 array indexed by `objectNumber` drops `assign` out of the CPU top-15 and halves the remaining `set` heap traffic); the residual `set` after that being the upstream `PDFRef.of` pool.set on cache miss (directly constructing the `PDFRef` via `Object.create(PDFRef.prototype)` on the gen=0 miss path bypasses the redundant upstream pool entirely, dropping `set` heap traffic from 7.7 MB to 0.5 MB and saving another ~93 ms on `PDFRef.of` CPU); `parseNumberOrRef` as the next-largest heap row at 15 MB of inlined `new PDFNumber(value)` calls -- PDFs reuse a few thousand unique numeric values hundreds of thousands of times (page indices, `/Count`, `/MediaBox` dimensions, font sizes), so pooling `PDFNumber` by value drops `parseNumberOrRef` off the top 10 and total process heap by ~13 %; `fastParseDictArray` at ~85 % FixedArray-growth garbage on 261 k dicts with a 5-entry median (pre-sizing the accumulator at `new Array(10)` collapses the per-call growth chain, dropping the row -25 % and total heap another -14 %); the next layer being the irreducible "one `new Array(10)` + one PDFDict instance per parsed dict" floor with ~1780 backing arrays still in flight after `--fast-dict-array` (collapse them into one long-lived mainBuf where every committed PDFDict entry lives, a per-parser temp stack handles parseDict recursion, owned dicts append to main and mutate in-place / COW-to-tail, and the whole PDFDict instance state packs into one 53-bit Number -- 24-bit start + 14-bit length + 1-bit owned -- so the per-dict object header collapses to a single field; total process heap drops 92 MB → 66 MB (-28 %), cumulative -57 % since the Map-backed baseline); GC self-time still ~150 ms / 15 % of process after fast-dict-onebuf (V8-flag knobs don't move it -- mark cost is dominated by walking the ~2.4 M Object-ref slots in mainBuf), so the next attack surface is shrinking the *live set* V8 has to mark by encoding slots as Numbers in a Float64Array mainBuf instead of Object references; that needs exact pool sizing, which is best done by a separate measure-allocate-work pass (Phase 0 viability gate: a no-allocate byte walker prototype clocks 135 ms vs PDFDocument.load at 1238 ms on the 39 MB Chrome-output PDF, ~9x cheaper, so even an 80 %-of-current work pass would land net-positive on CPU before any GC reduction; the prototype lives in `perf/phase0-measure.mjs` with `--dump-raw-pdf` on `measure.mjs` for capturing the canonical input); Phase 1 productionises that walker as `docs/lib/measure-pass.mjs` and adds `setExpectedDictSlots()` to fast-dict-onebuf so the harness's `--measure-pass` flag pre-sizes mainBuf exact (byte-identical output, +40 ms net process on the book) -- pipeline shape committed for Phase 2 to layer on, even though Phase 1 alone doesn't move the heap; a V8 IC-invalidation gotcha worth recording: rebinding the module-level `main` (rather than resizing it in place via `main.length = N`) made `_appendEntries` deopt and the heap jump 65 → 92 MB despite the resized Array being shape-identical; post-Phase-1 cleanup: the One-buffer `owned` bit was over-cautious -- `start + length === mainLen` is enough to know slots past mainLen are claimable regardless of whether the range came from the parser or a factory call, so the bit drops out of the packed value entirely (set's COW gate simplifies, _cow collapses to one branch, bit 38 becomes spare; byte-identical output, heap flat); slot-type histogram on `main[0..mainLen)` confirms keys are 100 % PDFName and the four big pools (Name / Ref / Number / Dict) cover 96.4 % of all slots, so a Float64Array mainBuf with a side `Object[]` for the residual ~3.5 % (Array / String / HexString) would collapse ~96 % of GC slot-marks at the price of a side-pool indirection -- `perf/instrument-slot-types.mjs` does the classification in 39 ms via `--instrument-slot-types`; built the Float64Array mainBuf with 4-bit tag + 49-bit payload encoding (subsuming fast-refs and fast-pdfnumber-pool's pool IDs, adding new pools for PDFArray / PDFString / PDFHexString) and confirmed byte-identical output -- but the measured win was a wash: pointer-array marks turn out to be fast in V8, encoding overhead at parse + decode at save roughly cancels the savings, and the new pool Maps cost ~3 MB heap, so the architecture stays on the shelf rather than shipping. |
diff --git a/perf/notes/08-pdf-lib.md b/perf/notes/08-pdf-lib.md
index 69db0721..9947cbcc 100644
--- a/perf/notes/08-pdf-lib.md
+++ b/perf/notes/08-pdf-lib.md
@@ -3448,6 +3448,126 @@ change. The subsequent Phase 2 / Phase 3 prototypes (next two
 sections) use these numbers to predict their wins; both turn out
 not to ship for reasons documented there.
 
+## Phase 2: Float64Array mainBuf + encoded slots (explored, didn't ship)
+
+The next architectural step from Phase 1. `main` becomes a
+`Float64Array`; every entry (key and value alike) is encoded as
+a 4-bit type tag + 49-bit pool id / payload packed into a single
+Float64. The hypothesis was that V8 would stop marking the 2.34 M
+Object-ref slots in `main` during GC, dropping mark-phase cost.
+
+Prototyped as `fast-dict-encoded.mjs`. Outcome: **wash.** The
+slot-mark-cost win is real (mainBuf's 2.34 M Object-ref slots →
+Float64 slots → V8 marks zero of them) but the cost wasn't large
+enough to matter -- pointer-array marks are fast in V8. The
+encoding overhead (per-slot encode at parse, per-slot decode at
+save) roughly cancels the savings; heap goes up ~3 MB from the
+new pool Maps (numberByValue, stringByValue, hexByValue,
+refGnByKey). The code was kept in faraday as opt-in (foundation
+for Phase 3) but is not pulled into staging; the design rationale
+below is the takeaway worth preserving.
+
+### Encoding scheme
+
+```
+Float64 slot (within Number.MAX_SAFE_INTEGER = 2^53 - 1):
+  bits 49-52  : type tag (4 bits, 16 possible, 11 used)
+  bits  0-48  : payload (49 bits)
+
+Tags:
+  0   PDFNull       (payload = 0)
+  1   PDFBool.False (payload = 0)
+  2   PDFBool.True  (payload = 0)
+  3   PDFName       (payload = name pool id)
+  4   PDFRef gen=0  (payload = objectNumber)
+  5   PDFRef gen!=0 (payload = side pool id)
+  6   PDFNumber     (payload = number pool id)
+  7   PDFDict       (payload = packed (start, length) -- the
+                    existing 38-bit fast-dict-onebuf encoding)
+  8   PDFArray      (payload = array pool id)
+  9   PDFString     (payload = string pool id, value-dedup)
+  10  PDFHexString  (payload = hex pool id, value-dedup)
+  11-15  reserved
+```
+
+### Pool subsumption
+
+The shim absorbs three existing pool shims under one umbrella:
+
+- `PDFRef.of` -- patched to assign `_encId` to each instance;
+  gen=0 uses `objectNumber` as id (dense `refByObjNum[]`); gen!=0
+  uses a sequential side-pool. Would subsume **`--fast-refs`**.
+- `PDFNumber.of` -- patched to assign `_encId`; value-dedup via
+  `numberByValue` Map + parallel `numberById[]`. Would subsume
+  **`--fast-pdfnumber-pool`**.
+- `PDFName.of` -- pdf-lib already pools by string; extended
+  with `_encId` assignment + `nameById[]` for decode.
+- `PDFArray`, `PDFString`, `PDFHexString` -- new pools (none
+  existed). `PDFArray` is mutable so no value-dedup, just
+  sequential id. Strings/HexStrings are immutable so dedup by
+  `value`.
+
+Mutually exclusive with `--fast-dict-onebuf`, `--fast-refs`,
+`--fast-pdfnumber-pool`, and the older dict-shape shims.
+
+### A trap worth recording: eager dictByPayload caching
+
+The first cut of `_makeFromRange` registered every parse-created
+PDFDict in a `dictByPayload` Map so `decodeValue(TAG_DICT)` would
+return the same instance. That writes 261 k Map entries during
+parse -- `set @ (no url):0` shot to **15.4 MB / 29 %** of the
+heap profile, and total sampled heap went 65 → 92 MB (+27 MB).
+
+The fix is the same kind of insight as the lazy materialization
+pattern that surfaced earlier: top-level dicts (226 k) live in
+`PDFContext.indirectObjects` and are never decoded via
+`TAG_DICT` (their entries are in main, but they themselves
+aren't slot values). Only nested dicts (~5 660) are accessed via
+`TAG_DICT` decode. Caching them lazily on first access caps
+`dictByPayload` at ~5 660 entries (~360 KB) and collapses the
+regression. Same shape of bug as the IC-invalidation gotcha in
+Phase 1: a plausible-looking eager cache landed an enormous heap
+regression that only made sense once you saw which population
+was actually being decoded vs only being written.
+
+### Mixed measured result
+
+| Metric | Phase 1 | Phase 2 | Delta |
+|---|---:|---:|---:|
+| Process wall (clean run) | 1.16 s | 1.18 s | ~+20 ms (noise) |
+| GC self-time (CPU profile) | 151 ms | 149 ms | ~0 ms |
+| GC total (`--trace-gc` full process) | 190 ms | 159 ms | -31 ms |
+| Mark-Compact events | 8 | 10 | +2 |
+| Scavenge events | 26 | 26 | 0 |
+| Heap allocation sampled | 65.4 MB | 68.5 MB | **+3 MB** |
+| Live mainBuf slots V8 marks | ~2.34 M | ~0 (Float64Array) | -100 % |
+| Structural output | byte-identical | byte-identical | -- |
+
+**Phase 2 is a wash.** The encoding overhead roughly cancels the
+mark-phase savings, and the new pool Maps cost more than the
+slot-mark reduction is worth.
+
+The first CPU profile of P2 showed +39 ms GC and +130 ms wall,
+but reruns landed it back near Phase 1. The original numbers were
+single-run noise (slow Scavenge cluster on a busy machine).
+
+### Why faraday kept it as opt-in, and why staging doesn't
+
+Two reasons faraday left it in tree:
+
+1. **Pool ID infrastructure is reusable.** Phase 3 (PDFArray
+   storage refactor) uses the same encoding scheme, same pools,
+   same `encodeValue` / `decodeValue` -- it piggybacks on
+   Phase 2 for free.
+2. **Validates the architecture.** Float64Array mainBuf works,
+   byte-identical, no correctness issues. If a future workload
+   stresses mainBuf mark cost more, Phase 2 would be ready.
+
+Phase 3 also doesn't ship (next section), so the dependency
+chain doesn't earn its keep on staging. Dropping Phase 2's code
+keeps the production import chain narrow; the design notes here
+are the part worth preserving.
+
 ## `@cantoo/pdf-lib`: not a drop-in replacement
 
 Spot-checked the maintained fork (`@cantoo/pdf-lib` 2.6.5) as an

From 18d3dd39298b46c8ff801b285595155d614f642c Mon Sep 17 00:00:00 2001
From: Kuba Sunderland-Ober <kuba@mareimbrium.org>
Date: Sat, 23 May 2026 21:21:53 +0200
Subject: [PATCH 28/44] Phase 3: PDFArray views into shared arrayBuf --
 explored, didn't ship.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Mirrors Phase 2's PDFDict shape onto PDFArray: each instance
becomes a view into a shared arrayBuf Float64Array via
this.d = packed (start, length), with the same temp-then-commit
parseArray pattern and the same 4-bit-tag + 49-bit-payload slot
encoding. Bit budget: 24 + 15 = 39 bits per array (one more
length bit than dict).

Measured result on faraday: heap win + CPU regression.

  process duration  1.09 s -> 1.45 s  (+360 ms, +33 %)
  GC self-time      149 ms -> 144 ms  (flat)
  heap sampled      65 MB  -> 58 MB   (-7.6 MB, -12 %)
  parseArray row    19.6 MB -> 0      (out of top 10)
  structural        byte-identical    (1651 pages, 1773 outline)

The heap win is real: 79k PDFArrays stop allocating backing
arrays. The CPU regression is mostly per-slot decode during save
-- PDFDict.copyBytesInto + PDFArray.copyBytesInto together
iterate ~3M slots, each calling decodeValue (10-case switch +
pool lookup). V8 doesn't inline decodeValue across the prototype
boundary; ~100 ns x 3M = ~300 ms.

Code dropped (depended on Phase 2's fast-dict-encoded.mjs, also
not on staging). Design notes preserved in
perf/notes/08-pdf-lib.md; the follow-up Phase 3β attempts to
recover most of the 300 ms by hand-inlining the common decode
cases at the hot call sites.
---
 perf/README.md           |   2 +-
 perf/notes/08-pdf-lib.md | 111 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 112 insertions(+), 1 deletion(-)

diff --git a/perf/README.md b/perf/README.md
index 09cf7757..3c5101b4 100644
--- a/perf/README.md
+++ b/perf/README.md
@@ -565,4 +565,4 @@ order; later ones reference earlier ones for context.
 | [05-blink-trace.md](notes/05-blink-trace.md) | What happened when we tried move-not-clone (a `previousLeaf` cache shipped instead of move); cracking the cpu profile's `(program)` row open with a Blink-category trace; the WhiteSpaceFilter paired-A/B that found it wasn't worth its layout cost in our pipeline. |
 | [06-microtasks-pageranges-css.md](notes/06-microtasks-pageranges-css.md) | Following `RunMicrotasks` down to zero (chunker fully sync); why `pageRanges` sharding is off the table; CSS cost attribution showing print.css's individual sections are all below the noise floor. |
 | [07-memory.md](notes/07-memory.md) | Where the renderer's 1.9 GB goes -- process-tree footprint, per-allocator + per-Blink-class breakdown, `--disable-gpu` + `--in-process-gpu` saving ~200 MB, a GC-pass probe finding 180 MB of unswept Oilpan garbage. |
-| [08-pdf-lib.md](notes/08-pdf-lib.md) | Profiling the process phase via `--cpu-profile-process`; pako's per-stream init dominates with ~4 500 small streams (routing pdf-lib's `deflate` + `inflate` through `node:zlib` saves ~1.5 s); `PDFRef.of`'s string-keyed Map lookup at 1.2 M calls per load (dense-array gen=0 cache saves ~0.2 s); parallelising save's per-stream deflate on libuv's pool with `objectsPerStream: 500` (~0.3 s off the main thread; PDF -5 %); `decodeName`'s regex scan on 2.76 M `PDFName.of` calls per load with a 0.0001 % hit rate (no-`#` cache saves ~0.5 s); `numberToString`'s redundant `toString`/`split`/`parseInt` on the 100 % no-`e` path; `sizeInBytes` allocating `n.toString(2)` on ~300 k xref-writer calls (short-circuit ladder saves ~60 ms); `PDFDict.entries` allocating `Array.from(map.entries())` on every dict serialisation (`Map.forEach` with hoisted callbacks saves ~80 ms); `parseDict`'s type-dispatch tail re-running `PDFName.of('Type'/'Catalog'/'Pages'/'Page')` per dict (hoisted sentinel constants drop `fastOf` self-time by 22 %); pdf-lib's `__awaiter`/`__generator` scaffolding on nine load + save methods costing ~135 ms of attributed self-time + ~50 ms GC (synchronified twins save 0.36 s of process); `parseObject`'s three speculative `matchKeyword(true/false/null)` scans on every dispatch (first-byte peek + gated keyword scans halve `parseObject` self-time); the sampling heap profile pointing at `new Map()` + `Map.prototype.set` at ~50 % of process-phase allocations (replacing `PDFDict`'s backing `Map` with a flat alternating `[k,v,k,v,...]` array drops Map+set heap traffic ~80 % and subsumes the earlier `fast-dict-iter` + `fast-parse-dict` shims); the only hot `Map.set` left being `PDFContext.assign`'s `indirectObjects.set(ref, object)` (replacing the Map with a dense gen=0 array indexed by `objectNumber` drops `assign` out of the CPU top-15 and halves the remaining `set` heap traffic); the residual `set` after that being the upstream `PDFRef.of` pool.set on cache miss (directly constructing the `PDFRef` via `Object.create(PDFRef.prototype)` on the gen=0 miss path bypasses the redundant upstream pool entirely, dropping `set` heap traffic from 7.7 MB to 0.5 MB and saving another ~93 ms on `PDFRef.of` CPU); `parseNumberOrRef` as the next-largest heap row at 15 MB of inlined `new PDFNumber(value)` calls -- PDFs reuse a few thousand unique numeric values hundreds of thousands of times (page indices, `/Count`, `/MediaBox` dimensions, font sizes), so pooling `PDFNumber` by value drops `parseNumberOrRef` off the top 10 and total process heap by ~13 %; `fastParseDictArray` at ~85 % FixedArray-growth garbage on 261 k dicts with a 5-entry median (pre-sizing the accumulator at `new Array(10)` collapses the per-call growth chain, dropping the row -25 % and total heap another -14 %); the next layer being the irreducible "one `new Array(10)` + one PDFDict instance per parsed dict" floor with ~1780 backing arrays still in flight after `--fast-dict-array` (collapse them into one long-lived mainBuf where every committed PDFDict entry lives, a per-parser temp stack handles parseDict recursion, owned dicts append to main and mutate in-place / COW-to-tail, and the whole PDFDict instance state packs into one 53-bit Number -- 24-bit start + 14-bit length + 1-bit owned -- so the per-dict object header collapses to a single field; total process heap drops 92 MB → 66 MB (-28 %), cumulative -57 % since the Map-backed baseline); GC self-time still ~150 ms / 15 % of process after fast-dict-onebuf (V8-flag knobs don't move it -- mark cost is dominated by walking the ~2.4 M Object-ref slots in mainBuf), so the next attack surface is shrinking the *live set* V8 has to mark by encoding slots as Numbers in a Float64Array mainBuf instead of Object references; that needs exact pool sizing, which is best done by a separate measure-allocate-work pass (Phase 0 viability gate: a no-allocate byte walker prototype clocks 135 ms vs PDFDocument.load at 1238 ms on the 39 MB Chrome-output PDF, ~9x cheaper, so even an 80 %-of-current work pass would land net-positive on CPU before any GC reduction; the prototype lives in `perf/phase0-measure.mjs` with `--dump-raw-pdf` on `measure.mjs` for capturing the canonical input); Phase 1 productionises that walker as `docs/lib/measure-pass.mjs` and adds `setExpectedDictSlots()` to fast-dict-onebuf so the harness's `--measure-pass` flag pre-sizes mainBuf exact (byte-identical output, +40 ms net process on the book) -- pipeline shape committed for Phase 2 to layer on, even though Phase 1 alone doesn't move the heap; a V8 IC-invalidation gotcha worth recording: rebinding the module-level `main` (rather than resizing it in place via `main.length = N`) made `_appendEntries` deopt and the heap jump 65 → 92 MB despite the resized Array being shape-identical; post-Phase-1 cleanup: the One-buffer `owned` bit was over-cautious -- `start + length === mainLen` is enough to know slots past mainLen are claimable regardless of whether the range came from the parser or a factory call, so the bit drops out of the packed value entirely (set's COW gate simplifies, _cow collapses to one branch, bit 38 becomes spare; byte-identical output, heap flat); slot-type histogram on `main[0..mainLen)` confirms keys are 100 % PDFName and the four big pools (Name / Ref / Number / Dict) cover 96.4 % of all slots, so a Float64Array mainBuf with a side `Object[]` for the residual ~3.5 % (Array / String / HexString) would collapse ~96 % of GC slot-marks at the price of a side-pool indirection -- `perf/instrument-slot-types.mjs` does the classification in 39 ms via `--instrument-slot-types`; built the Float64Array mainBuf with 4-bit tag + 49-bit payload encoding (subsuming fast-refs and fast-pdfnumber-pool's pool IDs, adding new pools for PDFArray / PDFString / PDFHexString) and confirmed byte-identical output -- but the measured win was a wash: pointer-array marks turn out to be fast in V8, encoding overhead at parse + decode at save roughly cancels the savings, and the new pool Maps cost ~3 MB heap, so the architecture stays on the shelf rather than shipping. |
+| [08-pdf-lib.md](notes/08-pdf-lib.md) | Profiling the process phase via `--cpu-profile-process`; pako's per-stream init dominates with ~4 500 small streams (routing pdf-lib's `deflate` + `inflate` through `node:zlib` saves ~1.5 s); `PDFRef.of`'s string-keyed Map lookup at 1.2 M calls per load (dense-array gen=0 cache saves ~0.2 s); parallelising save's per-stream deflate on libuv's pool with `objectsPerStream: 500` (~0.3 s off the main thread; PDF -5 %); `decodeName`'s regex scan on 2.76 M `PDFName.of` calls per load with a 0.0001 % hit rate (no-`#` cache saves ~0.5 s); `numberToString`'s redundant `toString`/`split`/`parseInt` on the 100 % no-`e` path; `sizeInBytes` allocating `n.toString(2)` on ~300 k xref-writer calls (short-circuit ladder saves ~60 ms); `PDFDict.entries` allocating `Array.from(map.entries())` on every dict serialisation (`Map.forEach` with hoisted callbacks saves ~80 ms); `parseDict`'s type-dispatch tail re-running `PDFName.of('Type'/'Catalog'/'Pages'/'Page')` per dict (hoisted sentinel constants drop `fastOf` self-time by 22 %); pdf-lib's `__awaiter`/`__generator` scaffolding on nine load + save methods costing ~135 ms of attributed self-time + ~50 ms GC (synchronified twins save 0.36 s of process); `parseObject`'s three speculative `matchKeyword(true/false/null)` scans on every dispatch (first-byte peek + gated keyword scans halve `parseObject` self-time); the sampling heap profile pointing at `new Map()` + `Map.prototype.set` at ~50 % of process-phase allocations (replacing `PDFDict`'s backing `Map` with a flat alternating `[k,v,k,v,...]` array drops Map+set heap traffic ~80 % and subsumes the earlier `fast-dict-iter` + `fast-parse-dict` shims); the only hot `Map.set` left being `PDFContext.assign`'s `indirectObjects.set(ref, object)` (replacing the Map with a dense gen=0 array indexed by `objectNumber` drops `assign` out of the CPU top-15 and halves the remaining `set` heap traffic); the residual `set` after that being the upstream `PDFRef.of` pool.set on cache miss (directly constructing the `PDFRef` via `Object.create(PDFRef.prototype)` on the gen=0 miss path bypasses the redundant upstream pool entirely, dropping `set` heap traffic from 7.7 MB to 0.5 MB and saving another ~93 ms on `PDFRef.of` CPU); `parseNumberOrRef` as the next-largest heap row at 15 MB of inlined `new PDFNumber(value)` calls -- PDFs reuse a few thousand unique numeric values hundreds of thousands of times (page indices, `/Count`, `/MediaBox` dimensions, font sizes), so pooling `PDFNumber` by value drops `parseNumberOrRef` off the top 10 and total process heap by ~13 %; `fastParseDictArray` at ~85 % FixedArray-growth garbage on 261 k dicts with a 5-entry median (pre-sizing the accumulator at `new Array(10)` collapses the per-call growth chain, dropping the row -25 % and total heap another -14 %); the next layer being the irreducible "one `new Array(10)` + one PDFDict instance per parsed dict" floor with ~1780 backing arrays still in flight after `--fast-dict-array` (collapse them into one long-lived mainBuf where every committed PDFDict entry lives, a per-parser temp stack handles parseDict recursion, owned dicts append to main and mutate in-place / COW-to-tail, and the whole PDFDict instance state packs into one 53-bit Number -- 24-bit start + 14-bit length + 1-bit owned -- so the per-dict object header collapses to a single field; total process heap drops 92 MB → 66 MB (-28 %), cumulative -57 % since the Map-backed baseline); GC self-time still ~150 ms / 15 % of process after fast-dict-onebuf (V8-flag knobs don't move it -- mark cost is dominated by walking the ~2.4 M Object-ref slots in mainBuf), so the next attack surface is shrinking the *live set* V8 has to mark by encoding slots as Numbers in a Float64Array mainBuf instead of Object references; that needs exact pool sizing, which is best done by a separate measure-allocate-work pass (Phase 0 viability gate: a no-allocate byte walker prototype clocks 135 ms vs PDFDocument.load at 1238 ms on the 39 MB Chrome-output PDF, ~9x cheaper, so even an 80 %-of-current work pass would land net-positive on CPU before any GC reduction; the prototype lives in `perf/phase0-measure.mjs` with `--dump-raw-pdf` on `measure.mjs` for capturing the canonical input); Phase 1 productionises that walker as `docs/lib/measure-pass.mjs` and adds `setExpectedDictSlots()` to fast-dict-onebuf so the harness's `--measure-pass` flag pre-sizes mainBuf exact (byte-identical output, +40 ms net process on the book) -- pipeline shape committed for Phase 2 to layer on, even though Phase 1 alone doesn't move the heap; a V8 IC-invalidation gotcha worth recording: rebinding the module-level `main` (rather than resizing it in place via `main.length = N`) made `_appendEntries` deopt and the heap jump 65 → 92 MB despite the resized Array being shape-identical; post-Phase-1 cleanup: the One-buffer `owned` bit was over-cautious -- `start + length === mainLen` is enough to know slots past mainLen are claimable regardless of whether the range came from the parser or a factory call, so the bit drops out of the packed value entirely (set's COW gate simplifies, _cow collapses to one branch, bit 38 becomes spare; byte-identical output, heap flat); slot-type histogram on `main[0..mainLen)` confirms keys are 100 % PDFName and the four big pools (Name / Ref / Number / Dict) cover 96.4 % of all slots, so a Float64Array mainBuf with a side `Object[]` for the residual ~3.5 % (Array / String / HexString) would collapse ~96 % of GC slot-marks at the price of a side-pool indirection -- `perf/instrument-slot-types.mjs` does the classification in 39 ms via `--instrument-slot-types`; built the Float64Array mainBuf with 4-bit tag + 49-bit payload encoding (subsuming fast-refs and fast-pdfnumber-pool's pool IDs, adding new pools for PDFArray / PDFString / PDFHexString) and confirmed byte-identical output -- but the measured win was a wash: pointer-array marks turn out to be fast in V8, encoding overhead at parse + decode at save roughly cancels the savings, and the new pool Maps cost ~3 MB heap, so the architecture stays on the shelf rather than shipping; mirroring the same shape to PDFArray (each instance as a view into a shared arrayBuf Float64Array via `this.d = (start, length)`, same temp-then-commit parser pattern) lands the expected heap win of -7.6 MB / -12 % and pulls parseArray's 19.6 MB attribution row off the top 10 entirely, but introduces a +360 ms wall-clock regression at save -- ~3 M per-slot `decodeValue` calls across copyBytesInto, ~100 ns each, V8 can't inline across the prototype boundary -- so the architecture stays off staging too. |
diff --git a/perf/notes/08-pdf-lib.md b/perf/notes/08-pdf-lib.md
index 9947cbcc..38f368b2 100644
--- a/perf/notes/08-pdf-lib.md
+++ b/perf/notes/08-pdf-lib.md
@@ -3568,6 +3568,117 @@ chain doesn't earn its keep on staging. Dropping Phase 2's code
 keeps the production import chain narrow; the design notes here
 are the part worth preserving.
 
+## Phase 3: PDFArray storage refactor (explored, didn't ship)
+
+Phase 2's `fast-dict-encoded.mjs` grew a sibling structure for
+PDFArray. Each PDFArray instance becomes a view into a shared
+`arrayBuf` Float64Array, with `this.d` packing `(start, length)`
+-- same shape as PDFDict in Phase 2, with one more length bit
+(max single array is 25 308 elements vs max single dict 8 706
+slots). Per-instance `this.array = []` allocation goes away.
+
+Same opt-in story as Phase 2 (and same don't-ship verdict on
+staging): heap win is real but the CPU regression at save time
+dominates.
+
+### The mechanism
+
+| | PDFDict (Phase 2) | PDFArray (Phase 3) |
+|---|---|---|
+| Backing buffer | `main` Float64Array | `arrayBuf` Float64Array |
+| Per-instance | `this.d` = packed `(start, length)` | same |
+| Bit budget | 24 + 14 = 38 bits | 24 + 15 = 39 bits |
+| Slot encoding | 4-bit tag + 49-bit payload | same scheme |
+| Lazy cache | `dictByPayload` | `arrayByPayload` |
+| Parser temp | `_dictTemp` (Float64Array) | `_arrayTemp` (Float64Array) |
+| TAG_ARRAY slot | was `OFF_ARRAY + arrayId` | now `OFF_ARRAY + arr.d` |
+
+Phase 2's `_assignArrayId` and `arrayById[]` pool are gone -- the
+view-payload encoding makes them obsolete. Phase 2's encoding
+scheme for TAG_ARRAY changes from a pool-id payload to the
+direct `(start, length)` payload that mirrors TAG_DICT.
+
+### Mutation paths
+
+`PDFArray.prototype` methods rewritten:
+
+- `size` -- reads length from `this.d`
+- `push` -- extend in place at HWM, else COW (same pattern as
+  PDFDict.set's append case)
+- `get(i)` / `set(i, v)` -- decode/encode at `arrayBuf[start + i]`
+- `insert(i, v)` / `remove(i)` -- always COW (would corrupt
+  neighbouring arrays' ranges otherwise)
+- `indexOf` -- compare encoded payloads, no decode needed
+- `asArray` / `clone` / `toString` / `sizeInBytes` /
+  `copyBytesInto` -- decode each element
+
+`PDFArray.withContext` bypasses the inherited constructor's
+`this.array = []` allocation by `Object.create`-ing the
+instance and setting `this.d` directly.
+
+### parseArray patch
+
+Same temp-then-commit pattern as parseDict. Each parser instance
+gets its own `_arrayTemp` Float64Array; parseArray pushes
+encoded elements onto temp, commits the frame to `arrayBuf` in
+one contiguous `arrayBuf.set(...)`, pops temp back. Recursion
+across dicts and arrays is fine because `_dictTemp` and
+`_arrayTemp` are separate.
+
+### Measured result: heap win + CPU regression
+
+Combined Phase 2+3 vs Phase 1 baseline (paired, production set):
+
+| Metric | Phase 1 baseline | Phase 2+3 | Delta |
+|---|---:|---:|---:|
+| Heap sampled | 65.4 MB | **57.8 MB** | **-7.6 MB (-12 %)** |
+| `parseArray` self-attribution | 19.6 MB | ~0 (out of top 10) | **-19.6 MB**, replaced by arrayBuf-mediated writes |
+| `_makeFromRange` | 16.5 MB | 14.3 MB | -2.2 MB |
+| GC self-time (CPU profile) | 149 ms | 144 ms | -5 ms (flat) |
+| Process duration | 1.09 s | 1.45 s | **+360 ms (+33 %)** |
+| Structural output | byte-identical | byte-identical | -- |
+
+The heap win is what we hoped for: PDFArrays stop allocating
+per-instance `[]` backing arrays (79 k of them), and parseArray
+stops attribution because writes go to the shared `arrayBuf`.
+
+The CPU regression is the killer. The cost comes from per-slot
+decode during save -- `PDFDict.copyBytesInto` and
+`PDFArray.copyBytesInto` together iterate ~3 M slots, calling
+`decodeValue` once per slot. `decodeValue` is a 10-case switch
+plus a pool lookup; V8 doesn't inline it across the prototype
+boundary. ~100 ns per call × 3 M = ~300 ms. GC didn't move
+much. The slot-mark savings from Float64Array `arrayBuf` are
+real, but as with Phase 2 they're small relative to total mark
+cost. V8 marks pointer arrays fast.
+
+### Why faraday kept it as opt-in, and why staging doesn't
+
+Phase 3 validates the architecture for both data structures
+(Float64Array storage works for dicts AND arrays, byte-identical,
+no correctness issues) and the heap win is real (-7.6 MB / -12 %
+is not nothing). It also sets up an obvious follow-up:
+hand-inline the common decode cases at the hot copyBytesInto /
+sizeInBytes call sites. That's Phase 3β below -- which recovers
+much of the 300 ms but the net win still doesn't justify the
+engineering surface for our pipeline, so the whole encoded
+architecture stays off staging.
+
+### Caveats / known limitations
+
+- Direct `new PDFArray(context)` (rather than the
+  `PDFArray.withContext` factory) would leave `this.d` undefined
+  and methods would misbehave. pdf-lib's parser and our
+  setOutline go through the factory, but a hypothetical caller
+  using `new` would need the factory or a defensive init guard.
+- `PDFArray.scalePDFNumbers` (in pdf-lib's PDFArray; not
+  rewritten here) goes through `get`/`set` and so would work
+  transparently via the encoded path. Not exercised in the book
+  build.
+- PDFArrays nested in PDFArrays via `TAG_ARRAY` decode lazily,
+  same pattern as nested dicts; `arrayByPayload` caps at the
+  number of distinct nested-array payloads (small).
+
 ## `@cantoo/pdf-lib`: not a drop-in replacement
 
 Spot-checked the maintained fork (`@cantoo/pdf-lib` 2.6.5) as an

From e2f97568089130783dc612028ceb6342a8ebd2be Mon Sep 17 00:00:00 2001
From: Kuba Sunderland-Ober <kuba@mareimbrium.org>
Date: Sat, 23 May 2026 21:56:50 +0200
Subject: [PATCH 29/44] =?UTF-8?q?Phase=203=CE=B2:=20hand-inline=20decodeVa?=
 =?UTF-8?q?lue=20at=20save=20hot=20path=20--=20explored,=20didn't=20ship.?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The Phase 3 CPU regression was almost entirely per-slot decode
during save -- PDFDict.copyBytesInto + sizeInBytes +
PDFArray.copyBytesInto + sizeInBytes iterate ~3M slots, each
calling decodeValue (10-case switch + pool lookup). V8 doesn't
inline across the prototype-method boundary; ~100 ns x 3M ~=
+300 ms.

(β) hand-inlines decodeValue's switch into all four hot methods.
The switch body is copy-pasted verbatim into each loop, giving
V8 a monomorphic call site per case branch.

Measured deltas vs Phase 3 (pre-inline) on the book:
  (garbage collector)      144 ms -> 130 ms  (-14 ms win)
  PDFObjectParser.parseName 106 ms -> 70 ms  (-36 ms win, surprise)
  PDFDict.copyBytesInto      57 ms -> 49 ms  (-8 ms)
  fastParseDictEncoded       59 ms -> 63 ms  (+4 ms)
  heap sampled              57.8 MB ~  58.0 MB (flat)
  structural                byte-identical (1651 pages, 1773 outline)

parseName's drop is a downstream effect of V8 re-optimizing the
call graph once the hot copyBytesInto / sizeInBytes paths became
monomorphic per case branch.

Net of full Phase 2 + 3 + β vs P1 baseline (fast-dict-onebuf):
  heap                65.4 MB -> 58.0 MB  (-7.4 MB, -11 %)
  GC self-time        149 ms -> 130 ms   (-19 ms, -13 %)
  CPU residual        ~+200 ms across many frames (noisy)

Architectural conclusion: Float64Array encoded storage works
correctly and delivers a real heap+GC win, but the per-slot
encoding overhead exceeds the slot-mark savings. V8 marks
pointer arrays faster than assumed (~10-20 ms for 2.4M slots,
not 100+). The original Object[] polymorphic .copyBytesInto()
was actually fine; replacing it with explicit switch dispatch
helps GC and parseName but hurts dict-side hot loops.

Code dropped (depended on Phase 2 / Phase 3 architecture, also
not on staging). Notes preserved in perf/notes/08-pdf-lib.md as
the endpoint of this storage-shape exploration. The next move
on the same theme is the much narrower "one-buffer for
PDFArray" (fast-array-onebuf), which mirrors fast-dict-onebuf's
shape directly and does ship.
---
 perf/README.md           |  2 +-
 perf/notes/08-pdf-lib.md | 77 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 78 insertions(+), 1 deletion(-)

diff --git a/perf/README.md b/perf/README.md
index 3c5101b4..a11407b6 100644
--- a/perf/README.md
+++ b/perf/README.md
@@ -565,4 +565,4 @@ order; later ones reference earlier ones for context.
 | [05-blink-trace.md](notes/05-blink-trace.md) | What happened when we tried move-not-clone (a `previousLeaf` cache shipped instead of move); cracking the cpu profile's `(program)` row open with a Blink-category trace; the WhiteSpaceFilter paired-A/B that found it wasn't worth its layout cost in our pipeline. |
 | [06-microtasks-pageranges-css.md](notes/06-microtasks-pageranges-css.md) | Following `RunMicrotasks` down to zero (chunker fully sync); why `pageRanges` sharding is off the table; CSS cost attribution showing print.css's individual sections are all below the noise floor. |
 | [07-memory.md](notes/07-memory.md) | Where the renderer's 1.9 GB goes -- process-tree footprint, per-allocator + per-Blink-class breakdown, `--disable-gpu` + `--in-process-gpu` saving ~200 MB, a GC-pass probe finding 180 MB of unswept Oilpan garbage. |
-| [08-pdf-lib.md](notes/08-pdf-lib.md) | Profiling the process phase via `--cpu-profile-process`; pako's per-stream init dominates with ~4 500 small streams (routing pdf-lib's `deflate` + `inflate` through `node:zlib` saves ~1.5 s); `PDFRef.of`'s string-keyed Map lookup at 1.2 M calls per load (dense-array gen=0 cache saves ~0.2 s); parallelising save's per-stream deflate on libuv's pool with `objectsPerStream: 500` (~0.3 s off the main thread; PDF -5 %); `decodeName`'s regex scan on 2.76 M `PDFName.of` calls per load with a 0.0001 % hit rate (no-`#` cache saves ~0.5 s); `numberToString`'s redundant `toString`/`split`/`parseInt` on the 100 % no-`e` path; `sizeInBytes` allocating `n.toString(2)` on ~300 k xref-writer calls (short-circuit ladder saves ~60 ms); `PDFDict.entries` allocating `Array.from(map.entries())` on every dict serialisation (`Map.forEach` with hoisted callbacks saves ~80 ms); `parseDict`'s type-dispatch tail re-running `PDFName.of('Type'/'Catalog'/'Pages'/'Page')` per dict (hoisted sentinel constants drop `fastOf` self-time by 22 %); pdf-lib's `__awaiter`/`__generator` scaffolding on nine load + save methods costing ~135 ms of attributed self-time + ~50 ms GC (synchronified twins save 0.36 s of process); `parseObject`'s three speculative `matchKeyword(true/false/null)` scans on every dispatch (first-byte peek + gated keyword scans halve `parseObject` self-time); the sampling heap profile pointing at `new Map()` + `Map.prototype.set` at ~50 % of process-phase allocations (replacing `PDFDict`'s backing `Map` with a flat alternating `[k,v,k,v,...]` array drops Map+set heap traffic ~80 % and subsumes the earlier `fast-dict-iter` + `fast-parse-dict` shims); the only hot `Map.set` left being `PDFContext.assign`'s `indirectObjects.set(ref, object)` (replacing the Map with a dense gen=0 array indexed by `objectNumber` drops `assign` out of the CPU top-15 and halves the remaining `set` heap traffic); the residual `set` after that being the upstream `PDFRef.of` pool.set on cache miss (directly constructing the `PDFRef` via `Object.create(PDFRef.prototype)` on the gen=0 miss path bypasses the redundant upstream pool entirely, dropping `set` heap traffic from 7.7 MB to 0.5 MB and saving another ~93 ms on `PDFRef.of` CPU); `parseNumberOrRef` as the next-largest heap row at 15 MB of inlined `new PDFNumber(value)` calls -- PDFs reuse a few thousand unique numeric values hundreds of thousands of times (page indices, `/Count`, `/MediaBox` dimensions, font sizes), so pooling `PDFNumber` by value drops `parseNumberOrRef` off the top 10 and total process heap by ~13 %; `fastParseDictArray` at ~85 % FixedArray-growth garbage on 261 k dicts with a 5-entry median (pre-sizing the accumulator at `new Array(10)` collapses the per-call growth chain, dropping the row -25 % and total heap another -14 %); the next layer being the irreducible "one `new Array(10)` + one PDFDict instance per parsed dict" floor with ~1780 backing arrays still in flight after `--fast-dict-array` (collapse them into one long-lived mainBuf where every committed PDFDict entry lives, a per-parser temp stack handles parseDict recursion, owned dicts append to main and mutate in-place / COW-to-tail, and the whole PDFDict instance state packs into one 53-bit Number -- 24-bit start + 14-bit length + 1-bit owned -- so the per-dict object header collapses to a single field; total process heap drops 92 MB → 66 MB (-28 %), cumulative -57 % since the Map-backed baseline); GC self-time still ~150 ms / 15 % of process after fast-dict-onebuf (V8-flag knobs don't move it -- mark cost is dominated by walking the ~2.4 M Object-ref slots in mainBuf), so the next attack surface is shrinking the *live set* V8 has to mark by encoding slots as Numbers in a Float64Array mainBuf instead of Object references; that needs exact pool sizing, which is best done by a separate measure-allocate-work pass (Phase 0 viability gate: a no-allocate byte walker prototype clocks 135 ms vs PDFDocument.load at 1238 ms on the 39 MB Chrome-output PDF, ~9x cheaper, so even an 80 %-of-current work pass would land net-positive on CPU before any GC reduction; the prototype lives in `perf/phase0-measure.mjs` with `--dump-raw-pdf` on `measure.mjs` for capturing the canonical input); Phase 1 productionises that walker as `docs/lib/measure-pass.mjs` and adds `setExpectedDictSlots()` to fast-dict-onebuf so the harness's `--measure-pass` flag pre-sizes mainBuf exact (byte-identical output, +40 ms net process on the book) -- pipeline shape committed for Phase 2 to layer on, even though Phase 1 alone doesn't move the heap; a V8 IC-invalidation gotcha worth recording: rebinding the module-level `main` (rather than resizing it in place via `main.length = N`) made `_appendEntries` deopt and the heap jump 65 → 92 MB despite the resized Array being shape-identical; post-Phase-1 cleanup: the One-buffer `owned` bit was over-cautious -- `start + length === mainLen` is enough to know slots past mainLen are claimable regardless of whether the range came from the parser or a factory call, so the bit drops out of the packed value entirely (set's COW gate simplifies, _cow collapses to one branch, bit 38 becomes spare; byte-identical output, heap flat); slot-type histogram on `main[0..mainLen)` confirms keys are 100 % PDFName and the four big pools (Name / Ref / Number / Dict) cover 96.4 % of all slots, so a Float64Array mainBuf with a side `Object[]` for the residual ~3.5 % (Array / String / HexString) would collapse ~96 % of GC slot-marks at the price of a side-pool indirection -- `perf/instrument-slot-types.mjs` does the classification in 39 ms via `--instrument-slot-types`; built the Float64Array mainBuf with 4-bit tag + 49-bit payload encoding (subsuming fast-refs and fast-pdfnumber-pool's pool IDs, adding new pools for PDFArray / PDFString / PDFHexString) and confirmed byte-identical output -- but the measured win was a wash: pointer-array marks turn out to be fast in V8, encoding overhead at parse + decode at save roughly cancels the savings, and the new pool Maps cost ~3 MB heap, so the architecture stays on the shelf rather than shipping; mirroring the same shape to PDFArray (each instance as a view into a shared arrayBuf Float64Array via `this.d = (start, length)`, same temp-then-commit parser pattern) lands the expected heap win of -7.6 MB / -12 % and pulls parseArray's 19.6 MB attribution row off the top 10 entirely, but introduces a +360 ms wall-clock regression at save -- ~3 M per-slot `decodeValue` calls across copyBytesInto, ~100 ns each, V8 can't inline across the prototype boundary -- so the architecture stays off staging too. |
+| [08-pdf-lib.md](notes/08-pdf-lib.md) | Profiling the process phase via `--cpu-profile-process`; pako's per-stream init dominates with ~4 500 small streams (routing pdf-lib's `deflate` + `inflate` through `node:zlib` saves ~1.5 s); `PDFRef.of`'s string-keyed Map lookup at 1.2 M calls per load (dense-array gen=0 cache saves ~0.2 s); parallelising save's per-stream deflate on libuv's pool with `objectsPerStream: 500` (~0.3 s off the main thread; PDF -5 %); `decodeName`'s regex scan on 2.76 M `PDFName.of` calls per load with a 0.0001 % hit rate (no-`#` cache saves ~0.5 s); `numberToString`'s redundant `toString`/`split`/`parseInt` on the 100 % no-`e` path; `sizeInBytes` allocating `n.toString(2)` on ~300 k xref-writer calls (short-circuit ladder saves ~60 ms); `PDFDict.entries` allocating `Array.from(map.entries())` on every dict serialisation (`Map.forEach` with hoisted callbacks saves ~80 ms); `parseDict`'s type-dispatch tail re-running `PDFName.of('Type'/'Catalog'/'Pages'/'Page')` per dict (hoisted sentinel constants drop `fastOf` self-time by 22 %); pdf-lib's `__awaiter`/`__generator` scaffolding on nine load + save methods costing ~135 ms of attributed self-time + ~50 ms GC (synchronified twins save 0.36 s of process); `parseObject`'s three speculative `matchKeyword(true/false/null)` scans on every dispatch (first-byte peek + gated keyword scans halve `parseObject` self-time); the sampling heap profile pointing at `new Map()` + `Map.prototype.set` at ~50 % of process-phase allocations (replacing `PDFDict`'s backing `Map` with a flat alternating `[k,v,k,v,...]` array drops Map+set heap traffic ~80 % and subsumes the earlier `fast-dict-iter` + `fast-parse-dict` shims); the only hot `Map.set` left being `PDFContext.assign`'s `indirectObjects.set(ref, object)` (replacing the Map with a dense gen=0 array indexed by `objectNumber` drops `assign` out of the CPU top-15 and halves the remaining `set` heap traffic); the residual `set` after that being the upstream `PDFRef.of` pool.set on cache miss (directly constructing the `PDFRef` via `Object.create(PDFRef.prototype)` on the gen=0 miss path bypasses the redundant upstream pool entirely, dropping `set` heap traffic from 7.7 MB to 0.5 MB and saving another ~93 ms on `PDFRef.of` CPU); `parseNumberOrRef` as the next-largest heap row at 15 MB of inlined `new PDFNumber(value)` calls -- PDFs reuse a few thousand unique numeric values hundreds of thousands of times (page indices, `/Count`, `/MediaBox` dimensions, font sizes), so pooling `PDFNumber` by value drops `parseNumberOrRef` off the top 10 and total process heap by ~13 %; `fastParseDictArray` at ~85 % FixedArray-growth garbage on 261 k dicts with a 5-entry median (pre-sizing the accumulator at `new Array(10)` collapses the per-call growth chain, dropping the row -25 % and total heap another -14 %); the next layer being the irreducible "one `new Array(10)` + one PDFDict instance per parsed dict" floor with ~1780 backing arrays still in flight after `--fast-dict-array` (collapse them into one long-lived mainBuf where every committed PDFDict entry lives, a per-parser temp stack handles parseDict recursion, owned dicts append to main and mutate in-place / COW-to-tail, and the whole PDFDict instance state packs into one 53-bit Number -- 24-bit start + 14-bit length + 1-bit owned -- so the per-dict object header collapses to a single field; total process heap drops 92 MB → 66 MB (-28 %), cumulative -57 % since the Map-backed baseline); GC self-time still ~150 ms / 15 % of process after fast-dict-onebuf (V8-flag knobs don't move it -- mark cost is dominated by walking the ~2.4 M Object-ref slots in mainBuf), so the next attack surface is shrinking the *live set* V8 has to mark by encoding slots as Numbers in a Float64Array mainBuf instead of Object references; that needs exact pool sizing, which is best done by a separate measure-allocate-work pass (Phase 0 viability gate: a no-allocate byte walker prototype clocks 135 ms vs PDFDocument.load at 1238 ms on the 39 MB Chrome-output PDF, ~9x cheaper, so even an 80 %-of-current work pass would land net-positive on CPU before any GC reduction; the prototype lives in `perf/phase0-measure.mjs` with `--dump-raw-pdf` on `measure.mjs` for capturing the canonical input); Phase 1 productionises that walker as `docs/lib/measure-pass.mjs` and adds `setExpectedDictSlots()` to fast-dict-onebuf so the harness's `--measure-pass` flag pre-sizes mainBuf exact (byte-identical output, +40 ms net process on the book) -- pipeline shape committed for Phase 2 to layer on, even though Phase 1 alone doesn't move the heap; a V8 IC-invalidation gotcha worth recording: rebinding the module-level `main` (rather than resizing it in place via `main.length = N`) made `_appendEntries` deopt and the heap jump 65 → 92 MB despite the resized Array being shape-identical; post-Phase-1 cleanup: the One-buffer `owned` bit was over-cautious -- `start + length === mainLen` is enough to know slots past mainLen are claimable regardless of whether the range came from the parser or a factory call, so the bit drops out of the packed value entirely (set's COW gate simplifies, _cow collapses to one branch, bit 38 becomes spare; byte-identical output, heap flat); slot-type histogram on `main[0..mainLen)` confirms keys are 100 % PDFName and the four big pools (Name / Ref / Number / Dict) cover 96.4 % of all slots, so a Float64Array mainBuf with a side `Object[]` for the residual ~3.5 % (Array / String / HexString) would collapse ~96 % of GC slot-marks at the price of a side-pool indirection -- `perf/instrument-slot-types.mjs` does the classification in 39 ms via `--instrument-slot-types`; built the Float64Array mainBuf with 4-bit tag + 49-bit payload encoding (subsuming fast-refs and fast-pdfnumber-pool's pool IDs, adding new pools for PDFArray / PDFString / PDFHexString) and confirmed byte-identical output -- but the measured win was a wash: pointer-array marks turn out to be fast in V8, encoding overhead at parse + decode at save roughly cancels the savings, and the new pool Maps cost ~3 MB heap, so the architecture stays on the shelf rather than shipping; mirroring the same shape to PDFArray (each instance as a view into a shared arrayBuf Float64Array via `this.d = (start, length)`, same temp-then-commit parser pattern) lands the expected heap win of -7.6 MB / -12 % and pulls parseArray's 19.6 MB attribution row off the top 10 entirely, but introduces a +360 ms wall-clock regression at save -- ~3 M per-slot `decodeValue` calls across copyBytesInto, ~100 ns each, V8 can't inline across the prototype boundary -- so the architecture stays off staging too; the natural follow-up is to hand-inline decodeValue's 10-case switch into all four hot save methods (Phase 3β), which recovers the function-call overhead and lands net wins on GC (-19 ms) and parseName (-17 ms, a downstream V8 re-optimization once the call sites became monomorphic per case branch) -- but the inlined switch body itself adds +23 ms at encode-at-parse and +22 ms at copyBytesInto, and the net of all of Phase 2 + 3 + β is "real heap+GC win, ~+200 ms wall-clock loss across many frames", so the architecture stays off staging on the simpler conclusion that V8 marks pointer arrays faster than expected and the original polymorphic `main[i].copyBytesInto()` was fine. |
diff --git a/perf/notes/08-pdf-lib.md b/perf/notes/08-pdf-lib.md
index 38f368b2..a1d66cca 100644
--- a/perf/notes/08-pdf-lib.md
+++ b/perf/notes/08-pdf-lib.md
@@ -3679,6 +3679,83 @@ architecture stays off staging.
   same pattern as nested dicts; `arrayByPayload` caps at the
   number of distinct nested-array payloads (small).
 
+## Phase 3β: hand-inline decodeValue at the save hot path (explored, didn't ship)
+
+The Phase 3 CPU regression was almost entirely per-slot decode
+during save -- `PDFDict.copyBytesInto`, `PDFDict.sizeInBytes`,
+`PDFArray.copyBytesInto`, `PDFArray.sizeInBytes` together
+iterate ~3 M slots, each calling `decodeValue` (10-case switch
++ pool lookup). V8 doesn't inline the function across the
+prototype-method boundary; ~100 ns × 3 M ≈ +300 ms.
+
+Phase 3β hand-inlines `decodeValue`'s switch into all four hot
+methods. The switch body is copy-pasted verbatim into each
+loop, giving V8 a monomorphic `.copyBytesInto` /
+`.sizeInBytes` call site per case branch.
+
+### Measured
+
+| Frame | P1 baseline | P3 (pre-inline) | **P3β** | β vs P1 |
+|---|---:|---:|---:|---:|
+| `(garbage collector)` | 149 ms | 144 ms | **130 ms** | **-19 ms (win)** |
+| `PDFObjectParser.parseName` | 87 ms | 106 ms | **70 ms** | **-17 ms (win)** |
+| `fastParseDict*` | 40 ms | 59 ms | 63 ms | +23 ms (encode at parse) |
+| `PDFDict.copyBytesInto` | 27 ms | 57 ms | **49 ms** | +22 ms |
+| `PDFDict.sizeInBytes` | (<top15) | (<top15) | 33 ms | new |
+| Heap sampled | 65.4 MB | 57.8 MB | **58.0 MB** | **-7.4 MB (win)** |
+| Structural | byte-identical | byte-identical | byte-identical | -- |
+
+The wins (GC -19 ms, parseName -17 ms) are real. parseName's
+drop is surprising but consistent across reruns -- the
+inlined switch made some call sites monomorphic that weren't
+before, and V8 re-optimized parseName as a downstream effect.
+
+The losses (encode-at-parse +23 ms, copyBytesInto +22 ms,
+sizeInBytes +33 ms) come from the inlined 11-case switch
+itself. Each iteration in the hot loop pays for the tag
+dispatch.
+
+### Architectural conclusion (Phase 2 + 3 + β closeout)
+
+Float64Array encoded storage **does work** -- byte-identical
+output, mainBuf and arrayBuf mark cost goes to zero, ~7.4 MB
+heap saved, GC drops ~20 ms. But it doesn't pull its weight
+on this workload because:
+
+1. **V8 marks pointer arrays fast.** mainBuf's 2.34 M
+   Object[] slots cost ~10-20 ms of mark time, not the 100+ ms
+   we assumed. The slot-mark savings are real but small.
+2. **The encoding scheme adds per-slot work that exceeds the
+   savings.** Encode at parse + decode at save = ~50 ms net
+   loss in the hot loops, even with hand-inlining.
+3. **The original polymorphic `main[i].copyBytesInto()` was
+   actually fine.** V8's megamorphic IC handled it well.
+   Replacing with explicit switch + monomorphic per-case
+   dispatch *helps slightly* in GC and parseName but
+   *hurts in dict hot paths*.
+
+The work isn't wasted -- the design notes here quantify *why*
+this approach isn't the right lever, and the pool ID
+infrastructure could be reused if a future optimization needs
+cross-type instance lookup. If a future workload stresses
+mainBuf mark cost more (much larger documents, more aggressive
+GC pressure, or a different V8 version) the encoded path is a
+known-correct starting point.
+
+Production stays on:
+
+- `--fast-dict-onebuf` (Object[] mainBuf with packed view)
+- `--fast-refs`, `--fast-pdfnumber-pool` (the pool shims that
+  fast-dict-encoded would have subsumed)
+- All other shipped `--fast-*` shims unchanged
+
+The next move on the same theme is the much narrower
+"one-buffer for PDFArray" -- skip the encoded scheme entirely
+and just mirror fast-dict-onebuf's shape onto PDFArray, keeping
+the Object[] storage and inheriting the same low-overhead
+view-with-packed-payload trick. That's the fast-array-onebuf
+section below; it does ship.
+
 ## `@cantoo/pdf-lib`: not a drop-in replacement
 
 Spot-checked the maintained fork (`@cantoo/pdf-lib` 2.6.5) as an

From 13c3adb8e8a4bda0d963d734337a1570203e462c Mon Sep 17 00:00:00 2001
From: Kuba Sunderland-Ober <kuba@mareimbrium.org>
Date: Sat, 23 May 2026 22:18:49 +0200
Subject: [PATCH 30/44] Update the README.

---
 perf/README.md | 29 +++++++++++++++++++++++++----
 1 file changed, 25 insertions(+), 4 deletions(-)

diff --git a/perf/README.md b/perf/README.md
index a11407b6..77fa85a7 100644
--- a/perf/README.md
+++ b/perf/README.md
@@ -63,10 +63,22 @@ or pdf-lib), or to write `book.pdf` for behavioural verification.
 
 ## Profiling pdf-lib (process phase): canonical command
 
-The mirror command for CPU-profiling the pdf-lib roundtrip:
+The mirror command for CPU-profiling the pdf-lib roundtrip (run from
+`perf/`):
 
 ```
-node measure.mjs --fast-refs --parallel-deflate --fast-decode-name --fast-number-to-string --fast-size-in-bytes --fast-inflate --fast-parse-number --fast-dict-onebuf --fast-parse-object --fast-sync-load --fast-indirect-objects --fast-pdfnumber-pool --cpu-profile-process --cpu-sampling 100
+node measure.mjs --fast-refs --parallel-deflate --fast-decode-name --fast-number-to-string --fast-size-in-bytes --fast-inflate --fast-parse-number --fast-dict-onebuf --fast-parse-object --fast-sync-load --fast-indirect-objects --fast-pdfnumber-pool --cpu-profile-process --cpu-sampling 100 --out results/<label>
+```
+
+`--out results/<label>` is optional but recommended: omit it and the
+run lands in `results/<ISO-timestamp>/`, which is fine for one-off
+captures but awkward to refer to later. For A/B work, label both
+sides (`results/pre-foo`, `results/post-foo`).
+
+Then read the bottom-up table:
+
+```
+node analyze-profile.mjs results/<label>/process.cpuprofile --top 15
 ```
 
 Flag rationale:
@@ -239,10 +251,19 @@ no-profile A/B as a sanity check.
 
 The companion command for the **sampling heap profile** of the
 process phase -- "where is pdf-lib allocating bytes?" rather than
-"where is it spending cycles?":
+"where is it spending cycles?" (run from `perf/`):
+
+```
+node measure.mjs --fast-refs --parallel-deflate --fast-decode-name --fast-number-to-string --fast-size-in-bytes --fast-inflate --fast-parse-number --fast-dict-onebuf --fast-parse-object --fast-sync-load --fast-indirect-objects --fast-pdfnumber-pool --heap-profile-process --heap-sampling 512 --out results/<label>
+```
+
+Same `--out` / labelling note as the CPU command above: omit it for a
+timestamped folder, pass it for a stable name.
+
+Then read the bottom-up table:
 
 ```
-node measure.mjs --fast-refs --parallel-deflate --fast-decode-name --fast-number-to-string --fast-size-in-bytes --fast-inflate --fast-parse-number --fast-dict-onebuf --fast-parse-object --fast-sync-load --fast-indirect-objects --fast-pdfnumber-pool --heap-profile-process --heap-sampling 512
+node analyze-heap-profile.mjs results/<label>/process.heapprofile --top 15
 ```
 
 Same `--fast-*` set as the CPU command (production is the baseline

From 6f7b1ad51aa30c0cf382060e5e4a82b841e6fbc2 Mon Sep 17 00:00:00 2001
From: Kuba Sunderland-Ober <kuba@mareimbrium.org>
Date: Sat, 23 May 2026 23:18:23 +0200
Subject: [PATCH 31/44] Enable Phase 1 measure-pass in production.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Wires docs/lib/measure-pass.mjs in front of PDFDocument.load in
docs/render-book.mjs. The no-allocate walker counts dictSlots once,
fast-dict-onebuf pre-sizes its main Array to the exact count
(main.length = N), and V8 growth resizes during load go away.

Net wall-clock on the book is ~+40 ms (walker ~60 ms, load saves
~20). That's the smallest of the four Phases evaluated and the
only one whose tradeoff is acceptable shipping: Phase 2 is a
regression, Phase 3 / 3β recover most of it but only for a ~7 MB
heap win that doesn't justify the CPU cost. Phase 1's bound is
mostly insurance -- mainBuf was over-allocating by ~60 K slots out
of 2.4 M -- but it lays the plumbing for any future shape change
to ship without re-doing the wiring.

Also adds --measure-pass to both canonical commands in
perf/README.md and a flag-rationale entry parallel to
fast-dict-onebuf's.
---
 docs/render-book.mjs     | 21 ++++++++++++++++++++-
 perf/README.md           | 21 ++++++++++++++++++---
 perf/notes/08-pdf-lib.md | 30 ++++++++++++++++++------------
 3 files changed, 56 insertions(+), 16 deletions(-)

diff --git a/docs/render-book.mjs b/docs/render-book.mjs
index b99a6b07..40b4a1d5 100644
--- a/docs/render-book.mjs
+++ b/docs/render-book.mjs
@@ -119,17 +119,31 @@ import { PDFDocument } from 'pdf-lib';
 //     ~15 MB of PDFNumber allocations to ~0.8 MB. Total process-phase
 //     heap traffic drops ~13 % (123 MB -> 107 MB). PDFNumber is
 //     immutable so sharing is safe.
+//   measure-pass (Phase 1) -- no-allocate byte walker
+//     (docs/lib/measure-pass.mjs) that runs in front of
+//     PDFDocument.load on the raw Chrome PDF and counts dictSlots.
+//     The count drives setExpectedDictSlots() on fast-dict-onebuf,
+//     which pre-sizes the module-level main Array to the exact
+//     slot count (no V8 growth resizes during load). Net wall-clock
+//     is ~+40 ms on the book (walker costs ~60 ms; load saves ~20).
+//     The bound on mainBuf isn't material on its own (~60 K slots
+//     out of 2.4 M) but commits the two-pass shape. Phase 2/3/3β
+//     (Float64Array mainBuf + encoded slots) were explored and
+//     didn't ship -- per-slot encode/decode cost exceeded the
+//     mark-phase savings. See "Phase 1: pre-size mainBuf via
+//     measure-pass" in perf/notes/08-pdf-lib.md.
 import './lib/fast-refs.mjs';
 import './lib/fast-inflate.mjs';
 import './lib/fast-parse-number.mjs';
 import './lib/fast-decode-name.mjs';
 import './lib/fast-number-to-string.mjs';
 import './lib/fast-size-in-bytes.mjs';
-import './lib/fast-dict-onebuf.mjs';
+import { setExpectedDictSlots }     from './lib/fast-dict-onebuf.mjs';
 import './lib/fast-parse-object.mjs';
 import './lib/fast-sync-load.mjs';
 import './lib/fast-indirect-objects.mjs';
 import './lib/fast-pdfnumber-pool.mjs';
+import { measure as measureRawPdf } from './lib/measure-pass.mjs';
 import { parseOutline, setOutline } from './lib/outline.mjs';
 import { setMetadata }              from './lib/postprocesser.mjs';
 import { parallelSave }             from './lib/parallel-deflate.mjs';
@@ -357,8 +371,13 @@ try {
   //  - dispatches every chunk's deflate to libuv's thread pool via
   //    async zlib.deflate instead of running serially on the main
   //    thread. Moves ~300 ms of zlib work off-CPU on the book.
+  //
+  // measureRawPdf walks rawPdf once with no allocations and hands
+  // the exact dictSlot count to fast-dict-onebuf so its main Array
+  // is pre-sized; eliminates V8 growth resizes during load.
   // See perf/notes/08-pdf-lib.md.
   const tProcess = Date.now();
+  setExpectedDictSlots(measureRawPdf(rawPdf).dictSlots);
   const pdfDoc = await PDFDocument.load(rawPdf);
   setMetadata(pdfDoc, meta);
   await setOutline(pdfDoc, outline, false);
diff --git a/perf/README.md b/perf/README.md
index 77fa85a7..79ab773b 100644
--- a/perf/README.md
+++ b/perf/README.md
@@ -67,7 +67,7 @@ The mirror command for CPU-profiling the pdf-lib roundtrip (run from
 `perf/`):
 
 ```
-node measure.mjs --fast-refs --parallel-deflate --fast-decode-name --fast-number-to-string --fast-size-in-bytes --fast-inflate --fast-parse-number --fast-dict-onebuf --fast-parse-object --fast-sync-load --fast-indirect-objects --fast-pdfnumber-pool --cpu-profile-process --cpu-sampling 100 --out results/<label>
+node measure.mjs --fast-refs --parallel-deflate --fast-decode-name --fast-number-to-string --fast-size-in-bytes --fast-inflate --fast-parse-number --fast-dict-onebuf --measure-pass --fast-parse-object --fast-sync-load --fast-indirect-objects --fast-pdfnumber-pool --cpu-profile-process --cpu-sampling 100 --out results/<label>
 ```
 
 `--out results/<label>` is optional but recommended: omit it and the
@@ -163,6 +163,20 @@ Flag rationale:
   cumulative heap reduction since the original Map-backed PDFDict
   (152 -> 66 MB). Production runs through it. See
   [notes/08-pdf-lib.md "One-buffer PDFDict"](notes/08-pdf-lib.md).
+- `--measure-pass` -- inject
+  [docs/lib/measure-pass.mjs](../docs/lib/measure-pass.mjs), the
+  no-allocate byte walker. Runs in front of `PDFDocument.load` on
+  rawPdf, counts dictSlots, hands the count to
+  `setExpectedDictSlots()` on `fast-dict-onebuf`, which pre-sizes
+  the module-level `main` Array to the exact slot count. Eliminates
+  V8 growth resizes during load. Net wall-clock ~+40 ms on the book
+  (walker ~60 ms, load saves ~20). Production runs through it -- the
+  bound on mainBuf isn't material on its own (~60 K slots out of
+  2.4 M) but commits the two-pass shape; Phases 2/3/3β (Float64Array
+  mainBuf + encoded slots) were explored and didn't ship. Requires
+  `--fast-dict-onebuf` (mutex-checked). See "Phase 1: pre-size mainBuf
+  via measure-pass" in
+  [notes/08-pdf-lib.md](notes/08-pdf-lib.md).
 - `--fast-parse-object` -- inject
   [docs/lib/fast-parse-object.mjs](../docs/lib/fast-parse-object.mjs),
   replacing `PDFObjectParser.prototype.parseObject` with a
@@ -254,7 +268,7 @@ process phase -- "where is pdf-lib allocating bytes?" rather than
 "where is it spending cycles?" (run from `perf/`):
 
 ```
-node measure.mjs --fast-refs --parallel-deflate --fast-decode-name --fast-number-to-string --fast-size-in-bytes --fast-inflate --fast-parse-number --fast-dict-onebuf --fast-parse-object --fast-sync-load --fast-indirect-objects --fast-pdfnumber-pool --heap-profile-process --heap-sampling 512 --out results/<label>
+node measure.mjs --fast-refs --parallel-deflate --fast-decode-name --fast-number-to-string --fast-size-in-bytes --fast-inflate --fast-parse-number --fast-dict-onebuf --measure-pass --fast-parse-object --fast-sync-load --fast-indirect-objects --fast-pdfnumber-pool --heap-profile-process --heap-sampling 512 --out results/<label>
 ```
 
 Same `--out` / labelling note as the CPU command above: omit it for a
@@ -560,6 +574,7 @@ file documenting each:
 | `PDFNumber.of` value-pool (dense int + Map fallback) | [08](notes/08-pdf-lib.md) | `parseNumberOrRef` off heap top-10; total process heap 123 MB → 107 MB (-13 %) |
 | Pre-size `parseDict` accumulator (`new Array(10)` median) | [08](notes/08-pdf-lib.md) | `fastParseDictArray` heap row -25 %; total process heap 107 MB → 92 MB (-14 %) |
 | One-buffer `PDFDict` (single mainBuf + packed 53-bit instance) | [08](notes/08-pdf-lib.md) | total process heap 92 MB → 66 MB (-28 %); cumulative -57 % since Map-backed PDFDict |
+| `measure-pass` (Phase 1) wired into production via `setExpectedDictSlots()` | [08](notes/08-pdf-lib.md) | byte-identical output; mainBuf pre-sized exact (no V8 growth resizes); ~+40 ms net process |
 
 What was tried and didn't ship:
 
@@ -586,4 +601,4 @@ order; later ones reference earlier ones for context.
 | [05-blink-trace.md](notes/05-blink-trace.md) | What happened when we tried move-not-clone (a `previousLeaf` cache shipped instead of move); cracking the cpu profile's `(program)` row open with a Blink-category trace; the WhiteSpaceFilter paired-A/B that found it wasn't worth its layout cost in our pipeline. |
 | [06-microtasks-pageranges-css.md](notes/06-microtasks-pageranges-css.md) | Following `RunMicrotasks` down to zero (chunker fully sync); why `pageRanges` sharding is off the table; CSS cost attribution showing print.css's individual sections are all below the noise floor. |
 | [07-memory.md](notes/07-memory.md) | Where the renderer's 1.9 GB goes -- process-tree footprint, per-allocator + per-Blink-class breakdown, `--disable-gpu` + `--in-process-gpu` saving ~200 MB, a GC-pass probe finding 180 MB of unswept Oilpan garbage. |
-| [08-pdf-lib.md](notes/08-pdf-lib.md) | Profiling the process phase via `--cpu-profile-process`; pako's per-stream init dominates with ~4 500 small streams (routing pdf-lib's `deflate` + `inflate` through `node:zlib` saves ~1.5 s); `PDFRef.of`'s string-keyed Map lookup at 1.2 M calls per load (dense-array gen=0 cache saves ~0.2 s); parallelising save's per-stream deflate on libuv's pool with `objectsPerStream: 500` (~0.3 s off the main thread; PDF -5 %); `decodeName`'s regex scan on 2.76 M `PDFName.of` calls per load with a 0.0001 % hit rate (no-`#` cache saves ~0.5 s); `numberToString`'s redundant `toString`/`split`/`parseInt` on the 100 % no-`e` path; `sizeInBytes` allocating `n.toString(2)` on ~300 k xref-writer calls (short-circuit ladder saves ~60 ms); `PDFDict.entries` allocating `Array.from(map.entries())` on every dict serialisation (`Map.forEach` with hoisted callbacks saves ~80 ms); `parseDict`'s type-dispatch tail re-running `PDFName.of('Type'/'Catalog'/'Pages'/'Page')` per dict (hoisted sentinel constants drop `fastOf` self-time by 22 %); pdf-lib's `__awaiter`/`__generator` scaffolding on nine load + save methods costing ~135 ms of attributed self-time + ~50 ms GC (synchronified twins save 0.36 s of process); `parseObject`'s three speculative `matchKeyword(true/false/null)` scans on every dispatch (first-byte peek + gated keyword scans halve `parseObject` self-time); the sampling heap profile pointing at `new Map()` + `Map.prototype.set` at ~50 % of process-phase allocations (replacing `PDFDict`'s backing `Map` with a flat alternating `[k,v,k,v,...]` array drops Map+set heap traffic ~80 % and subsumes the earlier `fast-dict-iter` + `fast-parse-dict` shims); the only hot `Map.set` left being `PDFContext.assign`'s `indirectObjects.set(ref, object)` (replacing the Map with a dense gen=0 array indexed by `objectNumber` drops `assign` out of the CPU top-15 and halves the remaining `set` heap traffic); the residual `set` after that being the upstream `PDFRef.of` pool.set on cache miss (directly constructing the `PDFRef` via `Object.create(PDFRef.prototype)` on the gen=0 miss path bypasses the redundant upstream pool entirely, dropping `set` heap traffic from 7.7 MB to 0.5 MB and saving another ~93 ms on `PDFRef.of` CPU); `parseNumberOrRef` as the next-largest heap row at 15 MB of inlined `new PDFNumber(value)` calls -- PDFs reuse a few thousand unique numeric values hundreds of thousands of times (page indices, `/Count`, `/MediaBox` dimensions, font sizes), so pooling `PDFNumber` by value drops `parseNumberOrRef` off the top 10 and total process heap by ~13 %; `fastParseDictArray` at ~85 % FixedArray-growth garbage on 261 k dicts with a 5-entry median (pre-sizing the accumulator at `new Array(10)` collapses the per-call growth chain, dropping the row -25 % and total heap another -14 %); the next layer being the irreducible "one `new Array(10)` + one PDFDict instance per parsed dict" floor with ~1780 backing arrays still in flight after `--fast-dict-array` (collapse them into one long-lived mainBuf where every committed PDFDict entry lives, a per-parser temp stack handles parseDict recursion, owned dicts append to main and mutate in-place / COW-to-tail, and the whole PDFDict instance state packs into one 53-bit Number -- 24-bit start + 14-bit length + 1-bit owned -- so the per-dict object header collapses to a single field; total process heap drops 92 MB → 66 MB (-28 %), cumulative -57 % since the Map-backed baseline); GC self-time still ~150 ms / 15 % of process after fast-dict-onebuf (V8-flag knobs don't move it -- mark cost is dominated by walking the ~2.4 M Object-ref slots in mainBuf), so the next attack surface is shrinking the *live set* V8 has to mark by encoding slots as Numbers in a Float64Array mainBuf instead of Object references; that needs exact pool sizing, which is best done by a separate measure-allocate-work pass (Phase 0 viability gate: a no-allocate byte walker prototype clocks 135 ms vs PDFDocument.load at 1238 ms on the 39 MB Chrome-output PDF, ~9x cheaper, so even an 80 %-of-current work pass would land net-positive on CPU before any GC reduction; the prototype lives in `perf/phase0-measure.mjs` with `--dump-raw-pdf` on `measure.mjs` for capturing the canonical input); Phase 1 productionises that walker as `docs/lib/measure-pass.mjs` and adds `setExpectedDictSlots()` to fast-dict-onebuf so the harness's `--measure-pass` flag pre-sizes mainBuf exact (byte-identical output, +40 ms net process on the book) -- pipeline shape committed for Phase 2 to layer on, even though Phase 1 alone doesn't move the heap; a V8 IC-invalidation gotcha worth recording: rebinding the module-level `main` (rather than resizing it in place via `main.length = N`) made `_appendEntries` deopt and the heap jump 65 → 92 MB despite the resized Array being shape-identical; post-Phase-1 cleanup: the One-buffer `owned` bit was over-cautious -- `start + length === mainLen` is enough to know slots past mainLen are claimable regardless of whether the range came from the parser or a factory call, so the bit drops out of the packed value entirely (set's COW gate simplifies, _cow collapses to one branch, bit 38 becomes spare; byte-identical output, heap flat); slot-type histogram on `main[0..mainLen)` confirms keys are 100 % PDFName and the four big pools (Name / Ref / Number / Dict) cover 96.4 % of all slots, so a Float64Array mainBuf with a side `Object[]` for the residual ~3.5 % (Array / String / HexString) would collapse ~96 % of GC slot-marks at the price of a side-pool indirection -- `perf/instrument-slot-types.mjs` does the classification in 39 ms via `--instrument-slot-types`; built the Float64Array mainBuf with 4-bit tag + 49-bit payload encoding (subsuming fast-refs and fast-pdfnumber-pool's pool IDs, adding new pools for PDFArray / PDFString / PDFHexString) and confirmed byte-identical output -- but the measured win was a wash: pointer-array marks turn out to be fast in V8, encoding overhead at parse + decode at save roughly cancels the savings, and the new pool Maps cost ~3 MB heap, so the architecture stays on the shelf rather than shipping; mirroring the same shape to PDFArray (each instance as a view into a shared arrayBuf Float64Array via `this.d = (start, length)`, same temp-then-commit parser pattern) lands the expected heap win of -7.6 MB / -12 % and pulls parseArray's 19.6 MB attribution row off the top 10 entirely, but introduces a +360 ms wall-clock regression at save -- ~3 M per-slot `decodeValue` calls across copyBytesInto, ~100 ns each, V8 can't inline across the prototype boundary -- so the architecture stays off staging too; the natural follow-up is to hand-inline decodeValue's 10-case switch into all four hot save methods (Phase 3β), which recovers the function-call overhead and lands net wins on GC (-19 ms) and parseName (-17 ms, a downstream V8 re-optimization once the call sites became monomorphic per case branch) -- but the inlined switch body itself adds +23 ms at encode-at-parse and +22 ms at copyBytesInto, and the net of all of Phase 2 + 3 + β is "real heap+GC win, ~+200 ms wall-clock loss across many frames", so the architecture stays off staging on the simpler conclusion that V8 marks pointer arrays faster than expected and the original polymorphic `main[i].copyBytesInto()` was fine. |
+| [08-pdf-lib.md](notes/08-pdf-lib.md) | Profiling the process phase via `--cpu-profile-process`; pako's per-stream init dominates with ~4 500 small streams (routing pdf-lib's `deflate` + `inflate` through `node:zlib` saves ~1.5 s); `PDFRef.of`'s string-keyed Map lookup at 1.2 M calls per load (dense-array gen=0 cache saves ~0.2 s); parallelising save's per-stream deflate on libuv's pool with `objectsPerStream: 500` (~0.3 s off the main thread; PDF -5 %); `decodeName`'s regex scan on 2.76 M `PDFName.of` calls per load with a 0.0001 % hit rate (no-`#` cache saves ~0.5 s); `numberToString`'s redundant `toString`/`split`/`parseInt` on the 100 % no-`e` path; `sizeInBytes` allocating `n.toString(2)` on ~300 k xref-writer calls (short-circuit ladder saves ~60 ms); `PDFDict.entries` allocating `Array.from(map.entries())` on every dict serialisation (`Map.forEach` with hoisted callbacks saves ~80 ms); `parseDict`'s type-dispatch tail re-running `PDFName.of('Type'/'Catalog'/'Pages'/'Page')` per dict (hoisted sentinel constants drop `fastOf` self-time by 22 %); pdf-lib's `__awaiter`/`__generator` scaffolding on nine load + save methods costing ~135 ms of attributed self-time + ~50 ms GC (synchronified twins save 0.36 s of process); `parseObject`'s three speculative `matchKeyword(true/false/null)` scans on every dispatch (first-byte peek + gated keyword scans halve `parseObject` self-time); the sampling heap profile pointing at `new Map()` + `Map.prototype.set` at ~50 % of process-phase allocations (replacing `PDFDict`'s backing `Map` with a flat alternating `[k,v,k,v,...]` array drops Map+set heap traffic ~80 % and subsumes the earlier `fast-dict-iter` + `fast-parse-dict` shims); the only hot `Map.set` left being `PDFContext.assign`'s `indirectObjects.set(ref, object)` (replacing the Map with a dense gen=0 array indexed by `objectNumber` drops `assign` out of the CPU top-15 and halves the remaining `set` heap traffic); the residual `set` after that being the upstream `PDFRef.of` pool.set on cache miss (directly constructing the `PDFRef` via `Object.create(PDFRef.prototype)` on the gen=0 miss path bypasses the redundant upstream pool entirely, dropping `set` heap traffic from 7.7 MB to 0.5 MB and saving another ~93 ms on `PDFRef.of` CPU); `parseNumberOrRef` as the next-largest heap row at 15 MB of inlined `new PDFNumber(value)` calls -- PDFs reuse a few thousand unique numeric values hundreds of thousands of times (page indices, `/Count`, `/MediaBox` dimensions, font sizes), so pooling `PDFNumber` by value drops `parseNumberOrRef` off the top 10 and total process heap by ~13 %; `fastParseDictArray` at ~85 % FixedArray-growth garbage on 261 k dicts with a 5-entry median (pre-sizing the accumulator at `new Array(10)` collapses the per-call growth chain, dropping the row -25 % and total heap another -14 %); the next layer being the irreducible "one `new Array(10)` + one PDFDict instance per parsed dict" floor with ~1780 backing arrays still in flight after `--fast-dict-array` (collapse them into one long-lived mainBuf where every committed PDFDict entry lives, a per-parser temp stack handles parseDict recursion, owned dicts append to main and mutate in-place / COW-to-tail, and the whole PDFDict instance state packs into one 53-bit Number -- 24-bit start + 14-bit length + 1-bit owned -- so the per-dict object header collapses to a single field; total process heap drops 92 MB → 66 MB (-28 %), cumulative -57 % since the Map-backed baseline); GC self-time still ~150 ms / 15 % of process after fast-dict-onebuf (V8-flag knobs don't move it -- mark cost is dominated by walking the ~2.4 M Object-ref slots in mainBuf), so the next attack surface is shrinking the *live set* V8 has to mark by encoding slots as Numbers in a Float64Array mainBuf instead of Object references; that needs exact pool sizing, which is best done by a separate measure-allocate-work pass (Phase 0 viability gate: a no-allocate byte walker prototype clocks 135 ms vs PDFDocument.load at 1238 ms on the 39 MB Chrome-output PDF, ~9x cheaper, so even an 80 %-of-current work pass would land net-positive on CPU before any GC reduction; the prototype lives in `perf/phase0-measure.mjs` with `--dump-raw-pdf` on `measure.mjs` for capturing the canonical input); Phase 1 productionises that walker as `docs/lib/measure-pass.mjs` and adds `setExpectedDictSlots()` to fast-dict-onebuf so the harness's `--measure-pass` flag pre-sizes mainBuf exact (byte-identical output, +40 ms net process on the book) -- pipeline shape committed for Phase 2 to layer on, even though Phase 1 alone doesn't move the heap; a V8 IC-invalidation gotcha worth recording: rebinding the module-level `main` (rather than resizing it in place via `main.length = N`) made `_appendEntries` deopt and the heap jump 65 → 92 MB despite the resized Array being shape-identical; post-Phase-1 cleanup: the One-buffer `owned` bit was over-cautious -- `start + length === mainLen` is enough to know slots past mainLen are claimable regardless of whether the range came from the parser or a factory call, so the bit drops out of the packed value entirely (set's COW gate simplifies, _cow collapses to one branch, bit 38 becomes spare; byte-identical output, heap flat); slot-type histogram on `main[0..mainLen)` confirms keys are 100 % PDFName and the four big pools (Name / Ref / Number / Dict) cover 96.4 % of all slots, so a Float64Array mainBuf with a side `Object[]` for the residual ~3.5 % (Array / String / HexString) would collapse ~96 % of GC slot-marks at the price of a side-pool indirection -- `perf/instrument-slot-types.mjs` does the classification in 39 ms via `--instrument-slot-types`; built the Float64Array mainBuf with 4-bit tag + 49-bit payload encoding (subsuming fast-refs and fast-pdfnumber-pool's pool IDs, adding new pools for PDFArray / PDFString / PDFHexString) and confirmed byte-identical output -- but the measured win was a wash: pointer-array marks turn out to be fast in V8, encoding overhead at parse + decode at save roughly cancels the savings, and the new pool Maps cost ~3 MB heap, so the architecture stays on the shelf rather than shipping; mirroring the same shape to PDFArray (each instance as a view into a shared arrayBuf Float64Array via `this.d = (start, length)`, same temp-then-commit parser pattern) lands the expected heap win of -7.6 MB / -12 % and pulls parseArray's 19.6 MB attribution row off the top 10 entirely, but introduces a +360 ms wall-clock regression at save -- ~3 M per-slot `decodeValue` calls across copyBytesInto, ~100 ns each, V8 can't inline across the prototype boundary -- so the architecture stays off staging too; the natural follow-up is to hand-inline decodeValue's 10-case switch into all four hot save methods (Phase 3β), which recovers the function-call overhead and lands net wins on GC (-19 ms) and parseName (-17 ms, a downstream V8 re-optimization once the call sites became monomorphic per case branch) -- but the inlined switch body itself adds +23 ms at encode-at-parse and +22 ms at copyBytesInto, and the net of all of Phase 2 + 3 + β is "real heap+GC win, ~+200 ms wall-clock loss across many frames", so the architecture stays off staging on the simpler conclusion that V8 marks pointer arrays faster than expected and the original polymorphic `main[i].copyBytesInto()` was fine; Phase 1 wires into production via a `setExpectedDictSlots()` call in `docs/render-book.mjs` that runs the measure walker on rawPdf and hands the exact dictSlot count to fast-dict-onebuf -- the bound on mainBuf isn't material on its own (~60 K slots out of 2.4 M of slack) but commits the two-pass shape so any future shim swap doesn't have to re-do the wiring. |
diff --git a/perf/notes/08-pdf-lib.md b/perf/notes/08-pdf-lib.md
index a1d66cca..7d2d3318 100644
--- a/perf/notes/08-pdf-lib.md
+++ b/perf/notes/08-pdf-lib.md
@@ -3333,19 +3333,24 @@ doesn't see.
 
 ### Where this lands
 
-`--measure-pass` lives behind a flag in the harness. Even at just
-~40 ms net regression, there's no current consumer of the
-measured counts that wins anything back on its own -- pre-sizing
-mainBuf exact saves nothing material in isolation. Phase 1
-commits to the wire-up shape; a separate later commit flips the
-flag into production once the architecture has another consumer.
+`--measure-pass` ships behind a harness flag at first, then gets
+wired into [`docs/render-book.mjs`](../../docs/render-book.mjs)'s
+production import chain in a subsequent commit (the "enable
+Phase 1 measure-pass in production" change). The decision to
+ship it was bounded: it's the smallest of the four Phases we
+evaluated and the only one whose tradeoff is acceptable for
+production. Phase 2 is a net regression on its own; Phase 3 /
+3β recover most of it for a ~7 MB heap win that doesn't justify
+the CPU cost. Phase 1's bound on mainBuf isn't material on its
+own (~60 K slots out of 2.4 M of slack), but it lays the
+plumbing for any future shape change to ship without re-doing
+the wiring.
 
 [`docs/lib/measure-pass.mjs`](../../docs/lib/measure-pass.mjs)
-ships as a library (the production home of the walker). Imported
-only when `--measure-pass` is passed. `perf/phase0-measure.mjs`
-is left alone -- it's the historical record of the viability
-gate, intentionally self-contained even though it now duplicates
-the walker.
+ships as a library (the production home of the walker).
+`perf/phase0-measure.mjs` is left alone -- it's the historical
+record of the viability gate, intentionally self-contained even
+though it now duplicates the walker.
 
 ## Dropping the owned bit (post-Phase-1 cleanup)
 
@@ -3793,7 +3798,8 @@ Cumulative process-phase cost, baseline → after the shims to date:
 | + fast-refs miss bypass              | ~1.0 s  | ~0.6 s | ~0.4 s |
 | + fast-pdfnumber-pool                | ~1.0 s  | ~0.6 s | ~0.4 s |
 | + parseDict pre-sized array          | ~1.0 s  | ~0.6 s | ~0.4 s |
-| **+ fast-dict-onebuf (this section)** | **~1.0 s** | **~0.6 s** | **~0.4 s** |
+| + fast-dict-onebuf                   | ~1.0 s  | ~0.6 s | ~0.4 s |
+| **+ measure-pass Phase 1 (this section)** | **~1.0 s** | **~0.7 s** | **~0.4 s** |
 
 The bottom-up after the latest pair is what's left of pdf-lib's
 genuine parser work: `PDFDict.entries`, `PDFObjectParser.parseName`,

From d2e32de01ddc111df008b4dcf46012c8f63ffd65 Mon Sep 17 00:00:00 2001
From: Kuba Sunderland-Ober <kuba@mareimbrium.org>
Date: Sat, 23 May 2026 23:20:32 +0200
Subject: [PATCH 32/44] One-buffer PDFArray: range views into a shared
 arrayMain JS Array.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Mirror of fast-dict-onebuf's strategy applied to PDFArray. Every
committed element lives in a single append-only `arrayMain` JS
Array, kept for the document's lifetime. Each PDFArray instance is
a view via packed (start, length) in `d`. Per-instance
`this.array = []` allocation goes away; ~79 k PDFArrays stop
allocating per-instance backing arrays + grow doublings.

Storage is a plain heterogeneous JS Array -- slots hold the
original PDFObject references, reads are `arrayMain[start + i]`
with no decode. This is the explored-but-didn't-ship Phase 3
shape minus the Float64Array encoding (which cost ~300 ms of
decodeValue dispatch on save's copyBytesInto across ~3 M slots).
The plain-reference shape skips that entirely.

Parser uses a per-parser _arrayTemp + length cursor as a recursion
stack, parallel to fast-dict-onebuf's _dictTemp; each parseArray
invocation pushes onto temp, commits its frame to arrayMain in one
contiguous append, and pops temp back. Dict / array temps are
independent so cross-recursion is fine.

Mutations: in-place replace for set, in-place extend at HWM for
push, COW for insert / remove / push-not-at-HWM. Same at-HWM
safety logic as fast-dict-onebuf; no owned bit needed.

Bit budget: 24-bit start (16 M slots) + 16-bit length (65 536
elements, max observed ~25 k on the book) = 40 bits, well within
Number.MAX_SAFE_INTEGER.

Singleton context is duplicated (10 lines) rather than shared with
fast-dict-onebuf -- each shim stays independently injectable.

Production wiring:
- docs/render-book.mjs: import setExpectedArraySlots; call after
  measure-pass before PDFDocument.load.
- perf/measure.mjs: --fast-array-onebuf flag. Composes with
  --fast-dict-onebuf. --measure-pass also drives
  setExpectedArraySlots when both shims are on.
- perf/README.md: --fast-array-onebuf in both canonical commands,
  flag-rationale entry, run.bat row, What-shipped + Investigation-log.

Heap impact (process phase, 512 B sampling, fast-array-onebuf on
vs the immediate predecessor baseline):
  total sampled    65.6 MB -> 51.9 MB  (-13.7 MB, -20.8 %)
  parseArray row   19.6 MB -> 0        (out of top 15)
  new shim row     -        -> 4.2 MB  (PDFArray wrappers)

CPU impact (process wall, pinned 0x5500 / High, no profiler, 3 paired runs):
  P1 only        median 1.07 s, mean 1.09 s
  P1 + this      median 1.02 s, mean 1.01 s
  Δ              mean +0.08 s (this shim slightly faster, within noise)

The CPU regression that showed up under --cpu-profile-process was
profiler-induced noise; gone once we pin and drop the sampler.

Cumulative heap arc since Map-backed PDFDict: 152 MB -> 52 MB
(-66 %). The endpoint of the dict + array allocator refactor.
---
 docs/lib/fast-array-onebuf.mjs | 331 +++++++++++++++++++++++++++++++++
 docs/render-book.mjs           |  44 +++--
 perf/README.md                 |  37 +++-
 perf/measure.mjs               |  18 +-
 perf/notes/08-pdf-lib.md       | 120 +++++++++++-
 5 files changed, 527 insertions(+), 23 deletions(-)
 create mode 100644 docs/lib/fast-array-onebuf.mjs

diff --git a/docs/lib/fast-array-onebuf.mjs b/docs/lib/fast-array-onebuf.mjs
new file mode 100644
index 00000000..66cf3a78
--- /dev/null
+++ b/docs/lib/fast-array-onebuf.mjs
@@ -0,0 +1,331 @@
+// One-buffer PDFArray: every committed element lives in a single
+// append-only JS Array (arrayMain), kept for the document's lifetime.
+// Mirror of fast-dict-onebuf's strategy applied to PDFArray. Backing
+// is a plain heterogeneous JS Array -- slots hold the original
+// PDFObject references directly. No encoding, no decode on read; the
+// hot path is `arrayMain[start + i]`.
+//
+// Phase 3 of fast-dict-encoded did the same range-view refactor on
+// PDFArray but used a Float64Array + encoded slots (mirroring its
+// dict shape). The encoded backing cost ~300 ms of decodeValue
+// dispatch during save (PDFArray.copyBytesInto iterates ~500 k
+// elements). This shim keeps the heap win (~19 MB on the book by
+// removing each PDFArray's per-instance `this.array = []`) without
+// paying the decode cost: slots are JS references, reads are direct.
+//
+// 40-bit packed Number layout (well within Number.MAX_SAFE_INTEGER):
+//   bits  0-23: start  (24 bits, max 16 M slots in arrayMain)
+//   bits 24-39: length (16 bits, max 65 536 elements; max observed
+//                       ~25 k on the book)
+//   bits 40-52: spare (13 bits)
+//
+// Recursion. parseArray pushes elements onto a per-parser _arrayTemp;
+// inner parseArray invocations append on top, commit their frame to
+// arrayMain in one append, and pop temp back. Inner / outer ranges
+// in arrayMain do not overlap. _arrayTemp is independent of
+// fast-dict-onebuf's _dictTemp so dict <-> array recursion is fine.
+//
+// Mutations:
+//   - set(i, v): in-place replace (safe; no length change)
+//   - push(v) at HWM:    in-place extend (no other arrays follow)
+//   - push(v) not at HWM: COW the range to tail, then push
+//   - insert / remove:   always COW (shifts would corrupt neighbours)
+// Same at-HWM-determines-safety logic as fast-dict-onebuf; no owned
+// bit needed (see fast-dict-onebuf commit 7e8b1f7).
+//
+// Singleton PDFContext (one PDFDocument.load per process in our
+// pipeline). The singleton is duplicated rather than shared with
+// fast-dict-onebuf -- the mechanism is ten lines and keeping each
+// shim independently injectable is worth more than dedup'ing it.
+// Both shims end up holding references to the same PDFContext.
+//
+// Composes with --fast-dict-onebuf. Mutually exclusive with
+// --fast-dict-encoded (which subsumes both via its own encoded shape).
+
+import { createRequire } from 'node:module';
+
+const require = createRequire(import.meta.url);
+const PDFArray        = require('pdf-lib/cjs/core/objects/PDFArray.js').default;
+const PDFObjectParser = require('pdf-lib/cjs/core/parser/PDFObjectParser.js').default;
+const CharCodes       = require('pdf-lib/cjs/core/syntax/CharCodes.js').default;
+
+// ---- The single buffer ---------------------------------------------
+
+// Pre-sized to total array slots + slack on the book. Other workloads
+// grow it naturally from this starting size. When the measure-pass
+// shim runs first, it calls setExpectedArraySlots() before parse,
+// which resizes `arrayMain` to exact measured demand via
+// `arrayMain.length = N`.
+const ARRAY_MAIN_INITIAL_CAP = 800000;
+const arrayMain = new Array(ARRAY_MAIN_INITIAL_CAP);
+let arrayMainLen = 0;
+
+export { arrayMain };
+export function getArrayMainLen() { return arrayMainLen; }
+
+// Resize arrayMain in place. Must be called before any parseArray /
+// withContext (i.e. while arrayMainLen is still 0). `slack` is a
+// multiplier on `slots`; default 1.0 (exact). Same in-place-resize
+// rationale as fast-dict-onebuf's setExpectedDictSlots: reassigning
+// the module-level binding invalidates V8's inline-cache slots in
+// every closure that reads it, and the deopt + recompile shows up as
+// a parse-time allocation spike.
+export function setExpectedArraySlots(slots, slack = 1.0) {
+  if (arrayMainLen > 0) {
+    throw new Error(
+      `fast-array-onebuf: setExpectedArraySlots called after parse started (arrayMainLen=${arrayMainLen})`,
+    );
+  }
+  arrayMain.length = Math.ceil(slots * slack);
+}
+
+// ---- Bit-packing helpers -------------------------------------------
+
+const POW_24 = 16777216;          // 2^24
+const MASK_24 = 0xFFFFFF;
+const MASK_16 = 0xFFFF;
+
+const MAX_START  = POW_24;          // exclusive
+const MAX_LENGTH = 1 << 16;         // 65 536, exclusive
+
+function pack(start, length) {
+  if (start  >= MAX_START)  throw new Error(`fast-array-onebuf: start ${start} exceeds 24-bit budget`);
+  if (length >= MAX_LENGTH) throw new Error(`fast-array-onebuf: length ${length} exceeds 16-bit budget`);
+  return start + length * POW_24;
+}
+
+function _start(d)  { return d & MASK_24; }
+function _length(d) { return Math.floor(d / POW_24) & MASK_16; }
+
+// ---- Singleton context ---------------------------------------------
+
+let _singletonContext = null;
+
+function _registerContext(ctx) {
+  if (_singletonContext === null) {
+    _singletonContext = ctx;
+  } else if (_singletonContext !== ctx) {
+    throw new Error('fast-array-onebuf: expected a singleton PDFContext, got a second distinct one.');
+  }
+}
+
+// ---- Append + COW helpers ------------------------------------------
+
+function _appendFromTemp(temp, fromOffset, lenSlots) {
+  for (let i = 0; i < lenSlots; i++) {
+    arrayMain[arrayMainLen + i] = temp[fromOffset + i];
+  }
+  arrayMainLen += lenSlots;
+}
+
+function _appendArray(arr) {
+  const len = arr.length;
+  for (let i = 0; i < len; i++) arrayMain[arrayMainLen + i] = arr[i];
+  arrayMainLen += len;
+}
+
+// COW: copy this array's range to arrayMain's tail. If already at
+// the HWM, nothing to copy -- return d unchanged.
+function _cow(pa) {
+  const d = pa.d;
+  const start = _start(d);
+  const length = _length(d);
+  if (start + length === arrayMainLen) return d;   // at HWM
+  const newStart = arrayMainLen;
+  for (let i = 0; i < length; i++) arrayMain[arrayMainLen + i] = arrayMain[start + i];
+  arrayMainLen += length;
+  return pack(newStart, length);
+}
+
+// ---- Construction --------------------------------------------------
+
+function _makeFromRange(start, length, ctx) {
+  _registerContext(ctx);
+  const pa = Object.create(PDFArray.prototype);
+  pa.d = pack(start, length);
+  return pa;
+}
+
+function _makeFromAppend(arr, ctx) {
+  const start = arrayMainLen;
+  _appendArray(arr);
+  return _makeFromRange(start, arr.length, ctx);
+}
+
+if (!PDFArray.prototype.__fastArrayOnebufInstalled) {
+
+  // ---- PDFArray.prototype -----------------------------------------
+
+  PDFArray.prototype.size = function () {
+    return _length(this.d);
+  };
+
+  PDFArray.prototype.push = function (object) {
+    const d0 = this.d;
+    const start0 = _start(d0);
+    const length0 = _length(d0);
+    let dNow = d0;
+    if (start0 + length0 !== arrayMainLen) {
+      dNow = _cow(this);
+    }
+    arrayMain[arrayMainLen++] = object;
+    const start = _start(dNow);
+    this.d = pack(start, length0 + 1);
+  };
+
+  PDFArray.prototype.get = function (index) {
+    return arrayMain[_start(this.d) + index];
+  };
+
+  PDFArray.prototype.set = function (index, object) {
+    arrayMain[_start(this.d) + index] = object;
+  };
+
+  PDFArray.prototype.indexOf = function (object) {
+    const d = this.d;
+    const start = _start(d);
+    const length = _length(d);
+    for (let i = 0; i < length; i++) {
+      if (arrayMain[start + i] === object) return i;
+    }
+    return undefined;
+  };
+
+  PDFArray.prototype.insert = function (index, object) {
+    // Always COW -- shifting elements in place would corrupt other
+    // arrays' ranges past this one.
+    const d0 = this.d;
+    const start0 = _start(d0);
+    const length0 = _length(d0);
+    const newStart = arrayMainLen;
+    for (let i = 0; i < index; i++) {
+      arrayMain[arrayMainLen++] = arrayMain[start0 + i];
+    }
+    arrayMain[arrayMainLen++] = object;
+    for (let i = index; i < length0; i++) {
+      arrayMain[arrayMainLen++] = arrayMain[start0 + i];
+    }
+    this.d = pack(newStart, length0 + 1);
+  };
+
+  PDFArray.prototype.remove = function (index) {
+    // Always COW (same reason as insert).
+    const d0 = this.d;
+    const start0 = _start(d0);
+    const length0 = _length(d0);
+    const newStart = arrayMainLen;
+    for (let i = 0; i < length0; i++) {
+      if (i === index) continue;
+      arrayMain[arrayMainLen++] = arrayMain[start0 + i];
+    }
+    this.d = pack(newStart, length0 - 1);
+  };
+
+  PDFArray.prototype.asArray = function () {
+    const d = this.d;
+    const start = _start(d);
+    const length = _length(d);
+    const out = new Array(length);
+    for (let i = 0; i < length; i++) out[i] = arrayMain[start + i];
+    return out;
+  };
+
+  PDFArray.prototype.clone = function (context) {
+    const d = this.d;
+    const start = _start(d);
+    const length = _length(d);
+    const newStart = arrayMainLen;
+    for (let i = 0; i < length; i++) arrayMain[arrayMainLen + i] = arrayMain[start + i];
+    arrayMainLen += length;
+    _registerContext(context || _singletonContext);
+    const c = Object.create(PDFArray.prototype);
+    c.d = pack(newStart, length);
+    return c;
+  };
+
+  PDFArray.prototype.toString = function () {
+    const d = this.d;
+    const start = _start(d);
+    const length = _length(d);
+    let s = '[ ';
+    for (let i = 0; i < length; i++) s += arrayMain[start + i].toString() + ' ';
+    return s + ']';
+  };
+
+  PDFArray.prototype.sizeInBytes = function () {
+    const d = this.d;
+    const start = _start(d);
+    const end = start + _length(d);
+    let size = 3;
+    for (let i = start; i < end; i++) size += arrayMain[i].sizeInBytes() + 1;
+    return size;
+  };
+
+  PDFArray.prototype.copyBytesInto = function (buffer, offset) {
+    const initialOffset = offset;
+    buffer[offset++] = CharCodes.LeftSquareBracket;
+    buffer[offset++] = CharCodes.Space;
+    const d = this.d;
+    const start = _start(d);
+    const end = start + _length(d);
+    for (let i = start; i < end; i++) {
+      offset += arrayMain[i].copyBytesInto(buffer, offset);
+      buffer[offset++] = CharCodes.Space;
+    }
+    buffer[offset++] = CharCodes.RightSquareBracket;
+    return offset - initialOffset;
+  };
+
+  // lookup, lookupMaybe, asRectangle, scalePDFNumbers stay on the
+  // upstream prototype -- they call this.get / this.size / this.set
+  // and dispatch through our overrides.
+
+  Object.defineProperty(PDFArray.prototype, 'context', {
+    get() { return _singletonContext; },
+    set(_ctx) { /* singleton is source of truth */ },
+    configurable: true,
+  });
+
+  // ---- PDFArray factory -------------------------------------------
+
+  PDFArray.withContext = function (context) {
+    return _makeFromAppend([], context);
+  };
+
+  // ---- PDFObjectParser.prototype.parseArray -----------------------
+  //
+  // Same temp/commit pattern as fast-dict-onebuf's parseDict:
+  // each parser instance carries its own _arrayTemp + length cursor;
+  // parseArray pushes elements onto temp's tail, commits the frame
+  // to arrayMain in one contiguous append, pops temp back to
+  // frameStart, returns a PDFArray view into arrayMain.
+
+  PDFObjectParser.prototype.parseArray = function fastParseArrayOneBuf() {
+    const bytes = this.bytes;
+    bytes.assertNext(CharCodes.LeftSquareBracket);
+    this.skipWhitespaceAndComments();
+
+    if (this._arrayTemp === undefined) {
+      this._arrayTemp = new Array(64);   // grows naturally if needed
+      this._arrayTempLen = 0;
+    }
+    const temp = this._arrayTemp;
+    const frameStart = this._arrayTempLen;
+
+    while (bytes.peek() !== CharCodes.RightSquareBracket) {
+      const element = this.parseObject();   // may recurse
+      temp[this._arrayTempLen++] = element;
+      this.skipWhitespaceAndComments();
+    }
+    bytes.assertNext(CharCodes.RightSquareBracket);
+
+    const frameLen = this._arrayTempLen - frameStart;
+    const start = arrayMainLen;
+    _appendFromTemp(temp, frameStart, frameLen);
+    this._arrayTempLen = frameStart;
+
+    return _makeFromRange(start, frameLen, this.context);
+  };
+
+  PDFArray.prototype.__fastArrayOnebufInstalled = true;
+}
diff --git a/docs/render-book.mjs b/docs/render-book.mjs
index 40b4a1d5..a5139ada 100644
--- a/docs/render-book.mjs
+++ b/docs/render-book.mjs
@@ -121,17 +121,29 @@ import { PDFDocument } from 'pdf-lib';
 //     immutable so sharing is safe.
 //   measure-pass (Phase 1) -- no-allocate byte walker
 //     (docs/lib/measure-pass.mjs) that runs in front of
-//     PDFDocument.load on the raw Chrome PDF and counts dictSlots.
-//     The count drives setExpectedDictSlots() on fast-dict-onebuf,
-//     which pre-sizes the module-level main Array to the exact
-//     slot count (no V8 growth resizes during load). Net wall-clock
-//     is ~+40 ms on the book (walker costs ~60 ms; load saves ~20).
-//     The bound on mainBuf isn't material on its own (~60 K slots
-//     out of 2.4 M) but commits the two-pass shape. Phase 2/3/3β
-//     (Float64Array mainBuf + encoded slots) were explored and
-//     didn't ship -- per-slot encode/decode cost exceeded the
-//     mark-phase savings. See "Phase 1: pre-size mainBuf via
-//     measure-pass" in perf/notes/08-pdf-lib.md.
+//     PDFDocument.load on the raw Chrome PDF and counts dictSlots
+//     + arraySlots. The counts drive setExpectedDictSlots() on
+//     fast-dict-onebuf and setExpectedArraySlots() on
+//     fast-array-onebuf, pre-sizing each shim's backing Array to
+//     the exact measured demand (no V8 growth resizes during load).
+//     Net wall-clock is ~+40 ms on the book (walker costs ~60 ms;
+//     load saves ~20). The bound on mainBuf isn't material on its
+//     own (~60 K slots out of 2.4 M) but commits the two-pass
+//     shape. Phase 2/3/3β (Float64Array mainBuf + encoded slots)
+//     were explored and didn't ship -- per-slot encode/decode cost
+//     exceeded the mark-phase savings. See "Phase 1: pre-size
+//     mainBuf via measure-pass" in perf/notes/08-pdf-lib.md.
+//   fast-array-onebuf -- same range-view pattern as fast-dict-onebuf
+//     applied to PDFArray. Each PDFArray's per-instance
+//     `this.array = []` goes away; instances become views into a
+//     shared arrayMain (plain JS Array, heterogeneous slots holding
+//     the original PDFObject references). Reads are direct -- no
+//     decode, unlike the explored-but-didn't-ship encoded approach
+//     which encoded slots into a Float64Array and paid ~300 ms of
+//     decodeValue dispatch during save. ~19 MB of process-phase
+//     heap traffic from parseArray collapses (the `this.array`
+//     allocation + grow doublings across ~79 k PDFArrays). See
+//     "One-buffer PDFArray" in perf/notes/08-pdf-lib.md.
 import './lib/fast-refs.mjs';
 import './lib/fast-inflate.mjs';
 import './lib/fast-parse-number.mjs';
@@ -139,6 +151,7 @@ import './lib/fast-decode-name.mjs';
 import './lib/fast-number-to-string.mjs';
 import './lib/fast-size-in-bytes.mjs';
 import { setExpectedDictSlots }     from './lib/fast-dict-onebuf.mjs';
+import { setExpectedArraySlots }    from './lib/fast-array-onebuf.mjs';
 import './lib/fast-parse-object.mjs';
 import './lib/fast-sync-load.mjs';
 import './lib/fast-indirect-objects.mjs';
@@ -373,11 +386,14 @@ try {
   //    thread. Moves ~300 ms of zlib work off-CPU on the book.
   //
   // measureRawPdf walks rawPdf once with no allocations and hands
-  // the exact dictSlot count to fast-dict-onebuf so its main Array
-  // is pre-sized; eliminates V8 growth resizes during load.
+  // the exact dictSlot + arraySlot counts to fast-dict-onebuf /
+  // fast-array-onebuf so each shim's backing Array is pre-sized;
+  // eliminates V8 growth resizes during load.
   // See perf/notes/08-pdf-lib.md.
   const tProcess = Date.now();
-  setExpectedDictSlots(measureRawPdf(rawPdf).dictSlots);
+  const counts = measureRawPdf(rawPdf);
+  setExpectedDictSlots(counts.dictSlots);
+  setExpectedArraySlots(counts.arraySlots);
   const pdfDoc = await PDFDocument.load(rawPdf);
   setMetadata(pdfDoc, meta);
   await setOutline(pdfDoc, outline, false);
diff --git a/perf/README.md b/perf/README.md
index 79ab773b..4c47386d 100644
--- a/perf/README.md
+++ b/perf/README.md
@@ -67,7 +67,7 @@ The mirror command for CPU-profiling the pdf-lib roundtrip (run from
 `perf/`):
 
 ```
-node measure.mjs --fast-refs --parallel-deflate --fast-decode-name --fast-number-to-string --fast-size-in-bytes --fast-inflate --fast-parse-number --fast-dict-onebuf --measure-pass --fast-parse-object --fast-sync-load --fast-indirect-objects --fast-pdfnumber-pool --cpu-profile-process --cpu-sampling 100 --out results/<label>
+node measure.mjs --fast-refs --parallel-deflate --fast-decode-name --fast-number-to-string --fast-size-in-bytes --fast-inflate --fast-parse-number --fast-dict-onebuf --fast-array-onebuf --measure-pass --fast-parse-object --fast-sync-load --fast-indirect-objects --fast-pdfnumber-pool --cpu-profile-process --cpu-sampling 100 --out results/<label>
 ```
 
 `--out results/<label>` is optional but recommended: omit it and the
@@ -163,13 +163,34 @@ Flag rationale:
   cumulative heap reduction since the original Map-backed PDFDict
   (152 -> 66 MB). Production runs through it. See
   [notes/08-pdf-lib.md "One-buffer PDFDict"](notes/08-pdf-lib.md).
+- `--fast-array-onebuf` -- inject
+  [docs/lib/fast-array-onebuf.mjs](../docs/lib/fast-array-onebuf.mjs).
+  Same range-view pattern as `--fast-dict-onebuf` applied to
+  PDFArray: every committed element lives in a single append-only
+  `arrayMain` JS Array, each PDFArray is a view via packed
+  `(start, length)` in `d`. Backing is a plain heterogeneous JS
+  Array -- slots hold the original PDFObject references, reads are
+  `arrayMain[start + i]` with no decode. This is the explored-but-
+  didn't-ship Phase 3 encoded approach minus the Float64Array
+  encoding (which cost ~300 ms on save's `copyBytesInto` from
+  per-slot `decodeValue` dispatch). Per-parser `_arrayTemp` for
+  the recursion stack, independent of fast-dict-onebuf's
+  `_dictTemp`. Mutations: in-place replace for `set`, in-place
+  extend at HWM for `push`, COW for everything else. Singleton
+  context is duplicated (10 lines) rather than shared so each shim
+  stays independently injectable. ~19 MB process-phase heap traffic
+  drops -- collapses parseArray's `this.array = []` + grow doublings
+  across ~79 k PDFArrays. Composes with `--fast-dict-onebuf`.
+  Production runs through it. See
+  [notes/08-pdf-lib.md "One-buffer PDFArray"](notes/08-pdf-lib.md).
 - `--measure-pass` -- inject
   [docs/lib/measure-pass.mjs](../docs/lib/measure-pass.mjs), the
   no-allocate byte walker. Runs in front of `PDFDocument.load` on
-  rawPdf, counts dictSlots, hands the count to
-  `setExpectedDictSlots()` on `fast-dict-onebuf`, which pre-sizes
-  the module-level `main` Array to the exact slot count. Eliminates
-  V8 growth resizes during load. Net wall-clock ~+40 ms on the book
+  rawPdf, counts dictSlots + arraySlots, hands them to
+  `setExpectedDictSlots()` on `fast-dict-onebuf` and (when on)
+  `setExpectedArraySlots()` on `fast-array-onebuf`, pre-sizing each
+  shim's backing Array to the exact slot count. Eliminates V8
+  growth resizes during load. Net wall-clock ~+40 ms on the book
   (walker ~60 ms, load saves ~20). Production runs through it -- the
   bound on mainBuf isn't material on its own (~60 K slots out of
   2.4 M) but commits the two-pass shape; Phases 2/3/3β (Float64Array
@@ -268,7 +289,7 @@ process phase -- "where is pdf-lib allocating bytes?" rather than
 "where is it spending cycles?" (run from `perf/`):
 
 ```
-node measure.mjs --fast-refs --parallel-deflate --fast-decode-name --fast-number-to-string --fast-size-in-bytes --fast-inflate --fast-parse-number --fast-dict-onebuf --measure-pass --fast-parse-object --fast-sync-load --fast-indirect-objects --fast-pdfnumber-pool --heap-profile-process --heap-sampling 512 --out results/<label>
+node measure.mjs --fast-refs --parallel-deflate --fast-decode-name --fast-number-to-string --fast-size-in-bytes --fast-inflate --fast-parse-number --fast-dict-onebuf --fast-array-onebuf --measure-pass --fast-parse-object --fast-sync-load --fast-indirect-objects --fast-pdfnumber-pool --heap-profile-process --heap-sampling 512 --out results/<label>
 ```
 
 Same `--out` / labelling note as the CPU command above: omit it for a
@@ -445,6 +466,7 @@ run.bat --fast-dict-iter                  # in-place Map.forEach for PDFDict.siz
 run.bat --fast-parse-dict                 # hoist Type/Catalog/Pages/Page sentinel PDFNames out of parseDict (Map-shape baseline; production now runs --fast-dict-onebuf)
 run.bat --fast-dict-array                 # replace PDFDict's backing Map with a per-dict flat [k,v,k,v,...] array; subsumes --fast-dict-iter + --fast-parse-dict (A/B baseline; production now runs --fast-dict-onebuf)
 run.bat --fast-dict-onebuf                # ONE long-lived buffer for all PDFDict entries + small per-parser temp (also ships; opt-in here for A/B)
+run.bat --fast-array-onebuf               # ONE long-lived buffer for all PDFArray elements + small per-parser temp; composes with --fast-dict-onebuf (also ships; opt-in here for A/B)
 run.bat --measure-pass --fast-dict-onebuf # walk rawPdf with the no-allocate measure pass and pre-size --fast-dict-onebuf's mainBuf to the exact dict-slot count (Phase 1 of the two-pass architecture; mutex with --incremental and --render-only)
 run.bat --fast-indirect-objects           # dense-array cache for PDFContext.indirectObjects (gen=0 path); mirror of --fast-refs on the value side (also ships; opt-in here for A/B)
 run.bat --fast-pdfnumber-pool             # value-keyed cache in front of PDFNumber.of; dense array for small ints, Map for the rest (also ships; opt-in here for A/B)
@@ -575,6 +597,7 @@ file documenting each:
 | Pre-size `parseDict` accumulator (`new Array(10)` median) | [08](notes/08-pdf-lib.md) | `fastParseDictArray` heap row -25 %; total process heap 107 MB → 92 MB (-14 %) |
 | One-buffer `PDFDict` (single mainBuf + packed 53-bit instance) | [08](notes/08-pdf-lib.md) | total process heap 92 MB → 66 MB (-28 %); cumulative -57 % since Map-backed PDFDict |
 | `measure-pass` (Phase 1) wired into production via `setExpectedDictSlots()` | [08](notes/08-pdf-lib.md) | byte-identical output; mainBuf pre-sized exact (no V8 growth resizes); ~+40 ms net process |
+| One-buffer `PDFArray` (single arrayMain + packed (start, length) view) | [08](notes/08-pdf-lib.md) | total process heap 66 MB → 52 MB (-21 %); parseArray off top 15; cumulative -66 % since Map-backed PDFDict |
 
 What was tried and didn't ship:
 
@@ -601,4 +624,4 @@ order; later ones reference earlier ones for context.
 | [05-blink-trace.md](notes/05-blink-trace.md) | What happened when we tried move-not-clone (a `previousLeaf` cache shipped instead of move); cracking the cpu profile's `(program)` row open with a Blink-category trace; the WhiteSpaceFilter paired-A/B that found it wasn't worth its layout cost in our pipeline. |
 | [06-microtasks-pageranges-css.md](notes/06-microtasks-pageranges-css.md) | Following `RunMicrotasks` down to zero (chunker fully sync); why `pageRanges` sharding is off the table; CSS cost attribution showing print.css's individual sections are all below the noise floor. |
 | [07-memory.md](notes/07-memory.md) | Where the renderer's 1.9 GB goes -- process-tree footprint, per-allocator + per-Blink-class breakdown, `--disable-gpu` + `--in-process-gpu` saving ~200 MB, a GC-pass probe finding 180 MB of unswept Oilpan garbage. |
-| [08-pdf-lib.md](notes/08-pdf-lib.md) | Profiling the process phase via `--cpu-profile-process`; pako's per-stream init dominates with ~4 500 small streams (routing pdf-lib's `deflate` + `inflate` through `node:zlib` saves ~1.5 s); `PDFRef.of`'s string-keyed Map lookup at 1.2 M calls per load (dense-array gen=0 cache saves ~0.2 s); parallelising save's per-stream deflate on libuv's pool with `objectsPerStream: 500` (~0.3 s off the main thread; PDF -5 %); `decodeName`'s regex scan on 2.76 M `PDFName.of` calls per load with a 0.0001 % hit rate (no-`#` cache saves ~0.5 s); `numberToString`'s redundant `toString`/`split`/`parseInt` on the 100 % no-`e` path; `sizeInBytes` allocating `n.toString(2)` on ~300 k xref-writer calls (short-circuit ladder saves ~60 ms); `PDFDict.entries` allocating `Array.from(map.entries())` on every dict serialisation (`Map.forEach` with hoisted callbacks saves ~80 ms); `parseDict`'s type-dispatch tail re-running `PDFName.of('Type'/'Catalog'/'Pages'/'Page')` per dict (hoisted sentinel constants drop `fastOf` self-time by 22 %); pdf-lib's `__awaiter`/`__generator` scaffolding on nine load + save methods costing ~135 ms of attributed self-time + ~50 ms GC (synchronified twins save 0.36 s of process); `parseObject`'s three speculative `matchKeyword(true/false/null)` scans on every dispatch (first-byte peek + gated keyword scans halve `parseObject` self-time); the sampling heap profile pointing at `new Map()` + `Map.prototype.set` at ~50 % of process-phase allocations (replacing `PDFDict`'s backing `Map` with a flat alternating `[k,v,k,v,...]` array drops Map+set heap traffic ~80 % and subsumes the earlier `fast-dict-iter` + `fast-parse-dict` shims); the only hot `Map.set` left being `PDFContext.assign`'s `indirectObjects.set(ref, object)` (replacing the Map with a dense gen=0 array indexed by `objectNumber` drops `assign` out of the CPU top-15 and halves the remaining `set` heap traffic); the residual `set` after that being the upstream `PDFRef.of` pool.set on cache miss (directly constructing the `PDFRef` via `Object.create(PDFRef.prototype)` on the gen=0 miss path bypasses the redundant upstream pool entirely, dropping `set` heap traffic from 7.7 MB to 0.5 MB and saving another ~93 ms on `PDFRef.of` CPU); `parseNumberOrRef` as the next-largest heap row at 15 MB of inlined `new PDFNumber(value)` calls -- PDFs reuse a few thousand unique numeric values hundreds of thousands of times (page indices, `/Count`, `/MediaBox` dimensions, font sizes), so pooling `PDFNumber` by value drops `parseNumberOrRef` off the top 10 and total process heap by ~13 %; `fastParseDictArray` at ~85 % FixedArray-growth garbage on 261 k dicts with a 5-entry median (pre-sizing the accumulator at `new Array(10)` collapses the per-call growth chain, dropping the row -25 % and total heap another -14 %); the next layer being the irreducible "one `new Array(10)` + one PDFDict instance per parsed dict" floor with ~1780 backing arrays still in flight after `--fast-dict-array` (collapse them into one long-lived mainBuf where every committed PDFDict entry lives, a per-parser temp stack handles parseDict recursion, owned dicts append to main and mutate in-place / COW-to-tail, and the whole PDFDict instance state packs into one 53-bit Number -- 24-bit start + 14-bit length + 1-bit owned -- so the per-dict object header collapses to a single field; total process heap drops 92 MB → 66 MB (-28 %), cumulative -57 % since the Map-backed baseline); GC self-time still ~150 ms / 15 % of process after fast-dict-onebuf (V8-flag knobs don't move it -- mark cost is dominated by walking the ~2.4 M Object-ref slots in mainBuf), so the next attack surface is shrinking the *live set* V8 has to mark by encoding slots as Numbers in a Float64Array mainBuf instead of Object references; that needs exact pool sizing, which is best done by a separate measure-allocate-work pass (Phase 0 viability gate: a no-allocate byte walker prototype clocks 135 ms vs PDFDocument.load at 1238 ms on the 39 MB Chrome-output PDF, ~9x cheaper, so even an 80 %-of-current work pass would land net-positive on CPU before any GC reduction; the prototype lives in `perf/phase0-measure.mjs` with `--dump-raw-pdf` on `measure.mjs` for capturing the canonical input); Phase 1 productionises that walker as `docs/lib/measure-pass.mjs` and adds `setExpectedDictSlots()` to fast-dict-onebuf so the harness's `--measure-pass` flag pre-sizes mainBuf exact (byte-identical output, +40 ms net process on the book) -- pipeline shape committed for Phase 2 to layer on, even though Phase 1 alone doesn't move the heap; a V8 IC-invalidation gotcha worth recording: rebinding the module-level `main` (rather than resizing it in place via `main.length = N`) made `_appendEntries` deopt and the heap jump 65 → 92 MB despite the resized Array being shape-identical; post-Phase-1 cleanup: the One-buffer `owned` bit was over-cautious -- `start + length === mainLen` is enough to know slots past mainLen are claimable regardless of whether the range came from the parser or a factory call, so the bit drops out of the packed value entirely (set's COW gate simplifies, _cow collapses to one branch, bit 38 becomes spare; byte-identical output, heap flat); slot-type histogram on `main[0..mainLen)` confirms keys are 100 % PDFName and the four big pools (Name / Ref / Number / Dict) cover 96.4 % of all slots, so a Float64Array mainBuf with a side `Object[]` for the residual ~3.5 % (Array / String / HexString) would collapse ~96 % of GC slot-marks at the price of a side-pool indirection -- `perf/instrument-slot-types.mjs` does the classification in 39 ms via `--instrument-slot-types`; built the Float64Array mainBuf with 4-bit tag + 49-bit payload encoding (subsuming fast-refs and fast-pdfnumber-pool's pool IDs, adding new pools for PDFArray / PDFString / PDFHexString) and confirmed byte-identical output -- but the measured win was a wash: pointer-array marks turn out to be fast in V8, encoding overhead at parse + decode at save roughly cancels the savings, and the new pool Maps cost ~3 MB heap, so the architecture stays on the shelf rather than shipping; mirroring the same shape to PDFArray (each instance as a view into a shared arrayBuf Float64Array via `this.d = (start, length)`, same temp-then-commit parser pattern) lands the expected heap win of -7.6 MB / -12 % and pulls parseArray's 19.6 MB attribution row off the top 10 entirely, but introduces a +360 ms wall-clock regression at save -- ~3 M per-slot `decodeValue` calls across copyBytesInto, ~100 ns each, V8 can't inline across the prototype boundary -- so the architecture stays off staging too; the natural follow-up is to hand-inline decodeValue's 10-case switch into all four hot save methods (Phase 3β), which recovers the function-call overhead and lands net wins on GC (-19 ms) and parseName (-17 ms, a downstream V8 re-optimization once the call sites became monomorphic per case branch) -- but the inlined switch body itself adds +23 ms at encode-at-parse and +22 ms at copyBytesInto, and the net of all of Phase 2 + 3 + β is "real heap+GC win, ~+200 ms wall-clock loss across many frames", so the architecture stays off staging on the simpler conclusion that V8 marks pointer arrays faster than expected and the original polymorphic `main[i].copyBytesInto()` was fine; Phase 1 wires into production via a `setExpectedDictSlots()` call in `docs/render-book.mjs` that runs the measure walker on rawPdf and hands the exact dictSlot count to fast-dict-onebuf -- the bound on mainBuf isn't material on its own (~60 K slots out of 2.4 M of slack) but commits the two-pass shape so any future shim swap doesn't have to re-do the wiring. |
+| [08-pdf-lib.md](notes/08-pdf-lib.md) | Profiling the process phase via `--cpu-profile-process`; pako's per-stream init dominates with ~4 500 small streams (routing pdf-lib's `deflate` + `inflate` through `node:zlib` saves ~1.5 s); `PDFRef.of`'s string-keyed Map lookup at 1.2 M calls per load (dense-array gen=0 cache saves ~0.2 s); parallelising save's per-stream deflate on libuv's pool with `objectsPerStream: 500` (~0.3 s off the main thread; PDF -5 %); `decodeName`'s regex scan on 2.76 M `PDFName.of` calls per load with a 0.0001 % hit rate (no-`#` cache saves ~0.5 s); `numberToString`'s redundant `toString`/`split`/`parseInt` on the 100 % no-`e` path; `sizeInBytes` allocating `n.toString(2)` on ~300 k xref-writer calls (short-circuit ladder saves ~60 ms); `PDFDict.entries` allocating `Array.from(map.entries())` on every dict serialisation (`Map.forEach` with hoisted callbacks saves ~80 ms); `parseDict`'s type-dispatch tail re-running `PDFName.of('Type'/'Catalog'/'Pages'/'Page')` per dict (hoisted sentinel constants drop `fastOf` self-time by 22 %); pdf-lib's `__awaiter`/`__generator` scaffolding on nine load + save methods costing ~135 ms of attributed self-time + ~50 ms GC (synchronified twins save 0.36 s of process); `parseObject`'s three speculative `matchKeyword(true/false/null)` scans on every dispatch (first-byte peek + gated keyword scans halve `parseObject` self-time); the sampling heap profile pointing at `new Map()` + `Map.prototype.set` at ~50 % of process-phase allocations (replacing `PDFDict`'s backing `Map` with a flat alternating `[k,v,k,v,...]` array drops Map+set heap traffic ~80 % and subsumes the earlier `fast-dict-iter` + `fast-parse-dict` shims); the only hot `Map.set` left being `PDFContext.assign`'s `indirectObjects.set(ref, object)` (replacing the Map with a dense gen=0 array indexed by `objectNumber` drops `assign` out of the CPU top-15 and halves the remaining `set` heap traffic); the residual `set` after that being the upstream `PDFRef.of` pool.set on cache miss (directly constructing the `PDFRef` via `Object.create(PDFRef.prototype)` on the gen=0 miss path bypasses the redundant upstream pool entirely, dropping `set` heap traffic from 7.7 MB to 0.5 MB and saving another ~93 ms on `PDFRef.of` CPU); `parseNumberOrRef` as the next-largest heap row at 15 MB of inlined `new PDFNumber(value)` calls -- PDFs reuse a few thousand unique numeric values hundreds of thousands of times (page indices, `/Count`, `/MediaBox` dimensions, font sizes), so pooling `PDFNumber` by value drops `parseNumberOrRef` off the top 10 and total process heap by ~13 %; `fastParseDictArray` at ~85 % FixedArray-growth garbage on 261 k dicts with a 5-entry median (pre-sizing the accumulator at `new Array(10)` collapses the per-call growth chain, dropping the row -25 % and total heap another -14 %); the next layer being the irreducible "one `new Array(10)` + one PDFDict instance per parsed dict" floor with ~1780 backing arrays still in flight after `--fast-dict-array` (collapse them into one long-lived mainBuf where every committed PDFDict entry lives, a per-parser temp stack handles parseDict recursion, owned dicts append to main and mutate in-place / COW-to-tail, and the whole PDFDict instance state packs into one 53-bit Number -- 24-bit start + 14-bit length + 1-bit owned -- so the per-dict object header collapses to a single field; total process heap drops 92 MB → 66 MB (-28 %), cumulative -57 % since the Map-backed baseline); GC self-time still ~150 ms / 15 % of process after fast-dict-onebuf (V8-flag knobs don't move it -- mark cost is dominated by walking the ~2.4 M Object-ref slots in mainBuf), so the next attack surface is shrinking the *live set* V8 has to mark by encoding slots as Numbers in a Float64Array mainBuf instead of Object references; that needs exact pool sizing, which is best done by a separate measure-allocate-work pass (Phase 0 viability gate: a no-allocate byte walker prototype clocks 135 ms vs PDFDocument.load at 1238 ms on the 39 MB Chrome-output PDF, ~9x cheaper, so even an 80 %-of-current work pass would land net-positive on CPU before any GC reduction; the prototype lives in `perf/phase0-measure.mjs` with `--dump-raw-pdf` on `measure.mjs` for capturing the canonical input); Phase 1 productionises that walker as `docs/lib/measure-pass.mjs` and adds `setExpectedDictSlots()` to fast-dict-onebuf so the harness's `--measure-pass` flag pre-sizes mainBuf exact (byte-identical output, +40 ms net process on the book) -- pipeline shape committed for Phase 2 to layer on, even though Phase 1 alone doesn't move the heap; a V8 IC-invalidation gotcha worth recording: rebinding the module-level `main` (rather than resizing it in place via `main.length = N`) made `_appendEntries` deopt and the heap jump 65 → 92 MB despite the resized Array being shape-identical; post-Phase-1 cleanup: the One-buffer `owned` bit was over-cautious -- `start + length === mainLen` is enough to know slots past mainLen are claimable regardless of whether the range came from the parser or a factory call, so the bit drops out of the packed value entirely (set's COW gate simplifies, _cow collapses to one branch, bit 38 becomes spare; byte-identical output, heap flat); slot-type histogram on `main[0..mainLen)` confirms keys are 100 % PDFName and the four big pools (Name / Ref / Number / Dict) cover 96.4 % of all slots, so a Float64Array mainBuf with a side `Object[]` for the residual ~3.5 % (Array / String / HexString) would collapse ~96 % of GC slot-marks at the price of a side-pool indirection -- `perf/instrument-slot-types.mjs` does the classification in 39 ms via `--instrument-slot-types`; built the Float64Array mainBuf with 4-bit tag + 49-bit payload encoding (subsuming fast-refs and fast-pdfnumber-pool's pool IDs, adding new pools for PDFArray / PDFString / PDFHexString) and confirmed byte-identical output -- but the measured win was a wash: pointer-array marks turn out to be fast in V8, encoding overhead at parse + decode at save roughly cancels the savings, and the new pool Maps cost ~3 MB heap, so the architecture stays on the shelf rather than shipping; mirroring the same shape to PDFArray (each instance as a view into a shared arrayBuf Float64Array via `this.d = (start, length)`, same temp-then-commit parser pattern) lands the expected heap win of -7.6 MB / -12 % and pulls parseArray's 19.6 MB attribution row off the top 10 entirely, but introduces a +360 ms wall-clock regression at save -- ~3 M per-slot `decodeValue` calls across copyBytesInto, ~100 ns each, V8 can't inline across the prototype boundary -- so the architecture stays off staging too; the natural follow-up is to hand-inline decodeValue's 10-case switch into all four hot save methods (Phase 3β), which recovers the function-call overhead and lands net wins on GC (-19 ms) and parseName (-17 ms, a downstream V8 re-optimization once the call sites became monomorphic per case branch) -- but the inlined switch body itself adds +23 ms at encode-at-parse and +22 ms at copyBytesInto, and the net of all of Phase 2 + 3 + β is "real heap+GC win, ~+200 ms wall-clock loss across many frames", so the architecture stays off staging on the simpler conclusion that V8 marks pointer arrays faster than expected and the original polymorphic `main[i].copyBytesInto()` was fine; Phase 1 wires into production via a `setExpectedDictSlots()` call in `docs/render-book.mjs` that runs the measure walker on rawPdf and hands the exact dictSlot count to fast-dict-onebuf -- the bound on mainBuf isn't material on its own (~60 K slots out of 2.4 M of slack) but commits the two-pass shape so any future shim swap doesn't have to re-do the wiring; finally, mirroring fast-dict-onebuf's range-view shape onto PDFArray (every committed element in a single append-only `arrayMain`, each PDFArray a view via packed `(start, length)`) is the lever the encoded approach was reaching for, with plain Object[] storage skipping the per-slot decode cost that killed Phase 3 -- ~19.6 MB `parseArray` allocation row drops off the top 15, total process heap 66 MB → 52 MB (-21 %), cumulative -66 % since the Map-backed baseline. |
diff --git a/perf/measure.mjs b/perf/measure.mjs
index 77475d89..72a6940d 100644
--- a/perf/measure.mjs
+++ b/perf/measure.mjs
@@ -296,6 +296,7 @@ let fastDictArray = false;
 let fastIndirectObjects = false;
 let fastPdfnumberPool = false;
 let fastDictOnebuf = false;
+let fastArrayOnebuf = false;
 let instrumentParsedict = false;
 let dumpRawPdf = null;
 let measurePass = false;
@@ -337,6 +338,7 @@ for (let i = 0; i < args.length; i++) {
   else if (a === '--fast-indirect-objects') fastIndirectObjects = true;
   else if (a === '--fast-pdfnumber-pool') fastPdfnumberPool = true;
   else if (a === '--fast-dict-onebuf') fastDictOnebuf = true;
+  else if (a === '--fast-array-onebuf') fastArrayOnebuf = true;
   else if (a === '--instrument-parsedict') instrumentParsedict = true;
   else if (a === '--dump-raw-pdf') dumpRawPdf = args[++i];
   else if (a === '--measure-pass') measurePass = true;
@@ -470,6 +472,10 @@ if (fastDictOnebuf) {
   await import('../docs/lib/fast-dict-onebuf.mjs');
   console.log('[harness] fast-dict-onebuf: ONE long-lived buffer for all PDFDict entries + small per-parser temp');
 }
+if (fastArrayOnebuf) {
+  await import('../docs/lib/fast-array-onebuf.mjs');
+  console.log('[harness] fast-array-onebuf: ONE long-lived buffer for all PDFArray elements + small per-parser temp');
+}
 if (instrumentParsedict) {
   await import('./instrument-parsedict.mjs');
 }
@@ -480,12 +486,22 @@ let _runMeasurePass = null;
 if (measurePass) {
   const { measure } = await import('../docs/lib/measure-pass.mjs');
   const { setExpectedDictSlots } = await import('../docs/lib/fast-dict-onebuf.mjs');
+  let setExpectedArraySlots = null;
+  if (fastArrayOnebuf) {
+    const ma = await import('../docs/lib/fast-array-onebuf.mjs');
+    setExpectedArraySlots = ma.setExpectedArraySlots;
+  }
   _runMeasurePass = (bytes) => {
     const counts = measure(bytes);
     setExpectedDictSlots(counts.dictSlots);
+    if (setExpectedArraySlots) setExpectedArraySlots(counts.arraySlots);
     return counts;
   };
-  console.log('[harness] measure-pass: no-allocate prelude, pre-sizes fast-dict-onebuf mainBuf to measured dict-slot count');
+  console.log(
+    setExpectedArraySlots
+      ? '[harness] measure-pass: no-allocate prelude, pre-sizes dict + array main buffers to measured slot counts'
+      : '[harness] measure-pass: no-allocate prelude, pre-sizes fast-dict-onebuf mainBuf to measured dict-slot count',
+  );
 }
 
 // --instrument-slot-types loads the slot-type classifier; called after
diff --git a/perf/notes/08-pdf-lib.md b/perf/notes/08-pdf-lib.md
index 7d2d3318..4974db55 100644
--- a/perf/notes/08-pdf-lib.md
+++ b/perf/notes/08-pdf-lib.md
@@ -3761,6 +3761,123 @@ the Object[] storage and inheriting the same low-overhead
 view-with-packed-payload trick. That's the fast-array-onebuf
 section below; it does ship.
 
+## One-buffer PDFArray
+
+Mirror of fast-dict-onebuf's strategy applied to PDFArray. Every
+committed element lives in a single append-only `arrayMain` JS
+Array, kept for the document's lifetime. Each PDFArray instance
+is a view via packed `(start, length)` in `d`. Per-instance
+`this.array = []` allocation goes away; ~79 k PDFArrays stop
+allocating per-instance backing arrays + grow doublings.
+
+Storage is a plain heterogeneous JS Array -- slots hold the
+original PDFObject references, reads are `arrayMain[start + i]`
+with no decode. This is the explored-but-didn't-ship Phase 3
+shape (PDFArray as a view into a shared backing) minus the
+Float64Array encoding: Phase 3 paid ~300 ms of `decodeValue`
+dispatch on save's `copyBytesInto` (~3 M slots × 10-case switch
++ pool lookup). The plain-reference shape skips that entirely
+and is what makes fast-array-onebuf cheap to ship.
+
+### Parser temp + commit
+
+Per-parser `_arrayTemp` + length cursor as a recursion stack,
+parallel to fast-dict-onebuf's `_dictTemp`. Each `parseArray`
+invocation pushes onto temp, commits its frame to `arrayMain`
+in one contiguous append, and pops temp back. Dict and array
+temps are independent so cross-recursion is fine.
+
+### Mutations
+
+- `set(i, v)` -- in-place replace at `arrayMain[start + i]`.
+  Safe for any array; no shifts.
+- `push(v)` -- in-place extend at HWM (`arrayMain.push(v)` +
+  length += 1) when `start + length === arrayMain.length`;
+  COW otherwise.
+- `insert(i, v)` / `remove(i)` -- always COW. Shifting slots
+  in `arrayMain` would corrupt other arrays' ranges.
+
+Same at-HWM safety logic as fast-dict-onebuf; no owned bit
+needed (`start + length === arrayMain.length` is sufficient).
+
+### Bit layout
+
+```
+bits  0-23: start  (24 bits, max 16 M slots)
+bits 24-39: length (16 bits, max 65 536 elements; max observed
+                    ~25 k on the book)
+```
+
+40 bits used, well within `Number.MAX_SAFE_INTEGER`. One more
+length bit than fast-dict-onebuf's 14-bit dict length, because
+arrays can be larger than dicts on this workload.
+
+### Singleton context (duplicated)
+
+Same singleton-PDFContext assumption as fast-dict-onebuf, but
+the ~10 lines of context-stash machinery are duplicated rather
+than shared, so each shim stays independently injectable. A
+caller can opt into one without the other; both are independent
+side-effecting imports.
+
+### Production wiring
+
+- [`docs/render-book.mjs`](../../docs/render-book.mjs) -- imports
+  `setExpectedArraySlots` alongside `setExpectedDictSlots`, calls
+  both after `measureRawPdf` returns and before `PDFDocument.load`.
+- [`perf/measure.mjs`](../measure.mjs) -- adds `--fast-array-onebuf`
+  flag. Composes with `--fast-dict-onebuf`; `--measure-pass` also
+  drives `setExpectedArraySlots` when the array shim is on.
+- The harness's `--fast-array-onebuf` is opt-in alongside the
+  production path, the same arrangement as `--fast-dict-onebuf`.
+
+### Measured wins
+
+Heap impact (process phase, 512 B sampling, paired runs vs the
+Phase 1 baseline that was the immediate predecessor of this
+shim):
+
+| Allocator                | P1 baseline | + fast-array-onebuf | Delta              |
+|--------------------------|------------:|--------------------:|-------------------:|
+| `parseArray`             |    19.6 MB  |             ~0 (off top 15) | **-19.6 MB**  |
+| new shim row (PDFArray wrappers) | -   |             4.2 MB   | +4.2 MB           |
+| Total sampled            |    65.6 MB  |            **51.9 MB**       | **-13.7 MB (-21 %)** |
+
+CPU impact (process wall, pinned 0x5500 / High, no profiler,
+3 paired runs each side):
+
+| State            | median | mean   |
+|------------------|-------:|-------:|
+| P1 only          | 1.07 s | 1.09 s |
+| P1 + this shim   | 1.02 s | 1.01 s |
+
+Mean shifts +0.08 s -- this shim slightly faster, well within
+noise on this machine.
+
+The CPU regression that showed up under
+`--cpu-profile-process` (paired with the encoded-storage
+prototype) was profiler-induced noise; the sampler's per-allocation
+bookkeeping interacts badly with this shape. Gone once we pin
+CPU and drop the sampler. Worth remembering: when the only
+signal saying "this is slower" is the profiler, run the same
+code without the profiler before accepting the verdict.
+
+### Cumulative arc (final)
+
+Heap, starting from the original Map-backed PDFDict:
+
+| State                             | Total sampled | Change vs prior |
+|-----------------------------------|--------------:|----------------:|
+| Map-backed (pre-fast-dict-array)  |   152 MB      | -               |
+| fast-dict-array                   |    92 MB      | -60 MB          |
+| fast-dict-onebuf                  |    66 MB      | -26 MB          |
+| **fast-array-onebuf**             |    **52 MB**  | **-14 MB**      |
+
+**-66 % cumulative reduction in process-phase heap traffic.**
+The final state of this storage-shape work. The endpoint of
+the dict + array allocator refactors that this notes file has
+been chasing for the last ~22 sections.
+
 ## `@cantoo/pdf-lib`: not a drop-in replacement
 
 Spot-checked the maintained fork (`@cantoo/pdf-lib` 2.6.5) as an
@@ -3799,7 +3916,8 @@ Cumulative process-phase cost, baseline → after the shims to date:
 | + fast-pdfnumber-pool                | ~1.0 s  | ~0.6 s | ~0.4 s |
 | + parseDict pre-sized array          | ~1.0 s  | ~0.6 s | ~0.4 s |
 | + fast-dict-onebuf                   | ~1.0 s  | ~0.6 s | ~0.4 s |
-| **+ measure-pass Phase 1 (this section)** | **~1.0 s** | **~0.7 s** | **~0.4 s** |
+| + measure-pass Phase 1               | ~1.0 s  | ~0.7 s | ~0.4 s |
+| **+ fast-array-onebuf (this section)** | **~1.0 s** | **~0.7 s** | **~0.4 s** |
 
 The bottom-up after the latest pair is what's left of pdf-lib's
 genuine parser work: `PDFDict.entries`, `PDFObjectParser.parseName`,

From 8a647c7b655333361edd74c4ed0c70d4b4bdee70 Mon Sep 17 00:00:00 2001
From: Kuba Sunderland-Ober <kuba@mareimbrium.org>
Date: Sun, 24 May 2026 10:54:48 +0200
Subject: [PATCH 33/44] fast-refs: drop the per-instance PDFRef.tag string.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Upstream caches `<obj> <gen> R` on each PDFRef so toString /
sizeInBytes / copyBytesInto can read it back. After fast-array-onebuf
shipped, the heap profile showed PDFParser.parseIndirectObjectHeader
at 13.7 MB / 25 % of total -- attribution chain (via
perf/find-heap-callers.mjs):

  parseIndirectObjectHeader  → skipJibberish  (14.2 MB)
    → matchIndirectObjectHeader (try/catch wrapper)
      → parseIndirectObjectHeader  → fastOf

skipJibberish runs after every successful indirect object parse and
speculatively calls matchIndirectObjectHeader to detect the next
`N M obj` header. On valid PDFs it always succeeds. fastOf fires
once per indirect-object boundary, populating the dense-array cache;
the subsequent "real" parseIndirectObject is a cache hit. V8 inlines
fastOf at this call site (small + hot from speculation), so the
attribution lands on the caller -- 13.7 MB of which was the
tag-string churn (`objectNumber + ' 0 R'`): V8 builds 1-2 intermediate
concat strings + the final ~25-35 B tag, ~150 k times.

Eliminating the field collapses both rows:

  parseIndirectObjectHeader  13.7 MB -> 9.3 MB  (-4.3 MB)
  fastOf (refs)               7.7 MB -> 4.8 MB  (-2.9 MB)
  total sampled              51.9 MB -> 45.2 MB (-6.7 MB, -13 %)
  parseArray row             gone (already collapsed by fast-array-onebuf)

The remaining 9.3 MB at parseIndirectObjectHeader and 4.8 MB at
fastOf are the PDFRef instances themselves (Object.create +
objectNumber + generationNumber fields, ~32-48 B × ~150 k) plus
attribution leakage from V8's inlining. Hard floor without dropping
per-PDFRef wrappers entirely.

Prototype methods now compute results from objectNumber /
generationNumber directly:
- copyBytesInto: writes digits straight into the output buffer with
  a no-allocation _writeUint helper (divide-and-write-backwards into
  the caller's buffer). No copyStringIntoBuffer call.
- sizeInBytes: returns digitCount(obj) + digitCount(gen) + 3.
- toString: builds on demand. Debug only, no caching needed.

Both gen=0 (no tag set) and gen!=0 (tag set by upstream's
constructor but ignored) work; the gen!=0 path's tag string is
allocated-then-wasted (~18 % of refs, ~1 MB), not worth patching
the constructor to avoid.

CPU impact (pinned 0x5500 / High, no profiler, 4 runs each):
  with-tag    median 1.045 s, mean 1.045 s
  tagless     median 1.030 s, mean 1.030 s
  Δ           ~15 ms tagless faster (in the noise but trending)

Output PDF is byte-identical modulo /CreationDate + /ModDate
timestamps (verified by inflating + diffing all 453 ObjStm streams).
---
 docs/lib/fast-refs.mjs   | 119 ++++++++++++++++++++++++++++++++-------
 perf/README.md           |  10 +++-
 perf/notes/08-pdf-lib.md | 107 ++++++++++++++++++++++++++++++++++-
 3 files changed, 214 insertions(+), 22 deletions(-)

diff --git a/docs/lib/fast-refs.mjs b/docs/lib/fast-refs.mjs
index 32b955e4..beeb76ad 100644
--- a/docs/lib/fast-refs.mjs
+++ b/docs/lib/fast-refs.mjs
@@ -1,5 +1,6 @@
 // Replace pdf-lib's PDFRef.of pool lookup with a dense-array cache
-// for the generation=0 case (the overwhelmingly common one).
+// for the generation=0 case (the overwhelmingly common one), AND
+// drop the per-instance `tag` string entirely.
 //
 // The upstream implementation
 // (node_modules/pdf-lib/cjs/core/objects/PDFRef.js) keys its pool by
@@ -13,24 +14,50 @@
 // hash it. That's ~330 ms of self-time on the process-phase profile
 // plus measurable GC pressure.
 //
-// Shim: dense array indexed by objectNumber for the gen=0 branch.
-// Plain array indexing, no string alloc, no Map hash.
-//
-// On a gen=0 cache miss we construct the PDFRef directly via
+// Shim part 1: dense array indexed by objectNumber for the gen=0 branch.
+// Plain array indexing, no string alloc, no Map hash. On a gen=0 cache
+// miss we construct the PDFRef directly via
 // `Object.create(PDFRef.prototype)` plus manual field init, skipping
 // both the ENFORCER check and the upstream `pool.set(tag, instance)`.
-// The upstream pool was the last remaining hot Map.set in the heap
-// profile after fast-indirect-objects shipped (~7 MB of `set` from
-// the once-per-unique-objectNumber miss), all of which becomes dead
-// arena allocation once the dense array is the authoritative cache.
-// PDFRef's super (PDFObject) has a no-op constructor; the only
-// instance fields the prototype methods read are `objectNumber`,
-// `generationNumber`, and `tag` (used by toString / sizeInBytes /
-// copyBytesInto), so direct construction is safe.
-//
-// gen != 0 calls (the other ~18 %, pdf-lib's xref-stream bookkeeping
-// where "generation" encodes an in-ObjStm index per PDF 1.5 spec,
-// see PDFXRefStreamParser.js:74-80) still pass through the original
+//
+// Shim part 2: drop the per-instance `tag` field. Upstream caches
+// `<obj> <gen> R` on each PDFRef so toString / sizeInBytes /
+// copyBytesInto can read it back. After fast-array-onebuf shipped,
+// the heap profile showed PDFParser.parseIndirectObjectHeader sitting
+// at 13.7 MB (25 % of total). The attribution chain (via
+// perf/find-heap-callers.mjs):
+//
+//   parseIndirectObjectHeader  → skipJibberish (14.2 MB)
+//     → matchIndirectObjectHeader (try/catch wrapper)
+//       → parseIndirectObjectHeader → fastOf
+//
+// skipJibberish runs after every successful indirect object parse and
+// speculatively calls matchIndirectObjectHeader to detect the next
+// `N M obj` header. On valid PDFs the speculation always succeeds, so
+// fastOf fires once per indirect-object boundary, populating the
+// dense-array cache. The subsequent "real" parseIndirectObject then
+// hits the cache. V8 inlines fastOf at this call site (small + hot
+// from speculation) so the attribution lands on the caller -- 13.7 MB
+// of which was the tag-string allocation (`objectNumber + ' 0 R'`):
+// V8 builds 1-2 intermediate concat strings + the final ~25-35 B
+// tag, ~150 k times.
+//
+// Eliminating the `tag` field collapses all of that. The prototype
+// methods now compute their results from objectNumber / generationNumber
+// directly. copyBytesInto writes digits straight into the output buffer
+// with a no-allocation _writeUint helper; sizeInBytes returns
+// digitCount(obj) + digitCount(gen) + 3 (for " " + " R"); toString
+// builds on demand (only used for debug, no caching needed).
+//
+// gen != 0 PDFRefs constructed via the upstream path still have `tag`
+// set by the upstream constructor -- our overrides ignore the field,
+// so the tag string is allocated-then-wasted. gen != 0 is ~18 % of refs
+// at ~50 K instances; the waste is bounded and not worth patching the
+// constructor for.
+//
+// gen != 0 cache lookups (pdf-lib's xref-stream bookkeeping where
+// "generation" encodes an in-ObjStm index per PDF 1.5 spec, see
+// PDFXRefStreamParser.js:74-80) still pass through the original
 // PDFRef.of -- their Map pool is harmless at gen!=0's volume.
 //
 // Side-effecting import. Import once before any pdf-lib operation.
@@ -38,6 +65,36 @@
 
 import { PDFRef } from "pdf-lib";
 
+// Write n's decimal representation into buffer starting at offset.
+// No allocations. Returns the number of bytes written. n must be a
+// non-negative integer.
+function _writeUint(buffer, offset, n) {
+  if (n < 10) { buffer[offset] = 0x30 + n; return 1; }
+  // Count digits.
+  let m = n, d = 0;
+  while (m > 0) { d++; m = (m / 10) | 0; }
+  // Write digits backwards.
+  for (let i = d - 1; i >= 0; i--) {
+    buffer[offset + i] = 0x30 + (n % 10);
+    n = (n / 10) | 0;
+  }
+  return d;
+}
+
+// Non-allocating decimal digit count for non-negative integers.
+// Ladder catches the common small-number cases without arithmetic.
+function _digitCount(n) {
+  if (n < 10)      return 1;
+  if (n < 100)     return 2;
+  if (n < 1000)    return 3;
+  if (n < 10000)   return 4;
+  if (n < 100000)  return 5;
+  if (n < 1000000) return 6;
+  let d = 0;
+  while (n > 0) { d++; n = (n / 10) | 0; }
+  return d;
+}
+
 if (!PDFRef.__fastPoolInstalled) {
   const original = PDFRef.of;
   const pool0 = [];
@@ -45,15 +102,39 @@ if (!PDFRef.__fastPoolInstalled) {
     if (generationNumber === undefined || generationNumber === 0) {
       const existing = pool0[objectNumber];
       if (existing) return existing;
-      // Direct construction -- skip ENFORCER check, skip upstream pool.set.
+      // Direct construction -- skip ENFORCER check, skip upstream pool.set,
+      // skip the per-instance `tag` string (the prototype methods now
+      // compute their results from objectNumber / generationNumber).
       const fresh = Object.create(PDFRef.prototype);
       fresh.objectNumber = objectNumber;
       fresh.generationNumber = 0;
-      fresh.tag = objectNumber + ' 0 R';
       pool0[objectNumber] = fresh;
       return fresh;
     }
     return original.call(PDFRef, objectNumber, generationNumber);
   };
+
+  // Replace the upstream prototype methods to ignore `tag` entirely.
+  // Works for both gen=0 (tag is absent) and gen!=0 (tag is set by
+  // upstream's constructor but ignored).
+
+  PDFRef.prototype.toString = function () {
+    return this.objectNumber + ' ' + this.generationNumber + ' R';
+  };
+
+  PDFRef.prototype.sizeInBytes = function () {
+    return _digitCount(this.objectNumber) + _digitCount(this.generationNumber) + 3;
+  };
+
+  PDFRef.prototype.copyBytesInto = function (buffer, offset) {
+    const start = offset;
+    offset += _writeUint(buffer, offset, this.objectNumber);
+    buffer[offset++] = 0x20;  // ' '
+    offset += _writeUint(buffer, offset, this.generationNumber);
+    buffer[offset++] = 0x20;  // ' '
+    buffer[offset++] = 0x52;  // 'R'
+    return offset - start;
+  };
+
   PDFRef.__fastPoolInstalled = true;
 }
diff --git a/perf/README.md b/perf/README.md
index 4c47386d..e9bd566d 100644
--- a/perf/README.md
+++ b/perf/README.md
@@ -93,7 +93,12 @@ Flag rationale:
   `Map.set` in the heap profile. The `PDFRef.of` row drops out of
   the CPU top-15 and the `set` builtin row collapses from ~7.5 MB
   to ~0.5 MB (the residual is `PDFName` interning's
-  `fastCache.set`, harmless). Production runs through it; the
+  `fastCache.set`, harmless). The shim also drops the per-instance
+  `tag` string -- `toString` / `sizeInBytes` / `copyBytesInto`
+  compute from `objectNumber` / `generationNumber` directly (no
+  `<obj> <gen> R` string allocated per ref), dropping
+  `parseIndirectObjectHeader` 13.7 MB → 9.3 MB and total process
+  heap 51.9 MB → 45.2 MB (-13 %). Production runs through it; the
   profile should too.
 - `--parallel-deflate` -- swap `pdfDoc.save()` for `parallelSave`
   from [docs/lib/parallel-deflate.mjs](../docs/lib/parallel-deflate.mjs),
@@ -598,6 +603,7 @@ file documenting each:
 | One-buffer `PDFDict` (single mainBuf + packed 53-bit instance) | [08](notes/08-pdf-lib.md) | total process heap 92 MB → 66 MB (-28 %); cumulative -57 % since Map-backed PDFDict |
 | `measure-pass` (Phase 1) wired into production via `setExpectedDictSlots()` | [08](notes/08-pdf-lib.md) | byte-identical output; mainBuf pre-sized exact (no V8 growth resizes); ~+40 ms net process |
 | One-buffer `PDFArray` (single arrayMain + packed (start, length) view) | [08](notes/08-pdf-lib.md) | total process heap 66 MB → 52 MB (-21 %); parseArray off top 15; cumulative -66 % since Map-backed PDFDict |
+| Drop per-instance `PDFRef.tag` string (`copyBytesInto` digit-write, `sizeInBytes` digit-count, `toString` on demand) | [08](notes/08-pdf-lib.md) | `parseIndirectObjectHeader` 13.7 MB → 9.3 MB; total process heap 51.9 MB → 45.2 MB (-13 %) |
 
 What was tried and didn't ship:
 
@@ -624,4 +630,4 @@ order; later ones reference earlier ones for context.
 | [05-blink-trace.md](notes/05-blink-trace.md) | What happened when we tried move-not-clone (a `previousLeaf` cache shipped instead of move); cracking the cpu profile's `(program)` row open with a Blink-category trace; the WhiteSpaceFilter paired-A/B that found it wasn't worth its layout cost in our pipeline. |
 | [06-microtasks-pageranges-css.md](notes/06-microtasks-pageranges-css.md) | Following `RunMicrotasks` down to zero (chunker fully sync); why `pageRanges` sharding is off the table; CSS cost attribution showing print.css's individual sections are all below the noise floor. |
 | [07-memory.md](notes/07-memory.md) | Where the renderer's 1.9 GB goes -- process-tree footprint, per-allocator + per-Blink-class breakdown, `--disable-gpu` + `--in-process-gpu` saving ~200 MB, a GC-pass probe finding 180 MB of unswept Oilpan garbage. |
-| [08-pdf-lib.md](notes/08-pdf-lib.md) | Profiling the process phase via `--cpu-profile-process`; pako's per-stream init dominates with ~4 500 small streams (routing pdf-lib's `deflate` + `inflate` through `node:zlib` saves ~1.5 s); `PDFRef.of`'s string-keyed Map lookup at 1.2 M calls per load (dense-array gen=0 cache saves ~0.2 s); parallelising save's per-stream deflate on libuv's pool with `objectsPerStream: 500` (~0.3 s off the main thread; PDF -5 %); `decodeName`'s regex scan on 2.76 M `PDFName.of` calls per load with a 0.0001 % hit rate (no-`#` cache saves ~0.5 s); `numberToString`'s redundant `toString`/`split`/`parseInt` on the 100 % no-`e` path; `sizeInBytes` allocating `n.toString(2)` on ~300 k xref-writer calls (short-circuit ladder saves ~60 ms); `PDFDict.entries` allocating `Array.from(map.entries())` on every dict serialisation (`Map.forEach` with hoisted callbacks saves ~80 ms); `parseDict`'s type-dispatch tail re-running `PDFName.of('Type'/'Catalog'/'Pages'/'Page')` per dict (hoisted sentinel constants drop `fastOf` self-time by 22 %); pdf-lib's `__awaiter`/`__generator` scaffolding on nine load + save methods costing ~135 ms of attributed self-time + ~50 ms GC (synchronified twins save 0.36 s of process); `parseObject`'s three speculative `matchKeyword(true/false/null)` scans on every dispatch (first-byte peek + gated keyword scans halve `parseObject` self-time); the sampling heap profile pointing at `new Map()` + `Map.prototype.set` at ~50 % of process-phase allocations (replacing `PDFDict`'s backing `Map` with a flat alternating `[k,v,k,v,...]` array drops Map+set heap traffic ~80 % and subsumes the earlier `fast-dict-iter` + `fast-parse-dict` shims); the only hot `Map.set` left being `PDFContext.assign`'s `indirectObjects.set(ref, object)` (replacing the Map with a dense gen=0 array indexed by `objectNumber` drops `assign` out of the CPU top-15 and halves the remaining `set` heap traffic); the residual `set` after that being the upstream `PDFRef.of` pool.set on cache miss (directly constructing the `PDFRef` via `Object.create(PDFRef.prototype)` on the gen=0 miss path bypasses the redundant upstream pool entirely, dropping `set` heap traffic from 7.7 MB to 0.5 MB and saving another ~93 ms on `PDFRef.of` CPU); `parseNumberOrRef` as the next-largest heap row at 15 MB of inlined `new PDFNumber(value)` calls -- PDFs reuse a few thousand unique numeric values hundreds of thousands of times (page indices, `/Count`, `/MediaBox` dimensions, font sizes), so pooling `PDFNumber` by value drops `parseNumberOrRef` off the top 10 and total process heap by ~13 %; `fastParseDictArray` at ~85 % FixedArray-growth garbage on 261 k dicts with a 5-entry median (pre-sizing the accumulator at `new Array(10)` collapses the per-call growth chain, dropping the row -25 % and total heap another -14 %); the next layer being the irreducible "one `new Array(10)` + one PDFDict instance per parsed dict" floor with ~1780 backing arrays still in flight after `--fast-dict-array` (collapse them into one long-lived mainBuf where every committed PDFDict entry lives, a per-parser temp stack handles parseDict recursion, owned dicts append to main and mutate in-place / COW-to-tail, and the whole PDFDict instance state packs into one 53-bit Number -- 24-bit start + 14-bit length + 1-bit owned -- so the per-dict object header collapses to a single field; total process heap drops 92 MB → 66 MB (-28 %), cumulative -57 % since the Map-backed baseline); GC self-time still ~150 ms / 15 % of process after fast-dict-onebuf (V8-flag knobs don't move it -- mark cost is dominated by walking the ~2.4 M Object-ref slots in mainBuf), so the next attack surface is shrinking the *live set* V8 has to mark by encoding slots as Numbers in a Float64Array mainBuf instead of Object references; that needs exact pool sizing, which is best done by a separate measure-allocate-work pass (Phase 0 viability gate: a no-allocate byte walker prototype clocks 135 ms vs PDFDocument.load at 1238 ms on the 39 MB Chrome-output PDF, ~9x cheaper, so even an 80 %-of-current work pass would land net-positive on CPU before any GC reduction; the prototype lives in `perf/phase0-measure.mjs` with `--dump-raw-pdf` on `measure.mjs` for capturing the canonical input); Phase 1 productionises that walker as `docs/lib/measure-pass.mjs` and adds `setExpectedDictSlots()` to fast-dict-onebuf so the harness's `--measure-pass` flag pre-sizes mainBuf exact (byte-identical output, +40 ms net process on the book) -- pipeline shape committed for Phase 2 to layer on, even though Phase 1 alone doesn't move the heap; a V8 IC-invalidation gotcha worth recording: rebinding the module-level `main` (rather than resizing it in place via `main.length = N`) made `_appendEntries` deopt and the heap jump 65 → 92 MB despite the resized Array being shape-identical; post-Phase-1 cleanup: the One-buffer `owned` bit was over-cautious -- `start + length === mainLen` is enough to know slots past mainLen are claimable regardless of whether the range came from the parser or a factory call, so the bit drops out of the packed value entirely (set's COW gate simplifies, _cow collapses to one branch, bit 38 becomes spare; byte-identical output, heap flat); slot-type histogram on `main[0..mainLen)` confirms keys are 100 % PDFName and the four big pools (Name / Ref / Number / Dict) cover 96.4 % of all slots, so a Float64Array mainBuf with a side `Object[]` for the residual ~3.5 % (Array / String / HexString) would collapse ~96 % of GC slot-marks at the price of a side-pool indirection -- `perf/instrument-slot-types.mjs` does the classification in 39 ms via `--instrument-slot-types`; built the Float64Array mainBuf with 4-bit tag + 49-bit payload encoding (subsuming fast-refs and fast-pdfnumber-pool's pool IDs, adding new pools for PDFArray / PDFString / PDFHexString) and confirmed byte-identical output -- but the measured win was a wash: pointer-array marks turn out to be fast in V8, encoding overhead at parse + decode at save roughly cancels the savings, and the new pool Maps cost ~3 MB heap, so the architecture stays on the shelf rather than shipping; mirroring the same shape to PDFArray (each instance as a view into a shared arrayBuf Float64Array via `this.d = (start, length)`, same temp-then-commit parser pattern) lands the expected heap win of -7.6 MB / -12 % and pulls parseArray's 19.6 MB attribution row off the top 10 entirely, but introduces a +360 ms wall-clock regression at save -- ~3 M per-slot `decodeValue` calls across copyBytesInto, ~100 ns each, V8 can't inline across the prototype boundary -- so the architecture stays off staging too; the natural follow-up is to hand-inline decodeValue's 10-case switch into all four hot save methods (Phase 3β), which recovers the function-call overhead and lands net wins on GC (-19 ms) and parseName (-17 ms, a downstream V8 re-optimization once the call sites became monomorphic per case branch) -- but the inlined switch body itself adds +23 ms at encode-at-parse and +22 ms at copyBytesInto, and the net of all of Phase 2 + 3 + β is "real heap+GC win, ~+200 ms wall-clock loss across many frames", so the architecture stays off staging on the simpler conclusion that V8 marks pointer arrays faster than expected and the original polymorphic `main[i].copyBytesInto()` was fine; Phase 1 wires into production via a `setExpectedDictSlots()` call in `docs/render-book.mjs` that runs the measure walker on rawPdf and hands the exact dictSlot count to fast-dict-onebuf -- the bound on mainBuf isn't material on its own (~60 K slots out of 2.4 M of slack) but commits the two-pass shape so any future shim swap doesn't have to re-do the wiring; finally, mirroring fast-dict-onebuf's range-view shape onto PDFArray (every committed element in a single append-only `arrayMain`, each PDFArray a view via packed `(start, length)`) is the lever the encoded approach was reaching for, with plain Object[] storage skipping the per-slot decode cost that killed Phase 3 -- ~19.6 MB `parseArray` allocation row drops off the top 15, total process heap 66 MB → 52 MB (-21 %), cumulative -66 % since the Map-backed baseline. |
+| [08-pdf-lib.md](notes/08-pdf-lib.md) | Profiling the process phase via `--cpu-profile-process`; pako's per-stream init dominates with ~4 500 small streams (routing pdf-lib's `deflate` + `inflate` through `node:zlib` saves ~1.5 s); `PDFRef.of`'s string-keyed Map lookup at 1.2 M calls per load (dense-array gen=0 cache saves ~0.2 s); parallelising save's per-stream deflate on libuv's pool with `objectsPerStream: 500` (~0.3 s off the main thread; PDF -5 %); `decodeName`'s regex scan on 2.76 M `PDFName.of` calls per load with a 0.0001 % hit rate (no-`#` cache saves ~0.5 s); `numberToString`'s redundant `toString`/`split`/`parseInt` on the 100 % no-`e` path; `sizeInBytes` allocating `n.toString(2)` on ~300 k xref-writer calls (short-circuit ladder saves ~60 ms); `PDFDict.entries` allocating `Array.from(map.entries())` on every dict serialisation (`Map.forEach` with hoisted callbacks saves ~80 ms); `parseDict`'s type-dispatch tail re-running `PDFName.of('Type'/'Catalog'/'Pages'/'Page')` per dict (hoisted sentinel constants drop `fastOf` self-time by 22 %); pdf-lib's `__awaiter`/`__generator` scaffolding on nine load + save methods costing ~135 ms of attributed self-time + ~50 ms GC (synchronified twins save 0.36 s of process); `parseObject`'s three speculative `matchKeyword(true/false/null)` scans on every dispatch (first-byte peek + gated keyword scans halve `parseObject` self-time); the sampling heap profile pointing at `new Map()` + `Map.prototype.set` at ~50 % of process-phase allocations (replacing `PDFDict`'s backing `Map` with a flat alternating `[k,v,k,v,...]` array drops Map+set heap traffic ~80 % and subsumes the earlier `fast-dict-iter` + `fast-parse-dict` shims); the only hot `Map.set` left being `PDFContext.assign`'s `indirectObjects.set(ref, object)` (replacing the Map with a dense gen=0 array indexed by `objectNumber` drops `assign` out of the CPU top-15 and halves the remaining `set` heap traffic); the residual `set` after that being the upstream `PDFRef.of` pool.set on cache miss (directly constructing the `PDFRef` via `Object.create(PDFRef.prototype)` on the gen=0 miss path bypasses the redundant upstream pool entirely, dropping `set` heap traffic from 7.7 MB to 0.5 MB and saving another ~93 ms on `PDFRef.of` CPU); `parseNumberOrRef` as the next-largest heap row at 15 MB of inlined `new PDFNumber(value)` calls -- PDFs reuse a few thousand unique numeric values hundreds of thousands of times (page indices, `/Count`, `/MediaBox` dimensions, font sizes), so pooling `PDFNumber` by value drops `parseNumberOrRef` off the top 10 and total process heap by ~13 %; `fastParseDictArray` at ~85 % FixedArray-growth garbage on 261 k dicts with a 5-entry median (pre-sizing the accumulator at `new Array(10)` collapses the per-call growth chain, dropping the row -25 % and total heap another -14 %); the next layer being the irreducible "one `new Array(10)` + one PDFDict instance per parsed dict" floor with ~1780 backing arrays still in flight after `--fast-dict-array` (collapse them into one long-lived mainBuf where every committed PDFDict entry lives, a per-parser temp stack handles parseDict recursion, owned dicts append to main and mutate in-place / COW-to-tail, and the whole PDFDict instance state packs into one 53-bit Number -- 24-bit start + 14-bit length + 1-bit owned -- so the per-dict object header collapses to a single field; total process heap drops 92 MB → 66 MB (-28 %), cumulative -57 % since the Map-backed baseline); GC self-time still ~150 ms / 15 % of process after fast-dict-onebuf (V8-flag knobs don't move it -- mark cost is dominated by walking the ~2.4 M Object-ref slots in mainBuf), so the next attack surface is shrinking the *live set* V8 has to mark by encoding slots as Numbers in a Float64Array mainBuf instead of Object references; that needs exact pool sizing, which is best done by a separate measure-allocate-work pass (Phase 0 viability gate: a no-allocate byte walker prototype clocks 135 ms vs PDFDocument.load at 1238 ms on the 39 MB Chrome-output PDF, ~9x cheaper, so even an 80 %-of-current work pass would land net-positive on CPU before any GC reduction; the prototype lives in `perf/phase0-measure.mjs` with `--dump-raw-pdf` on `measure.mjs` for capturing the canonical input); Phase 1 productionises that walker as `docs/lib/measure-pass.mjs` and adds `setExpectedDictSlots()` to fast-dict-onebuf so the harness's `--measure-pass` flag pre-sizes mainBuf exact (byte-identical output, +40 ms net process on the book) -- pipeline shape committed for Phase 2 to layer on, even though Phase 1 alone doesn't move the heap; a V8 IC-invalidation gotcha worth recording: rebinding the module-level `main` (rather than resizing it in place via `main.length = N`) made `_appendEntries` deopt and the heap jump 65 → 92 MB despite the resized Array being shape-identical; post-Phase-1 cleanup: the One-buffer `owned` bit was over-cautious -- `start + length === mainLen` is enough to know slots past mainLen are claimable regardless of whether the range came from the parser or a factory call, so the bit drops out of the packed value entirely (set's COW gate simplifies, _cow collapses to one branch, bit 38 becomes spare; byte-identical output, heap flat); slot-type histogram on `main[0..mainLen)` confirms keys are 100 % PDFName and the four big pools (Name / Ref / Number / Dict) cover 96.4 % of all slots, so a Float64Array mainBuf with a side `Object[]` for the residual ~3.5 % (Array / String / HexString) would collapse ~96 % of GC slot-marks at the price of a side-pool indirection -- `perf/instrument-slot-types.mjs` does the classification in 39 ms via `--instrument-slot-types`; built the Float64Array mainBuf with 4-bit tag + 49-bit payload encoding (subsuming fast-refs and fast-pdfnumber-pool's pool IDs, adding new pools for PDFArray / PDFString / PDFHexString) and confirmed byte-identical output -- but the measured win was a wash: pointer-array marks turn out to be fast in V8, encoding overhead at parse + decode at save roughly cancels the savings, and the new pool Maps cost ~3 MB heap, so the architecture stays on the shelf rather than shipping; mirroring the same shape to PDFArray (each instance as a view into a shared arrayBuf Float64Array via `this.d = (start, length)`, same temp-then-commit parser pattern) lands the expected heap win of -7.6 MB / -12 % and pulls parseArray's 19.6 MB attribution row off the top 10 entirely, but introduces a +360 ms wall-clock regression at save -- ~3 M per-slot `decodeValue` calls across copyBytesInto, ~100 ns each, V8 can't inline across the prototype boundary -- so the architecture stays off staging too; the natural follow-up is to hand-inline decodeValue's 10-case switch into all four hot save methods (Phase 3β), which recovers the function-call overhead and lands net wins on GC (-19 ms) and parseName (-17 ms, a downstream V8 re-optimization once the call sites became monomorphic per case branch) -- but the inlined switch body itself adds +23 ms at encode-at-parse and +22 ms at copyBytesInto, and the net of all of Phase 2 + 3 + β is "real heap+GC win, ~+200 ms wall-clock loss across many frames", so the architecture stays off staging on the simpler conclusion that V8 marks pointer arrays faster than expected and the original polymorphic `main[i].copyBytesInto()` was fine; Phase 1 wires into production via a `setExpectedDictSlots()` call in `docs/render-book.mjs` that runs the measure walker on rawPdf and hands the exact dictSlot count to fast-dict-onebuf -- the bound on mainBuf isn't material on its own (~60 K slots out of 2.4 M of slack) but commits the two-pass shape so any future shim swap doesn't have to re-do the wiring; finally, mirroring fast-dict-onebuf's range-view shape onto PDFArray (every committed element in a single append-only `arrayMain`, each PDFArray a view via packed `(start, length)`) is the lever the encoded approach was reaching for, with plain Object[] storage skipping the per-slot decode cost that killed Phase 3 -- ~19.6 MB `parseArray` allocation row drops off the top 15, total process heap 66 MB → 52 MB (-21 %), cumulative -66 % since the Map-backed baseline; once `fast-array-onebuf` shipped the next heap row was `PDFParser.parseIndirectObjectHeader` at 13.7 MB (25 % of total), attributed via `find-heap-callers.mjs` to V8 inlining `fastOf` into `skipJibberish`'s speculative `matchIndirectObjectHeader` call (~150 k tag-string allocations of ~25-35 B each), dropping the per-instance `PDFRef.tag` field entirely and computing `toString` / `sizeInBytes` / `copyBytesInto` from `objectNumber` / `generationNumber` directly via `_writeUint` + `_digitCount` helpers cuts `parseIndirectObjectHeader` to 9.3 MB (-4.3 MB), `fastOf` 7.7 → 4.8 MB (-2.9 MB), total process heap 51.9 → 45.2 MB (-13 %), with byte-identical output verified by inflating + diffing all 453 ObjStm streams. |
diff --git a/perf/notes/08-pdf-lib.md b/perf/notes/08-pdf-lib.md
index 4974db55..67b7ccb1 100644
--- a/perf/notes/08-pdf-lib.md
+++ b/perf/notes/08-pdf-lib.md
@@ -3878,6 +3878,110 @@ The final state of this storage-shape work. The endpoint of
 the dict + array allocator refactors that this notes file has
 been chasing for the last ~22 sections.
 
+## Drop the per-instance `PDFRef.tag` string
+
+With `fast-array-onebuf` shipping, the process-phase sampling heap
+profile flipped to `PDFParser.parseIndirectObjectHeader` at 13.7 MB
+/ 25 % of total. Attribution chain (via
+`perf/find-heap-callers.mjs`):
+
+```
+parseIndirectObjectHeader  → skipJibberish (14.2 MB)
+  → matchIndirectObjectHeader (try/catch wrapper)
+    → parseIndirectObjectHeader → fastOf
+```
+
+`skipJibberish` runs after every successful indirect object parse
+and speculatively calls `matchIndirectObjectHeader` to detect the
+next `N M obj` header. On valid PDFs the speculation always
+succeeds, so `fastOf` fires once per indirect-object boundary,
+populating the dense-array cache; the subsequent "real"
+`parseIndirectObject` is then a cache hit. V8 inlines `fastOf` at
+this call site (small + hot from speculation) so the attribution
+lands on the caller -- 13.7 MB of which was the tag-string churn
+(`objectNumber + ' 0 R'`): V8 builds 1-2 intermediate concat
+strings + the final ~25-35 B tag, ~150 k times.
+
+### Upstream
+
+`PDFRef` (`pdf-lib/.../objects/PDFRef.js`) caches the
+`<obj> <gen> R` string on each instance:
+
+```js
+function PDFRef(objectNumber, generationNumber) {
+  var _this = this;
+  ...
+  _this.tag = objectNumber + ' ' + generationNumber + ' R';
+}
+```
+
+so that `toString` / `sizeInBytes` / `copyBytesInto` can read it
+back -- the three prototype methods are then trivial (`this.tag`,
+`this.tag.length`, `copyStringIntoBuffer(this.tag, ...)`). The
+earlier `fast-refs` shim already constructs the gen=0 PDFRef via
+`Object.create(PDFRef.prototype)` + manual field init, so it
+populated `tag` itself to preserve those reads.
+
+### The shim
+
+Drop the field entirely. The three prototype methods compute their
+results from `objectNumber` / `generationNumber` directly:
+
+- `copyBytesInto`: writes digits straight into the output buffer
+  via a no-allocation `_writeUint` helper
+  (divide-and-write-backwards into the caller's buffer). No
+  `copyStringIntoBuffer` call.
+- `sizeInBytes`: returns `_digitCount(obj) + _digitCount(gen) + 3`
+  (the trailing 3 covers " " + " R"). `_digitCount` is a ladder
+  catching the common small-number cases without arithmetic.
+- `toString`: builds on demand. Debug-only path, no caching needed.
+
+Both gen=0 (no tag set; `fastOf` skips the upstream constructor)
+and gen!=0 (tag set by upstream's constructor but our overrides
+ignore it) work. The gen!=0 path's tag string is
+allocated-then-wasted (~18 % of refs × ~50 K instances × ~30 B
+= ~1 MB), bounded enough not to be worth patching the upstream
+constructor for.
+
+### Measured heap
+
+Process phase, 512 B sampling, paired runs vs the
+`fast-array-onebuf` baseline:
+
+| Allocator                       | Pre (MB) | Post (MB) | Delta              |
+|---------------------------------|---------:|----------:|-------------------:|
+| `parseIndirectObjectHeader`     |    13.7  |     9.3   | **-4.3 MB**        |
+| `fastOf` (refs)                 |     7.7  |     4.8   | **-2.9 MB**        |
+| Total sampled                   |    51.9  |    45.2   | **-6.7 MB (-13 %)** |
+
+The `parseArray` row was already collapsed by `fast-array-onebuf`,
+so this round attacks the next-largest remaining attribution. The
+residual 9.3 MB at `parseIndirectObjectHeader` and 4.8 MB at
+`fastOf` are the `PDFRef` instances themselves (`Object.create` +
+`objectNumber` + `generationNumber` fields, ~32-48 B × ~150 k)
+plus V8 inlining leakage from the `fastOf` speculation call site.
+Hard floor without dropping per-PDFRef wrappers entirely (which
+the class-shape round below picks up).
+
+### Measured CPU
+
+Pinned 0x5500 / High, no profiler, 4 runs each side:
+
+| State    |  median  |   mean   |
+|----------|---------:|---------:|
+| with-tag | 1.045 s  | 1.045 s  |
+| tagless  | 1.030 s  | 1.030 s  |
+| Δ        | ~15 ms tagless faster (in the noise but trending) |
+
+### Validation
+
+Output PDF is byte-identical to baseline modulo `/CreationDate`
++ `/ModDate` timestamps -- verified by inflating + diffing all
+453 ObjStm streams. The change is local to
+[`docs/lib/fast-refs.mjs`](../../docs/lib/fast-refs.mjs); no
+production import or flag change needed since `--fast-refs` was
+already wired up.
+
 ## `@cantoo/pdf-lib`: not a drop-in replacement
 
 Spot-checked the maintained fork (`@cantoo/pdf-lib` 2.6.5) as an
@@ -3917,7 +4021,8 @@ Cumulative process-phase cost, baseline → after the shims to date:
 | + parseDict pre-sized array          | ~1.0 s  | ~0.6 s | ~0.4 s |
 | + fast-dict-onebuf                   | ~1.0 s  | ~0.6 s | ~0.4 s |
 | + measure-pass Phase 1               | ~1.0 s  | ~0.7 s | ~0.4 s |
-| **+ fast-array-onebuf (this section)** | **~1.0 s** | **~0.7 s** | **~0.4 s** |
+| + fast-array-onebuf                  | ~1.0 s  | ~0.7 s | ~0.4 s |
+| **+ fast-refs tag drop (this section)** | **~1.0 s** | **~0.7 s** | **~0.4 s** |
 
 The bottom-up after the latest pair is what's left of pdf-lib's
 genuine parser work: `PDFDict.entries`, `PDFObjectParser.parseName`,

From b04f5ca017d85ffcc60096860fd75116c76a0741 Mon Sep 17 00:00:00 2001
From: Kuba Sunderland-Ober <kuba@mareimbrium.org>
Date: Sun, 24 May 2026 10:58:03 +0200
Subject: [PATCH 34/44] fast-sync-load: skip skipJibberish on the digit-byte
 fast path.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

On valid PDFs, the byte after each successful parseIndirectObject
+ skipWhitespaceAndComments is almost always a digit -- the start
of the next `N M obj` header. skipJibberish only exists to recover
from invalid PDFs that wedge garbage between indirect objects, but
its hot path runs unconditionally: ~150 k calls per load on the
book, each speculatively trying matchKeyword(xref/trailer/startxref)
(all fail on a digit) and then matchIndirectObjectHeader (a
try/catch around parseIndirectObjectHeader → parseRawInt × 2 →
matchKeyword('obj') → fastOf round-trip). The speculation succeeds
every time, the cursor gets rewound, and the outer while loop's
IsDigit check confirms what the speculation already proved.

Short-circuit when the cursor is on a digit; fall through to
skipJibberish on anything else (xref / trailer / startxref keyword
starts, or real jibberish between indirect-object sections).

The once-per-section skipJibberish in parseDocumentSection (after
maybeParseTrailer) is unaffected -- it handles boundaries between
PDF revisions / EOF where stray bytes are spec-legal.

Wall-clock impact (pinned 0x5500 / High, no profiler, 4 paired runs):

  without fast-path  median 1.07 s, mean 1.053 s
  with    fast-path  median 0.995 s, mean 0.985 s
  Δ                  ~67 ms faster (mean), ~6 % of process phase

Phase breakdown isolates the win to load (mean 0.518 → 0.455 s,
-62 ms); save is flat as expected (fast-path is load-side only).

Heap unchanged (0 MB delta, as predicted) -- the PDFRef instances
the speculation allocated were already attribution-shifted to the
real parseIndirectObject's cache miss, not new allocations.

Output PDF byte-identical to the pre-patch baseline (verified by
inflating + diffing all 453 ObjStm streams modulo timestamps).
---
 docs/lib/fast-sync-load.mjs | 14 +++++++
 perf/README.md              | 11 ++++-
 perf/notes/08-pdf-lib.md    | 81 ++++++++++++++++++++++++++++++++++++-
 3 files changed, 103 insertions(+), 3 deletions(-)

diff --git a/docs/lib/fast-sync-load.mjs b/docs/lib/fast-sync-load.mjs
index fd473f8b..1109247d 100644
--- a/docs/lib/fast-sync-load.mjs
+++ b/docs/lib/fast-sync-load.mjs
@@ -133,6 +133,20 @@ if (!PDFParser.prototype.__fastSyncLoadInstalled) {
         this.tryToParseInvalidIndirectObject();
       }
       this.skipWhitespaceAndComments();
+      // Fast path: on valid PDFs the next byte is almost always a digit
+      // (start of the next `N M obj` header). skipJibberish only exists
+      // to recover from invalid PDFs that wedge garbage between indirect
+      // objects, but its hot path -- 150 k calls per load on the book --
+      // speculatively runs matchKeyword(xref/trailer/startxref) (all fail
+      // on a digit) and then matchIndirectObjectHeader (a try/catch
+      // around parseIndirectObjectHeader + parseRawInt x2 + matchKeyword
+      // + fastOf round-trip). All to confirm what the outer while's
+      // IsDigit check already knew. Short-circuit when the cursor is on
+      // a digit; fall through to skipJibberish on anything else
+      // (xref / trailer / startxref keyword starts, or real jibberish).
+      // The once-per-section skipJibberish in parseDocumentSection
+      // (after maybeParseTrailer) is unaffected.
+      if (!this.bytes.done() && IsDigit[this.bytes.peek()]) continue;
       this.skipJibberish();
     }
   };
diff --git a/perf/README.md b/perf/README.md
index e9bd566d..751c3fc3 100644
--- a/perf/README.md
+++ b/perf/README.md
@@ -230,7 +230,13 @@ Flag rationale:
   dispatch + Promise allocation. The shim removes the scaffolding
   entirely. The `parseSpeed` / `objectsPerTick` options drop off
   `PDFDocument.load`, `parallelSave`, and `pdfDoc.save` call sites
-  in step. Production runs through it.
+  in step. Also short-circuits `skipJibberish` on the digit-byte
+  fast path -- `parseDocument`'s inner loop calls it ~150 k times
+  per load on the book, each call speculatively running
+  `matchKeyword(xref/trailer/startxref)` + `matchIndirectObjectHeader`
+  to confirm what the outer `while`'s `IsDigit` check already
+  proved; peeking the byte first and `continue`-ing on a digit
+  saves ~62 ms on load. Production runs through it.
 - `--fast-indirect-objects` -- inject
   [docs/lib/fast-indirect-objects.mjs](../docs/lib/fast-indirect-objects.mjs),
   replacing `PDFContext.indirectObjects` (`Map<PDFRef, PDFObject>`)
@@ -604,6 +610,7 @@ file documenting each:
 | `measure-pass` (Phase 1) wired into production via `setExpectedDictSlots()` | [08](notes/08-pdf-lib.md) | byte-identical output; mainBuf pre-sized exact (no V8 growth resizes); ~+40 ms net process |
 | One-buffer `PDFArray` (single arrayMain + packed (start, length) view) | [08](notes/08-pdf-lib.md) | total process heap 66 MB → 52 MB (-21 %); parseArray off top 15; cumulative -66 % since Map-backed PDFDict |
 | Drop per-instance `PDFRef.tag` string (`copyBytesInto` digit-write, `sizeInBytes` digit-count, `toString` on demand) | [08](notes/08-pdf-lib.md) | `parseIndirectObjectHeader` 13.7 MB → 9.3 MB; total process heap 51.9 MB → 45.2 MB (-13 %) |
+| `skipJibberish` digit-byte fast path (peek before speculative `matchKeyword` + `matchIndirectObjectHeader`) | [08](notes/08-pdf-lib.md) | load mean 0.518 → 0.455 s (-62 ms, -6 %); save flat; byte-identical output |
 
 What was tried and didn't ship:
 
@@ -630,4 +637,4 @@ order; later ones reference earlier ones for context.
 | [05-blink-trace.md](notes/05-blink-trace.md) | What happened when we tried move-not-clone (a `previousLeaf` cache shipped instead of move); cracking the cpu profile's `(program)` row open with a Blink-category trace; the WhiteSpaceFilter paired-A/B that found it wasn't worth its layout cost in our pipeline. |
 | [06-microtasks-pageranges-css.md](notes/06-microtasks-pageranges-css.md) | Following `RunMicrotasks` down to zero (chunker fully sync); why `pageRanges` sharding is off the table; CSS cost attribution showing print.css's individual sections are all below the noise floor. |
 | [07-memory.md](notes/07-memory.md) | Where the renderer's 1.9 GB goes -- process-tree footprint, per-allocator + per-Blink-class breakdown, `--disable-gpu` + `--in-process-gpu` saving ~200 MB, a GC-pass probe finding 180 MB of unswept Oilpan garbage. |
-| [08-pdf-lib.md](notes/08-pdf-lib.md) | Profiling the process phase via `--cpu-profile-process`; pako's per-stream init dominates with ~4 500 small streams (routing pdf-lib's `deflate` + `inflate` through `node:zlib` saves ~1.5 s); `PDFRef.of`'s string-keyed Map lookup at 1.2 M calls per load (dense-array gen=0 cache saves ~0.2 s); parallelising save's per-stream deflate on libuv's pool with `objectsPerStream: 500` (~0.3 s off the main thread; PDF -5 %); `decodeName`'s regex scan on 2.76 M `PDFName.of` calls per load with a 0.0001 % hit rate (no-`#` cache saves ~0.5 s); `numberToString`'s redundant `toString`/`split`/`parseInt` on the 100 % no-`e` path; `sizeInBytes` allocating `n.toString(2)` on ~300 k xref-writer calls (short-circuit ladder saves ~60 ms); `PDFDict.entries` allocating `Array.from(map.entries())` on every dict serialisation (`Map.forEach` with hoisted callbacks saves ~80 ms); `parseDict`'s type-dispatch tail re-running `PDFName.of('Type'/'Catalog'/'Pages'/'Page')` per dict (hoisted sentinel constants drop `fastOf` self-time by 22 %); pdf-lib's `__awaiter`/`__generator` scaffolding on nine load + save methods costing ~135 ms of attributed self-time + ~50 ms GC (synchronified twins save 0.36 s of process); `parseObject`'s three speculative `matchKeyword(true/false/null)` scans on every dispatch (first-byte peek + gated keyword scans halve `parseObject` self-time); the sampling heap profile pointing at `new Map()` + `Map.prototype.set` at ~50 % of process-phase allocations (replacing `PDFDict`'s backing `Map` with a flat alternating `[k,v,k,v,...]` array drops Map+set heap traffic ~80 % and subsumes the earlier `fast-dict-iter` + `fast-parse-dict` shims); the only hot `Map.set` left being `PDFContext.assign`'s `indirectObjects.set(ref, object)` (replacing the Map with a dense gen=0 array indexed by `objectNumber` drops `assign` out of the CPU top-15 and halves the remaining `set` heap traffic); the residual `set` after that being the upstream `PDFRef.of` pool.set on cache miss (directly constructing the `PDFRef` via `Object.create(PDFRef.prototype)` on the gen=0 miss path bypasses the redundant upstream pool entirely, dropping `set` heap traffic from 7.7 MB to 0.5 MB and saving another ~93 ms on `PDFRef.of` CPU); `parseNumberOrRef` as the next-largest heap row at 15 MB of inlined `new PDFNumber(value)` calls -- PDFs reuse a few thousand unique numeric values hundreds of thousands of times (page indices, `/Count`, `/MediaBox` dimensions, font sizes), so pooling `PDFNumber` by value drops `parseNumberOrRef` off the top 10 and total process heap by ~13 %; `fastParseDictArray` at ~85 % FixedArray-growth garbage on 261 k dicts with a 5-entry median (pre-sizing the accumulator at `new Array(10)` collapses the per-call growth chain, dropping the row -25 % and total heap another -14 %); the next layer being the irreducible "one `new Array(10)` + one PDFDict instance per parsed dict" floor with ~1780 backing arrays still in flight after `--fast-dict-array` (collapse them into one long-lived mainBuf where every committed PDFDict entry lives, a per-parser temp stack handles parseDict recursion, owned dicts append to main and mutate in-place / COW-to-tail, and the whole PDFDict instance state packs into one 53-bit Number -- 24-bit start + 14-bit length + 1-bit owned -- so the per-dict object header collapses to a single field; total process heap drops 92 MB → 66 MB (-28 %), cumulative -57 % since the Map-backed baseline); GC self-time still ~150 ms / 15 % of process after fast-dict-onebuf (V8-flag knobs don't move it -- mark cost is dominated by walking the ~2.4 M Object-ref slots in mainBuf), so the next attack surface is shrinking the *live set* V8 has to mark by encoding slots as Numbers in a Float64Array mainBuf instead of Object references; that needs exact pool sizing, which is best done by a separate measure-allocate-work pass (Phase 0 viability gate: a no-allocate byte walker prototype clocks 135 ms vs PDFDocument.load at 1238 ms on the 39 MB Chrome-output PDF, ~9x cheaper, so even an 80 %-of-current work pass would land net-positive on CPU before any GC reduction; the prototype lives in `perf/phase0-measure.mjs` with `--dump-raw-pdf` on `measure.mjs` for capturing the canonical input); Phase 1 productionises that walker as `docs/lib/measure-pass.mjs` and adds `setExpectedDictSlots()` to fast-dict-onebuf so the harness's `--measure-pass` flag pre-sizes mainBuf exact (byte-identical output, +40 ms net process on the book) -- pipeline shape committed for Phase 2 to layer on, even though Phase 1 alone doesn't move the heap; a V8 IC-invalidation gotcha worth recording: rebinding the module-level `main` (rather than resizing it in place via `main.length = N`) made `_appendEntries` deopt and the heap jump 65 → 92 MB despite the resized Array being shape-identical; post-Phase-1 cleanup: the One-buffer `owned` bit was over-cautious -- `start + length === mainLen` is enough to know slots past mainLen are claimable regardless of whether the range came from the parser or a factory call, so the bit drops out of the packed value entirely (set's COW gate simplifies, _cow collapses to one branch, bit 38 becomes spare; byte-identical output, heap flat); slot-type histogram on `main[0..mainLen)` confirms keys are 100 % PDFName and the four big pools (Name / Ref / Number / Dict) cover 96.4 % of all slots, so a Float64Array mainBuf with a side `Object[]` for the residual ~3.5 % (Array / String / HexString) would collapse ~96 % of GC slot-marks at the price of a side-pool indirection -- `perf/instrument-slot-types.mjs` does the classification in 39 ms via `--instrument-slot-types`; built the Float64Array mainBuf with 4-bit tag + 49-bit payload encoding (subsuming fast-refs and fast-pdfnumber-pool's pool IDs, adding new pools for PDFArray / PDFString / PDFHexString) and confirmed byte-identical output -- but the measured win was a wash: pointer-array marks turn out to be fast in V8, encoding overhead at parse + decode at save roughly cancels the savings, and the new pool Maps cost ~3 MB heap, so the architecture stays on the shelf rather than shipping; mirroring the same shape to PDFArray (each instance as a view into a shared arrayBuf Float64Array via `this.d = (start, length)`, same temp-then-commit parser pattern) lands the expected heap win of -7.6 MB / -12 % and pulls parseArray's 19.6 MB attribution row off the top 10 entirely, but introduces a +360 ms wall-clock regression at save -- ~3 M per-slot `decodeValue` calls across copyBytesInto, ~100 ns each, V8 can't inline across the prototype boundary -- so the architecture stays off staging too; the natural follow-up is to hand-inline decodeValue's 10-case switch into all four hot save methods (Phase 3β), which recovers the function-call overhead and lands net wins on GC (-19 ms) and parseName (-17 ms, a downstream V8 re-optimization once the call sites became monomorphic per case branch) -- but the inlined switch body itself adds +23 ms at encode-at-parse and +22 ms at copyBytesInto, and the net of all of Phase 2 + 3 + β is "real heap+GC win, ~+200 ms wall-clock loss across many frames", so the architecture stays off staging on the simpler conclusion that V8 marks pointer arrays faster than expected and the original polymorphic `main[i].copyBytesInto()` was fine; Phase 1 wires into production via a `setExpectedDictSlots()` call in `docs/render-book.mjs` that runs the measure walker on rawPdf and hands the exact dictSlot count to fast-dict-onebuf -- the bound on mainBuf isn't material on its own (~60 K slots out of 2.4 M of slack) but commits the two-pass shape so any future shim swap doesn't have to re-do the wiring; finally, mirroring fast-dict-onebuf's range-view shape onto PDFArray (every committed element in a single append-only `arrayMain`, each PDFArray a view via packed `(start, length)`) is the lever the encoded approach was reaching for, with plain Object[] storage skipping the per-slot decode cost that killed Phase 3 -- ~19.6 MB `parseArray` allocation row drops off the top 15, total process heap 66 MB → 52 MB (-21 %), cumulative -66 % since the Map-backed baseline; once `fast-array-onebuf` shipped the next heap row was `PDFParser.parseIndirectObjectHeader` at 13.7 MB (25 % of total), attributed via `find-heap-callers.mjs` to V8 inlining `fastOf` into `skipJibberish`'s speculative `matchIndirectObjectHeader` call (~150 k tag-string allocations of ~25-35 B each), dropping the per-instance `PDFRef.tag` field entirely and computing `toString` / `sizeInBytes` / `copyBytesInto` from `objectNumber` / `generationNumber` directly via `_writeUint` + `_digitCount` helpers cuts `parseIndirectObjectHeader` to 9.3 MB (-4.3 MB), `fastOf` 7.7 → 4.8 MB (-2.9 MB), total process heap 51.9 → 45.2 MB (-13 %), with byte-identical output verified by inflating + diffing all 453 ObjStm streams. |
+| [08-pdf-lib.md](notes/08-pdf-lib.md) | Profiling the process phase via `--cpu-profile-process`; pako's per-stream init dominates with ~4 500 small streams (routing pdf-lib's `deflate` + `inflate` through `node:zlib` saves ~1.5 s); `PDFRef.of`'s string-keyed Map lookup at 1.2 M calls per load (dense-array gen=0 cache saves ~0.2 s); parallelising save's per-stream deflate on libuv's pool with `objectsPerStream: 500` (~0.3 s off the main thread; PDF -5 %); `decodeName`'s regex scan on 2.76 M `PDFName.of` calls per load with a 0.0001 % hit rate (no-`#` cache saves ~0.5 s); `numberToString`'s redundant `toString`/`split`/`parseInt` on the 100 % no-`e` path; `sizeInBytes` allocating `n.toString(2)` on ~300 k xref-writer calls (short-circuit ladder saves ~60 ms); `PDFDict.entries` allocating `Array.from(map.entries())` on every dict serialisation (`Map.forEach` with hoisted callbacks saves ~80 ms); `parseDict`'s type-dispatch tail re-running `PDFName.of('Type'/'Catalog'/'Pages'/'Page')` per dict (hoisted sentinel constants drop `fastOf` self-time by 22 %); pdf-lib's `__awaiter`/`__generator` scaffolding on nine load + save methods costing ~135 ms of attributed self-time + ~50 ms GC (synchronified twins save 0.36 s of process); `parseObject`'s three speculative `matchKeyword(true/false/null)` scans on every dispatch (first-byte peek + gated keyword scans halve `parseObject` self-time); the sampling heap profile pointing at `new Map()` + `Map.prototype.set` at ~50 % of process-phase allocations (replacing `PDFDict`'s backing `Map` with a flat alternating `[k,v,k,v,...]` array drops Map+set heap traffic ~80 % and subsumes the earlier `fast-dict-iter` + `fast-parse-dict` shims); the only hot `Map.set` left being `PDFContext.assign`'s `indirectObjects.set(ref, object)` (replacing the Map with a dense gen=0 array indexed by `objectNumber` drops `assign` out of the CPU top-15 and halves the remaining `set` heap traffic); the residual `set` after that being the upstream `PDFRef.of` pool.set on cache miss (directly constructing the `PDFRef` via `Object.create(PDFRef.prototype)` on the gen=0 miss path bypasses the redundant upstream pool entirely, dropping `set` heap traffic from 7.7 MB to 0.5 MB and saving another ~93 ms on `PDFRef.of` CPU); `parseNumberOrRef` as the next-largest heap row at 15 MB of inlined `new PDFNumber(value)` calls -- PDFs reuse a few thousand unique numeric values hundreds of thousands of times (page indices, `/Count`, `/MediaBox` dimensions, font sizes), so pooling `PDFNumber` by value drops `parseNumberOrRef` off the top 10 and total process heap by ~13 %; `fastParseDictArray` at ~85 % FixedArray-growth garbage on 261 k dicts with a 5-entry median (pre-sizing the accumulator at `new Array(10)` collapses the per-call growth chain, dropping the row -25 % and total heap another -14 %); the next layer being the irreducible "one `new Array(10)` + one PDFDict instance per parsed dict" floor with ~1780 backing arrays still in flight after `--fast-dict-array` (collapse them into one long-lived mainBuf where every committed PDFDict entry lives, a per-parser temp stack handles parseDict recursion, owned dicts append to main and mutate in-place / COW-to-tail, and the whole PDFDict instance state packs into one 53-bit Number -- 24-bit start + 14-bit length + 1-bit owned -- so the per-dict object header collapses to a single field; total process heap drops 92 MB → 66 MB (-28 %), cumulative -57 % since the Map-backed baseline); GC self-time still ~150 ms / 15 % of process after fast-dict-onebuf (V8-flag knobs don't move it -- mark cost is dominated by walking the ~2.4 M Object-ref slots in mainBuf), so the next attack surface is shrinking the *live set* V8 has to mark by encoding slots as Numbers in a Float64Array mainBuf instead of Object references; that needs exact pool sizing, which is best done by a separate measure-allocate-work pass (Phase 0 viability gate: a no-allocate byte walker prototype clocks 135 ms vs PDFDocument.load at 1238 ms on the 39 MB Chrome-output PDF, ~9x cheaper, so even an 80 %-of-current work pass would land net-positive on CPU before any GC reduction; the prototype lives in `perf/phase0-measure.mjs` with `--dump-raw-pdf` on `measure.mjs` for capturing the canonical input); Phase 1 productionises that walker as `docs/lib/measure-pass.mjs` and adds `setExpectedDictSlots()` to fast-dict-onebuf so the harness's `--measure-pass` flag pre-sizes mainBuf exact (byte-identical output, +40 ms net process on the book) -- pipeline shape committed for Phase 2 to layer on, even though Phase 1 alone doesn't move the heap; a V8 IC-invalidation gotcha worth recording: rebinding the module-level `main` (rather than resizing it in place via `main.length = N`) made `_appendEntries` deopt and the heap jump 65 → 92 MB despite the resized Array being shape-identical; post-Phase-1 cleanup: the One-buffer `owned` bit was over-cautious -- `start + length === mainLen` is enough to know slots past mainLen are claimable regardless of whether the range came from the parser or a factory call, so the bit drops out of the packed value entirely (set's COW gate simplifies, _cow collapses to one branch, bit 38 becomes spare; byte-identical output, heap flat); slot-type histogram on `main[0..mainLen)` confirms keys are 100 % PDFName and the four big pools (Name / Ref / Number / Dict) cover 96.4 % of all slots, so a Float64Array mainBuf with a side `Object[]` for the residual ~3.5 % (Array / String / HexString) would collapse ~96 % of GC slot-marks at the price of a side-pool indirection -- `perf/instrument-slot-types.mjs` does the classification in 39 ms via `--instrument-slot-types`; built the Float64Array mainBuf with 4-bit tag + 49-bit payload encoding (subsuming fast-refs and fast-pdfnumber-pool's pool IDs, adding new pools for PDFArray / PDFString / PDFHexString) and confirmed byte-identical output -- but the measured win was a wash: pointer-array marks turn out to be fast in V8, encoding overhead at parse + decode at save roughly cancels the savings, and the new pool Maps cost ~3 MB heap, so the architecture stays on the shelf rather than shipping; mirroring the same shape to PDFArray (each instance as a view into a shared arrayBuf Float64Array via `this.d = (start, length)`, same temp-then-commit parser pattern) lands the expected heap win of -7.6 MB / -12 % and pulls parseArray's 19.6 MB attribution row off the top 10 entirely, but introduces a +360 ms wall-clock regression at save -- ~3 M per-slot `decodeValue` calls across copyBytesInto, ~100 ns each, V8 can't inline across the prototype boundary -- so the architecture stays off staging too; the natural follow-up is to hand-inline decodeValue's 10-case switch into all four hot save methods (Phase 3β), which recovers the function-call overhead and lands net wins on GC (-19 ms) and parseName (-17 ms, a downstream V8 re-optimization once the call sites became monomorphic per case branch) -- but the inlined switch body itself adds +23 ms at encode-at-parse and +22 ms at copyBytesInto, and the net of all of Phase 2 + 3 + β is "real heap+GC win, ~+200 ms wall-clock loss across many frames", so the architecture stays off staging on the simpler conclusion that V8 marks pointer arrays faster than expected and the original polymorphic `main[i].copyBytesInto()` was fine; Phase 1 wires into production via a `setExpectedDictSlots()` call in `docs/render-book.mjs` that runs the measure walker on rawPdf and hands the exact dictSlot count to fast-dict-onebuf -- the bound on mainBuf isn't material on its own (~60 K slots out of 2.4 M of slack) but commits the two-pass shape so any future shim swap doesn't have to re-do the wiring; finally, mirroring fast-dict-onebuf's range-view shape onto PDFArray (every committed element in a single append-only `arrayMain`, each PDFArray a view via packed `(start, length)`) is the lever the encoded approach was reaching for, with plain Object[] storage skipping the per-slot decode cost that killed Phase 3 -- ~19.6 MB `parseArray` allocation row drops off the top 15, total process heap 66 MB → 52 MB (-21 %), cumulative -66 % since the Map-backed baseline; once `fast-array-onebuf` shipped the next heap row was `PDFParser.parseIndirectObjectHeader` at 13.7 MB (25 % of total), attributed via `find-heap-callers.mjs` to V8 inlining `fastOf` into `skipJibberish`'s speculative `matchIndirectObjectHeader` call (~150 k tag-string allocations of ~25-35 B each), dropping the per-instance `PDFRef.tag` field entirely and computing `toString` / `sizeInBytes` / `copyBytesInto` from `objectNumber` / `generationNumber` directly via `_writeUint` + `_digitCount` helpers cuts `parseIndirectObjectHeader` to 9.3 MB (-4.3 MB), `fastOf` 7.7 → 4.8 MB (-2.9 MB), total process heap 51.9 → 45.2 MB (-13 %), with byte-identical output verified by inflating + diffing all 453 ObjStm streams; the same chain pointed at a redundancy on the CPU side -- `parseDocument`'s inner loop calls `skipJibberish` ~150 k times per load to recover from invalid PDFs that wedge garbage between indirect objects, but on valid PDFs every call speculatively runs `matchKeyword(xref/trailer/startxref)` (all fail on a digit) + `matchIndirectObjectHeader` (a `try` / `catch` around `parseIndirectObjectHeader` + `parseRawInt`x2 + `matchKeyword('obj')` + `fastOf`), all to confirm what the outer `while`'s `IsDigit` check already proved, so peeking the byte first and `continue`-ing on a digit (falling through to `skipJibberish` only on xref/trailer/startxref keyword starts or real jibberish) saves ~62 ms on load (mean 0.518 → 0.455 s, ~6 % of process). |
diff --git a/perf/notes/08-pdf-lib.md b/perf/notes/08-pdf-lib.md
index 67b7ccb1..d79397b6 100644
--- a/perf/notes/08-pdf-lib.md
+++ b/perf/notes/08-pdf-lib.md
@@ -3982,6 +3982,84 @@ Output PDF is byte-identical to baseline modulo `/CreationDate`
 production import or flag change needed since `--fast-refs` was
 already wired up.
 
+## `skipJibberish` digit-byte fast path
+
+The same `find-heap-callers.mjs` chain that surfaced the `PDFRef.tag`
+churn (previous section) named another redundancy worth chasing on
+the CPU side:
+
+```
+parseIndirectObjectHeader  → skipJibberish (14.2 MB)
+  → matchIndirectObjectHeader (try/catch wrapper)
+    → parseIndirectObjectHeader → fastOf
+```
+
+`skipJibberish` runs after every successful indirect object parse
+and exists only to recover from invalid PDFs that wedge garbage
+between indirect objects. Its hot path fires ~150 k times per load
+on the book, each call speculatively running:
+
+1. `matchKeyword('xref' / 'trailer' / 'startxref')` -- all fail on a
+   digit byte.
+2. `matchIndirectObjectHeader` -- a `try` / `catch` around
+   `parseIndirectObjectHeader` → `parseRawInt` × 2 →
+   `matchKeyword('obj')` → `fastOf` round-trip. The speculation
+   succeeds every time on a valid PDF, the cursor rewinds, and the
+   outer `while`'s `IsDigit` check confirms what the speculation
+   already proved.
+
+### Where the speculation lives
+
+`PDFParser.parseDocument`'s inner loop already calls
+`skipWhitespaceAndComments` between indirect objects. Patch a
+single-byte peek in front of `skipJibberish`:
+
+```js
+if (!this.bytes.done() && IsDigit[this.bytes.peek()]) continue;
+this.skipJibberish();
+```
+
+When the next byte is a digit (start of the next `N M obj` header
+on every valid PDF), `continue` skips straight to the next
+`parseIndirectObject`. Anything else (`xref` / `trailer` /
+`startxref` keyword starts, or real jibberish between indirect-object
+sections) falls through to `skipJibberish` unchanged.
+
+The once-per-section `skipJibberish` in `parseDocumentSection`
+(after `maybeParseTrailer`) is unaffected -- it handles boundaries
+between PDF revisions / EOF where stray bytes are spec-legal.
+
+### Measured CPU
+
+Pinned 0x5500 / High, no profiler, 4 paired runs:
+
+| State                | median  | mean    |
+|----------------------|--------:|--------:|
+| without fast path    | 1.07 s  | 1.053 s |
+| with fast path       | 0.995 s | 0.985 s |
+| Δ                    | ~67 ms faster (mean), ~6 % of process phase |
+
+Phase breakdown isolates the win to load (mean 0.518 → 0.455 s,
+-62 ms); save is flat as expected -- the fast path is load-side
+only.
+
+### Heap
+
+Unchanged (0 MB delta). The `PDFRef` instances the speculation
+allocated were already attribution-shifted to the real
+`parseIndirectObject`'s cache miss, not new allocations. The
+fast-path skips the speculation's `try` / `catch` + dispatch
+overhead, not its allocation tail.
+
+### Validation
+
+Output PDF byte-identical to the pre-patch baseline (verified by
+inflating + diffing all 453 ObjStm streams modulo `/CreationDate`
++ `/ModDate` timestamps). The change is local to
+[`docs/lib/fast-sync-load.mjs`](../../docs/lib/fast-sync-load.mjs);
+no production import or flag change needed since `--fast-sync-load`
+was already wired up.
+
 ## `@cantoo/pdf-lib`: not a drop-in replacement
 
 Spot-checked the maintained fork (`@cantoo/pdf-lib` 2.6.5) as an
@@ -4022,7 +4100,8 @@ Cumulative process-phase cost, baseline → after the shims to date:
 | + fast-dict-onebuf                   | ~1.0 s  | ~0.6 s | ~0.4 s |
 | + measure-pass Phase 1               | ~1.0 s  | ~0.7 s | ~0.4 s |
 | + fast-array-onebuf                  | ~1.0 s  | ~0.7 s | ~0.4 s |
-| **+ fast-refs tag drop (this section)** | **~1.0 s** | **~0.7 s** | **~0.4 s** |
+| + fast-refs tag drop                 | ~1.0 s  | ~0.7 s | ~0.4 s |
+| **+ skipJibberish digit fast-path (this section)** | **~0.95 s** | **~0.6 s** | **~0.4 s** |
 
 The bottom-up after the latest pair is what's left of pdf-lib's
 genuine parser work: `PDFDict.entries`, `PDFObjectParser.parseName`,

From 55a20d95ef37f61f07c1a65f83ac1a8f8183c93c Mon Sep 17 00:00:00 2001
From: Kuba Sunderland-Ober <kuba@mareimbrium.org>
Date: Sun, 24 May 2026 11:03:43 +0200
Subject: [PATCH 35/44] fast-refs-class: constructor-based PDFRef shape (-3.87
 MB heap, -12 % process).
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

fast-refs builds PDFRef instances via
`Object.create(PDFRef.prototype) + fresh.objectNumber = ... +
fresh.generationNumber = ...`. V8 transitions the hidden class on
each property write and routes the result through the slow-property
path. Empirically on the book that's ~60 B per pooled PDFRef, vs
~31 B for PDFName (built via `new PDFName(...)` -- a real
constructor with stable hidden class from the first instance).

This shim swaps `Object.create + writes` for a plain function used
as a constructor that sets both fields in one shot. Aliasing
`_FastRef.prototype = PDFRef.prototype` keeps `instanceof PDFRef`
satisfied and resolves all prototype methods on the shared
prototype (no extra proto-chain hop). gen != 0 still falls back to
upstream PDFRef.of's Map-based pool (rare on freshly-parsed PDFs).

Measured on the book (paired heap + cpu profile, --fast-refs vs
--fast-refs-class with the rest of the production shim set on):

  Heap (sampled total)            45.26 MB -> 41.39 MB  (-3.87 MB, -8.5 %)
  fastOf / fastClassOf row         4 696 KB ->  3 435 KB (-1 261 KB)
  create (builtin)                 3 379 KB ->  2 627 KB (-  752 KB)
  parseIndirectObjectHeader row    9 115 KB ->  7 435 KB (-1 680 KB)

Per-PDFRef savings: ~16 B/instance × 226 k unique = ~3.7 MB.
Not the full 30 B-to-PDFName-floor (PDFRef carries 2 fields vs
PDFName's 1), but a clean win and the construction-style change
applies symmetrically to the other Object.create-built shapes
(fast-dict-onebuf._makeFromRange, fast-array-onebuf._makeFromRange)
for the next round.

  Process wall-clock              1.13 s -> 0.99 s  (-140 ms, -12 %)
    load                          0.52 s -> 0.47 s
    save                          0.51 s -> 0.44 s
  fastOf (PDFRef) self-time       28 ms  -> (out of top 15)

GC self-time barely moved (87 ms -> 82 ms), consistent with the
allocation-rate drop being modest relative to mark-cost (the live
fast-dict-onebuf mainBuf still dominates the GC bill).

fast-refs.mjs stays in the tree as an A/B baseline. measure.mjs
mutex-checks --fast-refs and --fast-refs-class so the wrong one
can't be loaded silently. render-book.mjs swaps the import:
production runs through fast-refs-class now.
---
 docs/lib/fast-refs-class.mjs | 100 ++++++++++++++++++++++++++++++++++
 docs/render-book.mjs         |  27 +++++++--
 perf/README.md               |  53 ++++++++++++------
 perf/measure.mjs             |  10 ++++
 perf/notes/08-pdf-lib.md     | 103 ++++++++++++++++++++++++++++++++++-
 5 files changed, 270 insertions(+), 23 deletions(-)
 create mode 100644 docs/lib/fast-refs-class.mjs

diff --git a/docs/lib/fast-refs-class.mjs b/docs/lib/fast-refs-class.mjs
new file mode 100644
index 00000000..ea53af03
--- /dev/null
+++ b/docs/lib/fast-refs-class.mjs
@@ -0,0 +1,100 @@
+// fast-refs variant: use a class-style constructor for stable hidden class.
+//
+// fast-refs.mjs builds PDFRef instances with
+// `Object.create(PDFRef.prototype) + fresh.objectNumber = ... + fresh.gen = ...`.
+// V8 treats objects built that way as transitioning through intermediate
+// hidden-class maps as each property is added, and the result is roughly
+// twice as large per instance as a `new`-built object with the same
+// fields. Empirically on the book, PDFRef sits at ~60 B/instance via
+// fast-refs whereas PDFName (built via `new PDFName(...)`) sits at ~31 B.
+//
+// This shim swaps the `Object.create + writes` pattern for a constructor
+// that sets both fields in one shot, giving V8 a stable hidden class
+// from the first instance. Same external behaviour (pool semantics,
+// prototype methods, instanceof checks all work) -- the only change is
+// the construction style.
+//
+// Expected win: ~6 MB heap reduction on the book (226 k PDFRef instances
+// × ~30 B saved by skipping the slow-property path).
+//
+// Mutually exclusive with --fast-refs in the harness.
+
+import { PDFRef } from 'pdf-lib';
+
+// ---- helpers (same as fast-refs.mjs, see commentary there) -------------
+
+function _writeUint(buffer, offset, n) {
+  if (n < 10) { buffer[offset] = 0x30 + n; return 1; }
+  let m = n, d = 0;
+  while (m > 0) { d++; m = (m / 10) | 0; }
+  for (let i = d - 1; i >= 0; i--) {
+    buffer[offset + i] = 0x30 + (n % 10);
+    n = (n / 10) | 0;
+  }
+  return d;
+}
+
+function _digitCount(n) {
+  if (n < 10)      return 1;
+  if (n < 100)     return 2;
+  if (n < 1000)    return 3;
+  if (n < 10000)   return 4;
+  if (n < 100000)  return 5;
+  if (n < 1000000) return 6;
+  let d = 0;
+  while (n > 0) { d++; n = (n / 10) | 0; }
+  return d;
+}
+
+// ---- the constructor-based fast PDFRef shape ---------------------------
+
+// Plain function used as a constructor (V8 gives `new`-built instances a
+// stable hidden class derived from the assignment order in the body).
+// Aliasing the prototype to PDFRef.prototype keeps `instanceof PDFRef`
+// satisfied AND means method dispatch resolves on the shared prototype
+// (no extra proto-chain hop).
+function _FastRef(objectNumber, generationNumber) {
+  this.objectNumber = objectNumber;
+  this.generationNumber = generationNumber;
+}
+_FastRef.prototype = PDFRef.prototype;
+
+if (!PDFRef.__fastRefsClassInstalled) {
+  const original = PDFRef.of;
+  const pool0 = [];
+
+  PDFRef.of = function fastClassOf(objectNumber, generationNumber) {
+    if (generationNumber === undefined || generationNumber === 0) {
+      const existing = pool0[objectNumber];
+      if (existing) return existing;
+      const fresh = new _FastRef(objectNumber, 0);
+      pool0[objectNumber] = fresh;
+      return fresh;
+    }
+    // gen != 0: fall back to upstream PDFRef.of (its Map-based pool).
+    return original.call(PDFRef, objectNumber, generationNumber);
+  };
+
+  // Replace prototype methods to ignore the upstream `tag` field (the
+  // gen != 0 fallback path still sets it, but our overrides recompute
+  // from objectNumber / generationNumber so the tag is unused).
+  PDFRef.prototype.toString = function () {
+    return this.objectNumber + ' ' + this.generationNumber + ' R';
+  };
+
+  PDFRef.prototype.sizeInBytes = function () {
+    return _digitCount(this.objectNumber) + _digitCount(this.generationNumber) + 3;
+  };
+
+  PDFRef.prototype.copyBytesInto = function (buffer, offset) {
+    const start = offset;
+    offset += _writeUint(buffer, offset, this.objectNumber);
+    buffer[offset++] = 0x20;  // ' '
+    offset += _writeUint(buffer, offset, this.generationNumber);
+    buffer[offset++] = 0x20;  // ' '
+    buffer[offset++] = 0x52;  // 'R'
+    return offset - start;
+  };
+
+  PDFRef.__fastRefsClassInstalled = true;
+}
diff --git a/docs/render-book.mjs b/docs/render-book.mjs
index a5139ada..024f718f 100644
--- a/docs/render-book.mjs
+++ b/docs/render-book.mjs
@@ -36,10 +36,27 @@ import { PDFDocument } from 'pdf-lib';
 // before any pdf-lib operation -- order doesn't matter. See
 // perf/notes/08-pdf-lib.md.
 //
-//   fast-refs         -- dense-array cache in front of PDFRef.of for
-//     the gen=0 case (82 % of ~1.2 M calls per load). ~0.2 s saved
-//     on load.
-//   fast-inflate      -- swaps pako.inflate for node:zlib.inflateSync
+//   fast-refs-class   -- dense-array cache in front of PDFRef.of for
+//     the gen=0 case (82 % of ~1.2 M calls per load) PLUS a
+//     class-constructor shape for the PDFRef instance, AND drops
+//     the per-instance `tag` string (toString / sizeInBytes /
+//     copyBytesInto compute from objectNumber / generationNumber
+//     directly via _writeUint + _digitCount helpers). Replaces the
+//     `Object.create(PDFRef.prototype) + property writes` pattern of
+//     the older fast-refs.mjs shim, which V8 routes through the
+//     slow-property path: PDFRef ended up at ~60 B/instance vs
+//     PDFName's ~31 B (`new PDFName(...)`-built). The constructor
+//     gives V8 a stable hidden class from the first instance and
+//     drops per-instance cost to ~44 B. On the book (226 k unique
+//     PDFRefs) the combined effect is ~3.87 MB heap (-8.5 % of
+//     total process-phase allocation) and ~140 ms wall-clock (-12 %
+//     of process) on top of the tag-drop refinement that already
+//     trimmed parseIndirectObjectHeader by ~4.3 MB. Same prototype
+//     methods, same instanceof semantics; the only change is the
+//     construction style. See "fast-refs-class" in
+//     perf/notes/08-pdf-lib.md. fast-refs.mjs stays in the tree as
+//     an A/B baseline (mutex-checked in measure.mjs).
+//   fast-inflate     -- swaps pako.inflate for node:zlib.inflateSync
 //     on the one pdf-lib call site that uses it
 //     (PDFCrossRefStreamParser during load). Negligible cost shift,
 //     but eliminates the last pdf-lib -> pako call at runtime.
@@ -144,7 +161,7 @@ import { PDFDocument } from 'pdf-lib';
 //     heap traffic from parseArray collapses (the `this.array`
 //     allocation + grow doublings across ~79 k PDFArrays). See
 //     "One-buffer PDFArray" in perf/notes/08-pdf-lib.md.
-import './lib/fast-refs.mjs';
+import './lib/fast-refs-class.mjs';
 import './lib/fast-inflate.mjs';
 import './lib/fast-parse-number.mjs';
 import './lib/fast-decode-name.mjs';
diff --git a/perf/README.md b/perf/README.md
index 751c3fc3..b88cd74e 100644
--- a/perf/README.md
+++ b/perf/README.md
@@ -67,7 +67,7 @@ The mirror command for CPU-profiling the pdf-lib roundtrip (run from
 `perf/`):
 
 ```
-node measure.mjs --fast-refs --parallel-deflate --fast-decode-name --fast-number-to-string --fast-size-in-bytes --fast-inflate --fast-parse-number --fast-dict-onebuf --fast-array-onebuf --measure-pass --fast-parse-object --fast-sync-load --fast-indirect-objects --fast-pdfnumber-pool --cpu-profile-process --cpu-sampling 100 --out results/<label>
+node measure.mjs --fast-refs-class --parallel-deflate --fast-decode-name --fast-number-to-string --fast-size-in-bytes --fast-inflate --fast-parse-number --fast-dict-onebuf --fast-array-onebuf --measure-pass --fast-parse-object --fast-sync-load --fast-indirect-objects --fast-pdfnumber-pool --cpu-profile-process --cpu-sampling 100 --out results/<label>
 ```
 
 `--out results/<label>` is optional but recommended: omit it and the
@@ -84,22 +84,39 @@ node analyze-profile.mjs results/<label>/process.cpuprofile --top 15
 Flag rationale:
 
 - `--fast-refs` -- inject the
-  [docs/lib/fast-refs.mjs](../docs/lib/fast-refs.mjs) shipping
-  fix (dense-array cache for `PDFRef.of`'s gen=0 path). On miss,
+  [docs/lib/fast-refs.mjs](../docs/lib/fast-refs.mjs) shim:
+  dense-array cache for `PDFRef.of`'s gen=0 path; on miss,
   constructs the `PDFRef` directly via
   `Object.create(PDFRef.prototype)` + manual field init, bypassing
-  the upstream `pool.set(tag, instance)` -- after
-  `--fast-indirect-objects` shipped, that pool was the last hot
-  `Map.set` in the heap profile. The `PDFRef.of` row drops out of
-  the CPU top-15 and the `set` builtin row collapses from ~7.5 MB
-  to ~0.5 MB (the residual is `PDFName` interning's
-  `fastCache.set`, harmless). The shim also drops the per-instance
-  `tag` string -- `toString` / `sizeInBytes` / `copyBytesInto`
-  compute from `objectNumber` / `generationNumber` directly (no
-  `<obj> <gen> R` string allocated per ref), dropping
+  the upstream `pool.set(tag, instance)` and dropping the
+  per-instance `tag` string (`toString` / `sizeInBytes` /
+  `copyBytesInto` compute from `objectNumber` /
+  `generationNumber` directly). After `--fast-indirect-objects`
+  shipped, the upstream pool was the last hot `Map.set` in the
+  heap profile; this drops the `PDFRef.of` row off the CPU top-15
+  and the `set` builtin row from ~7.5 MB to ~0.5 MB. The
+  tag-drop layer then collapses
   `parseIndirectObjectHeader` 13.7 MB → 9.3 MB and total process
-  heap 51.9 MB → 45.2 MB (-13 %). Production runs through it; the
-  profile should too.
+  heap 51.9 MB → 45.2 MB (-13 %). **A/B baseline only** since
+  `--fast-refs-class` shipped: the `Object.create + writes`
+  construction style routes V8 through the slow-property path,
+  ending up at ~60 B/instance vs the constructor version's ~44 B.
+  Mutex with `--fast-refs-class` in the harness.
+- `--fast-refs-class` -- inject the
+  [docs/lib/fast-refs-class.mjs](../docs/lib/fast-refs-class.mjs)
+  shipping fix. Same dense-array cache + tag-drop as
+  `--fast-refs`, but the PDFRef instance is built via a
+  constructor (`new _FastRef(...)` with both fields set in the
+  body) rather than `Object.create + property writes`. V8 gives
+  `new`-built instances a stable hidden class from the first
+  instance; on the book that's ~16 B/instance × 226 k unique refs
+  = ~3.87 MB heap (-8.5 % of total process-phase allocation) and
+  ~140 ms wall-clock (-12 % of process) on top of the tag-drop
+  refinement. `_FastRef.prototype = PDFRef.prototype` keeps
+  `instanceof PDFRef` satisfied and resolves method dispatch on
+  the shared prototype (no extra proto-chain hop). gen != 0 still
+  falls back to the upstream `PDFRef.of` Map-based pool (rare on
+  freshly-parsed PDFs). Production runs through it.
 - `--parallel-deflate` -- swap `pdfDoc.save()` for `parallelSave`
   from [docs/lib/parallel-deflate.mjs](../docs/lib/parallel-deflate.mjs),
   which pre-deflates object streams in parallel on libuv's pool with
@@ -300,7 +317,7 @@ process phase -- "where is pdf-lib allocating bytes?" rather than
 "where is it spending cycles?" (run from `perf/`):
 
 ```
-node measure.mjs --fast-refs --parallel-deflate --fast-decode-name --fast-number-to-string --fast-size-in-bytes --fast-inflate --fast-parse-number --fast-dict-onebuf --fast-array-onebuf --measure-pass --fast-parse-object --fast-sync-load --fast-indirect-objects --fast-pdfnumber-pool --heap-profile-process --heap-sampling 512 --out results/<label>
+node measure.mjs --fast-refs-class --parallel-deflate --fast-decode-name --fast-number-to-string --fast-size-in-bytes --fast-inflate --fast-parse-number --fast-dict-onebuf --fast-array-onebuf --measure-pass --fast-parse-object --fast-sync-load --fast-indirect-objects --fast-pdfnumber-pool --heap-profile-process --heap-sampling 512 --out results/<label>
 ```
 
 Same `--out` / labelling note as the CPU command above: omit it for a
@@ -466,7 +483,8 @@ run.bat --time-hooks                      # per-task timing of every chunker/pol
 run.bat --incremental                     # process via incremental update instead of pdf-lib roundtrip
 run.bat --chrome-outline                  # let Chrome emit /Outlines (skip parseOutline + setOutline)
 run.bat --tracing                         # capture a hybrid Chrome trace (Blink events + embedded V8 cpu samples)
-run.bat --fast-refs                       # dense-array cache for PDFRef.of's gen=0 path (ships in render-book.mjs by default; opt-in here for A/B)
+run.bat --fast-refs                       # dense-array cache for PDFRef.of's gen=0 path + tag-drop (A/B baseline; production now runs --fast-refs-class)
+run.bat --fast-refs-class                 # --fast-refs + class-constructor PDFRef shape for stable V8 hidden class (also ships; opt-in here for A/B)
 run.bat --parallel-deflate                # parallelSave with objectsPerStream=500 (also ships; opt-in here for A/B)
 run.bat --fast-decode-name                # skip decodeName regex when name has no # (also ships; opt-in here for A/B)
 run.bat --fast-number-to-string           # skip numberToString redundant toString/split when no exponential (also ships; opt-in here for A/B)
@@ -611,6 +629,7 @@ file documenting each:
 | One-buffer `PDFArray` (single arrayMain + packed (start, length) view) | [08](notes/08-pdf-lib.md) | total process heap 66 MB → 52 MB (-21 %); parseArray off top 15; cumulative -66 % since Map-backed PDFDict |
 | Drop per-instance `PDFRef.tag` string (`copyBytesInto` digit-write, `sizeInBytes` digit-count, `toString` on demand) | [08](notes/08-pdf-lib.md) | `parseIndirectObjectHeader` 13.7 MB → 9.3 MB; total process heap 51.9 MB → 45.2 MB (-13 %) |
 | `skipJibberish` digit-byte fast path (peek before speculative `matchKeyword` + `matchIndirectObjectHeader`) | [08](notes/08-pdf-lib.md) | load mean 0.518 → 0.455 s (-62 ms, -6 %); save flat; byte-identical output |
+| Class-constructor `PDFRef` shape (`new _FastRef(...)` for stable V8 hidden class) | [08](notes/08-pdf-lib.md) | per-PDFRef ~60 B → ~44 B; total process heap 45.3 MB → 41.4 MB (-8.5 %); process wall 1.13 s → 0.99 s (-140 ms, -12 %) |
 
 What was tried and didn't ship:
 
@@ -637,4 +656,4 @@ order; later ones reference earlier ones for context.
 | [05-blink-trace.md](notes/05-blink-trace.md) | What happened when we tried move-not-clone (a `previousLeaf` cache shipped instead of move); cracking the cpu profile's `(program)` row open with a Blink-category trace; the WhiteSpaceFilter paired-A/B that found it wasn't worth its layout cost in our pipeline. |
 | [06-microtasks-pageranges-css.md](notes/06-microtasks-pageranges-css.md) | Following `RunMicrotasks` down to zero (chunker fully sync); why `pageRanges` sharding is off the table; CSS cost attribution showing print.css's individual sections are all below the noise floor. |
 | [07-memory.md](notes/07-memory.md) | Where the renderer's 1.9 GB goes -- process-tree footprint, per-allocator + per-Blink-class breakdown, `--disable-gpu` + `--in-process-gpu` saving ~200 MB, a GC-pass probe finding 180 MB of unswept Oilpan garbage. |
-| [08-pdf-lib.md](notes/08-pdf-lib.md) | Profiling the process phase via `--cpu-profile-process`; pako's per-stream init dominates with ~4 500 small streams (routing pdf-lib's `deflate` + `inflate` through `node:zlib` saves ~1.5 s); `PDFRef.of`'s string-keyed Map lookup at 1.2 M calls per load (dense-array gen=0 cache saves ~0.2 s); parallelising save's per-stream deflate on libuv's pool with `objectsPerStream: 500` (~0.3 s off the main thread; PDF -5 %); `decodeName`'s regex scan on 2.76 M `PDFName.of` calls per load with a 0.0001 % hit rate (no-`#` cache saves ~0.5 s); `numberToString`'s redundant `toString`/`split`/`parseInt` on the 100 % no-`e` path; `sizeInBytes` allocating `n.toString(2)` on ~300 k xref-writer calls (short-circuit ladder saves ~60 ms); `PDFDict.entries` allocating `Array.from(map.entries())` on every dict serialisation (`Map.forEach` with hoisted callbacks saves ~80 ms); `parseDict`'s type-dispatch tail re-running `PDFName.of('Type'/'Catalog'/'Pages'/'Page')` per dict (hoisted sentinel constants drop `fastOf` self-time by 22 %); pdf-lib's `__awaiter`/`__generator` scaffolding on nine load + save methods costing ~135 ms of attributed self-time + ~50 ms GC (synchronified twins save 0.36 s of process); `parseObject`'s three speculative `matchKeyword(true/false/null)` scans on every dispatch (first-byte peek + gated keyword scans halve `parseObject` self-time); the sampling heap profile pointing at `new Map()` + `Map.prototype.set` at ~50 % of process-phase allocations (replacing `PDFDict`'s backing `Map` with a flat alternating `[k,v,k,v,...]` array drops Map+set heap traffic ~80 % and subsumes the earlier `fast-dict-iter` + `fast-parse-dict` shims); the only hot `Map.set` left being `PDFContext.assign`'s `indirectObjects.set(ref, object)` (replacing the Map with a dense gen=0 array indexed by `objectNumber` drops `assign` out of the CPU top-15 and halves the remaining `set` heap traffic); the residual `set` after that being the upstream `PDFRef.of` pool.set on cache miss (directly constructing the `PDFRef` via `Object.create(PDFRef.prototype)` on the gen=0 miss path bypasses the redundant upstream pool entirely, dropping `set` heap traffic from 7.7 MB to 0.5 MB and saving another ~93 ms on `PDFRef.of` CPU); `parseNumberOrRef` as the next-largest heap row at 15 MB of inlined `new PDFNumber(value)` calls -- PDFs reuse a few thousand unique numeric values hundreds of thousands of times (page indices, `/Count`, `/MediaBox` dimensions, font sizes), so pooling `PDFNumber` by value drops `parseNumberOrRef` off the top 10 and total process heap by ~13 %; `fastParseDictArray` at ~85 % FixedArray-growth garbage on 261 k dicts with a 5-entry median (pre-sizing the accumulator at `new Array(10)` collapses the per-call growth chain, dropping the row -25 % and total heap another -14 %); the next layer being the irreducible "one `new Array(10)` + one PDFDict instance per parsed dict" floor with ~1780 backing arrays still in flight after `--fast-dict-array` (collapse them into one long-lived mainBuf where every committed PDFDict entry lives, a per-parser temp stack handles parseDict recursion, owned dicts append to main and mutate in-place / COW-to-tail, and the whole PDFDict instance state packs into one 53-bit Number -- 24-bit start + 14-bit length + 1-bit owned -- so the per-dict object header collapses to a single field; total process heap drops 92 MB → 66 MB (-28 %), cumulative -57 % since the Map-backed baseline); GC self-time still ~150 ms / 15 % of process after fast-dict-onebuf (V8-flag knobs don't move it -- mark cost is dominated by walking the ~2.4 M Object-ref slots in mainBuf), so the next attack surface is shrinking the *live set* V8 has to mark by encoding slots as Numbers in a Float64Array mainBuf instead of Object references; that needs exact pool sizing, which is best done by a separate measure-allocate-work pass (Phase 0 viability gate: a no-allocate byte walker prototype clocks 135 ms vs PDFDocument.load at 1238 ms on the 39 MB Chrome-output PDF, ~9x cheaper, so even an 80 %-of-current work pass would land net-positive on CPU before any GC reduction; the prototype lives in `perf/phase0-measure.mjs` with `--dump-raw-pdf` on `measure.mjs` for capturing the canonical input); Phase 1 productionises that walker as `docs/lib/measure-pass.mjs` and adds `setExpectedDictSlots()` to fast-dict-onebuf so the harness's `--measure-pass` flag pre-sizes mainBuf exact (byte-identical output, +40 ms net process on the book) -- pipeline shape committed for Phase 2 to layer on, even though Phase 1 alone doesn't move the heap; a V8 IC-invalidation gotcha worth recording: rebinding the module-level `main` (rather than resizing it in place via `main.length = N`) made `_appendEntries` deopt and the heap jump 65 → 92 MB despite the resized Array being shape-identical; post-Phase-1 cleanup: the One-buffer `owned` bit was over-cautious -- `start + length === mainLen` is enough to know slots past mainLen are claimable regardless of whether the range came from the parser or a factory call, so the bit drops out of the packed value entirely (set's COW gate simplifies, _cow collapses to one branch, bit 38 becomes spare; byte-identical output, heap flat); slot-type histogram on `main[0..mainLen)` confirms keys are 100 % PDFName and the four big pools (Name / Ref / Number / Dict) cover 96.4 % of all slots, so a Float64Array mainBuf with a side `Object[]` for the residual ~3.5 % (Array / String / HexString) would collapse ~96 % of GC slot-marks at the price of a side-pool indirection -- `perf/instrument-slot-types.mjs` does the classification in 39 ms via `--instrument-slot-types`; built the Float64Array mainBuf with 4-bit tag + 49-bit payload encoding (subsuming fast-refs and fast-pdfnumber-pool's pool IDs, adding new pools for PDFArray / PDFString / PDFHexString) and confirmed byte-identical output -- but the measured win was a wash: pointer-array marks turn out to be fast in V8, encoding overhead at parse + decode at save roughly cancels the savings, and the new pool Maps cost ~3 MB heap, so the architecture stays on the shelf rather than shipping; mirroring the same shape to PDFArray (each instance as a view into a shared arrayBuf Float64Array via `this.d = (start, length)`, same temp-then-commit parser pattern) lands the expected heap win of -7.6 MB / -12 % and pulls parseArray's 19.6 MB attribution row off the top 10 entirely, but introduces a +360 ms wall-clock regression at save -- ~3 M per-slot `decodeValue` calls across copyBytesInto, ~100 ns each, V8 can't inline across the prototype boundary -- so the architecture stays off staging too; the natural follow-up is to hand-inline decodeValue's 10-case switch into all four hot save methods (Phase 3β), which recovers the function-call overhead and lands net wins on GC (-19 ms) and parseName (-17 ms, a downstream V8 re-optimization once the call sites became monomorphic per case branch) -- but the inlined switch body itself adds +23 ms at encode-at-parse and +22 ms at copyBytesInto, and the net of all of Phase 2 + 3 + β is "real heap+GC win, ~+200 ms wall-clock loss across many frames", so the architecture stays off staging on the simpler conclusion that V8 marks pointer arrays faster than expected and the original polymorphic `main[i].copyBytesInto()` was fine; Phase 1 wires into production via a `setExpectedDictSlots()` call in `docs/render-book.mjs` that runs the measure walker on rawPdf and hands the exact dictSlot count to fast-dict-onebuf -- the bound on mainBuf isn't material on its own (~60 K slots out of 2.4 M of slack) but commits the two-pass shape so any future shim swap doesn't have to re-do the wiring; finally, mirroring fast-dict-onebuf's range-view shape onto PDFArray (every committed element in a single append-only `arrayMain`, each PDFArray a view via packed `(start, length)`) is the lever the encoded approach was reaching for, with plain Object[] storage skipping the per-slot decode cost that killed Phase 3 -- ~19.6 MB `parseArray` allocation row drops off the top 15, total process heap 66 MB → 52 MB (-21 %), cumulative -66 % since the Map-backed baseline; once `fast-array-onebuf` shipped the next heap row was `PDFParser.parseIndirectObjectHeader` at 13.7 MB (25 % of total), attributed via `find-heap-callers.mjs` to V8 inlining `fastOf` into `skipJibberish`'s speculative `matchIndirectObjectHeader` call (~150 k tag-string allocations of ~25-35 B each), dropping the per-instance `PDFRef.tag` field entirely and computing `toString` / `sizeInBytes` / `copyBytesInto` from `objectNumber` / `generationNumber` directly via `_writeUint` + `_digitCount` helpers cuts `parseIndirectObjectHeader` to 9.3 MB (-4.3 MB), `fastOf` 7.7 → 4.8 MB (-2.9 MB), total process heap 51.9 → 45.2 MB (-13 %), with byte-identical output verified by inflating + diffing all 453 ObjStm streams; the same chain pointed at a redundancy on the CPU side -- `parseDocument`'s inner loop calls `skipJibberish` ~150 k times per load to recover from invalid PDFs that wedge garbage between indirect objects, but on valid PDFs every call speculatively runs `matchKeyword(xref/trailer/startxref)` (all fail on a digit) + `matchIndirectObjectHeader` (a `try` / `catch` around `parseIndirectObjectHeader` + `parseRawInt`x2 + `matchKeyword('obj')` + `fastOf`), all to confirm what the outer `while`'s `IsDigit` check already proved, so peeking the byte first and `continue`-ing on a digit (falling through to `skipJibberish` only on xref/trailer/startxref keyword starts or real jibberish) saves ~62 ms on load (mean 0.518 → 0.455 s, ~6 % of process). |
+| [08-pdf-lib.md](notes/08-pdf-lib.md) | Profiling the process phase via `--cpu-profile-process`; pako's per-stream init dominates with ~4 500 small streams (routing pdf-lib's `deflate` + `inflate` through `node:zlib` saves ~1.5 s); `PDFRef.of`'s string-keyed Map lookup at 1.2 M calls per load (dense-array gen=0 cache saves ~0.2 s); parallelising save's per-stream deflate on libuv's pool with `objectsPerStream: 500` (~0.3 s off the main thread; PDF -5 %); `decodeName`'s regex scan on 2.76 M `PDFName.of` calls per load with a 0.0001 % hit rate (no-`#` cache saves ~0.5 s); `numberToString`'s redundant `toString`/`split`/`parseInt` on the 100 % no-`e` path; `sizeInBytes` allocating `n.toString(2)` on ~300 k xref-writer calls (short-circuit ladder saves ~60 ms); `PDFDict.entries` allocating `Array.from(map.entries())` on every dict serialisation (`Map.forEach` with hoisted callbacks saves ~80 ms); `parseDict`'s type-dispatch tail re-running `PDFName.of('Type'/'Catalog'/'Pages'/'Page')` per dict (hoisted sentinel constants drop `fastOf` self-time by 22 %); pdf-lib's `__awaiter`/`__generator` scaffolding on nine load + save methods costing ~135 ms of attributed self-time + ~50 ms GC (synchronified twins save 0.36 s of process); `parseObject`'s three speculative `matchKeyword(true/false/null)` scans on every dispatch (first-byte peek + gated keyword scans halve `parseObject` self-time); the sampling heap profile pointing at `new Map()` + `Map.prototype.set` at ~50 % of process-phase allocations (replacing `PDFDict`'s backing `Map` with a flat alternating `[k,v,k,v,...]` array drops Map+set heap traffic ~80 % and subsumes the earlier `fast-dict-iter` + `fast-parse-dict` shims); the only hot `Map.set` left being `PDFContext.assign`'s `indirectObjects.set(ref, object)` (replacing the Map with a dense gen=0 array indexed by `objectNumber` drops `assign` out of the CPU top-15 and halves the remaining `set` heap traffic); the residual `set` after that being the upstream `PDFRef.of` pool.set on cache miss (directly constructing the `PDFRef` via `Object.create(PDFRef.prototype)` on the gen=0 miss path bypasses the redundant upstream pool entirely, dropping `set` heap traffic from 7.7 MB to 0.5 MB and saving another ~93 ms on `PDFRef.of` CPU); `parseNumberOrRef` as the next-largest heap row at 15 MB of inlined `new PDFNumber(value)` calls -- PDFs reuse a few thousand unique numeric values hundreds of thousands of times (page indices, `/Count`, `/MediaBox` dimensions, font sizes), so pooling `PDFNumber` by value drops `parseNumberOrRef` off the top 10 and total process heap by ~13 %; `fastParseDictArray` at ~85 % FixedArray-growth garbage on 261 k dicts with a 5-entry median (pre-sizing the accumulator at `new Array(10)` collapses the per-call growth chain, dropping the row -25 % and total heap another -14 %); the next layer being the irreducible "one `new Array(10)` + one PDFDict instance per parsed dict" floor with ~1780 backing arrays still in flight after `--fast-dict-array` (collapse them into one long-lived mainBuf where every committed PDFDict entry lives, a per-parser temp stack handles parseDict recursion, owned dicts append to main and mutate in-place / COW-to-tail, and the whole PDFDict instance state packs into one 53-bit Number -- 24-bit start + 14-bit length + 1-bit owned -- so the per-dict object header collapses to a single field; total process heap drops 92 MB → 66 MB (-28 %), cumulative -57 % since the Map-backed baseline); GC self-time still ~150 ms / 15 % of process after fast-dict-onebuf (V8-flag knobs don't move it -- mark cost is dominated by walking the ~2.4 M Object-ref slots in mainBuf), so the next attack surface is shrinking the *live set* V8 has to mark by encoding slots as Numbers in a Float64Array mainBuf instead of Object references; that needs exact pool sizing, which is best done by a separate measure-allocate-work pass (Phase 0 viability gate: a no-allocate byte walker prototype clocks 135 ms vs PDFDocument.load at 1238 ms on the 39 MB Chrome-output PDF, ~9x cheaper, so even an 80 %-of-current work pass would land net-positive on CPU before any GC reduction; the prototype lives in `perf/phase0-measure.mjs` with `--dump-raw-pdf` on `measure.mjs` for capturing the canonical input); Phase 1 productionises that walker as `docs/lib/measure-pass.mjs` and adds `setExpectedDictSlots()` to fast-dict-onebuf so the harness's `--measure-pass` flag pre-sizes mainBuf exact (byte-identical output, +40 ms net process on the book) -- pipeline shape committed for Phase 2 to layer on, even though Phase 1 alone doesn't move the heap; a V8 IC-invalidation gotcha worth recording: rebinding the module-level `main` (rather than resizing it in place via `main.length = N`) made `_appendEntries` deopt and the heap jump 65 → 92 MB despite the resized Array being shape-identical; post-Phase-1 cleanup: the One-buffer `owned` bit was over-cautious -- `start + length === mainLen` is enough to know slots past mainLen are claimable regardless of whether the range came from the parser or a factory call, so the bit drops out of the packed value entirely (set's COW gate simplifies, _cow collapses to one branch, bit 38 becomes spare; byte-identical output, heap flat); slot-type histogram on `main[0..mainLen)` confirms keys are 100 % PDFName and the four big pools (Name / Ref / Number / Dict) cover 96.4 % of all slots, so a Float64Array mainBuf with a side `Object[]` for the residual ~3.5 % (Array / String / HexString) would collapse ~96 % of GC slot-marks at the price of a side-pool indirection -- `perf/instrument-slot-types.mjs` does the classification in 39 ms via `--instrument-slot-types`; built the Float64Array mainBuf with 4-bit tag + 49-bit payload encoding (subsuming fast-refs and fast-pdfnumber-pool's pool IDs, adding new pools for PDFArray / PDFString / PDFHexString) and confirmed byte-identical output -- but the measured win was a wash: pointer-array marks turn out to be fast in V8, encoding overhead at parse + decode at save roughly cancels the savings, and the new pool Maps cost ~3 MB heap, so the architecture stays on the shelf rather than shipping; mirroring the same shape to PDFArray (each instance as a view into a shared arrayBuf Float64Array via `this.d = (start, length)`, same temp-then-commit parser pattern) lands the expected heap win of -7.6 MB / -12 % and pulls parseArray's 19.6 MB attribution row off the top 10 entirely, but introduces a +360 ms wall-clock regression at save -- ~3 M per-slot `decodeValue` calls across copyBytesInto, ~100 ns each, V8 can't inline across the prototype boundary -- so the architecture stays off staging too; the natural follow-up is to hand-inline decodeValue's 10-case switch into all four hot save methods (Phase 3β), which recovers the function-call overhead and lands net wins on GC (-19 ms) and parseName (-17 ms, a downstream V8 re-optimization once the call sites became monomorphic per case branch) -- but the inlined switch body itself adds +23 ms at encode-at-parse and +22 ms at copyBytesInto, and the net of all of Phase 2 + 3 + β is "real heap+GC win, ~+200 ms wall-clock loss across many frames", so the architecture stays off staging on the simpler conclusion that V8 marks pointer arrays faster than expected and the original polymorphic `main[i].copyBytesInto()` was fine; Phase 1 wires into production via a `setExpectedDictSlots()` call in `docs/render-book.mjs` that runs the measure walker on rawPdf and hands the exact dictSlot count to fast-dict-onebuf -- the bound on mainBuf isn't material on its own (~60 K slots out of 2.4 M of slack) but commits the two-pass shape so any future shim swap doesn't have to re-do the wiring; finally, mirroring fast-dict-onebuf's range-view shape onto PDFArray (every committed element in a single append-only `arrayMain`, each PDFArray a view via packed `(start, length)`) is the lever the encoded approach was reaching for, with plain Object[] storage skipping the per-slot decode cost that killed Phase 3 -- ~19.6 MB `parseArray` allocation row drops off the top 15, total process heap 66 MB → 52 MB (-21 %), cumulative -66 % since the Map-backed baseline; once `fast-array-onebuf` shipped the next heap row was `PDFParser.parseIndirectObjectHeader` at 13.7 MB (25 % of total), attributed via `find-heap-callers.mjs` to V8 inlining `fastOf` into `skipJibberish`'s speculative `matchIndirectObjectHeader` call (~150 k tag-string allocations of ~25-35 B each), dropping the per-instance `PDFRef.tag` field entirely and computing `toString` / `sizeInBytes` / `copyBytesInto` from `objectNumber` / `generationNumber` directly via `_writeUint` + `_digitCount` helpers cuts `parseIndirectObjectHeader` to 9.3 MB (-4.3 MB), `fastOf` 7.7 → 4.8 MB (-2.9 MB), total process heap 51.9 → 45.2 MB (-13 %), with byte-identical output verified by inflating + diffing all 453 ObjStm streams; the same chain pointed at a redundancy on the CPU side -- `parseDocument`'s inner loop calls `skipJibberish` ~150 k times per load to recover from invalid PDFs that wedge garbage between indirect objects, but on valid PDFs every call speculatively runs `matchKeyword(xref/trailer/startxref)` (all fail on a digit) + `matchIndirectObjectHeader` (a `try` / `catch` around `parseIndirectObjectHeader` + `parseRawInt`x2 + `matchKeyword('obj')` + `fastOf`), all to confirm what the outer `while`'s `IsDigit` check already proved, so peeking the byte first and `continue`-ing on a digit (falling through to `skipJibberish` only on xref/trailer/startxref keyword starts or real jibberish) saves ~62 ms on load (mean 0.518 → 0.455 s, ~6 % of process); the next attack surface after that was the construction style itself -- `fast-refs`'s `Object.create(PDFRef.prototype) + fresh.objectNumber = ... + fresh.generationNumber = ...` routes V8 through the slow-property path with intermediate hidden-class transitions per write, putting PDFRef at ~60 B/instance vs PDFName's ~31 B (built via `new PDFName(...)` with a real constructor), so swapping to a plain function used as a constructor (`function _FastRef(o, g) { this.objectNumber = o; this.generationNumber = g; }` + `_FastRef.prototype = PDFRef.prototype`) gives V8 a stable hidden class from the first instance, drops per-PDFRef cost to ~44 B for ~3.87 MB heap (-8.5 %) and ~140 ms wall-clock (-12 % of process) on the book's 226 k unique PDFRefs (paired heap+cpu profile, --fast-refs vs --fast-refs-class with the rest of production on), with `parseIndirectObjectHeader` dropping 9.1 MB → 7.4 MB and `fastOf` 4.7 MB → 3.4 MB -- the `Object.create + writes` shim stays in the tree as A/B baseline (mutex-checked in measure.mjs). |
diff --git a/perf/measure.mjs b/perf/measure.mjs
index 72a6940d..bf6a9e5d 100644
--- a/perf/measure.mjs
+++ b/perf/measure.mjs
@@ -282,6 +282,7 @@ let cloneCount = false;
 let renderOnly = false;
 let tracing = false;
 let fastRefs = false;
+let fastRefsClass = false;
 let parallelDeflate = false;
 let fastDecodeName = false;
 let fastNumberToString = false;
@@ -324,6 +325,7 @@ for (let i = 0; i < args.length; i++) {
   else if (a === '--tracing') tracing = true;
   else if (a === '--no-affinity') { /* handled in pin-cpu.mjs */ }
   else if (a === '--fast-refs') fastRefs = true;
+  else if (a === '--fast-refs-class') fastRefsClass = true;
   else if (a === '--parallel-deflate') parallelDeflate = true;
   else if (a === '--fast-decode-name') fastDecodeName = true;
   else if (a === '--fast-number-to-string') fastNumberToString = true;
@@ -416,10 +418,18 @@ if (instrumentSlotTypes && (incremental || renderOnly)) {
 
 // Install the dense-array cache for PDFRef.of's gen=0 path before any
 // pdf-lib operation. Side-effecting import; idempotent.
+if (fastRefs && fastRefsClass) {
+  console.error('--fast-refs and --fast-refs-class are mutually exclusive (both shim PDFRef.of).');
+  process.exit(2);
+}
 if (fastRefs) {
   await import('../docs/lib/fast-refs.mjs');
   console.log('[harness] fast-refs: PDFRef.of dense-array cache for gen=0');
 }
+if (fastRefsClass) {
+  await import('../docs/lib/fast-refs-class.mjs');
+  console.log('[harness] fast-refs-class: PDFRef.of dense-array cache + class-constructor shape');
+}
 if (fastDecodeName) {
   await import('../docs/lib/fast-decode-name.mjs');
   console.log('[harness] fast-decode-name: skip decodeName regex when name has no #');
diff --git a/perf/notes/08-pdf-lib.md b/perf/notes/08-pdf-lib.md
index d79397b6..37982e3c 100644
--- a/perf/notes/08-pdf-lib.md
+++ b/perf/notes/08-pdf-lib.md
@@ -4060,6 +4060,106 @@ inflating + diffing all 453 ObjStm streams modulo `/CreationDate`
 no production import or flag change needed since `--fast-sync-load`
 was already wired up.
 
+## Class-constructor `PDFRef` shape
+
+The `Object.create + writes` trick the original `fast-refs` shim uses
+to skip the upstream `ENFORCER` check and `pool.set` (see [Skip
+`PDFRef` `pool.set` on the gen=0 miss path](#skip-pdfref-poolset-on-the-gen0-miss-path)
+above) carries an unexpected per-instance cost: V8 transitions the
+hidden class through one intermediate map per property write and
+routes the result through the slow-property path. On the book a
+fast-refs-built PDFRef sits at ~60 B/instance vs PDFName's ~31 B
+(built via `new PDFName(...)` -- a real constructor with a stable
+hidden class from the first instance).
+
+### The shim
+
+Plain function used as a constructor, both fields set in one shot:
+
+```js
+function _FastRef(objectNumber, generationNumber) {
+  this.objectNumber = objectNumber;
+  this.generationNumber = generationNumber;
+}
+_FastRef.prototype = PDFRef.prototype;
+
+PDFRef.of = function fastClassOf(objectNumber, generationNumber) {
+  if (generationNumber === undefined || generationNumber === 0) {
+    const existing = pool0[objectNumber];
+    if (existing) return existing;
+    const fresh = new _FastRef(objectNumber, 0);
+    pool0[objectNumber] = fresh;
+    return fresh;
+  }
+  return original.call(PDFRef, objectNumber, generationNumber);
+};
+```
+
+Aliasing `_FastRef.prototype = PDFRef.prototype` keeps
+`instanceof PDFRef` satisfied AND means method dispatch resolves
+on the shared prototype (no extra proto-chain hop). gen != 0 still
+falls back to the upstream `PDFRef.of` Map-based pool (rare on
+freshly-parsed PDFs).
+
+Same `toString` / `sizeInBytes` / `copyBytesInto` prototype
+overrides as the tag-drop section above -- the constructor produces
+gen=0 PDFRefs with no `tag` field at all, and the gen!=0 upstream
+fallback still sets `tag` but our overrides ignore it.
+
+### Measured heap
+
+Paired heap profile (`--fast-refs` vs `--fast-refs-class`, with the
+rest of the production shim set on):
+
+| Allocator                       | Pre        | Post       | Delta                  |
+|---------------------------------|-----------:|-----------:|-----------------------:|
+| Total sampled                   |  45.26 MB  |  41.39 MB  | **-3.87 MB (-8.5 %)**  |
+| `fastOf` / `fastClassOf` row    |   4 696 KB |   3 435 KB | -1 261 KB              |
+| `create` (builtin)              |   3 379 KB |   2 627 KB | -752 KB                |
+| `parseIndirectObjectHeader` row |   9 115 KB |   7 435 KB | -1 680 KB              |
+
+Per-PDFRef savings work out to ~16 B/instance × 226 k unique refs
+= ~3.7 MB, close to the measured 3.87 MB total. Not the full
+30 B-to-PDFName-floor (PDFRef carries 2 fields vs PDFName's 1),
+but a clean win and the construction-style change applies
+symmetrically to the other `Object.create`-built shapes
+(`fast-dict-onebuf._makeFromRange`,
+`fast-array-onebuf._makeFromRange`) for the next round.
+
+### Measured CPU
+
+Paired wall-clock and profile (`--cpu-profile-process`):
+
+| Row                        | Pre      | Post     | Delta              |
+|----------------------------|---------:|---------:|-------------------:|
+| Process wall-clock         | 1.13 s   | 0.99 s   | **-140 ms (-12 %)** |
+| load                       | 0.52 s   | 0.47 s   | -50 ms              |
+| save                       | 0.51 s   | 0.44 s   | -70 ms              |
+| `fastOf` (PDFRef) self-time| 28 ms    | out of top 15 | drops off      |
+
+GC self-time barely moved (87 ms → 82 ms), consistent with the
+allocation-rate drop being modest relative to mark-cost -- the live
+`fast-dict-onebuf` mainBuf still dominates the GC bill.
+
+### Wiring
+
+- [`docs/lib/fast-refs-class.mjs`](../../docs/lib/fast-refs-class.mjs)
+  -- new shim. Same `_writeUint` / `_digitCount` helpers as
+  `fast-refs`; same prototype overrides; only the construction style
+  differs.
+- [`docs/render-book.mjs`](../../docs/render-book.mjs) -- swaps
+  `import './lib/fast-refs.mjs'` for `import './lib/fast-refs-class.mjs'`.
+  Production runs through the new shim.
+- [`perf/measure.mjs`](../measure.mjs) -- adds the
+  `--fast-refs-class` flag with a mutex check against `--fast-refs`
+  (both shim `PDFRef.of`; loading both silently would not be
+  obvious if it broke something).
+
+`fast-refs.mjs` stays in the tree as an A/B baseline -- the
+construction style is the whole point of the comparison, so being
+able to flip back to the older shape with a flag is worth the
+20 lines of duplication.
+
 ## `@cantoo/pdf-lib`: not a drop-in replacement
 
 Spot-checked the maintained fork (`@cantoo/pdf-lib` 2.6.5) as an
@@ -4101,7 +4201,8 @@ Cumulative process-phase cost, baseline → after the shims to date:
 | + measure-pass Phase 1               | ~1.0 s  | ~0.7 s | ~0.4 s |
 | + fast-array-onebuf                  | ~1.0 s  | ~0.7 s | ~0.4 s |
 | + fast-refs tag drop                 | ~1.0 s  | ~0.7 s | ~0.4 s |
-| **+ skipJibberish digit fast-path (this section)** | **~0.95 s** | **~0.6 s** | **~0.4 s** |
+| + skipJibberish digit fast-path      | ~0.95 s | ~0.6 s | ~0.4 s |
+| **+ fast-refs-class (this section)** | **~0.9 s** | **~0.55 s** | **~0.4 s** |
 
 The bottom-up after the latest pair is what's left of pdf-lib's
 genuine parser work: `PDFDict.entries`, `PDFObjectParser.parseName`,

From fd265700e332242776c9498ca987f3b6f86b3bed Mon Sep 17 00:00:00 2001
From: Kuba Sunderland-Ober <kuba@mareimbrium.org>
Date: Sun, 24 May 2026 11:06:27 +0200
Subject: [PATCH 36/44] fast-dict-onebuf: constructor-based PDFDict shape
 (-5.98 MB heap).
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Same shape change fast-refs-class applied to PDFRef, now applied to
the PDFDict factory paths. Replaces
`Object.create(ProtoClass.prototype) + pd.d = ... [+ pd.normalized
= false + pd.autoNormalizeCTM = true]` in _makeFromRange and clone
with one plain-function constructor per subclass (`_FastDict`,
`_FastCatalog`, `_FastPageTree`, `_FastPageLeaf`), each with the
right field assignments in its body and its prototype aliased to
the upstream prototype so instanceof + method dispatch are
unchanged. Unknown subclasses fall back to the original
Object.create path (defensive; nothing in our pipeline hits it).

Measured on the book (paired heap + cpu profile, fast-refs-class
baseline vs + this change):

  Heap (sampled total)            41.39 MB -> 35.41 MB  (-5.98 MB, -14.4 %)
  _makeFromRange (dict)           16 484 KB -> 11 404 KB (-5 080 KB)
  create (builtin)                 2 627 KB ->    921 KB (-1 706 KB)
  _FastDict (new attribution row)        — ->    621 KB

Per-PDFDict saving: ~20 B/instance × 260 k = ~5.2 MB. Matches the
delta on the row plus the builtin's drop minus the new
constructor-frame attribution. Cumulative since fast-refs-class:
total sampled 45.26 MB -> 35.41 MB = -9.85 MB (-22 %) over two
shape-change commits.

CPU is roughly flat (process 0.99 s -> 1.03 s under cpu profile,
within noise). GC self-time +18 ms (82 -> 101 ms), consistent with
the existing fast-dict-onebuf trade-off documented in
perf/README.md: the dominant GC cost is the live mainBuf scan, not
allocation rate, so cutting allocation doesn't cut mark time. The
allocation-rate reduction still matters for sustained-load memory
pressure even when it doesn't move single-shot wall-clock.

Output PDF byte-identical modulo /CreationDate + /ModDate
timestamps (no content path touched, only the JS object shape used
to wrap the parsed dict range).
---
 docs/lib/fast-dict-onebuf.mjs |  50 +++++++++++++---
 perf/README.md                |  14 ++++-
 perf/notes/08-pdf-lib.md      | 105 +++++++++++++++++++++++++++++++++-
 3 files changed, 157 insertions(+), 12 deletions(-)

diff --git a/docs/lib/fast-dict-onebuf.mjs b/docs/lib/fast-dict-onebuf.mjs
index b810d3cc..1d705369 100644
--- a/docs/lib/fast-dict-onebuf.mjs
+++ b/docs/lib/fast-dict-onebuf.mjs
@@ -157,15 +157,51 @@ function _cow(pd) {
 }
 
 // ---- Construction ---------------------------------------------------
+//
+// Use plain-function constructors with the prototype aliased to the
+// upstream PDFDict / PDFCatalog / PDFPageTree / PDFPageLeaf prototypes
+// instead of `Object.create(proto) + property writes`. V8 gives
+// `new`-built instances a stable hidden class derived from the
+// assignment order in the constructor body, and per-instance heap cost
+// drops materially vs the slow-property path taken by Object.create +
+// later writes (the same shape change that fast-refs-class made for
+// PDFRef: ~60 B/instance -> ~44 B). For the 260 k+ dicts on the book
+// the per-instance gap × instance count is the dominant remaining heap
+// row.
+//
+// One constructor per subclass so V8 sees a single fixed shape per
+// kind. PDFPageLeaf carries extra fields (normalized,
+// autoNormalizeCTM) -- they're assigned in the constructor body so
+// the shape stays fixed. Any unknown PDFDict subclass falls back to
+// the original Object.create path so the shim doesn't crash on
+// downstream extensions (none in our pipeline; defensive only).
+
+function _FastDict(d) { this.d = d; }
+_FastDict.prototype = PDFDict.prototype;
+
+function _FastCatalog(d) { this.d = d; }
+_FastCatalog.prototype = PDFCatalog.prototype;
+
+function _FastPageTree(d) { this.d = d; }
+_FastPageTree.prototype = PDFPageTree.prototype;
+
+function _FastPageLeaf(d) {
+  this.d = d;
+  this.normalized = false;
+  this.autoNormalizeCTM = true;
+}
+_FastPageLeaf.prototype = PDFPageLeaf.prototype;
 
 function _makeFromRange(ProtoClass, start, length, ctx) {
   _registerContext(ctx);
+  const d = pack(start, length);
+  if (ProtoClass === PDFDict)      return new _FastDict(d);
+  if (ProtoClass === PDFPageLeaf)  return new _FastPageLeaf(d);
+  if (ProtoClass === PDFCatalog)   return new _FastCatalog(d);
+  if (ProtoClass === PDFPageTree)  return new _FastPageTree(d);
+  // Defensive fallback for any unknown subclass.
   const pd = Object.create(ProtoClass.prototype);
-  pd.d = pack(start, length);
-  if (ProtoClass === PDFPageLeaf) {
-    pd.normalized = false;
-    pd.autoNormalizeCTM = true;
-  }
+  pd.d = d;
   return pd;
 }
 
@@ -300,9 +336,7 @@ if (!PDFDict.prototype.__fastDictOnebufInstalled) {
     for (let i = 0; i < length; i++) main[mainLen + i] = main[start + i];
     mainLen += length;
     _registerContext(context || _singletonContext);
-    const c = Object.create(PDFDict.prototype);
-    c.d = pack(newStart, length);
-    return c;
+    return new _FastDict(pack(newStart, length));
   };
 
   PDFDict.prototype.toString = function () {
diff --git a/perf/README.md b/perf/README.md
index b88cd74e..21ea19ae 100644
--- a/perf/README.md
+++ b/perf/README.md
@@ -181,9 +181,16 @@ Flag rationale:
   post-parse, COW results) also append to main. Mutations:
   in-place replace for existing keys, COW (copy range to tail,
   append new pair, update encoded range) for new keys or delete.
-  Mutually exclusive with the other dict-shape shims. ~57 %
+  The wrapper instances themselves use the same constructor-based
+  shape `fast-refs-class` introduced for PDFRef -- one
+  plain-function constructor per subclass (`_FastDict`,
+  `_FastCatalog`, `_FastPageTree`, `_FastPageLeaf`) with the
+  prototype aliased to the upstream prototype, so V8 sees a stable
+  hidden class from the first instance. Saves ~20 B/PDFDict ×
+  260 k = ~5.2 MB heap on top of the storage refactor.
+  Mutually exclusive with the other dict-shape shims. ~77 %
   cumulative heap reduction since the original Map-backed PDFDict
-  (152 -> 66 MB). Production runs through it. See
+  (152 -> 35 MB). Production runs through it. See
   [notes/08-pdf-lib.md "One-buffer PDFDict"](notes/08-pdf-lib.md).
 - `--fast-array-onebuf` -- inject
   [docs/lib/fast-array-onebuf.mjs](../docs/lib/fast-array-onebuf.mjs).
@@ -630,6 +637,7 @@ file documenting each:
 | Drop per-instance `PDFRef.tag` string (`copyBytesInto` digit-write, `sizeInBytes` digit-count, `toString` on demand) | [08](notes/08-pdf-lib.md) | `parseIndirectObjectHeader` 13.7 MB → 9.3 MB; total process heap 51.9 MB → 45.2 MB (-13 %) |
 | `skipJibberish` digit-byte fast path (peek before speculative `matchKeyword` + `matchIndirectObjectHeader`) | [08](notes/08-pdf-lib.md) | load mean 0.518 → 0.455 s (-62 ms, -6 %); save flat; byte-identical output |
 | Class-constructor `PDFRef` shape (`new _FastRef(...)` for stable V8 hidden class) | [08](notes/08-pdf-lib.md) | per-PDFRef ~60 B → ~44 B; total process heap 45.3 MB → 41.4 MB (-8.5 %); process wall 1.13 s → 0.99 s (-140 ms, -12 %) |
+| Class-constructor `PDFDict` shape (`_FastDict` / `_FastCatalog` / `_FastPageTree` / `_FastPageLeaf` per-subclass constructors) | [08](notes/08-pdf-lib.md) | `_makeFromRange (dict)` 16.5 MB → 11.4 MB; total process heap 41.4 MB → 35.4 MB (-14.4 %); cumulative -77 % since Map-backed PDFDict |
 
 What was tried and didn't ship:
 
@@ -656,4 +664,4 @@ order; later ones reference earlier ones for context.
 | [05-blink-trace.md](notes/05-blink-trace.md) | What happened when we tried move-not-clone (a `previousLeaf` cache shipped instead of move); cracking the cpu profile's `(program)` row open with a Blink-category trace; the WhiteSpaceFilter paired-A/B that found it wasn't worth its layout cost in our pipeline. |
 | [06-microtasks-pageranges-css.md](notes/06-microtasks-pageranges-css.md) | Following `RunMicrotasks` down to zero (chunker fully sync); why `pageRanges` sharding is off the table; CSS cost attribution showing print.css's individual sections are all below the noise floor. |
 | [07-memory.md](notes/07-memory.md) | Where the renderer's 1.9 GB goes -- process-tree footprint, per-allocator + per-Blink-class breakdown, `--disable-gpu` + `--in-process-gpu` saving ~200 MB, a GC-pass probe finding 180 MB of unswept Oilpan garbage. |
-| [08-pdf-lib.md](notes/08-pdf-lib.md) | Profiling the process phase via `--cpu-profile-process`; pako's per-stream init dominates with ~4 500 small streams (routing pdf-lib's `deflate` + `inflate` through `node:zlib` saves ~1.5 s); `PDFRef.of`'s string-keyed Map lookup at 1.2 M calls per load (dense-array gen=0 cache saves ~0.2 s); parallelising save's per-stream deflate on libuv's pool with `objectsPerStream: 500` (~0.3 s off the main thread; PDF -5 %); `decodeName`'s regex scan on 2.76 M `PDFName.of` calls per load with a 0.0001 % hit rate (no-`#` cache saves ~0.5 s); `numberToString`'s redundant `toString`/`split`/`parseInt` on the 100 % no-`e` path; `sizeInBytes` allocating `n.toString(2)` on ~300 k xref-writer calls (short-circuit ladder saves ~60 ms); `PDFDict.entries` allocating `Array.from(map.entries())` on every dict serialisation (`Map.forEach` with hoisted callbacks saves ~80 ms); `parseDict`'s type-dispatch tail re-running `PDFName.of('Type'/'Catalog'/'Pages'/'Page')` per dict (hoisted sentinel constants drop `fastOf` self-time by 22 %); pdf-lib's `__awaiter`/`__generator` scaffolding on nine load + save methods costing ~135 ms of attributed self-time + ~50 ms GC (synchronified twins save 0.36 s of process); `parseObject`'s three speculative `matchKeyword(true/false/null)` scans on every dispatch (first-byte peek + gated keyword scans halve `parseObject` self-time); the sampling heap profile pointing at `new Map()` + `Map.prototype.set` at ~50 % of process-phase allocations (replacing `PDFDict`'s backing `Map` with a flat alternating `[k,v,k,v,...]` array drops Map+set heap traffic ~80 % and subsumes the earlier `fast-dict-iter` + `fast-parse-dict` shims); the only hot `Map.set` left being `PDFContext.assign`'s `indirectObjects.set(ref, object)` (replacing the Map with a dense gen=0 array indexed by `objectNumber` drops `assign` out of the CPU top-15 and halves the remaining `set` heap traffic); the residual `set` after that being the upstream `PDFRef.of` pool.set on cache miss (directly constructing the `PDFRef` via `Object.create(PDFRef.prototype)` on the gen=0 miss path bypasses the redundant upstream pool entirely, dropping `set` heap traffic from 7.7 MB to 0.5 MB and saving another ~93 ms on `PDFRef.of` CPU); `parseNumberOrRef` as the next-largest heap row at 15 MB of inlined `new PDFNumber(value)` calls -- PDFs reuse a few thousand unique numeric values hundreds of thousands of times (page indices, `/Count`, `/MediaBox` dimensions, font sizes), so pooling `PDFNumber` by value drops `parseNumberOrRef` off the top 10 and total process heap by ~13 %; `fastParseDictArray` at ~85 % FixedArray-growth garbage on 261 k dicts with a 5-entry median (pre-sizing the accumulator at `new Array(10)` collapses the per-call growth chain, dropping the row -25 % and total heap another -14 %); the next layer being the irreducible "one `new Array(10)` + one PDFDict instance per parsed dict" floor with ~1780 backing arrays still in flight after `--fast-dict-array` (collapse them into one long-lived mainBuf where every committed PDFDict entry lives, a per-parser temp stack handles parseDict recursion, owned dicts append to main and mutate in-place / COW-to-tail, and the whole PDFDict instance state packs into one 53-bit Number -- 24-bit start + 14-bit length + 1-bit owned -- so the per-dict object header collapses to a single field; total process heap drops 92 MB → 66 MB (-28 %), cumulative -57 % since the Map-backed baseline); GC self-time still ~150 ms / 15 % of process after fast-dict-onebuf (V8-flag knobs don't move it -- mark cost is dominated by walking the ~2.4 M Object-ref slots in mainBuf), so the next attack surface is shrinking the *live set* V8 has to mark by encoding slots as Numbers in a Float64Array mainBuf instead of Object references; that needs exact pool sizing, which is best done by a separate measure-allocate-work pass (Phase 0 viability gate: a no-allocate byte walker prototype clocks 135 ms vs PDFDocument.load at 1238 ms on the 39 MB Chrome-output PDF, ~9x cheaper, so even an 80 %-of-current work pass would land net-positive on CPU before any GC reduction; the prototype lives in `perf/phase0-measure.mjs` with `--dump-raw-pdf` on `measure.mjs` for capturing the canonical input); Phase 1 productionises that walker as `docs/lib/measure-pass.mjs` and adds `setExpectedDictSlots()` to fast-dict-onebuf so the harness's `--measure-pass` flag pre-sizes mainBuf exact (byte-identical output, +40 ms net process on the book) -- pipeline shape committed for Phase 2 to layer on, even though Phase 1 alone doesn't move the heap; a V8 IC-invalidation gotcha worth recording: rebinding the module-level `main` (rather than resizing it in place via `main.length = N`) made `_appendEntries` deopt and the heap jump 65 → 92 MB despite the resized Array being shape-identical; post-Phase-1 cleanup: the One-buffer `owned` bit was over-cautious -- `start + length === mainLen` is enough to know slots past mainLen are claimable regardless of whether the range came from the parser or a factory call, so the bit drops out of the packed value entirely (set's COW gate simplifies, _cow collapses to one branch, bit 38 becomes spare; byte-identical output, heap flat); slot-type histogram on `main[0..mainLen)` confirms keys are 100 % PDFName and the four big pools (Name / Ref / Number / Dict) cover 96.4 % of all slots, so a Float64Array mainBuf with a side `Object[]` for the residual ~3.5 % (Array / String / HexString) would collapse ~96 % of GC slot-marks at the price of a side-pool indirection -- `perf/instrument-slot-types.mjs` does the classification in 39 ms via `--instrument-slot-types`; built the Float64Array mainBuf with 4-bit tag + 49-bit payload encoding (subsuming fast-refs and fast-pdfnumber-pool's pool IDs, adding new pools for PDFArray / PDFString / PDFHexString) and confirmed byte-identical output -- but the measured win was a wash: pointer-array marks turn out to be fast in V8, encoding overhead at parse + decode at save roughly cancels the savings, and the new pool Maps cost ~3 MB heap, so the architecture stays on the shelf rather than shipping; mirroring the same shape to PDFArray (each instance as a view into a shared arrayBuf Float64Array via `this.d = (start, length)`, same temp-then-commit parser pattern) lands the expected heap win of -7.6 MB / -12 % and pulls parseArray's 19.6 MB attribution row off the top 10 entirely, but introduces a +360 ms wall-clock regression at save -- ~3 M per-slot `decodeValue` calls across copyBytesInto, ~100 ns each, V8 can't inline across the prototype boundary -- so the architecture stays off staging too; the natural follow-up is to hand-inline decodeValue's 10-case switch into all four hot save methods (Phase 3β), which recovers the function-call overhead and lands net wins on GC (-19 ms) and parseName (-17 ms, a downstream V8 re-optimization once the call sites became monomorphic per case branch) -- but the inlined switch body itself adds +23 ms at encode-at-parse and +22 ms at copyBytesInto, and the net of all of Phase 2 + 3 + β is "real heap+GC win, ~+200 ms wall-clock loss across many frames", so the architecture stays off staging on the simpler conclusion that V8 marks pointer arrays faster than expected and the original polymorphic `main[i].copyBytesInto()` was fine; Phase 1 wires into production via a `setExpectedDictSlots()` call in `docs/render-book.mjs` that runs the measure walker on rawPdf and hands the exact dictSlot count to fast-dict-onebuf -- the bound on mainBuf isn't material on its own (~60 K slots out of 2.4 M of slack) but commits the two-pass shape so any future shim swap doesn't have to re-do the wiring; finally, mirroring fast-dict-onebuf's range-view shape onto PDFArray (every committed element in a single append-only `arrayMain`, each PDFArray a view via packed `(start, length)`) is the lever the encoded approach was reaching for, with plain Object[] storage skipping the per-slot decode cost that killed Phase 3 -- ~19.6 MB `parseArray` allocation row drops off the top 15, total process heap 66 MB → 52 MB (-21 %), cumulative -66 % since the Map-backed baseline; once `fast-array-onebuf` shipped the next heap row was `PDFParser.parseIndirectObjectHeader` at 13.7 MB (25 % of total), attributed via `find-heap-callers.mjs` to V8 inlining `fastOf` into `skipJibberish`'s speculative `matchIndirectObjectHeader` call (~150 k tag-string allocations of ~25-35 B each), dropping the per-instance `PDFRef.tag` field entirely and computing `toString` / `sizeInBytes` / `copyBytesInto` from `objectNumber` / `generationNumber` directly via `_writeUint` + `_digitCount` helpers cuts `parseIndirectObjectHeader` to 9.3 MB (-4.3 MB), `fastOf` 7.7 → 4.8 MB (-2.9 MB), total process heap 51.9 → 45.2 MB (-13 %), with byte-identical output verified by inflating + diffing all 453 ObjStm streams; the same chain pointed at a redundancy on the CPU side -- `parseDocument`'s inner loop calls `skipJibberish` ~150 k times per load to recover from invalid PDFs that wedge garbage between indirect objects, but on valid PDFs every call speculatively runs `matchKeyword(xref/trailer/startxref)` (all fail on a digit) + `matchIndirectObjectHeader` (a `try` / `catch` around `parseIndirectObjectHeader` + `parseRawInt`x2 + `matchKeyword('obj')` + `fastOf`), all to confirm what the outer `while`'s `IsDigit` check already proved, so peeking the byte first and `continue`-ing on a digit (falling through to `skipJibberish` only on xref/trailer/startxref keyword starts or real jibberish) saves ~62 ms on load (mean 0.518 → 0.455 s, ~6 % of process); the next attack surface after that was the construction style itself -- `fast-refs`'s `Object.create(PDFRef.prototype) + fresh.objectNumber = ... + fresh.generationNumber = ...` routes V8 through the slow-property path with intermediate hidden-class transitions per write, putting PDFRef at ~60 B/instance vs PDFName's ~31 B (built via `new PDFName(...)` with a real constructor), so swapping to a plain function used as a constructor (`function _FastRef(o, g) { this.objectNumber = o; this.generationNumber = g; }` + `_FastRef.prototype = PDFRef.prototype`) gives V8 a stable hidden class from the first instance, drops per-PDFRef cost to ~44 B for ~3.87 MB heap (-8.5 %) and ~140 ms wall-clock (-12 % of process) on the book's 226 k unique PDFRefs (paired heap+cpu profile, --fast-refs vs --fast-refs-class with the rest of production on), with `parseIndirectObjectHeader` dropping 9.1 MB → 7.4 MB and `fastOf` 4.7 MB → 3.4 MB -- the `Object.create + writes` shim stays in the tree as A/B baseline (mutex-checked in measure.mjs). |
+| [08-pdf-lib.md](notes/08-pdf-lib.md) | Profiling the process phase via `--cpu-profile-process`; pako's per-stream init dominates with ~4 500 small streams (routing pdf-lib's `deflate` + `inflate` through `node:zlib` saves ~1.5 s); `PDFRef.of`'s string-keyed Map lookup at 1.2 M calls per load (dense-array gen=0 cache saves ~0.2 s); parallelising save's per-stream deflate on libuv's pool with `objectsPerStream: 500` (~0.3 s off the main thread; PDF -5 %); `decodeName`'s regex scan on 2.76 M `PDFName.of` calls per load with a 0.0001 % hit rate (no-`#` cache saves ~0.5 s); `numberToString`'s redundant `toString`/`split`/`parseInt` on the 100 % no-`e` path; `sizeInBytes` allocating `n.toString(2)` on ~300 k xref-writer calls (short-circuit ladder saves ~60 ms); `PDFDict.entries` allocating `Array.from(map.entries())` on every dict serialisation (`Map.forEach` with hoisted callbacks saves ~80 ms); `parseDict`'s type-dispatch tail re-running `PDFName.of('Type'/'Catalog'/'Pages'/'Page')` per dict (hoisted sentinel constants drop `fastOf` self-time by 22 %); pdf-lib's `__awaiter`/`__generator` scaffolding on nine load + save methods costing ~135 ms of attributed self-time + ~50 ms GC (synchronified twins save 0.36 s of process); `parseObject`'s three speculative `matchKeyword(true/false/null)` scans on every dispatch (first-byte peek + gated keyword scans halve `parseObject` self-time); the sampling heap profile pointing at `new Map()` + `Map.prototype.set` at ~50 % of process-phase allocations (replacing `PDFDict`'s backing `Map` with a flat alternating `[k,v,k,v,...]` array drops Map+set heap traffic ~80 % and subsumes the earlier `fast-dict-iter` + `fast-parse-dict` shims); the only hot `Map.set` left being `PDFContext.assign`'s `indirectObjects.set(ref, object)` (replacing the Map with a dense gen=0 array indexed by `objectNumber` drops `assign` out of the CPU top-15 and halves the remaining `set` heap traffic); the residual `set` after that being the upstream `PDFRef.of` pool.set on cache miss (directly constructing the `PDFRef` via `Object.create(PDFRef.prototype)` on the gen=0 miss path bypasses the redundant upstream pool entirely, dropping `set` heap traffic from 7.7 MB to 0.5 MB and saving another ~93 ms on `PDFRef.of` CPU); `parseNumberOrRef` as the next-largest heap row at 15 MB of inlined `new PDFNumber(value)` calls -- PDFs reuse a few thousand unique numeric values hundreds of thousands of times (page indices, `/Count`, `/MediaBox` dimensions, font sizes), so pooling `PDFNumber` by value drops `parseNumberOrRef` off the top 10 and total process heap by ~13 %; `fastParseDictArray` at ~85 % FixedArray-growth garbage on 261 k dicts with a 5-entry median (pre-sizing the accumulator at `new Array(10)` collapses the per-call growth chain, dropping the row -25 % and total heap another -14 %); the next layer being the irreducible "one `new Array(10)` + one PDFDict instance per parsed dict" floor with ~1780 backing arrays still in flight after `--fast-dict-array` (collapse them into one long-lived mainBuf where every committed PDFDict entry lives, a per-parser temp stack handles parseDict recursion, owned dicts append to main and mutate in-place / COW-to-tail, and the whole PDFDict instance state packs into one 53-bit Number -- 24-bit start + 14-bit length + 1-bit owned -- so the per-dict object header collapses to a single field; total process heap drops 92 MB → 66 MB (-28 %), cumulative -57 % since the Map-backed baseline); GC self-time still ~150 ms / 15 % of process after fast-dict-onebuf (V8-flag knobs don't move it -- mark cost is dominated by walking the ~2.4 M Object-ref slots in mainBuf), so the next attack surface is shrinking the *live set* V8 has to mark by encoding slots as Numbers in a Float64Array mainBuf instead of Object references; that needs exact pool sizing, which is best done by a separate measure-allocate-work pass (Phase 0 viability gate: a no-allocate byte walker prototype clocks 135 ms vs PDFDocument.load at 1238 ms on the 39 MB Chrome-output PDF, ~9x cheaper, so even an 80 %-of-current work pass would land net-positive on CPU before any GC reduction; the prototype lives in `perf/phase0-measure.mjs` with `--dump-raw-pdf` on `measure.mjs` for capturing the canonical input); Phase 1 productionises that walker as `docs/lib/measure-pass.mjs` and adds `setExpectedDictSlots()` to fast-dict-onebuf so the harness's `--measure-pass` flag pre-sizes mainBuf exact (byte-identical output, +40 ms net process on the book) -- pipeline shape committed for Phase 2 to layer on, even though Phase 1 alone doesn't move the heap; a V8 IC-invalidation gotcha worth recording: rebinding the module-level `main` (rather than resizing it in place via `main.length = N`) made `_appendEntries` deopt and the heap jump 65 → 92 MB despite the resized Array being shape-identical; post-Phase-1 cleanup: the One-buffer `owned` bit was over-cautious -- `start + length === mainLen` is enough to know slots past mainLen are claimable regardless of whether the range came from the parser or a factory call, so the bit drops out of the packed value entirely (set's COW gate simplifies, _cow collapses to one branch, bit 38 becomes spare; byte-identical output, heap flat); slot-type histogram on `main[0..mainLen)` confirms keys are 100 % PDFName and the four big pools (Name / Ref / Number / Dict) cover 96.4 % of all slots, so a Float64Array mainBuf with a side `Object[]` for the residual ~3.5 % (Array / String / HexString) would collapse ~96 % of GC slot-marks at the price of a side-pool indirection -- `perf/instrument-slot-types.mjs` does the classification in 39 ms via `--instrument-slot-types`; built the Float64Array mainBuf with 4-bit tag + 49-bit payload encoding (subsuming fast-refs and fast-pdfnumber-pool's pool IDs, adding new pools for PDFArray / PDFString / PDFHexString) and confirmed byte-identical output -- but the measured win was a wash: pointer-array marks turn out to be fast in V8, encoding overhead at parse + decode at save roughly cancels the savings, and the new pool Maps cost ~3 MB heap, so the architecture stays on the shelf rather than shipping; mirroring the same shape to PDFArray (each instance as a view into a shared arrayBuf Float64Array via `this.d = (start, length)`, same temp-then-commit parser pattern) lands the expected heap win of -7.6 MB / -12 % and pulls parseArray's 19.6 MB attribution row off the top 10 entirely, but introduces a +360 ms wall-clock regression at save -- ~3 M per-slot `decodeValue` calls across copyBytesInto, ~100 ns each, V8 can't inline across the prototype boundary -- so the architecture stays off staging too; the natural follow-up is to hand-inline decodeValue's 10-case switch into all four hot save methods (Phase 3β), which recovers the function-call overhead and lands net wins on GC (-19 ms) and parseName (-17 ms, a downstream V8 re-optimization once the call sites became monomorphic per case branch) -- but the inlined switch body itself adds +23 ms at encode-at-parse and +22 ms at copyBytesInto, and the net of all of Phase 2 + 3 + β is "real heap+GC win, ~+200 ms wall-clock loss across many frames", so the architecture stays off staging on the simpler conclusion that V8 marks pointer arrays faster than expected and the original polymorphic `main[i].copyBytesInto()` was fine; Phase 1 wires into production via a `setExpectedDictSlots()` call in `docs/render-book.mjs` that runs the measure walker on rawPdf and hands the exact dictSlot count to fast-dict-onebuf -- the bound on mainBuf isn't material on its own (~60 K slots out of 2.4 M of slack) but commits the two-pass shape so any future shim swap doesn't have to re-do the wiring; finally, mirroring fast-dict-onebuf's range-view shape onto PDFArray (every committed element in a single append-only `arrayMain`, each PDFArray a view via packed `(start, length)`) is the lever the encoded approach was reaching for, with plain Object[] storage skipping the per-slot decode cost that killed Phase 3 -- ~19.6 MB `parseArray` allocation row drops off the top 15, total process heap 66 MB → 52 MB (-21 %), cumulative -66 % since the Map-backed baseline; once `fast-array-onebuf` shipped the next heap row was `PDFParser.parseIndirectObjectHeader` at 13.7 MB (25 % of total), attributed via `find-heap-callers.mjs` to V8 inlining `fastOf` into `skipJibberish`'s speculative `matchIndirectObjectHeader` call (~150 k tag-string allocations of ~25-35 B each), dropping the per-instance `PDFRef.tag` field entirely and computing `toString` / `sizeInBytes` / `copyBytesInto` from `objectNumber` / `generationNumber` directly via `_writeUint` + `_digitCount` helpers cuts `parseIndirectObjectHeader` to 9.3 MB (-4.3 MB), `fastOf` 7.7 → 4.8 MB (-2.9 MB), total process heap 51.9 → 45.2 MB (-13 %), with byte-identical output verified by inflating + diffing all 453 ObjStm streams; the same chain pointed at a redundancy on the CPU side -- `parseDocument`'s inner loop calls `skipJibberish` ~150 k times per load to recover from invalid PDFs that wedge garbage between indirect objects, but on valid PDFs every call speculatively runs `matchKeyword(xref/trailer/startxref)` (all fail on a digit) + `matchIndirectObjectHeader` (a `try` / `catch` around `parseIndirectObjectHeader` + `parseRawInt`x2 + `matchKeyword('obj')` + `fastOf`), all to confirm what the outer `while`'s `IsDigit` check already proved, so peeking the byte first and `continue`-ing on a digit (falling through to `skipJibberish` only on xref/trailer/startxref keyword starts or real jibberish) saves ~62 ms on load (mean 0.518 → 0.455 s, ~6 % of process); the next attack surface after that was the construction style itself -- `fast-refs`'s `Object.create(PDFRef.prototype) + fresh.objectNumber = ... + fresh.generationNumber = ...` routes V8 through the slow-property path with intermediate hidden-class transitions per write, putting PDFRef at ~60 B/instance vs PDFName's ~31 B (built via `new PDFName(...)` with a real constructor), so swapping to a plain function used as a constructor (`function _FastRef(o, g) { this.objectNumber = o; this.generationNumber = g; }` + `_FastRef.prototype = PDFRef.prototype`) gives V8 a stable hidden class from the first instance, drops per-PDFRef cost to ~44 B for ~3.87 MB heap (-8.5 %) and ~140 ms wall-clock (-12 % of process) on the book's 226 k unique PDFRefs (paired heap+cpu profile, --fast-refs vs --fast-refs-class with the rest of production on), with `parseIndirectObjectHeader` dropping 9.1 MB → 7.4 MB and `fastOf` 4.7 MB → 3.4 MB -- the `Object.create + writes` shim stays in the tree as A/B baseline (mutex-checked in measure.mjs); the same shape change applied symmetrically to the four PDFDict factory paths in `fast-dict-onebuf` (`_makeFromRange` + the COW path inside `set` both build wrappers via `Object.create(ProtoClass.prototype) + pd.d = ...`, with PageLeaf carrying extra `normalized` / `autoNormalizeCTM` writes) -- one plain-function constructor per subclass (`_FastDict`, `_FastCatalog`, `_FastPageTree`, `_FastPageLeaf`) with the prototype aliased to the upstream prototype drops 260 k+ wrapper instances ~20 B each for `_makeFromRange (dict)` 16.5 MB → 11.4 MB, `create` builtin 2.6 MB → 0.9 MB, total process heap 41.4 MB → 35.4 MB (-14.4 %), cumulative -22 % over the two shape-change commits and -77 % since the Map-backed PDFDict baseline (152 MB → 35.4 MB); wall-clock roughly flat (0.99 → 1.03 s under cpu profile, within noise) with GC self-time +18 ms (82 → 101 ms) as expected -- the dominant GC cost is the live mainBuf scan rather than allocation rate, so cutting allocations doesn't move single-shot mark time. |
diff --git a/perf/notes/08-pdf-lib.md b/perf/notes/08-pdf-lib.md
index 37982e3c..59d29c27 100644
--- a/perf/notes/08-pdf-lib.md
+++ b/perf/notes/08-pdf-lib.md
@@ -4160,6 +4160,108 @@ construction style is the whole point of the comparison, so being
 able to flip back to the older shape with a flag is worth the
 20 lines of duplication.
 
+## Class-constructor `PDFDict` shape
+
+The same shape change `fast-refs-class` applied to PDFRef (above),
+now applied to the four PDFDict subclasses fast-dict-onebuf
+constructs: `PDFDict`, `PDFCatalog`, `PDFPageTree`, `PDFPageLeaf`.
+
+### Where fast-dict-onebuf was paying the same V8 tax
+
+`_makeFromRange` and the COW path inside `set` both build the
+wrapper instance via `Object.create(ProtoClass.prototype) + pd.d
+= ...` (plus `pd.normalized = false` / `pd.autoNormalizeCTM = true`
+for the PageLeaf case). On the book that's 260 k+ wrapper
+instances per load -- the dominant remaining heap row even after
+all the prior storage-shape work, with `_makeFromRange (dict)`
+showing 16.5 MB on the post-`fast-refs-class` profile.
+
+### The shim
+
+One plain-function constructor per subclass with the field
+assignments in the body. Aliasing each one's prototype to the
+upstream prototype keeps `instanceof` and method dispatch
+unchanged.
+
+```js
+function _FastDict(d) { this.d = d; }
+_FastDict.prototype = PDFDict.prototype;
+
+function _FastCatalog(d) { this.d = d; }
+_FastCatalog.prototype = PDFCatalog.prototype;
+
+function _FastPageTree(d) { this.d = d; }
+_FastPageTree.prototype = PDFPageTree.prototype;
+
+function _FastPageLeaf(d) {
+  this.d = d;
+  this.normalized = false;
+  this.autoNormalizeCTM = true;
+}
+_FastPageLeaf.prototype = PDFPageLeaf.prototype;
+
+function _makeFromRange(ProtoClass, start, length, ctx) {
+  _registerContext(ctx);
+  const d = pack(start, length);
+  if (ProtoClass === PDFDict)      return new _FastDict(d);
+  if (ProtoClass === PDFPageLeaf)  return new _FastPageLeaf(d);
+  if (ProtoClass === PDFCatalog)   return new _FastCatalog(d);
+  if (ProtoClass === PDFPageTree)  return new _FastPageTree(d);
+  // Defensive fallback for any unknown subclass.
+  const pd = Object.create(ProtoClass.prototype);
+  pd.d = d;
+  return pd;
+}
+```
+
+PageLeaf carries the extra `normalized` / `autoNormalizeCTM`
+fields -- they're assigned in the constructor body so V8 still sees
+a fixed shape per subclass. The COW path in `set` is updated in
+the same way (`return new _FastDict(pack(newStart, length))`).
+Unknown PDFDict subclasses fall back to the original Object.create
+path; nothing in our pipeline hits it (defensive only).
+
+### Measured heap
+
+Paired profile, `fast-refs-class` baseline vs + this change:
+
+| Allocator                       | Pre        | Post       | Delta              |
+|---------------------------------|-----------:|-----------:|-------------------:|
+| Total sampled                   |  41.39 MB  |  35.41 MB  | **-5.98 MB (-14.4 %)** |
+| `_makeFromRange` (dict)         |  16 484 KB |  11 404 KB | -5 080 KB          |
+| `create` (builtin)              |   2 627 KB |     921 KB | -1 706 KB          |
+| `_FastDict` (new row)           |     —      |     621 KB | +621 KB            |
+
+Per-PDFDict saving: ~20 B/instance × 260 k = ~5.2 MB. Matches the
+`_makeFromRange` delta + the builtin's drop minus the new
+constructor-frame attribution.
+
+**Cumulative since `fast-refs-class`**: total sampled 45.26 MB →
+35.41 MB = **-9.85 MB (-22 %)** over two shape-change commits.
+Bringing the cumulative heap reduction since the Map-backed
+baseline to ~77 % (152 MB → 35.4 MB).
+
+### Measured CPU
+
+Roughly flat -- process wall-clock 0.99 s → 1.03 s under cpu
+profile, within noise. GC self-time +18 ms (82 → 101 ms),
+consistent with the existing `fast-dict-onebuf` trade-off
+documented in the README: the dominant GC cost on this workload
+is the live `mainBuf` scan, not allocation rate, so cutting
+allocation doesn't move single-shot mark time. The
+allocation-rate reduction still matters for sustained-load
+memory pressure even when it doesn't show on a one-shot
+wall-clock.
+
+### Validation
+
+Output PDF byte-identical modulo `/CreationDate` + `/ModDate`
+timestamps -- only the JS object shape used to wrap the parsed
+dict range changed, not any content path. The change is local to
+[`docs/lib/fast-dict-onebuf.mjs`](../../docs/lib/fast-dict-onebuf.mjs);
+no production import or flag change needed since
+`--fast-dict-onebuf` was already wired up.
+
 ## `@cantoo/pdf-lib`: not a drop-in replacement
 
 Spot-checked the maintained fork (`@cantoo/pdf-lib` 2.6.5) as an
@@ -4202,7 +4304,8 @@ Cumulative process-phase cost, baseline → after the shims to date:
 | + fast-array-onebuf                  | ~1.0 s  | ~0.7 s | ~0.4 s |
 | + fast-refs tag drop                 | ~1.0 s  | ~0.7 s | ~0.4 s |
 | + skipJibberish digit fast-path      | ~0.95 s | ~0.6 s | ~0.4 s |
-| **+ fast-refs-class (this section)** | **~0.9 s** | **~0.55 s** | **~0.4 s** |
+| + fast-refs-class                    | ~0.9 s  | ~0.55 s | ~0.4 s |
+| **+ fast-dict-onebuf class shape (this section)** | **~0.9 s** | **~0.55 s** | **~0.4 s** |
 
 The bottom-up after the latest pair is what's left of pdf-lib's
 genuine parser work: `PDFDict.entries`, `PDFObjectParser.parseName`,

From cd02b26c59e052ab676206a513503f42e0d1dfcd Mon Sep 17 00:00:00 2001
From: Kuba Sunderland-Ober <kuba@mareimbrium.org>
Date: Sun, 24 May 2026 11:09:30 +0200
Subject: [PATCH 37/44] fast-array-onebuf: constructor-based PDFArray shape
 (-1.73 MB heap, -130 ms process).
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Same shape change applied to PDFArray's factory paths -- replace
`Object.create(PDFArray.prototype) + pa.d = ...` in _makeFromRange
and clone with a `_FastArray` plain-function constructor, prototype
aliased to PDFArray.prototype. No subclass dispatch needed
(PDFArray has none in pdf-lib, unlike PDFDict).

Measured on the book (paired heap + cpu profile, prior commit's
dict-class baseline vs + this change):

  Heap (sampled total)            35.41 MB -> 33.68 MB  (-1.73 MB, -4.9 %)
  fastParseArrayOneBuf row         4 372 KB ->  3 334 KB (-1 038 KB)
  create (builtin)                   921 KB ->   (out of top 15) -921 KB
  Process wall-clock               1.03 s ->  0.90 s  (-130 ms, -13 %)
  GC self-time                    100.90 ms -> 58.69 ms (-42 ms)

Per-PDFArray saving: ~22 B/instance × ~80 k = ~1.7 MB. Matches the
row delta + builtin drop.

Surprise win on GC + wall-clock: cuts GC self-time 42 ms despite a
much smaller allocation drop than fast-refs-class or
fast-dict-onebuf. The likely reason is that with all three
shape-changes in place, V8 sees fully monomorphic call sites for
PDFRef / PDFDict / PDFArray construction and method dispatch --
before the array change there was still one slow-property shape in
the mix dragging IC perf. Confirmed by the cumulative process-time
arc:

  baseline (fast-refs)                 1.13 s    87 ms GC
  + fast-refs-class                    0.99 s    82 ms GC
  + fast-dict-onebuf class shape       1.03 s   101 ms GC
  + fast-array-onebuf class shape      0.90 s    59 ms GC

The dict-only state had a slight CPU regression (+40 ms vs
refs-class) that the array change undid and then some. Ship the
combo, not just the two big-row ones.

Cumulative across the three commits in this round (baseline ->
array-class):

  Process wall-clock              1.13 s   -> 0.90 s    (-230 ms, -20 %)
  Total sampled heap              45.26 MB -> 33.68 MB  (-11.58 MB, -25.6 %)
  GC self-time                    87 ms    -> 59 ms     (-32 %)
---
 docs/lib/fast-array-onebuf.mjs | 21 ++++++---
 perf/README.md                 | 20 ++++++---
 perf/notes/08-pdf-lib.md       | 82 +++++++++++++++++++++++++++++++++-
 3 files changed, 111 insertions(+), 12 deletions(-)

diff --git a/docs/lib/fast-array-onebuf.mjs b/docs/lib/fast-array-onebuf.mjs
index 66cf3a78..5ed833ce 100644
--- a/docs/lib/fast-array-onebuf.mjs
+++ b/docs/lib/fast-array-onebuf.mjs
@@ -138,12 +138,23 @@ function _cow(pa) {
 }
 
 // ---- Construction --------------------------------------------------
+//
+// Use a plain-function constructor (`_FastArray`) with the prototype
+// aliased to PDFArray.prototype instead of `Object.create + writes`.
+// Same shape change fast-refs-class and fast-dict-onebuf made: V8
+// gives `new`-built instances a stable hidden class from the first
+// instance and drops per-instance cost vs the slow-property path
+// taken by Object.create + later property writes.
+//
+// No subclass dispatch needed -- PDFArray has no subclasses in
+// pdf-lib (unlike PDFDict's PDFCatalog / PDFPageTree / PDFPageLeaf).
+
+function _FastArray(d) { this.d = d; }
+_FastArray.prototype = PDFArray.prototype;
 
 function _makeFromRange(start, length, ctx) {
   _registerContext(ctx);
-  const pa = Object.create(PDFArray.prototype);
-  pa.d = pack(start, length);
-  return pa;
+  return new _FastArray(pack(start, length));
 }
 
 function _makeFromAppend(arr, ctx) {
@@ -238,9 +249,7 @@ if (!PDFArray.prototype.__fastArrayOnebufInstalled) {
     for (let i = 0; i < length; i++) arrayMain[arrayMainLen + i] = arrayMain[start + i];
     arrayMainLen += length;
     _registerContext(context || _singletonContext);
-    const c = Object.create(PDFArray.prototype);
-    c.d = pack(newStart, length);
-    return c;
+    return new _FastArray(pack(newStart, length));
   };
 
   PDFArray.prototype.toString = function () {
diff --git a/perf/README.md b/perf/README.md
index 21ea19ae..9edbfb7f 100644
--- a/perf/README.md
+++ b/perf/README.md
@@ -207,10 +207,19 @@ Flag rationale:
   `_dictTemp`. Mutations: in-place replace for `set`, in-place
   extend at HWM for `push`, COW for everything else. Singleton
   context is duplicated (10 lines) rather than shared so each shim
-  stays independently injectable. ~19 MB process-phase heap traffic
-  drops -- collapses parseArray's `this.array = []` + grow doublings
-  across ~79 k PDFArrays. Composes with `--fast-dict-onebuf`.
-  Production runs through it. See
+  stays independently injectable. Wrapper instances built via a
+  `_FastArray` plain-function constructor (prototype aliased to
+  `PDFArray.prototype`) rather than `Object.create + writes`, the
+  same shape change `fast-refs-class` and `fast-dict-onebuf` made
+  on their factory paths -- worth ~22 B/PDFArray × ~80 k instances
+  = ~1.7 MB heap, but the headline win is that with all three
+  shape changes in place V8 sees fully monomorphic call sites for
+  PDFRef / PDFDict / PDFArray construction and method dispatch,
+  collapsing GC self-time 101 → 59 ms (-42 %) and process
+  wall-clock 1.03 → 0.90 s (-130 ms, -13 %). ~19 MB process-phase
+  heap traffic drops -- collapses parseArray's `this.array = []`
+  + grow doublings across ~79 k PDFArrays. Composes with
+  `--fast-dict-onebuf`. Production runs through it. See
   [notes/08-pdf-lib.md "One-buffer PDFArray"](notes/08-pdf-lib.md).
 - `--measure-pass` -- inject
   [docs/lib/measure-pass.mjs](../docs/lib/measure-pass.mjs), the
@@ -638,6 +647,7 @@ file documenting each:
 | `skipJibberish` digit-byte fast path (peek before speculative `matchKeyword` + `matchIndirectObjectHeader`) | [08](notes/08-pdf-lib.md) | load mean 0.518 → 0.455 s (-62 ms, -6 %); save flat; byte-identical output |
 | Class-constructor `PDFRef` shape (`new _FastRef(...)` for stable V8 hidden class) | [08](notes/08-pdf-lib.md) | per-PDFRef ~60 B → ~44 B; total process heap 45.3 MB → 41.4 MB (-8.5 %); process wall 1.13 s → 0.99 s (-140 ms, -12 %) |
 | Class-constructor `PDFDict` shape (`_FastDict` / `_FastCatalog` / `_FastPageTree` / `_FastPageLeaf` per-subclass constructors) | [08](notes/08-pdf-lib.md) | `_makeFromRange (dict)` 16.5 MB → 11.4 MB; total process heap 41.4 MB → 35.4 MB (-14.4 %); cumulative -77 % since Map-backed PDFDict |
+| Class-constructor `PDFArray` shape (`_FastArray` factory + monomorphic call-site unlock across all three Fast classes) | [08](notes/08-pdf-lib.md) | total process heap 35.4 MB → 33.7 MB (-4.9 %); process wall 1.03 s → 0.90 s (-130 ms); GC self-time 101 ms → 59 ms (-42 %); cumulative -78 % heap since Map-backed PDFDict, -20 % process across the three shape-change commits |
 
 What was tried and didn't ship:
 
@@ -664,4 +674,4 @@ order; later ones reference earlier ones for context.
 | [05-blink-trace.md](notes/05-blink-trace.md) | What happened when we tried move-not-clone (a `previousLeaf` cache shipped instead of move); cracking the cpu profile's `(program)` row open with a Blink-category trace; the WhiteSpaceFilter paired-A/B that found it wasn't worth its layout cost in our pipeline. |
 | [06-microtasks-pageranges-css.md](notes/06-microtasks-pageranges-css.md) | Following `RunMicrotasks` down to zero (chunker fully sync); why `pageRanges` sharding is off the table; CSS cost attribution showing print.css's individual sections are all below the noise floor. |
 | [07-memory.md](notes/07-memory.md) | Where the renderer's 1.9 GB goes -- process-tree footprint, per-allocator + per-Blink-class breakdown, `--disable-gpu` + `--in-process-gpu` saving ~200 MB, a GC-pass probe finding 180 MB of unswept Oilpan garbage. |
-| [08-pdf-lib.md](notes/08-pdf-lib.md) | Profiling the process phase via `--cpu-profile-process`; pako's per-stream init dominates with ~4 500 small streams (routing pdf-lib's `deflate` + `inflate` through `node:zlib` saves ~1.5 s); `PDFRef.of`'s string-keyed Map lookup at 1.2 M calls per load (dense-array gen=0 cache saves ~0.2 s); parallelising save's per-stream deflate on libuv's pool with `objectsPerStream: 500` (~0.3 s off the main thread; PDF -5 %); `decodeName`'s regex scan on 2.76 M `PDFName.of` calls per load with a 0.0001 % hit rate (no-`#` cache saves ~0.5 s); `numberToString`'s redundant `toString`/`split`/`parseInt` on the 100 % no-`e` path; `sizeInBytes` allocating `n.toString(2)` on ~300 k xref-writer calls (short-circuit ladder saves ~60 ms); `PDFDict.entries` allocating `Array.from(map.entries())` on every dict serialisation (`Map.forEach` with hoisted callbacks saves ~80 ms); `parseDict`'s type-dispatch tail re-running `PDFName.of('Type'/'Catalog'/'Pages'/'Page')` per dict (hoisted sentinel constants drop `fastOf` self-time by 22 %); pdf-lib's `__awaiter`/`__generator` scaffolding on nine load + save methods costing ~135 ms of attributed self-time + ~50 ms GC (synchronified twins save 0.36 s of process); `parseObject`'s three speculative `matchKeyword(true/false/null)` scans on every dispatch (first-byte peek + gated keyword scans halve `parseObject` self-time); the sampling heap profile pointing at `new Map()` + `Map.prototype.set` at ~50 % of process-phase allocations (replacing `PDFDict`'s backing `Map` with a flat alternating `[k,v,k,v,...]` array drops Map+set heap traffic ~80 % and subsumes the earlier `fast-dict-iter` + `fast-parse-dict` shims); the only hot `Map.set` left being `PDFContext.assign`'s `indirectObjects.set(ref, object)` (replacing the Map with a dense gen=0 array indexed by `objectNumber` drops `assign` out of the CPU top-15 and halves the remaining `set` heap traffic); the residual `set` after that being the upstream `PDFRef.of` pool.set on cache miss (directly constructing the `PDFRef` via `Object.create(PDFRef.prototype)` on the gen=0 miss path bypasses the redundant upstream pool entirely, dropping `set` heap traffic from 7.7 MB to 0.5 MB and saving another ~93 ms on `PDFRef.of` CPU); `parseNumberOrRef` as the next-largest heap row at 15 MB of inlined `new PDFNumber(value)` calls -- PDFs reuse a few thousand unique numeric values hundreds of thousands of times (page indices, `/Count`, `/MediaBox` dimensions, font sizes), so pooling `PDFNumber` by value drops `parseNumberOrRef` off the top 10 and total process heap by ~13 %; `fastParseDictArray` at ~85 % FixedArray-growth garbage on 261 k dicts with a 5-entry median (pre-sizing the accumulator at `new Array(10)` collapses the per-call growth chain, dropping the row -25 % and total heap another -14 %); the next layer being the irreducible "one `new Array(10)` + one PDFDict instance per parsed dict" floor with ~1780 backing arrays still in flight after `--fast-dict-array` (collapse them into one long-lived mainBuf where every committed PDFDict entry lives, a per-parser temp stack handles parseDict recursion, owned dicts append to main and mutate in-place / COW-to-tail, and the whole PDFDict instance state packs into one 53-bit Number -- 24-bit start + 14-bit length + 1-bit owned -- so the per-dict object header collapses to a single field; total process heap drops 92 MB → 66 MB (-28 %), cumulative -57 % since the Map-backed baseline); GC self-time still ~150 ms / 15 % of process after fast-dict-onebuf (V8-flag knobs don't move it -- mark cost is dominated by walking the ~2.4 M Object-ref slots in mainBuf), so the next attack surface is shrinking the *live set* V8 has to mark by encoding slots as Numbers in a Float64Array mainBuf instead of Object references; that needs exact pool sizing, which is best done by a separate measure-allocate-work pass (Phase 0 viability gate: a no-allocate byte walker prototype clocks 135 ms vs PDFDocument.load at 1238 ms on the 39 MB Chrome-output PDF, ~9x cheaper, so even an 80 %-of-current work pass would land net-positive on CPU before any GC reduction; the prototype lives in `perf/phase0-measure.mjs` with `--dump-raw-pdf` on `measure.mjs` for capturing the canonical input); Phase 1 productionises that walker as `docs/lib/measure-pass.mjs` and adds `setExpectedDictSlots()` to fast-dict-onebuf so the harness's `--measure-pass` flag pre-sizes mainBuf exact (byte-identical output, +40 ms net process on the book) -- pipeline shape committed for Phase 2 to layer on, even though Phase 1 alone doesn't move the heap; a V8 IC-invalidation gotcha worth recording: rebinding the module-level `main` (rather than resizing it in place via `main.length = N`) made `_appendEntries` deopt and the heap jump 65 → 92 MB despite the resized Array being shape-identical; post-Phase-1 cleanup: the One-buffer `owned` bit was over-cautious -- `start + length === mainLen` is enough to know slots past mainLen are claimable regardless of whether the range came from the parser or a factory call, so the bit drops out of the packed value entirely (set's COW gate simplifies, _cow collapses to one branch, bit 38 becomes spare; byte-identical output, heap flat); slot-type histogram on `main[0..mainLen)` confirms keys are 100 % PDFName and the four big pools (Name / Ref / Number / Dict) cover 96.4 % of all slots, so a Float64Array mainBuf with a side `Object[]` for the residual ~3.5 % (Array / String / HexString) would collapse ~96 % of GC slot-marks at the price of a side-pool indirection -- `perf/instrument-slot-types.mjs` does the classification in 39 ms via `--instrument-slot-types`; built the Float64Array mainBuf with 4-bit tag + 49-bit payload encoding (subsuming fast-refs and fast-pdfnumber-pool's pool IDs, adding new pools for PDFArray / PDFString / PDFHexString) and confirmed byte-identical output -- but the measured win was a wash: pointer-array marks turn out to be fast in V8, encoding overhead at parse + decode at save roughly cancels the savings, and the new pool Maps cost ~3 MB heap, so the architecture stays on the shelf rather than shipping; mirroring the same shape to PDFArray (each instance as a view into a shared arrayBuf Float64Array via `this.d = (start, length)`, same temp-then-commit parser pattern) lands the expected heap win of -7.6 MB / -12 % and pulls parseArray's 19.6 MB attribution row off the top 10 entirely, but introduces a +360 ms wall-clock regression at save -- ~3 M per-slot `decodeValue` calls across copyBytesInto, ~100 ns each, V8 can't inline across the prototype boundary -- so the architecture stays off staging too; the natural follow-up is to hand-inline decodeValue's 10-case switch into all four hot save methods (Phase 3β), which recovers the function-call overhead and lands net wins on GC (-19 ms) and parseName (-17 ms, a downstream V8 re-optimization once the call sites became monomorphic per case branch) -- but the inlined switch body itself adds +23 ms at encode-at-parse and +22 ms at copyBytesInto, and the net of all of Phase 2 + 3 + β is "real heap+GC win, ~+200 ms wall-clock loss across many frames", so the architecture stays off staging on the simpler conclusion that V8 marks pointer arrays faster than expected and the original polymorphic `main[i].copyBytesInto()` was fine; Phase 1 wires into production via a `setExpectedDictSlots()` call in `docs/render-book.mjs` that runs the measure walker on rawPdf and hands the exact dictSlot count to fast-dict-onebuf -- the bound on mainBuf isn't material on its own (~60 K slots out of 2.4 M of slack) but commits the two-pass shape so any future shim swap doesn't have to re-do the wiring; finally, mirroring fast-dict-onebuf's range-view shape onto PDFArray (every committed element in a single append-only `arrayMain`, each PDFArray a view via packed `(start, length)`) is the lever the encoded approach was reaching for, with plain Object[] storage skipping the per-slot decode cost that killed Phase 3 -- ~19.6 MB `parseArray` allocation row drops off the top 15, total process heap 66 MB → 52 MB (-21 %), cumulative -66 % since the Map-backed baseline; once `fast-array-onebuf` shipped the next heap row was `PDFParser.parseIndirectObjectHeader` at 13.7 MB (25 % of total), attributed via `find-heap-callers.mjs` to V8 inlining `fastOf` into `skipJibberish`'s speculative `matchIndirectObjectHeader` call (~150 k tag-string allocations of ~25-35 B each), dropping the per-instance `PDFRef.tag` field entirely and computing `toString` / `sizeInBytes` / `copyBytesInto` from `objectNumber` / `generationNumber` directly via `_writeUint` + `_digitCount` helpers cuts `parseIndirectObjectHeader` to 9.3 MB (-4.3 MB), `fastOf` 7.7 → 4.8 MB (-2.9 MB), total process heap 51.9 → 45.2 MB (-13 %), with byte-identical output verified by inflating + diffing all 453 ObjStm streams; the same chain pointed at a redundancy on the CPU side -- `parseDocument`'s inner loop calls `skipJibberish` ~150 k times per load to recover from invalid PDFs that wedge garbage between indirect objects, but on valid PDFs every call speculatively runs `matchKeyword(xref/trailer/startxref)` (all fail on a digit) + `matchIndirectObjectHeader` (a `try` / `catch` around `parseIndirectObjectHeader` + `parseRawInt`x2 + `matchKeyword('obj')` + `fastOf`), all to confirm what the outer `while`'s `IsDigit` check already proved, so peeking the byte first and `continue`-ing on a digit (falling through to `skipJibberish` only on xref/trailer/startxref keyword starts or real jibberish) saves ~62 ms on load (mean 0.518 → 0.455 s, ~6 % of process); the next attack surface after that was the construction style itself -- `fast-refs`'s `Object.create(PDFRef.prototype) + fresh.objectNumber = ... + fresh.generationNumber = ...` routes V8 through the slow-property path with intermediate hidden-class transitions per write, putting PDFRef at ~60 B/instance vs PDFName's ~31 B (built via `new PDFName(...)` with a real constructor), so swapping to a plain function used as a constructor (`function _FastRef(o, g) { this.objectNumber = o; this.generationNumber = g; }` + `_FastRef.prototype = PDFRef.prototype`) gives V8 a stable hidden class from the first instance, drops per-PDFRef cost to ~44 B for ~3.87 MB heap (-8.5 %) and ~140 ms wall-clock (-12 % of process) on the book's 226 k unique PDFRefs (paired heap+cpu profile, --fast-refs vs --fast-refs-class with the rest of production on), with `parseIndirectObjectHeader` dropping 9.1 MB → 7.4 MB and `fastOf` 4.7 MB → 3.4 MB -- the `Object.create + writes` shim stays in the tree as A/B baseline (mutex-checked in measure.mjs); the same shape change applied symmetrically to the four PDFDict factory paths in `fast-dict-onebuf` (`_makeFromRange` + the COW path inside `set` both build wrappers via `Object.create(ProtoClass.prototype) + pd.d = ...`, with PageLeaf carrying extra `normalized` / `autoNormalizeCTM` writes) -- one plain-function constructor per subclass (`_FastDict`, `_FastCatalog`, `_FastPageTree`, `_FastPageLeaf`) with the prototype aliased to the upstream prototype drops 260 k+ wrapper instances ~20 B each for `_makeFromRange (dict)` 16.5 MB → 11.4 MB, `create` builtin 2.6 MB → 0.9 MB, total process heap 41.4 MB → 35.4 MB (-14.4 %), cumulative -22 % over the two shape-change commits and -77 % since the Map-backed PDFDict baseline (152 MB → 35.4 MB); wall-clock roughly flat (0.99 → 1.03 s under cpu profile, within noise) with GC self-time +18 ms (82 → 101 ms) as expected -- the dominant GC cost is the live mainBuf scan rather than allocation rate, so cutting allocations doesn't move single-shot mark time. |
+| [08-pdf-lib.md](notes/08-pdf-lib.md) | Profiling the process phase via `--cpu-profile-process`; pako's per-stream init dominates with ~4 500 small streams (routing pdf-lib's `deflate` + `inflate` through `node:zlib` saves ~1.5 s); `PDFRef.of`'s string-keyed Map lookup at 1.2 M calls per load (dense-array gen=0 cache saves ~0.2 s); parallelising save's per-stream deflate on libuv's pool with `objectsPerStream: 500` (~0.3 s off the main thread; PDF -5 %); `decodeName`'s regex scan on 2.76 M `PDFName.of` calls per load with a 0.0001 % hit rate (no-`#` cache saves ~0.5 s); `numberToString`'s redundant `toString`/`split`/`parseInt` on the 100 % no-`e` path; `sizeInBytes` allocating `n.toString(2)` on ~300 k xref-writer calls (short-circuit ladder saves ~60 ms); `PDFDict.entries` allocating `Array.from(map.entries())` on every dict serialisation (`Map.forEach` with hoisted callbacks saves ~80 ms); `parseDict`'s type-dispatch tail re-running `PDFName.of('Type'/'Catalog'/'Pages'/'Page')` per dict (hoisted sentinel constants drop `fastOf` self-time by 22 %); pdf-lib's `__awaiter`/`__generator` scaffolding on nine load + save methods costing ~135 ms of attributed self-time + ~50 ms GC (synchronified twins save 0.36 s of process); `parseObject`'s three speculative `matchKeyword(true/false/null)` scans on every dispatch (first-byte peek + gated keyword scans halve `parseObject` self-time); the sampling heap profile pointing at `new Map()` + `Map.prototype.set` at ~50 % of process-phase allocations (replacing `PDFDict`'s backing `Map` with a flat alternating `[k,v,k,v,...]` array drops Map+set heap traffic ~80 % and subsumes the earlier `fast-dict-iter` + `fast-parse-dict` shims); the only hot `Map.set` left being `PDFContext.assign`'s `indirectObjects.set(ref, object)` (replacing the Map with a dense gen=0 array indexed by `objectNumber` drops `assign` out of the CPU top-15 and halves the remaining `set` heap traffic); the residual `set` after that being the upstream `PDFRef.of` pool.set on cache miss (directly constructing the `PDFRef` via `Object.create(PDFRef.prototype)` on the gen=0 miss path bypasses the redundant upstream pool entirely, dropping `set` heap traffic from 7.7 MB to 0.5 MB and saving another ~93 ms on `PDFRef.of` CPU); `parseNumberOrRef` as the next-largest heap row at 15 MB of inlined `new PDFNumber(value)` calls -- PDFs reuse a few thousand unique numeric values hundreds of thousands of times (page indices, `/Count`, `/MediaBox` dimensions, font sizes), so pooling `PDFNumber` by value drops `parseNumberOrRef` off the top 10 and total process heap by ~13 %; `fastParseDictArray` at ~85 % FixedArray-growth garbage on 261 k dicts with a 5-entry median (pre-sizing the accumulator at `new Array(10)` collapses the per-call growth chain, dropping the row -25 % and total heap another -14 %); the next layer being the irreducible "one `new Array(10)` + one PDFDict instance per parsed dict" floor with ~1780 backing arrays still in flight after `--fast-dict-array` (collapse them into one long-lived mainBuf where every committed PDFDict entry lives, a per-parser temp stack handles parseDict recursion, owned dicts append to main and mutate in-place / COW-to-tail, and the whole PDFDict instance state packs into one 53-bit Number -- 24-bit start + 14-bit length + 1-bit owned -- so the per-dict object header collapses to a single field; total process heap drops 92 MB → 66 MB (-28 %), cumulative -57 % since the Map-backed baseline); GC self-time still ~150 ms / 15 % of process after fast-dict-onebuf (V8-flag knobs don't move it -- mark cost is dominated by walking the ~2.4 M Object-ref slots in mainBuf), so the next attack surface is shrinking the *live set* V8 has to mark by encoding slots as Numbers in a Float64Array mainBuf instead of Object references; that needs exact pool sizing, which is best done by a separate measure-allocate-work pass (Phase 0 viability gate: a no-allocate byte walker prototype clocks 135 ms vs PDFDocument.load at 1238 ms on the 39 MB Chrome-output PDF, ~9x cheaper, so even an 80 %-of-current work pass would land net-positive on CPU before any GC reduction; the prototype lives in `perf/phase0-measure.mjs` with `--dump-raw-pdf` on `measure.mjs` for capturing the canonical input); Phase 1 productionises that walker as `docs/lib/measure-pass.mjs` and adds `setExpectedDictSlots()` to fast-dict-onebuf so the harness's `--measure-pass` flag pre-sizes mainBuf exact (byte-identical output, +40 ms net process on the book) -- pipeline shape committed for Phase 2 to layer on, even though Phase 1 alone doesn't move the heap; a V8 IC-invalidation gotcha worth recording: rebinding the module-level `main` (rather than resizing it in place via `main.length = N`) made `_appendEntries` deopt and the heap jump 65 → 92 MB despite the resized Array being shape-identical; post-Phase-1 cleanup: the One-buffer `owned` bit was over-cautious -- `start + length === mainLen` is enough to know slots past mainLen are claimable regardless of whether the range came from the parser or a factory call, so the bit drops out of the packed value entirely (set's COW gate simplifies, _cow collapses to one branch, bit 38 becomes spare; byte-identical output, heap flat); slot-type histogram on `main[0..mainLen)` confirms keys are 100 % PDFName and the four big pools (Name / Ref / Number / Dict) cover 96.4 % of all slots, so a Float64Array mainBuf with a side `Object[]` for the residual ~3.5 % (Array / String / HexString) would collapse ~96 % of GC slot-marks at the price of a side-pool indirection -- `perf/instrument-slot-types.mjs` does the classification in 39 ms via `--instrument-slot-types`; built the Float64Array mainBuf with 4-bit tag + 49-bit payload encoding (subsuming fast-refs and fast-pdfnumber-pool's pool IDs, adding new pools for PDFArray / PDFString / PDFHexString) and confirmed byte-identical output -- but the measured win was a wash: pointer-array marks turn out to be fast in V8, encoding overhead at parse + decode at save roughly cancels the savings, and the new pool Maps cost ~3 MB heap, so the architecture stays on the shelf rather than shipping; mirroring the same shape to PDFArray (each instance as a view into a shared arrayBuf Float64Array via `this.d = (start, length)`, same temp-then-commit parser pattern) lands the expected heap win of -7.6 MB / -12 % and pulls parseArray's 19.6 MB attribution row off the top 10 entirely, but introduces a +360 ms wall-clock regression at save -- ~3 M per-slot `decodeValue` calls across copyBytesInto, ~100 ns each, V8 can't inline across the prototype boundary -- so the architecture stays off staging too; the natural follow-up is to hand-inline decodeValue's 10-case switch into all four hot save methods (Phase 3β), which recovers the function-call overhead and lands net wins on GC (-19 ms) and parseName (-17 ms, a downstream V8 re-optimization once the call sites became monomorphic per case branch) -- but the inlined switch body itself adds +23 ms at encode-at-parse and +22 ms at copyBytesInto, and the net of all of Phase 2 + 3 + β is "real heap+GC win, ~+200 ms wall-clock loss across many frames", so the architecture stays off staging on the simpler conclusion that V8 marks pointer arrays faster than expected and the original polymorphic `main[i].copyBytesInto()` was fine; Phase 1 wires into production via a `setExpectedDictSlots()` call in `docs/render-book.mjs` that runs the measure walker on rawPdf and hands the exact dictSlot count to fast-dict-onebuf -- the bound on mainBuf isn't material on its own (~60 K slots out of 2.4 M of slack) but commits the two-pass shape so any future shim swap doesn't have to re-do the wiring; finally, mirroring fast-dict-onebuf's range-view shape onto PDFArray (every committed element in a single append-only `arrayMain`, each PDFArray a view via packed `(start, length)`) is the lever the encoded approach was reaching for, with plain Object[] storage skipping the per-slot decode cost that killed Phase 3 -- ~19.6 MB `parseArray` allocation row drops off the top 15, total process heap 66 MB → 52 MB (-21 %), cumulative -66 % since the Map-backed baseline; once `fast-array-onebuf` shipped the next heap row was `PDFParser.parseIndirectObjectHeader` at 13.7 MB (25 % of total), attributed via `find-heap-callers.mjs` to V8 inlining `fastOf` into `skipJibberish`'s speculative `matchIndirectObjectHeader` call (~150 k tag-string allocations of ~25-35 B each), dropping the per-instance `PDFRef.tag` field entirely and computing `toString` / `sizeInBytes` / `copyBytesInto` from `objectNumber` / `generationNumber` directly via `_writeUint` + `_digitCount` helpers cuts `parseIndirectObjectHeader` to 9.3 MB (-4.3 MB), `fastOf` 7.7 → 4.8 MB (-2.9 MB), total process heap 51.9 → 45.2 MB (-13 %), with byte-identical output verified by inflating + diffing all 453 ObjStm streams; the same chain pointed at a redundancy on the CPU side -- `parseDocument`'s inner loop calls `skipJibberish` ~150 k times per load to recover from invalid PDFs that wedge garbage between indirect objects, but on valid PDFs every call speculatively runs `matchKeyword(xref/trailer/startxref)` (all fail on a digit) + `matchIndirectObjectHeader` (a `try` / `catch` around `parseIndirectObjectHeader` + `parseRawInt`x2 + `matchKeyword('obj')` + `fastOf`), all to confirm what the outer `while`'s `IsDigit` check already proved, so peeking the byte first and `continue`-ing on a digit (falling through to `skipJibberish` only on xref/trailer/startxref keyword starts or real jibberish) saves ~62 ms on load (mean 0.518 → 0.455 s, ~6 % of process); the next attack surface after that was the construction style itself -- `fast-refs`'s `Object.create(PDFRef.prototype) + fresh.objectNumber = ... + fresh.generationNumber = ...` routes V8 through the slow-property path with intermediate hidden-class transitions per write, putting PDFRef at ~60 B/instance vs PDFName's ~31 B (built via `new PDFName(...)` with a real constructor), so swapping to a plain function used as a constructor (`function _FastRef(o, g) { this.objectNumber = o; this.generationNumber = g; }` + `_FastRef.prototype = PDFRef.prototype`) gives V8 a stable hidden class from the first instance, drops per-PDFRef cost to ~44 B for ~3.87 MB heap (-8.5 %) and ~140 ms wall-clock (-12 % of process) on the book's 226 k unique PDFRefs (paired heap+cpu profile, --fast-refs vs --fast-refs-class with the rest of production on), with `parseIndirectObjectHeader` dropping 9.1 MB → 7.4 MB and `fastOf` 4.7 MB → 3.4 MB -- the `Object.create + writes` shim stays in the tree as A/B baseline (mutex-checked in measure.mjs); the same shape change applied symmetrically to the four PDFDict factory paths in `fast-dict-onebuf` (`_makeFromRange` + the COW path inside `set` both build wrappers via `Object.create(ProtoClass.prototype) + pd.d = ...`, with PageLeaf carrying extra `normalized` / `autoNormalizeCTM` writes) -- one plain-function constructor per subclass (`_FastDict`, `_FastCatalog`, `_FastPageTree`, `_FastPageLeaf`) with the prototype aliased to the upstream prototype drops 260 k+ wrapper instances ~20 B each for `_makeFromRange (dict)` 16.5 MB → 11.4 MB, `create` builtin 2.6 MB → 0.9 MB, total process heap 41.4 MB → 35.4 MB (-14.4 %), cumulative -22 % over the two shape-change commits and -77 % since the Map-backed PDFDict baseline (152 MB → 35.4 MB); wall-clock roughly flat (0.99 → 1.03 s under cpu profile, within noise) with GC self-time +18 ms (82 → 101 ms) as expected -- the dominant GC cost is the live mainBuf scan rather than allocation rate, so cutting allocations doesn't move single-shot mark time; mirroring the same change to PDFArray's `_makeFromRange` and COW paths with a single `_FastArray` constructor (no subclass dispatch needed -- PDFArray has none in pdf-lib) drops ~22 B/PDFArray × ~80 k = ~1.7 MB heap, but the surprise win is on CPU + GC: with all three shape changes in place V8 sees fully monomorphic call sites for PDFRef / PDFDict / PDFArray construction and method dispatch, undoing the dict-only state's +18 ms GC regression and then some -- GC self-time 101 → 59 ms (-42 %), process wall-clock 1.03 → 0.90 s (-130 ms, -13 %), so cumulative across the three shape-change commits (fast-refs-class + fast-dict-onebuf class + fast-array-onebuf class) the process drops 1.13 → 0.90 s (-230 ms, -20 %), total heap 45.3 → 33.7 MB (-25.6 %), GC self-time 87 → 59 ms (-32 %), with output byte-identical modulo timestamps. |
diff --git a/perf/notes/08-pdf-lib.md b/perf/notes/08-pdf-lib.md
index 59d29c27..5ff377e0 100644
--- a/perf/notes/08-pdf-lib.md
+++ b/perf/notes/08-pdf-lib.md
@@ -4262,6 +4262,85 @@ dict range changed, not any content path. The change is local to
 no production import or flag change needed since
 `--fast-dict-onebuf` was already wired up.
 
+## Class-constructor `PDFArray` shape
+
+The same shape change applied to PDFArray's factory paths. PDFArray
+has no subclasses in pdf-lib (unlike PDFDict), so a single
+`_FastArray` constructor covers both `_makeFromRange` and the COW
+path inside `set`:
+
+```js
+function _FastArray(d) { this.d = d; }
+_FastArray.prototype = PDFArray.prototype;
+
+function _makeFromRange(start, length, ctx) {
+  _registerContext(ctx);
+  return new _FastArray(pack(start, length));
+}
+```
+
+### Measured heap
+
+Paired profile, prior commit's dict-class baseline vs + this
+change:
+
+| Allocator                       | Pre        | Post       | Delta              |
+|---------------------------------|-----------:|-----------:|-------------------:|
+| Total sampled                   |  35.41 MB  |  33.68 MB  | **-1.73 MB (-4.9 %)** |
+| `fastParseArrayOneBuf` row      |   4 372 KB |   3 334 KB | -1 038 KB          |
+| `create` (builtin)              |     921 KB | out of top 15 | -921 KB        |
+
+Per-PDFArray saving: ~22 B/instance × ~80 k = ~1.7 MB. Matches the
+row delta + builtin drop.
+
+### Measured CPU -- the unexpected GC win
+
+| Row                | Pre       | Post     | Delta              |
+|--------------------|----------:|---------:|-------------------:|
+| Process wall-clock | 1.03 s    | 0.90 s   | **-130 ms (-13 %)** |
+| GC self-time       | 100.9 ms  | 58.7 ms  | **-42 ms (-42 %)**  |
+
+A surprising GC + wall-clock win for the smallest of the three
+heap drops. The likely reason is that with all three shape changes
+in place, V8 sees fully monomorphic call sites for PDFRef /
+PDFDict / PDFArray construction *and* method dispatch -- before
+the array change there was still one slow-property shape in the
+mix dragging IC perf. Confirmed by the cumulative process arc:
+
+| State                                  | process  | GC     |
+|----------------------------------------|---------:|-------:|
+| baseline (fast-refs)                   | 1.13 s   | 87 ms  |
+| + fast-refs-class                      | 0.99 s   | 82 ms  |
+| + fast-dict-onebuf class shape         | 1.03 s   | 101 ms |
+| + fast-array-onebuf class shape        | **0.90 s** | **59 ms** |
+
+The dict-only state had a slight CPU regression (+40 ms vs
+fast-refs-class) that the array change undid and then some.
+Argues strongly for shipping the full combo, not just the two
+big-heap-row ones.
+
+### Cumulative across the three shape-change commits
+
+Baseline (`fast-refs`) → all-three (`fast-array-onebuf class
+shape`):
+
+| Metric              | Pre        | Post       | Delta                |
+|---------------------|-----------:|-----------:|---------------------:|
+| Process wall-clock  | 1.13 s     | 0.90 s     | **-230 ms (-20 %)**  |
+| Total sampled heap  | 45.26 MB   | 33.68 MB   | **-11.58 MB (-25.6 %)** |
+| GC self-time        | 87 ms      | 59 ms      | **-32 %**            |
+
+Cumulative process-phase heap reduction since the Map-backed
+PDFDict baseline now stands at **~78 %** (152 MB → 33.7 MB).
+
+### Validation
+
+Output PDF byte-identical modulo `/CreationDate` + `/ModDate`
+timestamps. The change is local to
+[`docs/lib/fast-array-onebuf.mjs`](../../docs/lib/fast-array-onebuf.mjs);
+no production import or flag change needed since
+`--fast-array-onebuf` was already wired up.
+
 ## `@cantoo/pdf-lib`: not a drop-in replacement
 
 Spot-checked the maintained fork (`@cantoo/pdf-lib` 2.6.5) as an
@@ -4305,7 +4384,8 @@ Cumulative process-phase cost, baseline → after the shims to date:
 | + fast-refs tag drop                 | ~1.0 s  | ~0.7 s | ~0.4 s |
 | + skipJibberish digit fast-path      | ~0.95 s | ~0.6 s | ~0.4 s |
 | + fast-refs-class                    | ~0.9 s  | ~0.55 s | ~0.4 s |
-| **+ fast-dict-onebuf class shape (this section)** | **~0.9 s** | **~0.55 s** | **~0.4 s** |
+| + fast-dict-onebuf class shape       | ~0.9 s  | ~0.55 s | ~0.4 s |
+| **+ fast-array-onebuf class shape (this section)** | **~0.8 s** | **~0.5 s** | **~0.35 s** |
 
 The bottom-up after the latest pair is what's left of pdf-lib's
 genuine parser work: `PDFDict.entries`, `PDFObjectParser.parseName`,

From b2e5c492471ab36abaa8ad505a0794d6fc9432e5 Mon Sep 17 00:00:00 2001
From: Kuba Sunderland-Ober <kuba@mareimbrium.org>
Date: Sun, 24 May 2026 11:12:28 +0200
Subject: [PATCH 38/44] notes/08: close out the class-constructor PDFRef /
 PDFDict / PDFArray round.

The three commits that just landed (fast-refs-class, fast-dict-onebuf
class shape, fast-array-onebuf class shape) did the same shape
change to PDFRef, PDFDict, and PDFArray's factory paths -- swap
`Object.create(proto) + property writes` for a plain-function
constructor whose body assigns all fields in one shot, with the
prototype aliased so `instanceof` + method dispatch stay
unchanged. V8 gives the new instances a stable hidden class from
the first one and per-instance heap cost drops from ~60 B to
~44 B.

Each commit's narrative landed in notes/08-pdf-lib.md per the
staging convention (perf/README.md stays light/operational; per-
shim story lives in notes/). This commit folds in the closing
context that ties the round together:

- Per-instance savings table across all three wrappers in one
  view.
- Investigation aside: how the `parseIndirectObjectHeader` 9 MB
  heap row was a V8 inlining-attribution artifact for fastOf's
  allocations downstream, confirmed via `node --no-turbo-inlining`
  paired run. The hand-inlined `fast-pioh.mjs` attempt was
  deleted after proving the negative; call-counting
  instrumentation in perf/instrument-pioh.mjs (arrives in the
  next commit).
- Caveats: singleton subclass set in fast-dict-onebuf's dispatch,
  shared prototype semantics for `instanceof` / method dispatch,
  residual polymorphism on the gen != 0 PDFRef fallback.
- Top-5 heap rows post-round + the next-step menu (wrapper
  elimination vs targeted smaller shrinks). The big rows are now
  at the per-instance floor for V8 objects with 1-2 inline fields.

No code changes.
---
 perf/notes/08-pdf-lib.md | 96 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 96 insertions(+)

diff --git a/perf/notes/08-pdf-lib.md b/perf/notes/08-pdf-lib.md
index 5ff377e0..a8ba12a8 100644
--- a/perf/notes/08-pdf-lib.md
+++ b/perf/notes/08-pdf-lib.md
@@ -4341,6 +4341,102 @@ timestamps. The change is local to
 no production import or flag change needed since
 `--fast-array-onebuf` was already wired up.
 
+## Class-constructor round: closing the picture
+
+Recap of the three commits that just landed (PDFRef, PDFDict,
+PDFArray wrapper-shape changes): same attack, same constructor +
+prototype-aliasing trick. The per-instance numbers, before vs
+after, in one table:
+
+| Wrapper       | Before | After | Saved/inst | Count   | Total saved |
+|---------------|-------:|------:|-----------:|--------:|------------:|
+| PDFRef        |  ~60 B | ~44 B |     ~16 B  | 226 k   |   ~3.7 MB   |
+| PDFDict       |  ~64 B | ~44 B |     ~20 B  | 260 k   |   ~5.2 MB   |
+| PDFArray      |  ~54 B | ~32 B |     ~22 B  |  80 k   |   ~1.7 MB   |
+
+PDFRef stops at ~44 B because it carries 2 fields (`objectNumber`,
+`generationNumber`); PDFDict / PDFArray stop at ~32-44 B with 1
+field (the packed `d`). PDFPageLeaf carries 3 fields (d,
+normalized, autoNormalizeCTM) so it's slightly higher, but the
+constructor body still gives V8 the stable shape -- the 1 651
+PDFPageLeaf instances are a small tail.
+
+### Investigation aside: `parseIndirectObjectHeader` was a labelling artifact
+
+The hypothesis chain that led to the constructor-shape attack:
+
+1. Start: heap profile shows `parseIndirectObjectHeader` at 9.1 MB
+   self-attribution. Looks like a parser hot spot worth attacking.
+2. Hand-inline the entire function body (whitespace skip +
+   `parseRawInt` × 2 + `matchKeyword` + `PDFRef.of`) into a single
+   no-call body. Heap row barely moved (9.2 MB), CPU unchanged --
+   the row wasn't the call overhead.
+3. Disable V8 inlining with `node --no-turbo-inlining`. Heap row
+   collapses (9.2 MB → out of top 20). `fastOf` row jumps from
+   4.7 MB to 13.8 MB. Total sampled unchanged.
+
+Diagnosis: V8 inlines small hot leaf functions (like `fastOf`,
+when called from a hot caller) and attributes their allocations
+to the inliner's frame. The `parseIndirectObjectHeader` row name
+was misleading; the actual allocation source was the PDFRef
+instances being constructed downstream. Attacking the right thing
+(the wrapper shape) made the row drop too.
+
+The hand-inlined attempt (`fast-pioh.mjs`) was deleted after
+proving the negative; the call-counting instrumentation lives in
+[`perf/instrument-pioh.mjs`](../instrument-pioh.mjs). Both kept
+around in the writeup as the path to the right answer rather than
+the answer itself.
+
+### Caveats
+
+- **Singleton subclass set.** `fast-dict-onebuf` dispatches by
+  `ProtoClass === PDFDict | PDFCatalog | PDFPageTree |
+  PDFPageLeaf` to pick the right constructor. Any new PDFDict
+  subclass added in user code falls back to the original
+  `Object.create` path (defensive; nothing in our pipeline
+  triggers it). If the upstream PDFDict hierarchy grows, the
+  dispatch chain needs a new entry.
+- **Shared prototype.** `_FastRef.prototype = PDFRef.prototype`
+  means a `new _FastRef(...)` instance is indistinguishable from
+  a `new PDFRef(...)` instance via `instanceof` and method
+  dispatch. No code in our pipeline cares about constructor
+  identity (`obj.constructor === PDFRef` -- absent in pdf-lib +
+  our shims).
+- **Method dispatch stays polymorphic for gen != 0 PDFRefs.** The
+  `--fast-refs-class` shim only routes gen=0 through the
+  `_FastRef` constructor; gen != 0 falls back to upstream
+  `PDFRef.of` which uses its own Map-based pool and
+  `new PDFRef(...)`. Both shapes share `PDFRef.prototype` so
+  methods dispatch uniformly; V8 may see 2 maps but the path is
+  rare (~18 % of refs).
+
+### Where this leaves heap
+
+After the three commits, the top-5 heap rows are:
+
+| # | Self KB | Frame                                                              |
+|--:|--------:|--------------------------------------------------------------------|
+| 1 | 11 474  | `_makeFromRange` (PDFDict) -- 260 k × ~44 B floor                  |
+| 2 |  7 450  | `parseIndirectObjectHeader` -- V8 attribution of the next row      |
+| 3 |  3 411  | `fastClassOf` (PDFRef) -- 226 k × ~44 B floor                      |
+| 4 |  3 334  | `fastParseArrayOneBuf` (PDFArray) -- 80 k × ~32 B floor            |
+| 5 |  2 098  | `parseIndirectObjectSync` -- per-call attribution residual         |
+
+The big rows are now at the per-instance floor for V8 objects
+with 1-2 inline fields. Further heap reduction requires either:
+
+1. **Eliminate the wrapper entirely** -- PDFRef / PDFDict /
+   PDFArray become bare packed Numbers, every consumer rewritten
+   to call free functions instead of methods. Biggest remaining
+   win (~11 MB on PDFDict alone), largest engineering surface.
+2. **Smaller targeted shrinks** -- PDFNumber drops eager
+   `stringValue` cache, etc. Each at ~hundreds of KB,
+   accumulating slowly.
+
+Neither has been started; this section closes the per-instance
+constructor-shape round.
+
 ## `@cantoo/pdf-lib`: not a drop-in replacement
 
 Spot-checked the maintained fork (`@cantoo/pdf-lib` 2.6.5) as an

From 2c3650166e3cccbafb15313c2065ed5e23215ce3 Mon Sep 17 00:00:00 2001
From: Kuba Sunderland-Ober <kuba@mareimbrium.org>
Date: Sun, 24 May 2026 11:22:12 +0200
Subject: [PATCH 39/44] perf: add instrument-pioh + instrument-objclasses (used
 in the class-shape round).
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Both scripts informed the constructor-shape PDFRef / PDFDict /
PDFArray work that landed earlier in this round; capture them in
the tree so future "is row X a real hot spot or a labelling
artifact" investigations can reuse the pattern.

`instrument-pioh.mjs` -- wraps PDFParser.parseIndirectObjectHeader +
matchIndirectObjectHeader with counters, reports per-load call
counts and kept-heap delta. Built to answer the prerequisites
before committing to inline parseIndirectObjectHeader (was the
function actually a hot spot? was fast-sync-load's digit
short-circuit firing? was speculation throwing?). Output on the
book: 226 k pioh calls, 0 throws, 0 mih calls, ~35 MB kept heap.
The 9 MB self-attribution turned out to be V8 inlining fastOf into
the caller frame -- script informed the correct attack surface
(wrapper construction, not the parser body).

`instrument-objclasses.mjs` -- two views of "how many PDF*
wrappers do we build per load":
- `.of()` call counts for the pooled / factory-method classes
  (PDFRef 1.43 M calls / 226 k unique, PDFName 1.68 M / 4.8 k,
  PDFNumber 284 k / 16 k, PDFString 7.4 k, PDFRawStream 2 k);
- post-load walk of PDFContext.enumerateIndirectObjects() bumping
  per-runtime-class counters for the top-level shapes (PDFDict
  221 k top-level / ~261 k incl. nested; 1 PDFCatalog; 238
  PDFPageTree; 1 651 PDFPageLeaf).

Used to size the constructor-shape attack: confirmed PDFRef +
PDFDict + PDFArray dominate the per-instance × instance-count
product, so they were the right three to convert.

Both scripts updated to import fast-refs-class (current
production) rather than the older fast-refs.mjs (now A/B
baseline) so the numbers reflect production.

Also adds README entries under "What's in this folder", next to
the existing pdf-lib-side standalone harnesses
(profile-load.mjs, profile-roundtrip.mjs).
---
 perf/README.md                 |   2 +
 perf/instrument-objclasses.mjs | 146 +++++++++++++++++++++++++++++++++
 perf/instrument-pioh.mjs       |  98 ++++++++++++++++++++++
 3 files changed, 246 insertions(+)
 create mode 100644 perf/instrument-objclasses.mjs
 create mode 100644 perf/instrument-pioh.mjs

diff --git a/perf/README.md b/perf/README.md
index 9edbfb7f..34982a04 100644
--- a/perf/README.md
+++ b/perf/README.md
@@ -427,6 +427,8 @@ Side experiments / one-shot probes:
 | --- | --- |
 | `profile-load.mjs` | Standalone profiler for `PDFDocument.load`. Runs the load on a chosen PDF with a chosen `parseSpeed`; intended to be run under `node --cpu-prof`. Auto-pins on Windows via `pin-cpu.mjs`. |
 | `profile-roundtrip.mjs` | Times the full pdf-lib `load + save` roundtrip across the three `parseSpeed` / `objectsPerTick` settings on a chosen PDF. Auto-pins on Windows via `pin-cpu.mjs`. |
+| `instrument-pioh.mjs` | Wraps `PDFParser.prototype.parseIndirectObjectHeader` + `matchIndirectObjectHeader` with counters and reports per-load call counts + the kept-heap delta. Built during the "is the 9 MB heap row a real parser hot spot or a V8 inlining-attribution artifact" investigation -- a non-zero `mih` count would mean fast-sync-load's digit short-circuit isn't firing; a non-zero `throws` would mean the speculative-recovery try/catch is firing on production. Run with `node --expose-gc perf/instrument-pioh.mjs`. |
+| `instrument-objclasses.mjs` | Counts every PDF* class touched by a load on `raw.pdf`: per-class `.of()` call count for the pooled wrappers (PDFRef / PDFName / PDFNumber / PDFString / PDFHexString / PDFRawStream / PDFObjectStream) AND a post-load walk of `PDFContext.enumerateIndirectObjects()` bumping per-runtime-class counts for the top-level shapes. Used to size the constructor-shape round (how many of each wrapper is the per-instance cost multiplied by?). Run with `node perf/instrument-objclasses.mjs`. |
 | `probe-chrome-outline.mjs` | Renders a synthetic multi-level h1..h6 document via Chrome's `outline: true` and dumps the resulting `/Outlines` tree. Quick check that the CDP flag is wired correctly in the local Chromium / puppeteer combo. |
 | `compare-outlines.mjs` | Diffs two PDFs' `/Outlines` trees by `(depth, title, target page)`. Used to verify whether Chrome's native outline matches the injected one. |
 | `probe-outline-exclusions.mjs` | Tests which per-element attributes / styles (aria-hidden, role=presentation, hidden, display:none, CSS bookmark-level, ...) make Chrome drop a heading from its outline. |
diff --git a/perf/instrument-objclasses.mjs b/perf/instrument-objclasses.mjs
new file mode 100644
index 00000000..3c2860cb
--- /dev/null
+++ b/perf/instrument-objclasses.mjs
@@ -0,0 +1,146 @@
+// Count instances of each PDF* class touched by a load on the book.
+//
+// Two views of "how many":
+//
+//   1. "Counted by .of()" -- every call to ClassName.of(...) regardless
+//      of whether the pool returned an existing instance. Tells you call
+//      frequency. Useful for spotting "PDFRef.of fires 1.4 M times per
+//      load" vs "only 226 k of those are unique" (the rest are pool
+//      hits).
+//   2. "Observed in indirectObjects after load" -- walks the loaded
+//      PDFContext.enumerateIndirectObjects() and bumps a counter per
+//      top-level object's runtime class. Inline (nested) PDFDict /
+//      PDFArray instances don't show up here; for those, use the
+//      heap-profile rows directly.
+//
+// Wired up to inform the class-constructor shape work in
+// fast-refs-class / fast-dict-onebuf / fast-array-onebuf. Output on
+// the book:
+//
+//   Counted by .of():
+//     PDFRef               1429034   (~226 k unique, rest pool hits)
+//     PDFNumber             284105   (~16 k unique)
+//     PDFName              1681225   (~4.8 k unique)
+//     PDFString               7375
+//     PDFRawStream            2061
+//
+//   Observed in indirectObjects after load:
+//     PDFCatalog                  1
+//     PDFPageTree               238
+//     PDFPageLeaf              1651
+//     PDFRawStream            2061
+//     PDFDict                220815   (top-level only; ~261 k incl. nested)
+//     PDFArray                1651   (top-level only; ~80 k incl. nested)
+//
+// To get unique counts on the pooled classes, see the throwaway snippet
+// in the "Class-constructor shapes" section of README.md (wraps PDFRef.of
+// / PDFName.of / PDFNumber.of with a Set-based dedupe).
+//
+// Run: node perf/instrument-objclasses.mjs
+
+import '../docs/lib/fast-refs-class.mjs';
+import '../docs/lib/fast-inflate.mjs';
+import '../docs/lib/fast-parse-number.mjs';
+import '../docs/lib/fast-decode-name.mjs';
+import '../docs/lib/fast-number-to-string.mjs';
+import '../docs/lib/fast-size-in-bytes.mjs';
+import '../docs/lib/fast-parse-object.mjs';
+import '../docs/lib/fast-sync-load.mjs';
+import '../docs/lib/fast-indirect-objects.mjs';
+import '../docs/lib/fast-pdfnumber-pool.mjs';
+import { setExpectedDictSlots } from '../docs/lib/fast-dict-onebuf.mjs';
+import { setExpectedArraySlots } from '../docs/lib/fast-array-onebuf.mjs';
+import { measure as measureRawPdf } from '../docs/lib/measure-pass.mjs';
+import { PDFDocument } from 'pdf-lib';
+import { createRequire } from 'node:module';
+import { readFileSync } from 'node:fs';
+
+const require = createRequire(import.meta.url);
+
+const PDFRef        = require('pdf-lib/cjs/core/objects/PDFRef.js').default;
+const PDFName       = require('pdf-lib/cjs/core/objects/PDFName.js').default;
+const PDFNumber     = require('pdf-lib/cjs/core/objects/PDFNumber.js').default;
+const PDFString     = require('pdf-lib/cjs/core/objects/PDFString.js').default;
+const PDFHexString  = require('pdf-lib/cjs/core/objects/PDFHexString.js').default;
+const PDFDict       = require('pdf-lib/cjs/core/objects/PDFDict.js').default;
+const PDFArray      = require('pdf-lib/cjs/core/objects/PDFArray.js').default;
+const PDFStream     = require('pdf-lib/cjs/core/objects/PDFStream.js').default;
+const PDFRawStream  = require('pdf-lib/cjs/core/objects/PDFRawStream.js').default;
+const PDFBool       = require('pdf-lib/cjs/core/objects/PDFBool.js').default;
+const PDFCatalog    = require('pdf-lib/cjs/core/structures/PDFCatalog.js').default;
+const PDFPageTree   = require('pdf-lib/cjs/core/structures/PDFPageTree.js').default;
+const PDFPageLeaf   = require('pdf-lib/cjs/core/structures/PDFPageLeaf.js').default;
+const PDFObjectStream    = require('pdf-lib/cjs/core/structures/PDFObjectStream.js').default;
+const PDFCrossRefStream  = require('pdf-lib/cjs/core/structures/PDFCrossRefStream.js').default;
+const PDFFlateStream     = require('pdf-lib/cjs/core/structures/PDFFlateStream.js').default;
+const PDFContentStream   = require('pdf-lib/cjs/core/structures/PDFContentStream.js').default;
+
+const counts = new Map();
+function track(name, Cls) {
+  counts.set(name, 0);
+  const origOf = Cls.of;
+  if (typeof origOf === 'function') {
+    Cls.of = function (...args) {
+      const r = origOf.apply(this, args);
+      counts.set(name, counts.get(name) + 1);
+      return r;
+    };
+  }
+}
+
+// Counting via .of for the pooled / factory-method classes. PDFDict
+// / PDFArray / PDFPageLeaf are constructed via the fast-dict-onebuf
+// and fast-array-onebuf factory paths; for those, the post-load walk
+// below scans PDFContext.enumerateIndirectObjects() instead.
+track('PDFRef',        PDFRef);
+track('PDFNumber',     PDFNumber);
+track('PDFName',       PDFName);
+track('PDFString',     PDFString);
+track('PDFHexString',  PDFHexString);
+track('PDFRawStream',  PDFRawStream);
+track('PDFObjectStream', PDFObjectStream);
+
+const rawPdf = readFileSync(new URL('./raw.pdf', import.meta.url));
+
+const dictCounts = measureRawPdf(rawPdf);
+setExpectedDictSlots(dictCounts.dictSlots);
+setExpectedArraySlots(dictCounts.arraySlots);
+
+const tBefore = Date.now();
+const doc = await PDFDocument.load(rawPdf);
+console.log('load:    ', Date.now() - tBefore, 'ms');
+
+// After-load count: scan indirectObjects for each class.
+const seen = new Map();
+function bump(name) { seen.set(name, (seen.get(name) || 0) + 1); }
+function walk(obj, depth = 0) {
+  if (obj == null) return;
+  // Identify class.
+  if (obj instanceof PDFCatalog)         bump('PDFCatalog');
+  else if (obj instanceof PDFPageTree)   bump('PDFPageTree');
+  else if (obj instanceof PDFPageLeaf)   bump('PDFPageLeaf');
+  else if (obj instanceof PDFObjectStream) bump('PDFObjectStream');
+  else if (obj instanceof PDFCrossRefStream) bump('PDFCrossRefStream');
+  else if (obj instanceof PDFFlateStream) bump('PDFFlateStream');
+  else if (obj instanceof PDFContentStream) bump('PDFContentStream');
+  else if (obj instanceof PDFRawStream)  bump('PDFRawStream');
+  else if (obj instanceof PDFStream)     bump('PDFStream');
+  else if (obj instanceof PDFDict)       bump('PDFDict');
+  else if (obj instanceof PDFArray)      bump('PDFArray');
+  else if (obj instanceof PDFName)       bump('PDFName');
+  else if (obj instanceof PDFNumber)     bump('PDFNumber');
+  else if (obj instanceof PDFString)     bump('PDFString');
+  else if (obj instanceof PDFHexString)  bump('PDFHexString');
+  else if (obj instanceof PDFBool)       bump('PDFBool');
+}
+for (const [, obj] of doc.context.enumerateIndirectObjects()) walk(obj);
+
+console.log('\nCounted by .of():');
+for (const [k, v] of counts) console.log('  ' + k.padEnd(20), v);
+
+console.log('\nObserved in indirectObjects after load:');
+const names = ['PDFCatalog','PDFPageTree','PDFPageLeaf','PDFObjectStream',
+  'PDFCrossRefStream','PDFFlateStream','PDFContentStream','PDFRawStream',
+  'PDFStream','PDFDict','PDFArray','PDFName','PDFNumber','PDFString',
+  'PDFHexString','PDFBool'];
+for (const n of names) console.log('  ' + n.padEnd(20), seen.get(n) || 0);
diff --git a/perf/instrument-pioh.mjs b/perf/instrument-pioh.mjs
new file mode 100644
index 00000000..30cc6e7e
--- /dev/null
+++ b/perf/instrument-pioh.mjs
@@ -0,0 +1,98 @@
+// Count PDFParser.parseIndirectObjectHeader + matchIndirectObjectHeader
+// calls per load of perf/raw.pdf, plus the kept-heap delta across load.
+//
+// Background: the heap profile attributed ~9 MB of self-allocations to
+// parseIndirectObjectHeader -- enough to look like a real parser hot
+// spot. This script answers the prerequisite questions before
+// committing to an inline rewrite: how often is the function actually
+// called, does the speculative matchIndirectObjectHeader path fire on
+// the production shim stack (fast-sync-load's digit fast-path is
+// supposed to short-circuit it), and does parseIndirectObjectHeader
+// ever throw (recovery via matchIndirectObjectHeader's try/catch
+// wrapper)?
+//
+// Output, on the book (raw.pdf) with the current production shim stack:
+//   pioh calls:        226418
+//   pioh throws:       0
+//   mih  calls:        0           <- fast-sync-load short-circuit works
+//   heap delta (kept): ~35 MB
+//
+// The ~9 MB heap attribution turned out to be a V8 inlining-attribution
+// artifact (fastOf's PDFRef-construction bytes inlined into
+// parseIndirectObjectHeader's frame), not anything the function itself
+// allocates. Confirmed by re-profiling under `node --no-turbo-inlining`,
+// see "Class-constructor shapes for PDFRef / PDFDict / PDFArray" in
+// README.md. The fix wasn't in this function; it was in fast-refs's
+// wrapper construction (-> fast-refs-class).
+//
+// Run: node --expose-gc perf/instrument-pioh.mjs
+
+import '../docs/lib/fast-refs-class.mjs';
+import '../docs/lib/fast-inflate.mjs';
+import '../docs/lib/fast-parse-number.mjs';
+import '../docs/lib/fast-decode-name.mjs';
+import '../docs/lib/fast-number-to-string.mjs';
+import '../docs/lib/fast-size-in-bytes.mjs';
+import '../docs/lib/fast-parse-object.mjs';
+import '../docs/lib/fast-sync-load.mjs';
+import '../docs/lib/fast-indirect-objects.mjs';
+import '../docs/lib/fast-pdfnumber-pool.mjs';
+import { setExpectedDictSlots } from '../docs/lib/fast-dict-onebuf.mjs';
+import { setExpectedArraySlots } from '../docs/lib/fast-array-onebuf.mjs';
+import { measure as measureRawPdf } from '../docs/lib/measure-pass.mjs';
+import { PDFDocument } from 'pdf-lib';
+import { createRequire } from 'node:module';
+import { readFileSync } from 'node:fs';
+
+const require = createRequire(import.meta.url);
+const PDFParser = require('pdf-lib/cjs/core/parser/PDFParser.js').default;
+
+const rawPdf = readFileSync(new URL('./raw.pdf', import.meta.url));
+
+// Wrap parseIndirectObjectHeader + matchIndirectObjectHeader with
+// counters. The throws counter tells us whether the function recovers
+// via matchIndirectObjectHeader's try/catch (a non-zero value would
+// mean speculation is firing on the production shim stack, which would
+// invalidate the "fast-sync-load short-circuit works" claim).
+let pioCalls = 0;
+let mihCalls = 0;
+let pioThrows = 0;
+const origPioh = PDFParser.prototype.parseIndirectObjectHeader;
+const origMih = PDFParser.prototype.matchIndirectObjectHeader;
+
+PDFParser.prototype.parseIndirectObjectHeader = function () {
+  pioCalls++;
+  try {
+    return origPioh.call(this);
+  } catch (e) {
+    pioThrows++;
+    throw e;
+  }
+};
+
+PDFParser.prototype.matchIndirectObjectHeader = function () {
+  mihCalls++;
+  return origMih.call(this);
+};
+
+// Warm up: do the measure pass + a single dry run to JIT.
+const counts = measureRawPdf(rawPdf);
+setExpectedDictSlots(counts.dictSlots);
+setExpectedArraySlots(counts.arraySlots);
+
+// Memory before.
+if (global.gc) global.gc();
+const heapBefore = process.memoryUsage().heapUsed;
+
+const tBefore = Date.now();
+const doc = await PDFDocument.load(rawPdf);
+const tAfter = Date.now();
+
+if (global.gc) global.gc();
+const heapAfter = process.memoryUsage().heapUsed;
+
+console.log('load time:        ', tAfter - tBefore, 'ms');
+console.log('pioh calls:       ', pioCalls);
+console.log('pioh throws:      ', pioThrows);
+console.log('mih  calls:       ', mihCalls);
+console.log('heap delta (kept):', ((heapAfter - heapBefore) / 1024 / 1024).toFixed(2), 'MB');

From 371f020547df4abbd9e055cdfdfec36dc15ceb25 Mon Sep 17 00:00:00 2001
From: Kuba Sunderland-Ober <kuba@mareimbrium.org>
Date: Sun, 24 May 2026 11:42:15 +0200
Subject: [PATCH 40/44] fast-parse-name: byte-keyed cache in front of parseName
 (-80 ms process, -9 %).
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

PDFObjectParser.prototype.parseName fires 1.68 M times per load on
the book and the call density was the #1 row in the process CPU
profile after the constructor-shape round closed
(PDFObjectParser.parseName @ ~87 ms self + fastOf @ ~57 ms callee
= ~144 ms combined, ~16 % of process). 4 787 of those calls are
unique; the other 99.7 % hit the same handful of dict keys (Type,
Length, Pages, MediaBox, ...) over and over. The per-call work --
build a string via `name += charFromCode(byte)` then hand it to
PDFName.of's string-keyed Map -- is pure overhead on the hot path:
the answer was already cached, we just kept rebuilding the key.

A failed first attempt (the v1 of this shim, not committed) tried
to keep the cons-string accumulator but skip the per-byte
this.bytes.peek/.next/.done method dispatch. CPU didn't move:
V8 had already optimised the cons-string path well and the
saved method-call cost just shifted to attribution on the callers
(fastParseDictOneBuf / fastParseObject) under inlining. A second
failed sketch built the lookup string via
`String.fromCharCode.apply(null, buf.subarray(start, idx))` and
was SLOWER than upstream (~123 ms vs ~87 ms): .apply on a typed
array is a deopt path in V8.

This shim attacks the real surface: avoid producing the lookup
string at all on the hot path. Scan bytes with direct buffer
access, compute a Java-style `hash * 31 + byte` Smi hash in the
same pass, look up `Map<hash, Entry | Entry[]>` keyed by byte
content. Single-entry buckets (the vast majority -- 4 787 unique
names into 2^32 hash space gives essentially zero collisions)
store the Entry directly; collision buckets get promoted to an
Entry[] for linear scan. Entry carries a small Uint8Array copy of
the name body for exact equality check.

Cold path: byte-cache miss. Build the lookup string in one shot
(String.fromCharCode is fast for short ranges via direct args, no
.apply), call the upstream `PDFName.of` -- which on this stack
means fast-decode-name's string-keyed Map -- and cache the
returned PDFName in the byte-cache for next time. Both caches
converge on the same PDFName per logical name; direct
PDFName.of(...) calls from non-parser code (setOutline,
setMetadata) bypass the byte-cache and go straight through
fast-decode-name -- correct, no byte range available.

Measured on the book (paired heap + cpu profile, baseline =
prior --fast-refs-class + --fast-dict-onebuf + --fast-array-onebuf
state, this on top):

  Process wall-clock              0.90 s -> 0.82 s   (-80 ms, -9 %)
    load                          0.41 s -> 0.33 s
    save                          0.42 s -> 0.42 s
  parseName + fastOf (combined)   144 ms -> 58 ms    (-86 ms)
  PDFObjectParser.parseName       (gone from top 15)
  fastOf (PDFName decode-name)    52 ms  -> (gone from top 15)
  Heap (sampled total)            33.68 MB -> 34.98 MB (+1.30 MB)
    new fastParseName row          —     -> 1 269 KB (the cache)
    set (builtin)                  624 KB -> 852 KB (+228 KB)

Heap-profiled process wall-clock dropped much more (3.50 s ->
2.56 s, -940 ms) than the cpu-profiled run did -- because the
sampler's per-allocation bookkeeping is the dominant cost under
512 B sampling, and we just eliminated ~1.6 M transient string
allocations that were all under the sample threshold (so they
don't show up in the heap row, only in the sampler's wall-clock
overhead). Read the cpu number for "did we get faster"; read the
heap-row delta for "what's the long-lived cost" (+1.3 MB, the
cache itself).

GC self-time +21 ms on the cpu run (the live byte-cache adds to
mark cost), more than offset by the -80 ms parseName savings.

Output PDF byte-identical: the byte-cache and the string-cache
return the same PDFName instance per logical name, so all
downstream code sees the same identity.
---
 docs/lib/fast-parse-name.mjs | 146 ++++++++++++++++++++
 docs/render-book.mjs         |  19 +++
 perf/README.md               |  30 +++-
 perf/measure.mjs             |   6 +
 perf/notes/08-pdf-lib.md     | 258 ++++++++++++++++++++++++++++++++++-
 5 files changed, 455 insertions(+), 4 deletions(-)
 create mode 100644 docs/lib/fast-parse-name.mjs

diff --git a/docs/lib/fast-parse-name.mjs b/docs/lib/fast-parse-name.mjs
new file mode 100644
index 00000000..5da62fae
--- /dev/null
+++ b/docs/lib/fast-parse-name.mjs
@@ -0,0 +1,146 @@
+// Byte-keyed cache in front of parseName: on cache hit (99.7 % of
+// calls on the book) return the existing PDFName without allocating
+// the lookup string at all.
+//
+// Step 1 of this optimisation (commit history shows the failed
+// attempt) hand-inlined parseName's byte loop to skip the
+// `this.bytes.peek() / .next() / .done()` per-byte method dispatch
+// while keeping the original cons-string accumulator. CPU didn't move:
+// V8 was already optimising the cons-string path well, and the saved
+// method-call cost just shifted attribution to the callers
+// (fastParseDictOneBuf / fastParseObject). Heap was flat too.
+//
+// This shim attacks the actual transient cost: each call builds a
+// throwaway string (cons-chain of ~8 chars on average, then flattened
+// on first use) only to hand it to PDFName.of, which hashes the string
+// against a Map<string, PDFName> and returns the cached instance.
+// 1.68 M calls × ~10-byte average × cons-string allocations + Map.get
+// hashing-the-string-again adds up to non-trivial heap throughput and
+// CPU even though the per-call work is small.
+//
+// PDF names are 4 787 unique on the book vs 1 681 225 calls -- 99.7 %
+// hit rate. So 99.7 % of those string allocations + Map hashings are
+// pure overhead: the answer was already computed, we just needed a
+// way to find it without rebuilding the key.
+//
+// The byte-cache. Keyed by `Uint8Array.prototype.hash`-ish value
+// (Java-style `hash * 31 + byte`), valued by the cached PDFName.
+// Each bucket stores `Entry` (single-entry, the common case for ~99 %
+// of buckets) or `Entry[]` (collision, vanishingly rare for the 4.8 k
+// unique names hashed into 2^32 space). Entry holds the bytes-key
+// (a small Uint8Array copy of the name body) for collision-check
+// equality.
+//
+// Cold path. On byte-cache miss, build the string via
+// `String.fromCharCode` (one allocation, not the per-byte cons chain
+// because we already have the full byte range from the scan) and
+// call the upstream `PDFName.of` -- which on this stack means
+// fast-decode-name's string-keyed cache, which returns the PDFName
+// (cache hit on the string side) or constructs it. Either way, the
+// PDFName instance gets cached in the byte-cache for next time.
+// Both caches converge on the same PDFName instance per logical name.
+//
+// Composes with fast-decode-name (their caches see different keys for
+// the same logical name; both return the same PDFName via this fall-
+// back chain). Direct `PDFName.of(...)` calls from non-parser code
+// (setOutline, setMetadata) bypass the byte-cache and go straight
+// through fast-decode-name -- correct, since those calls don't have
+// a byte range to work with.
+//
+// Side-effecting import. Import once before PDFDocument.load runs;
+// idempotent.
+
+import { createRequire } from 'node:module';
+
+const require = createRequire(import.meta.url);
+const PDFObjectParser = require('pdf-lib/cjs/core/parser/PDFObjectParser.js').default;
+const PDFName         = require('pdf-lib/cjs/core/objects/PDFName.js').default;
+const CharCodes       = require('pdf-lib/cjs/core/syntax/CharCodes.js').default;
+const { IsWhitespace } = require('pdf-lib/cjs/core/syntax/Whitespace.js');
+const { IsDelimiter }  = require('pdf-lib/cjs/core/syntax/Delimiters.js');
+
+const FORWARD_SLASH = CharCodes.ForwardSlash;
+
+// hash -> Entry | Entry[]. Single-entry buckets store the Entry
+// directly; on collision we promote to an array. Entry shape is fixed
+// (bytes + name) so V8 gives it a stable hidden class.
+const byteCache = new Map();
+
+class Entry {
+  constructor(bytes, name) {
+    this.bytes = bytes;
+    this.name = name;
+  }
+}
+
+function _bytesEqual(a, buf, start, end) {
+  if (a.length !== end - start) return false;
+  for (let i = 0; i < a.length; i++) {
+    if (a[i] !== buf[start + i]) return false;
+  }
+  return true;
+}
+
+if (!PDFObjectParser.prototype.__fastParseNameInstalled) {
+  const orig = PDFObjectParser.prototype.parseName;
+
+  PDFObjectParser.prototype.parseName = function fastParseName() {
+    const stream = this.bytes;
+    const buf = stream.bytes;
+    const len = stream.length;
+    let idx = stream.idx;
+
+    // assertNext(ForwardSlash). Fall back on the unexpected path.
+    if (idx >= len || buf[idx] !== FORWARD_SLASH) {
+      return orig.call(this);
+    }
+    idx++;
+
+    // Scan body + compute hash in one pass. Java-style hashCode
+    // (`hash * 31 + byte`) -- monomorphic Smi math, no allocations.
+    const start = idx;
+    let hash = 0;
+    while (idx < len) {
+      const byte = buf[idx];
+      if (IsWhitespace[byte] || IsDelimiter[byte]) break;
+      hash = (hash * 31 + byte) | 0;
+      idx++;
+    }
+    stream.idx = idx;
+
+    // Look up the byte-cache.
+    const bucket = byteCache.get(hash);
+    if (bucket !== undefined) {
+      if (bucket instanceof Entry) {
+        if (_bytesEqual(bucket.bytes, buf, start, idx)) return bucket.name;
+      } else {
+        // Collision: rare. Linear scan of the bucket.
+        for (let i = 0; i < bucket.length; i++) {
+          const e = bucket[i];
+          if (_bytesEqual(e.bytes, buf, start, idx)) return e.name;
+        }
+      }
+    }
+
+    // Miss. Build the lookup string in one shot (no cons-chain --
+    // String.fromCharCode handles bytes 0-255 directly) and route
+    // through the upstream PDFName.of (which on this stack is
+    // fast-decode-name's string-keyed cache). The resulting PDFName
+    // is the canonical instance; cache it in the byte-cache for next
+    // time so subsequent calls with the same bytes hit here.
+    const slice = buf.subarray(start, idx);
+    const name = PDFName.of(String.fromCharCode.apply(null, slice));
+    const key = new Uint8Array(slice);   // copy for stable cache key
+    const entry = new Entry(key, name);
+    if (bucket === undefined) {
+      byteCache.set(hash, entry);
+    } else if (bucket instanceof Entry) {
+      byteCache.set(hash, [bucket, entry]);
+    } else {
+      bucket.push(entry);
+    }
+    return name;
+  };
+
+  PDFObjectParser.prototype.__fastParseNameInstalled = true;
+}
diff --git a/docs/render-book.mjs b/docs/render-book.mjs
index 024f718f..08117d54 100644
--- a/docs/render-book.mjs
+++ b/docs/render-book.mjs
@@ -102,6 +102,24 @@ import { PDFDocument } from 'pdf-lib';
 //     object body; the upstream version pays three speculative
 //     matchKeyword fail-and-rewind costs on every invocation. Same
 //     semantics, dispatch reordered by observed frequency.
+//   fast-parse-name -- byte-keyed cache in front of
+//     PDFObjectParser.parseName. Upstream builds the name body via
+//     `name += charFromCode(byte)` per byte then hands the result
+//     to PDFName.of (fast-decode-name's string-keyed Map). 99.7 % of
+//     the 1.68 M calls per load on the book are cache hits -- the
+//     same ~5 k unique names show up over and over (Type, Length,
+//     Pages, MediaBox, ...) -- so the per-call string build + hash
+//     is pure overhead on the hot path. The shim scans bytes with
+//     direct buffer access, accumulates a small Smi hash, and
+//     looks up a `Map<hash, PDFName>` keyed by byte content. On
+//     hit (~99.7 %) it returns the PDFName with zero string
+//     allocation; on miss it builds the string in one shot via
+//     String.fromCharCode and routes through the upstream
+//     PDFName.of (which is fast-decode-name's cache on this stack)
+//     so both caches converge on the same PDFName instance. ~80 ms
+//     of process wall-clock saved (-9 %) on the book, mostly on
+//     load (0.41 s -> 0.33 s). +1.3 MB long-lived heap for the
+//     cache itself, a small price for the load-time reduction.
 //   fast-sync-load -- rip the parseSpeed / objectsPerTick /
 //     shouldWaitForTick / waitForTick machinery out of both pdf-lib's
 //     load path (PDFDocument.load + five PDFParser /
@@ -170,6 +188,7 @@ import './lib/fast-size-in-bytes.mjs';
 import { setExpectedDictSlots }     from './lib/fast-dict-onebuf.mjs';
 import { setExpectedArraySlots }    from './lib/fast-array-onebuf.mjs';
 import './lib/fast-parse-object.mjs';
+import './lib/fast-parse-name.mjs';
 import './lib/fast-sync-load.mjs';
 import './lib/fast-indirect-objects.mjs';
 import './lib/fast-pdfnumber-pool.mjs';
diff --git a/perf/README.md b/perf/README.md
index 34982a04..77bb4c35 100644
--- a/perf/README.md
+++ b/perf/README.md
@@ -67,7 +67,7 @@ The mirror command for CPU-profiling the pdf-lib roundtrip (run from
 `perf/`):
 
 ```
-node measure.mjs --fast-refs-class --parallel-deflate --fast-decode-name --fast-number-to-string --fast-size-in-bytes --fast-inflate --fast-parse-number --fast-dict-onebuf --fast-array-onebuf --measure-pass --fast-parse-object --fast-sync-load --fast-indirect-objects --fast-pdfnumber-pool --cpu-profile-process --cpu-sampling 100 --out results/<label>
+node measure.mjs --fast-refs-class --parallel-deflate --fast-decode-name --fast-number-to-string --fast-size-in-bytes --fast-inflate --fast-parse-number --fast-dict-onebuf --fast-array-onebuf --measure-pass --fast-parse-object --fast-parse-name --fast-sync-load --fast-indirect-objects --fast-pdfnumber-pool --cpu-profile-process --cpu-sampling 100 --out results/<label>
 ```
 
 `--out results/<label>` is optional but recommended: omit it and the
@@ -252,6 +252,27 @@ Flag rationale:
   / refs first, then `<<`, names, arrays, strings). Same
   semantics. Pulls `parseObject` self-time from ~82 ms (5.2 %)
   to ~40 ms (3.1 %). Production runs through it.
+- `--fast-parse-name` -- inject
+  [docs/lib/fast-parse-name.mjs](../docs/lib/fast-parse-name.mjs),
+  a byte-keyed cache in front of
+  `PDFObjectParser.prototype.parseName`. Upstream builds the name
+  body via `name += charFromCode(byte)` per byte then hands the
+  result to `PDFName.of`'s string-keyed Map. On the book, 1.68 M
+  parseName calls hit ~5 k unique names (99.7 % cache-hit rate)
+  -- the per-call string build + hash is pure overhead on the hot
+  path. The shim scans bytes with direct buffer access,
+  accumulates a Java-style `hash * 31 + byte` Smi hash in the same
+  pass, and looks up a `Map<hash, Entry | Entry[]>` keyed by byte
+  content; on hit returns the PDFName with zero string allocation.
+  On miss, builds the string in one shot (`String.fromCharCode`
+  with direct args -- not `.apply` on a typed-array view, which is
+  a V8 deopt path) and routes through the upstream `PDFName.of`
+  (fast-decode-name's cache on this stack) so both caches converge
+  on the same PDFName instance. Pulls `parseName` + `fastOf`
+  combined from ~144 ms (~16 % of process) to ~58 ms; -80 ms
+  process wall-clock (-9 %), all on load (0.41 s → 0.33 s).
+  +1.3 MB long-lived heap for the cache itself. Production runs
+  through it.
 - `--fast-sync-load` -- inject
   [docs/lib/fast-sync-load.mjs](../docs/lib/fast-sync-load.mjs),
   replacing nine `__awaiter`-wrapped methods across pdf-lib's load
@@ -333,7 +354,7 @@ process phase -- "where is pdf-lib allocating bytes?" rather than
 "where is it spending cycles?" (run from `perf/`):
 
 ```
-node measure.mjs --fast-refs-class --parallel-deflate --fast-decode-name --fast-number-to-string --fast-size-in-bytes --fast-inflate --fast-parse-number --fast-dict-onebuf --fast-array-onebuf --measure-pass --fast-parse-object --fast-sync-load --fast-indirect-objects --fast-pdfnumber-pool --heap-profile-process --heap-sampling 512 --out results/<label>
+node measure.mjs --fast-refs-class --parallel-deflate --fast-decode-name --fast-number-to-string --fast-size-in-bytes --fast-inflate --fast-parse-number --fast-dict-onebuf --fast-array-onebuf --measure-pass --fast-parse-object --fast-parse-name --fast-sync-load --fast-indirect-objects --fast-pdfnumber-pool --heap-profile-process --heap-sampling 512 --out results/<label>
 ```
 
 Same `--out` / labelling note as the CPU command above: omit it for a
@@ -518,6 +539,8 @@ run.bat --measure-pass --fast-dict-onebuf # walk rawPdf with the no-allocate mea
 run.bat --fast-indirect-objects           # dense-array cache for PDFContext.indirectObjects (gen=0 path); mirror of --fast-refs on the value side (also ships; opt-in here for A/B)
 run.bat --fast-pdfnumber-pool             # value-keyed cache in front of PDFNumber.of; dense array for small ints, Map for the rest (also ships; opt-in here for A/B)
 run.bat --fast-parse-object               # first-byte dispatch in parseObject; gate true/false/null matchKeyword behind byte check (also ships; opt-in here for A/B)
+run.bat --fast-parse-name                 # byte-keyed cache in front of parseName; skip the string build + Map<string, PDFName> hash on the 99.7 % cache-hit path (also ships; opt-in here for A/B)
+run.bat --fast-parse-name                 # byte-keyed cache in front of parseName: skip the string build + Map<string, PDFName> hash on the 99.7 % cache-hit path (also ships; opt-in here for A/B)
 run.bat --fast-sync-load                  # synchronify PDFDocument.load + parser; strip waitForTick machinery (also ships; opt-in here for A/B)
 ```
 
@@ -650,6 +673,7 @@ file documenting each:
 | Class-constructor `PDFRef` shape (`new _FastRef(...)` for stable V8 hidden class) | [08](notes/08-pdf-lib.md) | per-PDFRef ~60 B → ~44 B; total process heap 45.3 MB → 41.4 MB (-8.5 %); process wall 1.13 s → 0.99 s (-140 ms, -12 %) |
 | Class-constructor `PDFDict` shape (`_FastDict` / `_FastCatalog` / `_FastPageTree` / `_FastPageLeaf` per-subclass constructors) | [08](notes/08-pdf-lib.md) | `_makeFromRange (dict)` 16.5 MB → 11.4 MB; total process heap 41.4 MB → 35.4 MB (-14.4 %); cumulative -77 % since Map-backed PDFDict |
 | Class-constructor `PDFArray` shape (`_FastArray` factory + monomorphic call-site unlock across all three Fast classes) | [08](notes/08-pdf-lib.md) | total process heap 35.4 MB → 33.7 MB (-4.9 %); process wall 1.03 s → 0.90 s (-130 ms); GC self-time 101 ms → 59 ms (-42 %); cumulative -78 % heap since Map-backed PDFDict, -20 % process across the three shape-change commits |
+| Byte-keyed `parseName` cache (Map<hash, Entry &#124; Entry[]>; skip per-call string build + string-keyed Map hash on 99.7 % hit path) | [08](notes/08-pdf-lib.md) | `parseName` + `fastOf` combined 144 ms → 58 ms; process wall 0.90 s → 0.82 s (-80 ms, -9 %, all on load); +1.3 MB long-lived heap for the cache |
 
 What was tried and didn't ship:
 
@@ -676,4 +700,4 @@ order; later ones reference earlier ones for context.
 | [05-blink-trace.md](notes/05-blink-trace.md) | What happened when we tried move-not-clone (a `previousLeaf` cache shipped instead of move); cracking the cpu profile's `(program)` row open with a Blink-category trace; the WhiteSpaceFilter paired-A/B that found it wasn't worth its layout cost in our pipeline. |
 | [06-microtasks-pageranges-css.md](notes/06-microtasks-pageranges-css.md) | Following `RunMicrotasks` down to zero (chunker fully sync); why `pageRanges` sharding is off the table; CSS cost attribution showing print.css's individual sections are all below the noise floor. |
 | [07-memory.md](notes/07-memory.md) | Where the renderer's 1.9 GB goes -- process-tree footprint, per-allocator + per-Blink-class breakdown, `--disable-gpu` + `--in-process-gpu` saving ~200 MB, a GC-pass probe finding 180 MB of unswept Oilpan garbage. |
-| [08-pdf-lib.md](notes/08-pdf-lib.md) | Profiling the process phase via `--cpu-profile-process`; pako's per-stream init dominates with ~4 500 small streams (routing pdf-lib's `deflate` + `inflate` through `node:zlib` saves ~1.5 s); `PDFRef.of`'s string-keyed Map lookup at 1.2 M calls per load (dense-array gen=0 cache saves ~0.2 s); parallelising save's per-stream deflate on libuv's pool with `objectsPerStream: 500` (~0.3 s off the main thread; PDF -5 %); `decodeName`'s regex scan on 2.76 M `PDFName.of` calls per load with a 0.0001 % hit rate (no-`#` cache saves ~0.5 s); `numberToString`'s redundant `toString`/`split`/`parseInt` on the 100 % no-`e` path; `sizeInBytes` allocating `n.toString(2)` on ~300 k xref-writer calls (short-circuit ladder saves ~60 ms); `PDFDict.entries` allocating `Array.from(map.entries())` on every dict serialisation (`Map.forEach` with hoisted callbacks saves ~80 ms); `parseDict`'s type-dispatch tail re-running `PDFName.of('Type'/'Catalog'/'Pages'/'Page')` per dict (hoisted sentinel constants drop `fastOf` self-time by 22 %); pdf-lib's `__awaiter`/`__generator` scaffolding on nine load + save methods costing ~135 ms of attributed self-time + ~50 ms GC (synchronified twins save 0.36 s of process); `parseObject`'s three speculative `matchKeyword(true/false/null)` scans on every dispatch (first-byte peek + gated keyword scans halve `parseObject` self-time); the sampling heap profile pointing at `new Map()` + `Map.prototype.set` at ~50 % of process-phase allocations (replacing `PDFDict`'s backing `Map` with a flat alternating `[k,v,k,v,...]` array drops Map+set heap traffic ~80 % and subsumes the earlier `fast-dict-iter` + `fast-parse-dict` shims); the only hot `Map.set` left being `PDFContext.assign`'s `indirectObjects.set(ref, object)` (replacing the Map with a dense gen=0 array indexed by `objectNumber` drops `assign` out of the CPU top-15 and halves the remaining `set` heap traffic); the residual `set` after that being the upstream `PDFRef.of` pool.set on cache miss (directly constructing the `PDFRef` via `Object.create(PDFRef.prototype)` on the gen=0 miss path bypasses the redundant upstream pool entirely, dropping `set` heap traffic from 7.7 MB to 0.5 MB and saving another ~93 ms on `PDFRef.of` CPU); `parseNumberOrRef` as the next-largest heap row at 15 MB of inlined `new PDFNumber(value)` calls -- PDFs reuse a few thousand unique numeric values hundreds of thousands of times (page indices, `/Count`, `/MediaBox` dimensions, font sizes), so pooling `PDFNumber` by value drops `parseNumberOrRef` off the top 10 and total process heap by ~13 %; `fastParseDictArray` at ~85 % FixedArray-growth garbage on 261 k dicts with a 5-entry median (pre-sizing the accumulator at `new Array(10)` collapses the per-call growth chain, dropping the row -25 % and total heap another -14 %); the next layer being the irreducible "one `new Array(10)` + one PDFDict instance per parsed dict" floor with ~1780 backing arrays still in flight after `--fast-dict-array` (collapse them into one long-lived mainBuf where every committed PDFDict entry lives, a per-parser temp stack handles parseDict recursion, owned dicts append to main and mutate in-place / COW-to-tail, and the whole PDFDict instance state packs into one 53-bit Number -- 24-bit start + 14-bit length + 1-bit owned -- so the per-dict object header collapses to a single field; total process heap drops 92 MB → 66 MB (-28 %), cumulative -57 % since the Map-backed baseline); GC self-time still ~150 ms / 15 % of process after fast-dict-onebuf (V8-flag knobs don't move it -- mark cost is dominated by walking the ~2.4 M Object-ref slots in mainBuf), so the next attack surface is shrinking the *live set* V8 has to mark by encoding slots as Numbers in a Float64Array mainBuf instead of Object references; that needs exact pool sizing, which is best done by a separate measure-allocate-work pass (Phase 0 viability gate: a no-allocate byte walker prototype clocks 135 ms vs PDFDocument.load at 1238 ms on the 39 MB Chrome-output PDF, ~9x cheaper, so even an 80 %-of-current work pass would land net-positive on CPU before any GC reduction; the prototype lives in `perf/phase0-measure.mjs` with `--dump-raw-pdf` on `measure.mjs` for capturing the canonical input); Phase 1 productionises that walker as `docs/lib/measure-pass.mjs` and adds `setExpectedDictSlots()` to fast-dict-onebuf so the harness's `--measure-pass` flag pre-sizes mainBuf exact (byte-identical output, +40 ms net process on the book) -- pipeline shape committed for Phase 2 to layer on, even though Phase 1 alone doesn't move the heap; a V8 IC-invalidation gotcha worth recording: rebinding the module-level `main` (rather than resizing it in place via `main.length = N`) made `_appendEntries` deopt and the heap jump 65 → 92 MB despite the resized Array being shape-identical; post-Phase-1 cleanup: the One-buffer `owned` bit was over-cautious -- `start + length === mainLen` is enough to know slots past mainLen are claimable regardless of whether the range came from the parser or a factory call, so the bit drops out of the packed value entirely (set's COW gate simplifies, _cow collapses to one branch, bit 38 becomes spare; byte-identical output, heap flat); slot-type histogram on `main[0..mainLen)` confirms keys are 100 % PDFName and the four big pools (Name / Ref / Number / Dict) cover 96.4 % of all slots, so a Float64Array mainBuf with a side `Object[]` for the residual ~3.5 % (Array / String / HexString) would collapse ~96 % of GC slot-marks at the price of a side-pool indirection -- `perf/instrument-slot-types.mjs` does the classification in 39 ms via `--instrument-slot-types`; built the Float64Array mainBuf with 4-bit tag + 49-bit payload encoding (subsuming fast-refs and fast-pdfnumber-pool's pool IDs, adding new pools for PDFArray / PDFString / PDFHexString) and confirmed byte-identical output -- but the measured win was a wash: pointer-array marks turn out to be fast in V8, encoding overhead at parse + decode at save roughly cancels the savings, and the new pool Maps cost ~3 MB heap, so the architecture stays on the shelf rather than shipping; mirroring the same shape to PDFArray (each instance as a view into a shared arrayBuf Float64Array via `this.d = (start, length)`, same temp-then-commit parser pattern) lands the expected heap win of -7.6 MB / -12 % and pulls parseArray's 19.6 MB attribution row off the top 10 entirely, but introduces a +360 ms wall-clock regression at save -- ~3 M per-slot `decodeValue` calls across copyBytesInto, ~100 ns each, V8 can't inline across the prototype boundary -- so the architecture stays off staging too; the natural follow-up is to hand-inline decodeValue's 10-case switch into all four hot save methods (Phase 3β), which recovers the function-call overhead and lands net wins on GC (-19 ms) and parseName (-17 ms, a downstream V8 re-optimization once the call sites became monomorphic per case branch) -- but the inlined switch body itself adds +23 ms at encode-at-parse and +22 ms at copyBytesInto, and the net of all of Phase 2 + 3 + β is "real heap+GC win, ~+200 ms wall-clock loss across many frames", so the architecture stays off staging on the simpler conclusion that V8 marks pointer arrays faster than expected and the original polymorphic `main[i].copyBytesInto()` was fine; Phase 1 wires into production via a `setExpectedDictSlots()` call in `docs/render-book.mjs` that runs the measure walker on rawPdf and hands the exact dictSlot count to fast-dict-onebuf -- the bound on mainBuf isn't material on its own (~60 K slots out of 2.4 M of slack) but commits the two-pass shape so any future shim swap doesn't have to re-do the wiring; finally, mirroring fast-dict-onebuf's range-view shape onto PDFArray (every committed element in a single append-only `arrayMain`, each PDFArray a view via packed `(start, length)`) is the lever the encoded approach was reaching for, with plain Object[] storage skipping the per-slot decode cost that killed Phase 3 -- ~19.6 MB `parseArray` allocation row drops off the top 15, total process heap 66 MB → 52 MB (-21 %), cumulative -66 % since the Map-backed baseline; once `fast-array-onebuf` shipped the next heap row was `PDFParser.parseIndirectObjectHeader` at 13.7 MB (25 % of total), attributed via `find-heap-callers.mjs` to V8 inlining `fastOf` into `skipJibberish`'s speculative `matchIndirectObjectHeader` call (~150 k tag-string allocations of ~25-35 B each), dropping the per-instance `PDFRef.tag` field entirely and computing `toString` / `sizeInBytes` / `copyBytesInto` from `objectNumber` / `generationNumber` directly via `_writeUint` + `_digitCount` helpers cuts `parseIndirectObjectHeader` to 9.3 MB (-4.3 MB), `fastOf` 7.7 → 4.8 MB (-2.9 MB), total process heap 51.9 → 45.2 MB (-13 %), with byte-identical output verified by inflating + diffing all 453 ObjStm streams; the same chain pointed at a redundancy on the CPU side -- `parseDocument`'s inner loop calls `skipJibberish` ~150 k times per load to recover from invalid PDFs that wedge garbage between indirect objects, but on valid PDFs every call speculatively runs `matchKeyword(xref/trailer/startxref)` (all fail on a digit) + `matchIndirectObjectHeader` (a `try` / `catch` around `parseIndirectObjectHeader` + `parseRawInt`x2 + `matchKeyword('obj')` + `fastOf`), all to confirm what the outer `while`'s `IsDigit` check already proved, so peeking the byte first and `continue`-ing on a digit (falling through to `skipJibberish` only on xref/trailer/startxref keyword starts or real jibberish) saves ~62 ms on load (mean 0.518 → 0.455 s, ~6 % of process); the next attack surface after that was the construction style itself -- `fast-refs`'s `Object.create(PDFRef.prototype) + fresh.objectNumber = ... + fresh.generationNumber = ...` routes V8 through the slow-property path with intermediate hidden-class transitions per write, putting PDFRef at ~60 B/instance vs PDFName's ~31 B (built via `new PDFName(...)` with a real constructor), so swapping to a plain function used as a constructor (`function _FastRef(o, g) { this.objectNumber = o; this.generationNumber = g; }` + `_FastRef.prototype = PDFRef.prototype`) gives V8 a stable hidden class from the first instance, drops per-PDFRef cost to ~44 B for ~3.87 MB heap (-8.5 %) and ~140 ms wall-clock (-12 % of process) on the book's 226 k unique PDFRefs (paired heap+cpu profile, --fast-refs vs --fast-refs-class with the rest of production on), with `parseIndirectObjectHeader` dropping 9.1 MB → 7.4 MB and `fastOf` 4.7 MB → 3.4 MB -- the `Object.create + writes` shim stays in the tree as A/B baseline (mutex-checked in measure.mjs); the same shape change applied symmetrically to the four PDFDict factory paths in `fast-dict-onebuf` (`_makeFromRange` + the COW path inside `set` both build wrappers via `Object.create(ProtoClass.prototype) + pd.d = ...`, with PageLeaf carrying extra `normalized` / `autoNormalizeCTM` writes) -- one plain-function constructor per subclass (`_FastDict`, `_FastCatalog`, `_FastPageTree`, `_FastPageLeaf`) with the prototype aliased to the upstream prototype drops 260 k+ wrapper instances ~20 B each for `_makeFromRange (dict)` 16.5 MB → 11.4 MB, `create` builtin 2.6 MB → 0.9 MB, total process heap 41.4 MB → 35.4 MB (-14.4 %), cumulative -22 % over the two shape-change commits and -77 % since the Map-backed PDFDict baseline (152 MB → 35.4 MB); wall-clock roughly flat (0.99 → 1.03 s under cpu profile, within noise) with GC self-time +18 ms (82 → 101 ms) as expected -- the dominant GC cost is the live mainBuf scan rather than allocation rate, so cutting allocations doesn't move single-shot mark time; mirroring the same change to PDFArray's `_makeFromRange` and COW paths with a single `_FastArray` constructor (no subclass dispatch needed -- PDFArray has none in pdf-lib) drops ~22 B/PDFArray × ~80 k = ~1.7 MB heap, but the surprise win is on CPU + GC: with all three shape changes in place V8 sees fully monomorphic call sites for PDFRef / PDFDict / PDFArray construction and method dispatch, undoing the dict-only state's +18 ms GC regression and then some -- GC self-time 101 → 59 ms (-42 %), process wall-clock 1.03 → 0.90 s (-130 ms, -13 %), so cumulative across the three shape-change commits (fast-refs-class + fast-dict-onebuf class + fast-array-onebuf class) the process drops 1.13 → 0.90 s (-230 ms, -20 %), total heap 45.3 → 33.7 MB (-25.6 %), GC self-time 87 → 59 ms (-32 %), with output byte-identical modulo timestamps. |
+| [08-pdf-lib.md](notes/08-pdf-lib.md) | Profiling the process phase via `--cpu-profile-process`; pako's per-stream init dominates with ~4 500 small streams (routing pdf-lib's `deflate` + `inflate` through `node:zlib` saves ~1.5 s); `PDFRef.of`'s string-keyed Map lookup at 1.2 M calls per load (dense-array gen=0 cache saves ~0.2 s); parallelising save's per-stream deflate on libuv's pool with `objectsPerStream: 500` (~0.3 s off the main thread; PDF -5 %); `decodeName`'s regex scan on 2.76 M `PDFName.of` calls per load with a 0.0001 % hit rate (no-`#` cache saves ~0.5 s); `numberToString`'s redundant `toString`/`split`/`parseInt` on the 100 % no-`e` path; `sizeInBytes` allocating `n.toString(2)` on ~300 k xref-writer calls (short-circuit ladder saves ~60 ms); `PDFDict.entries` allocating `Array.from(map.entries())` on every dict serialisation (`Map.forEach` with hoisted callbacks saves ~80 ms); `parseDict`'s type-dispatch tail re-running `PDFName.of('Type'/'Catalog'/'Pages'/'Page')` per dict (hoisted sentinel constants drop `fastOf` self-time by 22 %); pdf-lib's `__awaiter`/`__generator` scaffolding on nine load + save methods costing ~135 ms of attributed self-time + ~50 ms GC (synchronified twins save 0.36 s of process); `parseObject`'s three speculative `matchKeyword(true/false/null)` scans on every dispatch (first-byte peek + gated keyword scans halve `parseObject` self-time); the sampling heap profile pointing at `new Map()` + `Map.prototype.set` at ~50 % of process-phase allocations (replacing `PDFDict`'s backing `Map` with a flat alternating `[k,v,k,v,...]` array drops Map+set heap traffic ~80 % and subsumes the earlier `fast-dict-iter` + `fast-parse-dict` shims); the only hot `Map.set` left being `PDFContext.assign`'s `indirectObjects.set(ref, object)` (replacing the Map with a dense gen=0 array indexed by `objectNumber` drops `assign` out of the CPU top-15 and halves the remaining `set` heap traffic); the residual `set` after that being the upstream `PDFRef.of` pool.set on cache miss (directly constructing the `PDFRef` via `Object.create(PDFRef.prototype)` on the gen=0 miss path bypasses the redundant upstream pool entirely, dropping `set` heap traffic from 7.7 MB to 0.5 MB and saving another ~93 ms on `PDFRef.of` CPU); `parseNumberOrRef` as the next-largest heap row at 15 MB of inlined `new PDFNumber(value)` calls -- PDFs reuse a few thousand unique numeric values hundreds of thousands of times (page indices, `/Count`, `/MediaBox` dimensions, font sizes), so pooling `PDFNumber` by value drops `parseNumberOrRef` off the top 10 and total process heap by ~13 %; `fastParseDictArray` at ~85 % FixedArray-growth garbage on 261 k dicts with a 5-entry median (pre-sizing the accumulator at `new Array(10)` collapses the per-call growth chain, dropping the row -25 % and total heap another -14 %); the next layer being the irreducible "one `new Array(10)` + one PDFDict instance per parsed dict" floor with ~1780 backing arrays still in flight after `--fast-dict-array` (collapse them into one long-lived mainBuf where every committed PDFDict entry lives, a per-parser temp stack handles parseDict recursion, owned dicts append to main and mutate in-place / COW-to-tail, and the whole PDFDict instance state packs into one 53-bit Number -- 24-bit start + 14-bit length + 1-bit owned -- so the per-dict object header collapses to a single field; total process heap drops 92 MB → 66 MB (-28 %), cumulative -57 % since the Map-backed baseline); GC self-time still ~150 ms / 15 % of process after fast-dict-onebuf (V8-flag knobs don't move it -- mark cost is dominated by walking the ~2.4 M Object-ref slots in mainBuf), so the next attack surface is shrinking the *live set* V8 has to mark by encoding slots as Numbers in a Float64Array mainBuf instead of Object references; that needs exact pool sizing, which is best done by a separate measure-allocate-work pass (Phase 0 viability gate: a no-allocate byte walker prototype clocks 135 ms vs PDFDocument.load at 1238 ms on the 39 MB Chrome-output PDF, ~9x cheaper, so even an 80 %-of-current work pass would land net-positive on CPU before any GC reduction; the prototype lives in `perf/phase0-measure.mjs` with `--dump-raw-pdf` on `measure.mjs` for capturing the canonical input); Phase 1 productionises that walker as `docs/lib/measure-pass.mjs` and adds `setExpectedDictSlots()` to fast-dict-onebuf so the harness's `--measure-pass` flag pre-sizes mainBuf exact (byte-identical output, +40 ms net process on the book) -- pipeline shape committed for Phase 2 to layer on, even though Phase 1 alone doesn't move the heap; a V8 IC-invalidation gotcha worth recording: rebinding the module-level `main` (rather than resizing it in place via `main.length = N`) made `_appendEntries` deopt and the heap jump 65 → 92 MB despite the resized Array being shape-identical; post-Phase-1 cleanup: the One-buffer `owned` bit was over-cautious -- `start + length === mainLen` is enough to know slots past mainLen are claimable regardless of whether the range came from the parser or a factory call, so the bit drops out of the packed value entirely (set's COW gate simplifies, _cow collapses to one branch, bit 38 becomes spare; byte-identical output, heap flat); slot-type histogram on `main[0..mainLen)` confirms keys are 100 % PDFName and the four big pools (Name / Ref / Number / Dict) cover 96.4 % of all slots, so a Float64Array mainBuf with a side `Object[]` for the residual ~3.5 % (Array / String / HexString) would collapse ~96 % of GC slot-marks at the price of a side-pool indirection -- `perf/instrument-slot-types.mjs` does the classification in 39 ms via `--instrument-slot-types`; built the Float64Array mainBuf with 4-bit tag + 49-bit payload encoding (subsuming fast-refs and fast-pdfnumber-pool's pool IDs, adding new pools for PDFArray / PDFString / PDFHexString) and confirmed byte-identical output -- but the measured win was a wash: pointer-array marks turn out to be fast in V8, encoding overhead at parse + decode at save roughly cancels the savings, and the new pool Maps cost ~3 MB heap, so the architecture stays on the shelf rather than shipping; mirroring the same shape to PDFArray (each instance as a view into a shared arrayBuf Float64Array via `this.d = (start, length)`, same temp-then-commit parser pattern) lands the expected heap win of -7.6 MB / -12 % and pulls parseArray's 19.6 MB attribution row off the top 10 entirely, but introduces a +360 ms wall-clock regression at save -- ~3 M per-slot `decodeValue` calls across copyBytesInto, ~100 ns each, V8 can't inline across the prototype boundary -- so the architecture stays off staging too; the natural follow-up is to hand-inline decodeValue's 10-case switch into all four hot save methods (Phase 3β), which recovers the function-call overhead and lands net wins on GC (-19 ms) and parseName (-17 ms, a downstream V8 re-optimization once the call sites became monomorphic per case branch) -- but the inlined switch body itself adds +23 ms at encode-at-parse and +22 ms at copyBytesInto, and the net of all of Phase 2 + 3 + β is "real heap+GC win, ~+200 ms wall-clock loss across many frames", so the architecture stays off staging on the simpler conclusion that V8 marks pointer arrays faster than expected and the original polymorphic `main[i].copyBytesInto()` was fine; Phase 1 wires into production via a `setExpectedDictSlots()` call in `docs/render-book.mjs` that runs the measure walker on rawPdf and hands the exact dictSlot count to fast-dict-onebuf -- the bound on mainBuf isn't material on its own (~60 K slots out of 2.4 M of slack) but commits the two-pass shape so any future shim swap doesn't have to re-do the wiring; finally, mirroring fast-dict-onebuf's range-view shape onto PDFArray (every committed element in a single append-only `arrayMain`, each PDFArray a view via packed `(start, length)`) is the lever the encoded approach was reaching for, with plain Object[] storage skipping the per-slot decode cost that killed Phase 3 -- ~19.6 MB `parseArray` allocation row drops off the top 15, total process heap 66 MB → 52 MB (-21 %), cumulative -66 % since the Map-backed baseline; once `fast-array-onebuf` shipped the next heap row was `PDFParser.parseIndirectObjectHeader` at 13.7 MB (25 % of total), attributed via `find-heap-callers.mjs` to V8 inlining `fastOf` into `skipJibberish`'s speculative `matchIndirectObjectHeader` call (~150 k tag-string allocations of ~25-35 B each), dropping the per-instance `PDFRef.tag` field entirely and computing `toString` / `sizeInBytes` / `copyBytesInto` from `objectNumber` / `generationNumber` directly via `_writeUint` + `_digitCount` helpers cuts `parseIndirectObjectHeader` to 9.3 MB (-4.3 MB), `fastOf` 7.7 → 4.8 MB (-2.9 MB), total process heap 51.9 → 45.2 MB (-13 %), with byte-identical output verified by inflating + diffing all 453 ObjStm streams; the same chain pointed at a redundancy on the CPU side -- `parseDocument`'s inner loop calls `skipJibberish` ~150 k times per load to recover from invalid PDFs that wedge garbage between indirect objects, but on valid PDFs every call speculatively runs `matchKeyword(xref/trailer/startxref)` (all fail on a digit) + `matchIndirectObjectHeader` (a `try` / `catch` around `parseIndirectObjectHeader` + `parseRawInt`x2 + `matchKeyword('obj')` + `fastOf`), all to confirm what the outer `while`'s `IsDigit` check already proved, so peeking the byte first and `continue`-ing on a digit (falling through to `skipJibberish` only on xref/trailer/startxref keyword starts or real jibberish) saves ~62 ms on load (mean 0.518 → 0.455 s, ~6 % of process); the next attack surface after that was the construction style itself -- `fast-refs`'s `Object.create(PDFRef.prototype) + fresh.objectNumber = ... + fresh.generationNumber = ...` routes V8 through the slow-property path with intermediate hidden-class transitions per write, putting PDFRef at ~60 B/instance vs PDFName's ~31 B (built via `new PDFName(...)` with a real constructor), so swapping to a plain function used as a constructor (`function _FastRef(o, g) { this.objectNumber = o; this.generationNumber = g; }` + `_FastRef.prototype = PDFRef.prototype`) gives V8 a stable hidden class from the first instance, drops per-PDFRef cost to ~44 B for ~3.87 MB heap (-8.5 %) and ~140 ms wall-clock (-12 % of process) on the book's 226 k unique PDFRefs (paired heap+cpu profile, --fast-refs vs --fast-refs-class with the rest of production on), with `parseIndirectObjectHeader` dropping 9.1 MB → 7.4 MB and `fastOf` 4.7 MB → 3.4 MB -- the `Object.create + writes` shim stays in the tree as A/B baseline (mutex-checked in measure.mjs); the same shape change applied symmetrically to the four PDFDict factory paths in `fast-dict-onebuf` (`_makeFromRange` + the COW path inside `set` both build wrappers via `Object.create(ProtoClass.prototype) + pd.d = ...`, with PageLeaf carrying extra `normalized` / `autoNormalizeCTM` writes) -- one plain-function constructor per subclass (`_FastDict`, `_FastCatalog`, `_FastPageTree`, `_FastPageLeaf`) with the prototype aliased to the upstream prototype drops 260 k+ wrapper instances ~20 B each for `_makeFromRange (dict)` 16.5 MB → 11.4 MB, `create` builtin 2.6 MB → 0.9 MB, total process heap 41.4 MB → 35.4 MB (-14.4 %), cumulative -22 % over the two shape-change commits and -77 % since the Map-backed PDFDict baseline (152 MB → 35.4 MB); wall-clock roughly flat (0.99 → 1.03 s under cpu profile, within noise) with GC self-time +18 ms (82 → 101 ms) as expected -- the dominant GC cost is the live mainBuf scan rather than allocation rate, so cutting allocations doesn't move single-shot mark time; mirroring the same change to PDFArray's `_makeFromRange` and COW paths with a single `_FastArray` constructor (no subclass dispatch needed -- PDFArray has none in pdf-lib) drops ~22 B/PDFArray × ~80 k = ~1.7 MB heap, but the surprise win is on CPU + GC: with all three shape changes in place V8 sees fully monomorphic call sites for PDFRef / PDFDict / PDFArray construction and method dispatch, undoing the dict-only state's +18 ms GC regression and then some -- GC self-time 101 → 59 ms (-42 %), process wall-clock 1.03 → 0.90 s (-130 ms, -13 %), so cumulative across the three shape-change commits (fast-refs-class + fast-dict-onebuf class + fast-array-onebuf class) the process drops 1.13 → 0.90 s (-230 ms, -20 %), total heap 45.3 → 33.7 MB (-25.6 %), GC self-time 87 → 59 ms (-32 %), with output byte-identical modulo timestamps; with the constructor-shape round closed, the new #1 row in the process CPU profile was `PDFObjectParser.prototype.parseName` at 87 ms self + 57 ms via its `fastOf` callee = 144 ms combined (~16 % of process) firing 1.68 M times per load, of which 4 787 are unique (99.7 % cache-hit rate -- the same handful of dict keys like Type, Length, Pages, MediaBox over and over) -- two failed first attempts (skip per-byte ByteStream method dispatch via direct buffer access while keeping the cons-string accumulator: V8's cons-string optimisation was already covering the cost so no movement; and `String.fromCharCode.apply(null, buf.subarray(...))` as a one-shot allocation: SLOWER at ~123 ms vs ~87 ms because `.apply` on a typed-array view is a V8 deopt path) pointed at the wrong surface, the real win was caching the answer keyed on the byte content, scanning bytes with direct buffer access while accumulating a Java-style `hash * 31 + byte | 0` Smi hash in the same pass, looking up `Map<hash, Entry | Entry[]>` keyed by byte content (single-entry buckets the common case at 4.8 k names into 2^32 hash space, collision-bucket scan via `instanceof Entry` check), with cold path building the string in one shot via `String.fromCharCode` direct args and routing through fast-decode-name so both caches converge on the same PDFName instance -- pulls `parseName` + `fastOf` combined from 144 ms to 58 ms (-60 %), -80 ms process wall-clock (-9 %), all on load (0.41 → 0.33 s); +1.3 MB long-lived heap (4.8 k Entry objects + Uint8Array byte-keys + Map<number, ...> overhead) is a fixed cost for a workload-bounded cache; the heap-profile run shows a much bigger drop (3.50 → 2.56 s, -940 ms) -- not a real wall-clock win, just the sampler's per-allocation bookkeeping dropping in step with the ~1.6 M transient string allocations we eliminated (read cpu numbers for "did we get faster", heap numbers for long-lived cost). |
diff --git a/perf/measure.mjs b/perf/measure.mjs
index bf6a9e5d..cb6e5108 100644
--- a/perf/measure.mjs
+++ b/perf/measure.mjs
@@ -292,6 +292,7 @@ let fastParseNumber = false;
 let fastDictIter = false;
 let fastParseDict = false;
 let fastParseObject = false;
+let fastParseName = false;
 let fastSyncLoad = false;
 let fastDictArray = false;
 let fastIndirectObjects = false;
@@ -335,6 +336,7 @@ for (let i = 0; i < args.length; i++) {
   else if (a === '--fast-dict-iter') fastDictIter = true;
   else if (a === '--fast-parse-dict') fastParseDict = true;
   else if (a === '--fast-parse-object') fastParseObject = true;
+  else if (a === '--fast-parse-name') fastParseName = true;
   else if (a === '--fast-sync-load') fastSyncLoad = true;
   else if (a === '--fast-dict-array') fastDictArray = true;
   else if (a === '--fast-indirect-objects') fastIndirectObjects = true;
@@ -462,6 +464,10 @@ if (fastParseObject) {
   await import('../docs/lib/fast-parse-object.mjs');
   console.log('[harness] fast-parse-object: first-byte dispatch in parseObject, gate true/false/null matchKeyword behind byte check');
 }
+if (fastParseName) {
+  await import('../docs/lib/fast-parse-name.mjs');
+  console.log('[harness] fast-parse-name: byte-slice + String.fromCharCode build for PDFObjectParser.parseName');
+}
 if (fastSyncLoad) {
   await import('../docs/lib/fast-sync-load.mjs');
   console.log('[harness] fast-sync-load: synchronify PDFParser load path, strip waitForTick machinery');
diff --git a/perf/notes/08-pdf-lib.md b/perf/notes/08-pdf-lib.md
index a8ba12a8..e6963d9a 100644
--- a/perf/notes/08-pdf-lib.md
+++ b/perf/notes/08-pdf-lib.md
@@ -4437,6 +4437,261 @@ with 1-2 inline fields. Further heap reduction requires either:
 Neither has been started; this section closes the per-instance
 constructor-shape round.
 
+## Byte-keyed cache for `PDFName` lookups
+
+After the constructor-shape round closed, the new #1 row in the
+process CPU profile was `PDFObjectParser.prototype.parseName` at
+**87 ms self + 57 ms via its `fastOf` callee = ~144 ms combined
+(~16 % of process)**. The function fires **1.68 M times per load**
+on the book. Worth interrogating before treating as a hot loop.
+
+### What `parseName` does
+
+```js
+PDFObjectParser.prototype.parseName = function () {
+    this.bytes.assertNext(CharCodes.ForwardSlash);
+    var name = '';
+    while (!this.bytes.done()) {
+        var byte = this.bytes.peek();
+        if (IsWhitespace[byte] || IsDelimiter[byte]) break;
+        name += charFromCode(byte);   // per-byte cons-string append
+        this.bytes.next();
+    }
+    return PDFName.of(name);
+};
+```
+
+Two obvious-looking attack surfaces:
+
+1. **Per-byte method dispatch** (`this.bytes.peek()`, `.next()`,
+   `.done()`) -- ~16 M method calls across the load.
+2. **Per-byte string concat** (`name += charFromCode(byte)`) --
+   ~16 M cons-string appends, allocating an intermediate state per
+   byte until V8 flattens at the `PDFName.of(name)` lookup.
+
+### Two false starts
+
+The first version of `fast-parse-name.mjs` (not committed) kept
+the cons-string accumulator but read `this.bytes.bytes` /
+`.idx` / `.length` directly to skip the per-byte ByteStream
+dispatch:
+
+```js
+let name = '';
+while (idx < len) {
+  const byte = buf[idx];
+  if (IsWhitespace[byte] || IsDelimiter[byte]) break;
+  name += String.fromCharCode(byte);
+  idx++;
+}
+```
+
+Result: process wall-clock essentially flat (0.90 → 1.00 s,
+within noise). `parseName` self dropped out of the top 15 -- but
+the saved time migrated to attribution on the callers
+(`fastParseDictOneBuf` +35 ms, `fastParseObject` +21 ms) under
+V8 inlining and GC self +15 ms. The total didn't move.
+
+Lesson: **V8 already optimises the cons-string `+=` path well.**
+Per-byte appends use cons-string representation that defers the
+flatten until first use (`PDFName.of`'s string-hash lookup forces
+it). The method-call dispatch was apparently not the dominant
+cost; the cons-string flatten + `Map<string, PDFName>` lookup
+was.
+
+A second sketch (also not committed) built the lookup string via
+`String.fromCharCode.apply(null, buf.subarray(start, idx))` --
+the textbook "one-shot allocation" approach. It made things
+**worse** (~123 ms vs upstream ~87 ms). `.apply` on a Uint8Array
+view is a V8 deopt path: the engine has to convert each
+typed-array element to a stack arg, and the overhead exceeds the
+cons-string build it was meant to replace.
+
+Both attempts missed the actual surface.
+
+### The real surface: 99.7 % of the work is cache hits
+
+The instrumentation script
+[`perf/instrument-objclasses.mjs`](../instrument-objclasses.mjs)
+reports:
+
+| | calls    | unique | hit rate |
+|---|--------:|-------:|---------:|
+| `PDFName.of` | 1 681 225 | 4 787 | **99.715 %** |
+
+Of every 1 000 `parseName` calls, **997** hand the same byte
+sequence to `PDFName.of`'s string-keyed Map, get back the same
+`PDFName` instance, and discard the lookup string. The string was
+built, hashed, looked up, and thrown away -- the answer was
+already known.
+
+`Type`, `Length`, `Pages`, `MediaBox`, `Resources`, `Contents`,
+`Parent`, `Kids`, `Count`, `Font`, ... -- a few thousand names
+appear over and over across 260 k dicts.
+
+### The byte-cache
+
+Add a second cache layer in front of `PDFName.of`, keyed by the
+byte content of the name body (not the constructed string).
+
+```js
+// Scan body + Java-style hash in one pass. Smi math, no allocs.
+let hash = 0;
+while (idx < len) {
+  const byte = buf[idx];
+  if (IsWhitespace[byte] || IsDelimiter[byte]) break;
+  hash = (hash * 31 + byte) | 0;   // Smi
+  idx++;
+}
+
+// Map<hash, Entry | Entry[]>. Single-entry buckets are the common
+// case (4.8 k names into 2^32 hash space -> ~0 collisions).
+const bucket = byteCache.get(hash);
+if (bucket instanceof Entry) {
+  if (_bytesEqual(bucket.bytes, buf, start, idx)) return bucket.name;
+}
+// ... collision-bucket scan, then miss path.
+```
+
+On hit -- 99.7 % of calls -- return the cached `PDFName` with
+**zero string allocation, zero `Map<string, ...>` hashing**. Just
+a hash compute, a `Map<number, ...>` lookup, a `Uint8Array`
+equality check.
+
+On miss, build the string in one shot (`String.fromCharCode`
+with direct args via the existing fast path -- not `.apply` on a
+typed-array view), route through `PDFName.of` (which on this
+stack is fast-decode-name's string-keyed Map), and cache the
+returned `PDFName` in the byte-cache for next time. Both caches
+converge on the same `PDFName` instance per logical name, so
+identity comparisons (`name === PDFName.Type`) keep working
+everywhere downstream.
+
+### Bucket shape: `Entry | Entry[]`
+
+The Map values are polymorphic on purpose. Single-entry buckets
+store the `Entry` directly; on collision we promote to an
+`Entry[]` for linear scan. For 4.8 k unique names hashed into
+2^32 space, the expected collision count is ~0 (birthday bound).
+The polymorphic check (`bucket instanceof Entry`) only fires once
+per lookup, no IC degradation observed in practice.
+
+Stable hidden class for `Entry`: a plain class with
+`{ bytes, name }` set in the constructor body. Same pattern as
+`fast-refs-class` / `fast-dict-onebuf`'s `_FastDict` etc -- avoid
+`Object.create + writes`, give V8 a fixed shape from the start.
+
+### Hash function: Java-style `hash * 31 + byte`
+
+```js
+hash = (hash * 31 + byte) | 0;
+```
+
+- The `| 0` keeps `hash` in 32-bit signed Smi range, which V8
+  represents as an unboxed integer (no `HeapNumber` allocation).
+- `* 31` compiles to `(x << 5) - x` which is cheap.
+- Length is implicit in the iteration count (different-length
+  names with the same byte sums hash differently).
+- Collisions for the 4.8 k unique book names are zero in practice;
+  even if they occurred, the bucket scan catches them.
+
+FNV-1a was considered but adds two more shift-add ops per byte
+without measurable improvement for this collision count.
+
+### Measured
+
+Paired heap + cpu profile, baseline = the array-class state from
+the constructor-shape round, this on top:
+
+| Frame                          | Pre (ms) | Post (ms) | Delta              |
+|--------------------------------|---------:|----------:|-------------------:|
+| `PDFObjectParser.parseName`    |    87.14 |  (gone)   | **-87+ ms** (out of top 15) |
+| `fastOf` (PDFName decode-name) |    52.76 |  (gone)   | **-52+ ms** (out of top 15) |
+| `fastParseName` (new row)      |       -- |    58.52  | +58 ms (the cache lookup itself) |
+| `(garbage collector)`          |    58.69 |    80.31  | +21 ms (live-cache mark cost) |
+| Combined parseName + fastOf    |    ~144  |    ~58    | **-86 ms (-60 %)** |
+
+| Phase / metric                | Pre      | Post     | Delta               |
+|-------------------------------|---------:|---------:|--------------------:|
+| Process wall-clock (cpu run)  |  0.90 s  |  0.82 s  | **-80 ms (-9 %)**   |
+|   load                        |  0.41 s  |  0.33 s  | -80 ms              |
+|   save                        |  0.42 s  |  0.42 s  | flat                |
+| Heap (sampled total)          | 33.68 MB | 34.98 MB | +1.30 MB (cache)    |
+|   new `fastParseName` row     |        0 | 1 269 KB | the cache itself    |
+|   `set` (builtin)             |   624 KB |   852 KB | +228 KB (Map.set)   |
+
+The CPU win is all on load (which is where `parseName` runs);
+save is unchanged. Heap is +1.3 MB long-lived (the cache + 4.8 k
+`Uint8Array` byte-keys + Entry objects + `Map<number, ...>`
+overhead), a fixed cost for a workload-bounded cache.
+
+### A note on the heap-profile wall-clock
+
+Under `--heap-profile-process --heap-sampling 512`, the same run
+shows a much bigger speedup than the cpu-profile run:
+
+|                                  | Pre (heap-prof) | Post (heap-prof) | Delta    |
+|----------------------------------|----------------:|-----------------:|---------:|
+| Process wall-clock (heap run)    |          3.50 s |           2.56 s | -940 ms  |
+
+That 940 ms is **not a real wall-clock win** -- it's the
+sampler's per-allocation bookkeeping overhead dropping in step
+with the ~1.6 M transient allocations we just eliminated. The
+sampler fires once every 512 B; even at 64 B per allocation
+that's ~12 % of allocations sampled, but the bookkeeping work
+runs on **every** allocation to decide whether to sample.
+
+Read the cpu-profile number (-80 ms) for "did we get faster";
+read the heap-row delta (+1.3 MB) for "what's the long-lived
+cost". The 940 ms drop under heap profile is a secondary signal
+that confirms the allocation count dropped a lot even though most
+of those allocations were under the 512 B sample threshold and
+don't appear in the heap table.
+
+### Caveats
+
+- **Cache is process-lifetime.** Same as fast-decode-name. No
+  eviction; on the book the long-lived size stabilises at
+  ~1.3 MB (4.8 k entries × ~270 B amortised). For a workload
+  with very many unique names this would grow; for PDFs it
+  doesn't.
+- **`Map<number, value>` for the hash bucket.** V8's Map handles
+  Smi keys well, but allocates Map entry objects on `.set`.
+  The +228 KB in the `set` builtin row is mostly that.
+- **The byte-cache and fast-decode-name's string-cache are not
+  the same Map.** Both cache `PDFName` lookups, keyed
+  differently. Direct `PDFName.of("Foo")` calls (from non-parser
+  code) skip the byte-cache and hit fast-decode-name directly;
+  subsequent parser hits on the same name use the byte-cache.
+  Both return the same `PDFName` instance because the miss path
+  of the byte-cache goes through `PDFName.of`, which is
+  fast-decode-name.
+
+### What this teaches
+
+The two failed first attempts share a lesson: **V8 is good at
+the things you'd naively want to avoid** (cons-string `+=`,
+method dispatch through a small wrapper class). The wins come
+from eliminating the actual repeated work, not from rewriting
+the loop body.
+
+`parseName` looked like a hot loop. It was actually a hot lookup
+that built the lookup key by hand on every call. Move the
+key-build out of the hot path -- by caching the answer keyed on
+the raw input -- and the loop becomes irrelevant. Same lesson as
+the constructor-shape round (`Object.create + writes` is slow
+because V8 takes a different IC path, not because the writes
+themselves are slow); same lesson likely lurking in other "hot
+loop" rows in the profile.
+
+### Shipped
+
+[`docs/render-book.mjs`](../../docs/render-book.mjs) adds
+`import './lib/fast-parse-name.mjs';` next to the other parser-
+shim imports. The shim is idempotent on import and global on
+install -- no opt-out at production, the `--fast-parse-name`
+flag exists for A/B harness work.
+
 ## `@cantoo/pdf-lib`: not a drop-in replacement
 
 Spot-checked the maintained fork (`@cantoo/pdf-lib` 2.6.5) as an
@@ -4481,7 +4736,8 @@ Cumulative process-phase cost, baseline → after the shims to date:
 | + skipJibberish digit fast-path      | ~0.95 s | ~0.6 s | ~0.4 s |
 | + fast-refs-class                    | ~0.9 s  | ~0.55 s | ~0.4 s |
 | + fast-dict-onebuf class shape       | ~0.9 s  | ~0.55 s | ~0.4 s |
-| **+ fast-array-onebuf class shape (this section)** | **~0.8 s** | **~0.5 s** | **~0.35 s** |
+| + fast-array-onebuf class shape      | ~0.8 s  | ~0.5 s  | ~0.35 s |
+| **+ fast-parse-name (this section)** | **~0.75 s** | **~0.4 s** | **~0.35 s** |
 
 The bottom-up after the latest pair is what's left of pdf-lib's
 genuine parser work: `PDFDict.entries`, `PDFObjectParser.parseName`,

From 02aa7fbeb0adf77c2b3e40f1a372fd45c708af96 Mon Sep 17 00:00:00 2001
From: Kuba Sunderland-Ober <kuba@mareimbrium.org>
Date: Sun, 24 May 2026 11:52:42 +0200
Subject: [PATCH 41/44] pipeline-deflate: overlap buffer-build with libuv
 deflate (-47 ms save, -10 %).

---
 docs/lib/parallel-deflate.mjs |   9 ++-
 perf/README.md                |  10 ++-
 perf/notes/08-pdf-lib.md      | 129 +++++++++++++++++++++++++++++++++-
 3 files changed, 144 insertions(+), 4 deletions(-)

diff --git a/docs/lib/parallel-deflate.mjs b/docs/lib/parallel-deflate.mjs
index b7b71499..00698712 100644
--- a/docs/lib/parallel-deflate.mjs
+++ b/docs/lib/parallel-deflate.mjs
@@ -106,8 +106,13 @@ class ParallelStreamWriter extends PDFStreamWriter {
     );
 
     if (this._parallel && this.encodeStreams && objectStreams.length > 0) {
-      const unencoded = objectStreams.map(os => os.getUnencodedContents());
-      const deflated = await Promise.all(unencoded.map(buf => deflateAsync(buf)));
+      // Fire each deflate onto libuv as soon as its buffer is built,
+      // so deflate of stream N runs concurrently with the build of
+      // N+1..453 instead of after all 453 builds finish. Saves the
+      // main-thread idle wait at the Promise.all (~30 ms on the book).
+      const deflated = await Promise.all(
+        objectStreams.map(os => deflateAsync(os.getUnencodedContents())),
+      );
       for (let i = 0; i < objectStreams.length; i++) {
         objectStreams[i].contentsCache.value = deflated[i];
       }
diff --git a/perf/README.md b/perf/README.md
index 77bb4c35..0cea424a 100644
--- a/perf/README.md
+++ b/perf/README.md
@@ -123,6 +123,13 @@ Flag rationale:
   `objectsPerStream: 500`. Production runs through it; same logic.
   Moves ~300 ms of zlib work off the main thread, and routes every
   deflate call through `node:zlib` (no pdf-lib pure-JS fallback).
+  Phase 2's buffer-build + deflate is pipelined: each stream's
+  `deflateAsync(os.getUnencodedContents())` fires on libuv as soon
+  as its buffer is built, overlapping with the build of the next
+  stream instead of running build × 453 then deflate × 453 as
+  serial passes. Saves another ~47 ms on save (-10 %); the
+  `(idle)` row at the `Promise.all` gate (was 21 ms / 2.8 %)
+  drops out of the CPU top-15.
 - `--fast-decode-name` -- inject
   [docs/lib/fast-decode-name.mjs](../docs/lib/fast-decode-name.mjs), a
   parallel `Map<string, PDFName>` in front of `PDFName.of` that
@@ -674,6 +681,7 @@ file documenting each:
 | Class-constructor `PDFDict` shape (`_FastDict` / `_FastCatalog` / `_FastPageTree` / `_FastPageLeaf` per-subclass constructors) | [08](notes/08-pdf-lib.md) | `_makeFromRange (dict)` 16.5 MB → 11.4 MB; total process heap 41.4 MB → 35.4 MB (-14.4 %); cumulative -77 % since Map-backed PDFDict |
 | Class-constructor `PDFArray` shape (`_FastArray` factory + monomorphic call-site unlock across all three Fast classes) | [08](notes/08-pdf-lib.md) | total process heap 35.4 MB → 33.7 MB (-4.9 %); process wall 1.03 s → 0.90 s (-130 ms); GC self-time 101 ms → 59 ms (-42 %); cumulative -78 % heap since Map-backed PDFDict, -20 % process across the three shape-change commits |
 | Byte-keyed `parseName` cache (Map<hash, Entry &#124; Entry[]>; skip per-call string build + string-keyed Map hash on 99.7 % hit path) | [08](notes/08-pdf-lib.md) | `parseName` + `fastOf` combined 144 ms → 58 ms; process wall 0.90 s → 0.82 s (-80 ms, -9 %, all on load); +1.3 MB long-lived heap for the cache |
+| Pipeline `parallel-deflate` (overlap buffer-build with libuv deflate by folding two `.map`s into one) | [08](notes/08-pdf-lib.md) | save 0.467 s → 0.420 s (-47 ms, -10 %); `(idle)` row at `Promise.all` gate drops out of CPU top-15 |
 
 What was tried and didn't ship:
 
@@ -700,4 +708,4 @@ order; later ones reference earlier ones for context.
 | [05-blink-trace.md](notes/05-blink-trace.md) | What happened when we tried move-not-clone (a `previousLeaf` cache shipped instead of move); cracking the cpu profile's `(program)` row open with a Blink-category trace; the WhiteSpaceFilter paired-A/B that found it wasn't worth its layout cost in our pipeline. |
 | [06-microtasks-pageranges-css.md](notes/06-microtasks-pageranges-css.md) | Following `RunMicrotasks` down to zero (chunker fully sync); why `pageRanges` sharding is off the table; CSS cost attribution showing print.css's individual sections are all below the noise floor. |
 | [07-memory.md](notes/07-memory.md) | Where the renderer's 1.9 GB goes -- process-tree footprint, per-allocator + per-Blink-class breakdown, `--disable-gpu` + `--in-process-gpu` saving ~200 MB, a GC-pass probe finding 180 MB of unswept Oilpan garbage. |
-| [08-pdf-lib.md](notes/08-pdf-lib.md) | Profiling the process phase via `--cpu-profile-process`; pako's per-stream init dominates with ~4 500 small streams (routing pdf-lib's `deflate` + `inflate` through `node:zlib` saves ~1.5 s); `PDFRef.of`'s string-keyed Map lookup at 1.2 M calls per load (dense-array gen=0 cache saves ~0.2 s); parallelising save's per-stream deflate on libuv's pool with `objectsPerStream: 500` (~0.3 s off the main thread; PDF -5 %); `decodeName`'s regex scan on 2.76 M `PDFName.of` calls per load with a 0.0001 % hit rate (no-`#` cache saves ~0.5 s); `numberToString`'s redundant `toString`/`split`/`parseInt` on the 100 % no-`e` path; `sizeInBytes` allocating `n.toString(2)` on ~300 k xref-writer calls (short-circuit ladder saves ~60 ms); `PDFDict.entries` allocating `Array.from(map.entries())` on every dict serialisation (`Map.forEach` with hoisted callbacks saves ~80 ms); `parseDict`'s type-dispatch tail re-running `PDFName.of('Type'/'Catalog'/'Pages'/'Page')` per dict (hoisted sentinel constants drop `fastOf` self-time by 22 %); pdf-lib's `__awaiter`/`__generator` scaffolding on nine load + save methods costing ~135 ms of attributed self-time + ~50 ms GC (synchronified twins save 0.36 s of process); `parseObject`'s three speculative `matchKeyword(true/false/null)` scans on every dispatch (first-byte peek + gated keyword scans halve `parseObject` self-time); the sampling heap profile pointing at `new Map()` + `Map.prototype.set` at ~50 % of process-phase allocations (replacing `PDFDict`'s backing `Map` with a flat alternating `[k,v,k,v,...]` array drops Map+set heap traffic ~80 % and subsumes the earlier `fast-dict-iter` + `fast-parse-dict` shims); the only hot `Map.set` left being `PDFContext.assign`'s `indirectObjects.set(ref, object)` (replacing the Map with a dense gen=0 array indexed by `objectNumber` drops `assign` out of the CPU top-15 and halves the remaining `set` heap traffic); the residual `set` after that being the upstream `PDFRef.of` pool.set on cache miss (directly constructing the `PDFRef` via `Object.create(PDFRef.prototype)` on the gen=0 miss path bypasses the redundant upstream pool entirely, dropping `set` heap traffic from 7.7 MB to 0.5 MB and saving another ~93 ms on `PDFRef.of` CPU); `parseNumberOrRef` as the next-largest heap row at 15 MB of inlined `new PDFNumber(value)` calls -- PDFs reuse a few thousand unique numeric values hundreds of thousands of times (page indices, `/Count`, `/MediaBox` dimensions, font sizes), so pooling `PDFNumber` by value drops `parseNumberOrRef` off the top 10 and total process heap by ~13 %; `fastParseDictArray` at ~85 % FixedArray-growth garbage on 261 k dicts with a 5-entry median (pre-sizing the accumulator at `new Array(10)` collapses the per-call growth chain, dropping the row -25 % and total heap another -14 %); the next layer being the irreducible "one `new Array(10)` + one PDFDict instance per parsed dict" floor with ~1780 backing arrays still in flight after `--fast-dict-array` (collapse them into one long-lived mainBuf where every committed PDFDict entry lives, a per-parser temp stack handles parseDict recursion, owned dicts append to main and mutate in-place / COW-to-tail, and the whole PDFDict instance state packs into one 53-bit Number -- 24-bit start + 14-bit length + 1-bit owned -- so the per-dict object header collapses to a single field; total process heap drops 92 MB → 66 MB (-28 %), cumulative -57 % since the Map-backed baseline); GC self-time still ~150 ms / 15 % of process after fast-dict-onebuf (V8-flag knobs don't move it -- mark cost is dominated by walking the ~2.4 M Object-ref slots in mainBuf), so the next attack surface is shrinking the *live set* V8 has to mark by encoding slots as Numbers in a Float64Array mainBuf instead of Object references; that needs exact pool sizing, which is best done by a separate measure-allocate-work pass (Phase 0 viability gate: a no-allocate byte walker prototype clocks 135 ms vs PDFDocument.load at 1238 ms on the 39 MB Chrome-output PDF, ~9x cheaper, so even an 80 %-of-current work pass would land net-positive on CPU before any GC reduction; the prototype lives in `perf/phase0-measure.mjs` with `--dump-raw-pdf` on `measure.mjs` for capturing the canonical input); Phase 1 productionises that walker as `docs/lib/measure-pass.mjs` and adds `setExpectedDictSlots()` to fast-dict-onebuf so the harness's `--measure-pass` flag pre-sizes mainBuf exact (byte-identical output, +40 ms net process on the book) -- pipeline shape committed for Phase 2 to layer on, even though Phase 1 alone doesn't move the heap; a V8 IC-invalidation gotcha worth recording: rebinding the module-level `main` (rather than resizing it in place via `main.length = N`) made `_appendEntries` deopt and the heap jump 65 → 92 MB despite the resized Array being shape-identical; post-Phase-1 cleanup: the One-buffer `owned` bit was over-cautious -- `start + length === mainLen` is enough to know slots past mainLen are claimable regardless of whether the range came from the parser or a factory call, so the bit drops out of the packed value entirely (set's COW gate simplifies, _cow collapses to one branch, bit 38 becomes spare; byte-identical output, heap flat); slot-type histogram on `main[0..mainLen)` confirms keys are 100 % PDFName and the four big pools (Name / Ref / Number / Dict) cover 96.4 % of all slots, so a Float64Array mainBuf with a side `Object[]` for the residual ~3.5 % (Array / String / HexString) would collapse ~96 % of GC slot-marks at the price of a side-pool indirection -- `perf/instrument-slot-types.mjs` does the classification in 39 ms via `--instrument-slot-types`; built the Float64Array mainBuf with 4-bit tag + 49-bit payload encoding (subsuming fast-refs and fast-pdfnumber-pool's pool IDs, adding new pools for PDFArray / PDFString / PDFHexString) and confirmed byte-identical output -- but the measured win was a wash: pointer-array marks turn out to be fast in V8, encoding overhead at parse + decode at save roughly cancels the savings, and the new pool Maps cost ~3 MB heap, so the architecture stays on the shelf rather than shipping; mirroring the same shape to PDFArray (each instance as a view into a shared arrayBuf Float64Array via `this.d = (start, length)`, same temp-then-commit parser pattern) lands the expected heap win of -7.6 MB / -12 % and pulls parseArray's 19.6 MB attribution row off the top 10 entirely, but introduces a +360 ms wall-clock regression at save -- ~3 M per-slot `decodeValue` calls across copyBytesInto, ~100 ns each, V8 can't inline across the prototype boundary -- so the architecture stays off staging too; the natural follow-up is to hand-inline decodeValue's 10-case switch into all four hot save methods (Phase 3β), which recovers the function-call overhead and lands net wins on GC (-19 ms) and parseName (-17 ms, a downstream V8 re-optimization once the call sites became monomorphic per case branch) -- but the inlined switch body itself adds +23 ms at encode-at-parse and +22 ms at copyBytesInto, and the net of all of Phase 2 + 3 + β is "real heap+GC win, ~+200 ms wall-clock loss across many frames", so the architecture stays off staging on the simpler conclusion that V8 marks pointer arrays faster than expected and the original polymorphic `main[i].copyBytesInto()` was fine; Phase 1 wires into production via a `setExpectedDictSlots()` call in `docs/render-book.mjs` that runs the measure walker on rawPdf and hands the exact dictSlot count to fast-dict-onebuf -- the bound on mainBuf isn't material on its own (~60 K slots out of 2.4 M of slack) but commits the two-pass shape so any future shim swap doesn't have to re-do the wiring; finally, mirroring fast-dict-onebuf's range-view shape onto PDFArray (every committed element in a single append-only `arrayMain`, each PDFArray a view via packed `(start, length)`) is the lever the encoded approach was reaching for, with plain Object[] storage skipping the per-slot decode cost that killed Phase 3 -- ~19.6 MB `parseArray` allocation row drops off the top 15, total process heap 66 MB → 52 MB (-21 %), cumulative -66 % since the Map-backed baseline; once `fast-array-onebuf` shipped the next heap row was `PDFParser.parseIndirectObjectHeader` at 13.7 MB (25 % of total), attributed via `find-heap-callers.mjs` to V8 inlining `fastOf` into `skipJibberish`'s speculative `matchIndirectObjectHeader` call (~150 k tag-string allocations of ~25-35 B each), dropping the per-instance `PDFRef.tag` field entirely and computing `toString` / `sizeInBytes` / `copyBytesInto` from `objectNumber` / `generationNumber` directly via `_writeUint` + `_digitCount` helpers cuts `parseIndirectObjectHeader` to 9.3 MB (-4.3 MB), `fastOf` 7.7 → 4.8 MB (-2.9 MB), total process heap 51.9 → 45.2 MB (-13 %), with byte-identical output verified by inflating + diffing all 453 ObjStm streams; the same chain pointed at a redundancy on the CPU side -- `parseDocument`'s inner loop calls `skipJibberish` ~150 k times per load to recover from invalid PDFs that wedge garbage between indirect objects, but on valid PDFs every call speculatively runs `matchKeyword(xref/trailer/startxref)` (all fail on a digit) + `matchIndirectObjectHeader` (a `try` / `catch` around `parseIndirectObjectHeader` + `parseRawInt`x2 + `matchKeyword('obj')` + `fastOf`), all to confirm what the outer `while`'s `IsDigit` check already proved, so peeking the byte first and `continue`-ing on a digit (falling through to `skipJibberish` only on xref/trailer/startxref keyword starts or real jibberish) saves ~62 ms on load (mean 0.518 → 0.455 s, ~6 % of process); the next attack surface after that was the construction style itself -- `fast-refs`'s `Object.create(PDFRef.prototype) + fresh.objectNumber = ... + fresh.generationNumber = ...` routes V8 through the slow-property path with intermediate hidden-class transitions per write, putting PDFRef at ~60 B/instance vs PDFName's ~31 B (built via `new PDFName(...)` with a real constructor), so swapping to a plain function used as a constructor (`function _FastRef(o, g) { this.objectNumber = o; this.generationNumber = g; }` + `_FastRef.prototype = PDFRef.prototype`) gives V8 a stable hidden class from the first instance, drops per-PDFRef cost to ~44 B for ~3.87 MB heap (-8.5 %) and ~140 ms wall-clock (-12 % of process) on the book's 226 k unique PDFRefs (paired heap+cpu profile, --fast-refs vs --fast-refs-class with the rest of production on), with `parseIndirectObjectHeader` dropping 9.1 MB → 7.4 MB and `fastOf` 4.7 MB → 3.4 MB -- the `Object.create + writes` shim stays in the tree as A/B baseline (mutex-checked in measure.mjs); the same shape change applied symmetrically to the four PDFDict factory paths in `fast-dict-onebuf` (`_makeFromRange` + the COW path inside `set` both build wrappers via `Object.create(ProtoClass.prototype) + pd.d = ...`, with PageLeaf carrying extra `normalized` / `autoNormalizeCTM` writes) -- one plain-function constructor per subclass (`_FastDict`, `_FastCatalog`, `_FastPageTree`, `_FastPageLeaf`) with the prototype aliased to the upstream prototype drops 260 k+ wrapper instances ~20 B each for `_makeFromRange (dict)` 16.5 MB → 11.4 MB, `create` builtin 2.6 MB → 0.9 MB, total process heap 41.4 MB → 35.4 MB (-14.4 %), cumulative -22 % over the two shape-change commits and -77 % since the Map-backed PDFDict baseline (152 MB → 35.4 MB); wall-clock roughly flat (0.99 → 1.03 s under cpu profile, within noise) with GC self-time +18 ms (82 → 101 ms) as expected -- the dominant GC cost is the live mainBuf scan rather than allocation rate, so cutting allocations doesn't move single-shot mark time; mirroring the same change to PDFArray's `_makeFromRange` and COW paths with a single `_FastArray` constructor (no subclass dispatch needed -- PDFArray has none in pdf-lib) drops ~22 B/PDFArray × ~80 k = ~1.7 MB heap, but the surprise win is on CPU + GC: with all three shape changes in place V8 sees fully monomorphic call sites for PDFRef / PDFDict / PDFArray construction and method dispatch, undoing the dict-only state's +18 ms GC regression and then some -- GC self-time 101 → 59 ms (-42 %), process wall-clock 1.03 → 0.90 s (-130 ms, -13 %), so cumulative across the three shape-change commits (fast-refs-class + fast-dict-onebuf class + fast-array-onebuf class) the process drops 1.13 → 0.90 s (-230 ms, -20 %), total heap 45.3 → 33.7 MB (-25.6 %), GC self-time 87 → 59 ms (-32 %), with output byte-identical modulo timestamps; with the constructor-shape round closed, the new #1 row in the process CPU profile was `PDFObjectParser.prototype.parseName` at 87 ms self + 57 ms via its `fastOf` callee = 144 ms combined (~16 % of process) firing 1.68 M times per load, of which 4 787 are unique (99.7 % cache-hit rate -- the same handful of dict keys like Type, Length, Pages, MediaBox over and over) -- two failed first attempts (skip per-byte ByteStream method dispatch via direct buffer access while keeping the cons-string accumulator: V8's cons-string optimisation was already covering the cost so no movement; and `String.fromCharCode.apply(null, buf.subarray(...))` as a one-shot allocation: SLOWER at ~123 ms vs ~87 ms because `.apply` on a typed-array view is a V8 deopt path) pointed at the wrong surface, the real win was caching the answer keyed on the byte content, scanning bytes with direct buffer access while accumulating a Java-style `hash * 31 + byte | 0` Smi hash in the same pass, looking up `Map<hash, Entry | Entry[]>` keyed by byte content (single-entry buckets the common case at 4.8 k names into 2^32 hash space, collision-bucket scan via `instanceof Entry` check), with cold path building the string in one shot via `String.fromCharCode` direct args and routing through fast-decode-name so both caches converge on the same PDFName instance -- pulls `parseName` + `fastOf` combined from 144 ms to 58 ms (-60 %), -80 ms process wall-clock (-9 %), all on load (0.41 → 0.33 s); +1.3 MB long-lived heap (4.8 k Entry objects + Uint8Array byte-keys + Map<number, ...> overhead) is a fixed cost for a workload-bounded cache; the heap-profile run shows a much bigger drop (3.50 → 2.56 s, -940 ms) -- not a real wall-clock win, just the sampler's per-allocation bookkeeping dropping in step with the ~1.6 M transient string allocations we eliminated (read cpu numbers for "did we get faster", heap numbers for long-lived cost). |
+| [08-pdf-lib.md](notes/08-pdf-lib.md) | Profiling the process phase via `--cpu-profile-process`; pako's per-stream init dominates with ~4 500 small streams (routing pdf-lib's `deflate` + `inflate` through `node:zlib` saves ~1.5 s); `PDFRef.of`'s string-keyed Map lookup at 1.2 M calls per load (dense-array gen=0 cache saves ~0.2 s); parallelising save's per-stream deflate on libuv's pool with `objectsPerStream: 500` (~0.3 s off the main thread; PDF -5 %); `decodeName`'s regex scan on 2.76 M `PDFName.of` calls per load with a 0.0001 % hit rate (no-`#` cache saves ~0.5 s); `numberToString`'s redundant `toString`/`split`/`parseInt` on the 100 % no-`e` path; `sizeInBytes` allocating `n.toString(2)` on ~300 k xref-writer calls (short-circuit ladder saves ~60 ms); `PDFDict.entries` allocating `Array.from(map.entries())` on every dict serialisation (`Map.forEach` with hoisted callbacks saves ~80 ms); `parseDict`'s type-dispatch tail re-running `PDFName.of('Type'/'Catalog'/'Pages'/'Page')` per dict (hoisted sentinel constants drop `fastOf` self-time by 22 %); pdf-lib's `__awaiter`/`__generator` scaffolding on nine load + save methods costing ~135 ms of attributed self-time + ~50 ms GC (synchronified twins save 0.36 s of process); `parseObject`'s three speculative `matchKeyword(true/false/null)` scans on every dispatch (first-byte peek + gated keyword scans halve `parseObject` self-time); the sampling heap profile pointing at `new Map()` + `Map.prototype.set` at ~50 % of process-phase allocations (replacing `PDFDict`'s backing `Map` with a flat alternating `[k,v,k,v,...]` array drops Map+set heap traffic ~80 % and subsumes the earlier `fast-dict-iter` + `fast-parse-dict` shims); the only hot `Map.set` left being `PDFContext.assign`'s `indirectObjects.set(ref, object)` (replacing the Map with a dense gen=0 array indexed by `objectNumber` drops `assign` out of the CPU top-15 and halves the remaining `set` heap traffic); the residual `set` after that being the upstream `PDFRef.of` pool.set on cache miss (directly constructing the `PDFRef` via `Object.create(PDFRef.prototype)` on the gen=0 miss path bypasses the redundant upstream pool entirely, dropping `set` heap traffic from 7.7 MB to 0.5 MB and saving another ~93 ms on `PDFRef.of` CPU); `parseNumberOrRef` as the next-largest heap row at 15 MB of inlined `new PDFNumber(value)` calls -- PDFs reuse a few thousand unique numeric values hundreds of thousands of times (page indices, `/Count`, `/MediaBox` dimensions, font sizes), so pooling `PDFNumber` by value drops `parseNumberOrRef` off the top 10 and total process heap by ~13 %; `fastParseDictArray` at ~85 % FixedArray-growth garbage on 261 k dicts with a 5-entry median (pre-sizing the accumulator at `new Array(10)` collapses the per-call growth chain, dropping the row -25 % and total heap another -14 %); the next layer being the irreducible "one `new Array(10)` + one PDFDict instance per parsed dict" floor with ~1780 backing arrays still in flight after `--fast-dict-array` (collapse them into one long-lived mainBuf where every committed PDFDict entry lives, a per-parser temp stack handles parseDict recursion, owned dicts append to main and mutate in-place / COW-to-tail, and the whole PDFDict instance state packs into one 53-bit Number -- 24-bit start + 14-bit length + 1-bit owned -- so the per-dict object header collapses to a single field; total process heap drops 92 MB → 66 MB (-28 %), cumulative -57 % since the Map-backed baseline); GC self-time still ~150 ms / 15 % of process after fast-dict-onebuf (V8-flag knobs don't move it -- mark cost is dominated by walking the ~2.4 M Object-ref slots in mainBuf), so the next attack surface is shrinking the *live set* V8 has to mark by encoding slots as Numbers in a Float64Array mainBuf instead of Object references; that needs exact pool sizing, which is best done by a separate measure-allocate-work pass (Phase 0 viability gate: a no-allocate byte walker prototype clocks 135 ms vs PDFDocument.load at 1238 ms on the 39 MB Chrome-output PDF, ~9x cheaper, so even an 80 %-of-current work pass would land net-positive on CPU before any GC reduction; the prototype lives in `perf/phase0-measure.mjs` with `--dump-raw-pdf` on `measure.mjs` for capturing the canonical input); Phase 1 productionises that walker as `docs/lib/measure-pass.mjs` and adds `setExpectedDictSlots()` to fast-dict-onebuf so the harness's `--measure-pass` flag pre-sizes mainBuf exact (byte-identical output, +40 ms net process on the book) -- pipeline shape committed for Phase 2 to layer on, even though Phase 1 alone doesn't move the heap; a V8 IC-invalidation gotcha worth recording: rebinding the module-level `main` (rather than resizing it in place via `main.length = N`) made `_appendEntries` deopt and the heap jump 65 → 92 MB despite the resized Array being shape-identical; post-Phase-1 cleanup: the One-buffer `owned` bit was over-cautious -- `start + length === mainLen` is enough to know slots past mainLen are claimable regardless of whether the range came from the parser or a factory call, so the bit drops out of the packed value entirely (set's COW gate simplifies, _cow collapses to one branch, bit 38 becomes spare; byte-identical output, heap flat); slot-type histogram on `main[0..mainLen)` confirms keys are 100 % PDFName and the four big pools (Name / Ref / Number / Dict) cover 96.4 % of all slots, so a Float64Array mainBuf with a side `Object[]` for the residual ~3.5 % (Array / String / HexString) would collapse ~96 % of GC slot-marks at the price of a side-pool indirection -- `perf/instrument-slot-types.mjs` does the classification in 39 ms via `--instrument-slot-types`; built the Float64Array mainBuf with 4-bit tag + 49-bit payload encoding (subsuming fast-refs and fast-pdfnumber-pool's pool IDs, adding new pools for PDFArray / PDFString / PDFHexString) and confirmed byte-identical output -- but the measured win was a wash: pointer-array marks turn out to be fast in V8, encoding overhead at parse + decode at save roughly cancels the savings, and the new pool Maps cost ~3 MB heap, so the architecture stays on the shelf rather than shipping; mirroring the same shape to PDFArray (each instance as a view into a shared arrayBuf Float64Array via `this.d = (start, length)`, same temp-then-commit parser pattern) lands the expected heap win of -7.6 MB / -12 % and pulls parseArray's 19.6 MB attribution row off the top 10 entirely, but introduces a +360 ms wall-clock regression at save -- ~3 M per-slot `decodeValue` calls across copyBytesInto, ~100 ns each, V8 can't inline across the prototype boundary -- so the architecture stays off staging too; the natural follow-up is to hand-inline decodeValue's 10-case switch into all four hot save methods (Phase 3β), which recovers the function-call overhead and lands net wins on GC (-19 ms) and parseName (-17 ms, a downstream V8 re-optimization once the call sites became monomorphic per case branch) -- but the inlined switch body itself adds +23 ms at encode-at-parse and +22 ms at copyBytesInto, and the net of all of Phase 2 + 3 + β is "real heap+GC win, ~+200 ms wall-clock loss across many frames", so the architecture stays off staging on the simpler conclusion that V8 marks pointer arrays faster than expected and the original polymorphic `main[i].copyBytesInto()` was fine; Phase 1 wires into production via a `setExpectedDictSlots()` call in `docs/render-book.mjs` that runs the measure walker on rawPdf and hands the exact dictSlot count to fast-dict-onebuf -- the bound on mainBuf isn't material on its own (~60 K slots out of 2.4 M of slack) but commits the two-pass shape so any future shim swap doesn't have to re-do the wiring; finally, mirroring fast-dict-onebuf's range-view shape onto PDFArray (every committed element in a single append-only `arrayMain`, each PDFArray a view via packed `(start, length)`) is the lever the encoded approach was reaching for, with plain Object[] storage skipping the per-slot decode cost that killed Phase 3 -- ~19.6 MB `parseArray` allocation row drops off the top 15, total process heap 66 MB → 52 MB (-21 %), cumulative -66 % since the Map-backed baseline; once `fast-array-onebuf` shipped the next heap row was `PDFParser.parseIndirectObjectHeader` at 13.7 MB (25 % of total), attributed via `find-heap-callers.mjs` to V8 inlining `fastOf` into `skipJibberish`'s speculative `matchIndirectObjectHeader` call (~150 k tag-string allocations of ~25-35 B each), dropping the per-instance `PDFRef.tag` field entirely and computing `toString` / `sizeInBytes` / `copyBytesInto` from `objectNumber` / `generationNumber` directly via `_writeUint` + `_digitCount` helpers cuts `parseIndirectObjectHeader` to 9.3 MB (-4.3 MB), `fastOf` 7.7 → 4.8 MB (-2.9 MB), total process heap 51.9 → 45.2 MB (-13 %), with byte-identical output verified by inflating + diffing all 453 ObjStm streams; the same chain pointed at a redundancy on the CPU side -- `parseDocument`'s inner loop calls `skipJibberish` ~150 k times per load to recover from invalid PDFs that wedge garbage between indirect objects, but on valid PDFs every call speculatively runs `matchKeyword(xref/trailer/startxref)` (all fail on a digit) + `matchIndirectObjectHeader` (a `try` / `catch` around `parseIndirectObjectHeader` + `parseRawInt`x2 + `matchKeyword('obj')` + `fastOf`), all to confirm what the outer `while`'s `IsDigit` check already proved, so peeking the byte first and `continue`-ing on a digit (falling through to `skipJibberish` only on xref/trailer/startxref keyword starts or real jibberish) saves ~62 ms on load (mean 0.518 → 0.455 s, ~6 % of process); the next attack surface after that was the construction style itself -- `fast-refs`'s `Object.create(PDFRef.prototype) + fresh.objectNumber = ... + fresh.generationNumber = ...` routes V8 through the slow-property path with intermediate hidden-class transitions per write, putting PDFRef at ~60 B/instance vs PDFName's ~31 B (built via `new PDFName(...)` with a real constructor), so swapping to a plain function used as a constructor (`function _FastRef(o, g) { this.objectNumber = o; this.generationNumber = g; }` + `_FastRef.prototype = PDFRef.prototype`) gives V8 a stable hidden class from the first instance, drops per-PDFRef cost to ~44 B for ~3.87 MB heap (-8.5 %) and ~140 ms wall-clock (-12 % of process) on the book's 226 k unique PDFRefs (paired heap+cpu profile, --fast-refs vs --fast-refs-class with the rest of production on), with `parseIndirectObjectHeader` dropping 9.1 MB → 7.4 MB and `fastOf` 4.7 MB → 3.4 MB -- the `Object.create + writes` shim stays in the tree as A/B baseline (mutex-checked in measure.mjs); the same shape change applied symmetrically to the four PDFDict factory paths in `fast-dict-onebuf` (`_makeFromRange` + the COW path inside `set` both build wrappers via `Object.create(ProtoClass.prototype) + pd.d = ...`, with PageLeaf carrying extra `normalized` / `autoNormalizeCTM` writes) -- one plain-function constructor per subclass (`_FastDict`, `_FastCatalog`, `_FastPageTree`, `_FastPageLeaf`) with the prototype aliased to the upstream prototype drops 260 k+ wrapper instances ~20 B each for `_makeFromRange (dict)` 16.5 MB → 11.4 MB, `create` builtin 2.6 MB → 0.9 MB, total process heap 41.4 MB → 35.4 MB (-14.4 %), cumulative -22 % over the two shape-change commits and -77 % since the Map-backed PDFDict baseline (152 MB → 35.4 MB); wall-clock roughly flat (0.99 → 1.03 s under cpu profile, within noise) with GC self-time +18 ms (82 → 101 ms) as expected -- the dominant GC cost is the live mainBuf scan rather than allocation rate, so cutting allocations doesn't move single-shot mark time; mirroring the same change to PDFArray's `_makeFromRange` and COW paths with a single `_FastArray` constructor (no subclass dispatch needed -- PDFArray has none in pdf-lib) drops ~22 B/PDFArray × ~80 k = ~1.7 MB heap, but the surprise win is on CPU + GC: with all three shape changes in place V8 sees fully monomorphic call sites for PDFRef / PDFDict / PDFArray construction and method dispatch, undoing the dict-only state's +18 ms GC regression and then some -- GC self-time 101 → 59 ms (-42 %), process wall-clock 1.03 → 0.90 s (-130 ms, -13 %), so cumulative across the three shape-change commits (fast-refs-class + fast-dict-onebuf class + fast-array-onebuf class) the process drops 1.13 → 0.90 s (-230 ms, -20 %), total heap 45.3 → 33.7 MB (-25.6 %), GC self-time 87 → 59 ms (-32 %), with output byte-identical modulo timestamps; with the constructor-shape round closed, the new #1 row in the process CPU profile was `PDFObjectParser.prototype.parseName` at 87 ms self + 57 ms via its `fastOf` callee = 144 ms combined (~16 % of process) firing 1.68 M times per load, of which 4 787 are unique (99.7 % cache-hit rate -- the same handful of dict keys like Type, Length, Pages, MediaBox over and over) -- two failed first attempts (skip per-byte ByteStream method dispatch via direct buffer access while keeping the cons-string accumulator: V8's cons-string optimisation was already covering the cost so no movement; and `String.fromCharCode.apply(null, buf.subarray(...))` as a one-shot allocation: SLOWER at ~123 ms vs ~87 ms because `.apply` on a typed-array view is a V8 deopt path) pointed at the wrong surface, the real win was caching the answer keyed on the byte content, scanning bytes with direct buffer access while accumulating a Java-style `hash * 31 + byte | 0` Smi hash in the same pass, looking up `Map<hash, Entry | Entry[]>` keyed by byte content (single-entry buckets the common case at 4.8 k names into 2^32 hash space, collision-bucket scan via `instanceof Entry` check), with cold path building the string in one shot via `String.fromCharCode` direct args and routing through fast-decode-name so both caches converge on the same PDFName instance -- pulls `parseName` + `fastOf` combined from 144 ms to 58 ms (-60 %), -80 ms process wall-clock (-9 %), all on load (0.41 → 0.33 s); +1.3 MB long-lived heap (4.8 k Entry objects + Uint8Array byte-keys + Map<number, ...> overhead) is a fixed cost for a workload-bounded cache; the heap-profile run shows a much bigger drop (3.50 → 2.56 s, -940 ms) -- not a real wall-clock win, just the sampler's per-allocation bookkeeping dropping in step with the ~1.6 M transient string allocations we eliminated (read cpu numbers for "did we get faster", heap numbers for long-lived cost); the next row to drop was `PDFObjectStream.getUnencodedContents` (#4 at 46 ms self / 124 ms with callees) paired with a fat `(idle)` row at 32 ms / 3.4 % -- both attributable to `parallel-deflate.mjs`'s phase 2 running build + deflate as two strictly serial passes (`objectStreams.map(os => os.getUnencodedContents())` followed by `Promise.all(unencoded.map(buf => deflateAsync(buf)))`, the first ~120 ms of main-thread block then ~30 ms of main-thread idle awaiting libuv), so folding the two `.map`s into one (`Promise.all(objectStreams.map(os => deflateAsync(os.getUnencodedContents())))`) pipelines build with deflate -- each deflate fires on libuv as its buffer is built, overlapping with the build of the next stream rather than after all 453 builds complete -- and the await resolves almost immediately by the time the build loop finishes (by then ~430 of 453 deflates have run on the 4-worker pool, each ~0.3 ms compute); paired 3-run A/B with the rest of the shipped flag set on confirms save 0.467 s → 0.420 s (-47 ms, -10 %), process 0.887 s → 0.833 s (-54 ms, -6 %), load + setOutline flat as expected; the `(idle)` row drops out of the CPU top-15 entirely and `getUnencodedContents` self-time also drops (31.56 → 22.25 ms) as V8's task scheduling between build and the fire-and-forget Promise creation reattributes some samples -- a 47 ms vs 32 ms estimate gap accounted for by microtask-queue drain at the `Promise.all` gate + libuv callback marshalling now spread across the build loop instead of bunched at the end. |
diff --git a/perf/notes/08-pdf-lib.md b/perf/notes/08-pdf-lib.md
index e6963d9a..669d672a 100644
--- a/perf/notes/08-pdf-lib.md
+++ b/perf/notes/08-pdf-lib.md
@@ -4692,6 +4692,132 @@ shim imports. The shim is idempotent on import and global on
 install -- no opt-out at production, the `--fast-parse-name`
 flag exists for A/B harness work.
 
+## Pipeline the deflate: overlap buffer-build with libuv
+
+After `fast-parse-name` shipped, the CPU profile of the process
+phase showed `PDFObjectStream.getUnencodedContents` at #4 (45.97
+ms self, 123.75 ms with callees) and a fat `(idle)` row at
+31.82 ms / 3.4 %. The two are joined at the hip:
+`parallel-deflate.mjs`'s phase 2 ran the build and the deflate
+in two strictly serial passes:
+
+```js
+// before
+const unencoded = objectStreams.map(os => os.getUnencodedContents());        // ~120 ms main-thread block
+const deflated = await Promise.all(unencoded.map(buf => deflateAsync(buf))); // ~30 ms main-thread idle
+```
+
+Pass 1 built all 453 buffers on the main thread (the
+`getUnencodedContents` total subtree). Pass 2 fired all 453
+deflates into libuv at once and awaited them as a batch. The
+`(idle)` row was the main thread sleeping during that await.
+
+### The fix: fold the two `.map`s
+
+```js
+// after
+const deflated = await Promise.all(
+  objectStreams.map(os => deflateAsync(os.getUnencodedContents())),
+);
+```
+
+`.map` still iterates 453 times sequentially on the main thread,
+but each iteration now does build + dispatch in one step.
+`deflateAsync(buf)` returns a Promise immediately and the libuv
+worker picks up the buffer while the main thread starts building
+the next one. By the time the build loop finishes at ~120 ms,
+the first ~430 deflates have already run on the 4-worker pool
+(each takes ~0.3 ms compute); only the last handful are still in
+flight. The `await Promise.all` resolves almost immediately.
+
+### Why the savings are bounded
+
+The build loop is ~120 ms of main-thread JS; the total deflate
+compute is ~130 ms across 4 libuv threads, i.e. ~33 ms of wall.
+Pipelining overlaps the 33 ms of deflate-wall with the 120 ms of
+build-wall. Max possible win: the 33 ms idle. Build itself stays
+single-threaded -- pipelining can't shrink that.
+
+A bigger win would require putting the build itself on workers,
+but `getUnencodedContents` dispatches `.copyBytesInto()` on
+PDFDict / PDFArray / PDFNumber / PDFName / PDFRef / PDFString /
+PDFStream wrappers, and JS object wrappers can't cross
+`worker_threads` boundaries (the byte ranges live in `mainBuf` /
+`arrayMain`, but the dispatch logic is in pdf-lib + our shims).
+Either we duplicate ~500 lines of byte-emission into a worker
+file with SharedArrayBuffer views of the buffers, or we rewrite
+it as a native addon. Neither pays for itself at this row size.
+
+### Measured wins
+
+A/B on the book, 3 runs each, same shipped flag set, paired
+`--cpu-profile-process` (Windows /affinity-pinned):
+
+| Run                | process | load   | setOutline | save   |
+|--------------------|--------:|-------:|-----------:|-------:|
+| baseline A         |  0.89 s | 0.34 s |    0.01 s  | 0.48 s |
+| baseline B         |  0.90 s | 0.35 s |    0.01 s  | 0.47 s |
+| baseline C         |  0.87 s | 0.35 s |    0.01 s  | 0.45 s |
+| **baseline avg**   | **0.887 s** | 0.347 s | 0.010 s | **0.467 s** |
+| pipelined A        |  0.84 s | 0.34 s |    0.01 s  | 0.43 s |
+| pipelined B        |  0.83 s | 0.34 s |    0.01 s  | 0.42 s |
+| pipelined C        |  0.83 s | 0.34 s |    0.01 s  | 0.41 s |
+| **pipelined avg**  | **0.833 s** | 0.340 s | 0.010 s | **0.420 s** |
+| **delta**          | **-54 ms (-6.1 %)** | flat | flat | **-47 ms (-10.1 %)** |
+
+Load is flat (as expected -- no change touched it). Save dropped
+47 ms consistently across all 3 runs. The smoking gun in the CPU
+profile: baseline's `(idle)` row sat at 21 ms / 2.8 % (rank #9 of
+top 15); after pipelining the row drops out of the top 15
+entirely. That's the deflate-await idle being absorbed into the
+build wall, exactly as predicted.
+
+`getUnencodedContents` self-time also dropped (31.56 → 22.25 ms
+in the paired profiles), probably because in the baseline its
+samples were sandwiched between a sync build and a sync await
+with no other work to attribute against; in the pipelined version
+V8 task scheduling between the build and the fire-and-forget
+Promise creation absorbs some of that attribution. Either way the
+row stays in the top 15 -- the build itself is unchanged.
+
+### Why estimate (~32 ms) and actual (~47 ms) differ
+
+The estimate was derived from the `(idle)` row alone. The actual
+save delta is larger because the await also paid for:
+
+- Microtask-queue drain at the `Promise.all` gate (a few ms
+  across 453 settled promises).
+- libuv callback marshalling for the batch (the 24.75 ms
+  `writeSync` row in the baseline -- the inspector's name for
+  the deflate-result callback, not `fs.writeFileSync`). In the
+  pipelined version those callbacks fire spread out during the
+  build loop instead of bunched at the end.
+
+Both are small but real. Together they explain the gap between
+the 32 ms idle estimate and the 47 ms save delta.
+
+### What this teaches
+
+Two serial `.map`s with an `await` between them is almost always
+a missed pipeline. The fix is mechanical (fold into one `.map`),
+but the win only shows up when the second stage runs on a
+different execution context -- here, libuv's thread pool. For two
+main-thread stages there'd be no overlap to harvest and the diff
+would be a wash.
+
+The `(idle)` row in a CPU profile is the cheapest "next win" to
+spot: any time it's >2 %, there's an `await` somewhere that
+finished before its inputs were ready. Worth grepping for.
+
+### Shipped
+
+In-place edit to
+[`docs/lib/parallel-deflate.mjs`](../../docs/lib/parallel-deflate.mjs)
+at the `parallelSave` path; the harness's `--parallel-deflate`
+flag continues to flip the whole `parallelSave` path on, no new
+knob. `render-book.mjs` already calls `parallelSave` so the
+change is live in production without a new import.
+
 ## `@cantoo/pdf-lib`: not a drop-in replacement
 
 Spot-checked the maintained fork (`@cantoo/pdf-lib` 2.6.5) as an
@@ -4737,7 +4863,8 @@ Cumulative process-phase cost, baseline → after the shims to date:
 | + fast-refs-class                    | ~0.9 s  | ~0.55 s | ~0.4 s |
 | + fast-dict-onebuf class shape       | ~0.9 s  | ~0.55 s | ~0.4 s |
 | + fast-array-onebuf class shape      | ~0.8 s  | ~0.5 s  | ~0.35 s |
-| **+ fast-parse-name (this section)** | **~0.75 s** | **~0.4 s** | **~0.35 s** |
+| + fast-parse-name                    | ~0.75 s | ~0.4 s  | ~0.35 s |
+| **+ pipeline-deflate (this section)** | **~0.7 s** | **~0.4 s** | **~0.3 s** |
 
 The bottom-up after the latest pair is what's left of pdf-lib's
 genuine parser work: `PDFDict.entries`, `PDFObjectParser.parseName`,

From 40a6d513e1c9cece78d81b0325cbf25c6f64e6d4 Mon Sep 17 00:00:00 2001
From: Kuba Sunderland-Ober <kuba@mareimbrium.org>
Date: Sun, 24 May 2026 13:09:00 +0200
Subject: [PATCH 42/44] fast-dict-onebuf: pack PDFPageLeaf flags into d's gap
 bits.

d layout shifts from start[0:23] / length[24:37] to start[0:22] /
gap[23:24] / length[25:40], freeing bits 23 and 24 for PDFPageLeaf's
normalized + autoNormalizeCTM. _FastPageLeaf collapses to a single d
field; the booleans become prototype getters/setters that mask in/out
of d. start drops 24 -> 23 bits (8.4 M slots, still well above the
~2.3 M mainLen on the book); length grows 14 -> 16 bits (65 535,
ample headroom over the 8 706 observed max).

Heap saving on the 1 651 page leaves is sub-row at the 512 B sampler
resolution but real (~26 KB). Output byte-identical to baseline. CPU
flat (no PDFPageLeaf mutation paths fire on the book).
---
 docs/lib/fast-dict-onebuf.mjs | 119 ++++++++++++++++++++++++++--------
 perf/README.md                |  29 +++++----
 perf/notes/08-pdf-lib.md      |  98 +++++++++++++++++++++++++++-
 3 files changed, 207 insertions(+), 39 deletions(-)

diff --git a/docs/lib/fast-dict-onebuf.mjs b/docs/lib/fast-dict-onebuf.mjs
index 1d705369..888705cd 100644
--- a/docs/lib/fast-dict-onebuf.mjs
+++ b/docs/lib/fast-dict-onebuf.mjs
@@ -7,10 +7,23 @@
 // ever read from main, so the bufIdx field disappears from the
 // packed value -- frees up bits.
 //
-// 38-bit packed Number layout (well within Number.MAX_SAFE_INTEGER):
-//   bits  0-23: start  (24 bits, max 16 M slots in main)
-//   bits 24-37: length (14 bits, max 16 384 slots; max observed 8 706)
-//   bits 38-52: spare (15 bits)
+// 41-bit packed Number layout (well within Number.MAX_SAFE_INTEGER):
+//   bits  0-22: start  (23 bits, max 8.4 M slots in main; mainLen ~2.3 M today)
+//   bit     23: PDFPageLeaf `normalized` flag (zero on all other dict subtypes)
+//   bit     24: PDFPageLeaf `autoNormalizeCTM` flag (zero on all other dict subtypes)
+//   bits 25-40: length (16 bits, max 65 535 slots; max observed 8 706)
+//   bits 41-52: spare (12 bits; unused, available headroom)
+//
+// V8 Smi (31-bit signed) covers values < 2^30. start + length*2^25 stays
+// Smi iff length < 32 (the 2^30 boundary). Beyond that, `d` boxes to a
+// HeapNumber but bit math via `& MASK_*` and `+`/`-` continues to work --
+// reads still extract bits 0..30 correctly via Int32 coercion, writes
+// use arithmetic so high bits survive.
+//
+// PDFPageLeaf collapses to the same single-`d` field as plain PDFDict;
+// `normalized` and `autoNormalizeCTM` are gettters/setters that mask
+// in/out of `d`'s bits 23 and 24. Heap floor matches `_FastDict` (no
+// separate boolean property slots).
 //
 // Recursion. Outer parseDict pushes entries onto temp. Calling
 // this.parseObject() to parse a value may recurse to inner
@@ -99,21 +112,30 @@ export function setExpectedDictSlots(slots, slack = 1.0) {
 
 // ---- Bit-packing helpers --------------------------------------------
 
-const POW_24 = 16777216;          // 2^24
-const MASK_24 = 0xFFFFFF;
-const MASK_14 = 0x3FFF;
+const POW_23  = 1 << 23;            // 8 388 608  -- gap-bit base / start ceiling
+const POW_25  = 1 << 25;            // 33 554 432 -- length multiplier
+const MASK_23 = 0x7FFFFF;           // 23-bit start mask
+const MASK_16 = 0xFFFF;             // 16-bit length mask
+
+const NORM_BIT = POW_23;            // bit 23: PDFPageLeaf `normalized`
+const AUTO_BIT = POW_23 * 2;        // bit 24: PDFPageLeaf `autoNormalizeCTM`
+const GAP_MASK = NORM_BIT | AUTO_BIT;
 
-const MAX_START  = POW_24;          // exclusive
-const MAX_LENGTH = 1 << 14;         // 16384, exclusive
+const MAX_START  = POW_23;          // exclusive
+const MAX_LENGTH = 1 << 16;         // 65536, exclusive
 
 function pack(start, length) {
-  if (start  >= MAX_START)  throw new Error(`fast-dict-onebuf: start ${start} exceeds 24-bit budget`);
-  if (length >= MAX_LENGTH) throw new Error(`fast-dict-onebuf: length ${length} exceeds 14-bit budget`);
-  return start + length * POW_24;
+  if (start  >= MAX_START)  throw new Error(`fast-dict-onebuf: start ${start} exceeds 23-bit budget`);
+  if (length >= MAX_LENGTH) throw new Error(`fast-dict-onebuf: length ${length} exceeds 16-bit budget`);
+  return start + length * POW_25;
 }
 
-function _start(d)  { return d & MASK_24; }
-function _length(d) { return Math.floor(d / POW_24) & MASK_14; }
+// Read start (bits 0-22) and length (bits 25-40). Both work on
+// HeapNumber'd d: `& MASK_23` lives in low 32 bits (Int32 coercion
+// reads it correctly); `Math.floor(d / POW_25)` operates on the full
+// Number range before the `& MASK_16` truncates.
+function _start(d)  { return d & MASK_23; }
+function _length(d) { return Math.floor(d / POW_25) & MASK_16; }
 
 // ---- Singleton context ---------------------------------------------
 
@@ -145,6 +167,11 @@ function _appendArray(arr) {
 // COW: copy this dict's range to main's tail, return the new packed
 // value anchored at the new range. If we're already at the HWM,
 // nothing to copy -- return d unchanged.
+//
+// Gap bits (bits 23-24, used by PDFPageLeaf for normalized /
+// autoNormalizeCTM) are preserved across the repack. For non-PageLeaf
+// dicts the mask is zero, so `+ (d & GAP_MASK)` is a no-op. Addition
+// is used instead of `|` so the high bits of HeapNumber'd d survive.
 function _cow(pd) {
   const d = pd.d;
   const start = _start(d);
@@ -153,7 +180,7 @@ function _cow(pd) {
   const newStart = mainLen;
   for (let i = 0; i < length; i++) main[mainLen + i] = main[start + i];
   mainLen += length;
-  return pack(newStart, length);
+  return pack(newStart, length) + (d & GAP_MASK);
 }
 
 // ---- Construction ---------------------------------------------------
@@ -170,11 +197,15 @@ function _cow(pd) {
 // row.
 //
 // One constructor per subclass so V8 sees a single fixed shape per
-// kind. PDFPageLeaf carries extra fields (normalized,
-// autoNormalizeCTM) -- they're assigned in the constructor body so
-// the shape stays fixed. Any unknown PDFDict subclass falls back to
-// the original Object.create path so the shim doesn't crash on
-// downstream extensions (none in our pipeline; defensive only).
+// kind. PDFPageLeaf collapses to the same single-`d` shape as plain
+// PDFDict; `normalized` defaults to false (gap bit 23 clear) and
+// `autoNormalizeCTM` defaults to true (gap bit 24 set) -- the bit
+// is OR'd in by the constructor below via addition (so HeapNumber'd
+// d doesn't lose high bits to Int32 coercion). Both flags become
+// prototype getters/setters that mask in/out of bits 23-24.
+// Any unknown PDFDict subclass falls back to the original
+// Object.create path so the shim doesn't crash on downstream
+// extensions (none in our pipeline; defensive only).
 
 function _FastDict(d) { this.d = d; }
 _FastDict.prototype = PDFDict.prototype;
@@ -185,11 +216,11 @@ _FastCatalog.prototype = PDFCatalog.prototype;
 function _FastPageTree(d) { this.d = d; }
 _FastPageTree.prototype = PDFPageTree.prototype;
 
-function _FastPageLeaf(d) {
-  this.d = d;
-  this.normalized = false;
-  this.autoNormalizeCTM = true;
-}
+// d arrives from pack(start, length) so bits 23-24 are zero;
+// `+ AUTO_BIT` sets bit 24 unconditionally (autoNormalizeCTM = true
+// default). Use addition not `|`: if length >= 32, d > 2^30 (HeapNumber)
+// and `|` would truncate to Int32 losing high bits.
+function _FastPageLeaf(d) { this.d = d + AUTO_BIT; }
 _FastPageLeaf.prototype = PDFPageLeaf.prototype;
 
 function _makeFromRange(ProtoClass, start, length, ctx) {
@@ -268,7 +299,9 @@ if (!PDFDict.prototype.__fastDictOnebufInstalled) {
     main[mainLen++] = key;
     main[mainLen++] = value;
     const start = _start(dNow);
-    this.d = pack(start, length0 + 2);
+    // Preserve gap bits (PageLeaf flags) from dNow into the freshly
+    // packed value. Zero for non-PageLeaf dicts.
+    this.d = pack(start, length0 + 2) + (dNow & GAP_MASK);
   };
 
   PDFDict.prototype.get = function (key, preservePDFNull) {
@@ -315,7 +348,8 @@ if (!PDFDict.prototype.__fastDictOnebufInstalled) {
       if (i === foundIdx || i === foundIdx + 1) continue;
       main[mainLen++] = main[start0 + i];
     }
-    this.d = pack(newStart, length0 - 2);
+    // Preserve gap bits (PageLeaf flags); zero for non-PageLeaf dicts.
+    this.d = pack(newStart, length0 - 2) + (d0 & GAP_MASK);
     return true;
   };
 
@@ -386,6 +420,37 @@ if (!PDFDict.prototype.__fastDictOnebufInstalled) {
     configurable: true,
   });
 
+  // ---- PDFPageLeaf flag accessors -----------------------------------
+  //
+  // `normalized` and `autoNormalizeCTM` live in bits 23 and 24 of
+  // `d`. Reads use `& BIT` -- safe on HeapNumber'd d because both
+  // bits are in the low 32 (Int32 coercion reads them correctly).
+  // Writes use arithmetic (`d + BIT` / `d - BIT`) gated on the
+  // current bit state, so high bits of HeapNumber'd d survive.
+  // No-ops when the flag is already in the requested state.
+
+  Object.defineProperty(PDFPageLeaf.prototype, 'normalized', {
+    get() { return (this.d & NORM_BIT) !== 0; },
+    set(v) {
+      const d = this.d;
+      const has = (d & NORM_BIT) !== 0;
+      if (v && !has)      this.d = d + NORM_BIT;
+      else if (!v && has) this.d = d - NORM_BIT;
+    },
+    configurable: true,
+  });
+
+  Object.defineProperty(PDFPageLeaf.prototype, 'autoNormalizeCTM', {
+    get() { return (this.d & AUTO_BIT) !== 0; },
+    set(v) {
+      const d = this.d;
+      const has = (d & AUTO_BIT) !== 0;
+      if (v && !has)      this.d = d + AUTO_BIT;
+      else if (!v && has) this.d = d - AUTO_BIT;
+    },
+    configurable: true,
+  });
+
   // ---- PDFDict factories --------------------------------------------
 
   PDFDict.withContext = function (context) {
diff --git a/perf/README.md b/perf/README.md
index 0cea424a..234f89d5 100644
--- a/perf/README.md
+++ b/perf/README.md
@@ -183,21 +183,27 @@ Flag rationale:
   a stack of recursion frames; each parseDict invocation appends
   to temp, commits its frame to main in one contiguous append,
   and pops temp back. PDFDicts only ever read from main, so the
-  whole instance state packs into one 53-bit Number (24-bit start
-  + 14-bit length + 1-bit owned). Owned dicts (factory-created
+  whole instance state packs into one 41-bit Number (23-bit start
+  + 1-bit `normalized` flag + 1-bit `autoNormalizeCTM` flag +
+  16-bit length, in that bit order). Owned dicts (factory-created
   post-parse, COW results) also append to main. Mutations:
   in-place replace for existing keys, COW (copy range to tail,
-  append new pair, update encoded range) for new keys or delete.
-  The wrapper instances themselves use the same constructor-based
-  shape `fast-refs-class` introduced for PDFRef -- one
+  append new pair, update encoded range) for new keys or delete --
+  all preserve the two gap bits via `+ (d & GAP_MASK)` after the
+  repack. The wrapper instances themselves use the constructor-
+  based shape `fast-refs-class` introduced for PDFRef -- one
   plain-function constructor per subclass (`_FastDict`,
   `_FastCatalog`, `_FastPageTree`, `_FastPageLeaf`) with the
   prototype aliased to the upstream prototype, so V8 sees a stable
-  hidden class from the first instance. Saves ~20 B/PDFDict ×
-  260 k = ~5.2 MB heap on top of the storage refactor.
-  Mutually exclusive with the other dict-shape shims. ~77 %
-  cumulative heap reduction since the original Map-backed PDFDict
-  (152 -> 35 MB). Production runs through it. See
+  hidden class from the first instance. PDFPageLeaf collapses to
+  the same single-`d` shape as plain PDFDict, with `normalized` /
+  `autoNormalizeCTM` as prototype getters/setters that mask in/out
+  of bits 23-24. Saves ~20 B/PDFDict × 260 k = ~5.2 MB heap on
+  top of the storage refactor, plus ~26 KB on the 1 651 page
+  leaves from the flag-packing. Mutually exclusive with the other
+  dict-shape shims. ~77 % cumulative heap reduction since the
+  original Map-backed PDFDict (152 -> 35 MB). Production runs
+  through it. See
   [notes/08-pdf-lib.md "One-buffer PDFDict"](notes/08-pdf-lib.md).
 - `--fast-array-onebuf` -- inject
   [docs/lib/fast-array-onebuf.mjs](../docs/lib/fast-array-onebuf.mjs).
@@ -682,6 +688,7 @@ file documenting each:
 | Class-constructor `PDFArray` shape (`_FastArray` factory + monomorphic call-site unlock across all three Fast classes) | [08](notes/08-pdf-lib.md) | total process heap 35.4 MB → 33.7 MB (-4.9 %); process wall 1.03 s → 0.90 s (-130 ms); GC self-time 101 ms → 59 ms (-42 %); cumulative -78 % heap since Map-backed PDFDict, -20 % process across the three shape-change commits |
 | Byte-keyed `parseName` cache (Map<hash, Entry &#124; Entry[]>; skip per-call string build + string-keyed Map hash on 99.7 % hit path) | [08](notes/08-pdf-lib.md) | `parseName` + `fastOf` combined 144 ms → 58 ms; process wall 0.90 s → 0.82 s (-80 ms, -9 %, all on load); +1.3 MB long-lived heap for the cache |
 | Pipeline `parallel-deflate` (overlap buffer-build with libuv deflate by folding two `.map`s into one) | [08](notes/08-pdf-lib.md) | save 0.467 s → 0.420 s (-47 ms, -10 %); `(idle)` row at `Promise.all` gate drops out of CPU top-15 |
+| Pack PDFPageLeaf flags into `d`'s gap bits (`_FastPageLeaf` collapses to single-`d` shape; bit layout shifts to start[0:22] / norm[23] / auto[24] / length[25:40]) | [08](notes/08-pdf-lib.md) | ~26 KB on 1 651 page leaves (sub-row at 512 B sampler); output byte-identical; CPU flat |
 
 What was tried and didn't ship:
 
@@ -708,4 +715,4 @@ order; later ones reference earlier ones for context.
 | [05-blink-trace.md](notes/05-blink-trace.md) | What happened when we tried move-not-clone (a `previousLeaf` cache shipped instead of move); cracking the cpu profile's `(program)` row open with a Blink-category trace; the WhiteSpaceFilter paired-A/B that found it wasn't worth its layout cost in our pipeline. |
 | [06-microtasks-pageranges-css.md](notes/06-microtasks-pageranges-css.md) | Following `RunMicrotasks` down to zero (chunker fully sync); why `pageRanges` sharding is off the table; CSS cost attribution showing print.css's individual sections are all below the noise floor. |
 | [07-memory.md](notes/07-memory.md) | Where the renderer's 1.9 GB goes -- process-tree footprint, per-allocator + per-Blink-class breakdown, `--disable-gpu` + `--in-process-gpu` saving ~200 MB, a GC-pass probe finding 180 MB of unswept Oilpan garbage. |
-| [08-pdf-lib.md](notes/08-pdf-lib.md) | Profiling the process phase via `--cpu-profile-process`; pako's per-stream init dominates with ~4 500 small streams (routing pdf-lib's `deflate` + `inflate` through `node:zlib` saves ~1.5 s); `PDFRef.of`'s string-keyed Map lookup at 1.2 M calls per load (dense-array gen=0 cache saves ~0.2 s); parallelising save's per-stream deflate on libuv's pool with `objectsPerStream: 500` (~0.3 s off the main thread; PDF -5 %); `decodeName`'s regex scan on 2.76 M `PDFName.of` calls per load with a 0.0001 % hit rate (no-`#` cache saves ~0.5 s); `numberToString`'s redundant `toString`/`split`/`parseInt` on the 100 % no-`e` path; `sizeInBytes` allocating `n.toString(2)` on ~300 k xref-writer calls (short-circuit ladder saves ~60 ms); `PDFDict.entries` allocating `Array.from(map.entries())` on every dict serialisation (`Map.forEach` with hoisted callbacks saves ~80 ms); `parseDict`'s type-dispatch tail re-running `PDFName.of('Type'/'Catalog'/'Pages'/'Page')` per dict (hoisted sentinel constants drop `fastOf` self-time by 22 %); pdf-lib's `__awaiter`/`__generator` scaffolding on nine load + save methods costing ~135 ms of attributed self-time + ~50 ms GC (synchronified twins save 0.36 s of process); `parseObject`'s three speculative `matchKeyword(true/false/null)` scans on every dispatch (first-byte peek + gated keyword scans halve `parseObject` self-time); the sampling heap profile pointing at `new Map()` + `Map.prototype.set` at ~50 % of process-phase allocations (replacing `PDFDict`'s backing `Map` with a flat alternating `[k,v,k,v,...]` array drops Map+set heap traffic ~80 % and subsumes the earlier `fast-dict-iter` + `fast-parse-dict` shims); the only hot `Map.set` left being `PDFContext.assign`'s `indirectObjects.set(ref, object)` (replacing the Map with a dense gen=0 array indexed by `objectNumber` drops `assign` out of the CPU top-15 and halves the remaining `set` heap traffic); the residual `set` after that being the upstream `PDFRef.of` pool.set on cache miss (directly constructing the `PDFRef` via `Object.create(PDFRef.prototype)` on the gen=0 miss path bypasses the redundant upstream pool entirely, dropping `set` heap traffic from 7.7 MB to 0.5 MB and saving another ~93 ms on `PDFRef.of` CPU); `parseNumberOrRef` as the next-largest heap row at 15 MB of inlined `new PDFNumber(value)` calls -- PDFs reuse a few thousand unique numeric values hundreds of thousands of times (page indices, `/Count`, `/MediaBox` dimensions, font sizes), so pooling `PDFNumber` by value drops `parseNumberOrRef` off the top 10 and total process heap by ~13 %; `fastParseDictArray` at ~85 % FixedArray-growth garbage on 261 k dicts with a 5-entry median (pre-sizing the accumulator at `new Array(10)` collapses the per-call growth chain, dropping the row -25 % and total heap another -14 %); the next layer being the irreducible "one `new Array(10)` + one PDFDict instance per parsed dict" floor with ~1780 backing arrays still in flight after `--fast-dict-array` (collapse them into one long-lived mainBuf where every committed PDFDict entry lives, a per-parser temp stack handles parseDict recursion, owned dicts append to main and mutate in-place / COW-to-tail, and the whole PDFDict instance state packs into one 53-bit Number -- 24-bit start + 14-bit length + 1-bit owned -- so the per-dict object header collapses to a single field; total process heap drops 92 MB → 66 MB (-28 %), cumulative -57 % since the Map-backed baseline); GC self-time still ~150 ms / 15 % of process after fast-dict-onebuf (V8-flag knobs don't move it -- mark cost is dominated by walking the ~2.4 M Object-ref slots in mainBuf), so the next attack surface is shrinking the *live set* V8 has to mark by encoding slots as Numbers in a Float64Array mainBuf instead of Object references; that needs exact pool sizing, which is best done by a separate measure-allocate-work pass (Phase 0 viability gate: a no-allocate byte walker prototype clocks 135 ms vs PDFDocument.load at 1238 ms on the 39 MB Chrome-output PDF, ~9x cheaper, so even an 80 %-of-current work pass would land net-positive on CPU before any GC reduction; the prototype lives in `perf/phase0-measure.mjs` with `--dump-raw-pdf` on `measure.mjs` for capturing the canonical input); Phase 1 productionises that walker as `docs/lib/measure-pass.mjs` and adds `setExpectedDictSlots()` to fast-dict-onebuf so the harness's `--measure-pass` flag pre-sizes mainBuf exact (byte-identical output, +40 ms net process on the book) -- pipeline shape committed for Phase 2 to layer on, even though Phase 1 alone doesn't move the heap; a V8 IC-invalidation gotcha worth recording: rebinding the module-level `main` (rather than resizing it in place via `main.length = N`) made `_appendEntries` deopt and the heap jump 65 → 92 MB despite the resized Array being shape-identical; post-Phase-1 cleanup: the One-buffer `owned` bit was over-cautious -- `start + length === mainLen` is enough to know slots past mainLen are claimable regardless of whether the range came from the parser or a factory call, so the bit drops out of the packed value entirely (set's COW gate simplifies, _cow collapses to one branch, bit 38 becomes spare; byte-identical output, heap flat); slot-type histogram on `main[0..mainLen)` confirms keys are 100 % PDFName and the four big pools (Name / Ref / Number / Dict) cover 96.4 % of all slots, so a Float64Array mainBuf with a side `Object[]` for the residual ~3.5 % (Array / String / HexString) would collapse ~96 % of GC slot-marks at the price of a side-pool indirection -- `perf/instrument-slot-types.mjs` does the classification in 39 ms via `--instrument-slot-types`; built the Float64Array mainBuf with 4-bit tag + 49-bit payload encoding (subsuming fast-refs and fast-pdfnumber-pool's pool IDs, adding new pools for PDFArray / PDFString / PDFHexString) and confirmed byte-identical output -- but the measured win was a wash: pointer-array marks turn out to be fast in V8, encoding overhead at parse + decode at save roughly cancels the savings, and the new pool Maps cost ~3 MB heap, so the architecture stays on the shelf rather than shipping; mirroring the same shape to PDFArray (each instance as a view into a shared arrayBuf Float64Array via `this.d = (start, length)`, same temp-then-commit parser pattern) lands the expected heap win of -7.6 MB / -12 % and pulls parseArray's 19.6 MB attribution row off the top 10 entirely, but introduces a +360 ms wall-clock regression at save -- ~3 M per-slot `decodeValue` calls across copyBytesInto, ~100 ns each, V8 can't inline across the prototype boundary -- so the architecture stays off staging too; the natural follow-up is to hand-inline decodeValue's 10-case switch into all four hot save methods (Phase 3β), which recovers the function-call overhead and lands net wins on GC (-19 ms) and parseName (-17 ms, a downstream V8 re-optimization once the call sites became monomorphic per case branch) -- but the inlined switch body itself adds +23 ms at encode-at-parse and +22 ms at copyBytesInto, and the net of all of Phase 2 + 3 + β is "real heap+GC win, ~+200 ms wall-clock loss across many frames", so the architecture stays off staging on the simpler conclusion that V8 marks pointer arrays faster than expected and the original polymorphic `main[i].copyBytesInto()` was fine; Phase 1 wires into production via a `setExpectedDictSlots()` call in `docs/render-book.mjs` that runs the measure walker on rawPdf and hands the exact dictSlot count to fast-dict-onebuf -- the bound on mainBuf isn't material on its own (~60 K slots out of 2.4 M of slack) but commits the two-pass shape so any future shim swap doesn't have to re-do the wiring; finally, mirroring fast-dict-onebuf's range-view shape onto PDFArray (every committed element in a single append-only `arrayMain`, each PDFArray a view via packed `(start, length)`) is the lever the encoded approach was reaching for, with plain Object[] storage skipping the per-slot decode cost that killed Phase 3 -- ~19.6 MB `parseArray` allocation row drops off the top 15, total process heap 66 MB → 52 MB (-21 %), cumulative -66 % since the Map-backed baseline; once `fast-array-onebuf` shipped the next heap row was `PDFParser.parseIndirectObjectHeader` at 13.7 MB (25 % of total), attributed via `find-heap-callers.mjs` to V8 inlining `fastOf` into `skipJibberish`'s speculative `matchIndirectObjectHeader` call (~150 k tag-string allocations of ~25-35 B each), dropping the per-instance `PDFRef.tag` field entirely and computing `toString` / `sizeInBytes` / `copyBytesInto` from `objectNumber` / `generationNumber` directly via `_writeUint` + `_digitCount` helpers cuts `parseIndirectObjectHeader` to 9.3 MB (-4.3 MB), `fastOf` 7.7 → 4.8 MB (-2.9 MB), total process heap 51.9 → 45.2 MB (-13 %), with byte-identical output verified by inflating + diffing all 453 ObjStm streams; the same chain pointed at a redundancy on the CPU side -- `parseDocument`'s inner loop calls `skipJibberish` ~150 k times per load to recover from invalid PDFs that wedge garbage between indirect objects, but on valid PDFs every call speculatively runs `matchKeyword(xref/trailer/startxref)` (all fail on a digit) + `matchIndirectObjectHeader` (a `try` / `catch` around `parseIndirectObjectHeader` + `parseRawInt`x2 + `matchKeyword('obj')` + `fastOf`), all to confirm what the outer `while`'s `IsDigit` check already proved, so peeking the byte first and `continue`-ing on a digit (falling through to `skipJibberish` only on xref/trailer/startxref keyword starts or real jibberish) saves ~62 ms on load (mean 0.518 → 0.455 s, ~6 % of process); the next attack surface after that was the construction style itself -- `fast-refs`'s `Object.create(PDFRef.prototype) + fresh.objectNumber = ... + fresh.generationNumber = ...` routes V8 through the slow-property path with intermediate hidden-class transitions per write, putting PDFRef at ~60 B/instance vs PDFName's ~31 B (built via `new PDFName(...)` with a real constructor), so swapping to a plain function used as a constructor (`function _FastRef(o, g) { this.objectNumber = o; this.generationNumber = g; }` + `_FastRef.prototype = PDFRef.prototype`) gives V8 a stable hidden class from the first instance, drops per-PDFRef cost to ~44 B for ~3.87 MB heap (-8.5 %) and ~140 ms wall-clock (-12 % of process) on the book's 226 k unique PDFRefs (paired heap+cpu profile, --fast-refs vs --fast-refs-class with the rest of production on), with `parseIndirectObjectHeader` dropping 9.1 MB → 7.4 MB and `fastOf` 4.7 MB → 3.4 MB -- the `Object.create + writes` shim stays in the tree as A/B baseline (mutex-checked in measure.mjs); the same shape change applied symmetrically to the four PDFDict factory paths in `fast-dict-onebuf` (`_makeFromRange` + the COW path inside `set` both build wrappers via `Object.create(ProtoClass.prototype) + pd.d = ...`, with PageLeaf carrying extra `normalized` / `autoNormalizeCTM` writes) -- one plain-function constructor per subclass (`_FastDict`, `_FastCatalog`, `_FastPageTree`, `_FastPageLeaf`) with the prototype aliased to the upstream prototype drops 260 k+ wrapper instances ~20 B each for `_makeFromRange (dict)` 16.5 MB → 11.4 MB, `create` builtin 2.6 MB → 0.9 MB, total process heap 41.4 MB → 35.4 MB (-14.4 %), cumulative -22 % over the two shape-change commits and -77 % since the Map-backed PDFDict baseline (152 MB → 35.4 MB); wall-clock roughly flat (0.99 → 1.03 s under cpu profile, within noise) with GC self-time +18 ms (82 → 101 ms) as expected -- the dominant GC cost is the live mainBuf scan rather than allocation rate, so cutting allocations doesn't move single-shot mark time; mirroring the same change to PDFArray's `_makeFromRange` and COW paths with a single `_FastArray` constructor (no subclass dispatch needed -- PDFArray has none in pdf-lib) drops ~22 B/PDFArray × ~80 k = ~1.7 MB heap, but the surprise win is on CPU + GC: with all three shape changes in place V8 sees fully monomorphic call sites for PDFRef / PDFDict / PDFArray construction and method dispatch, undoing the dict-only state's +18 ms GC regression and then some -- GC self-time 101 → 59 ms (-42 %), process wall-clock 1.03 → 0.90 s (-130 ms, -13 %), so cumulative across the three shape-change commits (fast-refs-class + fast-dict-onebuf class + fast-array-onebuf class) the process drops 1.13 → 0.90 s (-230 ms, -20 %), total heap 45.3 → 33.7 MB (-25.6 %), GC self-time 87 → 59 ms (-32 %), with output byte-identical modulo timestamps; with the constructor-shape round closed, the new #1 row in the process CPU profile was `PDFObjectParser.prototype.parseName` at 87 ms self + 57 ms via its `fastOf` callee = 144 ms combined (~16 % of process) firing 1.68 M times per load, of which 4 787 are unique (99.7 % cache-hit rate -- the same handful of dict keys like Type, Length, Pages, MediaBox over and over) -- two failed first attempts (skip per-byte ByteStream method dispatch via direct buffer access while keeping the cons-string accumulator: V8's cons-string optimisation was already covering the cost so no movement; and `String.fromCharCode.apply(null, buf.subarray(...))` as a one-shot allocation: SLOWER at ~123 ms vs ~87 ms because `.apply` on a typed-array view is a V8 deopt path) pointed at the wrong surface, the real win was caching the answer keyed on the byte content, scanning bytes with direct buffer access while accumulating a Java-style `hash * 31 + byte | 0` Smi hash in the same pass, looking up `Map<hash, Entry | Entry[]>` keyed by byte content (single-entry buckets the common case at 4.8 k names into 2^32 hash space, collision-bucket scan via `instanceof Entry` check), with cold path building the string in one shot via `String.fromCharCode` direct args and routing through fast-decode-name so both caches converge on the same PDFName instance -- pulls `parseName` + `fastOf` combined from 144 ms to 58 ms (-60 %), -80 ms process wall-clock (-9 %), all on load (0.41 → 0.33 s); +1.3 MB long-lived heap (4.8 k Entry objects + Uint8Array byte-keys + Map<number, ...> overhead) is a fixed cost for a workload-bounded cache; the heap-profile run shows a much bigger drop (3.50 → 2.56 s, -940 ms) -- not a real wall-clock win, just the sampler's per-allocation bookkeeping dropping in step with the ~1.6 M transient string allocations we eliminated (read cpu numbers for "did we get faster", heap numbers for long-lived cost); the next row to drop was `PDFObjectStream.getUnencodedContents` (#4 at 46 ms self / 124 ms with callees) paired with a fat `(idle)` row at 32 ms / 3.4 % -- both attributable to `parallel-deflate.mjs`'s phase 2 running build + deflate as two strictly serial passes (`objectStreams.map(os => os.getUnencodedContents())` followed by `Promise.all(unencoded.map(buf => deflateAsync(buf)))`, the first ~120 ms of main-thread block then ~30 ms of main-thread idle awaiting libuv), so folding the two `.map`s into one (`Promise.all(objectStreams.map(os => deflateAsync(os.getUnencodedContents())))`) pipelines build with deflate -- each deflate fires on libuv as its buffer is built, overlapping with the build of the next stream rather than after all 453 builds complete -- and the await resolves almost immediately by the time the build loop finishes (by then ~430 of 453 deflates have run on the 4-worker pool, each ~0.3 ms compute); paired 3-run A/B with the rest of the shipped flag set on confirms save 0.467 s → 0.420 s (-47 ms, -10 %), process 0.887 s → 0.833 s (-54 ms, -6 %), load + setOutline flat as expected; the `(idle)` row drops out of the CPU top-15 entirely and `getUnencodedContents` self-time also drops (31.56 → 22.25 ms) as V8's task scheduling between build and the fire-and-forget Promise creation reattributes some samples -- a 47 ms vs 32 ms estimate gap accounted for by microtask-queue drain at the `Promise.all` gate + libuv callback marshalling now spread across the build loop instead of bunched at the end. |
+| [08-pdf-lib.md](notes/08-pdf-lib.md) | Profiling the process phase via `--cpu-profile-process`; pako's per-stream init dominates with ~4 500 small streams (routing pdf-lib's `deflate` + `inflate` through `node:zlib` saves ~1.5 s); `PDFRef.of`'s string-keyed Map lookup at 1.2 M calls per load (dense-array gen=0 cache saves ~0.2 s); parallelising save's per-stream deflate on libuv's pool with `objectsPerStream: 500` (~0.3 s off the main thread; PDF -5 %); `decodeName`'s regex scan on 2.76 M `PDFName.of` calls per load with a 0.0001 % hit rate (no-`#` cache saves ~0.5 s); `numberToString`'s redundant `toString`/`split`/`parseInt` on the 100 % no-`e` path; `sizeInBytes` allocating `n.toString(2)` on ~300 k xref-writer calls (short-circuit ladder saves ~60 ms); `PDFDict.entries` allocating `Array.from(map.entries())` on every dict serialisation (`Map.forEach` with hoisted callbacks saves ~80 ms); `parseDict`'s type-dispatch tail re-running `PDFName.of('Type'/'Catalog'/'Pages'/'Page')` per dict (hoisted sentinel constants drop `fastOf` self-time by 22 %); pdf-lib's `__awaiter`/`__generator` scaffolding on nine load + save methods costing ~135 ms of attributed self-time + ~50 ms GC (synchronified twins save 0.36 s of process); `parseObject`'s three speculative `matchKeyword(true/false/null)` scans on every dispatch (first-byte peek + gated keyword scans halve `parseObject` self-time); the sampling heap profile pointing at `new Map()` + `Map.prototype.set` at ~50 % of process-phase allocations (replacing `PDFDict`'s backing `Map` with a flat alternating `[k,v,k,v,...]` array drops Map+set heap traffic ~80 % and subsumes the earlier `fast-dict-iter` + `fast-parse-dict` shims); the only hot `Map.set` left being `PDFContext.assign`'s `indirectObjects.set(ref, object)` (replacing the Map with a dense gen=0 array indexed by `objectNumber` drops `assign` out of the CPU top-15 and halves the remaining `set` heap traffic); the residual `set` after that being the upstream `PDFRef.of` pool.set on cache miss (directly constructing the `PDFRef` via `Object.create(PDFRef.prototype)` on the gen=0 miss path bypasses the redundant upstream pool entirely, dropping `set` heap traffic from 7.7 MB to 0.5 MB and saving another ~93 ms on `PDFRef.of` CPU); `parseNumberOrRef` as the next-largest heap row at 15 MB of inlined `new PDFNumber(value)` calls -- PDFs reuse a few thousand unique numeric values hundreds of thousands of times (page indices, `/Count`, `/MediaBox` dimensions, font sizes), so pooling `PDFNumber` by value drops `parseNumberOrRef` off the top 10 and total process heap by ~13 %; `fastParseDictArray` at ~85 % FixedArray-growth garbage on 261 k dicts with a 5-entry median (pre-sizing the accumulator at `new Array(10)` collapses the per-call growth chain, dropping the row -25 % and total heap another -14 %); the next layer being the irreducible "one `new Array(10)` + one PDFDict instance per parsed dict" floor with ~1780 backing arrays still in flight after `--fast-dict-array` (collapse them into one long-lived mainBuf where every committed PDFDict entry lives, a per-parser temp stack handles parseDict recursion, owned dicts append to main and mutate in-place / COW-to-tail, and the whole PDFDict instance state packs into one 53-bit Number -- 24-bit start + 14-bit length + 1-bit owned -- so the per-dict object header collapses to a single field; total process heap drops 92 MB → 66 MB (-28 %), cumulative -57 % since the Map-backed baseline); GC self-time still ~150 ms / 15 % of process after fast-dict-onebuf (V8-flag knobs don't move it -- mark cost is dominated by walking the ~2.4 M Object-ref slots in mainBuf), so the next attack surface is shrinking the *live set* V8 has to mark by encoding slots as Numbers in a Float64Array mainBuf instead of Object references; that needs exact pool sizing, which is best done by a separate measure-allocate-work pass (Phase 0 viability gate: a no-allocate byte walker prototype clocks 135 ms vs PDFDocument.load at 1238 ms on the 39 MB Chrome-output PDF, ~9x cheaper, so even an 80 %-of-current work pass would land net-positive on CPU before any GC reduction; the prototype lives in `perf/phase0-measure.mjs` with `--dump-raw-pdf` on `measure.mjs` for capturing the canonical input); Phase 1 productionises that walker as `docs/lib/measure-pass.mjs` and adds `setExpectedDictSlots()` to fast-dict-onebuf so the harness's `--measure-pass` flag pre-sizes mainBuf exact (byte-identical output, +40 ms net process on the book) -- pipeline shape committed for Phase 2 to layer on, even though Phase 1 alone doesn't move the heap; a V8 IC-invalidation gotcha worth recording: rebinding the module-level `main` (rather than resizing it in place via `main.length = N`) made `_appendEntries` deopt and the heap jump 65 → 92 MB despite the resized Array being shape-identical; post-Phase-1 cleanup: the One-buffer `owned` bit was over-cautious -- `start + length === mainLen` is enough to know slots past mainLen are claimable regardless of whether the range came from the parser or a factory call, so the bit drops out of the packed value entirely (set's COW gate simplifies, _cow collapses to one branch, bit 38 becomes spare; byte-identical output, heap flat); slot-type histogram on `main[0..mainLen)` confirms keys are 100 % PDFName and the four big pools (Name / Ref / Number / Dict) cover 96.4 % of all slots, so a Float64Array mainBuf with a side `Object[]` for the residual ~3.5 % (Array / String / HexString) would collapse ~96 % of GC slot-marks at the price of a side-pool indirection -- `perf/instrument-slot-types.mjs` does the classification in 39 ms via `--instrument-slot-types`; built the Float64Array mainBuf with 4-bit tag + 49-bit payload encoding (subsuming fast-refs and fast-pdfnumber-pool's pool IDs, adding new pools for PDFArray / PDFString / PDFHexString) and confirmed byte-identical output -- but the measured win was a wash: pointer-array marks turn out to be fast in V8, encoding overhead at parse + decode at save roughly cancels the savings, and the new pool Maps cost ~3 MB heap, so the architecture stays on the shelf rather than shipping; mirroring the same shape to PDFArray (each instance as a view into a shared arrayBuf Float64Array via `this.d = (start, length)`, same temp-then-commit parser pattern) lands the expected heap win of -7.6 MB / -12 % and pulls parseArray's 19.6 MB attribution row off the top 10 entirely, but introduces a +360 ms wall-clock regression at save -- ~3 M per-slot `decodeValue` calls across copyBytesInto, ~100 ns each, V8 can't inline across the prototype boundary -- so the architecture stays off staging too; the natural follow-up is to hand-inline decodeValue's 10-case switch into all four hot save methods (Phase 3β), which recovers the function-call overhead and lands net wins on GC (-19 ms) and parseName (-17 ms, a downstream V8 re-optimization once the call sites became monomorphic per case branch) -- but the inlined switch body itself adds +23 ms at encode-at-parse and +22 ms at copyBytesInto, and the net of all of Phase 2 + 3 + β is "real heap+GC win, ~+200 ms wall-clock loss across many frames", so the architecture stays off staging on the simpler conclusion that V8 marks pointer arrays faster than expected and the original polymorphic `main[i].copyBytesInto()` was fine; Phase 1 wires into production via a `setExpectedDictSlots()` call in `docs/render-book.mjs` that runs the measure walker on rawPdf and hands the exact dictSlot count to fast-dict-onebuf -- the bound on mainBuf isn't material on its own (~60 K slots out of 2.4 M of slack) but commits the two-pass shape so any future shim swap doesn't have to re-do the wiring; finally, mirroring fast-dict-onebuf's range-view shape onto PDFArray (every committed element in a single append-only `arrayMain`, each PDFArray a view via packed `(start, length)`) is the lever the encoded approach was reaching for, with plain Object[] storage skipping the per-slot decode cost that killed Phase 3 -- ~19.6 MB `parseArray` allocation row drops off the top 15, total process heap 66 MB → 52 MB (-21 %), cumulative -66 % since the Map-backed baseline; once `fast-array-onebuf` shipped the next heap row was `PDFParser.parseIndirectObjectHeader` at 13.7 MB (25 % of total), attributed via `find-heap-callers.mjs` to V8 inlining `fastOf` into `skipJibberish`'s speculative `matchIndirectObjectHeader` call (~150 k tag-string allocations of ~25-35 B each), dropping the per-instance `PDFRef.tag` field entirely and computing `toString` / `sizeInBytes` / `copyBytesInto` from `objectNumber` / `generationNumber` directly via `_writeUint` + `_digitCount` helpers cuts `parseIndirectObjectHeader` to 9.3 MB (-4.3 MB), `fastOf` 7.7 → 4.8 MB (-2.9 MB), total process heap 51.9 → 45.2 MB (-13 %), with byte-identical output verified by inflating + diffing all 453 ObjStm streams; the same chain pointed at a redundancy on the CPU side -- `parseDocument`'s inner loop calls `skipJibberish` ~150 k times per load to recover from invalid PDFs that wedge garbage between indirect objects, but on valid PDFs every call speculatively runs `matchKeyword(xref/trailer/startxref)` (all fail on a digit) + `matchIndirectObjectHeader` (a `try` / `catch` around `parseIndirectObjectHeader` + `parseRawInt`x2 + `matchKeyword('obj')` + `fastOf`), all to confirm what the outer `while`'s `IsDigit` check already proved, so peeking the byte first and `continue`-ing on a digit (falling through to `skipJibberish` only on xref/trailer/startxref keyword starts or real jibberish) saves ~62 ms on load (mean 0.518 → 0.455 s, ~6 % of process); the next attack surface after that was the construction style itself -- `fast-refs`'s `Object.create(PDFRef.prototype) + fresh.objectNumber = ... + fresh.generationNumber = ...` routes V8 through the slow-property path with intermediate hidden-class transitions per write, putting PDFRef at ~60 B/instance vs PDFName's ~31 B (built via `new PDFName(...)` with a real constructor), so swapping to a plain function used as a constructor (`function _FastRef(o, g) { this.objectNumber = o; this.generationNumber = g; }` + `_FastRef.prototype = PDFRef.prototype`) gives V8 a stable hidden class from the first instance, drops per-PDFRef cost to ~44 B for ~3.87 MB heap (-8.5 %) and ~140 ms wall-clock (-12 % of process) on the book's 226 k unique PDFRefs (paired heap+cpu profile, --fast-refs vs --fast-refs-class with the rest of production on), with `parseIndirectObjectHeader` dropping 9.1 MB → 7.4 MB and `fastOf` 4.7 MB → 3.4 MB -- the `Object.create + writes` shim stays in the tree as A/B baseline (mutex-checked in measure.mjs); the same shape change applied symmetrically to the four PDFDict factory paths in `fast-dict-onebuf` (`_makeFromRange` + the COW path inside `set` both build wrappers via `Object.create(ProtoClass.prototype) + pd.d = ...`, with PageLeaf carrying extra `normalized` / `autoNormalizeCTM` writes) -- one plain-function constructor per subclass (`_FastDict`, `_FastCatalog`, `_FastPageTree`, `_FastPageLeaf`) with the prototype aliased to the upstream prototype drops 260 k+ wrapper instances ~20 B each for `_makeFromRange (dict)` 16.5 MB → 11.4 MB, `create` builtin 2.6 MB → 0.9 MB, total process heap 41.4 MB → 35.4 MB (-14.4 %), cumulative -22 % over the two shape-change commits and -77 % since the Map-backed PDFDict baseline (152 MB → 35.4 MB); wall-clock roughly flat (0.99 → 1.03 s under cpu profile, within noise) with GC self-time +18 ms (82 → 101 ms) as expected -- the dominant GC cost is the live mainBuf scan rather than allocation rate, so cutting allocations doesn't move single-shot mark time; mirroring the same change to PDFArray's `_makeFromRange` and COW paths with a single `_FastArray` constructor (no subclass dispatch needed -- PDFArray has none in pdf-lib) drops ~22 B/PDFArray × ~80 k = ~1.7 MB heap, but the surprise win is on CPU + GC: with all three shape changes in place V8 sees fully monomorphic call sites for PDFRef / PDFDict / PDFArray construction and method dispatch, undoing the dict-only state's +18 ms GC regression and then some -- GC self-time 101 → 59 ms (-42 %), process wall-clock 1.03 → 0.90 s (-130 ms, -13 %), so cumulative across the three shape-change commits (fast-refs-class + fast-dict-onebuf class + fast-array-onebuf class) the process drops 1.13 → 0.90 s (-230 ms, -20 %), total heap 45.3 → 33.7 MB (-25.6 %), GC self-time 87 → 59 ms (-32 %), with output byte-identical modulo timestamps; with the constructor-shape round closed, the new #1 row in the process CPU profile was `PDFObjectParser.prototype.parseName` at 87 ms self + 57 ms via its `fastOf` callee = 144 ms combined (~16 % of process) firing 1.68 M times per load, of which 4 787 are unique (99.7 % cache-hit rate -- the same handful of dict keys like Type, Length, Pages, MediaBox over and over) -- two failed first attempts (skip per-byte ByteStream method dispatch via direct buffer access while keeping the cons-string accumulator: V8's cons-string optimisation was already covering the cost so no movement; and `String.fromCharCode.apply(null, buf.subarray(...))` as a one-shot allocation: SLOWER at ~123 ms vs ~87 ms because `.apply` on a typed-array view is a V8 deopt path) pointed at the wrong surface, the real win was caching the answer keyed on the byte content, scanning bytes with direct buffer access while accumulating a Java-style `hash * 31 + byte | 0` Smi hash in the same pass, looking up `Map<hash, Entry | Entry[]>` keyed by byte content (single-entry buckets the common case at 4.8 k names into 2^32 hash space, collision-bucket scan via `instanceof Entry` check), with cold path building the string in one shot via `String.fromCharCode` direct args and routing through fast-decode-name so both caches converge on the same PDFName instance -- pulls `parseName` + `fastOf` combined from 144 ms to 58 ms (-60 %), -80 ms process wall-clock (-9 %), all on load (0.41 → 0.33 s); +1.3 MB long-lived heap (4.8 k Entry objects + Uint8Array byte-keys + Map<number, ...> overhead) is a fixed cost for a workload-bounded cache; the heap-profile run shows a much bigger drop (3.50 → 2.56 s, -940 ms) -- not a real wall-clock win, just the sampler's per-allocation bookkeeping dropping in step with the ~1.6 M transient string allocations we eliminated (read cpu numbers for "did we get faster", heap numbers for long-lived cost); the next row to drop was `PDFObjectStream.getUnencodedContents` (#4 at 46 ms self / 124 ms with callees) paired with a fat `(idle)` row at 32 ms / 3.4 % -- both attributable to `parallel-deflate.mjs`'s phase 2 running build + deflate as two strictly serial passes (`objectStreams.map(os => os.getUnencodedContents())` followed by `Promise.all(unencoded.map(buf => deflateAsync(buf)))`, the first ~120 ms of main-thread block then ~30 ms of main-thread idle awaiting libuv), so folding the two `.map`s into one (`Promise.all(objectStreams.map(os => deflateAsync(os.getUnencodedContents())))`) pipelines build with deflate -- each deflate fires on libuv as its buffer is built, overlapping with the build of the next stream rather than after all 453 builds complete -- and the await resolves almost immediately by the time the build loop finishes (by then ~430 of 453 deflates have run on the 4-worker pool, each ~0.3 ms compute); paired 3-run A/B with the rest of the shipped flag set on confirms save 0.467 s → 0.420 s (-47 ms, -10 %), process 0.887 s → 0.833 s (-54 ms, -6 %), load + setOutline flat as expected; the `(idle)` row drops out of the CPU top-15 entirely and `getUnencodedContents` self-time also drops (31.56 → 22.25 ms) as V8's task scheduling between build and the fire-and-forget Promise creation reattributes some samples -- a 47 ms vs 32 ms estimate gap accounted for by microtask-queue drain at the `Promise.all` gate + libuv callback marshalling now spread across the build loop instead of bunched at the end; the class-shape round left PDFPageLeaf as the only subclass with extra fields (`normalized` default false + `autoNormalizeCTM` default true, both written in the `_FastPageLeaf` constructor body) so the 1 651 page leaves on the book were ~24 B larger than plain `_FastDict` instances -- packing both booleans into `d`'s gap bits collapses PageLeaf to the same single-`d` shape (bit layout shifts from start[0:23] + length[24:37] to start[0:22] + norm[23] + auto[24] + length[25:40], dropping start from 24 to 23 bits / 8.4 M slots vs ~2.3 M mainLen, growing length from 14 to 16 bits / 65 535 vs 8 706 observed max) with the booleans as prototype getters/setters that mask in/out of bits 23-24, and the V8 Smi gotcha worth recording: Smi is 31-bit signed so d > 2^30 (i.e. length >= 32) boxes to HeapNumber where `d | NORM_BIT` would truncate to Int32 and lose the length, so all writes use arithmetic (`d + NORM_BIT` / `d - NORM_BIT` gated on the current bit state) and the COW / set / delete paths preserve the gap bits via `+ (d & GAP_MASK)` after the repack; saves ~26 KB on the 1 651 page leaves (sub-row at 512 B sampler resolution but real, calculated per-instance), output byte-identical, CPU flat (no PageLeaf mutation paths fire on the render-only workflow). |
diff --git a/perf/notes/08-pdf-lib.md b/perf/notes/08-pdf-lib.md
index 669d672a..c069089a 100644
--- a/perf/notes/08-pdf-lib.md
+++ b/perf/notes/08-pdf-lib.md
@@ -4818,6 +4818,101 @@ flag continues to flip the whole `parallelSave` path on, no new
 knob. `render-book.mjs` already calls `parallelSave` so the
 change is live in production without a new import.
 
+## Pack `PDFPageLeaf` flags into `d`'s gap bits
+
+`fast-dict-onebuf`'s class-shape change (above) left PDFPageLeaf as
+the only subclass with extra fields: `normalized` (default false) +
+`autoNormalizeCTM` (default true), both written in the
+`_FastPageLeaf` constructor body. V8 sees a fixed shape per
+subclass but PageLeaf instances are ~24 B larger than plain
+`_FastDict` (the two boolean slots plus their map entries). 1 651
+PageLeaves on the book × ~24 B = ~26 KB -- sub-row at the 512 B
+sampler resolution, but the same tax was waiting on every other
+`Object.create + writes`-style wrapper we'd want to apply this
+shape to.
+
+The packed `d` had 15 unused bits between the 14-bit length field
+and the 53-bit `Number.MAX_SAFE_INTEGER` ceiling. Two booleans fit
+in two bits.
+
+### The new layout
+
+```
+bits  0-22: start  (23 bits, max 8.4 M slots; mainLen ~2.3 M today)
+bit     23: PDFPageLeaf `normalized` flag (zero on all other subtypes)
+bit     24: PDFPageLeaf `autoNormalizeCTM` flag (zero on all other subtypes)
+bits 25-40: length (16 bits, max 65 535 slots; observed max 8 706)
+bits 41-52: spare (12 bits; available headroom)
+```
+
+`start` drops 24 → 23 bits (8.4 M slots, well above the ~2.3 M
+mainLen seen today); `length` grows 14 → 16 bits (65 535,
+comfortable headroom over the observed max).
+
+PDFPageLeaf collapses to the same single-`d` shape as plain
+PDFDict; `normalized` and `autoNormalizeCTM` become prototype
+getters/setters that mask in/out of bits 23-24.
+
+### The V8 Smi gotcha (worth recording)
+
+V8's Smi (31-bit signed integer) range covers values up to 2^30.
+`start + length * 2^25` stays Smi iff `length < 32`; beyond that
+`d` boxes to a HeapNumber. Two consequences:
+
+1. **Reads stay correct.** `d & MASK_23` lives in the low 32 bits;
+   V8 coerces `d` to Int32 for the `&`, which reads bits 0..30
+   correctly even when `d` is HeapNumber-boxed.
+   `Math.floor(d / POW_25) & MASK_16` operates on the full Number
+   range before the mask truncates.
+2. **Writes must use arithmetic, not bitwise OR.** `d | NORM_BIT`
+   on a HeapNumber'd d truncates to Int32 and loses the high bits
+   (the length). Use `d + NORM_BIT` / `d - NORM_BIT` gated on the
+   current bit state -- arithmetic addition operates on the full
+   Number range.
+
+The setters all follow the pattern:
+
+```js
+set(v) {
+  const d = this.d;
+  const has = (d & NORM_BIT) !== 0;
+  if (v && !has)      this.d = d + NORM_BIT;
+  else if (!v && has) this.d = d - NORM_BIT;
+}
+```
+
+`_cow`, `set` (the COW-on-mutation path), and `delete` all preserve
+the gap bits when they repack `d` by adding `(d & GAP_MASK)` back
+into the freshly packed value. For non-PageLeaf dicts the mask is
+zero so the add is a no-op; for PageLeaf the flags survive any
+backing-buffer move.
+
+### Constructor default
+
+`_FastPageLeaf` defaults `autoNormalizeCTM` to `true` (upstream
+behavior). Since `pack(start, length)` produces a value with bits
+23-24 cleared, the constructor sets bit 24 via addition:
+
+```js
+function _FastPageLeaf(d) { this.d = d + AUTO_BIT; }
+```
+
+Addition not `|` for the same reason -- if `length >= 32`, `d`
+exceeds 2^30 and the boxed Number's high bits would be lost to
+Int32 coercion via `|`.
+
+### Measured
+
+Heap saving on the 1 651 page leaves is sub-row at the 512 B
+sampler resolution but real (~26 KB by per-instance arithmetic).
+Output PDF byte-identical to baseline. CPU flat -- no PDFPageLeaf
+mutation paths fire on the book's render-only workflow.
+
+The change is local to
+[`docs/lib/fast-dict-onebuf.mjs`](../../docs/lib/fast-dict-onebuf.mjs);
+no production import or flag change needed since
+`--fast-dict-onebuf` was already wired up.
+
 ## `@cantoo/pdf-lib`: not a drop-in replacement
 
 Spot-checked the maintained fork (`@cantoo/pdf-lib` 2.6.5) as an
@@ -4864,7 +4959,8 @@ Cumulative process-phase cost, baseline → after the shims to date:
 | + fast-dict-onebuf class shape       | ~0.9 s  | ~0.55 s | ~0.4 s |
 | + fast-array-onebuf class shape      | ~0.8 s  | ~0.5 s  | ~0.35 s |
 | + fast-parse-name                    | ~0.75 s | ~0.4 s  | ~0.35 s |
-| **+ pipeline-deflate (this section)** | **~0.7 s** | **~0.4 s** | **~0.3 s** |
+| + pipeline-deflate                   | ~0.7 s  | ~0.4 s  | ~0.3 s  |
+| **+ PageLeaf flag-packing (this section)** | **~0.7 s** | **~0.4 s** | **~0.3 s** |
 
 The bottom-up after the latest pair is what's left of pdf-lib's
 genuine parser work: `PDFDict.entries`, `PDFObjectParser.parseName`,

From d22ff7466cd5c5badabbf62e424ab5a399d34704 Mon Sep 17 00:00:00 2001
From: Kuba Sunderland-Ober <kuba@mareimbrium.org>
Date: Sun, 24 May 2026 13:11:37 +0200
Subject: [PATCH 43/44] fast-refs-class: two-shape PDFRef, gen=0 single-slot
 (-1.88 MB heap).

Split _FastRef into two constructors: _FastRef(objectNumber) for the
gen=0 path (every PDFRef on fresh-Chrome workloads) carrying a single
inline data field, and _FastRefGen(objectNumber, generationNumber)
for the rare gen!=0 path (the xref free entry at object 0). A
default `generationNumber = 0` on PDFRef.prototype supplies the
missing field for _FastRef instances via prototype lookup, so reads
of either property stay as plain data-property loads -- no accessor-
property boundary that would deopt the IC at upstream call sites
(PDFCrossRefSection.append, PDFCrossRefStream entry tuples,
PDFWriter.serializeToBuffer, fast-indirect-objects, ...).

Per-instance: 24 B (two slots) -> 16 B (one slot). On 226 k unique
PDFRefs, ~1.88 MB heap saved (paired heap profile: 34.96 MB ->
33.08 MB total sampled). CPU min flat (no-profile wall-clock 0.70 s
vs ~0.83 s pre, but the heap-saving lane isn't the source of CPU
movement). PDF output byte-identical.

An accessor-property variant (single packed d + getters for
objectNumber/generationNumber) was tried first and rejected: it
regressed heap by +1.6 MB and CPU by +70 ms because the getter
dispatch broke V8's monomorphic ICs at the upstream xref-write
sites, forcing recompilation paths that couldn't elide the
{ref, offset, deleted} object literals in PDFCrossRefSection.addEntry
as aggressively.
---
 docs/lib/fast-refs-class.mjs |  72 +++++++++++++------
 perf/README.md               |  35 +++++----
 perf/notes/08-pdf-lib.md     | 135 ++++++++++++++++++++++++++++++++++-
 3 files changed, 207 insertions(+), 35 deletions(-)

diff --git a/docs/lib/fast-refs-class.mjs b/docs/lib/fast-refs-class.mjs
index ea53af03..c1c11e29 100644
--- a/docs/lib/fast-refs-class.mjs
+++ b/docs/lib/fast-refs-class.mjs
@@ -10,12 +10,25 @@
 //
 // This shim swaps the `Object.create + writes` pattern for a constructor
 // that sets both fields in one shot, giving V8 a stable hidden class
-// from the first instance. Same external behaviour (pool semantics,
-// prototype methods, instanceof checks all work) -- the only change is
-// the construction style.
+// from the first instance.
 //
-// Expected win: ~6 MB heap reduction on the book (226 k PDFRef instances
-// × ~30 B saved by skipping the slow-property path).
+// Two-shape variant: most PDFRefs on fresh-Chrome workloads are gen=0
+// and don't need to carry generationNumber at all. We allocate them via
+// _FastRef (single `objectNumber` inline slot) and let the prototype
+// supply a default `generationNumber = 0`. The rare gen!=0 path (PDF
+// spec allows it; our workload only hits it for the xref "free" entry
+// at object 0) uses _FastRefGen with both fields as own data properties.
+// V8 sees a bounded 2-shape polymorphism on PDFRef.prototype, and the
+// monomorphic hot path (gen=0 instances) keeps inline-field-read speed
+// for `.objectNumber` and `.generationNumber` reads -- no accessor-
+// property boundary to break inlining at upstream pdf-lib call sites
+// (PDFCrossRefSection.append, PDFCrossRefStream entry tuples,
+// PDFWriter.serializeToBuffer, our fast-indirect-objects shim, ...).
+//
+// Expected per-gen=0 instance: header (8 B) + 1 inline slot (4 B) = 12 B
+// raw, aligned to 16 B by V8 -- versus 12 + 2*4 = 20 B raw, aligned to
+// 24 B for a 2-slot instance. Saves 8 B per gen=0 PDFRef * ~226 k unique
+// = ~1.8 MB heap on the book.
 //
 // Mutually exclusive with --fast-refs in the harness.
 
@@ -46,38 +59,55 @@ function _digitCount(n) {
   return d;
 }
 
-// ---- the constructor-based fast PDFRef shape ---------------------------
+// ---- the constructor-based fast PDFRef shapes --------------------------
 
-// Plain function used as a constructor (V8 gives `new`-built instances a
-// stable hidden class derived from the assignment order in the body).
-// Aliasing the prototype to PDFRef.prototype keeps `instanceof PDFRef`
-// satisfied AND means method dispatch resolves on the shared prototype
-// (no extra proto-chain hop).
-function _FastRef(objectNumber, generationNumber) {
+// gen=0 instances: single inline `objectNumber` slot. `generationNumber`
+// is supplied as a data-property default on PDFRef.prototype (set below),
+// so reads return 0 without any accessor dispatch.
+function _FastRef(objectNumber) {
   this.objectNumber = objectNumber;
-  this.generationNumber = generationNumber;
 }
 _FastRef.prototype = PDFRef.prototype;
 
+// gen!=0 instances: both fields as own data properties, shadowing the
+// prototype default. V8 sees a second hidden class -- bounded 2-shape
+// polymorphism, well-handled by inline caches.
+function _FastRefGen(objectNumber, generationNumber) {
+  this.objectNumber = objectNumber;
+  this.generationNumber = generationNumber;
+}
+_FastRefGen.prototype = PDFRef.prototype;
+
 if (!PDFRef.__fastRefsClassInstalled) {
-  const original = PDFRef.of;
-  const pool0 = [];
+  const pool0 = [];                // dense gen=0 cache, indexed by objectNumber
+  const poolGenN = new Map();      // gen!=0 cache, keyed by "N M" string
 
   PDFRef.of = function fastClassOf(objectNumber, generationNumber) {
     if (generationNumber === undefined || generationNumber === 0) {
       const existing = pool0[objectNumber];
       if (existing) return existing;
-      const fresh = new _FastRef(objectNumber, 0);
+      const fresh = new _FastRef(objectNumber);
       pool0[objectNumber] = fresh;
       return fresh;
     }
-    // gen != 0: fall back to upstream PDFRef.of (its Map-based pool).
-    return original.call(PDFRef, objectNumber, generationNumber);
+    // gen != 0: this path is dead on fresh-Chrome workloads except for
+    // the xref "free" entry at object 0. Kept for spec correctness.
+    const key = objectNumber + ' ' + generationNumber;
+    const existing = poolGenN.get(key);
+    if (existing) return existing;
+    const fresh = new _FastRefGen(objectNumber, generationNumber);
+    poolGenN.set(key, fresh);
+    return fresh;
   };
 
-  // Replace prototype methods to ignore the upstream `tag` field (the
-  // gen != 0 fallback path still sets it, but our overrides recompute
-  // from objectNumber / generationNumber so the tag is unused).
+  // Default generationNumber on the prototype. _FastRef instances inherit
+  // this (no own property); _FastRefGen instances shadow it with their
+  // own data property. Both look like data-property reads to V8's IC.
+  PDFRef.prototype.generationNumber = 0;
+
+  // Hot prototype methods read `objectNumber` / `generationNumber` as
+  // regular data properties. The upstream `tag` string is gone -- no
+  // instance carries it any more.
   PDFRef.prototype.toString = function () {
     return this.objectNumber + ' ' + this.generationNumber + ' R';
   };
diff --git a/perf/README.md b/perf/README.md
index 234f89d5..3bd88e59 100644
--- a/perf/README.md
+++ b/perf/README.md
@@ -105,18 +105,26 @@ Flag rationale:
 - `--fast-refs-class` -- inject the
   [docs/lib/fast-refs-class.mjs](../docs/lib/fast-refs-class.mjs)
   shipping fix. Same dense-array cache + tag-drop as
-  `--fast-refs`, but the PDFRef instance is built via a
-  constructor (`new _FastRef(...)` with both fields set in the
-  body) rather than `Object.create + property writes`. V8 gives
-  `new`-built instances a stable hidden class from the first
-  instance; on the book that's ~16 B/instance × 226 k unique refs
-  = ~3.87 MB heap (-8.5 % of total process-phase allocation) and
-  ~140 ms wall-clock (-12 % of process) on top of the tag-drop
-  refinement. `_FastRef.prototype = PDFRef.prototype` keeps
-  `instanceof PDFRef` satisfied and resolves method dispatch on
-  the shared prototype (no extra proto-chain hop). gen != 0 still
-  falls back to the upstream `PDFRef.of` Map-based pool (rare on
-  freshly-parsed PDFs). Production runs through it.
+  `--fast-refs`, but PDFRef instances are built via plain-function
+  constructors rather than `Object.create + property writes`. Two
+  shapes: `_FastRef(objectNumber)` for the gen=0 path (one inline
+  slot) and `_FastRefGen(objectNumber, generationNumber)` for the
+  rare gen!=0 path (two slots, only the xref free entry at
+  object 0 on fresh-Chrome workloads). `generationNumber = 0` is a
+  data-property default on `PDFRef.prototype` so reads on gen=0
+  instances return 0 without an accessor dispatch -- keeps every
+  upstream `.objectNumber` / `.generationNumber` IC monomorphic on
+  the data-property path. V8 gives `new`-built instances a stable
+  hidden class from the first instance; per-instance is 16 B
+  aligned (one slot) for gen=0 vs 24 B for the legacy two-slot
+  shape, ~3.87 MB heap and ~140 ms wall-clock from the
+  constructor-shape change plus another ~1.88 MB from the
+  single-slot variant on top.
+  `_FastRef.prototype = PDFRef.prototype` keeps `instanceof PDFRef`
+  satisfied and resolves method dispatch on the shared prototype
+  (no extra proto-chain hop). gen != 0 has its own `poolGenN` Map
+  keyed by `"N M"` -- the shim is the entire `PDFRef.of` factory
+  now, no upstream pool involved. Production runs through it.
 - `--parallel-deflate` -- swap `pdfDoc.save()` for `parallelSave`
   from [docs/lib/parallel-deflate.mjs](../docs/lib/parallel-deflate.mjs),
   which pre-deflates object streams in parallel on libuv's pool with
@@ -689,6 +697,7 @@ file documenting each:
 | Byte-keyed `parseName` cache (Map<hash, Entry &#124; Entry[]>; skip per-call string build + string-keyed Map hash on 99.7 % hit path) | [08](notes/08-pdf-lib.md) | `parseName` + `fastOf` combined 144 ms → 58 ms; process wall 0.90 s → 0.82 s (-80 ms, -9 %, all on load); +1.3 MB long-lived heap for the cache |
 | Pipeline `parallel-deflate` (overlap buffer-build with libuv deflate by folding two `.map`s into one) | [08](notes/08-pdf-lib.md) | save 0.467 s → 0.420 s (-47 ms, -10 %); `(idle)` row at `Promise.all` gate drops out of CPU top-15 |
 | Pack PDFPageLeaf flags into `d`'s gap bits (`_FastPageLeaf` collapses to single-`d` shape; bit layout shifts to start[0:22] / norm[23] / auto[24] / length[25:40]) | [08](notes/08-pdf-lib.md) | ~26 KB on 1 651 page leaves (sub-row at 512 B sampler); output byte-identical; CPU flat |
+| Two-shape `PDFRef` (gen=0 single-slot `_FastRef` + gen!=0 two-slot `_FastRefGen`; `generationNumber = 0` as prototype default keeps IC monomorphic at every caller) | [08](notes/08-pdf-lib.md) | per-instance 24 B → 16 B aligned; total process heap 34.96 MB → 33.08 MB (-1.88 MB) |
 
 What was tried and didn't ship:
 
@@ -715,4 +724,4 @@ order; later ones reference earlier ones for context.
 | [05-blink-trace.md](notes/05-blink-trace.md) | What happened when we tried move-not-clone (a `previousLeaf` cache shipped instead of move); cracking the cpu profile's `(program)` row open with a Blink-category trace; the WhiteSpaceFilter paired-A/B that found it wasn't worth its layout cost in our pipeline. |
 | [06-microtasks-pageranges-css.md](notes/06-microtasks-pageranges-css.md) | Following `RunMicrotasks` down to zero (chunker fully sync); why `pageRanges` sharding is off the table; CSS cost attribution showing print.css's individual sections are all below the noise floor. |
 | [07-memory.md](notes/07-memory.md) | Where the renderer's 1.9 GB goes -- process-tree footprint, per-allocator + per-Blink-class breakdown, `--disable-gpu` + `--in-process-gpu` saving ~200 MB, a GC-pass probe finding 180 MB of unswept Oilpan garbage. |
-| [08-pdf-lib.md](notes/08-pdf-lib.md) | Profiling the process phase via `--cpu-profile-process`; pako's per-stream init dominates with ~4 500 small streams (routing pdf-lib's `deflate` + `inflate` through `node:zlib` saves ~1.5 s); `PDFRef.of`'s string-keyed Map lookup at 1.2 M calls per load (dense-array gen=0 cache saves ~0.2 s); parallelising save's per-stream deflate on libuv's pool with `objectsPerStream: 500` (~0.3 s off the main thread; PDF -5 %); `decodeName`'s regex scan on 2.76 M `PDFName.of` calls per load with a 0.0001 % hit rate (no-`#` cache saves ~0.5 s); `numberToString`'s redundant `toString`/`split`/`parseInt` on the 100 % no-`e` path; `sizeInBytes` allocating `n.toString(2)` on ~300 k xref-writer calls (short-circuit ladder saves ~60 ms); `PDFDict.entries` allocating `Array.from(map.entries())` on every dict serialisation (`Map.forEach` with hoisted callbacks saves ~80 ms); `parseDict`'s type-dispatch tail re-running `PDFName.of('Type'/'Catalog'/'Pages'/'Page')` per dict (hoisted sentinel constants drop `fastOf` self-time by 22 %); pdf-lib's `__awaiter`/`__generator` scaffolding on nine load + save methods costing ~135 ms of attributed self-time + ~50 ms GC (synchronified twins save 0.36 s of process); `parseObject`'s three speculative `matchKeyword(true/false/null)` scans on every dispatch (first-byte peek + gated keyword scans halve `parseObject` self-time); the sampling heap profile pointing at `new Map()` + `Map.prototype.set` at ~50 % of process-phase allocations (replacing `PDFDict`'s backing `Map` with a flat alternating `[k,v,k,v,...]` array drops Map+set heap traffic ~80 % and subsumes the earlier `fast-dict-iter` + `fast-parse-dict` shims); the only hot `Map.set` left being `PDFContext.assign`'s `indirectObjects.set(ref, object)` (replacing the Map with a dense gen=0 array indexed by `objectNumber` drops `assign` out of the CPU top-15 and halves the remaining `set` heap traffic); the residual `set` after that being the upstream `PDFRef.of` pool.set on cache miss (directly constructing the `PDFRef` via `Object.create(PDFRef.prototype)` on the gen=0 miss path bypasses the redundant upstream pool entirely, dropping `set` heap traffic from 7.7 MB to 0.5 MB and saving another ~93 ms on `PDFRef.of` CPU); `parseNumberOrRef` as the next-largest heap row at 15 MB of inlined `new PDFNumber(value)` calls -- PDFs reuse a few thousand unique numeric values hundreds of thousands of times (page indices, `/Count`, `/MediaBox` dimensions, font sizes), so pooling `PDFNumber` by value drops `parseNumberOrRef` off the top 10 and total process heap by ~13 %; `fastParseDictArray` at ~85 % FixedArray-growth garbage on 261 k dicts with a 5-entry median (pre-sizing the accumulator at `new Array(10)` collapses the per-call growth chain, dropping the row -25 % and total heap another -14 %); the next layer being the irreducible "one `new Array(10)` + one PDFDict instance per parsed dict" floor with ~1780 backing arrays still in flight after `--fast-dict-array` (collapse them into one long-lived mainBuf where every committed PDFDict entry lives, a per-parser temp stack handles parseDict recursion, owned dicts append to main and mutate in-place / COW-to-tail, and the whole PDFDict instance state packs into one 53-bit Number -- 24-bit start + 14-bit length + 1-bit owned -- so the per-dict object header collapses to a single field; total process heap drops 92 MB → 66 MB (-28 %), cumulative -57 % since the Map-backed baseline); GC self-time still ~150 ms / 15 % of process after fast-dict-onebuf (V8-flag knobs don't move it -- mark cost is dominated by walking the ~2.4 M Object-ref slots in mainBuf), so the next attack surface is shrinking the *live set* V8 has to mark by encoding slots as Numbers in a Float64Array mainBuf instead of Object references; that needs exact pool sizing, which is best done by a separate measure-allocate-work pass (Phase 0 viability gate: a no-allocate byte walker prototype clocks 135 ms vs PDFDocument.load at 1238 ms on the 39 MB Chrome-output PDF, ~9x cheaper, so even an 80 %-of-current work pass would land net-positive on CPU before any GC reduction; the prototype lives in `perf/phase0-measure.mjs` with `--dump-raw-pdf` on `measure.mjs` for capturing the canonical input); Phase 1 productionises that walker as `docs/lib/measure-pass.mjs` and adds `setExpectedDictSlots()` to fast-dict-onebuf so the harness's `--measure-pass` flag pre-sizes mainBuf exact (byte-identical output, +40 ms net process on the book) -- pipeline shape committed for Phase 2 to layer on, even though Phase 1 alone doesn't move the heap; a V8 IC-invalidation gotcha worth recording: rebinding the module-level `main` (rather than resizing it in place via `main.length = N`) made `_appendEntries` deopt and the heap jump 65 → 92 MB despite the resized Array being shape-identical; post-Phase-1 cleanup: the One-buffer `owned` bit was over-cautious -- `start + length === mainLen` is enough to know slots past mainLen are claimable regardless of whether the range came from the parser or a factory call, so the bit drops out of the packed value entirely (set's COW gate simplifies, _cow collapses to one branch, bit 38 becomes spare; byte-identical output, heap flat); slot-type histogram on `main[0..mainLen)` confirms keys are 100 % PDFName and the four big pools (Name / Ref / Number / Dict) cover 96.4 % of all slots, so a Float64Array mainBuf with a side `Object[]` for the residual ~3.5 % (Array / String / HexString) would collapse ~96 % of GC slot-marks at the price of a side-pool indirection -- `perf/instrument-slot-types.mjs` does the classification in 39 ms via `--instrument-slot-types`; built the Float64Array mainBuf with 4-bit tag + 49-bit payload encoding (subsuming fast-refs and fast-pdfnumber-pool's pool IDs, adding new pools for PDFArray / PDFString / PDFHexString) and confirmed byte-identical output -- but the measured win was a wash: pointer-array marks turn out to be fast in V8, encoding overhead at parse + decode at save roughly cancels the savings, and the new pool Maps cost ~3 MB heap, so the architecture stays on the shelf rather than shipping; mirroring the same shape to PDFArray (each instance as a view into a shared arrayBuf Float64Array via `this.d = (start, length)`, same temp-then-commit parser pattern) lands the expected heap win of -7.6 MB / -12 % and pulls parseArray's 19.6 MB attribution row off the top 10 entirely, but introduces a +360 ms wall-clock regression at save -- ~3 M per-slot `decodeValue` calls across copyBytesInto, ~100 ns each, V8 can't inline across the prototype boundary -- so the architecture stays off staging too; the natural follow-up is to hand-inline decodeValue's 10-case switch into all four hot save methods (Phase 3β), which recovers the function-call overhead and lands net wins on GC (-19 ms) and parseName (-17 ms, a downstream V8 re-optimization once the call sites became monomorphic per case branch) -- but the inlined switch body itself adds +23 ms at encode-at-parse and +22 ms at copyBytesInto, and the net of all of Phase 2 + 3 + β is "real heap+GC win, ~+200 ms wall-clock loss across many frames", so the architecture stays off staging on the simpler conclusion that V8 marks pointer arrays faster than expected and the original polymorphic `main[i].copyBytesInto()` was fine; Phase 1 wires into production via a `setExpectedDictSlots()` call in `docs/render-book.mjs` that runs the measure walker on rawPdf and hands the exact dictSlot count to fast-dict-onebuf -- the bound on mainBuf isn't material on its own (~60 K slots out of 2.4 M of slack) but commits the two-pass shape so any future shim swap doesn't have to re-do the wiring; finally, mirroring fast-dict-onebuf's range-view shape onto PDFArray (every committed element in a single append-only `arrayMain`, each PDFArray a view via packed `(start, length)`) is the lever the encoded approach was reaching for, with plain Object[] storage skipping the per-slot decode cost that killed Phase 3 -- ~19.6 MB `parseArray` allocation row drops off the top 15, total process heap 66 MB → 52 MB (-21 %), cumulative -66 % since the Map-backed baseline; once `fast-array-onebuf` shipped the next heap row was `PDFParser.parseIndirectObjectHeader` at 13.7 MB (25 % of total), attributed via `find-heap-callers.mjs` to V8 inlining `fastOf` into `skipJibberish`'s speculative `matchIndirectObjectHeader` call (~150 k tag-string allocations of ~25-35 B each), dropping the per-instance `PDFRef.tag` field entirely and computing `toString` / `sizeInBytes` / `copyBytesInto` from `objectNumber` / `generationNumber` directly via `_writeUint` + `_digitCount` helpers cuts `parseIndirectObjectHeader` to 9.3 MB (-4.3 MB), `fastOf` 7.7 → 4.8 MB (-2.9 MB), total process heap 51.9 → 45.2 MB (-13 %), with byte-identical output verified by inflating + diffing all 453 ObjStm streams; the same chain pointed at a redundancy on the CPU side -- `parseDocument`'s inner loop calls `skipJibberish` ~150 k times per load to recover from invalid PDFs that wedge garbage between indirect objects, but on valid PDFs every call speculatively runs `matchKeyword(xref/trailer/startxref)` (all fail on a digit) + `matchIndirectObjectHeader` (a `try` / `catch` around `parseIndirectObjectHeader` + `parseRawInt`x2 + `matchKeyword('obj')` + `fastOf`), all to confirm what the outer `while`'s `IsDigit` check already proved, so peeking the byte first and `continue`-ing on a digit (falling through to `skipJibberish` only on xref/trailer/startxref keyword starts or real jibberish) saves ~62 ms on load (mean 0.518 → 0.455 s, ~6 % of process); the next attack surface after that was the construction style itself -- `fast-refs`'s `Object.create(PDFRef.prototype) + fresh.objectNumber = ... + fresh.generationNumber = ...` routes V8 through the slow-property path with intermediate hidden-class transitions per write, putting PDFRef at ~60 B/instance vs PDFName's ~31 B (built via `new PDFName(...)` with a real constructor), so swapping to a plain function used as a constructor (`function _FastRef(o, g) { this.objectNumber = o; this.generationNumber = g; }` + `_FastRef.prototype = PDFRef.prototype`) gives V8 a stable hidden class from the first instance, drops per-PDFRef cost to ~44 B for ~3.87 MB heap (-8.5 %) and ~140 ms wall-clock (-12 % of process) on the book's 226 k unique PDFRefs (paired heap+cpu profile, --fast-refs vs --fast-refs-class with the rest of production on), with `parseIndirectObjectHeader` dropping 9.1 MB → 7.4 MB and `fastOf` 4.7 MB → 3.4 MB -- the `Object.create + writes` shim stays in the tree as A/B baseline (mutex-checked in measure.mjs); the same shape change applied symmetrically to the four PDFDict factory paths in `fast-dict-onebuf` (`_makeFromRange` + the COW path inside `set` both build wrappers via `Object.create(ProtoClass.prototype) + pd.d = ...`, with PageLeaf carrying extra `normalized` / `autoNormalizeCTM` writes) -- one plain-function constructor per subclass (`_FastDict`, `_FastCatalog`, `_FastPageTree`, `_FastPageLeaf`) with the prototype aliased to the upstream prototype drops 260 k+ wrapper instances ~20 B each for `_makeFromRange (dict)` 16.5 MB → 11.4 MB, `create` builtin 2.6 MB → 0.9 MB, total process heap 41.4 MB → 35.4 MB (-14.4 %), cumulative -22 % over the two shape-change commits and -77 % since the Map-backed PDFDict baseline (152 MB → 35.4 MB); wall-clock roughly flat (0.99 → 1.03 s under cpu profile, within noise) with GC self-time +18 ms (82 → 101 ms) as expected -- the dominant GC cost is the live mainBuf scan rather than allocation rate, so cutting allocations doesn't move single-shot mark time; mirroring the same change to PDFArray's `_makeFromRange` and COW paths with a single `_FastArray` constructor (no subclass dispatch needed -- PDFArray has none in pdf-lib) drops ~22 B/PDFArray × ~80 k = ~1.7 MB heap, but the surprise win is on CPU + GC: with all three shape changes in place V8 sees fully monomorphic call sites for PDFRef / PDFDict / PDFArray construction and method dispatch, undoing the dict-only state's +18 ms GC regression and then some -- GC self-time 101 → 59 ms (-42 %), process wall-clock 1.03 → 0.90 s (-130 ms, -13 %), so cumulative across the three shape-change commits (fast-refs-class + fast-dict-onebuf class + fast-array-onebuf class) the process drops 1.13 → 0.90 s (-230 ms, -20 %), total heap 45.3 → 33.7 MB (-25.6 %), GC self-time 87 → 59 ms (-32 %), with output byte-identical modulo timestamps; with the constructor-shape round closed, the new #1 row in the process CPU profile was `PDFObjectParser.prototype.parseName` at 87 ms self + 57 ms via its `fastOf` callee = 144 ms combined (~16 % of process) firing 1.68 M times per load, of which 4 787 are unique (99.7 % cache-hit rate -- the same handful of dict keys like Type, Length, Pages, MediaBox over and over) -- two failed first attempts (skip per-byte ByteStream method dispatch via direct buffer access while keeping the cons-string accumulator: V8's cons-string optimisation was already covering the cost so no movement; and `String.fromCharCode.apply(null, buf.subarray(...))` as a one-shot allocation: SLOWER at ~123 ms vs ~87 ms because `.apply` on a typed-array view is a V8 deopt path) pointed at the wrong surface, the real win was caching the answer keyed on the byte content, scanning bytes with direct buffer access while accumulating a Java-style `hash * 31 + byte | 0` Smi hash in the same pass, looking up `Map<hash, Entry | Entry[]>` keyed by byte content (single-entry buckets the common case at 4.8 k names into 2^32 hash space, collision-bucket scan via `instanceof Entry` check), with cold path building the string in one shot via `String.fromCharCode` direct args and routing through fast-decode-name so both caches converge on the same PDFName instance -- pulls `parseName` + `fastOf` combined from 144 ms to 58 ms (-60 %), -80 ms process wall-clock (-9 %), all on load (0.41 → 0.33 s); +1.3 MB long-lived heap (4.8 k Entry objects + Uint8Array byte-keys + Map<number, ...> overhead) is a fixed cost for a workload-bounded cache; the heap-profile run shows a much bigger drop (3.50 → 2.56 s, -940 ms) -- not a real wall-clock win, just the sampler's per-allocation bookkeeping dropping in step with the ~1.6 M transient string allocations we eliminated (read cpu numbers for "did we get faster", heap numbers for long-lived cost); the next row to drop was `PDFObjectStream.getUnencodedContents` (#4 at 46 ms self / 124 ms with callees) paired with a fat `(idle)` row at 32 ms / 3.4 % -- both attributable to `parallel-deflate.mjs`'s phase 2 running build + deflate as two strictly serial passes (`objectStreams.map(os => os.getUnencodedContents())` followed by `Promise.all(unencoded.map(buf => deflateAsync(buf)))`, the first ~120 ms of main-thread block then ~30 ms of main-thread idle awaiting libuv), so folding the two `.map`s into one (`Promise.all(objectStreams.map(os => deflateAsync(os.getUnencodedContents())))`) pipelines build with deflate -- each deflate fires on libuv as its buffer is built, overlapping with the build of the next stream rather than after all 453 builds complete -- and the await resolves almost immediately by the time the build loop finishes (by then ~430 of 453 deflates have run on the 4-worker pool, each ~0.3 ms compute); paired 3-run A/B with the rest of the shipped flag set on confirms save 0.467 s → 0.420 s (-47 ms, -10 %), process 0.887 s → 0.833 s (-54 ms, -6 %), load + setOutline flat as expected; the `(idle)` row drops out of the CPU top-15 entirely and `getUnencodedContents` self-time also drops (31.56 → 22.25 ms) as V8's task scheduling between build and the fire-and-forget Promise creation reattributes some samples -- a 47 ms vs 32 ms estimate gap accounted for by microtask-queue drain at the `Promise.all` gate + libuv callback marshalling now spread across the build loop instead of bunched at the end; the class-shape round left PDFPageLeaf as the only subclass with extra fields (`normalized` default false + `autoNormalizeCTM` default true, both written in the `_FastPageLeaf` constructor body) so the 1 651 page leaves on the book were ~24 B larger than plain `_FastDict` instances -- packing both booleans into `d`'s gap bits collapses PageLeaf to the same single-`d` shape (bit layout shifts from start[0:23] + length[24:37] to start[0:22] + norm[23] + auto[24] + length[25:40], dropping start from 24 to 23 bits / 8.4 M slots vs ~2.3 M mainLen, growing length from 14 to 16 bits / 65 535 vs 8 706 observed max) with the booleans as prototype getters/setters that mask in/out of bits 23-24, and the V8 Smi gotcha worth recording: Smi is 31-bit signed so d > 2^30 (i.e. length >= 32) boxes to HeapNumber where `d | NORM_BIT` would truncate to Int32 and lose the length, so all writes use arithmetic (`d + NORM_BIT` / `d - NORM_BIT` gated on the current bit state) and the COW / set / delete paths preserve the gap bits via `+ (d & GAP_MASK)` after the repack; saves ~26 KB on the 1 651 page leaves (sub-row at 512 B sampler resolution but real, calculated per-instance), output byte-identical, CPU flat (no PageLeaf mutation paths fire on the render-only workflow). |
+| [08-pdf-lib.md](notes/08-pdf-lib.md) | Profiling the process phase via `--cpu-profile-process`; pako's per-stream init dominates with ~4 500 small streams (routing pdf-lib's `deflate` + `inflate` through `node:zlib` saves ~1.5 s); `PDFRef.of`'s string-keyed Map lookup at 1.2 M calls per load (dense-array gen=0 cache saves ~0.2 s); parallelising save's per-stream deflate on libuv's pool with `objectsPerStream: 500` (~0.3 s off the main thread; PDF -5 %); `decodeName`'s regex scan on 2.76 M `PDFName.of` calls per load with a 0.0001 % hit rate (no-`#` cache saves ~0.5 s); `numberToString`'s redundant `toString`/`split`/`parseInt` on the 100 % no-`e` path; `sizeInBytes` allocating `n.toString(2)` on ~300 k xref-writer calls (short-circuit ladder saves ~60 ms); `PDFDict.entries` allocating `Array.from(map.entries())` on every dict serialisation (`Map.forEach` with hoisted callbacks saves ~80 ms); `parseDict`'s type-dispatch tail re-running `PDFName.of('Type'/'Catalog'/'Pages'/'Page')` per dict (hoisted sentinel constants drop `fastOf` self-time by 22 %); pdf-lib's `__awaiter`/`__generator` scaffolding on nine load + save methods costing ~135 ms of attributed self-time + ~50 ms GC (synchronified twins save 0.36 s of process); `parseObject`'s three speculative `matchKeyword(true/false/null)` scans on every dispatch (first-byte peek + gated keyword scans halve `parseObject` self-time); the sampling heap profile pointing at `new Map()` + `Map.prototype.set` at ~50 % of process-phase allocations (replacing `PDFDict`'s backing `Map` with a flat alternating `[k,v,k,v,...]` array drops Map+set heap traffic ~80 % and subsumes the earlier `fast-dict-iter` + `fast-parse-dict` shims); the only hot `Map.set` left being `PDFContext.assign`'s `indirectObjects.set(ref, object)` (replacing the Map with a dense gen=0 array indexed by `objectNumber` drops `assign` out of the CPU top-15 and halves the remaining `set` heap traffic); the residual `set` after that being the upstream `PDFRef.of` pool.set on cache miss (directly constructing the `PDFRef` via `Object.create(PDFRef.prototype)` on the gen=0 miss path bypasses the redundant upstream pool entirely, dropping `set` heap traffic from 7.7 MB to 0.5 MB and saving another ~93 ms on `PDFRef.of` CPU); `parseNumberOrRef` as the next-largest heap row at 15 MB of inlined `new PDFNumber(value)` calls -- PDFs reuse a few thousand unique numeric values hundreds of thousands of times (page indices, `/Count`, `/MediaBox` dimensions, font sizes), so pooling `PDFNumber` by value drops `parseNumberOrRef` off the top 10 and total process heap by ~13 %; `fastParseDictArray` at ~85 % FixedArray-growth garbage on 261 k dicts with a 5-entry median (pre-sizing the accumulator at `new Array(10)` collapses the per-call growth chain, dropping the row -25 % and total heap another -14 %); the next layer being the irreducible "one `new Array(10)` + one PDFDict instance per parsed dict" floor with ~1780 backing arrays still in flight after `--fast-dict-array` (collapse them into one long-lived mainBuf where every committed PDFDict entry lives, a per-parser temp stack handles parseDict recursion, owned dicts append to main and mutate in-place / COW-to-tail, and the whole PDFDict instance state packs into one 53-bit Number -- 24-bit start + 14-bit length + 1-bit owned -- so the per-dict object header collapses to a single field; total process heap drops 92 MB → 66 MB (-28 %), cumulative -57 % since the Map-backed baseline); GC self-time still ~150 ms / 15 % of process after fast-dict-onebuf (V8-flag knobs don't move it -- mark cost is dominated by walking the ~2.4 M Object-ref slots in mainBuf), so the next attack surface is shrinking the *live set* V8 has to mark by encoding slots as Numbers in a Float64Array mainBuf instead of Object references; that needs exact pool sizing, which is best done by a separate measure-allocate-work pass (Phase 0 viability gate: a no-allocate byte walker prototype clocks 135 ms vs PDFDocument.load at 1238 ms on the 39 MB Chrome-output PDF, ~9x cheaper, so even an 80 %-of-current work pass would land net-positive on CPU before any GC reduction; the prototype lives in `perf/phase0-measure.mjs` with `--dump-raw-pdf` on `measure.mjs` for capturing the canonical input); Phase 1 productionises that walker as `docs/lib/measure-pass.mjs` and adds `setExpectedDictSlots()` to fast-dict-onebuf so the harness's `--measure-pass` flag pre-sizes mainBuf exact (byte-identical output, +40 ms net process on the book) -- pipeline shape committed for Phase 2 to layer on, even though Phase 1 alone doesn't move the heap; a V8 IC-invalidation gotcha worth recording: rebinding the module-level `main` (rather than resizing it in place via `main.length = N`) made `_appendEntries` deopt and the heap jump 65 → 92 MB despite the resized Array being shape-identical; post-Phase-1 cleanup: the One-buffer `owned` bit was over-cautious -- `start + length === mainLen` is enough to know slots past mainLen are claimable regardless of whether the range came from the parser or a factory call, so the bit drops out of the packed value entirely (set's COW gate simplifies, _cow collapses to one branch, bit 38 becomes spare; byte-identical output, heap flat); slot-type histogram on `main[0..mainLen)` confirms keys are 100 % PDFName and the four big pools (Name / Ref / Number / Dict) cover 96.4 % of all slots, so a Float64Array mainBuf with a side `Object[]` for the residual ~3.5 % (Array / String / HexString) would collapse ~96 % of GC slot-marks at the price of a side-pool indirection -- `perf/instrument-slot-types.mjs` does the classification in 39 ms via `--instrument-slot-types`; built the Float64Array mainBuf with 4-bit tag + 49-bit payload encoding (subsuming fast-refs and fast-pdfnumber-pool's pool IDs, adding new pools for PDFArray / PDFString / PDFHexString) and confirmed byte-identical output -- but the measured win was a wash: pointer-array marks turn out to be fast in V8, encoding overhead at parse + decode at save roughly cancels the savings, and the new pool Maps cost ~3 MB heap, so the architecture stays on the shelf rather than shipping; mirroring the same shape to PDFArray (each instance as a view into a shared arrayBuf Float64Array via `this.d = (start, length)`, same temp-then-commit parser pattern) lands the expected heap win of -7.6 MB / -12 % and pulls parseArray's 19.6 MB attribution row off the top 10 entirely, but introduces a +360 ms wall-clock regression at save -- ~3 M per-slot `decodeValue` calls across copyBytesInto, ~100 ns each, V8 can't inline across the prototype boundary -- so the architecture stays off staging too; the natural follow-up is to hand-inline decodeValue's 10-case switch into all four hot save methods (Phase 3β), which recovers the function-call overhead and lands net wins on GC (-19 ms) and parseName (-17 ms, a downstream V8 re-optimization once the call sites became monomorphic per case branch) -- but the inlined switch body itself adds +23 ms at encode-at-parse and +22 ms at copyBytesInto, and the net of all of Phase 2 + 3 + β is "real heap+GC win, ~+200 ms wall-clock loss across many frames", so the architecture stays off staging on the simpler conclusion that V8 marks pointer arrays faster than expected and the original polymorphic `main[i].copyBytesInto()` was fine; Phase 1 wires into production via a `setExpectedDictSlots()` call in `docs/render-book.mjs` that runs the measure walker on rawPdf and hands the exact dictSlot count to fast-dict-onebuf -- the bound on mainBuf isn't material on its own (~60 K slots out of 2.4 M of slack) but commits the two-pass shape so any future shim swap doesn't have to re-do the wiring; finally, mirroring fast-dict-onebuf's range-view shape onto PDFArray (every committed element in a single append-only `arrayMain`, each PDFArray a view via packed `(start, length)`) is the lever the encoded approach was reaching for, with plain Object[] storage skipping the per-slot decode cost that killed Phase 3 -- ~19.6 MB `parseArray` allocation row drops off the top 15, total process heap 66 MB → 52 MB (-21 %), cumulative -66 % since the Map-backed baseline; once `fast-array-onebuf` shipped the next heap row was `PDFParser.parseIndirectObjectHeader` at 13.7 MB (25 % of total), attributed via `find-heap-callers.mjs` to V8 inlining `fastOf` into `skipJibberish`'s speculative `matchIndirectObjectHeader` call (~150 k tag-string allocations of ~25-35 B each), dropping the per-instance `PDFRef.tag` field entirely and computing `toString` / `sizeInBytes` / `copyBytesInto` from `objectNumber` / `generationNumber` directly via `_writeUint` + `_digitCount` helpers cuts `parseIndirectObjectHeader` to 9.3 MB (-4.3 MB), `fastOf` 7.7 → 4.8 MB (-2.9 MB), total process heap 51.9 → 45.2 MB (-13 %), with byte-identical output verified by inflating + diffing all 453 ObjStm streams; the same chain pointed at a redundancy on the CPU side -- `parseDocument`'s inner loop calls `skipJibberish` ~150 k times per load to recover from invalid PDFs that wedge garbage between indirect objects, but on valid PDFs every call speculatively runs `matchKeyword(xref/trailer/startxref)` (all fail on a digit) + `matchIndirectObjectHeader` (a `try` / `catch` around `parseIndirectObjectHeader` + `parseRawInt`x2 + `matchKeyword('obj')` + `fastOf`), all to confirm what the outer `while`'s `IsDigit` check already proved, so peeking the byte first and `continue`-ing on a digit (falling through to `skipJibberish` only on xref/trailer/startxref keyword starts or real jibberish) saves ~62 ms on load (mean 0.518 → 0.455 s, ~6 % of process); the next attack surface after that was the construction style itself -- `fast-refs`'s `Object.create(PDFRef.prototype) + fresh.objectNumber = ... + fresh.generationNumber = ...` routes V8 through the slow-property path with intermediate hidden-class transitions per write, putting PDFRef at ~60 B/instance vs PDFName's ~31 B (built via `new PDFName(...)` with a real constructor), so swapping to a plain function used as a constructor (`function _FastRef(o, g) { this.objectNumber = o; this.generationNumber = g; }` + `_FastRef.prototype = PDFRef.prototype`) gives V8 a stable hidden class from the first instance, drops per-PDFRef cost to ~44 B for ~3.87 MB heap (-8.5 %) and ~140 ms wall-clock (-12 % of process) on the book's 226 k unique PDFRefs (paired heap+cpu profile, --fast-refs vs --fast-refs-class with the rest of production on), with `parseIndirectObjectHeader` dropping 9.1 MB → 7.4 MB and `fastOf` 4.7 MB → 3.4 MB -- the `Object.create + writes` shim stays in the tree as A/B baseline (mutex-checked in measure.mjs); the same shape change applied symmetrically to the four PDFDict factory paths in `fast-dict-onebuf` (`_makeFromRange` + the COW path inside `set` both build wrappers via `Object.create(ProtoClass.prototype) + pd.d = ...`, with PageLeaf carrying extra `normalized` / `autoNormalizeCTM` writes) -- one plain-function constructor per subclass (`_FastDict`, `_FastCatalog`, `_FastPageTree`, `_FastPageLeaf`) with the prototype aliased to the upstream prototype drops 260 k+ wrapper instances ~20 B each for `_makeFromRange (dict)` 16.5 MB → 11.4 MB, `create` builtin 2.6 MB → 0.9 MB, total process heap 41.4 MB → 35.4 MB (-14.4 %), cumulative -22 % over the two shape-change commits and -77 % since the Map-backed PDFDict baseline (152 MB → 35.4 MB); wall-clock roughly flat (0.99 → 1.03 s under cpu profile, within noise) with GC self-time +18 ms (82 → 101 ms) as expected -- the dominant GC cost is the live mainBuf scan rather than allocation rate, so cutting allocations doesn't move single-shot mark time; mirroring the same change to PDFArray's `_makeFromRange` and COW paths with a single `_FastArray` constructor (no subclass dispatch needed -- PDFArray has none in pdf-lib) drops ~22 B/PDFArray × ~80 k = ~1.7 MB heap, but the surprise win is on CPU + GC: with all three shape changes in place V8 sees fully monomorphic call sites for PDFRef / PDFDict / PDFArray construction and method dispatch, undoing the dict-only state's +18 ms GC regression and then some -- GC self-time 101 → 59 ms (-42 %), process wall-clock 1.03 → 0.90 s (-130 ms, -13 %), so cumulative across the three shape-change commits (fast-refs-class + fast-dict-onebuf class + fast-array-onebuf class) the process drops 1.13 → 0.90 s (-230 ms, -20 %), total heap 45.3 → 33.7 MB (-25.6 %), GC self-time 87 → 59 ms (-32 %), with output byte-identical modulo timestamps; with the constructor-shape round closed, the new #1 row in the process CPU profile was `PDFObjectParser.prototype.parseName` at 87 ms self + 57 ms via its `fastOf` callee = 144 ms combined (~16 % of process) firing 1.68 M times per load, of which 4 787 are unique (99.7 % cache-hit rate -- the same handful of dict keys like Type, Length, Pages, MediaBox over and over) -- two failed first attempts (skip per-byte ByteStream method dispatch via direct buffer access while keeping the cons-string accumulator: V8's cons-string optimisation was already covering the cost so no movement; and `String.fromCharCode.apply(null, buf.subarray(...))` as a one-shot allocation: SLOWER at ~123 ms vs ~87 ms because `.apply` on a typed-array view is a V8 deopt path) pointed at the wrong surface, the real win was caching the answer keyed on the byte content, scanning bytes with direct buffer access while accumulating a Java-style `hash * 31 + byte | 0` Smi hash in the same pass, looking up `Map<hash, Entry | Entry[]>` keyed by byte content (single-entry buckets the common case at 4.8 k names into 2^32 hash space, collision-bucket scan via `instanceof Entry` check), with cold path building the string in one shot via `String.fromCharCode` direct args and routing through fast-decode-name so both caches converge on the same PDFName instance -- pulls `parseName` + `fastOf` combined from 144 ms to 58 ms (-60 %), -80 ms process wall-clock (-9 %), all on load (0.41 → 0.33 s); +1.3 MB long-lived heap (4.8 k Entry objects + Uint8Array byte-keys + Map<number, ...> overhead) is a fixed cost for a workload-bounded cache; the heap-profile run shows a much bigger drop (3.50 → 2.56 s, -940 ms) -- not a real wall-clock win, just the sampler's per-allocation bookkeeping dropping in step with the ~1.6 M transient string allocations we eliminated (read cpu numbers for "did we get faster", heap numbers for long-lived cost); the next row to drop was `PDFObjectStream.getUnencodedContents` (#4 at 46 ms self / 124 ms with callees) paired with a fat `(idle)` row at 32 ms / 3.4 % -- both attributable to `parallel-deflate.mjs`'s phase 2 running build + deflate as two strictly serial passes (`objectStreams.map(os => os.getUnencodedContents())` followed by `Promise.all(unencoded.map(buf => deflateAsync(buf)))`, the first ~120 ms of main-thread block then ~30 ms of main-thread idle awaiting libuv), so folding the two `.map`s into one (`Promise.all(objectStreams.map(os => deflateAsync(os.getUnencodedContents())))`) pipelines build with deflate -- each deflate fires on libuv as its buffer is built, overlapping with the build of the next stream rather than after all 453 builds complete -- and the await resolves almost immediately by the time the build loop finishes (by then ~430 of 453 deflates have run on the 4-worker pool, each ~0.3 ms compute); paired 3-run A/B with the rest of the shipped flag set on confirms save 0.467 s → 0.420 s (-47 ms, -10 %), process 0.887 s → 0.833 s (-54 ms, -6 %), load + setOutline flat as expected; the `(idle)` row drops out of the CPU top-15 entirely and `getUnencodedContents` self-time also drops (31.56 → 22.25 ms) as V8's task scheduling between build and the fire-and-forget Promise creation reattributes some samples -- a 47 ms vs 32 ms estimate gap accounted for by microtask-queue drain at the `Promise.all` gate + libuv callback marshalling now spread across the build loop instead of bunched at the end; the class-shape round left PDFPageLeaf as the only subclass with extra fields (`normalized` default false + `autoNormalizeCTM` default true, both written in the `_FastPageLeaf` constructor body) so the 1 651 page leaves on the book were ~24 B larger than plain `_FastDict` instances -- packing both booleans into `d`'s gap bits collapses PageLeaf to the same single-`d` shape (bit layout shifts from start[0:23] + length[24:37] to start[0:22] + norm[23] + auto[24] + length[25:40], dropping start from 24 to 23 bits / 8.4 M slots vs ~2.3 M mainLen, growing length from 14 to 16 bits / 65 535 vs 8 706 observed max) with the booleans as prototype getters/setters that mask in/out of bits 23-24, and the V8 Smi gotcha worth recording: Smi is 31-bit signed so d > 2^30 (i.e. length >= 32) boxes to HeapNumber where `d | NORM_BIT` would truncate to Int32 and lose the length, so all writes use arithmetic (`d + NORM_BIT` / `d - NORM_BIT` gated on the current bit state) and the COW / set / delete paths preserve the gap bits via `+ (d & GAP_MASK)` after the repack; saves ~26 KB on the 1 651 page leaves (sub-row at 512 B sampler resolution but real, calculated per-instance), output byte-identical, CPU flat (no PageLeaf mutation paths fire on the render-only workflow); the same "shape change interior to construction, IC story at every caller" pattern that drove the PageLeaf collapse also yields a second-pass win on PDFRef -- single-shape `_FastRef` still allocated two inline slots for `objectNumber` + `generationNumber` but `generationNumber` is always zero on fresh-Chrome workloads except for the xref "free" entry at object 0, so splitting into `_FastRef(objectNumber)` (one slot, gen=0 path) + `_FastRefGen(objectNumber, generationNumber)` (two slots, rare gen!=0 path) with `PDFRef.prototype.generationNumber = 0` as a data-property default supplies the missing field via prototype lookup -- crucial that this is a data-property default not an accessor, because a first-attempt packed-`d` + getter variant regressed +1.6 MB heap / +70 ms CPU by breaking V8's monomorphic ICs at every caller of `ref.objectNumber` / `ref.generationNumber` (PDFCrossRefSection.append, PDFCrossRefStream entry tuples, PDFWriter.serializeToBuffer, fast-indirect-objects, the `{ref, offset, deleted}` literals in `addEntry`), couldn't elide the literals as aggressively under accessor dispatch, recompilation paths landed with worse code than the two-slot baseline; the two-shape data-property variant pays in a bounded place (one extra hidden class for the rare path) without touching any caller's IC, saving 8 B per gen=0 instance × 226 k unique = 1.88 MB heap on the book (34.96 MB → 33.08 MB total sampled), with output byte-identical and the gen!=0 Map (`poolGenN` keyed by `"N M"`) replacing the upstream PDFRef.of fallback entirely. |
diff --git a/perf/notes/08-pdf-lib.md b/perf/notes/08-pdf-lib.md
index c069089a..eae3a111 100644
--- a/perf/notes/08-pdf-lib.md
+++ b/perf/notes/08-pdf-lib.md
@@ -4913,6 +4913,138 @@ The change is local to
 no production import or flag change needed since
 `--fast-dict-onebuf` was already wired up.
 
+## Two-shape `PDFRef`: gen=0 single-slot
+
+`fast-refs-class`'s single-shape constructor still allocates two
+inline slots per PDFRef -- `objectNumber` and `generationNumber`.
+On fresh-Chrome workloads `generationNumber` is **always zero**
+except for the xref "free" entry at object 0; the slot is dead
+weight on 226 k of 226 k instances.
+
+### The shim
+
+Split `_FastRef` into two constructors keyed on whether
+`generationNumber` is needed:
+
+```js
+// gen=0 instances: single inline `objectNumber` slot.
+// `generationNumber` is supplied as a data-property default on
+// PDFRef.prototype (set below), so reads return 0 without any
+// accessor dispatch.
+function _FastRef(objectNumber) {
+  this.objectNumber = objectNumber;
+}
+_FastRef.prototype = PDFRef.prototype;
+
+// gen!=0 instances: both fields as own data properties, shadowing
+// the prototype default. V8 sees a second hidden class -- bounded
+// 2-shape polymorphism, well-handled by inline caches.
+function _FastRefGen(objectNumber, generationNumber) {
+  this.objectNumber = objectNumber;
+  this.generationNumber = generationNumber;
+}
+_FastRefGen.prototype = PDFRef.prototype;
+
+// Default generationNumber on the prototype. _FastRef instances
+// inherit this (no own property); _FastRefGen instances shadow it
+// with their own data property. Both look like data-property reads
+// to V8's IC.
+PDFRef.prototype.generationNumber = 0;
+```
+
+The critical design point: **prototype default, not accessor**. A
+data-property default on the prototype keeps `.generationNumber`
+reads on the hot path as plain data-property loads. V8's monomorphic
+IC for `.generationNumber` covers both shapes uniformly -- the
+property is "present and readable as data" at the same offset in
+the IC's mental model, whether it lives on the instance or one hop
+up the prototype chain.
+
+### The accessor variant that didn't work
+
+A first attempt packed `(objectNumber, generationNumber)` into a
+single `d` field with `objectNumber` / `generationNumber` as
+getter accessors on the prototype:
+
+```js
+// rejected
+function _FastRefPacked(d) { this.d = d; }
+Object.defineProperty(PDFRef.prototype, 'objectNumber', {
+  get() { return this.d & MASK_obj; },
+});
+Object.defineProperty(PDFRef.prototype, 'generationNumber', {
+  get() { return this.d >>> SHIFT_gen; },
+});
+```
+
+Result: **+1.6 MB heap and +70 ms CPU** vs the two-shape variant.
+The accessor-property boundary broke V8's monomorphic ICs at every
+upstream pdf-lib call site that reads `ref.objectNumber` /
+`ref.generationNumber` -- `PDFCrossRefSection.append`,
+`PDFCrossRefStream` entry tuples, `PDFWriter.serializeToBuffer`,
+our `fast-indirect-objects` shim, plus all the small `{ref, offset,
+deleted}` literals in `addEntry`. V8 couldn't elide those object
+literals as aggressively once the property read became an accessor
+dispatch; recompilation paths landed with worse code than the
+two-slot baseline. **Same property name, same return value, but a
+different IC slot type -- and the difference shows up at every
+caller.**
+
+The two-shape variant pays for the win in a bounded place (one
+extra hidden class for the rare gen!=0 path) without touching any
+caller's IC.
+
+### Pool changes
+
+gen=0: same dense array indexed by `objectNumber` (unchanged).
+
+gen!=0: instead of falling back to the upstream `PDFRef.of`'s
+Map-based pool, the shim now keeps its own `poolGenN` Map keyed by
+`"N M"`. This means we never call into upstream PDFRef.of at all
+-- the entire `PDFRef.of` factory is ours. Path is dead on
+fresh-Chrome workloads except for the xref free entry at object 0,
+so the Map stays tiny.
+
+### Measured heap
+
+Paired heap profile (single-shape baseline vs + this change):
+
+| Allocator       | Pre        | Post       | Delta            |
+|-----------------|-----------:|-----------:|-----------------:|
+| Total sampled   |   34.96 MB |   33.08 MB | **-1.88 MB**     |
+
+Per-instance arithmetic: V8 aligns object headers + inline slots
+to 8-byte boundaries. Two-slot `_FastRefGen`-shape: 8 B header +
+2×4 B slots = 16 B raw, aligned to **24 B**. One-slot `_FastRef`
+shape: 8 B header + 1×4 B slot = 12 B raw, aligned to **16 B**.
+8 B saved per gen=0 instance × 226 k unique = ~1.8 MB. Matches
+the measured delta.
+
+### Measured CPU
+
+CPU is essentially flat -- no-profile wall-clock 0.70 s vs ~0.83 s
+pre, but the variance overlaps and the heap-saving lane isn't the
+source of CPU movement. PDF output byte-identical.
+
+### Validation
+
+Output PDF byte-identical -- same `PDFRef` identity per logical
+ref, same prototype methods, all callers see what they always did
+(`ref.objectNumber`, `ref.generationNumber`, `ref.toString()`,
+`ref.copyBytesInto(...)` all work). The change is local to
+[`docs/lib/fast-refs-class.mjs`](../../docs/lib/fast-refs-class.mjs);
+no production import or flag change needed since
+`--fast-refs-class` was already wired up.
+
+### What this teaches
+
+The shape change is interior to PDFRef construction; the IC story
+is at every caller. A two-shape polymorphism on one class is cheap
+when V8 sees it; an accessor-property change on a hot read is
+expensive everywhere that read happens. Prefer adding shape
+variants over swapping data properties for accessors when the
+property is hot.
+
 ## `@cantoo/pdf-lib`: not a drop-in replacement
 
 Spot-checked the maintained fork (`@cantoo/pdf-lib` 2.6.5) as an
@@ -4960,7 +5092,8 @@ Cumulative process-phase cost, baseline → after the shims to date:
 | + fast-array-onebuf class shape      | ~0.8 s  | ~0.5 s  | ~0.35 s |
 | + fast-parse-name                    | ~0.75 s | ~0.4 s  | ~0.35 s |
 | + pipeline-deflate                   | ~0.7 s  | ~0.4 s  | ~0.3 s  |
-| **+ PageLeaf flag-packing (this section)** | **~0.7 s** | **~0.4 s** | **~0.3 s** |
+| + PageLeaf flag-packing              | ~0.7 s  | ~0.4 s  | ~0.3 s  |
+| **+ two-shape PDFRef (this section)** | **~0.7 s** | **~0.4 s** | **~0.3 s** |
 
 The bottom-up after the latest pair is what's left of pdf-lib's
 genuine parser work: `PDFDict.entries`, `PDFObjectParser.parseName`,

From 80e25859bb442b1e0c147c128a40cb24f6a6941a Mon Sep 17 00:00:00 2001
From: Kuba Sunderland-Ober <kuba@mareimbrium.org>
Date: Sun, 24 May 2026 13:32:16 +0200
Subject: [PATCH 44/44] perf: add heap-subtree.mjs (subtree dump for a
 heap-profile frame).

Companion to analyze-heap.mjs / find-heap-callers.mjs. Prints every
match for a substring on functionName, with the matched frame's
self-size and each direct child's self + descendant total.

Built during the PDFRef class-shape round to investigate why
`maybeParseCrossRefSection` showed 3.4 MB self with <40 KB worth of
named children -- a V8 inlining-attribution case where the parent
frame's compiled code absorbed `PDFCrossRefSection.addEntry`'s
object-literal allocations. The flat top-15 view can't tell you that;
this script can.
---
 perf/README.md        |  1 +
 perf/heap-subtree.mjs | 64 +++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 65 insertions(+)
 create mode 100644 perf/heap-subtree.mjs

diff --git a/perf/README.md b/perf/README.md
index 3bd88e59..f3637cc1 100644
--- a/perf/README.md
+++ b/perf/README.md
@@ -446,6 +446,7 @@ or `--tracing`):
 | `find-callers.mjs` | "Who paid for this callee's time?" -- walks a `.cpuprofile` and attributes a target function's total time back to each direct caller. Used throughout the post-mortems to detect gBCR migration between callers. |
 | `find-heap-callers.mjs` | Heap-profile companion to `find-callers.mjs`. Walks a `.heapprofile` tree and attributes a target allocator's (e.g. `set`, `Map`, `String`) self+descendant bytes back to each direct caller. Useful for "where do all these Map.set calls come from?" questions. |
 | `find-heap-callees.mjs` | Other direction: walks a `.heapprofile` tree and lists a target frame's direct children with their (self + subtree) byte totals. Used to crack open mystery rows like "fastParseDictArray has 58 MB of self-size -- what's it actually allocating?". |
+| `heap-subtree.mjs` | "What does this frame actually allocate?" -- prints the subtree under every frame whose name matches a substring, with each direct child's self + descendant total. Companion to `analyze-heap.mjs` and `find-heap-callers.mjs`; use it when a top-15 row's self-size is big but its children look tiny (typical V8 inlining-attribution case). Built during the PDFRef class-shape round to confirm `maybeParseCrossRefSection` had inlined `PDFCrossRefSection.addEntry`'s object literals into its own compiled frame. |
 | `find-callees.mjs` | The other direction of `find-callers.mjs`: splits a function's self+descendant time across its direct callees. Surfaces the cases where V8 has rolled native DOM work back into the calling JS frame (Range deletion in `removeOverflow`, HTML parser in `wrapContent`). |
 | `grep-profile.mjs` | Lists every node in a `.cpuprofile` whose `functionName` matches a regex, with self-time and location. Quick check for "is this frame in the profile at all, and what's it called?" |
 | `ab-css.mjs` | CSS cost attribution for `docs/_site-pdf/assets/css/print.css` + `rouge.css`. Renders the book per variant (full / drop-rouge / drop-print-extras / baseline-minimal) and reports **paired-difference** CPU sample-time across N pairs (default 3), with the baseline re-measured immediately before each variant pair to cancel machine-state drift. Pulls per-`Document::recalcStyle` / `LocalFrameView::performLayout` / `rebuildLayoutTree` / `ShapeText` total time from the embedded V8 cpu profile in the hybrid trace; prints mean ± SD per variant so noise-floor rows are visible. Auto-pins on Windows via `pin-cpu.mjs`. Optional `--per-print-section` adds one drop-print-`<section>` variant per `/* ---- ---- */` divider in print.css; individual sections of print.css turned out to be below the noise floor on this book, so off by default. |
diff --git a/perf/heap-subtree.mjs b/perf/heap-subtree.mjs
new file mode 100644
index 00000000..8aaa2129
--- /dev/null
+++ b/perf/heap-subtree.mjs
@@ -0,0 +1,64 @@
+// "What does this frame actually allocate?" -- prints the heap-profile
+// subtree under any frame whose name matches a substring, with each
+// direct child's self + descendant byte total.
+//
+// Companion to analyze-heap.mjs (bottom-up flat list) and
+// find-heap-callers.mjs (who called this allocator). Use this when a
+// row in the top-15 looks suspicious -- e.g. a big self-size with
+// invisible children -- and you want to see what was inlined into the
+// frame's compiled code. Built during the PDFRef class-shape round,
+// where `maybeParseCrossRefSection` showed 3.4 MB self but its named
+// children totalled <40 KB; the subtree view confirmed V8 had
+// inlined `PDFCrossRefSection.addEntry` and attributed its object-
+// literal allocations to the parent frame.
+//
+// Usage:
+//   node heap-subtree.mjs <path/to/process.heapprofile> <function-name-substring>
+//
+// The substring matches case-sensitively on the V8 frame's
+// `functionName` field; all matches are reported, so a needle like
+// "parseDict" surfaces every frame containing that name.
+
+import { readFileSync } from 'node:fs';
+import { resolve } from 'node:path';
+
+const [, , profilePath, needle] = process.argv;
+if (!profilePath || !needle) {
+  console.error('usage: node heap-subtree.mjs <process.heapprofile> <function-substring>');
+  process.exit(2);
+}
+
+const profile = JSON.parse(readFileSync(resolve(profilePath), 'utf8'));
+
+function findNodes(node, out, depth = 0) {
+  const fn = (node.callFrame && node.callFrame.functionName) || '';
+  if (fn.includes(needle)) out.push(node);
+  for (const c of (node.children || [])) findNodes(c, out, depth + 1);
+}
+
+const matches = [];
+findNodes(profile.head, matches);
+console.log(`Found ${matches.length} matching frame(s)\n`);
+
+for (const m of matches) {
+  const cf = m.callFrame;
+  console.log(`=== ${cf.functionName}  @  ${cf.url}:${(cf.lineNumber||0)+1} ===`);
+  console.log(`self: ${(m.selfSize/1024).toFixed(2)} KB`);
+  console.log(`children (sorted by total):`);
+  const summarize = (n) => {
+    let total = n.selfSize;
+    for (const c of (n.children || [])) total += summarize(c);
+    n._total = total;
+    return total;
+  };
+  for (const c of (m.children || [])) summarize(c);
+  const sorted = (m.children || []).slice().sort((a, b) => b._total - a._total);
+  for (const c of sorted.slice(0, 12)) {
+    const cf = c.callFrame || {};
+    const fn = cf.functionName || '(anonymous)';
+    const url = cf.url || '';
+    const tail = url.split(/[\\/]/).slice(-2).join('/');
+    console.log(`  ${(c._total/1024).toFixed(2).padStart(10)} KB total | ${(c.selfSize/1024).toFixed(2).padStart(8)} KB self | ${fn}  @  ${tail}:${(cf.lineNumber||0)+1}`);
+  }
+  console.log('');
+}